In [1]:
#!pip install imblearn

In [2]:
#!pip install xgboost

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, precision_score
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading in full data set
brfss_total = pd.read_csv("../csv_data/brfss_total.csv")

In [3]:
brfss_total.head()

Unnamed: 0.1,Unnamed: 0,_STATE,DISPCODE,PHYSHLTH,MENTHLTH,USENOW3,HISPANC2,MARITAL,CHILDREN,EMPLOY,...,ACETTHEM,ACEHVSEX,MSCODE,_IMPAGE,_RFHLTH,_SMOKER3,_PRACE,_EDUCAG,_INCOMG,_TOTINDA
0,0,5.0,110.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,...,1.0,1.0,5.0,53.0,1.0,4.0,1.0,3.0,5.0,1.0
1,1,5.0,110.0,15.0,0.0,3.0,2.0,2.0,0.0,0.0,...,1.0,1.0,5.0,64.0,2.0,3.0,1.0,2.0,2.0,1.0
2,2,5.0,110.0,6.0,0.0,3.0,2.0,1.0,0.0,0.0,...,1.0,1.0,5.0,58.0,1.0,4.0,1.0,3.0,2.0,1.0
3,3,5.0,110.0,30.0,0.0,3.0,2.0,1.0,0.0,0.0,...,1.0,1.0,5.0,76.0,2.0,4.0,1.0,1.0,0.0,2.0
4,4,5.0,110.0,13.0,0.0,3.0,2.0,3.0,0.0,0.0,...,1.0,0.0,5.0,82.0,2.0,3.0,1.0,2.0,0.0,2.0


In [4]:
# just making sure no nulls are present
brfss_total.isna().sum()

Unnamed: 0    0
_STATE        0
DISPCODE      0
PHYSHLTH      0
MENTHLTH      0
USENOW3       0
HISPANC2      0
MARITAL       0
CHILDREN      0
EMPLOY        0
RENTHOM1      0
SEX           0
QLACTLM2      0
ACEDEPRS      0
ACEDRINK      0
ACEDRUGS      0
ACEPRISN      0
ACEDIVRC      0
ACEPUNCH      0
ACEHURT       0
ACESWEAR      0
ACETOUCH      0
ACETTHEM      0
ACEHVSEX      0
MSCODE        0
_IMPAGE       0
_RFHLTH       0
_SMOKER3      0
_PRACE        0
_EDUCAG       0
_INCOMG       0
_TOTINDA      0
dtype: int64

In [5]:
brfss_total.drop(columns=['Unnamed: 0'], inplace=True)

**Variables I will try to predict with my models:**
- USENOW3: Do you currently use chewing tobacco, snuff, or snus every day, some days, or not at all?
    - classification
    - 0 = Don't know, Not sure or Refused, 1 = every day, 2 = some days, 3 = not at all
- QLACTLM2: Are you limited in any way in any activities because of physical, mental, or emotional problems?
    - classification
    - 0 = Don't know, Not sure or Refused, 1 = yes, 2 = no
- _RFHLTH: Adults with good or better health vs. fair or poor health
    - classification
    - based off of GENHLTH
    - 0 = Don't know, Not sure or Refused, 1 = Good or Better Health, 2 = Fair or Poor Health
- _SMOKER3: Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker
    - classification
    - based off of SMOKE100 & SMOKEDAY
    - 0 = Don't know, Not sure or Refused, 1 = Current smoker (now smokes every day), 2 = Current smoker (now smokes some days), 3 = Former smoker, 4 = Never smoked

**Will OneHotEncode/ dummify ordinal/nominal features**

**Will only use a sample of the data set for models so they can run faster**

**Will use SMOTE to compensensate for imbalanced classes**

**Will aggregate all ACEs into two groups: Abuse and Household Challenges**

In [6]:
np.random.seed(151)

In [7]:
# taking a small sample so that my models will run a little faster
brfss_total_sample = brfss_total.sample(frac=0.05, axis=0)

brfss_total_sample.shape

(5878, 31)

In [8]:
# creating X variable with all features
X_all = brfss_total_sample.drop(columns=['USENOW3', 'QLACTLM2', '_RFHLTH', '_SMOKER3'])

In [9]:
# creating the 4 y's
y_tobacco = brfss_total_sample['USENOW3']
y_activity = brfss_total_sample['QLACTLM2']
y_health = brfss_total_sample['_RFHLTH']
y_smoker = brfss_total_sample['_SMOKER3']

In [10]:
#original baseline for tobacco
y_tobacco.value_counts(normalize=True)

3.0    0.966315
1.0    0.019905
2.0    0.011398
0.0    0.002382
Name: USENOW3, dtype: float64

In [11]:
#original baseline for activity
y_activity.value_counts(normalize=True)

2.0    0.717251
1.0    0.277305
0.0    0.005444
Name: QLACTLM2, dtype: float64

In [12]:
#original baseline for health
y_health.value_counts(normalize=True)

1.0    0.815073
2.0    0.182375
0.0    0.002552
Name: _RFHLTH, dtype: float64

In [13]:
#original baseline for smoker
y_smoker.value_counts(normalize=True)

4.0    0.524498
3.0    0.305376
1.0    0.120449
2.0    0.044063
0.0    0.005614
Name: _SMOKER3, dtype: float64

In [14]:
# splitting X up so I can do some engineering on the nominal data and ACE columns
X_num = X_all[['PHYSHLTH', 'MENTHLTH', 'CHILDREN']]
X_cat = X_all[['_STATE', 'DISPCODE', 'HISPANC2', 'MARITAL', 'EMPLOY', 'RENTHOM1', 'SEX', 'MSCODE', 
               '_IMPAGE', '_PRACE', '_EDUCAG', '_INCOMG','_TOTINDA']]
ace = X_all[['ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN', 'ACEDIVRC', 'ACEPUNCH', 'ACEHURT', 'ACESWEAR', 
                       'ACETOUCH', 'ACETTHEM', 'ACEHVSEX']]

In [15]:
# updating ACE columns to be a count depending on the question
# first 6 questions are yes or no, so yes will be be counted as 1 and no will be counted as 0
# last 5 are questions of frequency, never = 0, once = 1, more than once will equal 2 (since not given an exact number)
ace['ACEDEPRS'] = ace['ACEDEPRS'].map({1:1, 2:0, 0:0})
ace['ACEDRINK'] = ace['ACEDRINK'].map({1:1, 2:0, 0:0})
ace['ACEDRUGS'] = ace['ACEDRUGS'].map({1:1, 2:0, 0:0})
ace['ACEPRISN'] = ace['ACEPRISN'].map({1:1, 2:0, 0:0})
ace['ACEDIVRC'] = ace['ACEDIVRC'].map({1:1, 2:0, 0:0})
ace['ACEPUNCH'] = ace['ACEPUNCH'].map({1:0, 2:1, 3:2})
ace['ACEHURT'] = ace['ACEHURT'].map({1:0, 2:1, 3:2, 0:0})
ace['ACESWEAR'] = ace['ACESWEAR'].map({1:0, 2:1, 3:2, 0:0})
ace['ACETOUCH'] = ace['ACETOUCH'].map({1:0, 2:1, 3:2, 0:0})
ace['ACETTHEM'] = ace['ACETTHEM'].map({1:0, 2:1, 3:2, 0:0})
ace['ACEHVSEX'] = ace['ACEHVSEX'].map({1:0, 2:1, 3:2, 0:0})

In [16]:
ace['count'] = ace.sum(axis = 1)
X_num['ACE_Count'] = ace['count']

In [17]:
X_cat = X_cat.astype(str)

In [18]:
# dummifying nominal variables for X_all
X_dummies = pd.get_dummies(X_cat, drop_first=True)
X_dummies.head()

Unnamed: 0,_STATE_15.0,_STATE_19.0,_STATE_22.0,_STATE_27.0,_STATE_30.0,_STATE_32.0,_STATE_37.0,_STATE_40.0,_STATE_47.0,_STATE_5.0,...,_EDUCAG_2.0,_EDUCAG_3.0,_EDUCAG_4.0,_INCOMG_1.0,_INCOMG_2.0,_INCOMG_3.0,_INCOMG_4.0,_INCOMG_5.0,_TOTINDA_1.0,_TOTINDA_2.0
12235,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
55470,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
29353,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
44625,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
53217,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [19]:
X_num.head()

Unnamed: 0,PHYSHLTH,MENTHLTH,CHILDREN,ACE_Count
12235,0.0,0.0,0.0,0.0
55470,0.0,0.0,0.0,0.0
29353,0.0,0.0,0.0,0.0
44625,0.0,0.0,0.0,0.0
53217,0.0,0.0,0.0,0.0


In [20]:
# merging numerical and nominal data into one data frame
X_all = X_num.merge(X_dummies, left_index=True, right_index=True)

In [21]:
X_all.shape

(5878, 137)

In [22]:
# to compensate for unbalanced classes in my y's will use SMOTE

sm = SMOTE(random_state=151)
X_all1, y_tobacco = sm.fit_resample(X_all, y_tobacco)

sm2 = SMOTE(random_state=151)
X_all2, y_activity = sm2.fit_resample(X_all, y_activity)

sm3 = SMOTE(random_state=151)
X_all3, y_health = sm3.fit_resample(X_all, y_health)

sm4 = SMOTE(random_state=151)
X_all4, y_smoker = sm4.fit_resample(X_all, y_smoker)

In [23]:
# new baseline for tobacco
y_tobacco.value_counts(normalize=True)

0.0    0.25
1.0    0.25
2.0    0.25
3.0    0.25
Name: USENOW3, dtype: float64

In [24]:
# looks like SMOTE has increased the size of my y's more than 4x, so will probably take some time for models to run
y_tobacco.shape

(22720,)

In [25]:
# new baseline for activity
y_activity.value_counts(normalize=True)

0.0    0.333333
1.0    0.333333
2.0    0.333333
Name: QLACTLM2, dtype: float64

In [26]:
# new baseline for health
y_health.value_counts(normalize=True)

0.0    0.333333
2.0    0.333333
1.0    0.333333
Name: _RFHLTH, dtype: float64

In [27]:
# new baseline for smoker
y_smoker.value_counts(normalize=True)

0.0    0.2
2.0    0.2
1.0    0.2
3.0    0.2
4.0    0.2
Name: _SMOKER3, dtype: float64

In [28]:
X_all1.shape

(22720, 137)

In [29]:
# creating training and testing sets for all y's (stratified on y, but since the classes are equal probably didn't have to)
X_train_all, X_test_all, y_train_tobacco, y_test_tobacco = train_test_split(X_all1, y_tobacco, random_state = 151, stratify=y_tobacco)
X_train_all2, X_test_all2, y_train_activity, y_test_activity = train_test_split(X_all2, y_activity, random_state = 151, stratify=y_activity)
X_train_all3, X_test_all3, y_train_health, y_test_health = train_test_split(X_all3, y_health, random_state = 151, stratify=y_health)
X_train_all4, X_test_all4, y_train_smoker, y_test_smoker = train_test_split(X_all4, y_smoker, random_state = 151, stratify=y_smoker)

### Pipeline and Gridsearch with all features as predictors (Logistic Regression)

In [30]:
pipe_all_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

params_all_log = {'selectkbest__k': range(1, 137, 15),
         'logisticregression__C': [0.01, 0.5, 1]}

gs_all_log = GridSearchCV(pipe_all_log, params_all_log, cv=3)

gs_all_log.fit(X_train_all, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 137, 15)})

In [31]:
pipe2_all_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs2_all_log = GridSearchCV(pipe2_all_log, params_all_log, cv=3)

gs2_all_log.fit(X_train_all2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 137, 15)})

In [32]:
pipe3_all_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs3_all_log = GridSearchCV(pipe3_all_log, params_all_log, cv=3)

gs3_all_log.fit(X_train_all3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 137, 15)})

In [33]:
pipe4_all_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs4_all_log = GridSearchCV(pipe4_all_log, params_all_log, cv=3)

gs4_all_log.fit(X_train_all4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 137, 15)})

In [34]:
tobacco_all_log_preds = gs_all_log.predict(X_test_all)
activity_all_log_preds = gs2_all_log.predict(X_test_all2)
health_all_log_preds = gs3_all_log.predict(X_test_all3)
smoker_all_log_preds = gs4_all_log.predict(X_test_all4)

tobacco_all_log_prec = precision_score(y_test_tobacco, tobacco_all_log_preds, average='micro')
activity_all_log_prec = precision_score(y_test_activity, activity_all_log_preds, average='micro')
health_all_log_prec = precision_score(y_test_health, health_all_log_preds, average='micro')
smoker_all_log_prec = precision_score(y_test_smoker, smoker_all_log_preds, average='micro')

In [35]:
print(f' training accuracy for tobacco: {gs_all_log.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_log.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_log.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_log.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 0.8605292495452679
 training accuracy for activity: 0.8634201819685691
 training accuracy for health: 0.8897476487568675
 training accuracy for smoker: 0.6099106135554977


In [36]:
print(f' testing accuracy for tobacco: {gs_all_log.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_log.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_log.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_log.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.8582995951417004
 testing accuracy for activity: 0.8465116279069768
 testing accuracy for health: 0.8930167597765363
 testing accuracy for smoker: 0.5934409161894847


In [37]:
print(f'Precision for tobacco: {tobacco_all_log_prec}')
print(f'Precision for activity: {activity_all_log_prec}')
print(f'Precision for health: {health_all_log_prec}')
print(f'Precision for smoker: {smoker_all_log_prec}')

Precision for tobacco: 0.8582995951417004
Precision for activity: 0.8465116279069768
Precision for health: 0.8930167597765363
Precision for smoker: 0.5934409161894847


In [38]:
print(gs_all_log.best_params_)
print(gs2_all_log.best_params_)
print(gs3_all_log.best_params_)
print(gs4_all_log.best_params_)

{'logisticregression__C': 0.5, 'selectkbest__k': 136}
{'logisticregression__C': 0.5, 'selectkbest__k': 136}
{'logisticregression__C': 1, 'selectkbest__k': 136}
{'logisticregression__C': 1, 'selectkbest__k': 136}


**observations**

### Pipeline and Gridsearch with all features as predictors (Random Forest Classifier)

In [39]:
pipe_all_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

params_all_rfc = {'selectkbest__k': range(1, 137, 15),
                  'randomforestclassifier__n_estimators': [100, 300, 500],
                  'randomforestclassifier__max_depth': [None, 3, 5], }
                 #'randomforestclassifier__min_samples_split': [1, 3, 5],
                 #'randomforestclassifier__min_samples_leaf': [1, 3, 5]}

gs_all_rfc = GridSearchCV(pipe_all_rfc, params_all_rfc, cv=3)

gs_all_rfc.fit(X_train_all, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 137, 15)})

In [40]:
pipe2_all_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs2_all_rfc = GridSearchCV(pipe2_all_rfc, params_all_rfc, cv=3)

gs2_all_rfc.fit(X_train_all2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 137, 15)})

In [41]:
pipe3_all_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs3_all_rfc = GridSearchCV(pipe3_all_rfc, params_all_rfc, cv=3)

gs3_all_rfc.fit(X_train_all3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 137, 15)})

In [42]:
pipe4_all_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs4_all_rfc = GridSearchCV(pipe4_all_rfc, params_all_rfc, cv=3)

gs4_all_rfc.fit(X_train_all4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 137, 15)})

In [43]:
tobacco_all_rfc_preds = gs_all_rfc.predict(X_test_all)
activity_all_rfc_preds = gs2_all_rfc.predict(X_test_all2)
health_all_rfc_preds = gs3_all_rfc.predict(X_test_all3)
smoker_all_rfc_preds = gs4_all_rfc.predict(X_test_all4)

tobacco_all_rfc_prec = precision_score(y_test_tobacco, tobacco_all_rfc_preds, average='micro')
activity_all_rfc_prec = precision_score(y_test_activity, activity_all_rfc_preds, average='micro')
health_all_rfc_prec = precision_score(y_test_health, health_all_rfc_preds, average='micro')
smoker_all_rfc_prec = precision_score(y_test_smoker, smoker_all_rfc_preds, average='micro')

In [44]:
print(f' training accuracy for tobacco: {gs_all_rfc.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_rfc.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_rfc.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_rfc.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 0.9990611981458664
 training accuracy for activity: 0.9998966087675765
 training accuracy for health: 0.9997206443802961
 training accuracy for smoker: 0.9997396511325176


In [45]:
print(f' testing accuracy for tobacco: {gs_all_rfc.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_rfc.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_rfc.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_rfc.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.9927829607463474
 testing accuracy for activity: 0.8986046511627906
 testing accuracy for health: 0.9444134078212291
 testing accuracy for smoker: 0.8079125455491931


In [46]:
print(f'Precision for tobacco: {tobacco_all_rfc_prec}')
print(f'Precision for activity: {activity_all_rfc_prec}')
print(f'Precision for health: {health_all_rfc_prec}')
print(f'Precision for smoker: {smoker_all_rfc_prec}')

Precision for tobacco: 0.9927829607463474
Precision for activity: 0.8986046511627906
Precision for health: 0.9444134078212291
Precision for smoker: 0.8079125455491931


In [47]:
print(gs_all_rfc.best_params_)
print(gs2_all_rfc.best_params_)
print(gs3_all_rfc.best_params_)
print(gs4_all_rfc.best_params_)

{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 136}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 300, 'selectkbest__k': 136}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 100, 'selectkbest__k': 106}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 106}


### Pipeline and Gridsearch with all features as predictors (Extra Trees Classifier)

In [48]:
pipe_all_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

params_all_etc = {'selectkbest__k': range(1, 137, 15),
                  'extratreesclassifier__n_estimators': [100, 300, 500],
                  'extratreesclassifier__max_depth': [None, 3, 5], }
                 #'extratreesclassifier__min_samples_split': [1, 3, 5],
                 #'extratreesclassifier__min_samples_leaf': [1, 3, 5]}

gs_all_etc = GridSearchCV(pipe_all_etc, params_all_etc, cv=3)

gs_all_etc.fit(X_train_all, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 137, 15)})

In [49]:
pipe2_all_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs2_all_etc = GridSearchCV(pipe2_all_etc, params_all_etc, cv=3)

gs2_all_etc.fit(X_train_all2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 137, 15)})

In [50]:
pipe3_all_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs3_all_etc = GridSearchCV(pipe3_all_etc, params_all_etc, cv=3)

gs3_all_etc.fit(X_train_all3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 137, 15)})

In [51]:
pipe4_all_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs4_all_etc = GridSearchCV(pipe4_all_etc, params_all_etc, cv=3)

gs4_all_etc.fit(X_train_all4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 137, 15)})

In [52]:
tobacco_all_etc_preds = gs_all_etc.predict(X_test_all)
activity_all_etc_preds = gs2_all_etc.predict(X_test_all2)
health_all_etc_preds = gs3_all_etc.predict(X_test_all3)
smoker_all_etc_preds = gs4_all_etc.predict(X_test_all4)

tobacco_all_etc_prec = precision_score(y_test_tobacco, tobacco_all_etc_preds, average='micro')
activity_all_etc_prec = precision_score(y_test_activity, activity_all_etc_preds, average='micro')
health_all_etc_prec = precision_score(y_test_health, health_all_etc_preds, average='micro')
smoker_all_etc_prec = precision_score(y_test_smoker, smoker_all_etc_preds, average='micro')

In [53]:
print(f' training accuracy for tobacco: {gs_all_etc.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_etc.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_etc.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_etc.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 0.9990611981458664
 training accuracy for activity: 0.9998966087675765
 training accuracy for health: 0.9996275258403948
 training accuracy for smoker: 0.999826434088345


In [54]:
print(f' testing accuracy for tobacco: {gs_all_etc.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_etc.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_etc.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_etc.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.991550783312797
 testing accuracy for activity: 0.8976744186046511
 testing accuracy for health: 0.9441340782122905
 testing accuracy for smoker: 0.7902134305049453


In [55]:
print(f'Precision for tobacco: {tobacco_all_etc_prec}')
print(f'Precision for activity: {activity_all_etc_prec}')
print(f'Precision for health: {health_all_etc_prec}')
print(f'Precision for smoker: {smoker_all_etc_prec}')

Precision for tobacco: 0.991550783312797
Precision for activity: 0.8976744186046511
Precision for health: 0.9441340782122905
Precision for smoker: 0.7902134305049453


In [56]:
print(gs_all_etc.best_params_)
print(gs2_all_etc.best_params_)
print(gs3_all_etc.best_params_)
print(gs4_all_etc.best_params_)

{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 500, 'selectkbest__k': 136}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 500, 'selectkbest__k': 136}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 500, 'selectkbest__k': 76}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 300, 'selectkbest__k': 136}


### Pipeline and Gridsearch with all features as predictors (Ada Boost Classifier)

In [57]:
pipe_all_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

params_all_abc = {'selectkbest__k': range(1, 137, 15),
                  'adaboostclassifier__learning_rate': [0.5, 1.0],
                  'adaboostclassifier__n_estimators': [10, 15, 20, 25], }

gs_all_abc = GridSearchCV(pipe_all_abc, params_all_abc, cv=3)

gs_all_abc.fit(X_train_all, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 137, 15)})

In [58]:
pipe2_all_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs2_all_abc = GridSearchCV(pipe2_all_abc, params_all_abc, cv=3)

gs2_all_abc.fit(X_train_all2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 137, 15)})

In [59]:
pipe3_all_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs3_all_abc = GridSearchCV(pipe3_all_abc, params_all_abc, cv=3)

gs3_all_abc.fit(X_train_all3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 137, 15)})

In [60]:
pipe4_all_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs4_all_abc = GridSearchCV(pipe4_all_abc, params_all_abc, cv=3)

gs4_all_abc.fit(X_train_all4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 137, 15)})

In [61]:
tobacco_all_abc_preds = gs_all_abc.predict(X_test_all)
activity_all_abc_preds = gs2_all_abc.predict(X_test_all2)
health_all_abc_preds = gs3_all_abc.predict(X_test_all3)
smoker_all_abc_preds = gs4_all_abc.predict(X_test_all4)

tobacco_all_abc_prec = precision_score(y_test_tobacco, tobacco_all_abc_preds, average='micro')
activity_all_abc_prec = precision_score(y_test_activity, activity_all_abc_preds, average='micro')
health_all_abc_prec = precision_score(y_test_health, health_all_abc_preds, average='micro')
smoker_all_abc_prec = precision_score(y_test_smoker, smoker_all_abc_preds, average='micro')

In [62]:
print(f' training accuracy for tobacco: {gs_all_abc.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_abc.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_abc.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_abc.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 0.6989966555183946
 training accuracy for activity: 0.7797766749379652
 training accuracy for health: 0.7882484402644566
 training accuracy for smoker: 0.4865052503688276


In [63]:
print(f' testing accuracy for tobacco: {gs_all_abc.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_abc.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_abc.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_abc.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.6972364020418941
 testing accuracy for activity: 0.7643410852713178
 testing accuracy for health: 0.7829608938547487
 testing accuracy for smoker: 0.47683498178032274


In [64]:
print(f'Precision for tobacco: {tobacco_all_abc_prec}')
print(f'Precision for activity: {activity_all_abc_prec}')
print(f'Precision for health: {health_all_abc_prec}')
print(f'Precision for smoker: {smoker_all_abc_prec}')

Precision for tobacco: 0.6972364020418941
Precision for activity: 0.7643410852713178
Precision for health: 0.7829608938547487
Precision for smoker: 0.47683498178032274


In [65]:
print(gs_all_abc.best_params_)
print(gs2_all_abc.best_params_)
print(gs3_all_abc.best_params_)
print(gs4_all_abc.best_params_)

{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 31}
{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 31}
{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 31}
{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 121}


### Pipeline and Gridsearch with all features as predictors (XG Boost Classifier)

In [66]:
pipe_all_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

params_all_xgb = {'selectkbest__k': range(1, 137, 15),
                  'xgbclassifier__learning_rate': [0.5, 1.0],
                  'xgbclassifier__n_estimators': [10, 15, 20, 25],
                 'xgbclassifier__max_depth': [3, 5]}

gs_all_xgb = GridSearchCV(pipe_all_xgb, params_all_xgb, cv=3)

gs_all_xgb.fit(X_train_all, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [67]:
pipe2_all_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs2_all_xgb = GridSearchCV(pipe2_all_xgb, params_all_xgb, cv=3)

gs2_all_xgb.fit(X_train_all2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [68]:
pipe3_all_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs3_all_xgb = GridSearchCV(pipe3_all_xgb, params_all_xgb, cv=3)

gs3_all_xgb.fit(X_train_all3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [69]:
pipe4_all_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs4_all_xgb = GridSearchCV(pipe4_all_xgb, params_all_xgb, cv=3)

gs4_all_xgb.fit(X_train_all4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [70]:
tobacco_all_xgb_preds = gs_all_xgb.predict(X_test_all)
activity_all_xgb_preds = gs2_all_xgb.predict(X_test_all2)
health_all_xgb_preds = gs3_all_xgb.predict(X_test_all3)
smoker_all_xgb_preds = gs4_all_xgb.predict(X_test_all4)

tobacco_all_xgb_prec = precision_score(y_test_tobacco, tobacco_all_xgb_preds, average='micro')
activity_all_xgb_prec = precision_score(y_test_activity, activity_all_xgb_preds, average='micro')
health_all_xgb_prec = precision_score(y_test_health, health_all_xgb_preds, average='micro')
smoker_all_xgb_prec = precision_score(y_test_smoker, smoker_all_xgb_preds, average='micro')

In [71]:
print(f' training accuracy for tobacco: {gs_all_xgb.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_xgb.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_xgb.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_xgb.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 0.9943085137593147
 training accuracy for activity: 0.9478908188585607
 training accuracy for health: 0.9720644380296117
 training accuracy for smoker: 0.83190141456218


In [72]:
print(f' testing accuracy for tobacco: {gs_all_xgb.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_xgb.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_xgb.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_xgb.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.988558352402746
 testing accuracy for activity: 0.8793798449612403
 testing accuracy for health: 0.9290502793296089
 testing accuracy for smoker: 0.7139510671525248


In [73]:
print(f'Precision for tobacco: {tobacco_all_xgb_prec}')
print(f'Precision for activity: {activity_all_xgb_prec}')
print(f'Precision for health: {health_all_xgb_prec}')
print(f'Precision for smoker: {smoker_all_xgb_prec}')

Precision for tobacco: 0.988558352402746
Precision for activity: 0.8793798449612403
Precision for health: 0.9290502793296089
Precision for smoker: 0.7139510671525248


In [74]:
print(gs_all_xgb.best_params_)
print(gs2_all_xgb.best_params_)
print(gs3_all_xgb.best_params_)
print(gs4_all_xgb.best_params_)

{'selectkbest__k': 91, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 136, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 76, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 61, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
