In [91]:
#!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.7.0 imblearn-0.0


In [1]:
#!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-win_amd64.whl (86.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, precision_score
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

import xgboost as xgb

In [3]:
# reading in full data set
brfss_total = pd.read_csv("./brfss_total.csv")

In [4]:
brfss_total.head()

Unnamed: 0,_STATE,DISPCODE,PHYSHLTH,MENTHLTH,USENOW3,HISPANC2,MARITAL,CHILDREN,EMPLOY,RENTHOM1,...,ACETTHEM,ACEHVSEX,MSCODE,_IMPAGE,_RFHLTH,_SMOKER3,_PRACE,_EDUCAG,_INCOMG,_TOTINDA
14697,5.0,110.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0,...,1.0,1.0,5.0,53.0,1.0,4.0,1.0,3.0,5.0,1.0
14699,5.0,110.0,15.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,...,1.0,1.0,5.0,64.0,2.0,3.0,1.0,2.0,2.0,1.0
14700,5.0,110.0,6.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,5.0,58.0,1.0,4.0,1.0,3.0,2.0,1.0
14701,5.0,110.0,30.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,5.0,76.0,2.0,4.0,1.0,1.0,0.0,2.0
14704,5.0,110.0,13.0,0.0,3.0,2.0,3.0,0.0,0.0,0.0,...,1.0,0.0,5.0,82.0,2.0,3.0,1.0,2.0,0.0,2.0


In [5]:
# just making sure no nulls are present
brfss_total.isna().sum()

_STATE      0
DISPCODE    0
PHYSHLTH    0
MENTHLTH    0
USENOW3     0
HISPANC2    0
MARITAL     0
CHILDREN    0
EMPLOY      0
RENTHOM1    0
SEX         0
QLACTLM2    0
ACEDEPRS    0
ACEDRINK    0
ACEDRUGS    0
ACEPRISN    0
ACEDIVRC    0
ACEPUNCH    0
ACEHURT     0
ACESWEAR    0
ACETOUCH    0
ACETTHEM    0
ACEHVSEX    0
MSCODE      0
_IMPAGE     0
_RFHLTH     0
_SMOKER3    0
_PRACE      0
_EDUCAG     0
_INCOMG     0
_TOTINDA    0
dtype: int64

In [6]:
brfss_total.drop(columns=['Unnamed: 0', 'index'], inplace=True)
brfss_total

Unnamed: 0,_STATE,DISPCODE,PHYSHLTH,MENTHLTH,USENOW3,HISPANC2,MARITAL,CHILDREN,EMPLOY,RENTHOM1,...,ACETTHEM,ACEHVSEX,MSCODE,_IMPAGE,_RFHLTH,_SMOKER3,_PRACE,_EDUCAG,_INCOMG,_TOTINDA
0,5.0,110.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0,...,1.0,1.0,5.0,53.0,1.0,4.0,1.0,3.0,5.0,1.0
1,5.0,110.0,15.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,...,1.0,1.0,5.0,64.0,2.0,3.0,1.0,2.0,2.0,1.0
2,5.0,110.0,6.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,5.0,58.0,1.0,4.0,1.0,3.0,2.0,1.0
3,5.0,110.0,30.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,5.0,76.0,2.0,4.0,1.0,1.0,0.0,2.0
4,5.0,110.0,13.0,0.0,3.0,2.0,3.0,0.0,0.0,0.0,...,1.0,0.0,5.0,82.0,2.0,3.0,1.0,2.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117550,55.0,1100.0,10.0,20.0,3.0,2.0,5.0,0.0,1.0,1.0,...,1.0,1.0,0.0,21.0,1.0,1.0,1.0,2.0,2.0,2.0
117551,55.0,1100.0,0.0,0.0,3.0,1.0,1.0,0.0,1.0,2.0,...,1.0,1.0,0.0,30.0,1.0,2.0,1.0,2.0,3.0,2.0
117552,55.0,1100.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,2.0,...,1.0,1.0,0.0,62.0,1.0,3.0,1.0,4.0,5.0,1.0
117553,55.0,1100.0,15.0,30.0,3.0,2.0,2.0,1.0,0.0,2.0,...,1.0,1.0,0.0,39.0,2.0,1.0,1.0,1.0,1.0,2.0


**Variables I will try to predict with my models:**
- USENOW3: Do you currently use chewing tobacco, snuff, or snus every day, some days, or not at all?
    - classification
    - 0 = Don't know, Not sure or Refused, 1 = every day, 2 = some days, 3 = not at all
- QLACTLM2: Are you limited in any way in any activities because of physical, mental, or emotional problems?
    - classification
    - 0 = Don't know, Not sure or Refused, 1 = yes, 2 = no
- _RFHLTH: Adults with good or better health vs. fair or poor health
    - classification
    - based off of GENHLTH
    - 0 = Don't know, Not sure or Refused, 1 = Good or Better Health, 2 = Fair or Poor Health
- _SMOKER3: Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker
    - classification
    - based off of SMOKE100 & SMOKEDAY
    - 0 = Don't know, Not sure or Refused, 1 = Current smoker (now smokes every day), 2 = Current smoker (now smokes some days), 3 = Former smoker, 4 = Never smoked

**Will make first three y's binary**
- turning y's binary made it so that I couldn't stratify in train_test_split...so will not binarize my y's for now

**Will OneHotEncode/ dummify ordinal/nominal features**

**Will only use a sample of the data set for models so they can run faster**

**Will use SMOTE to compensensate for imbalanced classes**

**Will aggregate all ACEs into two groups: Abuse and Household Challenges**

In [7]:
# taking a small sample so that my models will run a little faster
brfss_total_sample = brfss_total.sample(frac=0.05, axis=0)

brfss_total_sample.shape

(5878, 32)

In [8]:
# creating X variables for ACE and all features
X_ace = brfss_total_sample[['ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN', 'ACEDIVRC', 'ACEPUNCH', 'ACEHURT', 'ACESWEAR', 
                       'ACETOUCH', 'ACETTHEM', 'ACEHVSEX']]

In [9]:
# creating the 4 y's
y_tobacco = brfss_total_sample['USENOW3']
y_activity = brfss_total_sample['QLACTLM2']
y_health = brfss_total_sample['_RFHLTH']
y_smoker = brfss_total_sample['_SMOKER3']

In [10]:
#original baseline for tobacco
y_tobacco.value_counts(normalize=True)

3.0    0.964954
1.0    0.017353
2.0    0.014801
0.0    0.002892
Name: USENOW3, dtype: float64

In [11]:
#original baseline for activity
y_activity.value_counts(normalize=True)

2.0    0.720823
1.0    0.272372
0.0    0.006805
Name: QLACTLM2, dtype: float64

In [12]:
#original baseline for health
y_health.value_counts(normalize=True)

1.0    0.810480
2.0    0.185777
0.0    0.003743
Name: _RFHLTH, dtype: float64

In [13]:
#original baseline for smoker
y_smoker.value_counts(normalize=True)

4.0    0.518374
3.0    0.316094
1.0    0.116366
2.0    0.043552
0.0    0.005614
Name: _SMOKER3, dtype: float64

In [14]:
# changing variables to strings since ACE questions are nominal and so I could dummify them
X_ace = X_ace.astype(str)

In [15]:
# dummifying X_ace
X_ace = pd.get_dummies(X_ace, drop_first=True)

In [16]:
X_ace.shape

(5878, 28)

In [17]:
# to compensate for unbalanced classes in my y's will use SMOTE

sm = SMOTE(random_state=151)
X_ace1, y_tobacco = sm.fit_resample(X_ace, y_tobacco)

sm2 = SMOTE(random_state=151)
X_ace2, y_activity = sm2.fit_resample(X_ace, y_activity)

sm3 = SMOTE(random_state=151)
X_ace3, y_health = sm3.fit_resample(X_ace, y_health)

sm4 = SMOTE(random_state=151)
X_ace4, y_smoker = sm4.fit_resample(X_ace, y_smoker)

In [18]:
# new baseline for tobacco
y_tobacco.value_counts(normalize=True)

0.0    0.25
1.0    0.25
2.0    0.25
3.0    0.25
Name: USENOW3, dtype: float64

In [19]:
# looks like SMOTE has increased the size of my y's more than 4x, so will probably take some time for models to run
y_tobacco.shape

(22688,)

In [20]:
# new baseline for activity
y_activity.value_counts(normalize=True)

0.0    0.333333
1.0    0.333333
2.0    0.333333
Name: QLACTLM2, dtype: float64

In [21]:
# new baseline for health
y_health.value_counts(normalize=True)

0.0    0.333333
2.0    0.333333
1.0    0.333333
Name: _RFHLTH, dtype: float64

In [22]:
# new baseline for smoker
y_smoker.value_counts(normalize=True)

0.0    0.2
2.0    0.2
3.0    0.2
1.0    0.2
4.0    0.2
Name: _SMOKER3, dtype: float64

In [23]:
X_ace1.shape

(22688, 28)

In [24]:
# creating training and testing sets for all y's (stratified on y, but since the classes are equal probably didn't have to)
X_train_ace, X_test_ace, y_train_tobacco, y_test_tobacco = train_test_split(X_ace1, y_tobacco, random_state = 151, stratify=y_tobacco)
X_train_ace2, X_test_ace2, y_train_activity, y_test_activity = train_test_split(X_ace2, y_activity, random_state = 151, stratify=y_activity)
X_train_ace3, X_test_ace3, y_train_health, y_test_health = train_test_split(X_ace3, y_health, random_state = 151, stratify=y_health)
X_train_ace4, X_test_ace4, y_train_smoker, y_test_smoker = train_test_split(X_ace4, y_smoker, random_state = 151, stratify=y_smoker)

### Pipeline and Gridsearch with just ACE as predictors (Logistic Regression)

In [25]:
pipe_ace_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

params_ace_log = {'selectkbest__k': range(1, 28),
         'logisticregression__C': [0.01, 0.5, 1]}

gs_ace_log = GridSearchCV(pipe_ace_log, params_ace_log, n_jobs=-1, cv=5)

gs_ace_log.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 28)})

In [26]:
pipe2_ace_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs2_ace_log = GridSearchCV(pipe2_ace_log, params_ace_log, n_jobs=-1, cv=5)

gs2_ace_log.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 28)})

In [27]:
pipe3_ace_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs3_ace_log = GridSearchCV(pipe3_ace_log, params_ace_log, n_jobs=-1, cv=5)

gs3_ace_log.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 28)})

In [28]:
pipe4_ace_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs4_ace_log = GridSearchCV(pipe4_ace_log, params_ace_log, n_jobs=-1, cv=5)

gs4_ace_log.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 28)})

In [29]:
tobacco_ace_log_preds = gs_ace_log.predict(X_test_ace)
activity_ace_log_preds = gs2_ace_log.predict(X_test_ace2)
health_ace_log_preds = gs3_ace_log.predict(X_test_ace3)
smoker_ace_log_preds = gs4_ace_log.predict(X_test_ace4)

tobacco_ace_log_prec = precision_score(y_test_tobacco, tobacco_ace_log_preds, average='micro')
activity_ace_log_prec = precision_score(y_test_activity, activity_ace_log_preds, average='micro')
health_ace_log_prec = precision_score(y_test_health, health_ace_log_preds, average='micro')
smoker_ace_log_prec = precision_score(y_test_smoker, smoker_ace_log_preds, average='micro')

In [30]:
print(f' training accuracy for tobacco: {gs_ace_log.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_log.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_log.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_log.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.5274447578749413
 training accuracy for activity: 0.5115913143816218
 training accuracy for health: 0.5453866965201978
 training accuracy for smoker: 0.3950638893751094


In [31]:
print(f' testing accuracy for tobacco: {gs_ace_log.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_log.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_log.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_log.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.5357898448519041
 testing accuracy for activity: 0.513530522341095
 testing accuracy for health: 0.5519171564511615
 testing accuracy for smoker: 0.39590443686006827


In [32]:
print(f'Precision for tobacco: {tobacco_ace_log_prec}')
print(f'Precision for activity: {activity_ace_log_prec}')
print(f'Precision for health: {health_ace_log_prec}')
print(f'Precision for smoker: {smoker_ace_log_prec}')

Precision for tobacco: 0.5357898448519041
Precision for activity: 0.513530522341095
Precision for health: 0.5519171564511615
Precision for smoker: 0.39590443686006827


In [33]:
print(gs_ace_log.best_params_)
print(gs2_ace_log.best_params_)
print(gs3_ace_log.best_params_)
print(gs4_ace_log.best_params_)

{'logisticregression__C': 1, 'selectkbest__k': 27}
{'logisticregression__C': 1, 'selectkbest__k': 27}
{'logisticregression__C': 1, 'selectkbest__k': 27}
{'logisticregression__C': 1, 'selectkbest__k': 26}


**Looks like Logistic Regression, even with hyperparamater tuning, didn't improve the scores from baseline. Interesting that predicting tobacco at an almost 97% accuracy and 97% precision only used one ACE question as a predictor. Will explore more.**

### Pipeline and Gridsearch with just ACE as predictors (Random Forest Classifier)

In [34]:
pipe_ace_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

params_ace_rfc = {'selectkbest__k': range(1, 28, 4),
                  'randomforestclassifier__n_estimators': [100, 300, 500],
                  'randomforestclassifier__max_depth': [None, 3, 5], }
                 #'randomforestclassifier__min_samples_split': [1, 3, 5],
                 #'randomforestclassifier__min_samples_leaf': [1, 3, 5]}

gs_ace_rfc = GridSearchCV(pipe_ace_rfc, params_ace_rfc, n_jobs=-1, cv=3)

gs_ace_rfc.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 28, 4)})

In [35]:
pipe2_ace_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs2_ace_rfc = GridSearchCV(pipe2_ace_rfc, params_ace_rfc, n_jobs=-1, cv=3)

gs2_ace_rfc.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 28, 4)})

In [36]:
pipe3_ace_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs3_ace_rfc = GridSearchCV(pipe3_ace_rfc, params_ace_rfc, n_jobs=-1, cv=3)

gs3_ace_rfc.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 28, 4)})

In [37]:
pipe4_ace_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs4_ace_rfc = GridSearchCV(pipe4_ace_rfc, params_ace_rfc, n_jobs=-1, cv=3)

gs4_ace_rfc.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 28, 4)})

In [38]:
tobacco_ace_rfc_preds = gs_ace_rfc.predict(X_test_ace)
activity_ace_rfc_preds = gs2_ace_rfc.predict(X_test_ace2)
health_ace_rfc_preds = gs3_ace_rfc.predict(X_test_ace3)
smoker_ace_rfc_preds = gs4_ace_rfc.predict(X_test_ace4)

tobacco_ace_rfc_prec = precision_score(y_test_tobacco, tobacco_ace_rfc_preds, average='micro')
activity_ace_rfc_prec = precision_score(y_test_activity, activity_ace_rfc_preds, average='micro')
health_ace_rfc_prec = precision_score(y_test_health, health_ace_rfc_preds, average='micro')
smoker_ace_rfc_prec = precision_score(y_test_smoker, smoker_ace_rfc_preds, average='micro')

In [39]:
print(f' training accuracy for tobacco: {gs_ace_rfc.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_rfc.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_rfc.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_rfc.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.7689821344616832
 training accuracy for activity: 0.7262142032938215
 training accuracy for health: 0.6503405168392574
 training accuracy for smoker: 0.6041484333975145


In [40]:
print(f' testing accuracy for tobacco: {gs_ace_rfc.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_rfc.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_rfc.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_rfc.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.7581100141043724
 testing accuracy for activity: 0.7026431718061674
 testing accuracy for health: 0.6336411978729359
 testing accuracy for smoker: 0.5452874770280913


In [41]:
print(f'Precision for tobacco: {tobacco_ace_rfc_prec}')
print(f'Precision for activity: {activity_ace_rfc_prec}')
print(f'Precision for health: {health_ace_rfc_prec}')
print(f'Precision for smoker: {smoker_ace_rfc_prec}')

Precision for tobacco: 0.7581100141043724
Precision for activity: 0.7026431718061674
Precision for health: 0.6336411978729359
Precision for smoker: 0.5452874770280913


In [42]:
print(gs_ace_rfc.best_params_)
print(gs2_ace_rfc.best_params_)
print(gs3_ace_rfc.best_params_)
print(gs4_ace_rfc.best_params_)

{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 25}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 100, 'selectkbest__k': 25}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 25}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 25}


### Pipeline and Gridsearch with just ACE as predictors (Extra Trees Classifier)

In [43]:
pipe_ace_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

params_ace_etc = {'selectkbest__k': range(1, 28, 4),
                  'extratreesclassifier__n_estimators': [100, 300, 500],
                  'extratreesclassifier__max_depth': [None, 3, 5], }
                 #'extratreesclassifier__min_samples_split': [1, 3, 5],
                 #'extratreesclassifier__min_samples_leaf': [1, 3, 5]}

gs_ace_etc = GridSearchCV(pipe_ace_etc, params_ace_etc, n_jobs=-1, cv=3)

gs_ace_etc.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 28, 4)})

In [44]:
pipe2_ace_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs2_ace_etc = GridSearchCV(pipe2_ace_etc, params_ace_etc, n_jobs=-1, cv=3)

gs2_ace_etc.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 28, 4)})

In [45]:
pipe3_ace_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs3_ace_etc = GridSearchCV(pipe3_ace_etc, params_ace_etc, n_jobs=-1, cv=3)

gs3_ace_etc.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 28, 4)})

In [46]:
pipe4_ace_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs4_ace_etc = GridSearchCV(pipe4_ace_etc, params_ace_etc, n_jobs=-1, cv=3)

gs4_ace_etc.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 28, 4)})

In [47]:
tobacco_ace_etc_preds = gs_ace_etc.predict(X_test_ace)
activity_ace_etc_preds = gs2_ace_etc.predict(X_test_ace2)
health_ace_etc_preds = gs3_ace_etc.predict(X_test_ace3)
smoker_ace_etc_preds = gs4_ace_etc.predict(X_test_ace4)

tobacco_ace_etc_prec = precision_score(y_test_tobacco, tobacco_ace_etc_preds, average='micro')
activity_ace_etc_prec = precision_score(y_test_activity, activity_ace_etc_preds, average='micro')
health_ace_etc_prec = precision_score(y_test_health, health_ace_etc_preds, average='micro')
smoker_ace_etc_prec = precision_score(y_test_smoker, smoker_ace_etc_preds, average='micro')

In [48]:
print(f' training accuracy for tobacco: {gs_ace_etc.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_etc.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_etc.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_etc.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.7689821344616832
 training accuracy for activity: 0.7262142032938215
 training accuracy for health: 0.6503405168392574
 training accuracy for smoker: 0.6041484333975145


In [49]:
print(f' testing accuracy for tobacco: {gs_ace_etc.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_etc.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_etc.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_etc.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.7582863187588152
 testing accuracy for activity: 0.7007551919446192
 testing accuracy for health: 0.6322418136020151
 testing accuracy for smoker: 0.5410868994486742


In [50]:
print(f'Precision for tobacco: {tobacco_ace_etc_prec}')
print(f'Precision for activity: {activity_ace_etc_prec}')
print(f'Precision for health: {health_ace_etc_prec}')
print(f'Precision for smoker: {smoker_ace_etc_prec}')

Precision for tobacco: 0.7582863187588152
Precision for activity: 0.7007551919446192
Precision for health: 0.6322418136020151
Precision for smoker: 0.5410868994486742


In [51]:
print(gs_ace_etc.best_params_)
print(gs2_ace_etc.best_params_)
print(gs3_ace_etc.best_params_)
print(gs4_ace_etc.best_params_)

{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 300, 'selectkbest__k': 25}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 500, 'selectkbest__k': 25}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 300, 'selectkbest__k': 25}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 100, 'selectkbest__k': 25}


### Pipeline and Gridsearch with just ACE as predictors (Gradient Boost Classifier)

In [57]:
pipe_ace_gbc = make_pipeline(SelectKBest(f_classif), StandardScaler(), GradientBoostingClassifier())

params_ace_gbc = {'selectkbest__k': range(1, 28, 4),
                  'gradientboostingclassifier__loss': ['deviance', 'exponential'],
                  'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.5],
                  'gradientboostingclassifier__n_estimators': [100, 500],
                  'gradientboostingclassifier__max_depth': [3, 5]}

gs_ace_gbc = GridSearchCV(pipe_ace_gbc, params_ace_gbc, n_jobs=-1, cv=3)

gs_ace_gbc.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier())]),
             n_jobs=-1,
             param_grid={'gradientboostingclassifier__learning_rate': [0.01,
                                                                       0.1,
                                                                       0.5],
                         'gradientboostingclassifier__loss': ['deviance',
                                                              'exponential'],
                         'gradientboostingclassifier__max_depth': [3, 5],
                         'gradientboostingclassifier__n_estimators': [100, 500],
                         'selectkbest__k': range(1, 28, 4)})

In [58]:
pipe2_ace_gbc = make_pipeline(SelectKBest(f_classif), StandardScaler(), GradientBoostingClassifier())

gs2_ace_gbc = GridSearchCV(pipe2_ace_gbc, params_ace_gbc, n_jobs=-1, cv=3)

gs2_ace_gbc.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier())]),
             n_jobs=-1,
             param_grid={'gradientboostingclassifier__learning_rate': [0.01,
                                                                       0.1,
                                                                       0.5],
                         'gradientboostingclassifier__loss': ['deviance',
                                                              'exponential'],
                         'gradientboostingclassifier__max_depth': [3, 5],
                         'gradientboostingclassifier__n_estimators': [100, 500],
                         'selectkbest__k': range(1, 28, 4)})

In [59]:
pipe3_ace_gbc = make_pipeline(SelectKBest(f_classif), StandardScaler(), GradientBoostingClassifier())

gs3_ace_gbc = GridSearchCV(pipe3_ace_gbc, params_ace_gbc, n_jobs=-1, cv=3)

gs3_ace_gbc.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier())]),
             n_jobs=-1,
             param_grid={'gradientboostingclassifier__learning_rate': [0.01,
                                                                       0.1,
                                                                       0.5],
                         'gradientboostingclassifier__loss': ['deviance',
                                                              'exponential'],
                         'gradientboostingclassifier__max_depth': [3, 5],
                         'gradientboostingclassifier__n_estimators': [100, 500],
                         'selectkbest__k': range(1, 28, 4)})

In [60]:
pipe4_ace_gbc = make_pipeline(SelectKBest(f_classif), StandardScaler(), GradientBoostingClassifier())

gs4_ace_gbc = GridSearchCV(pipe4_ace_gbc, params_ace_gbc, n_jobs=-1, cv=3)

gs4_ace_gbc.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier())]),
             n_jobs=-1,
             param_grid={'gradientboostingclassifier__learning_rate': [0.01,
                                                                       0.1,
                                                                       0.5],
                         'gradientboostingclassifier__loss': ['deviance',
                                                              'exponential'],
                         'gradientboostingclassifier__max_depth': [3, 5],
                         'gradientboostingclassifier__n_estimators': [100, 500],
                         'selectkbest__k': range(1, 28, 4)})

In [61]:
tobacco_ace_gbc_preds = gs_ace_gbc.predict(X_test_ace)
activity_ace_gbc_preds = gs2_ace_gbc.predict(X_test_ace2)
health_ace_gbc_preds = gs3_ace_gbc.predict(X_test_ace3)
smoker_ace_gbc_preds = gs4_ace_gbc.predict(X_test_ace4)

tobacco_ace_gbc_prec = precision_score(y_test_tobacco, tobacco_ace_gbc_preds, average='micro')
activity_ace_gbc_prec = precision_score(y_test_activity, activity_ace_gbc_preds, average='micro')
health_ace_gbc_prec = precision_score(y_test_health, health_ace_gbc_preds, average='micro')
smoker_ace_gbc_prec = precision_score(y_test_smoker, smoker_ace_gbc_preds, average='micro')

In [62]:
print(f' training accuracy for tobacco: {gs_ace_gbc.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_gbc.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_gbc.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_gbc.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.7689821344616832
 training accuracy for activity: 0.7240113290674499
 training accuracy for health: 0.6484746711446964
 training accuracy for smoker: 0.6041484333975145


In [63]:
print(f' testing accuracy for tobacco: {gs_ace_gbc.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_gbc.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_gbc.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_gbc.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.7588152327221439
 testing accuracy for activity: 0.6954059156702328
 testing accuracy for health: 0.6280436607892528
 testing accuracy for smoker: 0.5410868994486742


In [64]:
print(f'Precision for tobacco: {tobacco_ace_gbc_prec}')
print(f'Precision for activity: {activity_ace_gbc_prec}')
print(f'Precision for health: {health_ace_gbc_prec}')
print(f'Precision for smoker: {smoker_ace_gbc_prec}')

Precision for tobacco: 0.7588152327221439
Precision for activity: 0.6954059156702328
Precision for health: 0.6280436607892528
Precision for smoker: 0.5410868994486742


In [65]:
print(gs_ace_gbc.best_params_)
print(gs2_ace_gbc.best_params_)
print(gs3_ace_gbc.best_params_)
print(gs4_ace_gbc.best_params_)

{'gradientboostingclassifier__learning_rate': 0.5, 'gradientboostingclassifier__loss': 'deviance', 'gradientboostingclassifier__max_depth': 5, 'gradientboostingclassifier__n_estimators': 100, 'selectkbest__k': 25}
{'gradientboostingclassifier__learning_rate': 0.5, 'gradientboostingclassifier__loss': 'deviance', 'gradientboostingclassifier__max_depth': 5, 'gradientboostingclassifier__n_estimators': 100, 'selectkbest__k': 25}
{'gradientboostingclassifier__learning_rate': 0.1, 'gradientboostingclassifier__loss': 'deviance', 'gradientboostingclassifier__max_depth': 5, 'gradientboostingclassifier__n_estimators': 500, 'selectkbest__k': 25}
{'gradientboostingclassifier__learning_rate': 0.5, 'gradientboostingclassifier__loss': 'deviance', 'gradientboostingclassifier__max_depth': 5, 'gradientboostingclassifier__n_estimators': 500, 'selectkbest__k': 25}


### Pipeline and Gridsearch with just ACE as predictors (Ada Boost Classifier)

In [66]:
pipe_ace_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

params_ace_abc = {'selectkbest__k': range(1, 28, 4),
                  'adaboostclassifier__learning_rate': [0.5, 1.0],
                  'adaboostclassifier__n_estimators': [10, 15, 20, 25], }

gs_ace_abc = GridSearchCV(pipe_ace_abc, params_ace_abc, n_jobs=-1, cv=3)

gs_ace_abc.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 28, 4)})

In [68]:
pipe2_ace_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs2_ace_abc = GridSearchCV(pipe2_ace_abc, params_ace_abc, n_jobs=-1, cv=3)

gs2_ace_abc.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 28, 4)})

In [69]:
pipe3_ace_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs3_ace_abc = GridSearchCV(pipe3_ace_abc, params_ace_abc, n_jobs=-1, cv=3)

gs3_ace_abc.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 28, 4)})

In [70]:
pipe4_ace_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs4_ace_abc = GridSearchCV(pipe4_ace_abc, params_ace_abc, n_jobs=-1, cv=3)

gs4_ace_abc.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 28, 4)})

In [71]:
tobacco_ace_abc_preds = gs_ace_abc.predict(X_test_ace)
activity_ace_abc_preds = gs2_ace_abc.predict(X_test_ace2)
health_ace_abc_preds = gs3_ace_abc.predict(X_test_ace3)
smoker_ace_abc_preds = gs4_ace_abc.predict(X_test_ace4)

tobacco_ace_abc_prec = precision_score(y_test_tobacco, tobacco_ace_abc_preds, average='micro')
activity_ace_abc_prec = precision_score(y_test_activity, activity_ace_abc_preds, average='micro')
health_ace_abc_prec = precision_score(y_test_health, health_ace_abc_preds, average='micro')
smoker_ace_abc_prec = precision_score(y_test_smoker, smoker_ace_abc_preds, average='micro')

In [72]:
print(f' training accuracy for tobacco: {gs_ace_abc.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_abc.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_abc.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_abc.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.5980253878702397
 training accuracy for activity: 0.6180635686562467
 training accuracy for health: 0.4761638212519825
 training accuracy for smoker: 0.376597234377735


In [73]:
print(f' testing accuracy for tobacco: {gs_ace_abc.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_abc.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_abc.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_abc.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.6020803949224259
 testing accuracy for activity: 0.6340465701699182
 testing accuracy for health: 0.49006437167646233
 testing accuracy for smoker: 0.3880283538986611


In [74]:
print(f'Precision for tobacco: {tobacco_ace_abc_prec}')
print(f'Precision for activity: {activity_ace_abc_prec}')
print(f'Precision for health: {health_ace_abc_prec}')
print(f'Precision for smoker: {smoker_ace_abc_prec}')

Precision for tobacco: 0.6020803949224259
Precision for activity: 0.6340465701699182
Precision for health: 0.49006437167646233
Precision for smoker: 0.3880283538986611


In [75]:
print(gs_ace_abc.best_params_)
print(gs2_ace_abc.best_params_)
print(gs3_ace_abc.best_params_)
print(gs4_ace_abc.best_params_)

{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 21}
{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 20, 'selectkbest__k': 25}
{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 10, 'selectkbest__k': 9}
{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 25}


### Pipeline and Gridsearch with just ACE as predictors (XG Boost Classifier)

In [79]:
pipe_ace_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

params_ace_xgb = {'selectkbest__k': range(1, 28, 4),
                  'xgbclassifier__learning_rate': [0.5, 1.0],
                  'xgbclassifier__n_estimators': [10, 15, 20, 25],
                 'xgbclassifier__max_depth': [3, 5]}

gs_ace_xgb = GridSearchCV(pipe_ace_xgb, params_ace_xgb, n_jobs=-1, cv=3)

gs_ace_xgb.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [80]:
pipe2_ace_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs2_ace_xgb = GridSearchCV(pipe2_ace_xgb, params_ace_xgb, n_jobs=-1, cv=3)

gs2_ace_xgb.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [81]:
pipe3_ace_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs3_ace_xgb = GridSearchCV(pipe3_ace_xgb, params_ace_xgb, n_jobs=-1, cv=3)

gs3_ace_xgb.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [82]:
pipe4_ace_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs4_ace_xgb = GridSearchCV(pipe4_ace_xgb, params_ace_xgb, n_jobs=-1, cv=3)

gs4_ace_xgb.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [85]:
tobacco_ace_xgb_preds = gs_ace_xgb.predict(X_test_ace)
activity_ace_xgb_preds = gs2_ace_xgb.predict(X_test_ace2)
health_ace_xgb_preds = gs3_ace_xgb.predict(X_test_ace3)
smoker_ace_xgb_preds = gs4_ace_xgb.predict(X_test_ace4)

tobacco_ace_xgb_prec = precision_score(y_test_tobacco, tobacco_ace_xgb_preds, average='micro')
activity_ace_xgb_prec = precision_score(y_test_activity, activity_ace_xgb_preds, average='micro')
health_ace_xgb_prec = precision_score(y_test_health, health_ace_xgb_preds, average='micro')
smoker_ace_xgb_prec = precision_score(y_test_smoker, smoker_ace_xgb_preds, average='micro')

In [86]:
print(f' training accuracy for tobacco: {gs_ace_xgb.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_xgb.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_xgb.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_xgb.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.7594029149036201
 training accuracy for activity: 0.6930661911255638
 training accuracy for health: 0.6282302453587089
 training accuracy for smoker: 0.5292315771048486


In [87]:
print(f' testing accuracy for tobacco: {gs_ace_xgb.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_xgb.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_xgb.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_xgb.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.7593441466854725
 testing accuracy for activity: 0.6869100062932662
 testing accuracy for health: 0.6238455079764903
 testing accuracy for smoker: 0.5161459700708847


In [88]:
print(f'Precision for tobacco: {tobacco_ace_xgb_prec}')
print(f'Precision for activity: {activity_ace_xgb_prec}')
print(f'Precision for health: {health_ace_xgb_prec}')
print(f'Precision for smoker: {smoker_ace_xgb_prec}')

Precision for tobacco: 0.7593441466854725
Precision for activity: 0.6869100062932662
Precision for health: 0.6238455079764903
Precision for smoker: 0.5161459700708847


In [89]:
print(gs_ace_xgb.best_params_)
print(gs2_ace_xgb.best_params_)
print(gs3_ace_xgb.best_params_)
print(gs4_ace_xgb.best_params_)

{'selectkbest__k': 25, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 25, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 25, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 25, 'xgbclassifier__learning_rate': 0.5, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
