In [91]:
#!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.7.0 imblearn-0.0


In [1]:
#!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-win_amd64.whl (86.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, precision_score
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

import xgboost as xgb

In [2]:
# reading in full data set
brfss_total = pd.read_pickle('./Pickled_Data/brfss_total.pkl')

In [3]:
brfss_total.head()

Unnamed: 0,_STATE,DISPCODE,PHYSHLTH,MENTHLTH,USENOW3,HISPANC2,MARITAL,CHILDREN,EMPLOY,RENTHOM1,...,ACETTHEM,ACEHVSEX,MSCODE,_IMPAGE,_RFHLTH,_SMOKER3,_PRACE,_EDUCAG,_INCOMG,_TOTINDA
14697,5.0,110.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0,...,1.0,1.0,5.0,53.0,1.0,4.0,1.0,3.0,5.0,1.0
14699,5.0,110.0,15.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,...,1.0,1.0,5.0,64.0,2.0,3.0,1.0,2.0,2.0,1.0
14700,5.0,110.0,6.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,5.0,58.0,1.0,4.0,1.0,3.0,2.0,1.0
14701,5.0,110.0,30.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,5.0,76.0,2.0,4.0,1.0,1.0,0.0,2.0
14704,5.0,110.0,13.0,0.0,3.0,2.0,3.0,0.0,0.0,0.0,...,1.0,0.0,5.0,82.0,2.0,3.0,1.0,2.0,0.0,2.0


In [4]:
# just making sure no nulls are present
brfss_total.isna().sum()

_STATE      0
DISPCODE    0
PHYSHLTH    0
MENTHLTH    0
USENOW3     0
HISPANC2    0
MARITAL     0
CHILDREN    0
EMPLOY      0
RENTHOM1    0
SEX         0
QLACTLM2    0
ACEDEPRS    0
ACEDRINK    0
ACEDRUGS    0
ACEPRISN    0
ACEDIVRC    0
ACEPUNCH    0
ACEHURT     0
ACESWEAR    0
ACETOUCH    0
ACETTHEM    0
ACEHVSEX    0
MSCODE      0
_IMPAGE     0
_RFHLTH     0
_SMOKER3    0
_PRACE      0
_EDUCAG     0
_INCOMG     0
_TOTINDA    0
dtype: int64

In [5]:
# resetting the index so it starts at 0
brfss_total = brfss_total.reset_index()
brfss_total.drop(columns='index')

Unnamed: 0,_STATE,DISPCODE,PHYSHLTH,MENTHLTH,USENOW3,HISPANC2,MARITAL,CHILDREN,EMPLOY,RENTHOM1,...,ACETTHEM,ACEHVSEX,MSCODE,_IMPAGE,_RFHLTH,_SMOKER3,_PRACE,_EDUCAG,_INCOMG,_TOTINDA
0,5.0,110.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0,...,1.0,1.0,5.0,53.0,1.0,4.0,1.0,3.0,5.0,1.0
1,5.0,110.0,15.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,...,1.0,1.0,5.0,64.0,2.0,3.0,1.0,2.0,2.0,1.0
2,5.0,110.0,6.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,5.0,58.0,1.0,4.0,1.0,3.0,2.0,1.0
3,5.0,110.0,30.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,1.0,5.0,76.0,2.0,4.0,1.0,1.0,0.0,2.0
4,5.0,110.0,13.0,0.0,3.0,2.0,3.0,0.0,0.0,0.0,...,1.0,0.0,5.0,82.0,2.0,3.0,1.0,2.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117550,55.0,1100.0,10.0,20.0,3.0,2.0,5.0,0.0,1.0,1.0,...,1.0,1.0,0.0,21.0,1.0,1.0,1.0,2.0,2.0,2.0
117551,55.0,1100.0,0.0,0.0,3.0,1.0,1.0,0.0,1.0,2.0,...,1.0,1.0,0.0,30.0,1.0,2.0,1.0,2.0,3.0,2.0
117552,55.0,1100.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,2.0,...,1.0,1.0,0.0,62.0,1.0,3.0,1.0,4.0,5.0,1.0
117553,55.0,1100.0,15.0,30.0,3.0,2.0,2.0,1.0,0.0,2.0,...,1.0,1.0,0.0,39.0,2.0,1.0,1.0,1.0,1.0,2.0


**Variables I will try to predict with my models:**
- USENOW3: Do you currently use chewing tobacco, snuff, or snus every day, some days, or not at all?
    - classification
    - 0 = Don't know, Not sure or Refused, 1 = every day, 2 = some days, 3 = not at all
- QLACTLM2: Are you limited in any way in any activities because of physical, mental, or emotional problems?
    - classification
    - 0 = Don't know, Not sure or Refused, 1 = yes, 2 = no
- _RFHLTH: Adults with good or better health vs. fair or poor health
    - classification
    - based off of GENHLTH
    - 0 = Don't know, Not sure or Refused, 1 = Good or Better Health, 2 = Fair or Poor Health
- _SMOKER3: Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker
    - classification
    - based off of SMOKE100 & SMOKEDAY
    - 0 = Don't know, Not sure or Refused, 1 = Current smoker (now smokes every day), 2 = Current smoker (now smokes some days), 3 = Former smoker, 4 = Never smoked

**Will make first three y's binary**
- turning y's binary made it so that I couldn't stratify in train_test_split...so will not binarize my y's for now

**Will OneHotEncode/ dummify ordinal/nominal features**

**Will only use a sample of the data set for models so they can run faster**

**Will use SMOTE to compensensate for imbalanced classes**

**Will aggregate all ACEs into two groups: Abuse and Household Challenges**

In [6]:
# taking a small sample so that my models will run a little faster
brfss_total_sample = brfss_total.sample(frac=0.05, axis=0)

brfss_total_sample.shape

(5878, 32)

In [24]:
# creating X variable with all features
X_all = brfss_total_sample.drop(columns=['USENOW3', 'QLACTLM2', '_RFHLTH', '_SMOKER3'])

In [8]:
# creating the 4 y's
y_tobacco = brfss_total_sample['USENOW3']
y_activity = brfss_total_sample['QLACTLM2']
y_health = brfss_total_sample['_RFHLTH']
y_smoker = brfss_total_sample['_SMOKER3']

In [9]:
#original baseline for tobacco
y_tobacco.value_counts(normalize=True)

3.0    0.968357
2.0    0.014801
1.0    0.014291
0.0    0.002552
Name: USENOW3, dtype: float64

In [10]:
#original baseline for activity
y_activity.value_counts(normalize=True)

2.0    0.725587
1.0    0.267438
0.0    0.006975
Name: QLACTLM2, dtype: float64

In [11]:
#original baseline for health
y_health.value_counts(normalize=True)

1.0    0.813882
2.0    0.181354
0.0    0.004764
Name: _RFHLTH, dtype: float64

In [12]:
#original baseline for smoker
y_smoker.value_counts(normalize=True)

4.0    0.520075
3.0    0.306397
1.0    0.119599
2.0    0.048146
0.0    0.005784
Name: _SMOKER3, dtype: float64

In [25]:
X_num = X_all[['PHYSHLTH', 'MENTHLTH', 'CHILDREN']]
X_cat = X_all[['_STATE', 'DISPCODE', 'PHYSHLTH', 'MENTHLTH', 'HISPANC2', 'MARITAL' , 'CHILDREN',
               'EMPLOY', 'RENTHOM1', 'SEX', 'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN',
               'ACEDIVRC', 'ACEPUNCH', 'ACEHURT', 'ACESWEAR', 'ACETOUCH', 'ACETTHEM', 'ACEHVSEX', 
               'MSCODE', '_IMPAGE', '_PRACE', '_EDUCAG', '_INCOMG','_TOTINDA']]

In [26]:
X_num

Unnamed: 0,PHYSHLTH,MENTHLTH,CHILDREN
55751,30.0,0.0,0.0
79433,0.0,0.0,2.0
102899,1.0,3.0,0.0
79996,0.0,0.0,0.0
53082,1.0,30.0,0.0
...,...,...,...
49090,0.0,0.0,3.0
5615,0.0,14.0,1.0
103816,0.0,15.0,0.0
89740,0.0,0.0,0.0


In [29]:
X_cat = X_cat.astype(str)

In [30]:
# dummifying nominal variables for X_all
X_dummies = pd.get_dummies(X_cat, drop_first=True)
X_dummies.head()

Unnamed: 0,_STATE_15.0,_STATE_19.0,_STATE_22.0,_STATE_27.0,_STATE_30.0,_STATE_32.0,_STATE_37.0,_STATE_40.0,_STATE_47.0,_STATE_5.0,...,_EDUCAG_2.0,_EDUCAG_3.0,_EDUCAG_4.0,_INCOMG_1.0,_INCOMG_2.0,_INCOMG_3.0,_INCOMG_4.0,_INCOMG_5.0,_TOTINDA_1.0,_TOTINDA_2.0
55751,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
79433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
102899,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,1,0
79996,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
53082,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1


In [35]:
X_all = X_num.merge(X_dummies, left_index=True, right_index=True)

In [36]:
X_all.shape

(5878, 217)

In [37]:
# to compensate for unbalanced classes in my y's will use SMOTE

sm = SMOTE(random_state=151)
X_all1, y_tobacco = sm.fit_resample(X_all, y_tobacco)

sm2 = SMOTE(random_state=151)
X_all2, y_activity = sm2.fit_resample(X_all, y_activity)

sm3 = SMOTE(random_state=151)
X_all3, y_health = sm3.fit_resample(X_all, y_health)

sm4 = SMOTE(random_state=151)
X_all4, y_smoker = sm4.fit_resample(X_all, y_smoker)

In [38]:
# new baseline for tobacco
y_tobacco.value_counts(normalize=True)

0.0    0.25
1.0    0.25
2.0    0.25
3.0    0.25
Name: USENOW3, dtype: float64

In [39]:
# looks like SMOTE has increased the size of my y's more than 4x, so will probably take some time for models to run
y_tobacco.shape

(22768,)

In [40]:
# new baseline for activity
y_activity.value_counts(normalize=True)

0.0    0.333333
2.0    0.333333
1.0    0.333333
Name: QLACTLM2, dtype: float64

In [41]:
# new baseline for health
y_health.value_counts(normalize=True)

0.0    0.333333
1.0    0.333333
2.0    0.333333
Name: _RFHLTH, dtype: float64

In [42]:
# new baseline for smoker
y_smoker.value_counts(normalize=True)

0.0    0.2
1.0    0.2
2.0    0.2
4.0    0.2
3.0    0.2
Name: _SMOKER3, dtype: float64

In [43]:
X_all1.shape

(22768, 217)

In [44]:
# creating training and testing sets for all y's (stratified on y, but since the classes are equal probably didn't have to)
X_train_all, X_test_all, y_train_tobacco, y_test_tobacco = train_test_split(X_all1, y_tobacco, random_state = 151, stratify=y_tobacco)
X_train_all2, X_test_all2, y_train_activity, y_test_activity = train_test_split(X_all2, y_activity, random_state = 151, stratify=y_activity)
X_train_all3, X_test_all3, y_train_health, y_test_health = train_test_split(X_all3, y_health, random_state = 151, stratify=y_health)
X_train_all4, X_test_all4, y_train_smoker, y_test_smoker = train_test_split(X_all4, y_smoker, random_state = 151, stratify=y_smoker)

### Pipeline and Gridsearch with just all features predictors (Logistic Regression)

In [45]:
pipe_all_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

params_all_log = {'selectkbest__k': range(1, 217, 20),
         'logisticregression__C': [0.01, 0.5, 1]}

gs_all_log = GridSearchCV(pipe_all_log, params_all_log, n_jobs=-1, cv=3)

gs_all_log.fit(X_train_all, y_train_tobacco)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 217, 20)})

In [46]:
pipe2_all_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs2_all_log = GridSearchCV(pipe2_all_log, params_all_log, n_jobs=-1, cv=3)

gs2_all_log.fit(X_train_all2, y_train_activity)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 217, 20)})

In [47]:
pipe3_all_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs3_all_log = GridSearchCV(pipe3_all_log, params_all_log, n_jobs=-1, cv=3)

gs3_all_log.fit(X_train_all3, y_train_health)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 217, 20)})

In [48]:
pipe4_all_log = make_pipeline(SelectKBest(f_classif), StandardScaler(), LogisticRegression(max_iter=10_000))

gs4_all_log = GridSearchCV(pipe4_all_log, params_all_log, n_jobs=-1, cv=3)

gs4_all_log.fit(X_train_all4, y_train_smoker)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=10000))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.01, 0.5, 1],
                         'selectkbest__k': range(1, 217, 20)})

In [49]:
tobacco_all_log_preds = gs_all_log.predict(X_test_all)
activity_all_log_preds = gs2_all_log.predict(X_test_all2)
health_all_log_preds = gs3_all_log.predict(X_test_all3)
smoker_all_log_preds = gs4_all_log.predict(X_test_all4)

tobacco_all_log_prec = precision_score(y_test_tobacco, tobacco_all_log_preds, average='micro')
activity_all_log_prec = precision_score(y_test_activity, activity_all_log_preds, average='micro')
health_all_log_prec = precision_score(y_test_health, health_all_log_preds, average='micro')
smoker_all_log_prec = precision_score(y_test_smoker, smoker_all_log_preds, average='micro')

In [50]:
print(f' training accuracy for tobacco: {gs_all_log.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_log.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_log.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_log.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 0.9206488639025533
 training accuracy for activity: 0.8851604835348061
 training accuracy for health: 0.9265143069490895
 training accuracy for smoker: 0.659077030445782


In [51]:
print(f' testing accuracy for tobacco: {gs_all_log.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_log.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_log.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_log.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.9128601546029516
 testing accuracy for activity: 0.8849640512660206
 testing accuracy for health: 0.9147157190635451
 testing accuracy for smoker: 0.6240188383045526


In [52]:
print(f'Precision for tobacco: {tobacco_all_log_prec}')
print(f'Precision for activity: {activity_all_log_prec}')
print(f'Precision for health: {health_all_log_prec}')
print(f'Precision for smoker: {smoker_all_log_prec}')

Precision for tobacco: 0.9128601546029516
Precision for activity: 0.8849640512660206
Precision for health: 0.9147157190635451
Precision for smoker: 0.6240188383045526


In [53]:
print(gs_all_log.best_params_)
print(gs2_all_log.best_params_)
print(gs3_all_log.best_params_)
print(gs4_all_log.best_params_)

{'logisticregression__C': 1, 'selectkbest__k': 181}
{'logisticregression__C': 1, 'selectkbest__k': 201}
{'logisticregression__C': 1, 'selectkbest__k': 201}
{'logisticregression__C': 0.5, 'selectkbest__k': 201}


**observations**

### Pipeline and Gridsearch with just all features as predictors (Random Forest Classifier)

In [54]:
pipe_all_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

params_all_rfc = {'selectkbest__k': range(1, 217, 20),
                  'randomforestclassifier__n_estimators': [100, 300, 500],
                  'randomforestclassifier__max_depth': [None, 3, 5], }
                 #'randomforestclassifier__min_samples_split': [1, 3, 5],
                 #'randomforestclassifier__min_samples_leaf': [1, 3, 5]}

gs_all_rfc = GridSearchCV(pipe_all_rfc, params_all_rfc, n_jobs=-1, cv=3)

gs_all_rfc.fit(X_train_all, y_train_tobacco)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 217, 20)})

In [55]:
pipe2_all_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs2_all_rfc = GridSearchCV(pipe2_all_rfc, params_all_rfc, n_jobs=-1, cv=3)

gs2_all_rfc.fit(X_train_all2, y_train_activity)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 217, 20)})

In [56]:
pipe3_all_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs3_all_rfc = GridSearchCV(pipe3_all_rfc, params_all_rfc, n_jobs=-1, cv=3)

gs3_all_rfc.fit(X_train_all3, y_train_health)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 217, 20)})

In [57]:
pipe4_all_rfc = make_pipeline(SelectKBest(f_classif), StandardScaler(), RandomForestClassifier())

gs4_all_rfc = GridSearchCV(pipe4_all_rfc, params_all_rfc, n_jobs=-1, cv=3)

gs4_all_rfc.fit(X_train_all4, y_train_smoker)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'randomforestclassifier__max_depth': [None, 3, 5],
                         'randomforestclassifier__n_estimators': [100, 300,
                                                                  500],
                         'selectkbest__k': range(1, 217, 20)})

In [58]:
tobacco_all_rfc_preds = gs_all_rfc.predict(X_test_all)
activity_all_rfc_preds = gs2_all_rfc.predict(X_test_all2)
health_all_rfc_preds = gs3_all_rfc.predict(X_test_all3)
smoker_all_rfc_preds = gs4_all_rfc.predict(X_test_all4)

tobacco_all_rfc_prec = precision_score(y_test_tobacco, tobacco_all_rfc_preds, average='micro')
activity_all_rfc_prec = precision_score(y_test_activity, activity_all_rfc_preds, average='micro')
health_all_rfc_prec = precision_score(y_test_health, health_all_rfc_preds, average='micro')
smoker_all_rfc_prec = precision_score(y_test_smoker, smoker_all_rfc_preds, average='micro')

In [59]:
print(f' training accuracy for tobacco: {gs_all_rfc.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_rfc.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_rfc.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_rfc.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 1.0
 training accuracy for activity: 1.0
 training accuracy for health: 0.999721293199554
 training accuracy for smoker: 0.9992148652185292


In [60]:
print(f' testing accuracy for tobacco: {gs_all_rfc.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_rfc.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_rfc.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_rfc.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.9947294448348559
 testing accuracy for activity: 0.9105970615817442
 testing accuracy for health: 0.9467670011148273
 testing accuracy for smoker: 0.8113553113553114


In [61]:
print(f'Precision for tobacco: {tobacco_all_rfc_prec}')
print(f'Precision for activity: {activity_all_rfc_prec}')
print(f'Precision for health: {health_all_rfc_prec}')
print(f'Precision for smoker: {smoker_all_rfc_prec}')

Precision for tobacco: 0.9947294448348559
Precision for activity: 0.9105970615817442
Precision for health: 0.9467670011148273
Precision for smoker: 0.8113553113553114


In [42]:
print(gs_all_rfc.best_params_)
print(gs2_all_rfc.best_params_)
print(gs3_all_rfc.best_params_)
print(gs4_all_rfc.best_params_)

{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 25}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 100, 'selectkbest__k': 25}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 25}
{'randomforestclassifier__max_depth': None, 'randomforestclassifier__n_estimators': 500, 'selectkbest__k': 25}


### Pipeline and Gridsearch with just all features as predictors (Extra Trees Classifier)

In [62]:
pipe_all_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

params_all_etc = {'selectkbest__k': range(1, 217, 20),
                  'extratreesclassifier__n_estimators': [100, 300, 500],
                  'extratreesclassifier__max_depth': [None, 3, 5], }
                 #'extratreesclassifier__min_samples_split': [1, 3, 5],
                 #'extratreesclassifier__min_samples_leaf': [1, 3, 5]}

gs_all_etc = GridSearchCV(pipe_all_etc, params_all_etc, n_jobs=-1, cv=3)

gs_all_etc.fit(X_train_all, y_train_tobacco)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 217, 20)})

In [63]:
pipe2_all_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs2_all_etc = GridSearchCV(pipe2_all_etc, params_all_etc, n_jobs=-1, cv=3)

gs2_all_etc.fit(X_train_all2, y_train_activity)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 217, 20)})

In [64]:
pipe3_all_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs3_all_etc = GridSearchCV(pipe3_all_etc, params_all_etc, n_jobs=-1, cv=3)

gs3_all_etc.fit(X_train_all3, y_train_health)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 217, 20)})

In [65]:
pipe4_all_etc = make_pipeline(SelectKBest(f_classif), StandardScaler(), ExtraTreesClassifier())

gs4_all_etc = GridSearchCV(pipe4_all_etc, params_all_etc, n_jobs=-1, cv=3)

gs4_all_etc.fit(X_train_all4, y_train_smoker)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('extratreesclassifier',
                                        ExtraTreesClassifier())]),
             n_jobs=-1,
             param_grid={'extratreesclassifier__max_depth': [None, 3, 5],
                         'extratreesclassifier__n_estimators': [100, 300, 500],
                         'selectkbest__k': range(1, 217, 20)})

In [66]:
tobacco_all_etc_preds = gs_all_etc.predict(X_test_all)
activity_all_etc_preds = gs2_all_etc.predict(X_test_all2)
health_all_etc_preds = gs3_all_etc.predict(X_test_all3)
smoker_all_etc_preds = gs4_all_etc.predict(X_test_all4)

tobacco_all_etc_prec = precision_score(y_test_tobacco, tobacco_all_etc_preds, average='micro')
activity_all_etc_prec = precision_score(y_test_activity, activity_all_etc_preds, average='micro')
health_all_etc_prec = precision_score(y_test_health, health_all_etc_preds, average='micro')
smoker_all_etc_prec = precision_score(y_test_smoker, smoker_all_etc_preds, average='micro')

In [67]:
print(f' training accuracy for tobacco: {gs_all_etc.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_etc.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_etc.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_etc.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 1.0
 training accuracy for activity: 1.0
 training accuracy for health: 1.0
 training accuracy for smoker: 0.9992148652185292


In [68]:
print(f' testing accuracy for tobacco: {gs_all_etc.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_etc.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_etc.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_etc.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.9947294448348559
 testing accuracy for activity: 0.909659268521413
 testing accuracy for health: 0.9484392419175028
 testing accuracy for smoker: 0.8108320251177394


In [69]:
print(f'Precision for tobacco: {tobacco_all_etc_prec}')
print(f'Precision for activity: {activity_all_etc_prec}')
print(f'Precision for health: {health_all_etc_prec}')
print(f'Precision for smoker: {smoker_all_etc_prec}')

Precision for tobacco: 0.9947294448348559
Precision for activity: 0.909659268521413
Precision for health: 0.9484392419175028
Precision for smoker: 0.8108320251177394


In [70]:
print(gs_all_etc.best_params_)
print(gs2_all_etc.best_params_)
print(gs3_all_etc.best_params_)
print(gs4_all_etc.best_params_)

{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 300, 'selectkbest__k': 181}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 500, 'selectkbest__k': 201}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 500, 'selectkbest__k': 201}
{'extratreesclassifier__max_depth': None, 'extratreesclassifier__n_estimators': 300, 'selectkbest__k': 201}


### Pipeline and Gridsearch with just all features as predictors (Gradient Boost Classifier)

In [92]:
pipe_all_gbc = make_pipeline(SelectKBest(f_classif), StandardScaler(), GradientBoostingClassifier())

params_all_gbc = {'selectkbest__k': range(1, 217, 20),
                  'gradientboostingclassifier__loss': ['deviance', 'exponential'],
                  'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.5],
                  'gradientboostingclassifier__n_estimators': [100, 500],
                  'gradientboostingclassifier__max_depth': [3, 5]}

gs_all_gbc = GridSearchCV(pipe_all_gbc, params_all_gbc, n_jobs=-1, cv=3)

gs_all_gbc.fit(X_train_all, y_train_tobacco)

KeyboardInterrupt: 

In [None]:
pipe2_all_gbc = make_pipeline(SelectKBest(f_classif), StandardScaler(), GradientBoostingClassifier())

gs2_all_gbc = GridSearchCV(pipe2_all_gbc, params_all_gbc, n_jobs=-1, cv=3)

gs2_all_gbc.fit(X_train_all2, y_train_activity)

In [None]:
pipe3_all_gbc = make_pipeline(SelectKBest(f_classif), StandardScaler(), GradientBoostingClassifier())

gs3_all_gbc = GridSearchCV(pipe3_all_gbc, params_all_gbc, n_jobs=-1, cv=3)

gs3_all_gbc.fit(X_train_all3, y_train_health)

In [None]:
pipe4_all_gbc = make_pipeline(SelectKBest(f_classif), StandardScaler(), GradientBoostingClassifier())

gs4_all_gbc = GridSearchCV(pipe4_all_gbc, params_all_gbc, n_jobs=-1, cv=3)

gs4_all_gbc.fit(X_train_all4, y_train_smoker)

In [None]:
tobacco_all_gbc_preds = gs_all_gbc.predict(X_test_all)
activity_all_gbc_preds = gs2_all_gbc.predict(X_test_all2)
health_all_gbc_preds = gs3_all_gbc.predict(X_test_all3)
smoker_all_gbc_preds = gs4_all_gbc.predict(X_test_all4)

tobacco_all_gbc_prec = precision_score(y_test_tobacco, tobacco_all_gbc_preds, average='micro')
activity_all_gbc_prec = precision_score(y_test_activity, activity_all_gbc_preds, average='micro')
health_all_gbc_prec = precision_score(y_test_health, health_all_gbc_preds, average='micro')
smoker_all_gbc_prec = precision_score(y_test_smoker, smoker_all_gbc_preds, average='micro')

In [None]:
print(f' training accuracy for tobacco: {gs_all_gbc.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_gbc.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_gbc.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_gbc.score(X_train_all4, y_train_smoker)}')

In [None]:
print(f' testing accuracy for tobacco: {gs_all_gbc.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_gbc.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_gbc.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_gbc.score(X_test_all4, y_test_smoker)}')

In [None]:
print(f'Precision for tobacco: {tobacco_all_gbc_prec}')
print(f'Precision for activity: {activity_all_gbc_prec}')
print(f'Precision for health: {health_all_gbc_prec}')
print(f'Precision for smoker: {smoker_all_gbc_prec}')

In [None]:
print(gs_all_gbc.best_params_)
print(gs2_all_gbc.best_params_)
print(gs3_all_gbc.best_params_)
print(gs4_all_gbc.best_params_)

### Pipeline and Gridsearch with just all features as predictors (Ada Boost Classifier)

In [73]:
pipe_all_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

params_all_abc = {'selectkbest__k': range(1, 217, 20),
                  'adaboostclassifier__learning_rate': [0.5, 1.0],
                  'adaboostclassifier__n_estimators': [10, 15, 20, 25], }

gs_all_abc = GridSearchCV(pipe_all_abc, params_all_abc, n_jobs=-1, cv=3)

gs_all_abc.fit(X_train_all, y_train_tobacco)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 217, 20)})

In [74]:
pipe2_all_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs2_all_abc = GridSearchCV(pipe2_all_abc, params_all_abc, n_jobs=-1, cv=3)

gs2_all_abc.fit(X_train_all2, y_train_activity)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 217, 20)})

In [75]:
pipe3_all_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs3_all_abc = GridSearchCV(pipe3_all_abc, params_all_abc, n_jobs=-1, cv=3)

gs3_all_abc.fit(X_train_all3, y_train_health)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 217, 20)})

In [76]:
pipe4_all_abc = make_pipeline(SelectKBest(f_classif), StandardScaler(), AdaBoostClassifier())

gs4_all_abc = GridSearchCV(pipe4_all_abc, params_all_abc, n_jobs=-1, cv=3)

gs4_all_abc.fit(X_train_all4, y_train_smoker)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             n_jobs=-1,
             param_grid={'adaboostclassifier__learning_rate': [0.5, 1.0],
                         'adaboostclassifier__n_estimators': [10, 15, 20, 25],
                         'selectkbest__k': range(1, 217, 20)})

In [77]:
tobacco_all_abc_preds = gs_all_abc.predict(X_test_all)
activity_all_abc_preds = gs2_all_abc.predict(X_test_all2)
health_all_abc_preds = gs3_all_abc.predict(X_test_all3)
smoker_all_abc_preds = gs4_all_abc.predict(X_test_all4)

tobacco_all_abc_prec = precision_score(y_test_tobacco, tobacco_all_abc_preds, average='micro')
activity_all_abc_prec = precision_score(y_test_activity, activity_all_abc_preds, average='micro')
health_all_abc_prec = precision_score(y_test_health, health_all_abc_preds, average='micro')
smoker_all_abc_prec = precision_score(y_test_smoker, smoker_all_abc_preds, average='micro')

In [78]:
print(f' training accuracy for tobacco: {gs_all_abc.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_abc.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_abc.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_abc.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 0.752225345514172
 training accuracy for activity: 0.7338474364318466
 training accuracy for health: 0.7934782608695652
 training accuracy for smoker: 0.5017883625577947


In [79]:
print(f' testing accuracy for tobacco: {gs_all_abc.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_abc.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_abc.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_abc.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.7454321855235418
 testing accuracy for activity: 0.7464832760237574
 testing accuracy for health: 0.7984949832775919
 testing accuracy for smoker: 0.501046572475144


In [80]:
print(f'Precision for tobacco: {tobacco_all_abc_prec}')
print(f'Precision for activity: {activity_all_abc_prec}')
print(f'Precision for health: {health_all_abc_prec}')
print(f'Precision for smoker: {smoker_all_abc_prec}')

Precision for tobacco: 0.7454321855235418
Precision for activity: 0.7464832760237574
Precision for health: 0.7984949832775919
Precision for smoker: 0.501046572475144


In [81]:
print(gs_all_abc.best_params_)
print(gs2_all_abc.best_params_)
print(gs3_all_abc.best_params_)
print(gs4_all_abc.best_params_)

{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 41}
{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 101}
{'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 81}
{'adaboostclassifier__learning_rate': 0.5, 'adaboostclassifier__n_estimators': 25, 'selectkbest__k': 41}


### Pipeline and Gridsearch with just all features as predictors (XG Boost Classifier)

In [82]:
pipe_all_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

params_all_xgb = {'selectkbest__k': range(1, 217, 20),
                  'xgbclassifier__learning_rate': [0.5, 1.0],
                  'xgbclassifier__n_estimators': [10, 15, 20, 25],
                 'xgbclassifier__max_depth': [3, 5]}

gs_all_xgb = GridSearchCV(pipe_all_xgb, params_all_xgb, n_jobs=-1, cv=3)

gs_all_xgb.fit(X_train_all, y_train_tobacco)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [83]:
pipe2_all_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs2_all_xgb = GridSearchCV(pipe2_all_xgb, params_all_xgb, n_jobs=-1, cv=3)

gs2_all_xgb.fit(X_train_all2, y_train_activity)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [84]:
pipe3_all_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs3_all_xgb = GridSearchCV(pipe3_all_xgb, params_all_xgb, n_jobs=-1, cv=3)

gs3_all_xgb.fit(X_train_all3, y_train_health)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [85]:
pipe4_all_xgb = make_pipeline(SelectKBest(f_classif), StandardScaler(), xgb.XGBClassifier())

gs4_all_xgb = GridSearchCV(pipe4_all_xgb, params_all_xgb, n_jobs=-1, cv=3)

gs4_all_xgb.fit(X_train_all4, y_train_smoker)

  f = msb / msw


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('selectkbest', SelectKBest()),
                                       ('standardscaler', StandardScaler()),
                                       ('xgbclassifier',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=None, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                                      max_delta_step=None...
             

In [86]:
tobacco_all_xgb_preds = gs_all_xgb.predict(X_test_all)
activity_all_xgb_preds = gs2_all_xgb.predict(X_test_all2)
health_all_xgb_preds = gs3_all_xgb.predict(X_test_all3)
smoker_all_xgb_preds = gs4_all_xgb.predict(X_test_all4)

tobacco_all_xgb_prec = precision_score(y_test_tobacco, tobacco_all_xgb_preds, average='micro')
activity_all_xgb_prec = precision_score(y_test_activity, activity_all_xgb_preds, average='micro')
health_all_xgb_prec = precision_score(y_test_health, health_all_xgb_preds, average='micro')
smoker_all_xgb_prec = precision_score(y_test_smoker, smoker_all_xgb_preds, average='micro')

In [87]:
print(f' training accuracy for tobacco: {gs_all_xgb.score(X_train_all, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_all_xgb.score(X_train_all2, y_train_activity)}')
print(f' training accuracy for health: {gs3_all_xgb.score(X_train_all3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_all_xgb.score(X_train_all4, y_train_smoker)}')

 training accuracy for tobacco: 0.9965448582806278
 training accuracy for activity: 0.9462275948311797
 training accuracy for health: 0.9704570791527313
 training accuracy for smoker: 0.8524818982814272


In [88]:
print(f' testing accuracy for tobacco: {gs_all_xgb.score(X_test_all, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_all_xgb.score(X_test_all2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_all_xgb.score(X_test_all3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_all_xgb.score(X_test_all4, y_test_smoker)}')

 testing accuracy for tobacco: 0.988756148981026
 testing accuracy for activity: 0.8837136605189122
 testing accuracy for health: 0.9280936454849499
 testing accuracy for smoker: 0.7192569335426479


In [89]:
print(f'Precision for tobacco: {tobacco_all_xgb_prec}')
print(f'Precision for activity: {activity_all_xgb_prec}')
print(f'Precision for health: {health_all_xgb_prec}')
print(f'Precision for smoker: {smoker_all_xgb_prec}')

Precision for tobacco: 0.988756148981026
Precision for activity: 0.8837136605189122
Precision for health: 0.9280936454849499
Precision for smoker: 0.7192569335426479


In [90]:
print(gs_all_xgb.best_params_)
print(gs2_all_xgb.best_params_)
print(gs3_all_xgb.best_params_)
print(gs4_all_xgb.best_params_)

{'selectkbest__k': 121, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 81, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 161, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
{'selectkbest__k': 61, 'xgbclassifier__learning_rate': 1.0, 'xgbclassifier__max_depth': 5, 'xgbclassifier__n_estimators': 25}
