In [3]:
#!pip install imblearn

In [4]:
#!pip install xgboost

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, precision_score
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

from imblearn.over_sampling import SMOTE

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [6]:
# reading in full data set
brfss_total = pd.read_csv("./brfss_total.csv")

In [7]:
brfss_total.head()

Unnamed: 0.1,Unnamed: 0,index,_STATE,DISPCODE,PHYSHLTH,MENTHLTH,USENOW3,HISPANC2,MARITAL,CHILDREN,...,ACETTHEM,ACEHVSEX,MSCODE,_IMPAGE,_RFHLTH,_SMOKER3,_PRACE,_EDUCAG,_INCOMG,_TOTINDA
0,0,14697,5.0,110.0,0.0,0.0,3.0,2.0,1.0,1.0,...,1.0,1.0,5.0,53.0,1.0,4.0,1.0,3.0,5.0,1.0
1,1,14699,5.0,110.0,15.0,0.0,3.0,2.0,2.0,0.0,...,1.0,1.0,5.0,64.0,2.0,3.0,1.0,2.0,2.0,1.0
2,2,14700,5.0,110.0,6.0,0.0,3.0,2.0,1.0,0.0,...,1.0,1.0,5.0,58.0,1.0,4.0,1.0,3.0,2.0,1.0
3,3,14701,5.0,110.0,30.0,0.0,3.0,2.0,1.0,0.0,...,1.0,1.0,5.0,76.0,2.0,4.0,1.0,1.0,0.0,2.0
4,4,14704,5.0,110.0,13.0,0.0,3.0,2.0,3.0,0.0,...,1.0,0.0,5.0,82.0,2.0,3.0,1.0,2.0,0.0,2.0


In [8]:
# just making sure no nulls are present
brfss_total.isna().sum()

Unnamed: 0    0
index         0
_STATE        0
DISPCODE      0
PHYSHLTH      0
MENTHLTH      0
USENOW3       0
HISPANC2      0
MARITAL       0
CHILDREN      0
EMPLOY        0
RENTHOM1      0
SEX           0
QLACTLM2      0
ACEDEPRS      0
ACEDRINK      0
ACEDRUGS      0
ACEPRISN      0
ACEDIVRC      0
ACEPUNCH      0
ACEHURT       0
ACESWEAR      0
ACETOUCH      0
ACETTHEM      0
ACEHVSEX      0
MSCODE        0
_IMPAGE       0
_RFHLTH       0
_SMOKER3      0
_PRACE        0
_EDUCAG       0
_INCOMG       0
_TOTINDA      0
dtype: int64

In [9]:
brfss_total.drop(columns=['Unnamed: 0', 'index'], inplace=True)

**Variables I will try to predict with my models:**
- USENOW3: Do you currently use chewing tobacco, snuff, or snus every day, some days, or not at all?
    - classification
    - 0 = Don't know, Not sure or Refused, 1 = every day, 2 = some days, 3 = not at all
- QLACTLM2: Are you limited in any way in any activities because of physical, mental, or emotional problems?
    - classification
    - 0 = Don't know, Not sure or Refused, 1 = yes, 2 = no
- _RFHLTH: Adults with good or better health vs. fair or poor health
    - classification
    - based off of GENHLTH
    - 0 = Don't know, Not sure or Refused, 1 = Good or Better Health, 2 = Fair or Poor Health
- _SMOKER3: Four-level smoker status: Everyday smoker, Someday smoker, Former smoker, Non-smoker
    - classification
    - based off of SMOKE100 & SMOKEDAY
    - 0 = Don't know, Not sure or Refused, 1 = Current smoker (now smokes every day), 2 = Current smoker (now smokes some days), 3 = Former smoker, 4 = Never smoked

**Will make first three y's binary**
- turning y's binary made it so that I couldn't stratify in train_test_split...so will not binarize my y's for now

**Will OneHotEncode/ dummify ordinal/nominal features**

**Will only use a sample of the data set for models so they can run faster**

**Will use SMOTE to compensensate for imbalanced classes**

**Will aggregate all ACEs into two groups: Abuse and Household Challenges**

In [10]:
ace = brfss_total[['ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN', 'ACEDIVRC', 'ACEPUNCH', 'ACEHURT', 'ACESWEAR', 
                       'ACETOUCH', 'ACETTHEM', 'ACEHVSEX']]
ace.head()

Unnamed: 0,ACEDEPRS,ACEDRINK,ACEDRUGS,ACEPRISN,ACEDIVRC,ACEPUNCH,ACEHURT,ACESWEAR,ACETOUCH,ACETTHEM,ACEHVSEX
0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
1,2.0,1.0,2.0,2.0,2.0,1.0,1.0,3.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
3,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
4,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0


In [11]:
# updating ace columns to be a count depending on the question
# first 6 questions are yes or no, so yes will be be counted as 1 and no will be counted as 0
# last 5 are questions of frequency, never = 0, once = 1, more than once will equal 2 (since not given an exact number)
ace['ACEDEPRS'] = ace['ACEDEPRS'].map({1:1, 2:0, 0:0})
ace['ACEDRINK'] = ace['ACEDRINK'].map({1:1, 2:0, 0:0})
ace['ACEDRUGS'] = ace['ACEDRUGS'].map({1:1, 2:0, 0:0})
ace['ACEPRISN'] = ace['ACEPRISN'].map({1:1, 2:0, 0:0})
ace['ACEDIVRC'] = ace['ACEDIVRC'].map({1:1, 2:0, 0:0})
ace['ACEPUNCH'] = ace['ACEPUNCH'].map({1:0, 2:1, 3:2})
ace['ACEHURT'] = ace['ACEHURT'].map({1:0, 2:1, 3:2, 0:0})
ace['ACESWEAR'] = ace['ACESWEAR'].map({1:0, 2:1, 3:2, 0:0})
ace['ACETOUCH'] = ace['ACETOUCH'].map({1:0, 2:1, 3:2, 0:0})
ace['ACETTHEM'] = ace['ACETTHEM'].map({1:0, 2:1, 3:2, 0:0})
ace['ACEHVSEX'] = ace['ACEHVSEX'].map({1:0, 2:1, 3:2, 0:0})

In [12]:
ace['count'] = ace.sum(axis = 1)
ace.head()

Unnamed: 0,ACEDEPRS,ACEDRINK,ACEDRUGS,ACEPRISN,ACEDIVRC,ACEPUNCH,ACEHURT,ACESWEAR,ACETOUCH,ACETTHEM,ACEHVSEX,count
0,0,0,0,0,0,0.0,0,0,0,0,0,0.0
1,0,1,0,0,0,0.0,0,2,0,0,0,3.0
2,0,0,0,0,0,0.0,0,0,0,0,0,0.0
3,0,0,0,0,0,0.0,0,0,0,0,0,0.0
4,0,0,0,0,0,0.0,0,0,0,0,0,0.0


In [13]:
brfss_total['ACE_Count'] = ace['count']

In [14]:
brfss_total.drop(columns=['ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN', 'ACEDIVRC', 'ACEPUNCH', 'ACEHURT', 'ACESWEAR', 
                       'ACETOUCH', 'ACETTHEM', 'ACEHVSEX'], inplace=True)

In [15]:
brfss_total.head()

Unnamed: 0,_STATE,DISPCODE,PHYSHLTH,MENTHLTH,USENOW3,HISPANC2,MARITAL,CHILDREN,EMPLOY,RENTHOM1,...,QLACTLM2,MSCODE,_IMPAGE,_RFHLTH,_SMOKER3,_PRACE,_EDUCAG,_INCOMG,_TOTINDA,ACE_Count
0,5.0,110.0,0.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0,...,2.0,5.0,53.0,1.0,4.0,1.0,3.0,5.0,1.0,0.0
1,5.0,110.0,15.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,...,2.0,5.0,64.0,2.0,3.0,1.0,2.0,2.0,1.0,3.0
2,5.0,110.0,6.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,5.0,58.0,1.0,4.0,1.0,3.0,2.0,1.0,0.0
3,5.0,110.0,30.0,0.0,3.0,2.0,1.0,0.0,0.0,0.0,...,1.0,5.0,76.0,2.0,4.0,1.0,1.0,0.0,2.0,0.0
4,5.0,110.0,13.0,0.0,3.0,2.0,3.0,0.0,0.0,0.0,...,2.0,5.0,82.0,2.0,3.0,1.0,2.0,0.0,2.0,0.0


In [16]:
# taking a small sample so that my models will run a little faster
brfss_total_sample = brfss_total.sample(frac=0.05, axis=0)

brfss_total_sample.shape

(5878, 21)

In [17]:
X = brfss_total_sample[['ACE_Count']]

In [18]:
# creating the 4 y's
y_tobacco = brfss_total_sample['USENOW3']
y_activity = brfss_total_sample['QLACTLM2']
y_health = brfss_total_sample['_RFHLTH']
y_smoker = brfss_total_sample['_SMOKER3']

In [19]:
#original baseline for tobacco
y_tobacco.value_counts(normalize=True)

3.0    0.962913
1.0    0.018203
2.0    0.015481
0.0    0.003403
Name: USENOW3, dtype: float64

In [20]:
#original baseline for activity
y_activity.value_counts(normalize=True)

2.0    0.716570
1.0    0.276114
0.0    0.007315
Name: QLACTLM2, dtype: float64

In [21]:
#original baseline for health
y_health.value_counts(normalize=True)

1.0    0.808438
2.0    0.188329
0.0    0.003232
Name: _RFHLTH, dtype: float64

In [22]:
#original baseline for smoker
y_smoker.value_counts(normalize=True)

4.0    0.530793
3.0    0.302314
1.0    0.119939
2.0    0.041000
0.0    0.005954
Name: _SMOKER3, dtype: float64

In [23]:
# to compensate for unbalanced classes in my y's will use SMOTE

sm = SMOTE(random_state=151)
X1, y_tobacco = sm.fit_resample(X, y_tobacco)

sm2 = SMOTE(random_state=151)
X2, y_activity = sm2.fit_resample(X, y_activity)

sm3 = SMOTE(random_state=151)
X3, y_health = sm3.fit_resample(X, y_health)

sm4 = SMOTE(random_state=151)
X4, y_smoker = sm4.fit_resample(X, y_smoker)

In [24]:
# new baseline for tobacco
y_tobacco.value_counts(normalize=True)

2.0    0.25
0.0    0.25
1.0    0.25
3.0    0.25
Name: USENOW3, dtype: float64

In [25]:
# looks like SMOTE has increased the size of my y's more than 4x, so will probably take some time for models to run
y_tobacco.shape

(22640,)

In [26]:
# new baseline for activity
y_activity.value_counts(normalize=True)

0.0    0.333333
1.0    0.333333
2.0    0.333333
Name: QLACTLM2, dtype: float64

In [27]:
# new baseline for health
y_health.value_counts(normalize=True)

0.0    0.333333
2.0    0.333333
1.0    0.333333
Name: _RFHLTH, dtype: float64

In [28]:
# new baseline for smoker
y_smoker.value_counts(normalize=True)

0.0    0.2
2.0    0.2
1.0    0.2
3.0    0.2
4.0    0.2
Name: _SMOKER3, dtype: float64

In [29]:
X1.shape

(22640, 1)

In [30]:
# creating training and testing sets for all y's (stratified on y, but since the classes are equal probably didn't have to)
X_train_ace, X_test_ace, y_train_tobacco, y_test_tobacco = train_test_split(X1, y_tobacco, random_state = 151, stratify=y_tobacco)
X_train_ace2, X_test_ace2, y_train_activity, y_test_activity = train_test_split(X2, y_activity, random_state = 151, stratify=y_activity)
X_train_ace3, X_test_ace3, y_train_health, y_test_health = train_test_split(X3, y_health, random_state = 151, stratify=y_health)
X_train_ace4, X_test_ace4, y_train_smoker, y_test_smoker = train_test_split(X4, y_smoker, random_state = 151, stratify=y_smoker)

### Gridsearch with just ACE count as predictor (Logistic Regression)

In [32]:
params_ace_log = {'C': [0.01, 0.5, 1]}

gs_ace_log = GridSearchCV(LogisticRegression(max_iter=10_000), params_ace_log, cv=3)

gs_ace_log.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=10000),
             param_grid={'C': [0.01, 0.5, 1]})

In [33]:
gs2_ace_log = GridSearchCV(LogisticRegression(max_iter=10_000), params_ace_log, cv=3)

gs2_ace_log.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=10000),
             param_grid={'C': [0.01, 0.5, 1]})

In [34]:
gs3_ace_log = GridSearchCV(LogisticRegression(max_iter=10_000), params_ace_log, cv=3)

gs3_ace_log.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=10000),
             param_grid={'C': [0.01, 0.5, 1]})

In [35]:
gs4_ace_log = GridSearchCV(LogisticRegression(max_iter=10_000), params_ace_log, cv=3)

gs4_ace_log.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=10000),
             param_grid={'C': [0.01, 0.5, 1]})

In [36]:
tobacco_ace_log_preds = gs_ace_log.predict(X_test_ace)
activity_ace_log_preds = gs2_ace_log.predict(X_test_ace2)
health_ace_log_preds = gs3_ace_log.predict(X_test_ace3)
smoker_ace_log_preds = gs4_ace_log.predict(X_test_ace4)

tobacco_ace_log_prec = precision_score(y_test_tobacco, tobacco_ace_log_preds, average='micro')
activity_ace_log_prec = precision_score(y_test_activity, activity_ace_log_preds, average='micro')
health_ace_log_prec = precision_score(y_test_health, health_ace_log_preds, average='micro')
smoker_ace_log_prec = precision_score(y_test_smoker, smoker_ace_log_preds, average='micro')

In [37]:
print(f' training accuracy for tobacco: {gs_ace_log.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_log.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_log.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_log.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.38645465253239103
 training accuracy for activity: 0.4021314762055503
 training accuracy for health: 0.42386831275720166
 training accuracy for smoker: 0.27957264957264955


In [38]:
print(f' testing accuracy for tobacco: {gs_ace_log.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_log.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_log.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_log.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.38303886925795055
 testing accuracy for activity: 0.39632795188350745
 testing accuracy for health: 0.43125701459034793
 testing accuracy for smoker: 0.2794871794871795


In [39]:
print(f'Precision for tobacco: {tobacco_ace_log_prec}')
print(f'Precision for activity: {activity_ace_log_prec}')
print(f'Precision for health: {health_ace_log_prec}')
print(f'Precision for smoker: {smoker_ace_log_prec}')

Precision for tobacco: 0.38303886925795055
Precision for activity: 0.39632795188350745
Precision for health: 0.43125701459034793
Precision for smoker: 0.2794871794871795


In [40]:
print(gs_ace_log.best_params_)
print(gs2_ace_log.best_params_)
print(gs3_ace_log.best_params_)
print(gs4_ace_log.best_params_)

{'C': 0.01}
{'C': 0.01}
{'C': 0.01}
{'C': 0.01}


**Observations**

### Gridsearch with just ACE count as predictor (Random Forest Classifier)

In [41]:
params_ace_rfc = {'n_estimators': [100, 300, 500],
                  'max_depth': [None, 3, 5], 
                  'min_samples_split': [1, 3, 5],
                  'min_samples_leaf': [1, 3, 5]}

gs_ace_rfc = GridSearchCV(RandomForestClassifier(), params_ace_rfc, cv=3)

gs_ace_rfc.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 3, 5],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [100, 300, 500]})

In [42]:
gs2_ace_rfc = GridSearchCV(RandomForestClassifier(), params_ace_rfc, cv=3)

gs2_ace_rfc.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 3, 5],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [100, 300, 500]})

In [43]:
gs3_ace_rfc = GridSearchCV(RandomForestClassifier(), params_ace_rfc, cv=3)

gs3_ace_rfc.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 3, 5],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [100, 300, 500]})

In [44]:
gs4_ace_rfc = GridSearchCV(RandomForestClassifier(), params_ace_rfc, cv=3)

gs4_ace_rfc.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 3, 5],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [100, 300, 500]})

In [45]:
tobacco_ace_rfc_preds = gs_ace_rfc.predict(X_test_ace)
activity_ace_rfc_preds = gs2_ace_rfc.predict(X_test_ace2)
health_ace_rfc_preds = gs3_ace_rfc.predict(X_test_ace3)
smoker_ace_rfc_preds = gs4_ace_rfc.predict(X_test_ace4)

tobacco_ace_rfc_prec = precision_score(y_test_tobacco, tobacco_ace_rfc_preds, average='micro')
activity_ace_rfc_prec = precision_score(y_test_activity, activity_ace_rfc_preds, average='micro')
health_ace_rfc_prec = precision_score(y_test_health, health_ace_rfc_preds, average='micro')
smoker_ace_rfc_prec = precision_score(y_test_smoker, smoker_ace_rfc_preds, average='micro')

In [46]:
print(f' training accuracy for tobacco: {gs_ace_rfc.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_rfc.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_rfc.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_rfc.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.4658421672555948
 training accuracy for activity: 0.5264324153213042
 training accuracy for health: 0.5390011223344556
 training accuracy for smoker: 0.34205128205128205


In [47]:
print(f' testing accuracy for tobacco: {gs_ace_rfc.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_rfc.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_rfc.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_rfc.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.45918727915194346
 testing accuracy for activity: 0.5343463121240899
 testing accuracy for health: 0.5420875420875421
 testing accuracy for smoker: 0.32666666666666666


In [48]:
print(f'Precision for tobacco: {tobacco_ace_rfc_prec}')
print(f'Precision for activity: {activity_ace_rfc_prec}')
print(f'Precision for health: {health_ace_rfc_prec}')
print(f'Precision for smoker: {smoker_ace_rfc_prec}')

Precision for tobacco: 0.45918727915194346
Precision for activity: 0.5343463121240899
Precision for health: 0.5420875420875421
Precision for smoker: 0.32666666666666666


In [49]:
print(gs_ace_rfc.best_params_)
print(gs2_ace_rfc.best_params_)
print(gs3_ace_rfc.best_params_)
print(gs4_ace_rfc.best_params_)

{'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 3, 'n_estimators': 500}
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 500}


### Gridsearch with just ACE count as predictor (Extra Trees Classifier)

In [50]:
params_ace_etc = {'n_estimators': [100, 300, 500],
                  'max_depth': [None, 3, 5],
                 'min_samples_split': [1, 3, 5],
                 'min_samples_leaf': [1, 3, 5]}

gs_ace_etc = GridSearchCV(ExtraTreesClassifier(), params_ace_etc, cv=3)

gs_ace_etc.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3, estimator=ExtraTreesClassifier(),
             param_grid={'max_depth': [None, 3, 5],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [100, 300, 500]})

In [51]:
gs2_ace_etc = GridSearchCV(ExtraTreesClassifier(), params_ace_etc, cv=3)

gs2_ace_etc.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3, estimator=ExtraTreesClassifier(),
             param_grid={'max_depth': [None, 3, 5],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [100, 300, 500]})

In [52]:
gs3_ace_etc = GridSearchCV(ExtraTreesClassifier(), params_ace_etc, cv=3)

gs3_ace_etc.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3, estimator=ExtraTreesClassifier(),
             param_grid={'max_depth': [None, 3, 5],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [100, 300, 500]})

In [53]:
gs4_ace_etc = GridSearchCV(ExtraTreesClassifier(), params_ace_etc, cv=3)

gs4_ace_etc.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3, estimator=ExtraTreesClassifier(),
             param_grid={'max_depth': [None, 3, 5],
                         'min_samples_leaf': [1, 3, 5],
                         'min_samples_split': [1, 3, 5],
                         'n_estimators': [100, 300, 500]})

In [54]:
tobacco_ace_etc_preds = gs_ace_etc.predict(X_test_ace)
activity_ace_etc_preds = gs2_ace_etc.predict(X_test_ace2)
health_ace_etc_preds = gs3_ace_etc.predict(X_test_ace3)
smoker_ace_etc_preds = gs4_ace_etc.predict(X_test_ace4)

tobacco_ace_etc_prec = precision_score(y_test_tobacco, tobacco_ace_etc_preds, average='micro')
activity_ace_etc_prec = precision_score(y_test_activity, activity_ace_etc_preds, average='micro')
health_ace_etc_prec = precision_score(y_test_health, health_ace_etc_preds, average='micro')
smoker_ace_etc_prec = precision_score(y_test_smoker, smoker_ace_etc_preds, average='micro')

In [55]:
print(f' training accuracy for tobacco: {gs_ace_etc.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_etc.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_etc.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_etc.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.470906949352179
 training accuracy for activity: 0.5264324153213042
 training accuracy for health: 0.5390011223344556
 training accuracy for smoker: 0.34196581196581194


In [56]:
print(f' testing accuracy for tobacco: {gs_ace_etc.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_etc.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_etc.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_etc.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.4586572438162544
 testing accuracy for activity: 0.5343463121240899
 testing accuracy for health: 0.5432098765432098
 testing accuracy for smoker: 0.32794871794871794


In [57]:
print(f'Precision for tobacco: {tobacco_ace_etc_prec}')
print(f'Precision for activity: {activity_ace_etc_prec}')
print(f'Precision for health: {health_ace_etc_prec}')
print(f'Precision for smoker: {smoker_ace_etc_prec}')

Precision for tobacco: 0.4586572438162544
Precision for activity: 0.5343463121240899
Precision for health: 0.5432098765432098
Precision for smoker: 0.32794871794871794


In [58]:
print(gs_ace_etc.best_params_)
print(gs2_ace_etc.best_params_)
print(gs3_ace_etc.best_params_)
print(gs4_ace_etc.best_params_)

{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}


### Gridsearch with just ACE count as predictor (Gradient Boost Classifier)
- won't be running this model, it takes too long (even using the Cloud it takes hours for one model to run)

pipe_ace_gbc = make_pipeline(StandardScaler(), GradientBoostingClassifier())

params_ace_gbc = {'gradientboostingclassifier__loss': ['deviance', 'exponential'],
                  'gradientboostingclassifier__learning_rate': [0.01, 0.1, 0.5],
                  'gradientboostingclassifier__n_estimators': [100, 500],
                  'gradientboostingclassifier__max_depth': [3, 5]}

gs_ace_gbc = GridSearchCV(pipe_ace_gbc, params_ace_gbc, cv=3)

gs_ace_gbc.fit(X_train_ace, y_train_tobacco)

pipe2_ace_gbc = make_pipeline(StandardScaler(), GradientBoostingClassifier())

gs2_ace_gbc = GridSearchCV(pipe2_ace_gbc, params_ace_gbc, cv=3)

gs2_ace_gbc.fit(X_train_ace2, y_train_activity)

pipe3_ace_gbc = make_pipeline(StandardScaler(), GradientBoostingClassifier())

gs3_ace_gbc = GridSearchCV(pipe3_ace_gbc, params_ace_gbc, cv=3)

gs3_ace_gbc.fit(X_train_ace3, y_train_health)

pipe4_ace_gbc = make_pipeline(StandardScaler(), GradientBoostingClassifier())

gs4_ace_gbc = GridSearchCV(pipe4_ace_gbc, params_ace_gbc, cv=3)

gs4_ace_gbc.fit(X_train_ace4, y_train_smoker)

tobacco_ace_gbc_preds = gs_ace_gbc.predict(X_test_ace)
activity_ace_gbc_preds = gs2_ace_gbc.predict(X_test_ace2)
health_ace_gbc_preds = gs3_ace_gbc.predict(X_test_ace3)
smoker_ace_gbc_preds = gs4_ace_gbc.predict(X_test_ace4)

tobacco_ace_gbc_prec = precision_score(y_test_tobacco, tobacco_ace_gbc_preds, average='micro')
activity_ace_gbc_prec = precision_score(y_test_activity, activity_ace_gbc_preds, average='micro')
health_ace_gbc_prec = precision_score(y_test_health, health_ace_gbc_preds, average='micro')
smoker_ace_gbc_prec = precision_score(y_test_smoker, smoker_ace_gbc_preds, average='micro')

print(f' training accuracy for tobacco: {gs_ace_gbc.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_gbc.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_gbc.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_gbc.score(X_train_ace4, y_train_smoker)}')

print(f' testing accuracy for tobacco: {gs_ace_gbc.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_gbc.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_gbc.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_gbc.score(X_test_ace4, y_test_smoker)}')

print(f'Precision for tobacco: {tobacco_ace_gbc_prec}')
print(f'Precision for activity: {activity_ace_gbc_prec}')
print(f'Precision for health: {health_ace_gbc_prec}')
print(f'Precision for smoker: {smoker_ace_gbc_prec}')

print(gs_ace_gbc.best_params_)
print(gs2_ace_gbc.best_params_)
print(gs3_ace_gbc.best_params_)
print(gs4_ace_gbc.best_params_)

### Gridsearch with just ACE count as predictor (Ada Boost Classifier)

In [59]:
params_ace_abc = {'learning_rate': [0.5, 1.0],
                  'n_estimators': [10, 15, 20, 25], }

gs_ace_abc = GridSearchCV(AdaBoostClassifier(), params_ace_abc, cv=3)

gs_ace_abc.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5, 1.0],
                         'n_estimators': [10, 15, 20, 25]})

In [60]:
gs2_ace_abc = GridSearchCV(AdaBoostClassifier(), params_ace_abc, cv=3)

gs2_ace_abc.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5, 1.0],
                         'n_estimators': [10, 15, 20, 25]})

In [61]:
gs3_ace_abc = GridSearchCV(AdaBoostClassifier(), params_ace_abc, cv=3)

gs3_ace_abc.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5, 1.0],
                         'n_estimators': [10, 15, 20, 25]})

In [62]:
gs4_ace_abc = GridSearchCV(AdaBoostClassifier(), params_ace_abc, cv=3)

gs4_ace_abc.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5, 1.0],
                         'n_estimators': [10, 15, 20, 25]})

In [63]:
tobacco_ace_abc_preds = gs_ace_abc.predict(X_test_ace)
activity_ace_abc_preds = gs2_ace_abc.predict(X_test_ace2)
health_ace_abc_preds = gs3_ace_abc.predict(X_test_ace3)
smoker_ace_abc_preds = gs4_ace_abc.predict(X_test_ace4)

tobacco_ace_abc_prec = precision_score(y_test_tobacco, tobacco_ace_abc_preds, average='micro')
activity_ace_abc_prec = precision_score(y_test_activity, activity_ace_abc_preds, average='micro')
health_ace_abc_prec = precision_score(y_test_health, health_ace_abc_preds, average='micro')
smoker_ace_abc_prec = precision_score(y_test_smoker, smoker_ace_abc_preds, average='micro')

In [64]:
print(f' training accuracy for tobacco: {gs_ace_abc.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_abc.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_abc.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_abc.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.40683156654888103
 training accuracy for activity: 0.5061728395061729
 training accuracy for health: 0.4990647212869435
 training accuracy for smoker: 0.321965811965812


In [65]:
print(f' testing accuracy for tobacco: {gs_ace_abc.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_abc.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_abc.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_abc.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.4070671378091873
 testing accuracy for activity: 0.51218740107629
 testing accuracy for health: 0.5061728395061729
 testing accuracy for smoker: 0.3128205128205128


In [66]:
print(f'Precision for tobacco: {tobacco_ace_abc_prec}')
print(f'Precision for activity: {activity_ace_abc_prec}')
print(f'Precision for health: {health_ace_abc_prec}')
print(f'Precision for smoker: {smoker_ace_abc_prec}')

Precision for tobacco: 0.4070671378091873
Precision for activity: 0.51218740107629
Precision for health: 0.5061728395061729
Precision for smoker: 0.3128205128205128


In [67]:
print(gs_ace_abc.best_params_)
print(gs2_ace_abc.best_params_)
print(gs3_ace_abc.best_params_)
print(gs4_ace_abc.best_params_)

{'learning_rate': 0.5, 'n_estimators': 25}
{'learning_rate': 1.0, 'n_estimators': 25}
{'learning_rate': 1.0, 'n_estimators': 20}
{'learning_rate': 1.0, 'n_estimators': 10}


### Gridsearch with just ACE count as predictor (XG Boost Classifier)

In [68]:
params_ace_xgb = {'learning_rate': [0.5, 1.0],
                  'n_estimators': [10, 15, 20, 25],
                 'max_depth': [3, 5]}

gs_ace_xgb = GridSearchCV(xgb.XGBClassifier(), params_ace_xgb, cv=3)

gs_ace_xgb.fit(X_train_ace, y_train_tobacco)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [69]:
gs2_ace_xgb = GridSearchCV(xgb.XGBClassifier(), params_ace_xgb, cv=3)

gs2_ace_xgb.fit(X_train_ace2, y_train_activity)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [70]:
gs3_ace_xgb = GridSearchCV(xgb.XGBClassifier(), params_ace_xgb, cv=3)

gs3_ace_xgb.fit(X_train_ace3, y_train_health)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [71]:
gs4_ace_xgb = GridSearchCV(xgb.XGBClassifier(), params_ace_xgb, cv=3)

gs4_ace_xgb.fit(X_train_ace4, y_train_smoker)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [72]:
tobacco_ace_xgb_preds = gs_ace_xgb.predict(X_test_ace)
activity_ace_xgb_preds = gs2_ace_xgb.predict(X_test_ace2)
health_ace_xgb_preds = gs3_ace_xgb.predict(X_test_ace3)
smoker_ace_xgb_preds = gs4_ace_xgb.predict(X_test_ace4)

tobacco_ace_xgb_prec = precision_score(y_test_tobacco, tobacco_ace_xgb_preds, average='micro')
activity_ace_xgb_prec = precision_score(y_test_activity, activity_ace_xgb_preds, average='micro')
health_ace_xgb_prec = precision_score(y_test_health, health_ace_xgb_preds, average='micro')
smoker_ace_xgb_prec = precision_score(y_test_smoker, smoker_ace_xgb_preds, average='micro')

In [73]:
print(f' training accuracy for tobacco: {gs_ace_xgb.score(X_train_ace, y_train_tobacco)}')
print(f' training accuracy for activity: {gs2_ace_xgb.score(X_train_ace2, y_train_activity)}')
print(f' training accuracy for health: {gs3_ace_xgb.score(X_train_ace3, y_train_health)}')
print(f' training accuracy for smoker: {gs4_ace_xgb.score(X_train_ace4, y_train_smoker)}')

 training accuracy for tobacco: 0.4640164899882214
 training accuracy for activity: 0.5264324153213042
 training accuracy for health: 0.5390011223344556
 training accuracy for smoker: 0.34196581196581194


In [74]:
print(f' testing accuracy for tobacco: {gs_ace_xgb.score(X_test_ace, y_test_tobacco)}')
print(f' testing accuracy for activity: {gs2_ace_xgb.score(X_test_ace2, y_test_activity)}')
print(f' testing accuracy for health: {gs3_ace_xgb.score(X_test_ace3, y_test_health)}')
print(f' testing accuracy for smoker: {gs4_ace_xgb.score(X_test_ace4, y_test_smoker)}')

 testing accuracy for tobacco: 0.45971731448763253
 testing accuracy for activity: 0.5343463121240899
 testing accuracy for health: 0.5420875420875421
 testing accuracy for smoker: 0.32743589743589746


In [75]:
print(f'Precision for tobacco: {tobacco_ace_xgb_prec}')
print(f'Precision for activity: {activity_ace_xgb_prec}')
print(f'Precision for health: {health_ace_xgb_prec}')
print(f'Precision for smoker: {smoker_ace_xgb_prec}')

Precision for tobacco: 0.45971731448763253
Precision for activity: 0.5343463121240899
Precision for health: 0.5420875420875421
Precision for smoker: 0.32743589743589746


In [76]:
print(gs_ace_xgb.best_params_)
print(gs2_ace_xgb.best_params_)
print(gs3_ace_xgb.best_params_)
print(gs4_ace_xgb.best_params_)

{'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 20}
{'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 15}
{'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 25}
{'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 25}
