In [1]:
import pandas as pd
#import and merge the data
df_2016_implicate = pd.read_sas('nsch_2016_implicate.sas7bdat')
df_2016_topical = pd.read_sas('nsch_2016_topical.sas7bdat')
nsch_2016 = pd.merge(df_2016_topical, df_2016_implicate, on='HHID')
nsch_2017 = pd.read_sas('nsch_2017_topical.sas7bdat')

col_2016 = set(nsch_2016.columns)
col_2017 = set(nsch_2017.columns)
common_col = list(col_2016.intersection(col_2017))
data = pd.concat([nsch_2016[common_col], nsch_2017[common_col]])

In [2]:
interested_var = ['SC_AGE_YEARS', 'SC_RACE_R', 'K2Q01', 'K6Q71_R', 'K6Q72_R', 'K7Q84_R',\
                  'K7Q85_R', 'K7Q82_R', 'K7Q83_R', 'BULLIED', 'K7Q70_R',\
                  'MEMORYCOND', 'ERRANDALONE', 'HEADACHE', 'K2Q33A', 'K2Q32A', 'K2Q34A',\
                  'SUBABUSE', 'K2Q31A', 'HCABILITY', 'K2Q05', 'BIRTHWT_VL', 'MOMAGE',\
                  'WGTCONC', 'K4Q01', 'TREATNEED', 'CONFIDENT', 'NEWACTIVITY', 'HURTSAD',\
                  'CALMDOWN', 'TEMPER', 'K7Q04R_R', 'REPEATED', 'K7Q30', 'K7Q31', 'K7Q32',\
                  'K7Q37', 'PHYSACTIV', 'MAKEFRIEND', 'K11Q43R', 'BEDTIME', 'HOURSLEEP05',\
                  'HOURSLEEP', 'K7Q60_R', 'K7Q91_R', 'K6Q60_R', 'K6Q61_R', 'K8Q21',\
                  'K8Q30', 'K8Q31', 'K8Q32', 'K8Q34', 'K8Q35', 'K6Q20', 'K8Q11', 'K9Q40',\
                  'K9Q41', 'TALKABOUT', 'WKTOSOLVE', 'STRENGTHS', 'HOPEFUL', 'ACE1',\
                  'FOODSIT', 'K11Q60', 'K11Q61', 'K11Q62', 'S9Q34', 'K10Q11', 'K10Q12',\
                  'K10Q13', 'K10Q14', 'K10Q20', 'K10Q22', 'K10Q23', 'K10Q30', 'K10Q31',\
                  'K10Q40_R', 'GOFORHELP', 'K10Q41_R', 'K9Q96', 'ACE3', 'ACE4', 'ACE5',\
                  'ACE6', 'ACE7', 'ACE8', 'ACE9', 'ACE10', 'A1_PHYSHEALTH', 'A1_MENTHEALTH']

In [3]:
data = data[interested_var]

In [4]:
data = data.dropna(axis='columns', thresh=data.shape[0]*0.5).dropna()

In [5]:
data['ACE'] = (2-data['ACE3']) +\
           (2-data['ACE4']) +\
           (2-data['ACE5']) +\
           (2-data['ACE6']) +\
           (2-data['ACE7']) +\
           (2-data['ACE8']) +\
           (2-data['ACE9']) +\
           (2-data['ACE10'])

In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
import xgboost as xgb
from pyearth import Earth
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import warnings
from sklearn.model_selection import RandomizedSearchCV
warnings.filterwarnings('ignore')


X = data.drop(columns=['K2Q33A'])
y = data['K2Q33A']

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=42) 

In [7]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
sampler1 = RandomUnderSampler(random_state=42)
sampler2 = SMOTE(random_state=42)
sampler3 = ADASYN(random_state=42)
sampler4 = RandomOverSampler(random_state=42)

sampler = [sampler1, sampler2, sampler3, sampler4]

In [8]:
for s in sampler:
    dt_clf = Pipeline([('sampler', s), ('dt', DecisionTreeClassifier())])
    dt_params = {'dt__criterion': ['gini', 'entropy'],\
                 'dt__splitter': ['best', 'random'],\
                 'dt__max_depth': [None, 2, 3, 4, 5, 6, 7, 50, 100, 1000],\
                }

    dt_tune = RandomizedSearchCV(dt_clf, dt_params, cv=10, scoring='accuracy').fit(X_train, y_train)

    print("Decision tree:")
    print(dt_tune.best_params_)

Decision tree:
{'dt__splitter': 'random', 'dt__max_depth': 6, 'dt__criterion': 'entropy'}
Decision tree:
{'dt__splitter': 'best', 'dt__max_depth': 5, 'dt__criterion': 'gini'}
Decision tree:
{'dt__splitter': 'best', 'dt__max_depth': 6, 'dt__criterion': 'gini'}
Decision tree:
{'dt__splitter': 'random', 'dt__max_depth': 1000, 'dt__criterion': 'entropy'}


In [11]:
dt1 = make_pipeline(sampler1, DecisionTreeClassifier(criterion='entropy', max_depth=6, splitter='random'))
dt2 = make_pipeline(sampler2, DecisionTreeClassifier(criterion='gini', max_depth=5, splitter='best'))
dt3 = make_pipeline(sampler3, DecisionTreeClassifier(criterion='gini', max_depth=6, splitter='best'))
dt4 = make_pipeline(sampler4, DecisionTreeClassifier(criterion='entropy', max_depth=1000, splitter='random'))

In [9]:
for s in sampler:
    bg_clf = Pipeline([('sampler', s), ('bg', BaggingClassifier())])
    bg_params = {'bg__n_estimators': [10, 15, 20, 25, 30, 35, 40, 50, 70, 100, 200],\
                 'bg__max_samples': [1, 2, 3, 4, 5, 6, 7, 8, 9],\
                 'bg__max_features': [1.0, 0.8, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1],\
                 'bg__bootstrap':[True, False],\
                 'bg__bootstrap_features': [True, False]}

    bg_tune = RandomizedSearchCV(bg_clf, bg_params, cv=10, scoring='accuracy').fit(X_train, y_train)

    print("Bagging:")
    print(bg_tune.best_params_)

Bagging:
{'bg__n_estimators': 100, 'bg__max_samples': 6, 'bg__max_features': 0.2, 'bg__bootstrap_features': False, 'bg__bootstrap': False}
Bagging:
{'bg__n_estimators': 40, 'bg__max_samples': 8, 'bg__max_features': 0.1, 'bg__bootstrap_features': True, 'bg__bootstrap': True}
Bagging:
{'bg__n_estimators': 200, 'bg__max_samples': 3, 'bg__max_features': 0.8, 'bg__bootstrap_features': True, 'bg__bootstrap': False}
Bagging:
{'bg__n_estimators': 200, 'bg__max_samples': 7, 'bg__max_features': 0.1, 'bg__bootstrap_features': True, 'bg__bootstrap': True}


In [12]:
bg1=make_pipeline(sampler1, BaggingClassifier(n_estimators=100, max_samples=6,\
                                             max_features=0.2, bootstrap_features=False, bootstrap=False))
bg2=make_pipeline(sampler2, BaggingClassifier(n_estimators=40, max_samples=8,\
                                             max_features=0.1, bootstrap_features=True, bootstrap=True))
bg3=make_pipeline(sampler3, BaggingClassifier(n_estimators=200, max_samples=3,\
                                             max_features=0.8, bootstrap_features=True, bootstrap=False))
bg4=make_pipeline(sampler4, BaggingClassifier(n_estimators=200, max_samples=7,\
                                             max_features=0.1, bootstrap_features=True, bootstrap=True))

In [10]:
for s in sampler:
    rf_clf = Pipeline([('sampler', s), ('rf', RandomForestClassifier())])
    rf_params = {'rf__bootstrap': [True, False],
                 'rf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                 'rf__max_features': ['auto', 'sqrt'],
                 'rf__min_samples_leaf': [1, 2, 4],
                 'rf__min_samples_split': [2, 5, 10],
                 'rf__n_estimators': [  200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

    rf_tune = RandomizedSearchCV(rf_clf, rf_params, cv=10, scoring='accuracy').fit(X_train, y_train)
    
    print("random forest:")
    print(rf_tune.best_params_)

random forest:
{'rf__n_estimators': 800, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1, 'rf__max_features': 'auto', 'rf__max_depth': 10, 'rf__bootstrap': True}
random forest:
{'rf__n_estimators': 400, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 4, 'rf__max_features': 'auto', 'rf__max_depth': 70, 'rf__bootstrap': True}
random forest:
{'rf__n_estimators': 800, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 4, 'rf__max_features': 'auto', 'rf__max_depth': 70, 'rf__bootstrap': True}
random forest:
{'rf__n_estimators': 1000, 'rf__min_samples_split': 10, 'rf__min_samples_leaf': 2, 'rf__max_features': 'sqrt', 'rf__max_depth': 50, 'rf__bootstrap': False}


In [13]:
rf1 = make_pipeline(sampler1, RandomForestClassifier(n_estimators=800, min_samples_split=2,\
                                                  min_samples_leaf=1, max_features='auto',\
                                                  max_depth=10, bootstrap=True))
rf2 = make_pipeline(sampler2, RandomForestClassifier(n_estimators=400, min_samples_split=2,\
                                                  min_samples_leaf=4, max_features='auto',\
                                                  max_depth=70, bootstrap=True))
rf3 = make_pipeline(sampler3, RandomForestClassifier(n_estimators=800, min_samples_split=5,\
                                                  min_samples_leaf=4, max_features='auto',\
                                                  max_depth=70, bootstrap=True))
rf4 = make_pipeline(sampler4, RandomForestClassifier(n_estimators=1000, min_samples_split=10,\
                                                  min_samples_leaf=2, max_features='sqrt',\
                                                  max_depth=50, bootstrap=False))

In [14]:
models = {'Decision Tree_1': dt1,\
          'Decision Tree_2': dt2,\
          'Decision Tree_3': dt3,\
          'Decision Tree_4': dt4,\
          'Bagging1': bg1,\
          'Bagging2': bg2,\
          'Bagging3': bg3,\
          'Bagging4': bg4,\
          'Random Forest1': rf1,\
          'Random Forest2': rf2,\
          'Random Forest3': rf3,\
          'Random Forest4': rf4}

auc_ls = [np.mean(cross_val_score(models[key], X_train, y_train, cv=5, scoring='roc_auc')) for key in models.keys()]
acc_ls = [np.mean(cross_val_score(models[key], X_train, y_train, cv=5, scoring='accuracy')) for key in models.keys()]

In [15]:
df = pd.DataFrame({'Models': [k for k in models],\
                   'Accuracy': acc_ls,\
                   'ROC/AUC': auc_ls})

In [16]:
df

Unnamed: 0,Models,Accuracy,ROC/AUC
0,Decision Tree_1,0.808363,0.86489
1,Decision Tree_2,0.876753,0.866969
2,Decision Tree_3,0.877328,0.866158
3,Decision Tree_4,0.861368,0.682355
4,Bagging1,0.836874,0.824166
5,Bagging2,0.856777,0.774943
6,Bagging3,0.843567,0.801564
7,Bagging4,0.824734,0.819175
8,Random Forest1,0.825838,0.887798
9,Random Forest2,0.908181,0.888889


In [18]:
from skater.model import InMemoryModel
from skater.core.explanations import Interpretation

rf4.fit(X, y)
model = InMemoryModel(rf4.predict_proba, examples=X)
interpreter = Interpretation(X)
interpreter.load_data(X)
importance = interpreter.feature_importance.feature_importance(model)

[77/77] features ████████████████████ Time elapsed: 604 seconds

In [23]:
#importance = importance.to_frame(name='Importance Score')
importance = importance.sort_values(by='Importance Score', ascending=False)
importance.reset_index(level=0, inplace=True)
importance = importance.rename(columns={'index': 'feature'})

In [24]:
figure1_df = importance[0:10]

In [25]:
importance

Unnamed: 0,feature,feature.1,Importance Score
0,0,HCABILITY,0.084981
1,1,K2Q32A,0.056596
2,2,K7Q85_R,0.052396
3,3,MAKEFRIEND,0.046240
4,4,K8Q31,0.040008
5,5,SC_AGE_YEARS,0.035379
6,6,K2Q01,0.028532
7,7,BULLIED,0.026461
8,8,MEMORYCOND,0.023656
9,9,K2Q31A,0.023277
