In [79]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

## Read Data + Initial X, y Split

In [80]:
data = pd.read_csv("data/heart_2020_cleaned_v2.csv")
focus_cols = [
    'Smoking', 'Stroke', 'PhysicalHealth', 'DiffWalking',
    'Diabetic_Yes', 'GenHealth_Poor', 'GenHealth_Fair',
    'KidneyDisease', 'AgeCategory_80_or_older', 'SkinCancer'
]

X = data[focus_cols]
y = data['HeartDisease']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 0
)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## KNN Model

### Initial "Exploration" w/ Cross-Validation

In [81]:
knn = KNeighborsClassifier()
cv_results = cross_validate(
    knn, X_train, y_train, cv = 3,
    scoring = 'accuracy', return_estimator = True
)
print('CV Results (optimizing for accuracy):', cv_results['test_score'], "\n")

cv_results = cross_validate(
    knn, X_train, y_train, cv = 3,
    scoring = 'roc_auc', return_estimator = True
)
print('CV Results (optimizing for roc_auc):', cv_results['test_score'])

CV Results (optimizing for accuracy): [0.89913761 0.90493156 0.90586912] 

CV Results (optimizing for roc_auc): [0.6691108  0.67625005 0.6531881 ]


### Grid Search Parameter Optimization

In [82]:
parameters = {
    'n_neighbors': range(5, 15, 2),
    'weights': ['uniform', 'distance']
}

clf = GridSearchCV(
    KNeighborsClassifier(),
    parameters, cv = 3
)
clf.fit(X_train, y_train)

print("Scoring Results (CV Results):", clf.cv_results_['mean_test_score'], "\n")
print("Scoring Results (Test Dataset):", clf.score(X_test, y_test))

Scoring Results (CV Results): [0.90331276 0.90256275 0.90712525 0.90468765 0.90906253 0.90587498
 0.90993753 0.90575004 0.90981262 0.90600003] 

Scoring Results (Test Dataset): 0.90825


## Logistic Regression

### Grid Search Parameter Optimization

In [83]:
parameters = {
    'C': [pow(10,i) for i in range(0, 10)], 
    'penalty': ['l1','l2']
}
clf = GridSearchCV(
    LogisticRegression(solver = 'liblinear'), 
    parameters, cv = 3
)
clf.fit(X_train, y_train)

print("Scoring Results (CV Results):", clf.cv_results_['mean_test_score'], "\n")
print("Scoring Results (Test Dataset):", clf.score(X_test, y_test))

Scoring Results (CV Results): [0.91162502 0.91156251 0.91156251 0.91156251 0.91156251 0.91156251
 0.91156251 0.91156251 0.91156251 0.91156251 0.91156251 0.91156251
 0.91156251 0.91156251 0.91156251 0.91156251 0.91156251 0.91156251
 0.91156251 0.91156251] 

Scoring Results (Test Dataset): 0.915


### View Most "Important" Coefficients

In [84]:
coeffs = list(clf.best_estimator_.coef_[0])
features = focus_cols

df_coeffs_logistic = pd.DataFrame(
    zip(features, coeffs),
    columns = ['features', 'coeffs']
).sort_values(
    by = 'coeffs', key = lambda x: abs(x), ascending = False
)

df_coeffs_logistic

Unnamed: 0,features,coeffs
4,Diabetic_Yes,0.272987
8,AgeCategory_80_or_older,0.268208
6,GenHealth_Fair,0.238349
0,Smoking,0.22982
5,GenHealth_Poor,0.218338
1,Stroke,0.186559
9,SkinCancer,0.165848
7,KidneyDisease,0.126322
2,PhysicalHealth,0.099015
3,DiffWalking,0.096985


## Ensemble Methods

In [85]:
models = {
    'GBC':GradientBoostingClassifier(), 
    'ABC':AdaBoostClassifier(), 
    'RFC': RandomForestClassifier(), 
    'ETC': ExtraTreesClassifier()
}

for k in models.keys():
    models[k].fit(X_train, y_train)
    print(k, "Test Set Accuracy:", models[k].score(X_test, y_test))
    
    coeffs = models[k].feature_importances_
    features = focus_cols

    df_coeffs = pd.DataFrame(
        zip(features, coeffs),
        columns = ['features', 'coeffs']
    ).sort_values(
        by = 'coeffs', key = lambda x: abs(x), ascending = False
    )

    display(df_coeffs)
    print("\n", "----------------", "\n")

GBC Test Set Accuracy: 0.912


Unnamed: 0,features,coeffs
3,DiffWalking,0.144401
4,Diabetic_Yes,0.14158
5,GenHealth_Poor,0.141394
8,AgeCategory_80_or_older,0.115125
1,Stroke,0.114262
2,PhysicalHealth,0.098763
6,GenHealth_Fair,0.097206
7,KidneyDisease,0.058499
0,Smoking,0.048783
9,SkinCancer,0.039987



 ---------------- 

ABC Test Set Accuracy: 0.9145


Unnamed: 0,features,coeffs
2,PhysicalHealth,0.54
3,DiffWalking,0.1
5,GenHealth_Poor,0.08
6,GenHealth_Fair,0.08
1,Stroke,0.04
4,Diabetic_Yes,0.04
8,AgeCategory_80_or_older,0.04
9,SkinCancer,0.04
0,Smoking,0.02
7,KidneyDisease,0.02



 ---------------- 

RFC Test Set Accuracy: 0.9065


Unnamed: 0,features,coeffs
2,PhysicalHealth,0.368939
4,Diabetic_Yes,0.087757
3,DiffWalking,0.084746
8,AgeCategory_80_or_older,0.075915
1,Stroke,0.071782
0,Smoking,0.066693
5,GenHealth_Poor,0.064841
6,GenHealth_Fair,0.063894
9,SkinCancer,0.059426
7,KidneyDisease,0.056008



 ---------------- 

ETC Test Set Accuracy: 0.90875


Unnamed: 0,features,coeffs
2,PhysicalHealth,0.37448
4,Diabetic_Yes,0.090278
3,DiffWalking,0.086873
8,AgeCategory_80_or_older,0.076584
5,GenHealth_Poor,0.075581
1,Stroke,0.072591
6,GenHealth_Fair,0.063061
0,Smoking,0.057542
9,SkinCancer,0.052958
7,KidneyDisease,0.050052



 ---------------- 

