In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrix
from sklearn.metrics import mean_squared_error
data = pd.read_csv("data/heart_2020_cleaned_v2.csv")
#data1 = pd.read_csv("/data/heart_2020_cleaned_v2.csv",lines=True)

In [2]:
# add all the parameters to the X (ie. age categoriy and race)
# KNN analysis

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
X=data[['Smoking', 'Stroke', 'PhysicalHealth','DiffWalking', 
        'Diabetic_Yes','GenHealth_Poor', 'GenHealth_Fair', 
        'KidneyDisease','AgeCategory_80_or_older','SkinCancer']]
y=data['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
cv_results = cross_validate(knn,X_train,y_train,cv=3)
print('cv_results:',cv_results)
print(sum(y_train)/len(y_train))
cv_results = cross_validate(knn, X_train, y_train, cv=3,scoring='roc_auc')
print('cv_results with roc_auc:',cv_results)

cv_results: {'fit_time': array([0.57243562, 0.230371  , 0.92517662]), 'score_time': array([1.66420341, 1.57017732, 2.71882105]), 'test_score': array([0.90607424, 0.90830677, 0.91280705])}
0.084125
cv_results with roc_auc: {'fit_time': array([0.294559  , 0.15470457, 0.11107183]), 'score_time': array([0.78737855, 0.53224659, 0.45896077]), 'test_score': array([0.66862606, 0.65816457, 0.66241479])}


In [3]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

lr_model = LogisticRegression(solver='liblinear') # LogisticRegression is a␣classification!!
cv_results = cross_validate(lr_model, X_train, y_train, cv=3)
cv_results

parameters = {'C':[pow(10,i) for i in range(0, 10)],'penalty':['l1','l2']}
clf = GridSearchCV(LogisticRegression(solver='liblinear'), parameters,cv=3) #liblinear allow l1 panelty as well
clf.fit(X_train,y_train)
print(clf.cv_results_)

clf.score(X_test, y_test)

{'mean_fit_time': array([0.02510118, 0.03181601, 0.02827287, 0.03320162, 0.02844914,
       0.03478591, 0.03021042, 0.03609729, 0.03338774, 0.04648137,
       0.06766534, 0.12232916, 0.07077161, 0.06553022, 0.05035845,
       0.06078879, 0.04691315, 0.04716269, 0.0403978 , 0.04336294]), 'std_fit_time': array([0.00067041, 0.00074689, 0.00137001, 0.00168825, 0.00149708,
       0.00096094, 0.00099343, 0.00094008, 0.00533551, 0.0095762 ,
       0.00918135, 0.05695722, 0.00232198, 0.0051235 , 0.00201251,
       0.00106541, 0.00731203, 0.00301247, 0.0028307 , 0.0008442 ]), 'mean_score_time': array([0.00418361, 0.00431617, 0.00419052, 0.00363135, 0.00468397,
       0.00379451, 0.00437689, 0.00430465, 0.00523472, 0.00679652,
       0.00725396, 0.01141214, 0.01289113, 0.00745845, 0.00646385,
       0.00837183, 0.00616097, 0.0053943 , 0.00635131, 0.00432587]), 'std_score_time': array([0.00106395, 0.0004602 , 0.00020582, 0.00025205, 0.00048986,
       0.0002632 , 0.00054657, 0.00084248, 0.0003582

0.915

In [4]:
clf.best_estimator_.coef_

array([[0.56695067, 0.9206346 , 0.01342906, 0.42595948, 0.82868613,
        0.99638965, 0.69848587, 0.79958334, 1.00751219, 0.61086924]])

In [5]:
# Linear Regression

from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

print("Linear Model Score: ", linear_model.score(X_test, y_test))
print("Linear Model Coefficient: ",linear_model.coef_)

cv_results = cross_validate(linear_model, X_train, y_train, cv=3)
cv_results

Linear Model Score:  0.10541440445855843
Linear Model Coefficient:  [0.03648357 0.14302255 0.00137218 0.04520277 0.08292745 0.15545769
 0.07013543 0.11662035 0.10651632 0.05790824]


{'fit_time': array([0.03028965, 0.02142096, 0.01856732]),
 'score_time': array([0.01151848, 0.00398684, 0.0069766 ]),
 'test_score': array([0.12789679, 0.11120832, 0.13007865])}

In [7]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
models = {'GBC':GradientBoostingClassifier(), 
          'ABC':AdaBoostClassifier(), 
          'RFC': RandomForestClassifier(), 
          'ETC': ExtraTreesClassifier()}
for k in models.keys():
    models[k].fit(X_train, y_train)
    print(k, "Accuracy:", models[k].score(X_test, y_test))

GBC Accuracy: 0.91275
ABC Accuracy: 0.915
RFC Accuracy: 0.90875
ETC Accuracy: 0.91


In [8]:
gbt_model = GradientBoostingClassifier(max_depth=2,n_estimators=200)
gbt_model.fit(X_train, y_train)
print("Accuracy:", gbt_model.score(X_test, y_test))

Accuracy: 0.915


In [9]:
abc_model = AdaBoostClassifier(n_estimators=200)
abc_model.fit(X_train, y_train)
print("Accuracy:", abc_model.score(X_test, y_test))

Accuracy: 0.91475
