In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', None)

from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, classification_report, recall_score

from sklearn.preprocessing import MinMaxScaler
from time import time

In [2]:
df_diab_train = pd.read_csv('data/diabetes_data.csv')
df_heart_train = pd.read_csv('data/heart_disease_data.csv')

df_diab_test = pd.read_csv('data/test_diabetes_data.csv')
df_heart_test = pd.read_csv('data/test_heart_disease_data.csv')

In [3]:
df_heart_train.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,IsMale
0,40,True,289,False,172,False,0.0,1,False,False,True,False,False,False,True,False,True
1,49,True,180,False,156,False,1.0,0,True,False,False,True,False,False,True,False,False
2,37,True,283,False,98,False,0.0,1,False,False,True,False,False,False,False,True,True
3,48,True,214,False,108,True,1.5,0,True,True,False,False,False,False,True,False,False
4,54,True,195,False,122,False,0.0,1,False,False,False,True,False,False,True,False,True


In [4]:
df_diab_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Age,Education,Income,IsMale,Diabetes
0,False,False,False,66.0,False,False,False,False,True,True,False,True,False,5.0,0.0,30.0,True,9.0,4.0,5.0,False,True
1,False,False,False,34.0,False,False,False,True,False,True,False,True,False,3.0,0.0,0.0,False,7.0,6.0,8.0,False,True
2,False,False,False,30.0,False,False,False,True,True,True,False,True,False,3.0,0.0,2.0,False,6.0,6.0,8.0,True,True
3,False,False,False,39.0,False,False,False,True,True,True,False,True,False,2.0,2.0,10.0,True,4.0,5.0,5.0,False,True
4,False,False,False,37.0,True,False,False,False,False,True,False,True,False,2.0,0.0,0.0,False,7.0,6.0,7.0,True,True


# Algorithms to Explore
* Decision Trees (with pruning)
* Neural Network
* Boosting (GB)
* SVM
* kNN

In [5]:
def get_recall(model, X_test, y_test):
    y_pred = model.predict(X_test)
    recall = classification_report(y_test, y_pred, output_dict=True)['True']['recall']
    return recall

In [6]:
import json

In [7]:
train_times = {}
test_times = {}
scores = {}
acc_scores = {}

## DT

In [8]:
X_train_diab = df_diab_train.drop('Diabetes', axis=1)
y_train_diab = df_diab_train['Diabetes']

X_test_diab = df_diab_test.drop('Diabetes', axis=1)
y_test_diab = df_diab_test['Diabetes']

X_train_heart = df_heart_train.drop('HeartDisease', axis=1)
y_train_heart = df_heart_train['HeartDisease']

X_test_heart = df_heart_test.drop('HeartDisease', axis=1)
y_test_heart = df_heart_test['HeartDisease']

In [9]:
diab_start = time()
with open('params/dt_params.json') as f:
    diab_params = json.load(f)['diab']
diab_dt = DecisionTreeClassifier(**diab_params)
diab_dt.fit(X_train_diab, y_train_diab)
diab_end = time()

heart_start = time()
with open('params/dt_params.json') as f:
    heart_params = json.load(f)['heart']
heart_dt = DecisionTreeClassifier(**heart_params)
heart_dt.fit(X_train_heart, y_train_heart)
heart_end = time()

train_times['DT'] = [diab_end-diab_start, heart_end-heart_start]

In [10]:
diab_start = time()
diab_score = get_recall(diab_dt,X_test_diab, y_test_diab)
diab_end = time()

heart_start = time()
heart_score = get_recall(heart_dt, X_test_heart, y_test_heart)
heart_end = time()
scores['DT'] = [diab_score, heart_score]
acc_scores['DT'] = [diab_dt.score(X_test_diab, y_test_diab), heart_dt.score(X_test_heart, y_test_heart)]

test_times['DT'] = [diab_end-diab_start, heart_end-heart_start]

## NN

In [11]:
diab_start = time()
with open('params/nn_params.json') as f:
    diab_params = json.load(f)['diab']
diab_nn = MLPClassifier(**diab_params)
diab_nn.fit(X_train_diab, y_train_diab)
diab_end = time()

heart_start = time()
with open('params/nn_params.json') as f:
    heart_params = json.load(f)['heart']
heart_nn = MLPClassifier(**heart_params)
heart_nn.fit(X_train_heart, y_train_heart)
heart_end = time()

train_times['NN'] = [diab_end-diab_start, heart_end-heart_start]

In [12]:
diab_start = time()
diab_score = get_recall(diab_nn,X_test_diab, y_test_diab)
diab_end = time()

heart_start = time()
heart_score = get_recall(heart_nn, X_test_heart, y_test_heart)
heart_end = time()

scores['NN'] = [diab_score, heart_score]
acc_scores['NN'] = [diab_nn.score(X_test_diab, y_test_diab), heart_nn.score(X_test_heart, y_test_heart)]
test_times['NN'] = [diab_end-diab_start, heart_end-heart_start]

## ADA Boost

In [13]:
import pickle as pk
diab_start = time()
with open('params/ada_params.pk', 'rb') as f:
    diab_params = pk.load(f)['diab']
diab_ada = AdaBoostClassifier(**diab_params)
diab_ada.fit(X_train_diab, y_train_diab)
diab_end = time()

heart_start = time()
with open('params/ada_params.pk', 'rb') as f:
    heart_params = pk.load(f)['heart']
heart_ada = AdaBoostClassifier(**heart_params)
heart_ada.fit(X_train_heart, y_train_heart)
heart_end = time()

train_times['Ada'] = [diab_end-diab_start, heart_end-heart_start]

In [14]:
diab_start = time()
diab_score = get_recall(diab_ada,X_test_diab, y_test_diab)
diab_end = time()

heart_start = time()
heart_score = get_recall(heart_ada, X_test_heart, y_test_heart)
heart_end = time()
scores['Ada'] = [diab_score, heart_score]
acc_scores['Ada'] = [diab_ada.score(X_test_diab, y_test_diab), heart_ada.score(X_test_heart, y_test_heart)]

test_times['Ada'] = [diab_end-diab_start, heart_end-heart_start]

## kNN

In [15]:
diab_start = time()
with open('params/knn_params.json') as f:
    diab_params = json.load(f)['diab']
diab_knn = KNeighborsClassifier(**diab_params)
diab_knn.fit(X_train_diab, y_train_diab)
diab_end = time()

heart_start = time()
with open('params/knn_params.json') as f:
    heart_params = json.load(f)['heart']
heart_knn = KNeighborsClassifier(**heart_params)
heart_knn.fit(X_train_heart, y_train_heart)
heart_end = time()

train_times['kNN'] = [diab_end-diab_start, heart_end-heart_start]

In [16]:
diab_start = time()
diab_score = get_recall(diab_knn, X_test_diab, y_test_diab)
diab_end = time()

heart_start = time()
heart_score = get_recall(heart_knn, X_test_heart, y_test_heart)
heart_end = time()
scores['kNN'] = [diab_score, heart_score]
acc_scores['kNN'] = [diab_knn.score(X_test_diab, y_test_diab), heart_knn.score(X_test_heart, y_test_heart)]

test_times['kNN'] = [diab_end-diab_start, heart_end-heart_start]

## SVM

In [17]:
scaler = MinMaxScaler()  # Default behavior is to scale to [0,1]

X_train_diab = scaler.fit_transform(df_diab_train.drop('Diabetes', axis=1))
y_train_diab = df_diab_train['Diabetes']

X_test_diab = scaler.transform(df_diab_test.drop('Diabetes', axis=1))
y_test_diab = df_diab_test['Diabetes']
# X_train_diab, X_test_diab, y_train_diab, y_test_diab = train_test_split(X_diab, y_diab, random_state=0)

In [18]:
scaler = MinMaxScaler()  # Default behavior is to scale to [0,1]

X_train_heart = scaler.fit_transform(df_heart_train.drop('HeartDisease', axis=1))
y_train_heart = df_heart_train['HeartDisease']

X_test_heart = scaler.transform(df_heart_test.drop('HeartDisease', axis=1))
y_test_heart = df_heart_test['HeartDisease']

In [None]:
diab_start = time()
with open('params/svm_params.json') as f:
    diab_params = json.load(f)['diab']
diab_svm = SVC(**diab_params)
diab_svm.fit(X_train_diab, y_train_diab)
diab_end = time()

heart_start = time()
with open('params/svm_params.json') as f:
    heart_params = json.load(f)['heart']
heart_svm = SVC(**heart_params)
heart_svm.fit(X_train_heart, y_train_heart)
heart_end = time()

train_times['SVM'] = [diab_end-diab_start, heart_end-heart_start]

In [None]:
diab_start = time()
diab_score = get_recall(diab_svm,X_test_diab, y_test_diab)
diab_end = time()

heart_start = time()
heart_score = get_recall(heart_svm, X_test_heart, y_test_heart)
heart_end = time()
scores['SVM'] = [diab_score, heart_score]
acc_scores['SVM'] = [diab_svm.score(X_test_diab, y_test_diab), heart_svm.score(X_test_heart, y_test_heart)]

test_times['SVM'] = [diab_end-diab_start, heart_end-heart_start]

## Convert into visuals

In [None]:
fig, axs = plt.subplots(1,2, figsize=(15,5))

train_times_df = pd.DataFrame(train_times, index=['Diabetes', 'Heart Disease'])

train_times_df.T.plot(kind='bar', logy=True, title='Train Times',ax=axs[0], grid=True, xlabel='Model', ylabel='Time (log scale)');

test_times_df = pd.DataFrame(train_times, index=['Diabetes', 'Heart Disease'])

test_times_df.T.plot(kind='bar', logy=True, title='Test Times', xlabel='Model', ax=axs[1],grid=True, ylabel='Time (log scale)');

In [None]:
fig, axs = plt.subplots(1,2, figsize=(15,5))
scores_df = pd.DataFrame(scores, index=['Diabetes', 'Heart Disease'])

scores_df.T.plot(kind='bar', title='Recall Scores for All Models', ax=axs[0], grid=True, xlabel='Model', ylabel='Recall (log scale)');

acc_scores_df = pd.DataFrame(acc_scores, index=['Diabetes', 'Heart Disease'])

acc_scores_df.T.plot(kind='bar', title='Accuracy Scores for All Models',ax=axs[1], grid=True, xlabel='Model', ylabel='Accuracy (log scale)');

In [None]:
scores_df

In [None]:
acc_scores_df