## Slipt data into train, validation, and test

In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix


import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)


# ML
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Helpers functions

def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

### Read in Data

In [3]:
df = pd.read_csv('datasets/hdps_cleaned.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,0.0,4.0,150.0,407.0,0.0,2.0,154.0,0.0,4.0,2.0,3.0,7.0,1
1,59.0,1.0,2.0,140.0,221.0,0.0,0.0,164.0,1.0,0.0,1.0,0.0,3.0,0
2,64.0,1.0,3.0,125.0,309.0,0.0,0.0,131.0,1.0,1.8,2.0,0.0,7.0,1
3,52.0,1.0,4.0,108.0,233.0,1.0,0.0,147.0,0.0,0.1,1.0,3.0,7.0,0
4,47.0,1.0,4.0,110.0,275.0,0.0,2.0,118.0,1.0,1.0,2.0,1.0,3.0,1


### One Hot encoding

In [4]:
dataset = pd.get_dummies(df, columns=['cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'], drop_first=True)

In [5]:
dataset.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_2.0,cp_3.0,cp_4.0,...,restecg_1.0,restecg_2.0,exang_1.0,slope_2.0,slope_3.0,ca_1.0,ca_2.0,ca_3.0,thal_6.0,thal_7.0
0,63.0,0.0,150.0,407.0,154.0,4.0,1,0,0,1,...,0,1,0,1,0,0,0,1,0,1
1,59.0,1.0,140.0,221.0,164.0,0.0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,64.0,1.0,125.0,309.0,131.0,1.8,1,0,1,0,...,0,0,1,1,0,0,0,0,0,1
3,52.0,1.0,108.0,233.0,147.0,0.1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
4,47.0,1.0,110.0,275.0,118.0,1.0,1,0,0,1,...,0,1,1,1,0,1,0,0,0,0


### split into train (0.6), validation (0.2) and test (0.2) set

In [6]:
features = dataset.drop('target', axis=1)
labels = dataset['target']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=14)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=14)

In [7]:
for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset)/ len(labels), 2))

0.6
0.2
0.2


## Support Vector Machine

In [8]:
SVC()

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [9]:
svc = SVC()
parameters = {
    'kernel': ['linear', 'sigmoid', 'rbf'],
    'C': [0.1, 1, 10, 30, 50, 100]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'C': 10, 'kernel': 'linear'}

0.825 (+/-0.059) for {'C': 0.1, 'kernel': 'linear'}
0.554 (+/-0.008) for {'C': 0.1, 'kernel': 'sigmoid'}
0.554 (+/-0.008) for {'C': 0.1, 'kernel': 'rbf'}
0.825 (+/-0.054) for {'C': 1, 'kernel': 'linear'}
0.554 (+/-0.008) for {'C': 1, 'kernel': 'sigmoid'}
0.778 (+/-0.105) for {'C': 1, 'kernel': 'rbf'}
0.834 (+/-0.072) for {'C': 10, 'kernel': 'linear'}
0.554 (+/-0.008) for {'C': 10, 'kernel': 'sigmoid'}
0.776 (+/-0.103) for {'C': 10, 'kernel': 'rbf'}
0.819 (+/-0.077) for {'C': 30, 'kernel': 'linear'}
0.554 (+/-0.008) for {'C': 30, 'kernel': 'sigmoid'}
0.776 (+/-0.103) for {'C': 30, 'kernel': 'rbf'}
0.81 (+/-0.069) for {'C': 50, 'kernel': 'linear'}
0.554 (+/-0.008) for {'C': 50, 'kernel': 'sigmoid'}
0.776 (+/-0.103) for {'C': 50, 'kernel': 'rbf'}
0.793 (+/-0.106) for {'C': 100, 'kernel': 'linear'}
0.554 (+/-0.008) for {'C': 100, 'kernel': 'sigmoid'}
0.776 (+/-0.103) for {'C': 100, 'kernel': 'rbf'}


> ### 1. Training the models

In [10]:
svm1 = SVC(C=10, kernel='linear')
svm1.fit(X_train, y_train)

svm2 = SVC(C=0.1, kernel='linear')
svm2.fit(X_train, y_train)

svm3 = SVC(C=1, kernel='linear')
svm3.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

> ### 2.  Evaluate models on Validation set

In [11]:
for mdl in [svm1, svm2, svm3]:
    y_pred = mdl.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred),3)
    precision = round(precision_score(y_val, y_pred),3)
    recall = round(recall_score(y_val, y_pred),3)
    print('Kernel: {} & C: {} -- A: {} / P: {} / R: {}'. format(mdl.kernel,
                                                                mdl.C,
                                                                accuracy,
                                                                precision,
                                                                recall))

Kernel: linear & C: 10 -- A: 0.843 / P: 0.891 / R: 0.759
Kernel: linear & C: 0.1 -- A: 0.826 / P: 0.854 / R: 0.759
Kernel: linear & C: 1 -- A: 0.852 / P: 0.894 / R: 0.778


> ### 3. Evaluate the best model on the test set

In [12]:
for mdl in [svm1, svm2, svm3]:
    y_pred = mdl.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred),3)
    precision = round(precision_score(y_test, y_pred),3)
    recall = round(recall_score(y_val, y_pred),3)
    print('Kernel: {} & C: {} -- A: {} / P: {} / R: {}'. format(mdl.kernel,
                                                                mdl.C,
                                                                accuracy,
                                                                precision,
                                                                recall))





# y_pred = svm3.predict(X_test)
# accuracy = round(accuracy_score(y_test, y_pred), 3)
# precision = round(precision_score(y_test, y_pred), 3)
# recall = round(recall_score(y_test, y_pred), 3)
# print('Kernel: {} & C: {} -- A: {} / P: {} / R: {}'. format(svm3.kernel,
#                                                                 svm3.C,
#                                                                 accuracy,
#                                                                 precision,
#                                                                 recall))

Kernel: linear & C: 10 -- A: 0.87 / P: 0.894 / R: 0.352
Kernel: linear & C: 0.1 -- A: 0.852 / P: 0.872 / R: 0.333
Kernel: linear & C: 1 -- A: 0.87 / P: 0.878 / R: 0.37


> ### Write out pickled model

In [13]:
joblib.dump(svm1, 'models/SVM_model.pkl')

['models/SVM_model.pkl']

## Decision Trees

In [14]:
DecisionTreeClassifier()

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [15]:
import numpy as np

dt = DecisionTreeClassifier()

depth_range = range(1, 15)
leaf_range = range(1,20)
criterion = ['gini', 'entropy']

param_grid = dict(max_depth=depth_range, min_samples_leaf=leaf_range, criterion=criterion)
# parameters = {
#     'criterion':['gini','entropy'],
#     'max_depth': np.arange(1, len(X_train))
# }

cv = GridSearchCV(dt, param_grid, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 1}

0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 2}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 3}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 4}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 5}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 6}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 7}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 8}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 9}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 10}
0.729 (+/-0.111) for {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 11}
0.729 (+/-0.111) 

> ### 1.  Training the best 3 models

In [16]:
dt1 = DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_leaf=2)
dt1.fit(X_train, y_train)

dt2 = DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_leaf=1)
dt2.fit(X_train, y_train)

dt3 = DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=1)
dt3.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=12,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

> ### 2.  Evaluate models on Validation set

In [17]:
for mdl in [dt1, dt2, dt3]:
    y_pred = mdl.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred),3)
    precision = round(precision_score(y_val, y_pred),3)
    recall = round(recall_score(y_val, y_pred),3)
    print('Criterion: {}, Max Depth: {} & Min Samples Leaf: {} -- A: {} / P: {} / R: {}'. format(mdl.criterion,
                                                                                                 mdl.max_depth,
                                                                                                 mdl.min_samples_leaf,
                                                                                                 accuracy,
                                                                                                 precision,
                                                                                                 recall))
                                                                                                 

Criterion: entropy, Max Depth: 8 & Min Samples Leaf: 2 -- A: 0.835 / P: 0.83 / R: 0.815
Criterion: entropy, Max Depth: 8 & Min Samples Leaf: 1 -- A: 0.861 / P: 0.797 / R: 0.944
Criterion: entropy, Max Depth: 12 & Min Samples Leaf: 1 -- A: 0.887 / P: 0.836 / R: 0.944


> ### 3.  Evaluate the best models on the Test set

In [18]:
for mdl in [dt1, dt2, dt3]:
    y_pred = mdl.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred),3)
    precision = round(precision_score(y_test, y_pred),3)
    recall = round(recall_score(y_test, y_pred),3)
    print('Criterion: {}, Max Depth: {} & Min Samples Leaf: {} -- A: {} / P: {} / R: {}'. format(mdl.criterion,
                                                                                                 mdl.max_depth,
                                                                                                 mdl.min_samples_leaf,
                                                                                                 accuracy,
                                                                                                 precision,
                                                                                                 recall))

# y_pred = dt2.predict(X_test)
# accuracy = round(accuracy_score(y_test, y_pred),3)
# precision = round(precision_score(y_test, y_pred),3)
# recall = round(recall_score(y_val, y_pred),3)
# print('Criterion: {}, Max Depth: {} & Min Samples Leaf: {} -- A: {} / P: {} / R: {}'. format(dt2.criterion,
#                                                                                                  dt2.max_depth,
#                                                                                                  dt2.min_samples_leaf,
#                                                                                                  accuracy,
#                                                                                                  precision,
#                                                                                                  recall))

Criterion: entropy, Max Depth: 8 & Min Samples Leaf: 2 -- A: 0.878 / P: 0.865 / R: 0.865
Criterion: entropy, Max Depth: 8 & Min Samples Leaf: 1 -- A: 0.878 / P: 0.852 / R: 0.885
Criterion: entropy, Max Depth: 12 & Min Samples Leaf: 1 -- A: 0.913 / P: 0.875 / R: 0.942


> ### Write out pickled model

In [19]:
joblib.dump(dt3, 'models/DT_model.pkl')

['models/DT_model.pkl']

##  K Nearest Neighbor

In [20]:
KNeighborsClassifier()

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [21]:
knn = KNeighborsClassifier()

k_range = list(range(1,31))
weight_options = ["uniform", "distance"]
metrics = ['euclidean', 'manhattan']

param_grid = dict(n_neighbors= k_range, weights=weight_options, metric=metrics)

cv = GridSearchCV(knn, param_grid, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'metric': 'manhattan', 'n_neighbors': 26, 'weights': 'distance'}

0.773 (+/-0.185) for {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.773 (+/-0.185) for {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.708 (+/-0.087) for {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'uniform'}
0.773 (+/-0.185) for {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'distance'}
0.726 (+/-0.079) for {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.813 (+/-0.11) for {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.706 (+/-0.061) for {'metric': 'euclidean', 'n_neighbors': 4, 'weights': 'uniform'}
0.799 (+/-0.156) for {'metric': 'euclidean', 'n_neighbors': 4, 'weights': 'distance'}
0.694 (+/-0.062) for {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.799 (+/-0.123) for {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.714 (+/-0.08) for {'metric': 'euclidean', 'n_neighbors': 6, 'wei

In [22]:
# Training the 3 models

knn1 = KNeighborsClassifier(n_neighbors=26, metric='manhattan', weights='distance')
knn1.fit(X_train, y_train)

knn2 = KNeighborsClassifier(n_neighbors=24, metric='manhattan', weights='distance')
knn2.fit(X_train, y_train)

knn3 = KNeighborsClassifier(n_neighbors=28, metric='manhattan', weights='distance')
knn3.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=28, p=2,
                     weights='distance')

In [23]:
# Evaluate on the validation set

for mdl in [knn1, knn2, knn3]:
    y_pred = mdl.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred),3)
    precision = round(precision_score(y_val, y_pred),3)
    recall = round(recall_score(y_val, y_pred),3)
    print('Neighbors: {}, Metric: {} & Weights: {} -- A: {} / P: {} / R: {}'. format(mdl.n_neighbors,
                                                                                                 mdl.metric,
                                                                                                 mdl.weights,
                                                                                                 accuracy,
                                                                                                 precision,
                                                                                                 recall))
                                                                                                 

Neighbors: 26, Metric: manhattan & Weights: distance -- A: 0.861 / P: 0.839 / R: 0.87
Neighbors: 24, Metric: manhattan & Weights: distance -- A: 0.852 / P: 0.825 / R: 0.87
Neighbors: 28, Metric: manhattan & Weights: distance -- A: 0.87 / P: 0.868 / R: 0.852


In [24]:
# Evaluate the best model on the Test set

for mdl in [knn1, knn2, knn3]:
    y_pred = mdl.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred),4)
    precision = round(precision_score(y_test, y_pred),4)
    recall = round(recall_score(y_test, y_pred),3)
    print('Neighbors: {}, Metric: {} & Weights: {} -- A: {} / P: {} / R: {}'. format(mdl.n_neighbors,
                                                                                                 mdl.metric,
                                                                                                 mdl.weights,
                                                                                                 accuracy,
                                                                                                 precision,
                                                                                                 recall))

# y_pred = knn3.predict(X_test)
# accuracy = round(accuracy_score(y_test, y_pred),3)
# precision = round(precision_score(y_test, y_pred),3)
# recall = round(recall_score(y_val, y_pred),3)
# print('Neighbors: {}, Metric: {} & Weights: {} -- A: {} / P: {} / R: {}'. format(knn3.n_neighbors,
#                                                                                                  knn3.metric,
#                                                                                                  knn3.weights,
#                                                                                                  accuracy,
#                                                                                                  precision,
#                                                                                                  recall))

Neighbors: 26, Metric: manhattan & Weights: distance -- A: 0.8435 / P: 0.9048 / R: 0.731
Neighbors: 24, Metric: manhattan & Weights: distance -- A: 0.8435 / P: 0.8864 / R: 0.75
Neighbors: 28, Metric: manhattan & Weights: distance -- A: 0.8348 / P: 0.9024 / R: 0.712


In [25]:
joblib.dump(knn1, 'models/KNN_model.pkl')

['models/KNN_model.pkl']

## Stacking (SVM, KNN, Decision Tree) Meta leaner Logistic Regression

In [28]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier

### Read in Models

In [29]:
svm_model = joblib.load('models/SVM_model.pkl')
dt_model = joblib.load('models/DT_model.pkl')
knn_model = joblib.load('models/KNN_model.pkl')
lr = LogisticRegression()

sclf = StackingClassifier(classifiers=[svm_model, dt_model, knn_model], 
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([svm_model, dt_model, knn_model, sclf], 
                      ['SVM', 
                       'Decision Tree', 
                       'KNN',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X_train, y_train, 
                                              cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

3-fold cross validation:

Accuracy: 0.83 (+/- 0.04) [SVM]
Accuracy: 0.84 (+/- 0.04) [Decision Tree]
Accuracy: 0.86 (+/- 0.04) [KNN]
Accuracy: 0.88 (+/- 0.05) [StackingClassifier]


In [34]:

y_pred = sclf.predict(X_val)
accuracy = round(accuracy_score(y_val, y_pred),3)
precision = round(precision_score(y_val, y_pred),3)
recall = round(recall_score(y_val, y_pred),3)
print(' A: {} / P: {} / R: {}'. format(accuracy,precision,recall))
                                                                                                 





# print('5-fold cross validation:\n')

# for clf, label in zip([svm_model, dt_model, knn_model, sclf], 
#                       ['SVM', 
#                        'Decision Tree', 
#                        'KNN',
#                        'StackingClassifier']):

#     scores = model_selection.cross_val_score(clf, X_val, y_val, 
#                                               cv=5, scoring='accuracy')
#     print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
#           % (scores.mean(), scores.std(), label))


5-fold cross validation:

Accuracy: 0.79 (+/- 0.05) [SVM]
Accuracy: 0.78 (+/- 0.06) [Decision Tree]
Accuracy: 0.70 (+/- 0.09) [KNN]
Accuracy: 0.78 (+/- 0.04) [StackingClassifier]


In [35]:
print('5-fold cross validation:\n')

for clf, label in zip([svm_model, dt_model, knn_model, sclf], 
                      ['SVM', 
                       'Decision Tree', 
                       'KNN',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X_test, y_test, 
                                              cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

5-fold cross validation:

Accuracy: 0.78 (+/- 0.13) [SVM]
Accuracy: 0.77 (+/- 0.10) [Decision Tree]
Accuracy: 0.73 (+/- 0.08) [KNN]
Accuracy: 0.80 (+/- 0.14) [StackingClassifier]


### models = {}

# Comment
# for mld in ['SVM', 'DT', 'KNN']:
#     models[mdl] = joblib.load('models/{}_models.pkl'.format(mdl))