In [164]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from numpy import mean, std
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [165]:
df = pd.read_csv('heart.csv')

In [166]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.00,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.10,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.60,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.00,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.90,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.00,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.80,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.00,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.00,2,0,2,1


In [167]:
df.count()

age         1025
sex         1025
cp          1025
trestbps    1025
chol        1025
fbs         1025
restecg     1025
thalach     1025
exang       1025
oldpeak     1025
slope       1025
ca          1025
thal        1025
target      1025
dtype: int64

In [168]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [169]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.43,0.7,0.94,131.61,246.0,0.15,0.53,149.11,0.34,1.07,1.39,0.75,2.32,0.51
std,9.07,0.46,1.03,17.52,51.59,0.36,0.53,23.01,0.47,1.18,0.62,1.03,0.62,0.5
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


# Split the dataset into:
# x = independent variables
# y = dependent/target variables

In [170]:
x = df.drop(['target'], axis=1)
y = df[['target']]

In [171]:
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.00,2,2,3
1,53,1,0,140,203,1,0,155,1,3.10,0,0,3
2,70,1,0,145,174,0,1,125,1,2.60,0,0,3
3,61,1,0,148,203,0,1,161,0,0.00,2,1,3
4,62,0,0,138,294,1,1,106,0,1.90,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.00,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.80,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.00,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.00,2,0,2


In [172]:
y

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
1020,1
1021,0
1022,0
1023,1


# Hold-out set = 70/30
# or test size of 30%

In [173]:
trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.3)

# DTC

In [174]:
dtc = DecisionTreeClassifier()
dtc.fit(trainX, trainY)

DecisionTreeClassifier()

In [175]:
y_predicted = dtc.predict(testX)

In [176]:
conf = confusion_matrix(testY, y_predicted)

In [177]:
tp = conf[0][0]
fn = conf[0][1]
fp = conf[1][0]
tn = conf[1][1]

precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/(tp+fn+fp+tn)
sensitivity = tp/(tp+fn)
specificity = tn/(tp+tn)
f1 = 2*(precision*recall)/(precision+recall)
dtcres = precision, recall, accuracy, sensitivity, specificity, f1
dtcpre, dtcrec, dtcacc, dtcsen, dtcspe, dtcf1 = precision, recall, accuracy, sensitivity, specificity, f1
precision, recall, accuracy, sensitivity, specificity, f1

(0.9805194805194806,
 0.9741935483870968,
 0.9772727272727273,
 0.9741935483870968,
 0.4983388704318937,
 0.9773462783171522)

# NB

In [178]:
nb = GaussianNB()
nb.fit(trainX, trainY.values.ravel())

GaussianNB()

In [179]:
y_predicted = nb.predict(testX)

In [180]:
conf = confusion_matrix(testY, y_predicted)

In [181]:
tp = conf[0][0]
fn = conf[0][1]
fp = conf[1][0]
tn = conf[1][1]

precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/(tp+fn+fp+tn)
sensitivity = tp/(tp+fn)
specificity = tn/(tp+tn)
f1 = 2*(precision*recall)/(precision+recall)
nbres = precision, recall, accuracy, sensitivity, specificity, f1
nbpre, nbrec, nbacc, nbsen, nbspe, nbf1 = precision, recall, accuracy, sensitivity, specificity, f1
precision, recall, accuracy, sensitivity, specificity, f1

(0.8533333333333334,
 0.8258064516129032,
 0.8409090909090909,
 0.8258064516129032,
 0.5057915057915058,
 0.839344262295082)

# SVM

In [182]:
svm = SVC(kernel='linear')
svm.fit(trainX, trainY.values.ravel())

SVC(kernel='linear')

In [183]:
y_predicted = svm.predict(testX)

In [184]:
conf = confusion_matrix(testY, y_predicted)

In [185]:
tp = conf[0][0]
fn = conf[0][1]
fp = conf[1][0]
tn = conf[1][1]

precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/(tp+fn+fp+tn)
sensitivity = tp/(tp+fn)
specificity = tn/(tp+tn)
f1 = 2*(precision*recall)/(precision+recall)
svmres = precision, recall, accuracy, sensitivity, specificity, f1
svmpre, svmrec, svmacc, svmsen, svmspe, svmf1 = precision, recall, accuracy, sensitivity, specificity, f1
precision, recall, accuracy, sensitivity, specificity, f1

(0.9318181818181818,
 0.7935483870967742,
 0.8668831168831169,
 0.7935483870967742,
 0.5393258426966292,
 0.8571428571428571)

# RFC

In [186]:
rfc = RandomForestClassifier()
rfc.fit(trainX, trainY.values.ravel())

RandomForestClassifier()

In [187]:
y_predicted = rfc.predict(testX)

In [188]:
conf = confusion_matrix(testY, y_predicted)

In [189]:
tp = conf[0][0]
fn = conf[0][1]
fp = conf[1][0]
tn = conf[1][1]

precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/(tp+fn+fp+tn)
sensitivity = tp/(tp+fn)
specificity = tn/(tp+tn)
f1 = 2*(precision*recall)/(precision+recall)
rfcres = precision, recall, accuracy, sensitivity, specificity, f1
rfcpre, rfcrec, rfcacc, rfcsen, rfcspe, rfcf1 = precision, recall, accuracy, sensitivity, specificity, f1
precision, recall, accuracy, sensitivity, specificity, f1

(0.9810126582278481,
 1.0,
 0.9902597402597403,
 1.0,
 0.4918032786885246,
 0.9904153354632589)

# KNN

In [190]:
knn = KNeighborsClassifier()
knn.fit(trainX, trainY.values.ravel())

KNeighborsClassifier()

In [191]:
y_predicted = knn.predict(testX)

In [192]:
conf = confusion_matrix(testY, y_predicted)

In [193]:
tp = conf[0][0]
fn = conf[0][1]
fp = conf[1][0]
tn = conf[1][1]

precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/(tp+fn+fp+tn)
sensitivity = tp/(tp+fn)
specificity = tn/(tp+tn)
f1 = 2*(precision*recall)/(precision+recall)
knnres = precision, recall, accuracy, sensitivity, specificity, f1
knnpre, knnrec, knnacc, knnsen, knnspe, knnf1 = precision, recall, accuracy, sensitivity, specificity, f1
precision, recall, accuracy, sensitivity, specificity, f1

(0.6797385620915033,
 0.6709677419354839,
 0.6753246753246753,
 0.6709677419354839,
 0.5,
 0.6753246753246753)

# LR

In [194]:
# lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr = LogisticRegression(max_iter=1000)
lr.fit(trainX, trainY.values.ravel())

LogisticRegression(max_iter=1000)

In [195]:
y_predicted = lr.predict(testX)

In [196]:
conf = confusion_matrix(testY, y_predicted)

In [197]:
tp = conf[0][0]
fn = conf[0][1]
fp = conf[1][0]
tn = conf[1][1]

precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/(tp+fn+fp+tn)
sensitivity = tp/(tp+fn)
specificity = tn/(tp+tn)
f1 = 2*(precision*recall)/(precision+recall)
lrres = precision, recall, accuracy, sensitivity, specificity, f1
lrpre, lrrec, lracc, lrsen, lrspe, lrf1 = precision, recall, accuracy, sensitivity, specificity, f1
precision, recall, accuracy, sensitivity, specificity, f1

(0.9148936170212766,
 0.832258064516129,
 0.8766233766233766,
 0.832258064516129,
 0.5222222222222223,
 0.8716216216216215)

# Results

In [198]:
print('__________________________________________________________________________________________________________________________')
print("DTC:\n", dtcres, "\nNB:\n", nbres, "\nSVM:\n", svmres, "\nRFC:\n", rfcres, "\nKNN:\n", knnres, "\nLR:\n", lrres)
print('__________________________________________________________________________________________________________________________')

__________________________________________________________________________________________________________________________
DTC:
 (0.9805194805194806, 0.9741935483870968, 0.9772727272727273, 0.9741935483870968, 0.4983388704318937, 0.9773462783171522) 
NB:
 (0.8533333333333334, 0.8258064516129032, 0.8409090909090909, 0.8258064516129032, 0.5057915057915058, 0.839344262295082) 
SVM:
 (0.9318181818181818, 0.7935483870967742, 0.8668831168831169, 0.7935483870967742, 0.5393258426966292, 0.8571428571428571) 
RFC:
 (0.9810126582278481, 1.0, 0.9902597402597403, 1.0, 0.4918032786885246, 0.9904153354632589) 
KNN:
 (0.6797385620915033, 0.6709677419354839, 0.6753246753246753, 0.6709677419354839, 0.5, 0.6753246753246753) 
LR:
 (0.9148936170212766, 0.832258064516129, 0.8766233766233766, 0.832258064516129, 0.5222222222222223, 0.8716216216216215)
__________________________________________________________________________________________________________________________


# Tabled Results

In [199]:
results = [("DTC", dtcpre, dtcrec, dtcacc, dtcsen, dtcspe, dtcf1),
          ("NB", nbpre, nbrec, nbacc, nbsen, nbspe, nbf1),
          ("SVM", svmpre, svmrec, svmacc, svmsen, svmspe, svmf1),
          ("RFC", rfcpre, rfcrec, rfcacc, rfcsen, rfcspe, rfcf1),
          ("KNN", knnpre, knnrec, knnacc, knnsen, knnspe, knnf1),
          ("LR", lrpre, lrrec, lracc, lrsen, lrspe, lrf1)]
          
pd.options.display.float_format = '{:,.2f}'.format
dtable = pd.DataFrame(results, columns = ['MODEL', 'Precision', 'Recall', 'Accuracy', 'Sensitivity', 'Specificity', 'F1-Score'])
# dtable.sort_values(by=['Accuracy'], ascending=False)
dtable

Unnamed: 0,MODEL,Precision,Recall,Accuracy,Sensitivity,Specificity,F1-Score
0,DTC,0.98,0.97,0.98,0.97,0.5,0.98
1,NB,0.85,0.83,0.84,0.83,0.51,0.84
2,SVM,0.93,0.79,0.87,0.79,0.54,0.86
3,RFC,0.98,1.0,0.99,1.0,0.49,0.99
4,KNN,0.68,0.67,0.68,0.67,0.5,0.68
5,LR,0.91,0.83,0.88,0.83,0.52,0.87


# Performance by Precision

In [200]:
results = [("DTC", dtcpre),
          ("NB", nbpre),
          ("SVM", svmpre),
          ("RFC", rfcpre),
          ("KNN", knnpre),
          ("LR", lrpre)]
pd.options.display.float_format = '{:,.2f}'.format
dtable = pd.DataFrame(results, columns = ['MODEL', 'Precision'])
dtable.sort_values(by=['Precision'], ascending=False)
# dtable

Unnamed: 0,MODEL,Precision
3,RFC,0.98
0,DTC,0.98
2,SVM,0.93
5,LR,0.91
1,NB,0.85
4,KNN,0.68


# Performance by Recall

In [201]:
results = [("DTC", dtcrec),
          ("NB", nbrec),
          ("SVM", svmrec),
          ("RFC", rfcrec),
          ("KNN", knnrec),
          ("LR", lrrec)]
pd.options.display.float_format = '{:,.2f}'.format
dtable = pd.DataFrame(results, columns = ['MODEL', 'Recall'])
dtable.sort_values(by=['Recall'], ascending=False)
# dtable

Unnamed: 0,MODEL,Recall
3,RFC,1.0
0,DTC,0.97
5,LR,0.83
1,NB,0.83
2,SVM,0.79
4,KNN,0.67


# Performance by Specificity

In [202]:
results = [("DTC", dtcspe),
          ("NB", nbspe),
          ("SVM", svmspe),
          ("RFC", rfcspe),
          ("KNN", knnspe),
          ("LR", lrspe)]
pd.options.display.float_format = '{:,.2f}'.format
dtable = pd.DataFrame(results, columns = ['MODEL', 'Specificity'])
dtable.sort_values(by=['Specificity'], ascending=False)
# dtable

Unnamed: 0,MODEL,Specificity
2,SVM,0.54
5,LR,0.52
1,NB,0.51
4,KNN,0.5
0,DTC,0.5
3,RFC,0.49


# Performance by F1-Score

In [203]:
results = [("DTC", dtcf1),
          ("NB", nbf1),
          ("SVM", svmf1),
          ("RFC", rfcf1),
          ("KNN", knnf1),
          ("LR", lrf1)]
# pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.float_format = '{:.2f}'.format
dtable = pd.DataFrame(results, columns = ['MODEL', 'F1-Score'])
dtable.sort_values(by=['F1-Score'], ascending=False)
# dtable

Unnamed: 0,MODEL,F1-Score
3,RFC,0.99
0,DTC,0.98
5,LR,0.87
2,SVM,0.86
1,NB,0.84
4,KNN,0.68


# K-Fold
# n = 10

In [204]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)
specificityScorer = make_scorer(recall_score, pos_label=0)

# Cross-validation

In [205]:
# DTC
dtcPreScore = cross_val_score(dtc, x, y, scoring='precision', cv=cv, n_jobs=-1)
dtcRecScore = cross_val_score(dtc, x, y, scoring='recall', cv=cv, n_jobs=-1)
dtcSpeScore = cross_val_score(dtc, x, y, scoring=specificityScorer, cv=cv, n_jobs=-1)
dtcF1Score = cross_val_score(dtc, x, y, scoring='f1', cv=cv, n_jobs=-1)
dtcCVRes = dtcPreScore, dtcRecScore, dtcSpeScore, dtcF1Score

In [206]:
# NB
nbPreScore = cross_val_score(nb, x, y, scoring='precision', cv=cv, n_jobs=-1)
nbRecScore = cross_val_score(nb, x, y, scoring='recall', cv=cv, n_jobs=-1)
nbSpeScore = cross_val_score(nb, x, y, scoring=specificityScorer, cv=cv, n_jobs=-1)
nbF1Score = cross_val_score(nb, x, y, scoring='f1', cv=cv, n_jobs=-1)
nbCVRes = nbPreScore, nbRecScore, nbSpeScore, nbF1Score

In [None]:
# SVM
svmPreScore = cross_val_score(svm, x, y, scoring='precision', cv=cv, n_jobs=-1)
svmRecScore = cross_val_score(svm, x, y, scoring='recall', cv=cv, n_jobs=-1)
svmSpeScore = cross_val_score(svm, x, y, scoring=specificityScorer, cv=cv, n_jobs=-1)
svmF1Score = cross_val_score(svm, x, y, scoring='f1', cv=cv, n_jobs=-1)
svmCVRes = svmPreScore, svmRecScore, svmSpeScore, svmF1Score

In [208]:
# RFC
rfcPreScore = cross_val_score(rfc, x, y, scoring='precision', cv=cv, n_jobs=-1)
rfcRecScore = cross_val_score(rfc, x, y, scoring='recall', cv=cv, n_jobs=-1)
rfcSpeScore = cross_val_score(rfc, x, y, scoring=specificityScorer, cv=cv, n_jobs=-1)
rfcF1Score = cross_val_score(rfc, x, y, scoring='f1', cv=cv, n_jobs=-1)
rfcCVRes = rfcPreScore, rfcRecScore, rfcSpeScore, rfcF1Score

In [209]:
# KNN
knnPreScore = cross_val_score(knn, x, y, scoring='precision', cv=cv, n_jobs=-1)
knnRecScore = cross_val_score(knn, x, y, scoring='recall', cv=cv, n_jobs=-1)
knnSpeScore = cross_val_score(knn, x, y, scoring=specificityScorer, cv=cv, n_jobs=-1)
knnF1Score = cross_val_score(knn, x, y, scoring='f1', cv=cv, n_jobs=-1)
knnCVRes = knnPreScore, knnRecScore, knnSpeScore, knnF1Score

In [210]:
# LR
lrPreScore = cross_val_score(lr, x, y, scoring='precision', cv=cv, n_jobs=-1)
lrRecScore = cross_val_score(lr, x, y, scoring='recall', cv=cv, n_jobs=-1)
lrSpeScore = cross_val_score(lr, x, y, scoring=specificityScorer, cv=cv, n_jobs=-1)
lrF1Score = cross_val_score(lr, x, y, scoring='f1', cv=cv, n_jobs=-1)
lrCVRes = lrPreScore, lrRecScore, lrSpeScore, lrF1Score

# K-Fold Cross Validation Results

In [216]:
scores = [("DTC", mean(dtcCVRes[0]), mean(dtcCVRes[1]), mean(dtcCVRes[2]), mean(dtcCVRes[3])),
          ("NB", mean(nbCVRes[0]), mean(nbCVRes[1]), mean(nbCVRes[2]), mean(nbCVRes[3])),
          ("SVM", mean(svmCVRes[0]), mean(svmCVRes[1]), mean(svmCVRes[2]), mean(svmCVRes[3])),
          ("RFC", mean(rfcCVRes[0]), mean(rfcCVRes[1]), mean(rfcCVRes[2]), mean(rfcCVRes[3])),
          ("KNN", mean(knnCVRes[0]), mean(knnCVRes[1]), mean(knnCVRes[2]), mean(knnCVRes[3])),
          ("LR", mean(lrCVRes[0]), mean(lrCVRes[1]), mean(lrCVRes[2]), mean(lrCVRes[3]))]

pd.options.display.float_format = '{:.2f}'.format
dtable = pd.DataFrame(scores, columns = ['MODEL', 'Precision', 'Recall', 'Specificity', 'F1-Score'])
# dtable.sort_values(by=['Precision'], ascending=False)
dtable

Unnamed: 0,MODEL,Precision,Recall,Specificity,F1-Score
0,DTC,1.0,1.0,1.0,1.0
1,NB,0.81,0.86,0.79,0.83
2,SVM,0.81,0.91,0.77,0.86
3,RFC,1.0,1.0,1.0,1.0
4,KNN,0.78,0.73,0.77,0.75
5,LR,0.82,0.89,0.79,0.85


# Performance by Precision

In [212]:
scores = [("DTC", mean(dtcCVRes[0])),
          ("NB", mean(nbCVRes[0])),
          ("SVM", mean(svmCVRes[0])),
          ("RFC", mean(rfcCVRes[0])),
          ("KNN", mean(knnCVRes[0])),
          ("LR", mean(lrCVRes[0]))]

pd.options.display.float_format = '{:.2f}'.format
dtable = pd.DataFrame(scores, columns = ['MODEL', 'Precision'])
dtable.sort_values(by=['Precision'], ascending=False)
# dtable

Unnamed: 0,MODEL,Precision
0,DTC,1.0
3,RFC,1.0
5,LR,0.82
1,NB,0.81
2,SVM,0.81
4,KNN,0.78


# Performance by Recall

In [213]:
scores = [("DTC", mean(dtcCVRes[1])),
          ("NB", mean(nbCVRes[1])),
          ("SVM", mean(svmCVRes[1])),
          ("RFC", mean(rfcCVRes[1])),
          ("KNN", mean(knnCVRes[1])),
          ("LR", mean(lrCVRes[1]))]

pd.options.display.float_format = '{:.2f}'.format
dtable = pd.DataFrame(scores, columns = ['MODEL', 'Recall'])
dtable.sort_values(by=['Recall'], ascending=False)
# dtable

Unnamed: 0,MODEL,Recall
0,DTC,1.0
3,RFC,1.0
2,SVM,0.91
5,LR,0.89
1,NB,0.86
4,KNN,0.73


# Performance by Specificity

In [214]:
scores = [("DTC", mean(dtcCVRes[2])),
          ("NB", mean(nbCVRes[2])),
          ("SVM", mean(svmCVRes[2])),
          ("RFC", mean(rfcCVRes[2])),
          ("KNN", mean(knnCVRes[2])),
          ("LR", mean(lrCVRes[2]))]

pd.options.display.float_format = '{:.2f}'.format
dtable = pd.DataFrame(scores, columns = ['MODEL', 'Specificity'])
dtable.sort_values(by=['Specificity'], ascending=False)
# dtable

Unnamed: 0,MODEL,Specificity
0,DTC,1.0
3,RFC,1.0
1,NB,0.79
5,LR,0.79
4,KNN,0.77
2,SVM,0.77


# Performance by F1-Score

In [215]:
scores = [("DTC", mean(dtcCVRes[3])),
          ("NB", mean(nbCVRes[3])),
          ("SVM", mean(svmCVRes[3])),
          ("RFC", mean(rfcCVRes[3])),
          ("KNN", mean(knnCVRes[3])),
          ("LR", mean(lrCVRes[3]))]

pd.options.display.float_format = '{:.2f}'.format
dtable = pd.DataFrame(scores, columns = ['MODEL', 'F1-Score'])
dtable.sort_values(by=['F1-Score'], ascending=False)
# dtable

Unnamed: 0,MODEL,F1-Score
0,DTC,1.0
3,RFC,1.0
2,SVM,0.86
5,LR,0.85
1,NB,0.83
4,KNN,0.75
