In [39]:
import matplotlib.pyplot as plt
import numpy as np
import csv

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score
from numpy import mean

In [40]:
file = open("heart.csv", 'r')
csv_reader = csv.reader(file)

header = next(csv_reader)
#print(header)

X, y = [], []
for features in csv_reader:
  y.append(features.pop(-1))
  X.append(features)

X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

In [41]:
lst_y_pred_logR, lst_y_pred_KNN, lst_y_test = [], [], []
iterations = 10

for _ in range(iterations):
  model_logR = LogisticRegression(max_iter=2000, fit_intercept=True)
  model_logR.fit(X, y);

  model_KNN = KNeighborsClassifier(n_neighbors=5)
  model_KNN.fit(X, y);

  y_pred_logR = model_logR.predict(X)
  y_pred_KNN = model_KNN.predict(X)

  lst_y_pred_logR.append(y_pred_logR)
  lst_y_pred_KNN.append(y_pred_KNN)
  lst_y_test.append(y)


accuracy_logR = 0
accuracy_KNN = 0

for i in range(iterations):
  accuracy_logR += accuracy_score(lst_y_test[i], lst_y_pred_logR[i])
  accuracy_KNN += accuracy_score(lst_y_test[i], lst_y_pred_KNN[i])

print("LogR Accuracy (Basic without split) = %.3f" % (accuracy_logR/iterations))
print("KNN Accuracy (Basic without split) = %.3f" % (accuracy_KNN/iterations))

LogR Accuracy (Basic without split) = 0.848
KNN Accuracy (Basic without split) = 0.766


In [42]:
lst_y_pred_logR, lst_y_pred_KNN, lst_y_test = [], [], []
iterations = 10

for _ in range(iterations):
  X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    shuffle=True,
                                                    stratify=y)
  
  model_logR = LogisticRegression(max_iter=2000, fit_intercept=True)
  model_logR.fit(X_train, y_train);

  model_KNN = KNeighborsClassifier(n_neighbors=5)
  model_KNN.fit(X_train, y_train);

  y_pred_logR = model_logR.predict(X_test)
  y_pred_KNN = model_KNN.predict(X_test)

  lst_y_pred_logR.append(y_pred_logR)
  lst_y_pred_KNN.append(y_pred_KNN)
  lst_y_test.append(y_test)


accuracy_logR = 0
accuracy_KNN = 0

for i in range(iterations):
  accuracy_logR += accuracy_score(lst_y_test[i], lst_y_pred_logR[i])
  accuracy_KNN += accuracy_score(lst_y_test[i], lst_y_pred_KNN[i])

print("LogR Accuracy (with Train and Test split) = %.3f" % (accuracy_logR/iterations))
print("KNN Accuracy (with Train and Test split) = %.3f" % (accuracy_KNN/iterations))

LogR Accuracy (with Train and Test split) = 0.821
KNN Accuracy (with Train and Test split) = 0.641


In [43]:
# prepare the cross-validation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)

# creating models
model_logR = LogisticRegression(max_iter=2000, fit_intercept=True)
model_KNN = KNeighborsClassifier(n_neighbors=5)

# evaluate model
scores_logR = cross_val_score(model_logR, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
scores_KNN = cross_val_score(model_KNN, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print("LogR Accuracy (with RepeatedKFold) = %.3f" % (mean(scores_logR)))
print("KNN Accuracy (with RepeatedKFold) = %.3f" % (mean(scores_KNN)))

LogR Accuracy (with RepeatedKFold) = 0.831
KNN Accuracy (with RepeatedKFold) = 0.648
