In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

We will be running 3 models: 1) KNN, 2) Random Forest, 3) Logistic Regression. First we will introduce our data.

In [None]:
heart = pd.read_csv("heart.csv")
heart.head()

In [None]:
heart.info()

Now we will describe each variable. 

age: in years

sex: 0 - Female, 1 - Male

cp: types of chest pain. 0 - Typical Angina, 1 - Atypical Angina, 2 - non-anginal pain, 3 - Asymptomatic 

trtbps: resting systolic blood pressure(mm Hg). Measures the pressure in you arteries when your heart beats.

chol: serum cholesterol (HDL, LDL and triglycerides) (mg/dl)

fbs: fasting blood sugar (1 if >= 120 mg/dl, 0 otherwise)

restecg: resting electrocardiographic results: 0 - normal, 1 - ST-T wave abnormality, 2 - showing probablie or definite left ventricular hypertrophy.

thalachh: maximum heart rate achieved during strenuous exercise

exng: exercise induced heart pain: 1 - yes, 0 - no

oldpeak: ST depression induced from exercise

slp: slope of the peak exercise segment ST wave segment

caa: coronary arteries with an abnormality(0-4)

thall: Thallium stress test to check blood flow.


In [None]:
heart.describe()

Get dummie variables for categorical data.

In [None]:
cp = pd.get_dummies(heart["cp"])
cp = cp.rename({0:"TA", 1:"ATA", 2:"NAP", 3:"ASY"}, axis = 1)

In [None]:
restecg = pd.get_dummies(heart["restecg"])
restecg = restecg.rename({0:"Normal", 1:"STT_ab",2:"hyper"}, axis = 1)

In [None]:
slp = pd.get_dummies(heart["slp"])
slp = slp.rename({0:"up", 1:"flat", 2:"down"}, axis = 1)

In [None]:
thall = pd.get_dummies(heart["thall"])
thall.drop(columns = 0, inplace = True)
thall = thall.rename({1:"normal", 2:"FD", 3:"RD"}, axis = 1)

In [None]:
heart = pd.concat([heart,cp,restecg,slp,thall], axis = 1)
heart.drop(columns = ["cp", "restecg", "slp", "thall"], inplace = True)
heart

In [None]:
value = heart.pop("output")
heart.insert(22, "output", value)
heart

#### What is the naive classifier?

In [None]:
naive = round(heart[heart["output"] == 1]["output"].sum()/len(heart), 3)
naive

In [None]:
sns.countplot(data = heart, x = "output")
plt.title("Counts for Heart Disease")
plt.xlabel("Heart Disease")
plt.ylabel("Counts")
plt.xticks(ticks = [0,1], labels = ["No", "Yes"])
plt.savefig("Counts.png", bbox_inches = "tight")
plt.show()

If we said "yes" to every observation. Then we would be correct 54.5% of the time. We need to check and see if our models beat this.

Now we will run 10-fold cross validation on our KNN and Random Forest to get adequate tuning parameters. For the Random Forest we will use 5 features, since this is roughly the square root of the number of features.

In [None]:
X = heart.iloc[:,:-1]
y = heart.iloc[:,-1]
kf = KFold(n_splits = 10, shuffle = True)

In [None]:
accuracy = []
for i in range(1, 200):
    knn = KNeighborsClassifier(n_neighbors = i)
    cv_score = cross_val_score(knn, X, y, cv = kf)
    accuracy.append(cv_score.mean())

In [None]:
plt.plot(np.arange(1,200), accuracy)
plt.xlabel("k-value")
plt.ylabel("Accuracy Score")
plt.title("Accuracy Score vs. k-value")
plt.savefig("CVKNN.png", bbox_inches = "tight")
plt.show()

We will use a k-value of 15, since this roughly maximizes our accuracy score.

In [None]:
forest = RandomForestClassifier(random_state = 10, max_features = 5)
parameters = {'max_depth':np.arange(1,20), 'n_estimators':np.arange(5,70,5)}
rf = GridSearchCV(forest, parameters, cv = kf).fit(X,y)

In [None]:
rf.best_params_

Max_depth of 6 and n_estimators of 40

Backward Variable selection was done for the Logistic Regression in R

We will now train our models

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.20, random_state = 10)

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors = 15)
knn.fit(xtrain, ytrain)

pred = knn.predict(xtest)
y_score = knn.predict_proba(xtest)[:,1]
score_k = accuracy_score(ytest, pred)
cm_k = confusion_matrix(ytest, pred)
fpr_k, tpr_k, _ = roc_curve(ytest, y_score)
auc_k = round(roc_auc_score(ytest, y_score),3)

In [None]:
plt.plot(fpr_k, tpr_k, label = "AUC = " + str(auc_k))
plt.legend()
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve")
plt.axline([0,0], [1,1], color = "k")
plt.show()

In [None]:
print(score_k)
cm_k

# Random Forest

In [None]:
forest = RandomForestClassifier(max_depth = 6, n_estimators = 40, max_features = 5, random_state = 10)
forest.fit(xtrain, ytrain)

pred = forest.predict(xtest)
y_score = forest.predict_proba(xtest)[:,1]
score_t = accuracy_score(ytest, pred)
cm_t = confusion_matrix(ytest, pred)
fpr_rf, tpr_rf, _ = roc_curve(ytest, y_score)
auc_rf = round(roc_auc_score(ytest, y_score),3)

### Feature Importance

In [None]:
df_dic = {"name":[], "importance":[]}
name = heart.columns
for name, importance in zip(heart[name], forest.feature_importances_):
    print(name, "=", importance)
    df_dic["name"].append(name) 
    df_dic["importance"].append(importance)

In [None]:
df_dic
df = pd.DataFrame.from_dict(df_dic)
df = df.sort_values(by = "importance", ascending = False).reset_index(drop = True)

In [None]:
plt.barh(df["name"], df["importance"])
plt.title("Feature Importance")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.savefig("Importance.png", bbox_inches = "tight")
plt.show()

In [None]:
plt.plot(fpr_rf, tpr_rf, label = "AUC = " + str(auc_rf))
plt.legend()
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve")
plt.axline([0,0], [1,1], color = "k")
plt.show()

In [None]:
print(score_t)
cm_t

# Logistic Regression

##### Using all features

In [None]:
log_reg = LogisticRegression(max_iter = 5000)
log_reg.fit(xtrain,ytrain)

In [None]:
pred = log_reg.predict(xtest)
y_score = log_reg.predict_proba(xtest)[:,1]
score_lr_1 = accuracy_score(ytest, pred)
cm_lr_1 = confusion_matrix(ytest, pred)
fpr_lr_1, tpr_lr_1, _ = roc_curve(ytest, y_score)
auc_lr_1 = round(roc_auc_score(ytest, y_score),3)

In [None]:
plt.plot(fpr_lr_1, tpr_lr_1, label = "AUC = " + str(auc_lr_1))
plt.legend()
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve")
plt.axline([0,0], [1,1], color = "k")
plt.show()

In [None]:
print(score_lr_1)
cm_lr_1

##### After Feature Selection

In [None]:
heart = heart[["trtbps", "thalachh", "ATA", "exng", "STT_ab", "normal", "flat", "oldpeak",
              "sex", "FD", "caa", "TA", "output"]]
X = heart.iloc[:,:-1]
X = sm.add_constant(X)
y = heart.iloc[:,-1]
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.20, random_state = 10)
heart

In [None]:
log_reg = LogisticRegression(max_iter = 5000)
log_reg.fit(xtrain,ytrain)

In [None]:
log_reg.coef_

In [None]:
pred = log_reg.predict(xtest)
y_score = log_reg.predict_proba(xtest)[:,1]
score_lr = accuracy_score(ytest, pred)
cm_lr = confusion_matrix(ytest, pred)
fpr_lr, tpr_lr, _ = roc_curve(ytest, y_score)
auc_lr = round(roc_auc_score(ytest, y_score),3)

In [None]:
plt.plot(fpr_lr, tpr_lr, label = "AUC = " + str(auc_lr))
plt.legend()
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("ROC Curve")
plt.axline([0,0], [1,1], color = "k")
plt.show()

In [None]:
print(score_lr)
cm_lr

# Results

In [None]:
plt.plot(fpr_k, tpr_k,label = "KNN AUC = " + str(auc_k))
plt.plot(fpr_rf, tpr_rf, label = "RF AUC = " + str(auc_rf))
plt.plot(fpr_lr_1, tpr_lr_1, label = "LR AUC (all features) = " + str(auc_lr_1))
plt.plot(fpr_lr, tpr_lr, label = "LR AUC = " + str(auc_lr))
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve's")
plt.legend()
plt.axline([0,0], [1,1], color = "k")
plt.savefig("ROC.png", bbox_inches = "tight")
plt.show()

In [None]:
print("KNN Accuracy Score: " + str(round(score_k,4)))
print("Random Forest Accuracy Score: " + str(round(score_t,4)))
print("Logistic Regression Accuracy Score (all features): " + str(round(score_lr_1,4)))
print("Logistic Regression Accuracy Score: " + str(round(score_lr,4)))

In [None]:
print("KNN Confusion Matrix: ")
print(cm_k)
print("Random Forest Confusion Matrix: ")
print(cm_t)
print("Logistic Regression Confusion Matrix (all features): ")
print(cm_lr_1)
print("Logistic Regression Confusion Matrix: ")
print(cm_lr)