In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## 1. Exploring the Data
We decided on a dataset from [Kaggle](https://www.kaggle.com/datasets/mexwell/heart-disease-dataset?resource=download) that uses the UCI dataset and more.

In [3]:
data = pd.read_csv("heart_statlog_cleveland_hungary_final.csv")

In [4]:
#Basic info about the dataset, appears to be no NaNs
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB


In [5]:
#Checking to see if there are any outliers
data.describe()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,53.720168,0.763866,3.232773,132.153782,210.363866,0.213445,0.698319,139.732773,0.387395,0.922773,1.62437,0.528571
std,9.358203,0.424884,0.93548,18.368823,101.420489,0.409912,0.870359,25.517636,0.48736,1.086337,0.610459,0.499393
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,3.0,120.0,188.0,0.0,0.0,121.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,4.0,130.0,229.0,0.0,0.0,140.5,0.0,0.6,2.0,1.0
75%,60.0,1.0,4.0,140.0,269.75,0.0,2.0,160.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [6]:
#Checking to see how balanced the targets are
data["target"].value_counts()

target
1    629
0    561
Name: count, dtype: int64

## 2. Preparing Data

In [8]:
data.shape

(1190, 12)

In [9]:
X = data.drop("target", axis=1)
y = data["target"]

In [10]:
#Using sklearns train_test_split to split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=321) 

In [11]:
#Normalizing with Standard Scaler since data is nominal, binary, numerical (no categorical)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
#Saving scaler with pickle for frontend implementation
with open("scaler.pickle", "wb") as f:
    pickle.dump(scaler, f, pickle.HIGHEST_PROTOCOL)

## Choosing Models and Training
The chosen models were Logistic Regression, Decision Tree, Random Forest, SVM, and K-Nearest Neighbors

In [14]:
#Storing model names and classes in a dict for easy looping during training
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier() 
}

#accuracy, precision, recall, f1, roc_auc
list_of_scores = []

In [15]:
#Training
for name, model in models.items():
    score_list = [name]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #Saving the best model with pickle for frontend implementation
    if name == "Random Forest":
        with open("model.pickle", "wb") as f:
            pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
    score_list.append(metrics.accuracy_score(y_test, y_pred))
    score_list.append(metrics.precision_score(y_test, y_pred))
    score_list.append(metrics.recall_score(y_test, y_pred))
    score_list.append(metrics.f1_score(y_test, y_pred))
    score_list.append(metrics.roc_auc_score(y_test, y_pred))
    list_of_scores.append(score_list)

In [16]:
score_df = pd.DataFrame(list_of_scores, columns=["Model", "Accuracy", "Precision", "Recall", "F1", "ROC_AUC"])

In [17]:
score_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC_AUC
0,Logistic Regression,0.815126,0.829268,0.816,0.822581,0.81508
1,Decision Tree,0.89916,0.891473,0.92,0.905512,0.898053
2,Random Forest,0.907563,0.905512,0.92,0.912698,0.906903
3,SVM,0.836134,0.825758,0.872,0.848249,0.83423
4,KNN,0.819328,0.810606,0.856,0.832685,0.817381


It appears that the Random Forest scores best in all categories, so this model was chosen for the CDSS.