# Heart Disease Predictor

This project takes a cleaned dataset containing key indicators of heart disease. 

It compares the performance of three models - K-Nearest Neighbor, Random Forest classifier with randomized search for hyperparameter tuning,and XGBoost classifier.

In [200]:
# Load in libraries

from sklearn.model_selection import train_test_split as tts
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

##### EDA

In [201]:
# Pull in heart disease dataset from Kaggle
### Get 2020 dataset: https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
df = pd.read_csv("Heart_Disease_Prediction.csv")
df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [202]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    object 
dtypes: float64(1), int64(12), 

In [203]:
df.isnull().sum()

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64

In [204]:
df['Heart Disease'].value_counts(normalize=True)

Heart Disease
Absence     0.555556
Presence    0.444444
Name: proportion, dtype: float64

In [205]:
# Determine if 'AgeCategory' should be numeric or categorical feature 
print(df['Age'].unique())

[70 67 57 64 74 65 56 59 60 63 53 44 61 71 46 40 48 43 47 54 51 58 66 37
 50 42 62 49 52 45 41 76 39 35 55 34 38 69 68 77 29]


In [206]:
### Bin age values by deciles
df['AgeBin'] = pd.qcut(df['Age'], 10, labels=False)

df.drop(columns = ['Age'], inplace = True)

In [207]:
numerical_cols = list(df.select_dtypes(include=['float64', 'int64']).columns)
categorical_cols = list(df.select_dtypes(include=['object']).columns)


print(f"Numerical features:\n {', '.join(numerical_cols)}\n")
print(f"Categorical features:\n {', '.join(categorical_cols)}\n")

Numerical features:
 Sex, Chest pain type, BP, Cholesterol, FBS over 120, EKG results, Max HR, Exercise angina, ST depression, Slope of ST, Number of vessels fluro, Thallium, AgeBin

Categorical features:
 Heart Disease



In [208]:
# See unique categorical values 

categ_list = list(df[categorical_cols])
for i in categ_list:
    print(i)
    print(df[i].unique())

Heart Disease
['Presence' 'Absence']


##### Variable Encoding

In [209]:
# Encode Yes/No values
def encode_yes_no(value: str):
    ### 1 == Yes.
    if value.lower() == "presence":
        return 1
    ### 0 == No.
    elif value.lower() == "absence":
        return 0
    ### 2 == Else.
    else:
        return 2
    
df['Heart Disease'] = df['Heart Disease'].apply(lambda x: encode_yes_no(x))
df_target = df['Heart Disease']

# Drop old columns
df.drop(columns = ['Heart Disease'], inplace = True)

df_numerical = df[df.select_dtypes(include=['float64', 'int64']).columns]


##### Train-Test-Split the Data


In [210]:
# Drop target variable
xTrain, xTest, yTrain, yTest = tts(df_numerical, df_target, random_state=17)

In [211]:
# Standardize features
scaler = StandardScaler()
xTrain = scaler.fit_transform(xTrain)
xTest = scaler.transform(xTest)

### KNN

In [212]:
# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to the data
knn.fit(xTrain, yTrain)

# Make a prediction
yPred = knn.predict(xTest)

# Create and log MSE metrics using predictions of X_test and its actual value y_test
mse = mean_squared_error(yTest, yPred)
print("MSE:")
print(f"{mse}")

### View Confusion Matrix...
print(confusion_matrix(yTest, yPred))
print("\n")

### View Accuracy Score...
print("Acccuracy Score:")
print(f"{round(accuracy_score(yTest, yPred), 2)*100}%")

### View Precision Score...
print("\nPrecision Score:")
print(f"{round(precision_score(yTest, yPred), 2)*100}%")

### View Recall Score...
print("\nRecall Score:")
print(f"{round(recall_score(yTest, yPred), 2)*100}%")

print("\nF1 Score:")
print(f"{round(f1_score(yTest, yPred), 2)*100}%")

MSE:
0.19117647058823528
[[35  6]
 [ 7 20]]


Acccuracy Score:
81.0%

Precision Score:
77.0%

Recall Score:
74.0%

F1 Score:
75.0%


##### Try out RandomizedSearchCV with Random Forest Classifier

In [213]:
# dictionary containing hyperparameter names and list of values we want to try
parameters = {"n_estimators": [20,50,100,200], 
              "max_depth": [3, 5, 7]}

# Instantiate Model
rf = RandomForestClassifier()
grid_rf_model = RandomizedSearchCV(rf, parameters)

# Train  Model
grid_rf_model.fit(xTrain, yTrain)

best_rf = grid_rf_model.best_estimator_
for p in parameters:
    print(f"Best '{p}': {best_rf.get_params()[p]}")

Best 'n_estimators': 100
Best 'max_depth': 7


In [214]:
yPred = best_rf.predict(xTest)

# Create and log MSE metrics using predictions of X_test and its actual value y_test
mse = mean_squared_error(yTest, yPred)
print("MSE:")
print(f"{mse}")

### View Confusion Matrix...
print(confusion_matrix(yTest, yPred))
print("\n")

### View Accuracy Score...
print("Acccuracy Score:")
print(f"{round(accuracy_score(yTest, yPred), 2)*100}%")

### View Precision Score...
print("\nPrecision Score:")
print(f"{round(precision_score(yTest, yPred), 2)*100}%")

### View Recall Score...
print("\nRecall Score:")
print(f"{round(recall_score(yTest, yPred), 2)*100}%")

print("\nF1 Score:")
print(f"{round(f1_score(yTest, yPred), 2)*100}%")

MSE:
0.14705882352941177
[[39  2]
 [ 8 19]]


Acccuracy Score:
85.0%

Precision Score:
90.0%

Recall Score:
70.0%

F1 Score:
79.0%


##### XGBoost Modeling

In [215]:
!pip install xgboost

[0m

In [216]:
import xgboost

classifier = xgboost.XGBClassifier()
classifier.fit(xTrain, yTrain)

In [217]:
# Get Predictions
yPred = classifier.predict(xTest)

# Create and log MSE metrics using predictions of X_test and its actual value y_test
mse = mean_squared_error(yTest, yPred)
print("MSE:")
print(f"{mse}")

### View Confusion Matrix...
print(confusion_matrix(yTest, yPred))
print("\n")

### View Accuracy Score...
print("Acccuracy Score:")
print(f"{round(accuracy_score(yTest, yPred), 2)*100}%")

### View Precision Score...
print("\nPrecision Score:")
print(f"{round(precision_score(yTest, yPred), 2)*100}%")

### View Recall Score...
print("\nRecall Score:")
print(f"{round(recall_score(yTest, yPred), 2)*100}%")

print("\nF1 Score:")
print(f"{round(f1_score(yTest, yPred), 2)*100}%")

MSE:
0.19117647058823528
[[36  5]
 [ 8 19]]


Acccuracy Score:
81.0%

Precision Score:
79.0%

Recall Score:
70.0%

F1 Score:
75.0%
