# Heart Disease Predictor

This project takes a cleaned dataset containing key indicators of heart disease. 

It compares the performance of three models - Random Forest classifier with randomized search for hyperparameter tuning, XGBoost classifier, and K-Nearest Neighbor.

In [56]:
# Load in libraries

from sklearn.model_selection import train_test_split as tts
import pandas as pd

##### EDA

In [57]:
# Pull in heart disease dataset from Kaggle
### Get 2020 dataset: https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
df = pd.read_csv("heart_2022_no_nans.csv")
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  object 
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  object 
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  object 
 10  HadAngina                  246022 non-null  object 
 11  HadStroke                  246022 non-null  object 
 12  HadAsthma                  246022 non-null  object 
 13  HadSkinCancer              24

In [59]:
df['HadHeartAttack'].value_counts(normalize=True)

HadHeartAttack
No     0.945391
Yes    0.054609
Name: proportion, dtype: float64

In [60]:
# Count of unique values
display(pd.DataFrame(df.select_dtypes(include=['O']).nunique()).T)

Unnamed: 0,State,Sex,GeneralHealth,LastCheckupTime,PhysicalActivities,RemovedTeeth,HadHeartAttack,HadAngina,HadStroke,HadAsthma,...,ChestScan,RaceEthnicityCategory,AgeCategory,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,54,2,5,4,2,4,2,2,2,2,...,2,5,13,2,2,2,2,4,2,3


In [61]:
# Determine if 'AgeCategory' should be numeric or categorical feature 
print(df['AgeCategory'].unique())

['Age 65 to 69' 'Age 70 to 74' 'Age 75 to 79' 'Age 80 or older'
 'Age 50 to 54' 'Age 40 to 44' 'Age 60 to 64' 'Age 55 to 59'
 'Age 45 to 49' 'Age 35 to 39' 'Age 25 to 29' 'Age 30 to 34'
 'Age 18 to 24']


In [62]:
numerical_cols = list(df.select_dtypes(include=['float64']).columns)
categorical_cols = list(df.select_dtypes(include=['object']).columns)


print(f"Numerical features:\n {', '.join(numerical_cols)}\n")
print(f"Categorical features:\n {', '.join(categorical_cols)}\n")

Numerical features:
 PhysicalHealthDays, MentalHealthDays, SleepHours, HeightInMeters, WeightInKilograms, BMI

Categorical features:
 State, Sex, GeneralHealth, LastCheckupTime, PhysicalActivities, RemovedTeeth, HadHeartAttack, HadAngina, HadStroke, HadAsthma, HadSkinCancer, HadCOPD, HadDepressiveDisorder, HadKidneyDisease, HadArthritis, HadDiabetes, DeafOrHardOfHearing, BlindOrVisionDifficulty, DifficultyConcentrating, DifficultyWalking, DifficultyDressingBathing, DifficultyErrands, SmokerStatus, ECigaretteUsage, ChestScan, RaceEthnicityCategory, AgeCategory, AlcoholDrinkers, HIVTesting, FluVaxLast12, PneumoVaxEver, TetanusLast10Tdap, HighRiskLastYear, CovidPos



In [63]:
print(categorical_cols)

['State', 'Sex', 'GeneralHealth', 'LastCheckupTime', 'PhysicalActivities', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus', 'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos']


In [64]:
# See unique categorical values 

categ_list = list(df[categorical_cols])
for i in categ_list:
    print(i)
    print(df[i].unique())

State
['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands']
Sex
['Female' 'Male']
GeneralHealth
['Very good' 'Fair' 'Good' 'Excellent' 'Poor']
LastCheckupTime
['Within past year (anytime less than 12 months ago)'
 '5 or more years ago'
 'Within past 2 years (1 year but less than 2 years ago)'
 'Within past 5 years (2 years but less than 5 years ago)']
PhysicalActivities
['Yes' 'No']
RemovedTeeth
['None of t

##### Variable Encoding

In [65]:
# Encode Yes/No values
def encode_yes_no(value: str):
    ### 1 == Yes.
    if value.lower() == "yes":
        return 1
    ### 0 == No.
    elif value.lower() == "no":
        return 0
    ### 2 == Else.
    else:
        return 2
    
df["HadHeartAttack"] = df["HadHeartAttack"].apply(lambda x: encode_yes_no(x))
df_target = df["HadHeartAttack"]

df_categorical = df[df.select_dtypes(include=['object']).columns]

# Drop old columns
df.drop(columns = ['HadHeartAttack'], inplace = True)

In [None]:
# Encode Categorical Variables 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), df_categorical)], remainder='passthrough')
x_encoded = columnTransformer.fit_transform(df)


  (0, 0)	1.0
  (0, 54)	1.0
  (0, 60)	1.0
  (0, 64)	1.0
  (0, 66)	1.0
  (0, 70)	1.0
  (0, 71)	1.0
  (0, 73)	1.0
  (0, 75)	1.0
  (0, 77)	1.0
  (0, 79)	1.0
  (0, 81)	1.0
  (0, 83)	1.0
  (0, 86)	1.0
  (0, 87)	1.0
  (0, 91)	1.0
  (0, 93)	1.0
  (0, 95)	1.0
  (0, 97)	1.0
  (0, 99)	1.0
  (0, 101)	1.0
  (0, 105)	1.0
  (0, 107)	1.0
  (0, 111)	1.0
  (0, 117)	1.0
  :	:
  (246021, 83)	1.0
  (246021, 85)	1.0
  (246021, 87)	1.0
  (246021, 91)	1.0
  (246021, 93)	1.0
  (246021, 95)	1.0
  (246021, 97)	1.0
  (246021, 99)	1.0
  (246021, 101)	1.0
  (246021, 106)	1.0
  (246021, 107)	1.0
  (246021, 112)	1.0
  (246021, 113)	1.0
  (246021, 128)	1.0
  (246021, 131)	1.0
  (246021, 134)	1.0
  (246021, 136)	1.0
  (246021, 138)	1.0
  (246021, 139)	1.0
  (246021, 143)	1.0
  (246021, 147)	1.0
  (246021, 150)	5.0
  (246021, 151)	1.83
  (246021, 152)	108.86
  (246021, 153)	32.55


##### Train-Test-Split the Data


In [83]:
# Drop target variable
xTrain, xTest, yTrain, yTest = tts(x_encoded, df_target, random_state=23)

##### KNN

In [84]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to the data
knn.fit(xTrain, yTrain)

# Make a prediction
yPred = knn.predict(xTest)

# Create and log MSE metrics using predictions of X_test and its actual value y_test
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(yTest, yPred)
print("MSE:")
print(f"{mse}")

### View Confusion Matrix...
from sklearn.metrics import confusion_matrix
print(confusion_matrix(yTest, yPred))
print("\n")

### View Accuracy Score...
from sklearn.metrics import accuracy_score
print("Acccuracy Score:")
print(f"{round(accuracy_score(yTest, yPred), 2)*100}%")

### View Precision Score...
from sklearn.metrics import precision_score
print("\nPrecision Score:")
print(f"{round(precision_score(yTest, yPred), 2)*100}%")

### View Recall Score...
from sklearn.metrics import recall_score
print("\nRecall Score:")
print(f"{round(recall_score(yTest, yPred), 2)*100}%")

from sklearn.metrics import f1_score
print("\nF1 Score:")
print(f"{round(f1_score(yTest, yPred), 2)*100}%")

##### Try out RandomizedSearchCV with Random Forest Classifier

In [None]:
# dictionary containing hyperparameter names and list of values we want to try
parameters = {"n_estimators": randint(5, 100), 
              "max_depth": [3, 5, 7]}

# Instantiate Model
rf = RandomForestClassifier()
grid_rf_model = RandomizedSearchCV(rf, parameters)

# Train  Model
grid_rf_model.fit(xTrain, yTrain)

best_rf = grid_rf_model.best_estimator_
for p in parameters:
    print(f"Best '{p}': {best_rf.get_params()[p]}")

In [None]:
yPred = best_rf.predict(xTest)

# Create and log MSE metrics using predictions of X_test and its actual value y_test
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(yTest, yPred)
print("MSE:")
print(f"{mse}")

### View Confusion Matrix...
from sklearn.metrics import confusion_matrix
print(confusion_matrix(yTest, yPred))
print("\n")

### View Accuracy Score...
from sklearn.metrics import accuracy_score
print("Acccuracy Score:")
print(f"{round(accuracy_score(yTest, yPred), 2)*100}%")

### View Precision Score...
from sklearn.metrics import precision_score
print("\nPrecision Score:")
print(f"{round(precision_score(yTest, yPred), 2)*100}%")

### View Recall Score...
from sklearn.metrics import recall_score
print("\nRecall Score:")
print(f"{round(recall_score(yTest, yPred), 2)*100}%")

from sklearn.metrics import f1_score
print("\nF1 Score:")
print(f"{round(f1_score(yTest, yPred), 2)*100}%")

##### XGBoost Modeling

In [None]:
import xgboost

classifier = xgboost.XGBClassifier()

classifier.fit(xTrain, yTrain)

In [None]:
# Get Predictions
yPred = classifier.predict(xTest)

# Create and log MSE metrics using predictions of X_test and its actual value y_test
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(yTest, yPred)
print("MSE:")
print(f"{mse}")

### View Confusion Matrix...
from sklearn.metrics import confusion_matrix
print(confusion_matrix(yTest, yPred))
print("\n")

### View Accuracy Score...
from sklearn.metrics import accuracy_score
print("Acccuracy Score:")
print(f"{round(accuracy_score(yTest, yPred), 2)*100}%")

### View Precision Score...
from sklearn.metrics import precision_score
print("\nPrecision Score:")
print(f"{round(precision_score(yTest, yPred), 2)*100}%")

### View Recall Score...
from sklearn.metrics import recall_score
print("\nRecall Score:")
print(f"{round(recall_score(yTest, yPred), 2)*100}%")

from sklearn.metrics import f1_score
print("\nF1 Score:")
print(f"{round(f1_score(yTest, yPred), 2)*100}%")