# Heart Disease Predictor - Random Forest Classifier vs. XGBoost

This project takes a cleaned dataset containing key indicators of heart disease. It uses a Random Forest classifier with randomized search for hyperparameter tuning and compares performance to XGBoost classifier.

In [1]:
# Load in libraries

from sklearn.model_selection import train_test_split as tts
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

##### EDA

In [2]:
# Pull in heart disease dataset from Kaggle
### Get 2020 dataset: https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
df = pd.read_csv("heart_2020_cleaned.csv")
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [3]:
ycol = ["HeartDisease"]
df['HeartDisease'].value_counts(normalize=True)

No     0.914405
Yes    0.085595
Name: HeartDisease, dtype: float64

In [4]:
# Count of unique values
display(pd.DataFrame(df.select_dtypes(include=['O']).nunique()).T)

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,Asthma,KidneyDisease,SkinCancer
0,2,2,2,2,2,2,13,6,4,2,5,2,2,2


In [5]:
# Determine if 'AgeCategory' should be numeric or categorical feature 
print(df['AgeCategory'].unique())

['55-59' '80 or older' '65-69' '75-79' '40-44' '70-74' '60-64' '50-54'
 '45-49' '18-24' '35-39' '30-34' '25-29']


In [6]:
numerical_cols = list(df.select_dtypes(include=['float64', 'int64']).columns)
categorical_cols = list(df.select_dtypes(include=['object']).columns)

# Age acts more like a numeric feature
categorical_cols.remove('AgeCategory')
numerical_cols.append('AgeCategory')

print(f"Numerical features:\n {', '.join(numerical_cols)}\n")
print(f"Categorical features:\n {', '.join(categorical_cols)}\n")

Numerical features:
 BMI, PhysicalHealth, MentalHealth, SleepTime, AgeCategory

Categorical features:
 HeartDisease, Smoking, AlcoholDrinking, Stroke, DiffWalking, Sex, Race, Diabetic, PhysicalActivity, GenHealth, Asthma, KidneyDisease, SkinCancer



In [7]:
# See unique categorical values 

categ_list = list(df[categorical_cols])
for i in categ_list:
    print(i)
    print(df[i].unique())

HeartDisease
['No' 'Yes']
Smoking
['Yes' 'No']
AlcoholDrinking
['No' 'Yes']
Stroke
['No' 'Yes']
DiffWalking
['No' 'Yes']
Sex
['Female' 'Male']
Race
['White' 'Black' 'Asian' 'American Indian/Alaskan Native' 'Other'
 'Hispanic']
Diabetic
['Yes' 'No' 'No, borderline diabetes' 'Yes (during pregnancy)']
PhysicalActivity
['Yes' 'No']
GenHealth
['Very good' 'Fair' 'Good' 'Poor' 'Excellent']
Asthma
['Yes' 'No']
KidneyDisease
['No' 'Yes']
SkinCancer
['Yes' 'No']


##### Variable Encoding

In [8]:
# Change age to numerical values
df['AgeCategory'] = df['AgeCategory'].apply(lambda x : int(x[:2]))
print(df['AgeCategory'].unique())

[55 80 65 75 40 70 60 50 45 18 35 30 25]


In [9]:
# Encode Yes/No values
def integer_encode_yes_no(value: str):
    ### 1 == Yes.
    if value.lower() == "yes":
        return 1
    ### 0 == No.
    elif value.lower() == "no":
        return 0
    ### 2 == Else.
    else:
        return 2
    
# Apply a function along a dataframe axis
df["HeartDisease"] = df["HeartDisease"].apply(lambda x: integer_encode_yes_no(x))
df["Smoking"] = df["Smoking"].apply(lambda x: integer_encode_yes_no(x))
df["AlcoholDrinking"] = df["AlcoholDrinking"].apply(lambda x: integer_encode_yes_no(x))
df["DiffWalking"] = df["DiffWalking"].apply(lambda x: integer_encode_yes_no(x))
df["Diabetic"] = df["Diabetic"].apply(lambda x: integer_encode_yes_no(x))
df["PhysicalActivity"] = df["PhysicalActivity"].apply(lambda x: integer_encode_yes_no(x))
df["Asthma"] = df["Asthma"].apply(lambda x: integer_encode_yes_no(x))
df["KidneyDisease"] = df["KidneyDisease"].apply(lambda x: integer_encode_yes_no(x))
df["SkinCancer"] = df["SkinCancer"].apply(lambda x: integer_encode_yes_no(x))
df["Stroke"] = df["Stroke"].apply(lambda x: integer_encode_yes_no(x))

# Encode Sex Values
def integer_encode_sex_values(value: str):
    ### 1 === Male.
    if value.lower() == "male":
        return 1
    ### 0 == Female.
    elif value.lower() == "female":
        return 0
    ### 2 == Else.
    else:
        return 2

df["Sex"] = df["Sex"].apply(lambda x: integer_encode_sex_values(x))



# Encode Race Values
def integer_encode_race_values(value: str):
    ### 0 if White...
    if value == "White":
        return 0
    ### 1 if Hispanic...
    elif value == "Hispanic":
        return 1
    ### 2 if Black...
    elif value == "Black":
        return 2
    ### 3 if Other...
    elif value == "Other":
        return 3
    ### 4 if Asian...
    elif value == "Asian":
        return 4
    ### 5 if American Indian/Alaskan Native
    elif value == "American Indian/Alaskan Native":
        return 5
    ### 99 if Else...
    else:
        return 99
df["Race"] = df["Race"].apply(lambda x: integer_encode_race_values(x))

# Encode health values
def integer_encode_health_values(value: str):
    ### 0 if Very good...
    if value == "Very good":
        return 0
    ### 1 if Good...
    elif value == "Good":
        return 1
    ### 2 if Excellent...
    elif value == "Excellent":
        return 2
    ### 3 if Fair...
    elif value == "Fair":
        return 3
    ### 4 if Poor...
    elif value == "Poor":
        return 4
    ### 99 if Else...
    else:
        return 99
df["GenHealth"] = df["GenHealth"].apply(lambda x: integer_encode_health_values(x))

In [10]:
# Verify encoding
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,55,0,1,1,0,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,80,0,0,1,0,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,65,0,1,1,3,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,75,0,0,0,1,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,40,0,0,1,0,8.0,0,0,0


##### Train-Test-Split the Data


In [11]:
# Drop target variable
xTrain, xTest, yTrain, yTest = tts(df.drop(["HeartDisease"], axis=1), df["HeartDisease"], random_state=23)

##### Try out RandomizedSearchCV with Random Forest Classifier

In [12]:
# dictionary containing hyperparameter names and list of values we want to try
parameters = {"n_estimators": randint(5, 100), 
              "max_depth": [3, 5, 7]}

# Instantiate Model
rf = RandomForestClassifier()
grid_rf_model = RandomizedSearchCV(rf, parameters)

# Train  Model
grid_rf_model.fit(xTrain, yTrain)

best_rf = grid_rf_model.best_estimator_
for p in parameters:
    print(f"Best '{p}': {best_rf.get_params()[p]}")

Best 'n_estimators': 35
Best 'max_depth': 7


In [13]:
yPred = best_rf.predict(xTest)

# Create and log MSE metrics using predictions of X_test and its actual value y_test
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(yTest, yPred)
print("MSE:")
print(f"{mse}")

### View Confusion Matrix...
from sklearn.metrics import confusion_matrix
print(confusion_matrix(yTest, yPred))
print("\n")

### View Accuracy Score...
from sklearn.metrics import accuracy_score
print("Acccuracy Score:")
print(f"{round(accuracy_score(yTest, yPred), 2)*100}%")

### View Precision Score...
from sklearn.metrics import precision_score
print("\nPrecision Score:")
print(f"{round(precision_score(yTest, yPred), 2)*100}%")

### View Recall Score...
from sklearn.metrics import recall_score
print("\nRecall Score:")
print(f"{round(recall_score(yTest, yPred), 2)*100}%")

from sklearn.metrics import f1_score
print("\nF1 Score:")
print(f"{round(f1_score(yTest, yPred), 2)*100}%")

MSE:
0.08275275488123679
[[73147    94]
 [ 6522   186]]


Acccuracy Score:
92.0%

Precision Score:
66.0%

Recall Score:
3.0%

F1 Score:
5.0%


##### XGBoost Modeling

In [14]:
import xgboost

classifier = xgboost.XGBClassifier()

classifier.fit(xTrain, yTrain)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [15]:
# Get Predictions
yPred = classifier.predict(xTest)

# Create and log MSE metrics using predictions of X_test and its actual value y_test
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(yTest, yPred)
print("MSE:")
print(f"{mse}")

### View Confusion Matrix...
from sklearn.metrics import confusion_matrix
print(confusion_matrix(yTest, yPred))
print("\n")

### View Accuracy Score...
from sklearn.metrics import accuracy_score
print("Acccuracy Score:")
print(f"{round(accuracy_score(yTest, yPred), 2)*100}%")

### View Precision Score...
from sklearn.metrics import precision_score
print("\nPrecision Score:")
print(f"{round(precision_score(yTest, yPred), 2)*100}%")

### View Recall Score...
from sklearn.metrics import recall_score
print("\nRecall Score:")
print(f"{round(recall_score(yTest, yPred), 2)*100}%")

from sklearn.metrics import f1_score
print("\nF1 Score:")
print(f"{round(f1_score(yTest, yPred), 2)*100}%")

MSE:
0.08310297814856972
[[72639   602]
 [ 6042   666]]


Acccuracy Score:
92.0%

Precision Score:
53.0%

Recall Score:
10.0%

F1 Score:
17.0%
