In [29]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [15]:
# Read our data file, heart.csv. Source: https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset/data
# Note, the target value is encoded incorrectly, 0 is positive for heart disease, 1 is negative.
# See https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset/discussion/401933
data = pd.read_csv('heart.csv')

In [16]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [17]:
# Show our table to take a look at general statistics about our data. 
# You can see  that our data is skewed towards males, as the mean sex is 0.69 
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [18]:
X = data.drop('target', axis=1)
y = data['target']

In [26]:
# Here we split out data into training and testing data, so we can use some of the data to test our predictions later. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# We instantiate our random forest model
model = RandomForestClassifier()

In [27]:
# We train the model with our training data
model.fit(X_train, y_train)

In [28]:
# Given our testing datas x values (everything but the target column) we are predicting whether a person will have heart disease
y_pred = model.predict(X_test)

In [30]:
# We evaluate the accuracy of our model based on comparing the predicted y values to the actual y values in our testing data
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.99


    True Negatives (TN): 102  <-  The number of instances correctly predicted as having heart disease (target=0)
    False Positives (FP): 0   <-  The number of instances incorrectly predicted as not having heart disease (target=1)
    False Negatives (FN): 3   <-  The number of instances incorrectly predicted as having heart disease (target=0)
    True Positives (TP): 100  <-  The number of instances correctly predicted as not having heart disease (target=1)

In [31]:
# We print a confusion matrix to show the results of TN/FP/FN/TP table
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[102   0]
 [  3 100]]


We also print out a precision report that indicates our recall ability. We notice that it makes a few mistakes and tells people they have heart disease when they do not have heart disease, but there is no cases where the model tells someone they don't have heart disease when they do in fact have heart disease. 

In [25]:
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205

