In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import joblib

In [2]:
# Load the dataset
df = pd.read_csv(".\data\heart.csv")

In [4]:
#check number of rows and columns
df.shape

(1025, 14)

In [3]:
# check for missing values
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
#check the first five records
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [6]:
#data statistics
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


The provided table contains statistical summary information for the dataset's columns. Here's an interpretation of each column:

- **age:** The age of the individuals in the dataset ranges from 29 to 77 years, with an average age of approximately 54.43 years.

- **sex:** The 'sex' column is binary, with 0 representing females and 1 representing males. The mean value is approximately 0.70, indicating that there are slightly more males than females in the dataset.

- **cp (chest pain type):** This categorical variable has values ranging from 0 to 3, representing different types of chest pain. The mean value is approximately 0.94, suggesting that on average, chest pain type 1 is the most common.

- **trestbps (resting blood pressure):** The resting blood pressure varies from 94 to 200 mm Hg, with an average value of approximately 131.61 mm Hg.

- **chol (cholesterol level):** Cholesterol levels range from 126 to 564 mg/dL, with an average cholesterol level of approximately 246 mg/dL.

- **fbs (fasting blood sugar):** Fasting blood sugar is binary (0 or 1), with an average value of approximately 0.15. This indicates that a small portion of the individuals have elevated fasting blood sugar.

- **restecg (resting electrocardiographic results):** This categorical variable ranges from 0 to 2, representing different resting electrocardiographic results. The mean value is approximately 0.53, indicating that result type 1 is the most common.

- **thalach (maximum heart rate achieved):** The maximum heart rate achieved ranges from 71 to 202 beats per minute, with an average of approximately 149.11 bpm.

- **exang (exercise-induced angina):** This binary variable (0 or 1) indicates whether exercise induced angina. On average, it occurs in approximately 33.66% of cases.

- **oldpeak (ST depression induced by exercise):** The ST depression induced by exercise ranges from 0 to 6.2, with an average value of approximately 1.07.

- **slope (slope of the peak exercise ST segment):** This categorical variable ranges from 0 to 2, representing different slope types. The mean value is approximately 1.39.

- **ca (number of major vessels):** The number of major vessels ranges from 0 to 4, with an average value of approximately 0.75.

- **thal (thalassemia type):** Thalassemia type ranges from 0 to 3, with an average value of approximately 2.32.

- **target:** The 'target' column is binary, with 0 indicating no heart disease and 1 indicating the presence of heart disease. On average, approximately 51.31% of individuals in the dataset have heart disease.

In [8]:
# Split features and target variable
X = df.drop("target", axis=1)
y = df["target"]

In [9]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [11]:
# Initialize the models
models = [
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("Support Vector Machine", SVC(random_state=42)),
    ("K-Nearest Neighbors", KNeighborsClassifier()),
    ("Logistic Regression", LogisticRegression(random_state=42))
]

In [12]:
best_model = None
best_accuracy = 0

for model_name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{model_name} Accuracy: {accuracy}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

Random Forest Accuracy: 0.9853658536585366
Gradient Boosting Accuracy: 0.9317073170731708
Support Vector Machine Accuracy: 0.8878048780487805
K-Nearest Neighbors Accuracy: 0.8341463414634146
Logistic Regression Accuracy: 0.7951219512195122


In [13]:
print(f"Best Model: {best_model}, Best Accuracy: {best_accuracy}")

Best Model: RandomForestClassifier(random_state=42), Best Accuracy: 0.9853658536585366


In [15]:
# Save the best model to a file
joblib.dump(best_model, "./model/best_heart_disease_model.pkl")

['./model/best_heart_disease_model.pkl']