In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("../data/cleaned_heart_disease.csv")

In [5]:
df.head()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num,sex_Male,dataset_Hungary,...,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,1,1.007386,0.705176,0.303643,0.489727,1.368109,-0.3614,-0.871794,1,0,...,0,1,1,0,0,0,0,0,0,0
1,2,1.432034,1.518569,0.789967,-1.181478,0.611589,4.411152,0.879408,1,0,...,0,0,0,0,0,1,1,0,1,0
2,3,1.432034,-0.650479,0.266939,-0.345875,1.651804,2.820301,0.003807,1,0,...,0,0,0,0,0,1,1,0,0,1
3,4,-1.752828,-0.108217,0.459634,1.961979,2.502889,-0.3614,-0.871794,1,0,...,1,0,0,1,0,0,0,0,1,0
4,5,-1.32818,-0.108217,0.037541,1.36512,0.517024,-0.3614,-0.871794,0,0,...,0,0,0,0,0,0,0,1,1,0


In [6]:
df['num'] = (df['num'] >0).astype(int)

print(df['num'].value_counts())
print(df['num'].unique())

num
1    509
0    411
Name: count, dtype: int64
[0 1]


In [7]:
X = df.drop(columns=['num'])
y = df['num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)

Training set: (736, 22)
Testing set: (184, 22)


In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"âœ… {name} trained successfully.")

âœ… Logistic Regression trained successfully.
âœ… Decision Tree trained successfully.
âœ… Random Forest trained successfully.
âœ… SVM trained successfully.


In [12]:
results = []

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None
    })

In [15]:
results = pd.DataFrame(results)
print("\nðŸ“ˆ Model Performance Summary:")
print(results)


ðŸ“ˆ Model Performance Summary:
                     Accuracy  Precision  Recall  F1-Score    AUC
Model                                                            
Logistic Regression     0.848      0.849   0.882     0.865  0.934
Decision Tree           0.810      0.832   0.824     0.828  0.808
Random Forest           0.891      0.866   0.951     0.907  0.955
SVM                     0.799      0.874   0.745     0.804  0.743
