In [18]:
import pandas as pd

df = pd.read_csv(r"C:\Users\gulat\Downloads\heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [19]:
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [20]:
X = df.drop("target", axis=1)
y = df["target"]

In [21]:
X.shape, y.shape

((1025, 13), (1025,))

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [24]:
X_train_scaled.shape
X_test_scaled.shape


(205, 13)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [26]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


In [27]:
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    print(f"{name} trained")


Logistic Regression trained
Decision Tree trained
KNN trained
Naive Bayes trained
Random Forest trained
XGBoost trained


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [28]:
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)
import numpy as np


In [29]:
results = []

for name, model in models.items():
    # Predictions
    y_pred = model.predict(X_test_scaled)
    
    # Probabilities (for AUC)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        y_prob = model.decision_function(X_test_scaled)
    
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    results.append([name, acc, auc, prec, rec, f1, mcc])


In [30]:
results_df = pd.DataFrame(
    results,
    columns=["Model", "Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]
)

results_df


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.809756,0.92981,0.761905,0.914286,0.831169,0.630908
1,Decision Tree,0.985366,0.985714,1.0,0.971429,0.985507,0.971151
2,KNN,0.863415,0.962905,0.873786,0.857143,0.865385,0.726935
3,Naive Bayes,0.829268,0.904286,0.807018,0.87619,0.840183,0.660163
4,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0
5,XGBoost,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
cm = confusion_matrix(y_test, models["Logistic Regression"].predict(X_test_scaled))
cm


array([[70, 30],
       [ 9, 96]])

In [32]:
import os
import joblib

os.makedirs("model", exist_ok=True)

for name, model in models.items():
    filename = name.replace(" ", "_").lower() + ".pkl"
    joblib.dump(model, f"model/{filename}")
    print(f"Saved: model/{filename}")

joblib.dump(scaler, "model/scaler.pkl")
print("Saved: model/scaler.pkl")


Saved: model/logistic_regression.pkl
Saved: model/decision_tree.pkl
Saved: model/knn.pkl
Saved: model/naive_bayes.pkl
Saved: model/random_forest.pkl
Saved: model/xgboost.pkl
Saved: model/scaler.pkl
