In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import pickle


# Import top 10 ML Library

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifie

# Load dataset

In [7]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [10]:
df['Sex'].value_counts()

Sex
M    725
F    193
Name: count, dtype: int64

In [11]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [13]:
df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [14]:
df.shape

(918, 12)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [18]:
df[['HeartDisease']]  #0 is No heart disease detected   and 1 is Heart disease present.

Unnamed: 0,HeartDisease
0,0
1,1
2,0
3,1
4,0
...,...
913,1
914,1
915,1
916,1


In [19]:
df['HeartDisease'].value_counts()

HeartDisease
1    508
0    410
Name: count, dtype: int64

In [23]:
var = df[['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']]
for col in var:
    print(f"\nColumn: {col}")
    print(df[col].value_counts())


Column: Sex
Sex
M    725
F    193
Name: count, dtype: int64

Column: ChestPainType
ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64

Column: RestingECG
RestingECG
Normal    552
LVH       188
ST        178
Name: count, dtype: int64

Column: ExerciseAngina
ExerciseAngina
N    547
Y    371
Name: count, dtype: int64

Column: ST_Slope
ST_Slope
Flat    460
Up      395
Down     63
Name: count, dtype: int64


# Label Encoder

In [35]:
label_encoder = preprocessing.LabelEncoder()    # 0 is gradute and 1 is not gradute
obj = (df.dtypes == 'object')                      # 0 is no and 1 is yes
for col in list(obj[obj].index):                   # 0 is approved and 1 is rejected.
  df[col] = label_encoder.fit_transform(df[col])

In [36]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1,1
914,68,1,0,144,193,1,1,141,0,3.4,1,1
915,57,1,0,130,131,0,1,115,1,1.2,1,1
916,57,0,1,130,236,0,0,174,0,0.0,1,1


# Splitting Data into Training and Testing Sets


In [49]:
X = df.drop(columns=['HeartDisease'])
y  = df['Sex']

In [50]:
X

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,1,140,289,0,1,172,0,0.0,2
1,49,0,2,160,180,0,1,156,0,1.0,1
2,37,1,1,130,283,0,2,98,0,0.0,2
3,48,0,0,138,214,0,1,108,1,1.5,1
4,54,1,2,150,195,0,1,122,0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1
914,68,1,0,144,193,1,1,141,0,3.4,1
915,57,1,0,130,131,0,1,115,1,1.2,1
916,57,0,1,130,236,0,0,174,0,0.0,1


In [54]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features (optional but recommended for some models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [58]:
X_train

array([[-1.66859275,  0.51528365, -0.80328342, ...,  1.17935554,
         0.53258119, -2.24895016],
       [-0.17160837, -1.94067869,  1.34465841, ..., -0.84792072,
        -0.83712656,  1.0757818 ],
       [-0.27853583, -1.94067869,  1.34465841, ..., -0.84792072,
         0.53258119,  1.0757818 ],
       ...,
       [-0.59931819,  0.51528365, -0.80328342, ..., -0.84792072,
        -0.38055731, -0.58658418],
       [-0.27853583, -1.94067869, -0.80328342, ...,  1.17935554,
         0.25863964, -0.58658418],
       [ 1.00459365,  0.51528365, -0.80328342, ..., -0.84792072,
        -0.83712656, -0.58658418]], shape=(734, 11))

In [57]:
X_test

array([[ 0.57688382,  0.51528365, -0.80328342, ...,  1.17935554,
        -0.83712656, -0.58658418],
       [-0.59931819, -1.94067869,  1.34465841, ..., -0.84792072,
        -0.65449886,  1.0757818 ],
       [ 0.25610145,  0.51528365,  0.27068749, ..., -0.84792072,
        -0.83712656, -2.24895016],
       ...,
       [-0.59931819, -1.94067869, -0.80328342, ...,  1.17935554,
         0.53258119, -0.58658418],
       [ 0.36302891,  0.51528365,  1.34465841, ..., -0.84792072,
         0.62389504,  1.0757818 ],
       [ 1.00459365,  0.51528365,  0.27068749, ..., -0.84792072,
        -0.65449886,  1.0757818 ]], shape=(184, 11))

# Initialize models


In [75]:
from lightgbm import LGBMClassifier

# Initialize all models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

# Show initialized models
for name, model in models.items():
    print(f"{name} ")

Logistic Regression 
K-Nearest Neighbors 
Support Vector Machine 
Decision Tree 
Random Forest 
Naive Bayes 
Gradient Boosting 
AdaBoost 
XGBoost 
LightGBM 


In [76]:
# 6. Train and evaluate each model
for name, model in models.items():
    print(f"\n📌 Evaluating: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")

    # Classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


📌 Evaluating: Logistic Regression
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        39
           1       1.00      1.00      1.00       145

    accuracy                           1.00       184
   macro avg       1.00      1.00      1.00       184
weighted avg       1.00      1.00      1.00       184

Confusion Matrix:
[[ 39   0]
 [  0 145]]

📌 Evaluating: K-Nearest Neighbors
Accuracy: 0.9891
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        39
           1       0.99      1.00      0.99       145

    accuracy                           0.99       184
   macro avg       0.99      0.97      0.98       184
weighted avg       0.99      0.99      0.99       184

Confusion Matrix:
[[ 37   2]
 [  0 145]]

📌 Evaluating: Support Vector Machine
Accuracy: 1.0000
Classification Report:
              precision    recall  f

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# Save The All Model

In [77]:
# Apply StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)


In [78]:
# Re-initialize all models
models = {
    "logistic_regression.pkl": LogisticRegression(max_iter=1000),
    "knn.pkl": KNeighborsClassifier(),
    "svm.pkl": SVC(probability=True),
    "decision_tree.pkl": DecisionTreeClassifier(),
    "random_forest.pkl": RandomForestClassifier(),
    "naive_bayes.pkl": GaussianNB(),
    "gradient_boosting.pkl": GradientBoostingClassifier(),
    "adaboost.pkl": AdaBoostClassifier(),
    "xgboost.pkl": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "lightgbm.pkl": LGBMClassifier()
}

# Train and save each model
for filename, model in models.items():
    model.fit(X_train, y_train)
    with open(filename, "wb") as f:
        pickle.dump(model, f)
    print(f"✅ Saved: {filename}")


✅ Saved: logistic_regression.pkl
✅ Saved: knn.pkl
✅ Saved: svm.pkl
✅ Saved: decision_tree.pkl
✅ Saved: random_forest.pkl
✅ Saved: naive_bayes.pkl
✅ Saved: gradient_boosting.pkl
✅ Saved: adaboost.pkl


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved: xgboost.pkl
[LightGBM] [Info] Number of positive: 580, number of negative: 154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 382
[LightGBM] [Info] Number of data points in the train set: 734, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.790191 -> initscore=1.326076
[LightGBM] [Info] Start training from score 1.326076
✅ Saved: lightgbm.pkl


# Save The Best Model

In [79]:
import pickle
from sklearn.metrics import accuracy_score

best_model = None
best_accuracy = 0
best_model_name = ""

# Re-train and evaluate each model on scaled data
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: Accuracy = {acc:.4f}")

    # Check if this is the best model so far
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

# Save the best model
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print(f"\n✅ Best Model Saved: {best_model_name} with Accuracy = {best_accuracy:.4f}")


logistic_regression.pkl: Accuracy = 1.0000
knn.pkl: Accuracy = 0.9891
svm.pkl: Accuracy = 1.0000
decision_tree.pkl: Accuracy = 1.0000
random_forest.pkl: Accuracy = 1.0000
naive_bayes.pkl: Accuracy = 1.0000
gradient_boosting.pkl: Accuracy = 1.0000
adaboost.pkl: Accuracy = 1.0000
xgboost.pkl: Accuracy = 1.0000
[LightGBM] [Info] Number of positive: 580, number of negative: 154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 382
[LightGBM] [Info] Number of data points in the train set: 734, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.790191 -> initscore=1.326076
[LightGBM] [Info] Start training from score 1.326076
lightgbm.pkl: Accuracy = 0.9891

✅ Best Model Saved: logistic_regression.pkl with Accuracy = 1.0000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
