Import

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib



In [3]:
# Load dataset
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Patient_ID,State_Name,Age,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,...,Diastolic_BP,Air_Pollution_Exposure,Family_History,Stress_Level,Healthcare_Access,Heart_Attack_History,Emergency_Response_Time,Annual_Income,Health_Insurance,Heart_Attack_Risk
0,1,Rajasthan,42,Female,0,0,1,1,0,0,...,119,1,0,4,0,0,157,611025,0,0
1,2,Himachal Pradesh,26,Male,0,0,0,0,1,1,...,115,0,0,7,0,0,331,174527,0,0
2,3,Assam,78,Male,0,0,1,0,0,1,...,117,0,1,10,1,0,186,1760112,1,0
3,4,Odisha,58,Male,1,0,1,0,0,1,...,65,0,0,1,1,1,324,1398213,0,0
4,5,Karnataka,22,Male,0,0,0,0,0,1,...,109,0,0,9,0,0,209,97987,0,1


In [4]:
df.shape

(10000, 26)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Patient_ID               10000 non-null  int64 
 1   State_Name               10000 non-null  object
 2   Age                      10000 non-null  int64 
 3   Gender                   10000 non-null  object
 4   Diabetes                 10000 non-null  int64 
 5   Hypertension             10000 non-null  int64 
 6   Obesity                  10000 non-null  int64 
 7   Smoking                  10000 non-null  int64 
 8   Alcohol_Consumption      10000 non-null  int64 
 9   Physical_Activity        10000 non-null  int64 
 10  Diet_Score               10000 non-null  int64 
 11  Cholesterol_Level        10000 non-null  int64 
 12  Triglyceride_Level       10000 non-null  int64 
 13  LDL_Level                10000 non-null  int64 
 14  HDL_Level                10000 non-null

In [6]:
print(df.isna().sum())

Patient_ID                 0
State_Name                 0
Age                        0
Gender                     0
Diabetes                   0
Hypertension               0
Obesity                    0
Smoking                    0
Alcohol_Consumption        0
Physical_Activity          0
Diet_Score                 0
Cholesterol_Level          0
Triglyceride_Level         0
LDL_Level                  0
HDL_Level                  0
Systolic_BP                0
Diastolic_BP               0
Air_Pollution_Exposure     0
Family_History             0
Stress_Level               0
Healthcare_Access          0
Heart_Attack_History       0
Emergency_Response_Time    0
Annual_Income              0
Health_Insurance           0
Heart_Attack_Risk          0
dtype: int64


In [7]:
print(df['Heart_Attack_Risk'].value_counts()) 

Heart_Attack_Risk
0    6993
1    3007
Name: count, dtype: int64


In [8]:
X = df.drop(columns=['Heart_Attack_Risk'])
y = df['Heart_Attack_Risk']

In [9]:
X.shape, y.shape

((10000, 25), (10000,))

In [10]:
# Identify categorical vs numeric
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()

# Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
X_train.shape, X_test.shape

((8000, 25), (2000, 25))

In [12]:
# Logistic Regression
pipe_lr = Pipeline(steps=[('preprocessor', preprocessor),
                          ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))])

# Random Forest
pipe_rf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('clf', RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42))])

# XGBoost
pipe_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                           ('clf', XGBClassifier(n_estimators=200, eval_metric='logloss', random_state=42))])

models = {"LogisticRegression": pipe_lr, "RandomForest": pipe_rf, "XGBoost": pipe_xgb}
results = {}

for name, model in models.items():
    print(f"\n▶ Training {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_prob))

    results[name] = (model, roc_auc_score(y_test, y_prob))



▶ Training LogisticRegression
              precision    recall  f1-score   support

           0       0.70      0.52      0.60      1399
           1       0.30      0.47      0.36       601

    accuracy                           0.51      2000
   macro avg       0.50      0.50      0.48      2000
weighted avg       0.58      0.51      0.53      2000

ROC AUC: 0.4953728536784654

▶ Training RandomForest


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

           0       0.70      1.00      0.82      1399
           1       0.00      0.00      0.00       601

    accuracy                           0.70      2000
   macro avg       0.35      0.50      0.41      2000
weighted avg       0.49      0.70      0.58      2000

ROC AUC: 0.48043111373824177

▶ Training XGBoost
              precision    recall  f1-score   support

           0       0.70      0.87      0.77      1399
           1       0.28      0.11      0.16       601

    accuracy                           0.64      2000
   macro avg       0.49      0.49      0.47      2000
weighted avg       0.57      0.64      0.59      2000

ROC AUC: 0.49310596230490283


In [13]:
best_model = max(results.items(), key=lambda x: x[1][1])[0]
print("✅ Best model:", best_model)

joblib.dump(results[best_model][0], "heart_disease_pipeline.pkl")


✅ Best model: LogisticRegression


['heart_disease_pipeline.pkl']