In [1]:
# %%
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# %% STEP 1: LOAD DATA
df = pd.read_csv("framingham.csv")
print("✅ Original shape:", df.shape)

# %% STEP 2: DATA CLEANING
df['glucose'] = df['glucose'].fillna(df['glucose'].mean())
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
print("✅ After cleaning:", df.shape)

# %% STEP 3: REMOVE OUTLIERS (Using IQR Method)
def remove_outliers_iqr(data, columns):
    for col in columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        data = data[(data[col] >= lower) & (data[col] <= upper)]
    return data

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove("TenYearCHD")  # Don't apply to target
df = remove_outliers_iqr(df, numeric_cols)
print("🧹 After outlier removal:", df.shape)

# %% STEP 4: DROP HIGHLY CORRELATED FEATURES
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [column for column in upper.columns if any(upper[column] > 0.80)]
df.drop(columns=to_drop_corr, inplace=True)
print("🔁 Dropped correlated features:", to_drop_corr)

# %% STEP 5: SPLIT FEATURES & TARGET
X = df.drop("TenYearCHD", axis=1)
y = df["TenYearCHD"]

# %% STEP 6: DROP LOW-IMPORTANCE FEATURES
rf_temp = RandomForestClassifier(random_state=42)
rf_temp.fit(X, y)
importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_temp.feature_importances_})
low_importance = importances[importances['Importance'] < 0.01]['Feature'].tolist()
X.drop(columns=low_importance, inplace=True)
print("🔻 Dropped low-importance features:", low_importance)

# %% STEP 7: BALANCE WITH SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# %% STEP 8: TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# %% STEP 9: SCALING
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# %% STEP 10: MODEL TRAINING
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# %% STEP 11: EVALUATION
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"\n🎯 Test Accuracy: {accuracy * 100:.2f}%")
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# %% STEP 12: CROSS-VALIDATION (5-Fold)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, StandardScaler().fit_transform(X_resampled), y_resampled, cv=kfold, scoring='accuracy')
print(f"\n🔁 5-Fold Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}% ± {cv_scores.std() * 100:.2f}%")



✅ Original shape: (4240, 16)
✅ After cleaning: (3989, 16)
🧹 After outlier removal: (3291, 16)
🔁 Dropped correlated features: []
🔻 Dropped low-importance features: ['BPMeds', 'prevalentStroke', 'diabetes']

🎯 Test Accuracy: 92.97%

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93       568
           1       0.94      0.92      0.93       585

    accuracy                           0.93      1153
   macro avg       0.93      0.93      0.93      1153
weighted avg       0.93      0.93      0.93      1153

📉 Confusion Matrix:
 [[535  33]
 [ 48 537]]

🔁 5-Fold Cross-Validation Accuracy: 92.77% ± 1.34%


In [2]:
X['education'].value_counts()

education
1.0    1300
2.0    1023
3.0     577
4.0     391
Name: count, dtype: int64

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3291 entries, 0 to 4239
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   male           3291 non-null   int64  
 1   age            3291 non-null   int64  
 2   education      3291 non-null   float64
 3   currentSmoker  3291 non-null   int64  
 4   cigsPerDay     3291 non-null   float64
 5   prevalentHyp   3291 non-null   int64  
 6   totChol        3291 non-null   float64
 7   sysBP          3291 non-null   float64
 8   diaBP          3291 non-null   float64
 9   BMI            3291 non-null   float64
 10  heartRate      3291 non-null   float64
 11  glucose        3291 non-null   float64
dtypes: float64(8), int64(4)
memory usage: 334.2 KB


In [4]:
import pickle

# Assume these variables exist
# best_model = your trained ML model
# best_model_name = name of the best model (optional)
# scaler = your fitted scaler (e.g., StandardScaler or MinMaxScaler)

# Create a dictionary to store both
model_package = {
    "model": model,
    "scaler": scaler
}

with open('heart_disease_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print(f"Best model and scaler saved to 'heart_disease_model.pkl'")


Best model and scaler saved to 'heart_disease_model.pkl'
