# 🫀 Heart Disease — Full Pipeline (Colab)
**Feature Engineering v2 · Optuna · 3-Model Ensemble · Stacking**

### Before running
1. Run Cell 1 to install
2. Run Cell 2 to upload `train_clean.csv` then `test.csv`
3. Run all remaining cells top to bottom
- **2-fold CV** to keep runtime short
- **20 Optuna trials** per model
- Early stopping handled correctly per model type throughout

---
## 1 · Install

In [None]:
!pip install optuna catboost lightgbm --upgrade --quiet
print('✅ Done')


---
## 2 · Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, io
warnings.filterwarnings('ignore')

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

from lightgbm import LGBMClassifier
from lightgbm import early_stopping as lgbm_es, log_evaluation as lgbm_log
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, recall_score, precision_score, confusion_matrix

sns.set_theme(style='whitegrid')
plt.rcParams['figure.dpi'] = 100

# ── Global CV config ───────────────────────────────────────────────────────
N_FOLDS  = 2   # keep low for Colab stability
N_TRIALS = 20  # per model

print('✅ Imports done')


---
## 3 · Upload Data

In [None]:
from google.colab import files

print('📁 Upload train_clean.csv')
up = files.upload()
train_bytes = list(up.values())[0]

print('📁 Upload test.csv')
up = files.upload()
test_bytes = list(up.values())[0]

df   = pd.read_csv(io.BytesIO(train_bytes))
test = pd.read_csv(io.BytesIO(test_bytes))
print(f'✅ Train: {df.shape} | Test: {test.shape}')


---
## 4 · Groups & Feature Engineering

In [None]:
def rebuild_groups(data):
    d = data.copy()
    d['sex_label'] = d['Sex'].map({1: 'Male', 0: 'Female'})
    d['under_surveillance'] = (
        (d['Number of vessels fluro'] > 0) |
        (d['Thallium'] == 6) |
        (d['EKG results'] == 2)
    ).astype(int)
    d['group'] = d['sex_label'] + ' / ' + d['under_surveillance'].map({0:'Naive', 1:'Surveilled'})
    return d

def engineer_features(data):
    d = data.copy()
    # v1 — interactions
    d['sex_x_chest_pain']   = d['Sex'] * d['Chest pain type']
    d['sex_x_slope']        = d['Sex'] * d['Slope of ST']
    d['sex_x_max_hr']       = d['Sex'] * d['Max HR']
    d['surv_x_cholesterol'] = d['under_surveillance'] * d['Cholesterol']
    d['surv_x_max_hr']      = d['under_surveillance'] * d['Max HR']
    d['surv_x_bp']          = d['under_surveillance'] * d['BP']
    d['hr_reserve']         = (220 - d['Age']) - d['Max HR']
    # v2 — clinical
    d['thallium_x_vessels'] = d['Thallium'] * d['Number of vessels fluro']
    d['thallium_x_chest']   = d['Thallium'] * d['Chest pain type']
    d['thallium_x_angina']  = d['Thallium'] * d['Exercise angina']
    d['age_bin']            = pd.cut(d['Age'], bins=[0,40,50,60,70,110],
                                     labels=[0,1,2,3,4]).astype(int)
    d['risk_score']         = (
        d['Number of vessels fluro'] * 2 +
        (d['Thallium'] == 7).astype(int) * 3 +
        d['Exercise angina'] +
        (d['ST depression'] > 1).astype(int) +
        (d['Slope of ST'] == 2).astype(int) * 2
    )
    d['st_x_slope']         = d['ST depression'] * d['Slope of ST']
    d['hr_res_x_angina']    = d['hr_reserve'] * d['Exercise angina']
    d['age_x_sex']          = d['Age'] * d['Sex']
    return d

df   = rebuild_groups(df)
df   = engineer_features(df)
test = rebuild_groups(test)
test = engineer_features(test)

FEATURE_COLS = [c for c in df.columns if c not in
                ['id', 'target', 'sex_label', 'group', 'Heart Disease', 'sample_weight']]

X      = df[FEATURE_COLS].copy()
y      = df['target']
groups = df['group']

weight_map = {
    'Female / Naive':      4.0,
    'Female / Surveilled': 2.0,
    'Male / Naive':        1.5,
    'Male / Surveilled':   1.0,
}
sample_weights = groups.map(weight_map)
skf            = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
group_encoded  = pd.Categorical(groups).codes

print(f'Features: {len(FEATURE_COLS)}')
print(df['group'].value_counts())


---
## 5 · Fit Helper
Handles early stopping correctly for each model type.

In [None]:
def fit_model(model, X_train, y_train, X_val, y_val, w_train, es_rounds=50):
    """Single fit call with correct early stopping per model type."""
    name = type(model).__name__
    if name == 'LGBMClassifier':
        model.fit(X_train, y_train,
                  sample_weight=w_train,
                  eval_set=[(X_val, y_val)],
                  callbacks=[lgbm_es(es_rounds, verbose=False), lgbm_log(-1)])
    elif name == 'XGBClassifier':
        model.fit(X_train, y_train,
                  sample_weight=w_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=es_rounds,
                  verbose=False)
    else:  # CatBoost — od_wait set in constructor
        model.fit(X_train, y_train,
                  sample_weight=w_train,
                  eval_set=[(X_val, y_val)],
                  verbose=False)
    return model


def run_cv(model_fn, es_rounds=50):
    """Run N_FOLDS CV, return (mean_auc, fn_rate_female_naive, oof)."""
    oof = np.zeros(len(X))
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, group_encoded)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        w_tr        = sample_weights.iloc[train_idx]
        model       = model_fn()
        fit_model(model, X_tr, y_tr, X_val, y_val, w_tr, es_rounds)
        oof[val_idx] = model.predict_proba(X_val)[:, 1]

    mean_auc = roc_auc_score(y, oof)
    mask     = groups == 'Female / Naive'
    yt, yp   = y[mask], (oof[mask] >= 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(yt, yp).ravel()
    return mean_auc, fn / max(fn + tp, 1), oof


def evaluate_by_group(y_true, y_pred_proba, threshold=0.5):
    y_pred  = (y_pred_proba >= threshold).astype(int)
    results = []
    for group in sorted(groups.unique()):
        mask = groups == group
        yt, yp, ypp = y_true[mask], y_pred[mask], y_pred_proba[mask]
        if yt.nunique() < 2: continue
        tn, fp, fn, tp = confusion_matrix(yt, yp).ravel()
        results.append({
            'group':     group,
            'n':         int(mask.sum()),
            'AUC':       round(roc_auc_score(yt, ypp), 4),
            'Recall':    round(recall_score(yt, yp, zero_division=0), 3),
            'FN_rate':   round(fn / max(fn + tp, 1), 3),
        })
    print(pd.DataFrame(results).set_index('group').to_string())


# Optuna patience-based early stopping
class OptunaES:
    def __init__(self, patience=8):
        self.patience = patience
        self._best   = -np.inf
        self._count  = 0
    def __call__(self, study, trial):
        best = max((t.values[0] for t in study.best_trials), default=-np.inf)
        if best > self._best + 1e-5:
            self._best, self._count = best, 0
        else:
            self._count += 1
        if self._count >= self.patience:
            print(f'⏹ Optuna ES after {self.patience} non-improving trials')
            study.stop()

print('✅ Helpers ready')


---
## 6 · Optuna — LightGBM

In [None]:
def lgbm_objective(trial):
    model_fn = lambda: LGBMClassifier(
        n_estimators     = trial.suggest_int('n_estimators', 300, 2000),
        learning_rate    = trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        num_leaves       = trial.suggest_int('num_leaves', 31, 255),
        max_depth        = trial.suggest_int('max_depth', 3, 9),
        min_child_samples= trial.suggest_int('min_child_samples', 10, 100),
        subsample        = trial.suggest_float('subsample', 0.5, 1.0),
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0),
        reg_alpha        = trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        reg_lambda       = trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        verbose=-1, random_state=42, n_jobs=-1,
    )
    auc, fn, _ = run_cv(model_fn)
    return auc, fn

lgbm_study = optuna.create_study(
    directions=['maximize', 'minimize'],
    sampler=optuna.samplers.TPESampler(seed=42),
    storage='sqlite:////content/lgbm_study.db',
    study_name='lgbm_v1', load_if_exists=True,
)
n_done = len(lgbm_study.trials)
print(f'LightGBM: {n_done} done, running {max(0, N_TRIALS-n_done)} more...')
lgbm_study.optimize(lgbm_objective, n_trials=max(0, N_TRIALS-n_done),
                    callbacks=[OptunaES(patience=8)], show_progress_bar=True)
print(f'✅ {len(lgbm_study.best_trials)} Pareto-optimal trials')


---
## 7 · Optuna — XGBoost

In [None]:
def xgb_objective(trial):
    model_fn = lambda: XGBClassifier(
        n_estimators     = trial.suggest_int('n_estimators', 300, 2000),
        learning_rate    = trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        max_depth        = trial.suggest_int('max_depth', 3, 9),
        subsample        = trial.suggest_float('subsample', 0.5, 1.0),
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0),
        min_child_weight = trial.suggest_int('min_child_weight', 1, 20),
        gamma            = trial.suggest_float('gamma', 0.0, 5.0),
        reg_alpha        = trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        reg_lambda       = trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        eval_metric='auc', verbosity=0, random_state=42, n_jobs=-1,
    )
    auc, fn, _ = run_cv(model_fn)
    return auc, fn

xgb_study = optuna.create_study(
    directions=['maximize', 'minimize'],
    sampler=optuna.samplers.TPESampler(seed=42),
    storage='sqlite:////content/xgb_study.db',
    study_name='xgboost_v1', load_if_exists=True,
)
n_done = len(xgb_study.trials)
print(f'XGBoost: {n_done} done, running {max(0, N_TRIALS-n_done)} more...')
xgb_study.optimize(xgb_objective, n_trials=max(0, N_TRIALS-n_done),
                   callbacks=[OptunaES(patience=8)], show_progress_bar=True)
print(f'✅ {len(xgb_study.best_trials)} Pareto-optimal trials')


---
## 8 · Optuna — CatBoost

In [None]:
def cat_objective(trial):
    model_fn = lambda: CatBoostClassifier(
        iterations          = trial.suggest_int('iterations', 300, 2000),
        learning_rate       = trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        depth               = trial.suggest_int('depth', 4, 10),
        l2_leaf_reg         = trial.suggest_float('l2_leaf_reg', 1.0, 20.0, log=True),
        bagging_temperature = trial.suggest_float('bagging_temperature', 0.0, 1.0),
        random_strength     = trial.suggest_float('random_strength', 0.0, 10.0),
        border_count        = trial.suggest_int('border_count', 32, 255),
        od_type='Iter', od_wait=50,
        random_seed=42, verbose=0, thread_count=-1,
    )
    auc, fn, _ = run_cv(model_fn)
    return auc, fn

cat_study = optuna.create_study(
    directions=['maximize', 'minimize'],
    sampler=optuna.samplers.TPESampler(seed=42),
    storage='sqlite:////content/cat_study.db',
    study_name='catboost_v1', load_if_exists=True,
)
n_done = len(cat_study.trials)
print(f'CatBoost: {n_done} done, running {max(0, N_TRIALS-n_done)} more...')
cat_study.optimize(cat_objective, n_trials=max(0, N_TRIALS-n_done),
                   callbacks=[OptunaES(patience=8)], show_progress_bar=True)
print(f'✅ {len(cat_study.best_trials)} Pareto-optimal trials')


---
## 9 · Pareto Front & Best Trial Selection

In [None]:
def plot_pareto(study, name, ax):
    all_auc = [t.values[0] for t in study.trials if t.values and len(t.values)==2]
    all_fn  = [t.values[1] for t in study.trials if t.values and len(t.values)==2]
    par_auc = [t.values[0] for t in study.best_trials]
    par_fn  = [t.values[1] for t in study.best_trials]
    ax.scatter(all_fn, all_auc, alpha=0.2, color='grey', s=15, label='All')
    ax.scatter(par_fn, par_auc, color='steelblue', s=50, zorder=5, label='Pareto')
    ax.set_xlabel('FN Rate Female/Naive (↓)')
    ax.set_ylabel('AUC (↑)')
    ax.set_title(name)
    ax.legend(fontsize=8)

fig, axes = plt.subplots(1, 3, figsize=(16, 4))
plot_pareto(lgbm_study, 'LightGBM', axes[0])
plot_pareto(xgb_study,  'XGBoost',  axes[1])
plot_pareto(cat_study,  'CatBoost', axes[2])
plt.suptitle('Pareto Front — AUC vs Female/Naive FN Rate', fontsize=12)
plt.tight_layout(); plt.show()

def best_trial(study, fn_penalty=0.5):
    best_score, best = -np.inf, None
    for t in study.best_trials:
        score = t.values[0] - fn_penalty * t.values[1]
        if score > best_score:
            best_score, best = score, t
    return best

lgbm_best = best_trial(lgbm_study)
xgb_best  = best_trial(xgb_study)
cat_best  = best_trial(cat_study)

for name, b in [('LightGBM', lgbm_best), ('XGBoost', xgb_best), ('CatBoost', cat_best)]:
    print(f'\n=== {name} ===')
    print(f'  AUC: {b.values[0]:.4f}  FN: {b.values[1]:.3f}')
    print(f'  {b.params}')


---
## 10 · Retrain & Generate Predictions

In [None]:
X_test = test[FEATURE_COLS].copy()

oof_lgbm = np.zeros(len(X)); test_lgbm = np.zeros(len(X_test))
oof_xgb  = np.zeros(len(X)); test_xgb  = np.zeros(len(X_test))
oof_cat  = np.zeros(len(X)); test_cat  = np.zeros(len(X_test))

retrain_configs = [
    ('LightGBM',
     lambda: LGBMClassifier(**lgbm_best.params, verbose=-1, random_state=42, n_jobs=-1),
     oof_lgbm, test_lgbm),
    ('XGBoost',
     lambda: XGBClassifier(**xgb_best.params, eval_metric='auc',
                           verbosity=0, random_state=42, n_jobs=-1),
     oof_xgb, test_xgb),
    ('CatBoost',
     lambda: CatBoostClassifier(**cat_best.params, od_type='Iter', od_wait=50,
                                random_seed=42, verbose=0, thread_count=-1),
     oof_cat, test_cat),
]

for name, model_fn, oof_, test_ in retrain_configs:
    print(f'Retraining {name}...')
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, group_encoded)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        w_tr        = sample_weights.iloc[train_idx]
        model       = model_fn()
        fit_model(model, X_tr, y_tr, X_val, y_val, w_tr)
        oof_[val_idx]  = model.predict_proba(X_val)[:, 1]
        test_         += model.predict_proba(X_test)[:, 1] / N_FOLDS
    print(f'  OOF AUC: {roc_auc_score(y, oof_):.4f}')


---
## 11 · Stacking Meta-Learner

In [None]:
meta_train = np.column_stack([oof_lgbm, oof_xgb, oof_cat])
meta_test  = np.column_stack([test_lgbm, test_xgb, test_cat])

meta_skf  = StratifiedKFold(n_splits=5, shuffle=True, random_state=99)
oof_meta  = np.zeros(len(X))
test_meta = np.zeros(len(X_test))

meta_model = Pipeline([
    ('scaler', RobustScaler()),
    ('clf', LogisticRegression(C=1.0, max_iter=1000, random_state=42)),
])

for fold, (tr_idx, val_idx) in enumerate(meta_skf.split(meta_train, y)):
    meta_model.fit(meta_train[tr_idx], y.iloc[tr_idx])
    oof_meta[val_idx]  = meta_model.predict_proba(meta_train[val_idx])[:, 1]
    test_meta         += meta_model.predict_proba(meta_test)[:, 1] / 5

auc_stack = roc_auc_score(y, oof_meta)
auc_avg   = roc_auc_score(y, (oof_lgbm + oof_xgb + oof_cat) / 3)
print(f'Stacking AUC:       {auc_stack:.4f}')
print(f'Simple average AUC: {auc_avg:.4f}')

if auc_stack > auc_avg:
    print('→ Using stacking')
    final_oof, final_test = oof_meta, test_meta
else:
    print('→ Using simple average')
    final_oof  = (oof_lgbm + oof_xgb + oof_cat) / 3
    final_test = (test_lgbm + test_xgb + test_cat) / 3

print(f'\nFinal OOF AUC: {roc_auc_score(y, final_oof):.4f}')


---
## 12 · Per-Group Evaluation

In [None]:
print('=== Final Ensemble — Per-Group ===')
evaluate_by_group(y, final_oof)


---
## 13 · Download Submission

In [None]:
from google.colab import files

submission = pd.DataFrame({
    'id':            test['id'],
    'Heart Disease': final_test,
})
submission.to_csv('/content/submission_final.csv', index=False)
print('✅ submission_final.csv ready')
print(submission.head())
print(submission['Heart Disease'].describe().round(3))
files.download('/content/submission_final.csv')
