Verify whether oversampling-before-split inflates performance on CDC's BRFSS dataset.
Replicating : https://pmc.ncbi.nlm.nih.gov/articles/PMC11678659/

The script runs two experiments using BRFSS (2021):

A) "LEAKY": SMOTE-ENN is applied to the dataset BEFORE the train/test split.

B) "PROPER": Split first; oversampling (SMOTE-ENN) happens ONLY inside the training pipeline.


In [1]:
# https://www.cdc.gov/brfss/annual_data/annual_data.htm
# datafile downloaded from: https://www.cdc.gov/brfss/annual_data/2021/files/LLCP2021XPT.zip

XPT_PATH = r"C:\Users\meltawil\Rutgers University\DOPPS Data Kidney Project - General\6.0 Team Working Folders\Mohamed\NHANES Files"
# XPT_PATH = "/mnt/c/Users/meltawil/Rutgers University/DOPPS Data Kidney Project - General/6.0 Team Working Folders/Mohamed/NHANES Files/LLCP2021.XPT"

random_state = 42
TEST_SIZE = 0.30
SUBSAMPLE_FRAC = 0.1   # 20% used in the experiment to speed up processing


### 1) IMPORTS

In [2]:
import os
import warnings
import time
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
)

from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler


### 2) HELPER FUNCTIONS

In [3]:
def recode_alcday5(series):
    """Recode ALCDAY5 to approximate drinks per month."""
    s = pd.to_numeric(series, errors="coerce")
    out = pd.Series(np.nan, index=s.index, dtype="float")

    # Days per week (101–107)
    mask_week = s.between(101, 107)
    out[mask_week] = (s[mask_week] - 100) * 4.345  # weeks → ~days/month

    # Days in past 30 days (201–230)
    mask_month = s.between(201, 230)
    out[mask_month] = s[mask_month] - 200

    # No drinks in past 30 days
    out[s == 888] = 0

    # 777, 999, NaN → leave as NaN
    return out


def recode_food_frequency(series):
    """Recode FRUIT2, FVGREEN1, FRENCHF1 to times per month."""
    s = pd.to_numeric(series, errors="coerce")
    out = pd.Series(np.nan, index=s.index, dtype="float")

    # 101–199: per day → ×30
    mask = (s >= 101) & (s <= 199)
    out[mask] = (s[mask] - 100) * 30

    # 201–299: per week → ×4.3
    mask = (s >= 201) & (s <= 299)
    out[mask] = (s[mask] - 200) * 4.3

    # 300: less than once per month
    out[s == 300] = 0.5

    # 301–399: per month
    mask = (s >= 301) & (s <= 399)
    out[mask] = s[mask] - 300

    # 555: Never
    out[s == 555] = 0

    # 777, 999 → NaN
    out[s.isin([777, 999])] = np.nan

    return out


def prepare_brfss_dataset(df):
    """
    Prepare BRFSS dataset for ML using the 18-paper-variable setup.
    Works for BRFSS 2021, 2022, 2023 (if vars exist).
    Returns: X (features), y (target).
    """

    # Target: Heart disease (_MICHD)
    y = df["_MICHD"].replace({1: 1,2: 0, 7: np.nan, 9: np.nan})
    y.name = "Heart_Disease"

    # Features
    X = pd.DataFrame(index=df.index)
    X["General_Health"] = df["GENHLTH"].replace({1: 1, 2: 2, 3: 3, 4: 4, 5: 5,7: np.nan, 9: np.nan})
    X["Checkup"] = df["CHECKUP1"].replace({1: 4, 2: 3, 3: 2, 4: 1,7: np.nan, 8: 0, 9: np.nan})
    X["Exercise"] = df["EXERANY2"].replace({1: 1, 2: 0, 7: np.nan, 9: np.nan})
    X["Skin_Cancer"] = df["CHCSCNCR"].replace({1: 1, 2: 0, 7: np.nan, 9: np.nan})
    X["Other_Cancer"] = df["CHCOCNCR"].replace({1: 1, 2: 0, 7: np.nan, 9: np.nan})
    X["Depression"] = df["ADDEPEV3"].replace({1: 1, 2: 0, 7: np.nan, 9: np.nan})
    X["Diabetes"] = df["DIABETE4"].replace({1: 1, 2: 0, 3: 0, 4: 0, 7: np.nan, 9: np.nan})
    X["Arthritis"] = df["HAVARTH5"].replace({1: 1, 2: 0, 7: np.nan, 9: np.nan})
    X["Sex"] = df["SEXVAR"]
    X["Age_Category"] = df["_AGEG5YR"].replace({
        1: 12, 2: 11, 3: 10, 4: 9, 5: 8, 6: 7,
        7: 6, 8: 5, 9: 4, 10: 3, 11: 2, 12: 1, 13: 0,
        14: np.nan
    })
    X["Height_cm"] = pd.to_numeric(df.get("HTM4"), errors="coerce")
    X["Weight_kg"] = pd.to_numeric(df.get("WTKG3"), errors="coerce") * 0.01
    X["BMI"] = X["Weight_kg"] / ((X["Height_cm"] * 0.01) ** 2)
    X["Smoking_History"] = df["SMOKE100"].replace({1: 1, 2: 0, 7: np.nan, 9: np.nan})
    X["Alcohol_Consumption"] = recode_alcday5(df["ALCDAY5"])
    X["Fruit_Consumption"] = recode_food_frequency(df["FRUIT2"])
    X["Green_Vegetables_Consumption"] = recode_food_frequency(df["FVGREEN1"])
    X["FriedPotato_Consumption"] = recode_food_frequency(df["FRENCHF1"])

    return X, y


In [4]:
def clean_brfss_dataset(X, y, drop_na=True, drop_dupes=True,
                        height_range=(140, 210), weight_range=(45, 200),
                        verbose=True):
    """
    Clean BRFSS dataset after feature/target preparation.
    - Drops duplicate rows
    - Optionally drops rows with NaNs
    - Applies height/weight plausibility filters
    - Returns cleaned (X, y)
    """

    # Combine into single DataFrame
    df_xy = pd.concat([X, y], axis=1)

    n_before = len(df_xy)

    if drop_dupes:
        df_xy = df_xy.drop_duplicates()
    n_after_dupes = len(df_xy)

    if drop_na:
        df_xy = df_xy.dropna()
    n_after_na = len(df_xy)

    # Apply range filters
    mask = (
        df_xy["Height_cm"].between(*height_range) &
        df_xy["Weight_kg"].between(*weight_range)
    )
    df_xy = df_xy[mask]
    n_after_range = len(df_xy)

    # Split back
    X_clean = df_xy.drop(columns=["Heart_Disease"])
    y_clean = df_xy["Heart_Disease"]

    if verbose:
        print(f"Initial rows: {n_before:,}")
        if drop_dupes:
            print(f"After dropping duplicates: {n_after_dupes:,} (removed {n_before - n_after_dupes:,})")
        if drop_na:
            print(f"After dropping NaNs: {n_after_na:,} (removed {n_after_dupes - n_after_na:,})")
        print(f"After filtering Height[{height_range[0]}–{height_range[1]}] & "
              f"Weight[{weight_range[0]}–{weight_range[1]}]: {n_after_range:,} "
              f"(removed {n_after_na - n_after_range:,})")
        print("\n")

    return X_clean, y_clean


In [5]:
def split_brfss_dataset(X, y, test_size=TEST_SIZE, subsample_frac=SUBSAMPLE_FRAC,
                        random_state=random_state, verbose=True):
    """
    Prepares BRFSS dataset for ML:
    - Drops rows with missing labels (NaN in y)
    - Optionally subsamples while preserving class balance
    - Splits into train/test sets with stratification
    """

    # Drop NaN labels
    mask = y.notna()
    X = X.loc[mask].reset_index(drop=True)
    y = y.loc[mask].astype(int).reset_index(drop=True)

    # # Subsample
    # if subsample_frac < 1.0:
    #     X, _, y, _ = train_test_split(
    #         X, y,
    #         test_size=(1 - subsample_frac),
    #         random_state=random_state,
    #         stratify=y
    #     )
    # else:
    #     X, y = X.copy(), y.copy()

    # if verbose:
    #     print(f"Subsampled dataset: {len(X):,} rows, positives={y.sum():,} ({y.mean()*100:.2f}%)\n")

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )

    if verbose:
        def summarize_balance(name, y_arr):
            p = float((y_arr == 1).mean())
            n = len(y_arr)
            print(f"{name:>10s}  n={n:>8d}  positives={p*100:5.2f}%  (class balance)")
        print("\n=== Class balance (original and splits) ===")
        summarize_balance("ALL", y)
        summarize_balance("TRAIN", y_train)
        summarize_balance("TEST", y_test)
        print("\n")

    return X_train, X_test, y_train, y_test


In [6]:
# Helper transformer to cast to float32
# to_float32 = FunctionTransformer(lambda x: x.astype(np.float32))

def make_knn_pipeline(leaky=False):
    KNN_args = {"n_neighbors":2,
                "metric":"euclidean",
                "weights":"uniform",
                "algorithm":"brute"}
    
    if leaky:
        return ImbPipeline([
            ("scaler", MinMaxScaler()),
            ("to32", FunctionTransformer(lambda x: x.astype(np.float32))),
            ("clf", KNeighborsClassifier(**KNN_args)),        
            ])
    else:
        return ImbPipeline([
            ("scaler", MinMaxScaler()),
            ("smoteenn", smoteenn),                         
            ("to32", FunctionTransformer(lambda x: x.astype(np.float32))),
            ("clf", KNeighborsClassifier(**KNN_args)),        
            ])


# def evaluate(pipe, Xval, yval, label):
#     preds = pipe.predict(Xval)

#     try:
#         proba = pipe.predict_proba(Xval)[:, 1]
#         auc = roc_auc_score(yval, proba)
#     except Exception:
#         proba, auc = None, np.nan

#     acc = accuracy_score(yval, preds)
#     f1  = f1_score(yval, preds)
#     cm  = confusion_matrix(yval, preds)
#     cr  = classification_report(yval, preds, digits=3, output_dict=True)  # structured dict

#     # Print summary
#     print(f"\n[{label}]")
#     print(f"ACC: {acc:.4f}  F1: {f1:.4f}  AUC: {auc:.4f}")
#     print("Confusion matrix:\n", cm)
#     print("Classification report:\n", classification_report(yval, preds, digits=3))

#     # Return as dictionary
#     return {
#         "label": label,
#         "accuracy": acc,
#         "f1": f1,
#         "auc": auc,
#         "confusion_matrix": cm,
#         "classification_report": cr
#     }

from sklearn.preprocessing import LabelBinarizer

def evaluate(pipe, Xval, yval, label):
    preds = pipe.predict(Xval)

    # Ensure y is 0/1 binary
    y_bin = np.array(yval).ravel()

    proba, auc, fpr, tpr = None, np.nan, [], []
    if hasattr(pipe, "predict_proba"):
        try:
            raw_proba = pipe.predict_proba(Xval)
            if raw_proba.ndim == 2 and raw_proba.shape[1] > 1:
                proba = raw_proba[:, 1]
            else:
                proba = raw_proba.ravel()

            auc = roc_auc_score(y_bin, proba)
            fpr, tpr, _ = roc_curve(y_bin, proba)

            print(f"[{label}] ROC computed: {len(fpr)} points")
        except Exception as e:
            print(f"[{label}] ROC failed: {e}")

    acc = accuracy_score(yval, preds)
    f1  = f1_score(yval, preds)
    cm  = confusion_matrix(yval, preds)
    cr  = classification_report(yval, preds, digits=3, output_dict=True)

    return {
        "label": label,
        "accuracy": acc,
        "f1": f1,
        "auc": auc,
        "confusion_matrix": cm,
        "classification_report": cr,
        "fpr": fpr,   # always return (even if empty)
        "tpr": tpr
    }

# def evaluate(pipe, Xval, yval, label):
#     preds = pipe.predict(Xval)

#     # Force numpy array
#     y_bin = np.array(yval).ravel()

#     proba, auc, fpr, tpr = None, np.nan, [], []
#     if hasattr(pipe, "predict_proba"):
#         try:
#             raw_proba = pipe.predict_proba(Xval)
#             print(f"[{label}] predict_proba shape: {raw_proba.shape}")

#             # Handle binary output
#             if raw_proba.ndim == 2 and raw_proba.shape[1] > 1:
#                 proba = raw_proba[:, 1]
#             else:
#                 proba = raw_proba.ravel()

#             print(f"[{label}] proba sample: {proba[:10]}")  # first 10 probs

#             auc = roc_auc_score(y_bin, proba)
#             fpr, tpr, _ = roc_curve(y_bin, proba)

#             print(f"[{label}] ROC lengths: fpr={len(fpr)}, tpr={len(tpr)}")

#         except Exception as e:
#             print(f"[{label}] ROC failed: {e}")

#     acc = accuracy_score(yval, preds)
#     f1  = f1_score(yval, preds)
#     cm  = confusion_matrix(yval, preds)
#     cr  = classification_report(yval, preds, digits=3, output_dict=True)

#     return {
#         "label": label,
#         "accuracy": acc,
#         "f1": f1,
#         "auc": auc,
#         "confusion_matrix": cm,
#         "classification_report": cr,
#         "fpr": fpr,
#         "tpr": tpr
#     }


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

# NEW: use SciKeras wrapper instead of the removed TF one
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization


def build_ann(input_dim=None):
    """ANN architecture per paper (binary classification)."""
    model = Sequential()
    model.add(Dense(128, activation="relu", input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer="adam", loss="mse", metrics=["accuracy"])
    return model



def make_pipeline(model_name="knn", leaky=False, smoteenn=None):
    """
    Build an imbalanced pipeline with classifier chosen by `model_name`.

    Parameters
    ----------
    model_name : str
        One of: 'knn', 'rf', 'lr', 'nb', 'svc', 'xgb', 'ann'
    leaky : bool
        If True, skip SMOTE–ENN step.
    smoteenn : transformer
        SMOTE-ENN object (or any sampler) to insert if not leaky.
    """

    clf_dict = {
        "knn": KNeighborsClassifier(
            n_neighbors=2, metric="euclidean", weights="uniform", algorithm="brute"
        ),
        "rf": RandomForestClassifier(
            n_estimators=300, max_depth=None,
            min_samples_split=2, min_samples_leaf=1, random_state=42
        ),
        "lr": LogisticRegression(
            solver="saga", penalty="l1", max_iter=500, C=0.08858667904100823
        ),
        "nb": GaussianNB(var_smoothing=1e-7, priors=[0.3, 0.7]),
        "svc": SVC(kernel="rbf", probability=True, random_state=42),
        "xgb": XGBClassifier(
            colsample_bytree=1.0, learning_rate=0.2, max_depth=7,
            n_estimators=300, subsample=0.9,
            reg_alpha=0.5, reg_lambda=0.5, use_label_encoder=False,
            eval_metric="logloss", random_state=42
        ),
        "ann": KerasClassifier(
            model=build_ann,
            model__input_dim=X.shape[1],  # <-- tell SciKeras how many features
            epochs=50,
            batch_size=32,
            verbose=0
        ),        
    }

    if model_name not in clf_dict:
        raise ValueError(f"Unknown model_name: {model_name}")

    steps = [("scaler", MinMaxScaler())]

    if not leaky and smoteenn is not None:
        steps.append(("smoteenn", smoteenn))

    steps.extend([
        ("to32", FunctionTransformer(lambda x: x.astype(np.float32))),
        ("clf", clf_dict[model_name])
    ])

    return ImbPipeline(steps)


ImportError: cannot import name 'kullback_leibler_divergence' from 'keras.losses' (C:\Users\meltawil\anaconda3\envs\gpu-env\lib\site-packages\keras\losses\__init__.py)

### 3) LOAD & PREPROCESS DATA

In [None]:
# ===== LOAD  ===== 
df = pd.read_sas(os.path.join(XPT_PATH, "LLCP2021.XPT"), format="xport", encoding="utf-8")
df.columns = [str(c).upper() for c in df.columns]
print(f"Rows: {len(df):,}, Columns: {len(df.columns):,}\n")

# ===== MAP FIELDS  ===== 
X, y = prepare_brfss_dataset(df)

# ===== Target distribution =====
counts = y.value_counts(dropna=False)
percentages = (counts / len(y) * 100).round(2)
# print("\nTarget distribution (Heart_Disease):")
print(pd.DataFrame({"Count": counts, "Percent": percentages}))
print("\n")

# ===== DATA CLEANING  ===== 
X_all, y_all = clean_brfss_dataset(X, y)

# ===== SUBSAMPLE  ===== 
if SUBSAMPLE_FRAC < 1.0:
    X, _, y, _ = train_test_split(X_all, y_all,test_size=(1 - SUBSAMPLE_FRAC),random_state=random_state,stratify=y_all)
    print(f"Subsampled dataset: {len(X):,} rows, positives={y.sum():,} ({y.mean()*100:.2f}%)\n")
else:
    X, y = X.copy(), y.copy()

# ===== DATA SPLITTING  ===== 
X_train, X_test, y_train, y_test = split_brfss_dataset(X, y)


In [None]:
# # Combine features + target
# df_corr = X.copy()
# df_corr["Heart_Disease"] = y

# # Compute correlations (numeric only)
# corr = df_corr.corr(numeric_only=True)

# # Plot heatmap with annotations
# plt.figure(figsize=(14, 10))
# sns.heatmap(corr,cmap="coolwarm",annot=True,fmt=".2f",center=0,cbar_kws={"shrink": 0.75})
# plt.title("Correlation Heatmap of Features and Heart_Disease", fontsize=16, pad=20)
# plt.tight_layout()
# plt.show()


In [None]:
X.shape

### 4) EXPERIMENT A — **LEAKY**

In [None]:
# ======================================================================
# EXPERIMENT A: LEAKY (SMOTE-ENN applied BEFORE split)
# ======================================================================
print("\n" + "="*70)
print("EXPERIMENT A: LEAKY (SMOTE-ENN applied BEFORE split)")
print("="*70)

smoteenn = SMOTEENN(random_state=random_state, sampling_strategy="auto", n_jobs=-1)

# 1) Apply SMOTE-ENN globally (this is the 'leaky' mistake)
X_leaky, y_leaky = smoteenn.fit_resample(X, y)
print(f"After SMOTE-ENN (global): rows={len(X_leaky):,}, "
      f"positives={(y_leaky==1).sum():,}, negatives={(y_leaky==0).sum():,}")

# 2) Split AFTER resampling (leakage!) & run pipeline
Xtr_L, Xte_L, ytr_L, yte_L = train_test_split(X_leaky, y_leaky,test_size=TEST_SIZE,random_state=random_state,stratify=y_leaky)

# 3) Run ALL models
model_names = ["knn", "rf", "lr", "nb", "xgb"] #, "svc", "ann"]
results_leaky = {}

for name in model_names:
    print(f"\n--- Training {name.upper()} ---")
    start = time.time()
    pipe = make_pipeline(model_name=name, leaky=True)
    pipe.fit(Xtr_L, ytr_L)
    results_leaky[name] = evaluate(pipe, Xte_L, yte_L, f"LEAKY {name.upper()}")
    end = time.time()
    elapsed = end - start
    print(f"Time taken for {name.upper()}: {elapsed:.2f} seconds\n\n")

print("\nCompleted Experiment A (Leaky).")


In [None]:
def build_comparison_table(results_dict):
    """
    Build a comparison DataFrame from evaluate() outputs.
    results_dict: dict of {model_name: evaluate_output}
    """
    rows = []
    for name, res in results_dict.items():
        cr = res["classification_report"]

        # Find the positive class key (exclude 'accuracy', 'macro avg', 'weighted avg')
        class_keys = [k for k in cr.keys() if k not in ["accuracy", "macro avg", "weighted avg"]]
        if len(class_keys) == 2:
            # assume binary: take the positive class as the *larger* label
            pos_class = sorted(class_keys)[-1]
        else:
            # fallback: just take the last class
            pos_class = class_keys[-1]

        rows.append({
            "Method": name.upper(),
            "AUC": res["auc"],
            "ACC": res["accuracy"],
            "F1": res["f1"],
            "Precision": cr[pos_class]["precision"],
            "Recall": cr[pos_class]["recall"],
            "Specificity": res["confusion_matrix"][0,0] / res["confusion_matrix"][0].sum()
        })

    df = pd.DataFrame(rows)
    return df.set_index("Method")


def plot_all_rocs(results_dict, title="ROC Curves Comparison"):
    plt.figure(figsize=(8,6))
    for name, res in results_dict.items():
        fpr, tpr, auc = res.get("fpr"), res.get("tpr"), res.get("auc")
        if fpr is not None and tpr is not None and len(fpr) > 0:
            plt.plot(fpr, tpr, label=f"{name.upper()} (AUC={auc:.4f})")
        else:
            print(f"[{name.upper()}] No ROC curve data available.")
    plt.plot([0,1],[0,1],"k--",label="Chance")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()



In [None]:
df_leaky  = build_comparison_table(results_leaky)

print("\n=== LEAKY Results ===")
print(df_leaky.round(4))

plot_all_rocs(results_leaky, title="Experiment A: Leaky ROC Curves")


### 5) EXPERIMENT B — **PROPER**

In [None]:
# ======================================================================
# EXPERIMENT B: PROPER (split first; resample only on the training folds)
# ======================================================================

# Split first; then run SMOTE-ENN ONLY within training via imblearn Pipeline.
print("\n" + "="*70)
print("EXPERIMENT B: PROPER (split first; resample only on the training folds)")
print("="*70)


# Run ALL models
results_proper1 = {}

for name in model_names:
    print(f"\n--- Training {name.upper()} ---")
    pipe = make_pipeline(model_name=name, leaky=False, smoteenn=smoteenn)
    pipe.fit(X_train, y_train)
    results_proper1[name] = evaluate(pipe, X_test, y_test, f"PROPER {name.upper()}")

print("\nCompleted Experiment B (Proper).")


In [None]:
df_leaky  = build_comparison_table(results_leaky)

print("\n=== PROPER (1) Results ===")
print(df_leaky.round(4))

plot_all_rocs(results_proper1, title="Experiment A: Proper (1) ROC Curves")


### 6) EXPERIMENT C — **PROPER** RandomUnderSampler applied BEFORE split

In [None]:
# ======================================================================
# EXPERIMENT C: LEAKY (RandomUnderSampler applied BEFORE split)
# ======================================================================
print("\n" + "="*70)
print("EXPERIMENT C: LEAKY (RandomUnderSampler applied BEFORE split)")
print("="*70)

rus = RandomUnderSampler(random_state=random_state, sampling_strategy="auto")

# 1) Apply undersampling globally (this is the 'leaky' mistake)
X_leaky_us, y_leaky_us = rus.fit_resample(X_all, y_all)
print(f"After RandomUnderSampler (global): rows={len(X_leaky_us):,}, "
      f"positives={(y_leaky_us==1).sum():,}, negatives={(y_leaky_us==0).sum():,}")

# 2) Split AFTER resampling (leakage!)
Xtr_L_us, Xte_L_us, ytr_L_us, yte_L_us = train_test_split(
    X_leaky_us, y_leaky_us,
    test_size=TEST_SIZE,
    random_state=random_state,
    stratify=y_leaky_us
)

# 3) Run ALL models
model_names = ["knn", "rf", "lr", "nb", "xgb"]  # add "svc", "ann" later if needed
results_leaky_us = {}

for name in model_names:
    print(f"\n--- Training {name.upper()} (Undersample) ---")
    start = time.time()
    pipe = make_pipeline(model_name=name, leaky=True)  # still "leaky" since resampling was global
    pipe.fit(Xtr_L_us, ytr_L_us)
    results_leaky_us[name] = evaluate(pipe, Xte_L_us, yte_L_us, f"LEAKY-US {name.upper()}")
    end = time.time()
    print(f"Time taken for {name.upper()}: {end-start:.2f} seconds\n")

print("\nCompleted Experiment C (Leaky with Random Undersampling).")


In [None]:
df_leaky  = build_comparison_table(results_leaky)

print("\n=== LEAKY Results ===")
print(results_leaky_us.round(4))

plot_all_rocs(results_leaky_us, title="Experiment A: Leaky ROC Curves")


### 6) COMPARISON