# 00. Introduction
The aim of this notebook is to add thorough, EDA-driven feature engineering on:
 - Original competition features
 - New features derived from raw data (general EDA rules)
 - New features derived from reconstructed columns (medical/logic rules)
 - Auto-generated meaningful features targeting ~20, ~50, ~100, and ~200 extras
We evaluate each feature set with 5-fold CV (LightGBM GPU), log AUC, save OOF & submissions,
and export per-set feature importances.

Newly created features in version 2 saved in the dataset I created, reading those files here to run other models.

## 01. Config

In [1]:
VERSION = "FE_012"
TIME_LIMIT_HOURS = 11.5
N_SPLITS = 10
SEED = 42

# Select model(s) to run (keep your structure)
MODELS_TO_RUN = [
    # "xgboost_gpu",
    # "lightgbm_gpu",
    # "catboost_gpu",
    # "logreg",
    "hist_gbdt",
]

TRAIN_PATH = "/kaggle/input/playground-series-s5e12/train.csv"
TEST_PATH  = "/kaggle/input/playground-series-s5e12/test.csv"
SAMPLE_SUB_PATH = "/kaggle/input/playground-series-s5e12/sample_submission.csv"

USE_PREBUILT_RECON = True
RECON_TRAIN_PATH = "/kaggle/input/s05e12-outputs-diabetes-prediction/train_reconstructed_vreconstruct_007.csv"
RECON_TEST_PATH  = "/kaggle/input/s05e12-outputs-diabetes-prediction/test_reconstructed_vreconstruct_007.csv"

# Read engineered feature sets you created earlier
USE_PREBUILT_ENGINEERED = True
ENGINEERED_DIR = "/kaggle/input/s05e12-outputs-diabetes-prediction"
ENGINEERED_VERSION = "FE_002"   # matches your saved engineered filenames

TARGET = "diagnosed_diabetes"
ID_COL = "id"
SUB_TARGET_COL = "diagnosed_diabetes"

# Paths to 6 feature-importance CSVs (LightGBM GPU)
FI_PATHS = {
    "auto_100": "/kaggle/input/s05e12-outputs-diabetes-prediction/71_03-feat_importance_lightgbm_gpu_auto_100_vFE_008.csv",
    "auto_200": "/kaggle/input/s05e12-outputs-diabetes-prediction/71_03-feat_importance_lightgbm_gpu_auto_200_vFE_008.csv",
    "auto_20":  "/kaggle/input/s05e12-outputs-diabetes-prediction/71_03-feat_importance_lightgbm_gpu_auto_20_vFE_008.csv",
    "auto_50":  "/kaggle/input/s05e12-outputs-diabetes-prediction/71_03-feat_importance_lightgbm_gpu_auto_50_vFE_008.csv",
    "raw_eda":  "/kaggle/input/s05e12-outputs-diabetes-prediction/71_03-feat_importance_lightgbm_gpu_raw_eda_vFE_008.csv",
    "recon_fe": "/kaggle/input/s05e12-outputs-diabetes-prediction/71_03-feat_importance_lightgbm_gpu_recon_fe_vFE_008.csv",
}

TOPK_LIST = [5, 10, 15, 20, 25]
WEIGHT_EPS = 1e-3  # to avoid zeroing features during weighting

# Output
import os
OUTPUT_DIR = f"model_outputs_v{VERSION}"
RESULTS_CSV = f"{OUTPUT_DIR}/results_v{VERSION}.csv"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Running version:", VERSION)
print("Models:", MODELS_TO_RUN)
print("Time limit (hours):", TIME_LIMIT_HOURS)

Running version: FE_012
Models: ['hist_gbdt']
Time limit (hours): 11.5


## 02. Imports

In [2]:
import os, time, gc, warnings, json, math
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from IPython.display import display

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import mutual_info_classif

# LightGBM / XGBoost / CatBoost
import lightgbm as lgb
from lightgbm.callback import early_stopping, log_evaluation
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool

# Extra models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

In [3]:
START_TIME = time.time()
def time_up():
    return (time.time() - START_TIME) >= (TIME_LIMIT_HOURS * 3600)

def seconds_to_str(s):
    m, s = divmod(int(s), 60); h, m = divmod(m, 60)
    return f"{h:02d}:{m:02d}:{s:02d}"

def set_seed(seed=SEED):
    import random
    np.random.seed(seed); random.seed(seed)
set_seed(SEED)

print("Setup complete.")

Setup complete.


## 03. Data Load

In [4]:
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_SUB_PATH)

feature_cols_base = [c for c in train.columns if c not in [TARGET, ID_COL]]

if USE_PREBUILT_RECON:
    train_recon = pd.read_csv(RECON_TRAIN_PATH)
    test_recon  = pd.read_csv(RECON_TEST_PATH)
    if TARGET in train_recon.columns and TARGET in train.columns:
        try: train_recon[TARGET] = train_recon[TARGET].astype(train[TARGET].dtype)
        except: pass
    for df, ref in [(train_recon, train), (test_recon, test)]:
        if ID_COL in df.columns and ID_COL in ref.columns:
            try: df[ID_COL] = df[ID_COL].astype(ref[ID_COL].dtype)
            except: pass
    print("Loaded reconstructed CSVs:",
          f"train_recon {train_recon.shape} | test_recon {test_recon.shape}")
else:
    train_recon, test_recon = train.copy(), test.copy()

print("Files OK.")
print("Rows: train", len(train), "| test", len(test))
print("Train features:", len(feature_cols_base))

Loaded reconstructed CSVs: train_recon (700000, 32) | test_recon (300000, 31)
Files OK.
Rows: train 700000 | test 300000
Train features: 24


## 04. Helpers

In [5]:
def get_num_cat_cols(df, exclude=None):
    exclude = set(exclude or [])
    num_cols = [c for c in df.columns
                if c not in exclude and pd.api.types.is_numeric_dtype(df[c])]
    cat_cols = [c for c in df.columns
                if c not in exclude and c not in num_cols]
    return num_cols, cat_cols

def safe_div(a, b):
    with np.errstate(divide='ignore', invalid='ignore'):
        out = np.where(b==0, np.nan, a/b)
    return out

def add_cols(df, new_cols: dict):
    for k, v in new_cols.items():
        df[k] = v
    return df

def cap_outliers(s: pd.Series, q_low=0.01, q_high=0.99):
    lo, hi = s.quantile(q_low), s.quantile(q_high)
    return s.clip(lo, hi)

def log1p_if_positive(s: pd.Series):
    if (s.dropna() >= 0).all():
        return np.log1p(s)
    return s

def quantile_bucket(s: pd.Series, q=5):
    try:
        return pd.qcut(s, q, labels=False, duplicates="drop")
    except Exception:
        return pd.Series(np.nan, index=s.index)

def write_results_row(row_dict, results_csv=RESULTS_CSV):
    df_row = pd.DataFrame([row_dict])
    if os.path.exists(results_csv):
        prev = pd.read_csv(results_csv)
        out = pd.concat([prev, df_row], ignore_index=True)
    else:
        out = df_row
    out.to_csv(results_csv, index=False)

def save_oof_and_sub(set_name, model_name, oof, test_pred, ids_train, ids_test):
    oof_path = f"{OUTPUT_DIR}/oof_{model_name}_{set_name}_v{VERSION}.csv"
    sub_path = f"{OUTPUT_DIR}/sub_{model_name}_{set_name}_v{VERSION}.csv"
    pd.DataFrame({ID_COL: ids_train, "oof_pred": oof}).to_csv(oof_path, index=False)
    if test_pred is not None and ids_test is not None:
        pd.DataFrame({ID_COL: ids_test, SUB_TARGET_COL: test_pred}).to_csv(sub_path, index=False)
    else:
        sub_path = None
    return oof_path, sub_path

def save_importance_csv(set_name, model_name, feat_names, fold_importances):
    imp = (pd.DataFrame({"feature": feat_names, "importance": fold_importances})
             .groupby("feature", as_index=False)["importance"].sum()
             .sort_values("importance", ascending=False))
    imp_path = f"{OUTPUT_DIR}/feat_importance_{model_name}_{set_name}_v{VERSION}.csv"
    imp.to_csv(imp_path, index=False)
    return imp_path

In [6]:
def fit_predict_lightgbm(est, Xtr, ytr, Xva, yva, Xte):
    est.set_params(bagging_seed=SEED, feature_fraction_seed=SEED, data_random_seed=SEED, verbosity=-1)
    est.fit(
        Xtr, ytr,
        eval_set=[(Xva, yva)],
        eval_metric="auc",
        callbacks=[early_stopping(200, verbose=False), log_evaluation(0)]
    )
    va = est.predict_proba(Xva)[:, 1]
    te = est.predict_proba(Xte)[:, 1] if Xte is not None else None
    return va, te, getattr(est, "feature_importances_", None)

def fit_predict_xgb(est, Xtr, ytr, Xva, yva, Xte):
    est.fit(Xtr, ytr, eval_set=[(Xva, yva)], verbose=False)
    va = est.predict_proba(Xva)[:, 1]
    te = est.predict_proba(Xte)[:, 1] if Xte is not None else None
    return va, te, getattr(est, "feature_importances_", None)

def fit_predict_cat(est, Xtr, ytr, Xva, yva, Xte):
    trp = Pool(Xtr, ytr); vap = Pool(Xva, yva)
    est.fit(trp, eval_set=vap, verbose=False, use_best_model=True)
    va = est.predict_proba(Xva)[:, 1]
    te = est.predict_proba(Xte)[:, 1] if Xte is not None else None
    return va, te, getattr(est, "feature_importances_", None)

def fit_predict_sklearn(est, Xtr, ytr, Xva, yva, Xte):
    est_use = est
    if not hasattr(est_use, "predict_proba"):
        est_use = CalibratedClassifierCV(est_use, method="isotonic", cv=3)
    est_use.fit(Xtr, ytr)
    va = est_use.predict_proba(Xva)[:, 1]
    te = est_use.predict_proba(Xte)[:, 1] if Xte is not None else None
    imp = getattr(est, "feature_importances_", None)
    return va, te, imp

## 05. Preprocessing

In [7]:
def make_preprocessors(train_df, target_col=TARGET, id_col=ID_COL):
    feature_cols = [c for c in train_df.columns if c not in [target_col, id_col]]
    num_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(train_df[c])]
    cat_cols = [c for c in feature_cols if c not in num_cols]

    preproc_ohe_sparse = ColumnTransformer(
        transformers=[
            ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
            ("cat", Pipeline([
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=True))
            ]), cat_cols),
        ],
        remainder="drop"
    )

    preproc_ordscale = ColumnTransformer(
        transformers=[
            ("num", Pipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("sc", StandardScaler(with_mean=True))
            ]), num_cols),
            ("cat", Pipeline([
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
            ]), cat_cols),
        ],
        remainder="drop"
    )

    return feature_cols, num_cols, cat_cols, preproc_ohe_sparse, preproc_ordscale

def preprocessor_for_model(model_name, train_df):
    feature_cols, num_cols, cat_cols, preproc_ohe_sparse, preproc_ordscale = make_preprocessors(train_df)
    if model_name in ["xgboost_gpu", "lightgbm_gpu", "catboost_gpu", "hist_gbdt"]:
        return feature_cols, preproc_ohe_sparse
    elif model_name in ["logreg"]:
        return feature_cols, preproc_ordscale
    else:
        return feature_cols, preproc_ohe_sparse

## 06. Model Zoo

In [8]:
def make_xgboost_gpu():
    return XGBClassifier(
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=5,
        subsample=0.85,
        colsample_bytree=0.80,
        reg_alpha=0.0,
        reg_lambda=0.0,
        tree_method="gpu_hist",
        predictor="gpu_predictor",
        objective="binary:logistic",
        eval_metric="auc",
        random_state=SEED
    )

def make_lightgbm_gpu():
    return lgb.LGBMClassifier(
        objective="binary",
        n_estimators=3500,
        learning_rate=0.025,
        num_leaves=63,
        subsample=0.90,
        colsample_bytree=0.80,
        min_data_in_leaf=25,
        reg_alpha=0.0,
        reg_lambda=0.0,
        device="gpu",
        random_state=SEED,
        verbosity=-1
    )

def make_catboost_gpu():
    return CatBoostClassifier(
        iterations=3500,
        learning_rate=0.025,
        depth=6,
        l2_leaf_reg=3.0,
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        random_seed=SEED,
        verbose=False
    )

def make_logreg():
    return LogisticRegression(
        max_iter=4000,
        solver="lbfgs",
        n_jobs=-1
    )

def make_hist_gbdt():
    return HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=None,
        max_bins=255,
        early_stopping=True,
        l2_regularization=0.0,
        random_state=SEED
    )

FACTORIES = {
    "xgboost_gpu": make_xgboost_gpu,
    "lightgbm_gpu": make_lightgbm_gpu,
    "catboost_gpu": make_catboost_gpu,
    "logreg": make_logreg,
    "hist_gbdt": make_hist_gbdt,
}

## 07. EDA Features

In [9]:
def fe_raw_basic(train_df: pd.DataFrame, test_df: pd.DataFrame):
    tr = train_df.copy(); te = test_df.copy()
    exclude = [TARGET, ID_COL]
    num_cols = [c for c in tr.columns if c not in exclude and pd.api.types.is_numeric_dtype(tr[c])]
    cat_cols = [c for c in tr.columns if c not in exclude and c not in num_cols]

    # Numeric caps + log1p
    for c in num_cols:
        lo, hi = tr[c].quantile(0.01), tr[c].quantile(0.99)
        tr[f"{c}_cap"] = tr[c].clip(lo, hi)
        te[f"{c}_cap"] = te[c].clip(lo, hi)
        if (tr[c].dropna() >= 0).all():
            tr[f"{c}_log1p"] = np.log1p(tr[c])
            te[f"{c}_log1p"] = np.log1p(te[c])

    # Missingness ratios
    tr["num_nan_ratio"] = tr[num_cols].isna().mean(axis=1) if num_cols else 0.0
    te["num_nan_ratio"] = te[num_cols].isna().mean(axis=1) if num_cols else 0.0

    # Simple interactions among top-variance numeric columns
    if len(num_cols) >= 4:
        var_rank = tr[num_cols].var().sort_values(ascending=False)
        topk = list(var_rank.index[:6])
        for i in range(len(topk)):
            for j in range(i+1, len(topk)):
                a, b = topk[i], topk[j]
                tr[f"{a}_plus_{b}"]  = tr[a] + tr[b]
                te[f"{a}_plus_{b}"]  = te[a] + te[b]
                tr[f"{a}_minus_{b}"] = tr[a] - tr[b]
                te[f"{a}_minus_{b}"] = te[a] - te[b]
                tr[f"{a}_ratio_{b}"] = np.where(tr[b]==0, np.nan, tr[a]/tr[b])
                te[f"{a}_ratio_{b}"] = np.where(te[b]==0, np.nan, te[a]/te[b])

    # Quantile buckets for skewed numerics
    for c in num_cols:
        if tr[c].dropna().skew() > 1.0:
            try:
                tr[f"{c}_q5"] = pd.qcut(tr[c], 5, labels=False, duplicates="drop")
                te[f"{c}_q5"] = pd.qcut(te[c], 5, labels=False, duplicates="drop")
            except Exception:
                pass

    return tr, te


def _consistent_stage_codes(tr_stage: pd.Series, te_stage: pd.Series):
    both = pd.concat([tr_stage, te_stage], axis=0)
    codes = pd.Categorical(both).codes
    tr_codes = pd.Series(codes[:len(tr_stage)], index=tr_stage.index).astype(float)
    te_codes = pd.Series(codes[len(tr_stage):], index=te_stage.index).astype(float)
    return tr_codes, te_codes


def fe_recon_basic(train_df: pd.DataFrame, test_df: pd.DataFrame):
    tr = train_df.copy(); te = test_df.copy()
    cols = set(train_df.columns)

    gf  = "glucose_fasting"
    gpp = "glucose_postprandial"
    ins = "insulin_level"
    a1c = "hba1c"
    rs  = "diabetes_risk_score"
    stg = "diabetes_stage"

    # Glucose deltas & ratios
    if gf in cols and gpp in cols:
        tr["glucose_delta"] = tr[gpp] - tr[gf]
        te["glucose_delta"] = te[gpp] - te[gf]
        tr["glucose_ratio"] = safe_div(tr[gpp], tr[gf])
        te["glucose_ratio"] = safe_div(te[gpp], te[gf])

    # Insulin resistance proxy
    if gf in cols and ins in cols:
        tr["insulin_resistance_proxy"] = safe_div(tr[gf], (tr[ins] + 1e-3))
        te["insulin_resistance_proxy"] = safe_div(te[gf], (te[ins] + 1e-3))

    # eAG from HbA1c (mg/dL)
    if a1c in cols:
        tr["a1c_eag"] = 28.7 * tr[a1c] - 46.7
        te["a1c_eag"] = 28.7 * te[a1c] - 46.7

    # Risk buckets & stage numeric encodings
    if rs in cols:
        tr["risk_bucket_q5"] = quantile_bucket(tr[rs], q=5)
        te["risk_bucket_q5"] = quantile_bucket(te[rs], q=5)

    if stg in cols:
        if pd.api.types.is_numeric_dtype(tr[stg]):
            tr["stage_code"] = tr[stg].astype(float)
            te["stage_code"] = te[stg].astype(float)
        else:
            tr_codes, te_codes = _consistent_stage_codes(tr[stg], te[stg])
            tr["stage_code"] = tr_codes
            te["stage_code"] = te_codes

    # Safety caps and logs
    for c in [x for x in [gf, gpp, ins, a1c, rs] if x in cols]:
        tr[f"{c}_cap"] = cap_outliers(tr[c])
        te[f"{c}_cap"] = cap_outliers(te[c])
        tr[f"{c}_log1p"] = log1p_if_positive(tr[c])
        te[f"{c}_log1p"] = log1p_if_positive(te[c])

    return tr, te

## 08. Auto Generated


In [10]:
def auto_generate_features(train_df: pd.DataFrame, test_df: pd.DataFrame, budget=20):
    tr = train_df.copy(); te = test_df.copy()
    exclude = [TARGET, ID_COL]
    num_cols = [c for c in tr.columns if c not in exclude and pd.api.types.is_numeric_dtype(tr[c])]
    cat_cols = [c for c in tr.columns if c not in exclude and c not in num_cols]

    cand = {}

    # A) z-scores and squares for top-variance numerics
    var_rank = tr[num_cols].var().sort_values(ascending=False) if len(num_cols) else pd.Series(dtype=float)
    topn = list(var_rank.index[:min(12, len(var_rank))])
    for c in topn:
        mu, sd = tr[c].mean(), tr[c].std(ddof=0) or 1.0
        cand[f"{c}_z"] = (tr[c] - mu) / sd
        cand[f"{c}_2"] = tr[c] * tr[c]

    # B) pairwise interactions
    for i in range(len(topn)):
        for j in range(i+1, len(topn)):
            a, b = topn[i], topn[j]
            cand[f"{a}_x_{b}"] = tr[a] * tr[b]
            cand[f"{a}_r_{b}"] = np.where(tr[b]==0, np.nan, tr[a]/tr[b])

    # C) frequency encodings for low-card cats
    for c in cat_cols:
        if tr[c].nunique(dropna=True) <= 20:
            freq = tr[c].value_counts(dropna=False) / len(tr)
            cand[f"{c}_freq"] = tr[c].map(freq)

    # D) quantile buckets on skewed numerics
    skewed = [c for c in num_cols if tr[c].dropna().skew() > 1.0]
    for c in skewed[:10]:
        try:
            cand[f"{c}_q5auto"] = pd.qcut(tr[c], 5, labels=False, duplicates="drop")
        except Exception:
            pass

    C = pd.DataFrame(index=tr.index, data=cand)

    # MI selection
    MI_X = C.copy()
    for col in MI_X.columns:
        if not pd.api.types.is_numeric_dtype(MI_X[col]):
            MI_X[col] = pd.Categorical(MI_X[col]).codes
    mi = mutual_info_classif(MI_X.fillna(-999), tr[TARGET].astype(int), random_state=SEED)
    mi_series = pd.Series(mi, index=MI_X.columns).sort_values(ascending=False)
    keep = list(mi_series.head(min(budget, len(mi_series))).index)

    # Re-create kept features on test
    for k in keep:
        if k.endswith("_z"):
            base = k[:-2]; mu, sd = tr[base].mean(), tr[base].std(ddof=0) or 1.0
            tr[k] = (tr[base] - mu) / sd
            te[k] = (te[base] - mu) / sd
        elif k.endswith("_2"):
            base = k[:-2]
            tr[k] = tr[base] * tr[base]; te[k] = te[base] * te[base]
        elif "_x_" in k:
            a, b = k.split("_x_")
            tr[k] = tr[a] * tr[b]; te[k] = te[a] * te[b]
        elif "_r_" in k:
            a, b = k.split("_r_")
            tr[k] = np.where(tr[b]==0, np.nan, tr[a]/tr[b])
            te[k] = np.where(te[b]==0, np.nan, te[a]/te[b])
        elif k.endswith("_freq"):
            base = k[:-5]; freq = tr[base].value_counts(dropna=False) / len(tr)
            tr[k] = tr[base].map(freq); te[k] = te[base].map(freq)
        elif k.endswith("_q5auto"):
            base = k.replace("_q5auto", "")
            try:
                tr[k] = pd.qcut(tr[base], 5, labels=False, duplicates="drop")
                te[k] = pd.qcut(te[base], 5, labels=False, duplicates="drop")
            except Exception:
                tr[k] = np.nan; te[k] = np.nan
        else:
            tr[k] = C[k]; te[k] = np.nan

    return tr, te, keep, mi_series

## 09. Train

In [11]:
def evaluate_feature_set_for_model(model_name: str, set_name: str,
                                   train_df: pd.DataFrame, test_df: pd.DataFrame):
    feature_cols, preproc = preprocessor_for_model(model_name, train_df)

    X = train_df[feature_cols].copy()
    y = train_df[TARGET].astype(int)
    X_te = test_df[feature_cols].copy() if test_df is not None else None

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    oof = np.zeros(len(train_df), dtype=np.float32)
    test_preds, fold_aucs = [], []
    fold_importance_sums = None

    t0 = time.time()
    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        if time_up():
            print(f"[{set_name} | {model_name}] Time limit hit after fold {fold-1}.")
            break

        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        fitted_pre = preproc.fit(X_tr)
        Xtr_t = fitted_pre.transform(X_tr)
        Xva_t = fitted_pre.transform(X_va)
        Xte_t = fitted_pre.transform(X_te) if X_te is not None else None

        est = FACTORIES[model_name]()
        if model_name == "lightgbm_gpu":
            va_pred, te_pred, imp = fit_predict_lightgbm(est, Xtr_t, y_tr, Xva_t, y_va, Xte_t)
        elif model_name == "xgboost_gpu":
            va_pred, te_pred, imp = fit_predict_xgb(est, Xtr_t, y_tr, Xva_t, y_va, Xte_t)
        elif model_name == "catboost_gpu":
            va_pred, te_pred, imp = fit_predict_cat(est, Xtr_t, y_tr, Xva_t, y_va, Xte_t)
        else:
            va_pred, te_pred, imp = fit_predict_sklearn(est, Xtr_t, y_tr, Xva_t, y_va, Xte_t)

        oof[va_idx] = va_pred
        if te_pred is not None:
            test_preds.append(te_pred)

        fold_aucs.append(roc_auc_score(y_va, va_pred))

        try:
            names = fitted_pre.get_feature_names_out()
        except Exception:
            names = np.array([f"f_{i}" for i in range(Xtr_t.shape[1])])
        if imp is not None:
            imp = np.asarray(imp, dtype=float)
            if fold_importance_sums is None:
                fold_importance_sums = pd.Series(0.0, index=names)
            k = min(len(fold_importance_sums), len(imp))
            fold_importance_sums.iloc[:k] += imp[:k]

        del Xtr_t, Xva_t, Xte_t
        gc.collect()

    elapsed = time.time() - t0
    if not fold_aucs:
        print(f"[{set_name} | {model_name}] No folds completed.")
        return None

    cv_mean, cv_std = float(np.mean(fold_aucs)), float(np.std(fold_aucs))
    test_pred = np.mean(test_preds, axis=0) if test_preds else None

    oof_path, sub_path = save_oof_and_sub(set_name, model_name, oof, test_pred,
                                          train_df[ID_COL].values,
                                          (test_df[ID_COL].values if (test_df is not None and test_pred is not None) else None))
    imp_path = None
    if fold_importance_sums is not None:
        imp_path = save_importance_csv(set_name, model_name, fold_importance_sums.index, fold_importance_sums.values)

    row = {
        "version": VERSION,
        "feature_set": set_name,
        "model": model_name,
        "cv_auc_mean": cv_mean,
        "cv_auc_std": cv_std,
        "folds_completed": len(fold_aucs),
        "train_time_sec": round(elapsed, 2),
        "train_time_hms": seconds_to_str(elapsed),
        "timestamp": pd.Timestamp.utcnow().isoformat(),
        "oof_path": oof_path,
        "sub_path": sub_path,
        "importance_path": imp_path
    }
    write_results_row(row)
    print(f"[{set_name} | {model_name}] CV AUC: {cv_mean:.6f} ± {cv_std:.6f} | time {seconds_to_str(elapsed)}")
    print(f"[{set_name} | {model_name}] OOF -> {oof_path}")
    if sub_path: print(f"[{set_name} | {model_name}] SUB -> {sub_path}")
    if imp_path: print(f"[{set_name} | {model_name}] IMP -> {imp_path}")
    return row

## 10. Building Features

In [12]:
# # Ensure reconstructed frames exist (reuse prebuilt CSVs if configured earlier)
# if 'train_recon' not in globals() or 'test_recon' not in globals():
#     if USE_PREBUILT_RECON:
#         train_recon = pd.read_csv(RECON_TRAIN_PATH)
#         test_recon  = pd.read_csv(RECON_TEST_PATH)
#         if TARGET in train_recon.columns and TARGET in train.columns:
#             try: train_recon[TARGET] = train_recon[TARGET].astype(train[TARGET].dtype)
#             except: pass
#         for df, ref in [(train_recon, train), (test_recon, test)]:
#             if ID_COL in df.columns and ID_COL in ref.columns:
#                 try: df[ID_COL] = df[ID_COL].astype(ref[ID_COL].dtype)
#                 except: pass
#         print("Loaded reconstructed CSVs:",
#               f"train_recon {train_recon.shape} | test_recon {test_recon.shape}")
#     else:
#         train_recon, test_recon = train.copy(), test.copy()
#         print("Using original train/test as fallback for reconstruction-dependent FE.")

# # B. Raw-EDA features on competition columns (ENGINEERED)
# train_raw, test_raw = fe_raw_basic(train, test)

# # C. Recon-aware features (ENGINEERED on reconstructed data)
# train_recon_fe, test_recon_fe = fe_recon_basic(train_recon, test_recon)

# # D. Auto-generated sets on reconstructed space (ENGINEERED; MI-selected)
# train_auto20,  test_auto20,  keep20,  mi20  = auto_generate_features(train_recon_fe, test_recon_fe, budget=20)
# train_auto50,  test_auto50,  keep50,  mi50  = auto_generate_features(train_recon_fe, test_recon_fe, budget=50)
# train_auto100, test_auto100, keep100, mi100 = auto_generate_features(train_recon_fe, test_recon_fe, budget=100)
# train_auto200, test_auto200, keep200, mi200 = auto_generate_features(train_recon_fe, test_recon_fe, budget=200)

# print("Engineered feature sets ready:")
# for name, df in [
#     ("raw_eda",   train_raw),
#     ("recon_fe",  train_recon_fe),
#     ("auto_20",   train_auto20),
#     ("auto_50",   train_auto50),
#     ("auto_100",  train_auto100),
#     ("auto_200",  train_auto200),
# ]:
#     print(f" - {name:10s} -> shape {df.shape}")

In [13]:
# to_save = {
#     f"{OUTPUT_DIR}/train_raw_eda_v{VERSION}.csv":  train_raw,
#     f"{OUTPUT_DIR}/test_raw_eda_v{VERSION}.csv":   test_raw,
#     f"{OUTPUT_DIR}/train_recon_fe_v{VERSION}.csv": train_recon_fe,
#     f"{OUTPUT_DIR}/test_recon_fe_v{VERSION}.csv":  test_recon_fe,
#     f"{OUTPUT_DIR}/train_auto20_v{VERSION}.csv":   train_auto20,
#     f"{OUTPUT_DIR}/test_auto20_v{VERSION}.csv":    test_auto20,
#     f"{OUTPUT_DIR}/train_auto50_v{VERSION}.csv":   train_auto50,
#     f"{OUTPUT_DIR}/test_auto50_v{VERSION}.csv":    test_auto50,
#     f"{OUTPUT_DIR}/train_auto100_v{VERSION}.csv":  train_auto100,
#     f"{OUTPUT_DIR}/test_auto100_v{VERSION}.csv":   test_auto100,
#     f"{OUTPUT_DIR}/train_auto200_v{VERSION}.csv":  train_auto200,
#     f"{OUTPUT_DIR}/test_auto200_v{VERSION}.csv":   test_auto200,
# }
# for path, df in to_save.items():
#     df.to_csv(path, index=False)
# print("Saved engineered datasets to:", OUTPUT_DIR)

In [14]:
def _load_engineered_pair(name: str, must_have_target: bool = True):
    """
    Loads train_<name>_v{VERSION}.csv and test_<name>_v{VERSION}.csv
    from ENGINEERED_DIR, aligns dtypes for ID and TARGET, and returns (train_df, test_df).
    """
    tr_path = f"{ENGINEERED_DIR}/train_{name}_v{ENGINEERED_VERSION}.csv"
    te_path = f"{ENGINEERED_DIR}/test_{name}_v{ENGINEERED_VERSION}.csv"

    if not os.path.exists(tr_path):
        raise FileNotFoundError(f"Missing engineered TRAIN file: {tr_path}")
    if not os.path.exists(te_path):
        raise FileNotFoundError(f"Missing engineered TEST file: {te_path}")

    tr_df = pd.read_csv(tr_path)
    te_df = pd.read_csv(te_path)

    # Align ID dtype
    if ID_COL in tr_df.columns:
        try: tr_df[ID_COL] = tr_df[ID_COL].astype(train[ID_COL].dtype)
        except: pass
    if ID_COL in te_df.columns:
        try: te_df[ID_COL] = te_df[ID_COL].astype(test[ID_COL].dtype)
        except: pass

    # Ensure target presence for train if required
    if must_have_target and TARGET not in tr_df.columns:
        # If the saved engineered file didn't include TARGET, merge it from original train
        tr_df = tr_df.merge(train[[ID_COL, TARGET]], on=ID_COL, how="left")

    # Safety: ensure no duplicate columns sneaked in
    tr_df = tr_df.loc[:, ~tr_df.columns.duplicated()]
    te_df = te_df.loc[:, ~te_df.columns.duplicated()]

    return tr_df, te_df


# Only read engineered sets we need to evaluate now
if USE_PREBUILT_ENGINEERED:
    print("Reading prebuilt engineered feature sets from:", ENGINEERED_DIR)

    # B. Raw-EDA features on competition columns (ENGINEERED)
    train_raw,    test_raw    = _load_engineered_pair("raw_eda",    must_have_target=True)

    # C. Recon-aware features (ENGINEERED)
    train_recon_fe, test_recon_fe = _load_engineered_pair("recon_fe", must_have_target=True)

    # D. Auto-generated sets (ENGINEERED; MI-selected)
    train_auto20,  test_auto20  = _load_engineered_pair("auto20",   must_have_target=True)
    train_auto50,  test_auto50  = _load_engineered_pair("auto50",   must_have_target=True)
    train_auto100, test_auto100 = _load_engineered_pair("auto100",  must_have_target=True)
    train_auto200, test_auto200 = _load_engineered_pair("auto200",  must_have_target=True)

    print("Engineered feature sets loaded:")
    for name, df in [
        ("raw_eda",   train_raw),
        ("recon_fe",  train_recon_fe),
        ("auto20",    train_auto20),
        ("auto50",    train_auto50),
        ("auto100",   train_auto100),
        ("auto200",   train_auto200),
    ]:
        print(f" - {name:10s} -> shape {df.shape}")
else:
    raise RuntimeError("USE_PREBUILT_ENGINEERED is False. Set it True to read prebuilt datasets.")

Reading prebuilt engineered feature sets from: /kaggle/input/s05e12-outputs-diabetes-prediction
Engineered feature sets loaded:
 - raw_eda    -> shape (700000, 112)
 - recon_fe   -> shape (700000, 48)
 - auto20     -> shape (700000, 68)
 - auto50     -> shape (700000, 98)
 - auto100    -> shape (700000, 148)
 - auto200    -> shape (700000, 215)


## 11. Feature Importance

In [15]:
def _read_fi_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    
    # Normalize importance >= 0, handle column names
    if "feature" not in df.columns:
        # try to infer
        maybe = [c for c in df.columns if c.lower().startswith("feat")]
        if maybe:
            df = df.rename(columns={maybe[0]: "feature"})
    if "importance" not in df.columns:
        maybe = [c for c in df.columns if c.lower().startswith("import")]
        if maybe:
            df = df.rename(columns={maybe[0]: "importance"})
    df = df[["feature","importance"]].copy()
    df["importance"] = df["importance"].fillna(0).astype(float).clip(lower=0)
    # Combine duplicates by sum (just in case)
    df = df.groupby("feature", as_index=False)["importance"].sum().sort_values("importance", ascending=False)
    return df

fi_tables = {k: _read_fi_csv(v) for k, v in FI_PATHS.items()}

print("Loaded feature-importance tables:")
for k, v in fi_tables.items():
    print(k, "->", v.shape, "| top3:", list(v.head(3)["feature"]))

Loaded feature-importance tables:
auto_100 -> (166, 2) | top3: ['num__physical_activity_minutes_per_week_z', 'num__triglycerides', 'num__physical_activity_minutes_per_week_2']
auto_200 -> (233, 2) | top3: ['num__physical_activity_minutes_per_week_z', 'num__physical_activity_minutes_per_week', 'num__physical_activity_minutes_per_week_2']
auto_20 -> (86, 2) | top3: ['num__physical_activity_minutes_per_week', 'num__triglycerides', 'num__cholesterol_total']
auto_50 -> (116, 2) | top3: ['num__physical_activity_minutes_per_week', 'num__triglycerides', 'num__screen_time_hours_per_day']
raw_eda -> (128, 2) | top3: ['num__physical_activity_minutes_per_week', 'num__physical_activity_minutes_per_week_cap', 'num__physical_activity_minutes_per_week_log1p']
recon_fe -> (66, 2) | top3: ['num__physical_activity_minutes_per_week', 'num__triglycerides', 'num__cholesterol_total']


In [16]:
def _weights_from_fi(fi_df: pd.DataFrame) -> dict:
    """Return dict feature -> normalized weight in [0,1], with epsilon floor."""
    if fi_df.empty:
        return {}
    imp = fi_df["importance"].astype(float).values
    m = imp.max() if imp.size else 0.0
    if m <= 0:
        w = np.zeros_like(imp)
    else:
        w = imp / m
    w = np.maximum(w, 0.0)
    weights = dict(zip(fi_df["feature"], w))
    return weights

def _apply_feature_weights(df: pd.DataFrame, weights: dict) -> pd.DataFrame:
    """Multiply NUMERIC columns by (weight + WEIGHT_EPS) if column in weights."""
    if not weights:
        return df
    out = df.copy()
    for col, w in weights.items():
        if col in out.columns and pd.api.types.is_numeric_dtype(out[col]):
            out[col] = out[col] * (float(w) + WEIGHT_EPS)
    return out

def _subset_by_topk(train_df: pd.DataFrame, test_df: pd.DataFrame, fi_df: pd.DataFrame, k: int):
    """Keep only ID, TARGET and Top-K features that exist in df."""
    top_feats = list(fi_df.head(k)["feature"])
    keep_feats = [c for c in top_feats if c in train_df.columns]
    cols = [ID_COL] + ([TARGET] if TARGET in train_df.columns else []) + keep_feats
    tr_k = train_df[cols].copy()
    te_cols = [c for c in cols if c != TARGET]
    te_k = test_df[te_cols].copy()
    return tr_k, te_k, keep_feats

def _signature_from_cols(cols: list) -> str:
    """Create a stable signature (for dedup) from a list of feature columns."""
    return "|".join(sorted(cols))

## 12. Run List

In [17]:
# Map feature set names to loaded dataframes
feature_sets = {
    "raw_eda":   (train_raw,     test_raw),
    "recon_fe":  (train_recon_fe, test_recon_fe),
    "auto_20":   (train_auto20,  test_auto20),
    "auto_50":   (train_auto50,  test_auto50),
    "auto_100":  (train_auto100, test_auto100),
    "auto_200":  (train_auto200, test_auto200),
}

# Safety: align TARGET presence in train and ID presence in test
for name, (tr_df, te_df) in feature_sets.items():
    if TARGET not in tr_df.columns:
        tr_df = tr_df.merge(train[[ID_COL, TARGET]], on=ID_COL, how="left")
        feature_sets[name] = (tr_df, te_df)

# 12A. Build full-set (unweighted) and full-set (weighted) tasks
run_bundles = []  # list of tuples: (set_name, train_df, test_df)

# Full sets
for set_name, (tr_df, te_df) in feature_sets.items():
    # Unweighted
    run_bundles.append((f"{set_name}_full_unweighted", tr_df, te_df))
    # Weighted
    fi_df = fi_tables.get(set_name.replace("auto_", "auto_"), pd.DataFrame(columns=["feature","importance"]))
    w = _weights_from_fi(fi_df)
    tr_w = _apply_feature_weights(tr_df.drop(columns=[TARGET], errors="ignore"), w)
    if TARGET in tr_df.columns:
        tr_w = tr_w.join(tr_df[[ID_COL, TARGET]].set_index(ID_COL), on=ID_COL)
    te_w = _apply_feature_weights(te_df, w)
    run_bundles.append((f"{set_name}_full_weighted", tr_w, te_w))

# 12B. Build Top-K subsets across ALL sets, deduplicate by feature-signature
seen_sigs = set()
topk_pairs = []  # (set_name, k, weighted_flag, tr_df, te_df)

for set_name, (tr_df, te_df) in feature_sets.items():
    fi_df = fi_tables[set_name]  # guaranteed by mapping above
    for k in TOPK_LIST:
        # Unweighted subset
        tr_k, te_k, keep_feats = _subset_by_topk(tr_df, te_df, fi_df, k)
        sig = _signature_from_cols(keep_feats)
        if sig not in seen_sigs:
            seen_sigs.add(sig)
            topk_pairs.append((f"{set_name}_top{k}_unweighted", tr_k, te_k))
        # Weighted subset (weights from fi of the SAME set)
        w = _weights_from_fi(fi_df)
        tr_kw = _apply_feature_weights(tr_k.drop(columns=[TARGET], errors="ignore"), w)
        if TARGET in tr_k.columns:
            tr_kw = tr_kw.join(tr_k[[ID_COL, TARGET]].set_index(ID_COL), on=ID_COL)
        te_kw = _apply_feature_weights(te_k, w)
        sig_w = _signature_from_cols(keep_feats) + "_w"
        if sig_w not in seen_sigs:
            seen_sigs.add(sig_w)
            topk_pairs.append((f"{set_name}_top{k}_weighted", tr_kw, te_kw))

# Merge topk tasks into run_bundles
run_bundles.extend(topk_pairs)

print(f"Total run bundles prepared: {len(run_bundles)}")
for name, _, _ in run_bundles[:6]:
    print("  ->", name)

Total run bundles prepared: 14
  -> raw_eda_full_unweighted
  -> raw_eda_full_weighted
  -> recon_fe_full_unweighted
  -> recon_fe_full_weighted
  -> auto_20_full_unweighted
  -> auto_20_full_weighted


## 1. The RUN

In [18]:
results = []

for set_name, tr_df, te_df in run_bundles:
    for model_name in MODELS_TO_RUN:
        if time_up():
            print("\n=== Global time limit reached; stopping. ===")
            break
        try:
            row = evaluate_feature_set_for_model(model_name, set_name, tr_df, te_df)
            if row is not None:
                results.append(row)
        except Exception as e:
            print(f"[{set_name} | {model_name}] ERROR:", e)
            continue

# Summary table
if os.path.exists(RESULTS_CSV):
    results_df = pd.read_csv(RESULTS_CSV).sort_values(
        ["cv_auc_mean","feature_set","model"], ascending=[False, True, True]
    )
    display(results_df)
else:
    print("No results CSV produced.")

[raw_eda_full_unweighted | hist_gbdt] CV AUC: 0.714821 ± 0.001524 | time 00:09:37
[raw_eda_full_unweighted | hist_gbdt] OOF -> model_outputs_vFE_012/oof_hist_gbdt_raw_eda_full_unweighted_vFE_012.csv
[raw_eda_full_unweighted | hist_gbdt] SUB -> model_outputs_vFE_012/sub_hist_gbdt_raw_eda_full_unweighted_vFE_012.csv
[raw_eda_full_weighted | hist_gbdt] CV AUC: 0.714821 ± 0.001524 | time 00:09:38
[raw_eda_full_weighted | hist_gbdt] OOF -> model_outputs_vFE_012/oof_hist_gbdt_raw_eda_full_weighted_vFE_012.csv
[raw_eda_full_weighted | hist_gbdt] SUB -> model_outputs_vFE_012/sub_hist_gbdt_raw_eda_full_weighted_vFE_012.csv
[recon_fe_full_unweighted | hist_gbdt] CV AUC: 0.717725 ± 0.001632 | time 00:05:27
[recon_fe_full_unweighted | hist_gbdt] OOF -> model_outputs_vFE_012/oof_hist_gbdt_recon_fe_full_unweighted_vFE_012.csv
[recon_fe_full_unweighted | hist_gbdt] SUB -> model_outputs_vFE_012/sub_hist_gbdt_recon_fe_full_unweighted_vFE_012.csv
[recon_fe_full_weighted | hist_gbdt] CV AUC: 0.717725 ± 0

Unnamed: 0,version,feature_set,model,cv_auc_mean,cv_auc_std,folds_completed,train_time_sec,train_time_hms,timestamp,oof_path,sub_path,importance_path
4,FE_012,auto_20_full_unweighted,hist_gbdt,0.717853,0.001708,10,409.08,00:06:49,2025-12-23T16:02:04.065329+00:00,model_outputs_vFE_012/oof_hist_gbdt_auto_20_fu...,model_outputs_vFE_012/sub_hist_gbdt_auto_20_fu...,
5,FE_012,auto_20_full_weighted,hist_gbdt,0.717853,0.001708,10,404.34,00:06:44,2025-12-23T16:08:50.804460+00:00,model_outputs_vFE_012/oof_hist_gbdt_auto_20_fu...,model_outputs_vFE_012/sub_hist_gbdt_auto_20_fu...,
2,FE_012,recon_fe_full_unweighted,hist_gbdt,0.717725,0.001632,10,327.19,00:05:27,2025-12-23T15:49:35.354690+00:00,model_outputs_vFE_012/oof_hist_gbdt_recon_fe_f...,model_outputs_vFE_012/sub_hist_gbdt_recon_fe_f...,
3,FE_012,recon_fe_full_weighted,hist_gbdt,0.717725,0.001632,10,334.0,00:05:34,2025-12-23T15:55:11.696389+00:00,model_outputs_vFE_012/oof_hist_gbdt_recon_fe_f...,model_outputs_vFE_012/sub_hist_gbdt_recon_fe_f...,
6,FE_012,auto_50_full_unweighted,hist_gbdt,0.717689,0.001643,10,545.23,00:09:05,2025-12-23T16:17:59.668392+00:00,model_outputs_vFE_012/oof_hist_gbdt_auto_50_fu...,model_outputs_vFE_012/sub_hist_gbdt_auto_50_fu...,
7,FE_012,auto_50_full_weighted,hist_gbdt,0.717689,0.001643,10,545.67,00:09:05,2025-12-23T16:27:08.062159+00:00,model_outputs_vFE_012/oof_hist_gbdt_auto_50_fu...,model_outputs_vFE_012/sub_hist_gbdt_auto_50_fu...,
8,FE_012,auto_100_full_unweighted,hist_gbdt,0.717363,0.001619,10,752.18,00:12:32,2025-12-23T16:39:44.195233+00:00,model_outputs_vFE_012/oof_hist_gbdt_auto_100_f...,model_outputs_vFE_012/sub_hist_gbdt_auto_100_f...,
9,FE_012,auto_100_full_weighted,hist_gbdt,0.717363,0.001619,10,754.73,00:12:34,2025-12-23T16:52:23.972384+00:00,model_outputs_vFE_012/oof_hist_gbdt_auto_100_f...,model_outputs_vFE_012/sub_hist_gbdt_auto_100_f...,
10,FE_012,auto_200_full_unweighted,hist_gbdt,0.717318,0.001648,10,1035.03,00:17:15,2025-12-23T17:09:43.969924+00:00,model_outputs_vFE_012/oof_hist_gbdt_auto_200_f...,model_outputs_vFE_012/sub_hist_gbdt_auto_200_f...,
11,FE_012,auto_200_full_weighted,hist_gbdt,0.717318,0.001648,10,1031.33,00:17:11,2025-12-23T17:27:02.637353+00:00,model_outputs_vFE_012/oof_hist_gbdt_auto_200_f...,model_outputs_vFE_012/sub_hist_gbdt_auto_200_f...,
