In [1]:
# =========================================================
# BEST PIPELINE
# + AdaBoost Residual
# + RandomForest Residual
# + Soft Voting
# + Logit Temperature Scaling
# =========================================================

import numpy as np
import pandas as pd
import gc
import warnings
from itertools import combinations

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from scipy.stats import rankdata

warnings.filterwarnings("ignore")

# =====================
# LOAD DATA
# =====================
train = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test  = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")

TARGET = "Heart Disease"
train[TARGET] = train[TARGET].map({"Absence":0, "Presence":1}).astype(np.uint8)

y = train[TARGET].values
test_ids = test["id"]

X_tr_raw = train.drop(columns=[TARGET, "id"])
X_te_raw = test.drop(columns=["id"])

cat_cols = [
    'Sex','Chest pain type','FBS over 120','EKG results',
    'Exercise angina','Slope of ST',
    'Number of vessels fluro','Thallium'
]
num_cols = ['Age','BP','Cholesterol','Max HR','ST depression']

# =====================
# FREQUENCY ENCODING
# =====================
def freq_encode(tr, te, cols):
    tr_out, te_out = pd.DataFrame(index=tr.index), pd.DataFrame(index=te.index)
    for c in cols:
        freq = tr[c].value_counts(normalize=True)
        tr_out[c+"_freq"] = tr[c].map(freq)
        te_out[c+"_freq"] = te[c].map(freq)
    return tr_out.fillna(0), te_out.fillna(0)

tr_freq, te_freq = freq_encode(X_tr_raw, X_te_raw, cat_cols + num_cols)

# =====================
# TARGET ENCODING
# =====================
skf_te = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

tr_te = pd.DataFrame(index=X_tr_raw.index)
te_te = pd.DataFrame(index=X_te_raw.index)

for c in cat_cols + num_cols:
    tr_te[c+"_te"] = 0.0
    for tr_i, val_i in skf_te.split(X_tr_raw, y):
        means = train.iloc[tr_i].groupby(c)[TARGET].mean()
        tr_te.iloc[val_i, tr_te.columns.get_loc(c+"_te")] = (
            X_tr_raw.iloc[val_i][c].map(means)
        )
    te_te[c+"_te"] = X_te_raw[c].map(train.groupby(c)[TARGET].mean())

tr_te = tr_te.fillna(0)
te_te = te_te.fillna(0)

# =====================
# CORRELATION FEATURE GROWTH
# =====================
base = tr_te.copy()
corr_scores = {}

for a, b in combinations(base.columns, 2):
    corr_scores[(a,b)] = abs(np.corrcoef(base[a], base[b])[0,1])

top_pairs = sorted(corr_scores, key=corr_scores.get, reverse=True)[:8]

for a, b in top_pairs:
    tr_te[f"{a}_x_{b}"] = tr_te[a] * tr_te[b]
    te_te[f"{a}_x_{b}"] = te_te[a] * te_te[b]

X_train = pd.concat([tr_freq, tr_te], axis=1).fillna(0)
X_test  = pd.concat([te_freq, te_te], axis=1).fillna(0)

# =====================
# MODEL CONFIGS
# =====================
cat_params = {
    "iterations": 10000,
    "learning_rate": 0.01,
    "depth": 2,
    "loss_function": "Logloss",
    "auto_class_weights": "Balanced",
    "bootstrap_type": "Bernoulli",
    "subsample": 0.9,
    "l2_leaf_reg": 12,
    "random_strength": 1.2,
    "task_type": "GPU",
    "verbose": False
}

xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.01,
    "max_depth": 2,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "n_estimators": 10000,
    "tree_method": "hist",
    "predictor": "gpu_predictor",
    "random_state": 42,
    "n_jobs": -1
}

# =====================
# CV TRAINING
# =====================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(X_train))
test_pred = np.zeros(len(X_test))

for fold, (tr, val) in enumerate(skf.split(X_train, y)):
    print(f"\nFold {fold+1}")

    cb = CatBoostClassifier(**cat_params, random_seed=42+fold)
    cb.fit(X_train.iloc[tr], y[tr])
    val_cb = cb.predict_proba(X_train.iloc[val])[:,1]
    te_cb  = cb.predict_proba(X_test)[:,1]

    xgb = XGBClassifier(**xgb_params)
    xgb.fit(X_train.iloc[tr], y[tr])
    val_xgb = xgb.predict_proba(X_train.iloc[val])[:,1]
    te_xgb  = xgb.predict_proba(X_test)[:,1]

    # Rank stacking
    rank_cb_val = rankdata(val_cb) / len(val_cb)
    rank_xgb_val = rankdata(val_xgb) / len(val_xgb)

    rank_cb_te = rankdata(te_cb) / len(te_cb)
    rank_xgb_te = rankdata(te_xgb) / len(te_xgb)

    val_rank = (rank_cb_val + rank_xgb_val) / 2
    te_rank  = (rank_cb_te + rank_xgb_te) / 2

    # META FEATURES
    val_meta = np.column_stack([
        rank_cb_val,
        rank_xgb_val,
        val_rank,
        np.abs(rank_cb_val - rank_xgb_val)
    ])

    te_meta = np.column_stack([
        rank_cb_te,
        rank_xgb_te,
        te_rank,
        np.abs(rank_cb_te - rank_xgb_te)
    ])

    # AdaBoost Residual
    ada = AdaBoostClassifier(
        estimator=DecisionTreeClassifier(
            max_depth=3,
            min_samples_leaf=40,
            random_state=42
        ),
        n_estimators=600,
        learning_rate=0.03,
        algorithm="SAMME",
        random_state=42
    )
    ada.fit(val_meta, y[val])
    val_ada = ada.predict_proba(val_meta)[:,1]
    te_ada  = ada.predict_proba(te_meta)[:,1]

    # RandomForest Residual
    rf = RandomForestClassifier(
        n_estimators=800,
        max_depth=6,
        min_samples_leaf=50,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )
    rf.fit(val_meta, y[val])
    val_rf = rf.predict_proba(val_meta)[:,1]
    te_rf  = rf.predict_proba(te_meta)[:,1]

    # SOFT VOTING
    val_final = (val_rank + val_ada + val_rf) / 3
    te_final  = (te_rank + te_ada + te_rf) / 3

    oof[val] = val_final
    test_pred += te_final / skf.n_splits

    print(" Fold AUC:", roc_auc_score(y[val], val_final))
    gc.collect()

print("\nFinal CV AUC:", roc_auc_score(y, oof))

# =====================
# TEMPERATURE SCALING
# =====================
def safe_logit(p, eps=1e-6):
    p = np.clip(p, eps, 1-eps)
    return np.log(p/(1-p))

def sigmoid(x):
    return 1/(1+np.exp(-x))

temperature = 1.02
test_pred = sigmoid(safe_logit(test_pred) * temperature)

print("Applied temperature scaling:", temperature)

# =====================
# SAVE SUBMISSION
# =====================
submission = pd.DataFrame({
    "id": test_ids,
    TARGET: test_pred
})

submission.to_csv("submission.csv", index=False)

print("\nðŸš€ FINAL SUBMISSION SAVED")



Fold 1
 Fold AUC: 0.9564900361138138

Fold 2
 Fold AUC: 0.9552335019274563

Fold 3
 Fold AUC: 0.9560306632777964

Fold 4
 Fold AUC: 0.9557610639743124

Fold 5
 Fold AUC: 0.9565558908566627

Final CV AUC: 0.9560145611405322
Applied temperature scaling: 1.02

ðŸš€ FINAL SUBMISSION SAVED
