In [1]:
# ==============================
# 1) Imports & Config
# ==============================
from __future__ import annotations

import os
import sys
import math
import json
import numpy as np
import pandas as pd

# Models
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from catboost import CatBoostClassifier

# Metrics
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss, f1_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

# Utils
from pprint import pprint

# Repro
SEED = 42
TAU  = 300   # keep at 300 as you requested
np.random.seed(SEED)

pd.set_option("display.max_columns", 200)
print(f"Config set. SEED={SEED}, TAU={TAU}")


Config set. SEED=42, TAU=300


### DATA PREPARATION

In [3]:
from pathlib import Path

DATA_CSV = Path("cleaned/mafia_clean.csv")   # put the CSV next to this notebook or provide an absolute path
OUT_DIR  = Path("cleaned"); OUT_DIR.mkdir(exist_ok=True, parents=True)

df = pd.read_csv(DATA_CSV)
print("Loaded:", df.shape, "columns:", len(df.columns))
assert {'id','game_id','player_id','role','team','game_points','team_win'}.issubset(df.columns), \
    "Missing required columns in the cleaned dataset."

# Basic coercions
df['id'] = pd.to_numeric(df['id'], errors='coerce').astype('int64')
df['game_id'] = pd.to_numeric(df['game_id'], errors='coerce').astype('int64')
df['player_id'] = pd.to_numeric(df['player_id'], errors='coerce').astype('int64')
df['team_win'] = pd.to_numeric(df['team_win'], errors='coerce').astype('int8')
df['team'] = df['team'].astype('category')
df['role'] = df['role'].astype('category')

# Seat/position optional column name normalization (if present)
if 'place' in df.columns:
    df['place'] = pd.to_numeric(df['place'], errors='coerce').fillna(0).astype('int16')

# Meta eras
bins   = [0, 200_000, 400_000, 600_000, 800_000, 1_000_000_000]
labels = [1, 2, 3, 4, 5]
df['meta_period'] = pd.cut(df['id'], bins=bins, labels=labels, include_lowest=True).astype('int8')

# Gap per player (id as time proxy)
df = df.sort_values(['player_id','id']).copy()
df['gap_id'] = df.groupby('player_id')['id'].diff().fillna(0).astype('int64')
df['gap_id_clipped'] = np.clip(df['gap_id'], 0, 5000).astype('int32')
GAP_THRESH = 381  # adjust via quantiles if desired
df['long_break_flag'] = (df['gap_id'] >= GAP_THRESH).astype('int8')

# Restore global order
df = df.sort_values('id').reset_index(drop=True)

def compute_elos(dfin, init=1500, k=24, tau=300.0):
    d = dfin.sort_values('id').copy()
    elo_global, elo_side, elo_role = {}, {}, {}
    last_seen = {}
    outs = []

    for gid, g in d.groupby('game_id', sort=False):
        cur = g.copy()
        cur['pre_elo']      = [elo_global.get(pid, init) for pid in cur['player_id']]
        cur['pre_elo_side'] = [elo_side.get((pid, team), init) for pid, team in zip(cur['player_id'], cur['team'])]
        cur['pre_elo_role'] = [elo_role.get((pid, role), init) for pid, role in zip(cur['player_id'], cur['role'])]

        maf_mask  = cur['team'].eq('mafia')
        mafia_mu  = cur.loc[maf_mask, 'pre_elo'].mean()
        citizen_mu= cur.loc[~maf_mask, 'pre_elo'].mean()
        exp_mafia = 1.0 / (1.0 + 10 ** ((citizen_mu - mafia_mu)/400))
        mafia_res = int(cur.loc[maf_mask, 'team_win'].iloc[0])

        for _, r in cur.iterrows():
            pid, side, role, rid = int(r['player_id']), r['team'], r['role'], int(r['id'])
            gap = rid - last_seen.get(pid, rid)
            decay = float(np.exp(-max(gap,0)/float(tau)))
            exp = exp_mafia if side=='mafia' else (1-exp_mafia)
            act = mafia_res if side=='mafia' else (1-mafia_res)
            delta = k * decay * (act - exp)

            elo_global[pid] = elo_global.get(pid,  init) + delta
            elo_side[(pid, side)] = elo_side.get((pid, side), init) + delta
            elo_role[(pid, role)] = elo_role.get((pid, role), init) + delta
            last_seen[pid] = rid

        outs.append(cur[['game_id','player_id','pre_elo','pre_elo_side','pre_elo_role']])

    elo_df = pd.concat(outs, ignore_index=True)
    return d.merge(elo_df, on=['game_id','player_id'], how='left')

work_players = compute_elos(df, init=1500, k=24, tau=300.0)

def add_rolling_stats_side(df, windows=(5,20)):
    d = df.sort_values(['player_id','id']).copy()
    for side in ['mafia','citizens']:
        mask = d['team'].eq(side)
        d.loc[mask, f'roll5_win_rate_{side}']  = d.loc[mask].groupby('player_id')['team_win'].shift(1).rolling(windows[0], min_periods=1).mean().values
        d.loc[mask, f'roll20_win_rate_{side}'] = d.loc[mask].groupby('player_id')['team_win'].shift(1).rolling(windows[1], min_periods=1).mean().values
        d.loc[~mask, f'roll5_win_rate_{side}']  = 0.0
        d.loc[~mask, f'roll20_win_rate_{side}'] = 0.0
    return d

work_players = add_rolling_stats_side(work_players)

def add_role_history_stats(df, windows=(5,20,50)):
    d = df.sort_values(['player_id','role','id']).copy()
    out = []
    for (pid, role), g in d.groupby(['player_id','role'], sort=False):
        g = g.copy()
        past = g['team_win'].shift(1)
        g['games_in_role'] = np.arange(len(g), dtype=np.int32)
        for w in windows:
            g[f'win_rate_role_{role}_last{w}'] = past.rolling(w, min_periods=1).mean()
        out.append(g)
    return pd.concat(out, ignore_index=True).sort_values('id').reset_index(drop=True)

work_players = add_role_history_stats(work_players, windows=(5,20,50))

from itertools import combinations

def add_synergy_features(df):
    d = df.copy()
    game_order = (d.groupby('game_id')['id'].max().sort_values().index.tolist())
    pair_counts = {}
    out_rows = []

    for gid in game_order:
        g = d[d['game_id'] == gid]
        for team in ['mafia', 'citizens']:
            players = g.loc[g['team']==team, 'player_id'].dropna().astype(int).tolist()
            vals = [pair_counts.get((a,b,team), 0) for a,b in combinations(sorted(players), 2)] if len(players)>=2 else []
            s_mean = float(np.mean(vals)) if vals else 0.0
            s_max  = float(np.max(vals))  if vals else 0.0
            out_rows.append((gid, team, s_mean, s_max))
        # update after
        for team in ['mafia', 'citizens']:
            players = g.loc[g['team']==team, 'player_id'].dropna().astype(int).tolist()
            if len(players)>=2:
                for a,b in combinations(sorted(players), 2):
                    pair_counts[(a,b,team)] = pair_counts.get((a,b,team),0) + 1

    team_synergy = pd.DataFrame(out_rows, columns=['game_id','team','synergy_mean_team','synergy_max_team'])
    return d.merge(team_synergy, on=['game_id','team'], how='left')

work_players = add_synergy_features(work_players)

from itertools import product

def add_enemy_familiarity_features(df):
    d = df.sort_values('id').copy()
    game_order = (d.groupby('game_id')['id'].max().sort_values().index.tolist())
    faced_counts = {}
    out_rows = []

    for gid in game_order:
        g = d[d['game_id'] == gid]
        maf = g[g['team']=='mafia']['player_id'].dropna().astype(int).tolist()
        cit = g[g['team']=='citizens']['player_id'].dropna().astype(int).tolist()

        pairs_maf = [faced_counts.get(tuple(sorted([a,b])), 0) for a,b in product(maf, cit)]
        pairs_cit = [faced_counts.get(tuple(sorted([a,b])), 0) for a,b in product(cit, maf)]

        def stats(vals):
            return (float(np.mean(vals)) if vals else 0.0,
                    float(np.max(vals))  if vals else 0.0)

        maf_mean, maf_max = stats(pairs_maf)
        cit_mean, cit_max = stats(pairs_cit)

        out_rows.append((gid,'mafia',    maf_mean, maf_max))
        out_rows.append((gid,'citizens', cit_mean, cit_max))

        for a,b in product(maf, cit):
            key = tuple(sorted([int(a),int(b)]))
            faced_counts[key] = faced_counts.get(key, 0) + 1

    fam = pd.DataFrame(out_rows, columns=['game_id','team','enemy_fam_mean_team','enemy_fam_max_team'])
    return d.merge(fam, on=['game_id','team'], how='left')

work_players = add_enemy_familiarity_features(work_players)

def add_streak_features(df):
    d = df.sort_values(['player_id','id']).copy()
    win_streaks, loss_streaks = [], []

    for pid, g in d.groupby('player_id', sort=False):
        prev = g['team_win'].shift(1).values
        w_stk = np.zeros(len(g), dtype=np.int16)
        l_stk = np.zeros(len(g), dtype=np.int16)
        cur_w = cur_l = 0
        for i, v in enumerate(prev):
            if np.isnan(v):
                cur_w = cur_l = 0
            else:
                if v == 1:
                    cur_w += 1; cur_l = 0
                else:
                    cur_l += 1; cur_w = 0
            w_stk[i] = cur_w
            l_stk[i] = cur_l
        win_streaks.append(pd.Series(w_stk, index=g.index))
        loss_streaks.append(pd.Series(l_stk, index=g.index))

    d['win_streak']  = pd.concat(win_streaks).sort_index()
    d['loss_streak'] = pd.concat(loss_streaks).sort_index()
    return d.sort_values('id').reset_index(drop=True)

work_players = add_streak_features(work_players)

def add_games_played_feature(df):
    d = df.sort_values(['player_id','id']).copy()
    # number of *prior* appearances (shift to avoid leakage)
    d['games_played'] = d.groupby('player_id').cumcount().astype('int32')
    return d.sort_values('id').reset_index(drop=True)

work_players = add_games_played_feature(work_players)

def build_team_agg(work_players, add_ratios=False, ratio_eps=1e-3):
    agg_funcs = {}

    def add_agg(col, funcs):
        if col in work_players.columns:
            agg_funcs[col] = funcs

    def q25(x): return np.nanpercentile(x, 25)
    def q75(x): return np.nanpercentile(x, 75)

    # Core
    add_agg('pre_elo', ['mean','std','min','max', q25, q75])
    add_agg('pre_elo_side', ['mean'])
    add_agg('pre_elo_role', ['mean'])
    add_agg('gap_id_clipped', ['mean','max'])
    add_agg('long_break_flag', ['sum'])
    add_agg('place', ['mean','std','min','max'])
    add_agg('games_played', ['mean','std','min','max'])  # if present

    # Optional blocks
    add_agg('win_streak', ['mean','max'])
    add_agg('loss_streak', ['mean','max'])
    add_agg('synergy_mean_team', ['mean'])
    add_agg('synergy_max_team',  ['mean'])
    add_agg('enemy_fam_mean_team', ['mean'])
    add_agg('enemy_fam_max_team',  ['mean'])
    add_agg('roll5_win_rate_mafia',  ['mean'])
    add_agg('roll20_win_rate_mafia', ['mean'])
    add_agg('roll5_win_rate_citizens',  ['mean'])
    add_agg('roll20_win_rate_citizens', ['mean'])
    if 'meta_period' in work_players.columns:
        agg_funcs['meta_period'] = ['first']

    base = work_players.groupby(['game_id','team']).agg(agg_funcs)
    base.columns = ['_'.join([str(x) for x in c if x not in (None,)]).replace('<function ','').replace('>','')
                    for c in base.columns]
    base = base.reset_index()

    # --- NEW: meta-period normalization for Elo stats (remove era drift) ---
    if 'meta_period_first' in base.columns:
        elo_cols = [c for c in base.columns if c.startswith('pre_elo_')]
        for col in elo_cols:
            # center within meta-period
            base[f'{col}_norm'] = base[col] - base.groupby('meta_period_first')[col].transform('mean')

    # Role-specific singletons/means
    full_idx = base.set_index(['game_id','team']).index
    # Role-specific singletons/means
    full_idx = base.set_index(['game_id','team']).index

    def single_role_stat(role, value_col, out_name):
        s = (work_players[work_players['role']==role]
             .groupby(['game_id','team'])[value_col].mean()).reindex(full_idx)
        s.name = out_name; return s

    def mean_role_stat(role, value_col, out_name):
        s = (work_players[work_players['role']==role]
             .groupby(['game_id','team'])[value_col].mean()).reindex(full_idx)
        s.name = out_name; return s

    pieces = [
        single_role_stat('don','pre_elo_role','don_pre_elo_role'),
        single_role_stat('sheriff','pre_elo_role','sheriff_pre_elo_role'),
        single_role_stat('don','place','don_place'),
        single_role_stat('sheriff','place','sheriff_place'),
        mean_role_stat('black','pre_elo_role','black_mean_pre_elo_role'),
        mean_role_stat('red','pre_elo_role','red_mean_pre_elo_role'),
        single_role_stat('don','games_in_role','don_games_in_role'),
        single_role_stat('sheriff','games_in_role','sheriff_games_in_role'),
        mean_role_stat('black','games_in_role','black_mean_games_in_role'),
        mean_role_stat('red','games_in_role','red_mean_games_in_role'),
        single_role_stat('don','win_rate_role_don_last20','don_wr20'),
        single_role_stat('sheriff','win_rate_role_sheriff_last20','sheriff_wr20'),
        mean_role_stat('black','win_rate_role_black_last20','black_mean_wr20'),
        mean_role_stat('red','win_rate_role_red_last20','red_mean_wr20'),
    ]
    role_feats = pd.concat(pieces, axis=1).reset_index()
    team_agg = base.merge(role_feats, on=['game_id','team'], how='left')

    # Label & time proxy
    labels  = work_players.groupby(['game_id','team'])['team_win'].max().rename('team_win_team')
    gmaxid  = work_players.groupby('game_id')['id'].max().rename('game_max_id')
    team_agg = team_agg.merge(labels, on=['game_id','team']).merge(gmaxid, on='game_id')

    # Safe deltas / ratios
    wide = team_agg.pivot(index='game_id', columns='team')
    wide.columns = [f"{a}__{b}" for a,b in wide.columns]
    wide = wide.reset_index()

    def side_cols(side): 
        return [c for c in wide.columns if c.endswith(f"__{side}") and c!='game_id']
    maf_cols = side_cols('mafia')

    delta = pd.DataFrame({'game_id': wide['game_id']})
    skip_prefixes = ('team_win_team','meta_period')
    for mcol in maf_cols:
        base_name = mcol[:-len("__mafia")]
        if base_name.startswith(skip_prefixes): 
            continue
        ccol = base_name + "__citizens"
        if ccol in wide.columns:
            delta[base_name + "__delta_maf_minus_cit"] = wide[mcol] - wide[ccol]
            if add_ratios:
                delta[base_name + "__ratio_maf_over_cit"] = (wide[mcol] + ratio_eps) / (wide[ccol] + ratio_eps)

    team_tall = team_agg.merge(delta, on='game_id', how='left')

    # --- NEW: a few safe interactions (helps tree models separate regimes) ---
    def safe_mul(a, b): 
        return (team_tall.get(a) if a in team_tall else 0) * (team_tall.get(b) if b in team_tall else 0)

    def safe_diff(a, b): 
        return (team_tall.get(a) if a in team_tall else 0) - (team_tall.get(b) if b in team_tall else 0)

    # Names used below exist after delta creation; if any is missing in your run, it's treated as 0
    team_tall['elo_synergy_product'] = safe_mul('pre_elo_mean__delta_maf_minus_cit',
                                                'synergy_mean_team_mean__delta_maf_minus_cit')
    team_tall['elo_enemy_gap']       = safe_diff('pre_elo_mean__delta_maf_minus_cit',
                                                'enemy_fam_mean_team_mean__delta_maf_minus_cit')
    team_tall['elo_streak_mix']      = safe_mul('pre_elo_mean__delta_maf_minus_cit',
                                                'win_streak_mean__delta_maf_minus_cit')

    return team_tall

team_tall = build_team_agg(work_players, add_ratios=False)  # ratios often redundant

team_only = [c for c in team_tall.columns if c.startswith((
    'pre_elo_', 'gap_id_clipped_', 'long_break_flag_', 'place_',
    'win_streak_', 'loss_streak_', 'synergy_mean_team_', 'synergy_max_team_',
    'enemy_fam_', 'games_played_', 
    'don_pre_elo_role', 'sheriff_pre_elo_role', 'black_mean_pre_elo_role', 'red_mean_pre_elo_role',
    'don_games_in_role', 'sheriff_games_in_role', 'black_mean_games_in_role', 'red_mean_games_in_role',
    'don_wr20', 'sheriff_wr20', 'black_mean_wr20', 'red_mean_wr20',
    'meta_period_first'
))]
delta_feats = [c for c in team_tall.columns if c.endswith('__delta_maf_minus_cit')]

# NEW: explicitly add our interactions and meta-normalized Elo columns
extra_feats = [c for c in ['elo_synergy_product','elo_enemy_gap','elo_streak_mix']
               if c in team_tall.columns]
meta_norm_feats = [c for c in team_tall.columns if c.endswith('_norm')]

forbidden_tokens = {'team_win','team_win_team'}
USED_FEATS = [c for c in sorted(set(team_only + delta_feats + extra_feats + meta_norm_feats))
              if not any(tok in c for tok in forbidden_tokens)]

X = team_tall[USED_FEATS].fillna(0)
y = team_tall['team_win_team'].astype(int).values
groups = team_tall['game_id'].values
time_key = team_tall['game_max_id'].values

q70, q85 = np.quantile(time_key, [0.70, 0.85])
train_mask = time_key <= q85
cal_mask   = (time_key > q70) & (time_key <= q85)
test_mask  = time_key > q85

print("Shapes | X:", X.shape, "| y:", y.shape)
print("Split sizes | train:", train_mask.sum(), "cal:", cal_mask.sum(), "test:", test_mask.sum())


  df = pd.read_csv(DATA_CSV)


Loaded: (802820, 21) columns: 21


  for (pid, role), g in d.groupby(['player_id','role'], sort=False):


Shapes | X: (160564, 105) | y: (160564,)
Split sizes | train: 136480 cal: 24084 test: 24084


In [4]:
# ==============================
# 3) Canonical splits (train / cal / holdout)
# ==============================
n = len(y)
n_holdout = int(round(n * 0.15))
idx_all = np.arange(n)

# Holdout: last 15%
idx_holdout = idx_all[-n_holdout:]
idx_fitpool = idx_all[:-n_holdout]

# Calibration: 15% of fit pool (from the tail of fit pool)
n_cal = int(round(len(idx_fitpool) * 0.15))
idx_cal = idx_fitpool[-n_cal:]
idx_train = idx_fitpool[:-n_cal]

# Inner split for LGBM early stopping (from train)
n_va = int(round(len(idx_train) * 0.15))
idx_inner_va = idx_train[-n_va:]
idx_inner_tr = idx_train[:-n_va]

sizes = {
    "train": len(idx_train),
    "cal": len(idx_cal),
    "fit": len(idx_fitpool),
    "inner_tr": len(idx_inner_tr),
    "inner_va": len(idx_inner_va),
    "test": len(idx_holdout),
}
print(sizes)

# Slice arrays (keep as DataFrames/Series)
def _slice_xy(X, y, idx):
    Xs = X.iloc[idx] if hasattr(X, "iloc") else pd.DataFrame(X)[idx]
    ys = y.iloc[idx] if hasattr(y, "iloc") else pd.Series(y)[idx]
    return Xs.reset_index(drop=True), ys.reset_index(drop=True)

X_train, y_train   = _slice_xy(X, y, idx_train)
X_cal,   y_cal     = _slice_xy(X, y, idx_cal)
X_inner_tr, y_inner_tr = _slice_xy(X, y, idx_inner_tr)
X_inner_va, y_inner_va = _slice_xy(X, y, idx_inner_va)
X_holdout, y_holdout   = _slice_xy(X, y, idx_holdout)

print("Check shapes:",
      "train", X_train.shape, 
      "cal", X_cal.shape,
      "inner_tr", X_inner_tr.shape,
      "inner_va", X_inner_va.shape,
      "holdout", X_holdout.shape)


{'train': 116007, 'cal': 20472, 'fit': 136479, 'inner_tr': 98606, 'inner_va': 17401, 'test': 24085}
Check shapes: train (116007, 105) cal (20472, 105) inner_tr (98606, 105) inner_va (17401, 105) holdout (24085, 105)


In [5]:
# ==============================
# 4) LightGBM — train + calibrate + evaluate
# ==============================
lgb_params_tuned = {
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "num_leaves": 78,
    "min_data_in_leaf": 134,
    "subsample": 0.8140936140036887,
    "colsample_bytree": 0.7844939514101106,
    "reg_lambda": 0.1904075276204348,
    "reg_alpha": 0.5453556057858624,
    "objective": "binary",
    "boosting_type": "gbdt",
    "n_jobs": -1,
    "random_state": SEED,
}
pprint(lgb_params_tuned)

# Train with early stopping on inner_va
lgb = LGBMClassifier(**lgb_params_tuned)
lgb.fit(
    X_inner_tr, y_inner_tr,
    eval_set=[(X_inner_va, y_inner_va)],
    eval_metric="logloss",
    callbacks=[early_stopping(stopping_rounds=100), log_evaluation(200)]
)
print("LGBM fitted with best_iteration_ =", getattr(lgb, "best_iteration_", None))

# Calibrate on calibration set
lgb_cal = CalibratedClassifierCV(lgb, cv="prefit", method="sigmoid")
lgb_cal.fit(X_cal, y_cal)

# Evaluate on holdout
p_lgb = lgb_cal.predict_proba(X_holdout)[:, 1]
metrics_lgb = {
    "LogLoss": log_loss(y_holdout, p_lgb),
    "AUC": roc_auc_score(y_holdout, p_lgb),
    "Brier": brier_score_loss(y_holdout, p_lgb),
}
print("\nLGBM (calibrated) — Holdout")
for k,v in metrics_lgb.items():
    print(f"{k}: {v:.10f}")


{'boosting_type': 'gbdt',
 'colsample_bytree': 0.7844939514101106,
 'learning_rate': 0.02,
 'min_data_in_leaf': 134,
 'n_estimators': 5000,
 'n_jobs': -1,
 'num_leaves': 78,
 'objective': 'binary',
 'random_state': 42,
 'reg_alpha': 0.5453556057858624,
 'reg_lambda': 0.1904075276204348,
 'subsample': 0.8140936140036887}
[LightGBM] [Info] Number of positive: 49303, number of negative: 49303
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15051
[LightGBM] [Info] Number of data points in the train set: 98606, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 100 rounds
[200]	valid_0's binary_logloss: 0.658859
Early stopping, best iteration is:
[257]	valid_0's binary_logloss: 0.658613
LGBM fitted with best_iteration_ = 257





LGBM (calibrated) — Holdout
LogLoss: 0.6665559744
AUC: 0.6297505701
Brier: 0.2370229242


In [6]:
# ==============================
# 5) CatBoost — baseline (train + calibrate + evaluate)
# ==============================
cat_params = dict(
    loss_function="Logloss",
    depth=8,
    learning_rate=0.03,
    iterations=2000,
    random_seed=SEED,
    eval_metric="Logloss",
    verbose=200,
    od_type="Iter",
    od_wait=100,
    subsample=1.0,
    l2_leaf_reg=7.0,
    bagging_temperature=1.0,
)

cat = CatBoostClassifier(**cat_params)
cat.fit(
    X_train, y_train,
    eval_set=(X_inner_va, y_inner_va),
    use_best_model=True,
    verbose=200
)
print("CatBoost fitted.")

# Calibrate on the same calibration set
cat_cal = CalibratedClassifierCV(cat, cv="prefit", method="sigmoid")
cat_cal.fit(X_cal, y_cal)

# Evaluate on holdout
p_cat = cat_cal.predict_proba(X_holdout)[:, 1]
metrics_cat = {
    "LogLoss": log_loss(y_holdout, p_cat),
    "AUC": roc_auc_score(y_holdout, p_cat),
    "Brier": brier_score_loss(y_holdout, p_cat),
}
print("\nCatBoost (calibrated) — Holdout")
for k,v in metrics_cat.items():
    print(f"{k}: {v:.10f}")


0:	learn: 0.6919197	test: 0.6917088	best: 0.6917088 (0)	total: 327ms	remaining: 10m 53s
200:	learn: 0.6582651	test: 0.6522251	best: 0.6522251 (200)	total: 32.3s	remaining: 4m 49s
400:	learn: 0.6483437	test: 0.6432742	best: 0.6432742 (400)	total: 1m 3s	remaining: 4m 14s
600:	learn: 0.6364242	test: 0.6316679	best: 0.6316679 (600)	total: 1m 34s	remaining: 3m 39s
800:	learn: 0.6256300	test: 0.6219296	best: 0.6219296 (800)	total: 2m 4s	remaining: 3m 5s
1000:	learn: 0.6151933	test: 0.6123993	best: 0.6123993 (1000)	total: 2m 33s	remaining: 2m 33s
1200:	learn: 0.6048901	test: 0.6029702	best: 0.6029702 (1200)	total: 3m 3s	remaining: 2m 2s
1400:	learn: 0.5947692	test: 0.5939215	best: 0.5939215 (1400)	total: 3m 33s	remaining: 1m 31s
1600:	learn: 0.5854162	test: 0.5851037	best: 0.5851037 (1600)	total: 4m 3s	remaining: 1m
1800:	learn: 0.5764603	test: 0.5768919	best: 0.5768919 (1800)	total: 4m 34s	remaining: 30.4s
1999:	learn: 0.5673948	test: 0.5684793	best: 0.5684793 (1999)	total: 5m 5s	remaining: 



In [7]:
# ==============================
# 6) Blend search (LGBM vs Cat baseline)
# ==============================
def eval_metrics(y, p):
    return (
        log_loss(y, p),
        roc_auc_score(y, p),
        brier_score_loss(y, p),
    )

ll_lgb, auc_lgb, br_lgb = eval_metrics(y_holdout, p_lgb)
ll_cat, auc_cat, br_cat = eval_metrics(y_holdout, p_cat)

print(f"\nLGBM  : LogLoss={ll_lgb:.6f}  AUC={auc_lgb:.6f}  Brier={br_lgb:.6f}")
print(f"Cat   : LogLoss={ll_cat:.6f}  AUC={auc_cat:.6f}  Brier={br_cat:.6f}")

ws = np.linspace(0.0, 1.0, 41)  # 0.00..1.00 step 0.025
best = None
for w in ws:
    p_blend = w * p_lgb + (1 - w) * p_cat
    ll, auc, br = eval_metrics(y_holdout, p_blend)
    if best is None or ll < best[0]:
        best = (ll, auc, br, w)

ll_b, auc_b, br_b, w_star = best
print(f"\nBest blend (LGBM vs Cat): w_lgbm={w_star:.2f}, w_cat={1-w_star:.2f}")
print(f"→ LogLoss={ll_b:.6f}  AUC={auc_b:.6f}  Brier={br_b:.6f}")

p_blend = w_star * p_lgb + (1 - w_star) * p_cat



LGBM  : LogLoss=0.666556  AUC=0.629751  Brier=0.237023
Cat   : LogLoss=0.667096  AUC=0.628840  Brier=0.237282

Best blend (LGBM vs Cat): w_lgbm=0.58, w_cat=0.42
→ LogLoss=0.665779  AUC=0.631850  Brier=0.236652


In [8]:
# ==============================
# 7) Optional: CatBoost tuning (GridSearchCV)
# ==============================
RUN_CAT_TUNE = False  # flip to True if you want to tune

if RUN_CAT_TUNE:
    grid = {
        "depth": [6, 8],
        "learning_rate": [0.02, 0.03],
        "iterations": [1500, 2000],
        "subsample": [0.8, 1.0],
        "l2_leaf_reg": [3, 7],
        "bagging_temperature": [0.5, 1.0],
    }
    base = CatBoostClassifier(
        loss_function="Logloss",
        random_seed=SEED,
        eval_metric="Logloss",
        verbose=False
    )
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    gs = GridSearchCV(
        estimator=base,
        param_grid=grid,
        scoring="neg_log_loss",
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    print("[Cat tune] Fitting grid...")
    gs.fit(X_train, y_train)
    print("\n[Cat tune] Best params:", gs.best_params_)
    print("[Cat tune] CV LogLoss:", -gs.best_score_)

    # Refit best on full train (X_train,y_train) with eval on inner_va for OD
    best_params = dict(gs.best_params_)
    best_params.update(dict(loss_function="Logloss", random_seed=SEED, eval_metric="Logloss", verbose=200,
                            od_type="Iter", od_wait=100))
    cat_tuned = CatBoostClassifier(**best_params)
    cat_tuned.fit(X_train, y_train, eval_set=(X_inner_va, y_inner_va), use_best_model=True, verbose=200)

    # Calibrate on calibration set
    cat_tuned_cal = CalibratedClassifierCV(cat_tuned, cv="prefit", method="sigmoid")
    cat_tuned_cal.fit(X_cal, y_cal)

    # Predict on holdout (same length as y_holdout)
    p_cat_tuned = cat_tuned_cal.predict_proba(X_holdout)[:, 1]

    ll_t, auc_t, br_t = eval_metrics(y_holdout, p_cat_tuned)
    print(f"\n[Cat tune] Tuned Cat — Holdout  LogLoss={ll_t:.6f}  AUC={auc_t:.6f}  Brier={br_t:.6f}")

    # Blend LGBM with tuned Cat
    best_t = None
    for w in ws:
        p = w * p_lgb + (1 - w) * p_cat_tuned
        ll, auc, br = eval_metrics(y_holdout, p)
        if best_t is None or ll < best_t[0]:
            best_t = (ll, auc, br, w)
    ll_bt, auc_bt, br_bt, w_star_t = best_t
    print(f"\nBest blend (LGBM vs CatTuned): w_lgbm={w_star_t:.2f}, w_catT={1-w_star_t:.2f}")
    print(f"→ LogLoss={ll_bt:.6f}  AUC={auc_bt:.6f}  Brier={br_bt:.6f}")


In [9]:
# ==============================
# 8) F1 threshold search (diagnostic)
# ==============================
# Use the best currently available prediction (blend if it improved LogLoss; else Cat or LGBM)
p_best = p_blend if 'p_blend' in globals() else (p_cat if metrics_cat['LogLoss'] <= metrics_lgb['LogLoss'] else p_lgb)

ths = np.linspace(0.05, 0.95, 19)
best_f1, best_t = -1, None
for t in ths:
    f1 = f1_score(y_holdout, (p_best >= t).astype(int))
    if f1 > best_f1:
        best_f1, best_t = f1, t
print(f"Best F1 threshold (holdout): {best_t:.2f}, F1={best_f1:.4f}")


Best F1 threshold (holdout): 0.35, F1=0.6712


In [10]:
# ==============================
# 9) Feature importances (LightGBM)
# ==============================
if hasattr(lgb, "feature_importances_"):
    fi = pd.Series(lgb.feature_importances_, index=X_inner_tr.columns).sort_values(ascending=False)
    topk = fi.head(20)
    print("Top 20 features (LGBM):")
    print(topk)
else:
    print("LightGBM importances not available.")


Top 20 features (LGBM):
roll20_win_rate_citizens_mean__delta_maf_minus_cit    1752
roll5_win_rate_citizens_mean__delta_maf_minus_cit     1086
gap_id_clipped_mean                                    630
pre_elo_side_mean__delta_maf_minus_cit                 584
gap_id_clipped_max                                     581
roll20_win_rate_mafia_mean__delta_maf_minus_cit        553
gap_id_clipped_mean__delta_maf_minus_cit               464
pre_elo_min__delta_maf_minus_cit                       452
pre_elo_q25__delta_maf_minus_cit                       426
pre_elo_role_mean__delta_maf_minus_cit                 390
elo_synergy_product                                    382
pre_elo_side_mean                                      373
enemy_fam_mean_team_mean                               367
elo_streak_mix                                         348
place_std                                              333
roll5_win_rate_mafia_mean__delta_maf_minus_cit         317
synergy_mean_team_mean__delta_ma

In [11]:
# ==============================
# 10) Final summary table
# ==============================
rows = []

rows.append({
    "Model": "LGBM (calibrated)",
    "LogLoss": metrics_lgb["LogLoss"],
    "AUC":     metrics_lgb["AUC"],
    "Brier":   metrics_lgb["Brier"],
})

rows.append({
    "Model": "CatBoost (calibrated)",
    "LogLoss": metrics_cat["LogLoss"],
    "AUC":     metrics_cat["AUC"],
    "Brier":   metrics_cat["Brier"],
})

if 'p_blend' in globals():
    rows.append({
        "Model": f"Blend* ({w_star:.2f}·LGBM + {(1-w_star):.2f}·Cat)",
        "LogLoss": ll_b,
        "AUC":     auc_b,
        "Brier":   br_b,
    })

if 'RUN_CAT_TUNE' in globals() and RUN_CAT_TUNE and 'p_cat_tuned' in globals():
    # tuned cat
    rows.append({
        "Model": "CatBoost (tuned + cal)",
        "LogLoss": ll_t,
        "AUC":     auc_t,
        "Brier":   br_t,
    })
    # tuned blend
    rows.append({
        "Model": f"Blend** ({w_star_t:.2f}·LGBM + {(1-w_star_t):.2f}·CatT)",
        "LogLoss": ll_bt,
        "AUC":     auc_bt,
        "Brier":   br_bt,
    })

summary = pd.DataFrame(rows).set_index("Model")

# Pretty print without requiring jinja2
def _fmt(x): 
    try: 
        return f"{x:.10f}"
    except: 
        return x

print(summary.to_string(float_format=lambda x: f"{x:.10f}"))


                                   LogLoss          AUC        Brier
Model                                                               
LGBM (calibrated)             0.6665559744 0.6297505701 0.2370229242
CatBoost (calibrated)         0.6670964744 0.6288397346 0.2372824973
Blend* (0.58·LGBM + 0.42·Cat) 0.6657793963 0.6318499647 0.2366521003


In [12]:
# ==============================
# 11) Export predictor function (optional)
# ==============================
def make_predict_proba_fn(model="blend"):
    """
    Returns a function f(X_df) -> proba for chosen model:
    - "lgbm": calibrated LightGBM
    - "cat": calibrated CatBoost
    - "blend": best 2-way blend found on holdout (if available; else falls back to better single model)
    """
    if model == "lgbm":
        def f(X_df):
            return lgb_cal.predict_proba(X_df)[:, 1]
        return f
    elif model == "cat":
        def f(X_df):
            return cat_cal.predict_proba(X_df)[:, 1]
        return f
    elif model == "blend":
        if 'p_blend' in globals():
            w = w_star
            def f(X_df):
                p1 = lgb_cal.predict_proba(X_df)[:, 1]
                p2 = cat_cal.predict_proba(X_df)[:, 1]
                return w * p1 + (1 - w) * p2
            return f
        else:
            # choose better single by LogLoss
            if metrics_cat["LogLoss"] <= metrics_lgb["LogLoss"]:
                return make_predict_proba_fn("cat")
            else:
                return make_predict_proba_fn("lgbm")
    else:
        raise ValueError("model must be one of {'lgbm','cat','blend'}")

predict_proba = make_predict_proba_fn("blend")
print("Predictor ready.")


Predictor ready.
