# Mafia — Per-Game Outcome Modeling (Pre-Game Only)

Goal: predict the **winning side** (mafia vs citizens) from the lineup and past form.
- Label per team row: `team_win_team` (1/0)
- Validation: GroupKFold by `game_id` + chronological holdout by `game_max_id`
- Metrics: LogLoss (primary), ROC-AUC, Brier, per-game accuracy
- Features: per-player Elo & rolling stats → aggregated to team → deltas (mafia - citizens)


In [1]:
# pip install pandas numpy lightgbm scikit-learn pyarrow fastparquet
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

DATA_PLAYERS = Path("cleaned/mafia_clean.csv")  # cleaned player-level dataset
OUT_DIR = Path("./team_data"); OUT_DIR.mkdir(exist_ok=True, parents=True)
ARTS = Path("./artifacts_game"); ARTS.mkdir(exist_ok=True, parents=True)
SEED = 42
np.random.seed(SEED)

## 1) Load cleaned player-level data
We expect: 10 rows per game, 3 mafia / 7 citizens, winners per game ∈ {3,7}, no duplicates.

In [2]:
df = pd.read_csv(DATA_PLAYERS)
df.shape, df.head(3)

  df = pd.read_csv(DATA_PLAYERS)


((802820, 21),
         id  game_id  place  player_id original_nickname   role  killed_first  \
 0  1000001   100001     10          2             Teddy    red             0   
 1  1000002   100001      7         26          Искатель    red             0   
 2  1000003   100001      9         50             Лоску  black             0   
 
    best_move  game_points  game_autobonus  ...  best_move_bonus  penalty  \
 0          0          0.0             0.0  ...              0.0      0.0   
 1          0          0.0             0.0  ...              0.0      0.0   
 2          0          2.0             0.0  ...              0.0      0.0   
 
    fouls  penalty_disciplinary   Ci  created_at updated_at      team team_win  \
 0    0.0                   0.0  0.0         NaN        NaN  citizens        0   
 1    0.0                   0.0  0.0         NaN        NaN  citizens        0   
 2    0.0                   0.0  0.0         NaN        NaN     mafia        1   
 
    total_points  


## 2) Pre-game player features: Elo + rolling stats (leakage-safe)
- Sort by `id` (time proxy).
- Elo updates only from past results.
- Rolling win rate (5, 20), career win rate, games played.

In [3]:
# Inspect distribution of per-player id gaps
tmp = (df.sort_values(['player_id','id'])
         .groupby('player_id')['id']
         .diff()
         .dropna())
print(tmp.describe(percentiles=[.5,.8,.9,.95,.99]))

# Choose a threshold (start with the 95th percentile as a proxy)
GAP_THRESH = tmp.quantile(0.95)
print("Chosen GAP_THRESH (id units):", GAP_THRESH)

count    732003.000000
mean        135.976756
std        1368.720034
min           1.000000
50%          42.000000
80%         127.000000
90%         228.000000
95%         381.000000
99%        1174.000000
max      245724.000000
Name: id, dtype: float64
Chosen GAP_THRESH (id units): 381.0


In [4]:
def compute_elos(df, init=1500, k=24):
    """
    Pre-game Elo features:
    - pre_elo:   global player skill
    - pre_elo_side: player skill when playing this side ('mafia' or 'citizens')
    - pre_elo_role: player skill when playing this exact role ('don','black','sheriff','red')
    """
    df = df.sort_values('id').copy()
    elo_global = {}
    elo_side   = {}  # key: (player_id, side)
    elo_role   = {}  # key: (player_id, role)

    outs = []

    for gid, g in df.groupby('game_id', sort=False):
        cur = g.copy()

        # pre-game ratings (BEFORE updating with this game's result)
        cur['pre_elo']       = [elo_global.get(pid, init) for pid in cur['player_id']]
        cur['pre_elo_side']  = [elo_side.get((pid, team), init) for pid, team in zip(cur['player_id'], cur['team'])]
        cur['pre_elo_role']  = [elo_role.get((pid, role), init) for pid, role in zip(cur['player_id'], cur['role'])]

        # expected result based on global ELO means (stable & data-rich)
        maf = cur['team'].eq('mafia')
        mafia_mean = cur.loc[maf, 'pre_elo'].mean()
        cit_mean   = cur.loc[~maf, 'pre_elo'].mean()
        exp_mafia  = 1.0 / (1.0 + 10 ** ((cit_mean - mafia_mean)/400))

        mafia_res = int(cur.loc[maf, 'team_win'].iloc[0])
        cit_res   = 1 - mafia_res

        # update ratings AFTER the game
        for _, r in cur.iterrows():
            pid, side, role = r['player_id'], r['team'], r['role']
            exp = exp_mafia if side == 'mafia' else (1 - exp_mafia)
            act = mafia_res   if side == 'mafia' else (1 - mafia_res)
            delta = k * (act - exp)
            elo_global[pid] = elo_global.get(pid, init) + delta
            elo_side[(pid, side)] = elo_side.get((pid, side), init) + delta
            elo_role[(pid, role)] = elo_role.get((pid, role), init) + delta

        outs.append(cur[['game_id','player_id','pre_elo','pre_elo_side','pre_elo_role']])

    elo_df = pd.concat(outs, ignore_index=True)
    return df.merge(elo_df, on=['game_id','player_id'], how='left')

def add_rolling_stats_side(df, windows=(5, 20)):
    """
    Adds BOTH:
    - Generic per-player rolling: roll{w}_win_rate, games_played, career_win_rate
    - Side-specific rolling:      roll{w}_win_rate_mafia, roll{w}_win_rate_citizens
    All computed leakage-safe (uses shift(1)).
    """
    df = df.sort_values(['player_id','id']).copy()
    out = []
    for pid, g in df.groupby('player_id', sort=False):
        g = g.copy()
        g['win_shift'] = g['team_win'].shift(1)

        # Generic rolling win-rates (all games for this player)
        for w in windows:
            g[f'roll{w}_win_rate'] = g['win_shift'].rolling(w, min_periods=1).mean()

        # Side-specific rolling win-rates
        for side in ['mafia','citizens']:
            mask = g['team'].eq(side)
            for w in windows:
                g.loc[mask, f'roll{w}_win_rate_{side}'] = g.loc[mask, 'win_shift'].rolling(w, min_periods=1).mean()

        # Career stats (overall)
        g['games_played']   = np.arange(len(g))               # 0,1,2,... (pre-game count for current row)
        g['career_win_rate'] = g['win_shift'].expanding().mean()

        g.drop(columns=['win_shift'], inplace=True)
        out.append(g)
    return pd.concat(out, ignore_index=True)

def add_break_features(df, gap_thresh):
    """
    Adds:
    - gap_id: id difference since last game for this player (NaN for first game)
    - long_break_flag: 1 if gap_id >= gap_thresh else 0
    - gap_id_clipped: replace NaN with 0, clip huge tails for stability
    """
    d = df.sort_values(['player_id','id']).copy()
    d['gap_id'] = d.groupby('player_id')['id'].diff()
    d['gap_id_clipped'] = d['gap_id'].fillna(0).clip(lower=0)
    d['long_break_flag'] = (d['gap_id_clipped'] >= gap_thresh).astype('int8')
    return d.sort_values('id').reset_index(drop=True)

work_players = compute_elos(df)
work_players = add_rolling_stats_side(work_players)
work_players = add_break_features(work_players, GAP_THRESH)

# This preview now works again:
work_players[['game_id','team','player_id',
              'pre_elo',
              'roll5_win_rate','roll20_win_rate','career_win_rate','games_played']].head()

Unnamed: 0,game_id,team,player_id,pre_elo,roll5_win_rate,roll20_win_rate,career_win_rate,games_played
0,100001,citizens,2,1500.0,,,,0
1,100001,citizens,26,1500.0,,,,0
2,100001,mafia,50,1500.0,,,,0
3,100001,mafia,74,1500.0,,,,0
4,100001,citizens,98,1500.0,,,,0


### Synergy features (co-play history)
For each game and team, compute how many times each pair of teammates had previously played together on the **same side**. Aggregate within the team (mean, max). Uses chronological order by `id`, no leakage.


In [5]:
from itertools import combinations

def add_synergy_features(df):
    """
    For each game_id and team:
      - Compute prior same-team co-plays for all unordered pairs within that team (before this game).
      - Store team-level aggregates: synergy_mean_team, synergy_max_team.
    Implementation detail:
      - Iterate games in chronological order (by max id in the game).
      - Maintain a dictionary that counts how many times pair (a,b) have co-played on a given side.
    """
    df = df.copy()
    # game chronological order (by last id within game)
    game_order = (df.groupby('game_id')['id'].max()
                    .sort_values().index.tolist())

    # dict key: (min(pid), max(pid), team) -> count of prior co-plays on that team
    pair_counts = {}

    # output rows: (game_id, team, synergy_mean, synergy_max)
    out_rows = []

    for gid in game_order:
        g = df[df['game_id'] == gid]
        for team in ['mafia', 'citizens']:
            team_players = g.loc[g['team'] == team, 'player_id'].tolist()
            pair_vals = []
            for a, b in combinations(sorted(team_players), 2):
                key = (a, b, team)
                pair_vals.append(pair_counts.get(key, 0))

            if len(pair_vals) == 0:
                s_mean, s_max = 0.0, 0.0
            else:
                s_mean = float(np.mean(pair_vals))
                s_max  = float(np.max(pair_vals))

            out_rows.append((gid, team, s_mean, s_max))

        # AFTER recording features, update counts with this game
        for team in ['mafia', 'citizens']:
            team_players = g.loc[g['team'] == team, 'player_id'].tolist()
            for a, b in combinations(sorted(team_players), 2):
                key = (a, b, team)
                pair_counts[key] = pair_counts.get(key, 0) + 1

    team_synergy = pd.DataFrame(out_rows, columns=['game_id','team','synergy_mean_team','synergy_max_team'])
    # attach to every player-row (team-wise constant)
    return df.merge(team_synergy, on=['game_id','team'], how='left')

# AFTER: call it
work_players = add_synergy_features(work_players)
work_players[['game_id','team','synergy_mean_team','synergy_max_team']].head()


Unnamed: 0,game_id,team,synergy_mean_team,synergy_max_team
0,100001,citizens,0.0,0.0
1,100001,citizens,0.0,0.0
2,100001,mafia,0.0,0.0
3,100001,mafia,0.0,0.0
4,100001,citizens,0.0,0.0


### Streak features (momentum)
Compute per-player **pre-game** win_streak and loss_streak using only past results. Aggregate to team (mean, max).


In [6]:
def add_streak_features(df):
    """
    Adds per-player pre-game streaks:
      - win_streak: consecutive wins ending just before this game
      - loss_streak: consecutive losses ending just before this game
    """
    df = df.sort_values(['player_id','id']).copy()
    win_streaks = []
    loss_streaks = []

    for pid, g in df.groupby('player_id', sort=False):
        prev = g['team_win'].shift(1).values  # past outcomes only
        w_stk = np.zeros(len(g), dtype=np.int16)
        l_stk = np.zeros(len(g), dtype=np.int16)
        cur_w, cur_l = 0, 0
        for i, v in enumerate(prev):
            if np.isnan(v):
                cur_w, cur_l = 0, 0
            else:
                if v == 1:  # previous result was win
                    cur_w += 1
                    cur_l = 0
                else:       # previous result was loss
                    cur_l += 1
                    cur_w = 0
            w_stk[i] = cur_w
            l_stk[i] = cur_l
        win_streaks.append(pd.Series(w_stk, index=g.index))
        loss_streaks.append(pd.Series(l_stk, index=g.index))

    df['win_streak']  = pd.concat(win_streaks).sort_index()
    df['loss_streak'] = pd.concat(loss_streaks).sort_index()
    return df.sort_values('id').reset_index(drop=True)

# AFTER: call it
work_players = add_streak_features(work_players)
work_players[['game_id','player_id','win_streak','loss_streak']].head()


Unnamed: 0,game_id,player_id,win_streak,loss_streak
0,100001,2,0,0
1,100001,26,0,0
2,100001,50,0,0
3,100001,74,0,0
4,100001,98,0,0


## 3) Aggregate to team level (two rows per game) + matchup deltas
- Aggregate per-team: mean/std/min/max/q25/q75 of `pre_elo`; mean of rolling stats.
- Create deltas: (mafia - citizens) for each aggregate to encode matchup strength.
- Add: `team_win_team` (label) and `game_max_id` (time proxy per game).


In [7]:
def q25(x): return x.quantile(0.25)

def q75(x): return x.quantile(0.75)

def build_team_agg(work_players):
    # 1) Base numeric aggregates (+ seat, breaks, streaks, side/role elos)
    agg_funcs = {
        'pre_elo': ['mean','std','min','max', q25, q75],
        'pre_elo_side': ['mean'],
        'pre_elo_role': ['mean'],
        'gap_id_clipped': ['mean','max'],
        'long_break_flag': ['sum'],
        'place': ['mean','std','min','max'],
        'win_streak': ['mean','max'],
        'loss_streak': ['mean','max'],
        'synergy_mean_team': ['mean'],  # team-wise constant per game, but keep API consistent
        'synergy_max_team':  ['mean'],
        'games_played': ['mean','std','min','max'],  # ensure std included
    }
    for side in ['mafia','citizens']:
        agg_funcs[f'roll5_win_rate_{side}']  = ['mean']
        agg_funcs[f'roll20_win_rate_{side}'] = ['mean']

    base = work_players.groupby(['game_id','team']).agg(agg_funcs)
    base.columns = ['_'.join(filter(None, map(str, c))).replace('<function ','').replace('>','') for c in base.columns]
    base = base.reset_index()

    # 2) Role-specific Elo & seats
    def single_role_stat(role, value_col):
        s = (work_players[work_players['role'] == role]
             .groupby(['game_id','team'])[value_col]
             .mean()
             .rename(f'{role}_{value_col}'))
        
        return s

    don_elo     = single_role_stat('don', 'pre_elo_role')
    sheriff_elo = single_role_stat('sheriff', 'pre_elo_role')
    don_place   = single_role_stat('don', 'place')
    sheriff_place = single_role_stat('sheriff', 'place')

    def mean_role_stat(role, value_col):
        s = (work_players[work_players['role'] == role]
             .groupby(['game_id','team'])[value_col]
             .mean()
             .rename(f'{role}_mean_{value_col}'))
        
        return s

    black_elo_mean = mean_role_stat('black', 'pre_elo_role')
    red_elo_mean   = mean_role_stat('red', 'pre_elo_role')

    role_feats = pd.concat([don_elo, sheriff_elo, don_place, sheriff_place,
                            black_elo_mean, red_elo_mean], axis=1).reset_index()

    team_agg = base.merge(role_feats, on=['game_id','team'], how='left')

    # 3) Label & time proxy
    labels  = work_players.groupby(['game_id','team'])['team_win'].max().rename('team_win_team')
    gmaxid  = work_players.groupby('game_id')['id'].max().rename('game_max_id')
    team_agg = team_agg.merge(labels, on=['game_id','team']).merge(gmaxid, on='game_id')

    # 4) SAFE deltas
    wide = team_agg.pivot(index='game_id', columns='team')
    wide.columns = [f"{a}__{b}" for a,b in wide.columns]
    wide = wide.reset_index()

    def side_cols(side): return [c for c in wide.columns if c.endswith(f"__{side}") and c != 'game_id']
    maf_cols = side_cols('mafia')

    delta = pd.DataFrame({'game_id': wide['game_id']})

    for mcol in maf_cols:
        if 'team_win_team' in mcol:
            continue  # never delta the target
        base_name = mcol[:-len("__mafia")]
        ccol = base_name + "__citizens"
        if ccol in wide.columns:
            delta[base_name + "__delta_maf_minus_cit"] = wide[mcol] - wide[ccol]

    team_tall = team_agg.merge(delta, on='game_id', how='left')
    
    return team_tall

# build new team dataset
team_tall = build_team_agg(work_players)

# Save aggregated dataset
OUT_DIR.mkdir(exist_ok=True, parents=True)
team_tall.to_csv(OUT_DIR/"mafia_team_agg.csv", index=False)
team_tall.to_parquet(OUT_DIR/"mafia_team_agg.parquet", index=False)
team_tall.head(3)


Unnamed: 0,game_id,team,pre_elo_mean,pre_elo_std,pre_elo_min,pre_elo_max,pre_elo_q25,pre_elo_q75,pre_elo_side_mean,pre_elo_role_mean,...,roll20_win_rate_mafia_mean__delta_maf_minus_cit,roll5_win_rate_citizens_mean__delta_maf_minus_cit,roll20_win_rate_citizens_mean__delta_maf_minus_cit,don_pre_elo_role__delta_maf_minus_cit,sheriff_pre_elo_role__delta_maf_minus_cit,don_place__delta_maf_minus_cit,sheriff_place__delta_maf_minus_cit,black_mean_pre_elo_role__delta_maf_minus_cit,red_mean_pre_elo_role__delta_maf_minus_cit,game_max_id__delta_maf_minus_cit
0,100001,citizens,1500.0,0.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,,,,,,,,,,0
1,100001,mafia,1500.0,0.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,,,,,,,,,,0
2,100002,citizens,1500.0,0.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,,,,,,,,,,0


## 4) Features, CV strategy, and baselines
- `X`: team aggregates + delta features
- `y`: `team_win_team`
- `groups`: `game_id`
- Chronological split by `game_max_id` (70/15/15) for calibration & final holdout

In [8]:
team_only = [c for c in team_tall.columns if c.startswith((
    'pre_elo_', 'gap_id_clipped_', 'long_break_flag_',
    'place_', 'win_streak_', 'loss_streak_',
    'synergy_mean_team_', 'synergy_max_team_',
    'games_played_', 'don_pre_elo_role', 'sheriff_pre_elo_role',
    'don_place', 'sheriff_place', 'black_mean_pre_elo_role', 'red_mean_pre_elo_role',
    'roll5_win_rate_mafia_', 'roll20_win_rate_mafia_', 'roll5_win_rate_citizens_', 'roll20_win_rate_citizens_'
))]

delta_feats = [c for c in team_tall.columns if c.endswith('__delta_maf_minus_cit')]

forbidden_tokens = {'team_win','team_win_team'}
USED_FEATS = [c for c in sorted(set(team_only + delta_feats))
              if not any(tok in c for tok in forbidden_tokens)]

X = team_tall[USED_FEATS].fillna(0)
y = team_tall['team_win_team'].astype(int).values
groups = team_tall['game_id'].values
time_key = team_tall['game_max_id'].values

q70, q85 = np.quantile(time_key, [0.70, 0.85])
train_mask = time_key <= q85
cal_mask   = (time_key > q70) & (time_key <= q85)
test_mask  = time_key > q85

### Removing features

In [9]:
low_var = X.columns[X.std() < 1e-6]
print("Near-constant features:", list(low_var))
X.drop(columns=low_var, inplace=True)
USED_FEATS = [c for c in USED_FEATS if c not in set(low_var)]

corr = X[train_mask].corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if (upper[col] > 0.98).any()]
print("Highly collinear to drop:", to_drop[:20])
X.drop(columns=to_drop, inplace=True)
USED_FEATS = [c for c in USED_FEATS if c not in set(to_drop)]

# # quick one-pass using final model (or average across folds if you saved them)
# imp = pd.DataFrame({'feature': X.columns,
#                     'gain': final.booster_.feature_importance(importance_type='gain')})
# zero_imp = imp.loc[imp['gain'] == 0, 'feature'].tolist()
# print("Zero-importance:", zero_imp[:20])
# X.drop(columns=zero_imp, inplace=True)
# USED_FEATS = [c for c in USED_FEATS if c not in set(zero_imp)]

Near-constant features: ['black_mean_pre_elo_role__delta_maf_minus_cit', 'don_place__delta_maf_minus_cit', 'don_pre_elo_role__delta_maf_minus_cit', 'game_max_id__delta_maf_minus_cit', 'red_mean_pre_elo_role__delta_maf_minus_cit', 'roll20_win_rate_citizens_mean__delta_maf_minus_cit', 'roll20_win_rate_mafia_mean__delta_maf_minus_cit', 'roll5_win_rate_citizens_mean__delta_maf_minus_cit', 'roll5_win_rate_mafia_mean__delta_maf_minus_cit', 'sheriff_place__delta_maf_minus_cit', 'sheriff_pre_elo_role__delta_maf_minus_cit']
Highly collinear to drop: ['don_pre_elo_role', 'red_mean_pre_elo_role', 'roll5_win_rate_citizens_mean', 'roll5_win_rate_mafia_mean', 'sheriff_pre_elo_role']


In [10]:
print("Features:", len(USED_FEATS))
print("Rows:", len(team_tall), "| Train:", train_mask.sum(), "Cal:", cal_mask.sum(), "Test:", test_mask.sum())

Features: 55
Rows: 160564 | Train: 136480 Cal: 24084 Test: 24084


In [11]:
# Baseline-0 constant 0.5 on holdout
p_const = np.repeat(0.5, len(y))
print("Baseline-0 (holdout) LogLoss:", log_loss(y[test_mask], p_const[test_mask]))
print("Baseline-0 (holdout) Brier  :", brier_score_loss(y[test_mask], p_const[test_mask]))
# ROC-AUC is undefined for a constant predictor; skip.

# Baseline-1 Logistic Regression with GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

logreg = Pipeline([
    ('scaler', StandardScaler(with_mean=True)),  # center & scale (X is dense)
    ('lr', LogisticRegression(
        max_iter=5000,          # more steps
        solver='lbfgs',         # good default for L2
        penalty='l2',
        n_jobs=-1,
        random_state=SEED
    ))
])

gkf = GroupKFold(n_splits=5)
lls, aucs, brs = [], [], []
for tr, va in gkf.split(X, y, groups=groups):
    logreg.fit(X.iloc[tr], y[tr])
    p = logreg.predict_proba(X.iloc[va])[:,1]
    lls.append(log_loss(y[va], p))
    aucs.append(roc_auc_score(y[va], p))
    brs.append(brier_score_loss(y[va], p))
print(f"Baseline-1 (LogReg) | LogLoss {np.mean(lls):.4f}±{np.std(lls):.4f} | "
      f"AUC {np.mean(aucs):.4f}±{np.std(aucs):.4f} | Brier {np.mean(brs):.4f}±{np.std(brs):.4f}")


Baseline-0 (holdout) LogLoss: 0.6931471805599454
Baseline-0 (holdout) Brier  : 0.25
Baseline-1 (LogReg) | LogLoss 0.6841±0.0010 | AUC 0.5746±0.0039 | Brier 0.2455±0.0005


## 5) LightGBM (main model) + CV + calibration + holdout
### Model tuning (LightGBM)
Slightly higher capacity + early stopping + class_weight='balanced'. Calibrate with sigmoid.


In [12]:
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss
from sklearn.model_selection import GroupKFold
from lightgbm import early_stopping, log_evaluation
import numpy as np

params = dict(
    n_estimators=3000,
    learning_rate=0.01,
    num_leaves=127,
    min_data_in_leaf=40,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.5,
    reg_alpha=0.3,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

gkf = GroupKFold(n_splits=5)
lls, aucs, brs = [], [], []
for k,(tr,va) in enumerate(gkf.split(X,y,groups=groups),1):
    model = LGBMClassifier(**params)
    model.fit(
    X.iloc[tr], y[tr],
    eval_set=[(X.iloc[va], y[va])],
    eval_metric='logloss',
    callbacks=[
        early_stopping(stopping_rounds=100),  # same meaning as early_stopping_rounds
        log_evaluation(0)                     # silences per-iteration logs; use log_evaluation(50) to see progress
    ]
)
    p = model.predict_proba(X.iloc[va])[:,1]
    lls.append(log_loss(y[va], p)); aucs.append(roc_auc_score(y[va], p)); brs.append(brier_score_loss(y[va], p))
    print(f"Fold {k}: LogLoss={lls[-1]:.4f} AUC={aucs[-1]:.4f} Brier={brs[-1]:.4f}")

print(f"\nCV | LogLoss {np.mean(lls):.4f}±{np.std(lls):.4f} | "
      f"AUC {np.mean(aucs):.4f}±{np.std(aucs):.4f} | Brier {np.mean(brs):.4f}±{np.std(brs):.4f}")

q70, q85 = np.quantile(time_key, [0.70, 0.85])
train_mask = time_key <= q85
cal_mask   = (time_key > q70) & (time_key <= q85)
test_mask  = time_key > q85

# make a tiny val split from the train window (e.g., last 10% by time among train_mask)
tr_time = time_key[train_mask]
tr_q90 = np.quantile(tr_time, 0.90)
inner_tr = train_mask & (time_key <= tr_q90)
inner_va = train_mask & (time_key >  tr_q90)

final = LGBMClassifier(**params)
final.fit(
    X[inner_tr], y[inner_tr],
    eval_set=[(X[inner_va], y[inner_va])],
    eval_metric='logloss',
    callbacks=[early_stopping(stopping_rounds=100), log_evaluation(0)]
)


calibrated = CalibratedClassifierCV(final, cv='prefit', method='sigmoid').fit(X[cal_mask], y[cal_mask])

p_test = calibrated.predict_proba(X[test_mask])[:,1]
print("\nHoldout (last 15%)")
print("LogLoss:", log_loss(y[test_mask], p_test))
print("ROC-AUC:", roc_auc_score(y[test_mask], p_test))
print("Brier  :", brier_score_loss(y[test_mask], p_test))


[LightGBM] [Info] Number of positive: 64225, number of negative: 64225
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8362
[LightGBM] [Info] Number of data points in the train set: 128450, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[253]	valid_0's binary_logloss: 0.68381
Fold 1: LogLoss=0.6838 AUC=0.5770 Brier=0.2453
[LightGBM] [Info] Number of positive: 64225, number of negative: 64225
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074530 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8364
[LightGBM] [Info] Number of data points in the train set: 128450, number of used features: 55
[LightGBM] [Inf




Holdout (last 15%)
LogLoss: 0.6810904739046589
ROC-AUC: 0.5912541792703008
Brier  : 0.243958086341504


### Feature cleaning

In [13]:
import pandas as pd
fi = pd.DataFrame({
    'feature': X.columns,
    'gain': final.booster_.feature_importance(importance_type='gain')
}).sort_values('gain', ascending=False)
fi.head(15)


Unnamed: 0,feature,gain
0,black_mean_pre_elo_role,83972.693232
1,don_place,15973.395452
41,pre_elo_side_mean__delta_maf_minus_cit,13711.242615
27,place_std__delta_maf_minus_cit,10598.319596
35,pre_elo_q25__delta_maf_minus_cit,10245.660873
13,gap_id_clipped_mean__delta_maf_minus_cit,9368.866647
39,pre_elo_role_mean__delta_maf_minus_cit,8837.018718
40,pre_elo_side_mean,8660.291512
38,pre_elo_role_mean,8497.468383
36,pre_elo_q75,8256.890153


### Sanity-Check 0 — Context
We assume you already have these variables from the notebook:
- `team_tall`  : the per-team dataset (2 rows per game)
- `USED_FEATS` : the list of feature column names used to train
- `X, y`       : features (DataFrame) and labels (numpy array)
- `groups`     : `game_id` for GroupKFold
- `time_key`   : `game_max_id` (time proxy per game)
- masks: `train_mask`, `cal_mask`, `test_mask`
- models: `final` (LightGBM before calibration), `calibrated` (after isotonic)

In [14]:
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss

# Raw (uncalibrated) predictions on holdout
p_test_raw = final.predict_proba(X[test_mask])[:, 1]
print("RAW (no calibration) — holdout")
print("LogLoss:", log_loss(y[test_mask], p_test_raw))
print("AUC    :", roc_auc_score(y[test_mask], p_test_raw))
print("Brier  :", brier_score_loss(y[test_mask], p_test_raw))

# Calibrated (your current numbers)
p_test_cal = calibrated.predict_proba(X[test_mask])[:, 1]
print("\nCALIBRATED — holdout")
print("LogLoss:", log_loss(y[test_mask], p_test_cal))
print("AUC    :", roc_auc_score(y[test_mask], p_test_cal))
print("Brier  :", brier_score_loss(y[test_mask], p_test_cal))


RAW (no calibration) — holdout
LogLoss: 0.6806256995681352
AUC    : 0.5912541792703008
Brier  : 0.2437700147688812

CALIBRATED — holdout
LogLoss: 0.6810904739046589
AUC    : 0.5912541792703008
Brier  : 0.243958086341504


In [15]:
rng = np.random.default_rng(42)
y_shuffled = y.copy()
y_shuffled[test_mask] = rng.permutation(y_shuffled[test_mask])

print("Label-shuffled holdout:")
print("LogLoss:", log_loss(y_shuffled[test_mask], p_test_cal))
print("AUC    :", roc_auc_score(y_shuffled[test_mask], p_test_cal))
print("Brier  :", brier_score_loss(y_shuffled[test_mask], p_test_cal))


Label-shuffled holdout:
LogLoss: 0.7147548988978578
AUC    : 0.5003733989940153
Brier  : 0.26041983603755


In [16]:
# Shuffle the order of X within the holdout
X_test = X[test_mask].copy()
X_test_shuffled = X_test.sample(frac=1.0, random_state=123)

# Predict on shuffled features (same trained model)
p_test_shufX = calibrated.predict_proba(X_test_shuffled)[:, 1]

print("Feature-shuffled holdout:")
print("LogLoss:", log_loss(y[test_mask], p_test_shufX))
print("AUC    :", roc_auc_score(y[test_mask], p_test_shufX))
print("Brier  :", brier_score_loss(y[test_mask], p_test_shufX))


Feature-shuffled holdout:
LogLoss: 0.7133639023716307
AUC    : 0.5044940318639509
Brier  : 0.25976301299755994


In [17]:
suspects = {'team_win','team_win_team',
            'game_points','total_points',
            'game_bonus','game_autobonus','best_move_bonus',
            'killed_first','best_move'}
bad = [c for c in USED_FEATS if any(s in c for s in suspects)]
print("Forbidden features found:", bad)

Forbidden features found: []


## 6) Convert to per-game winner & accuracy
Pick the side with larger probability within each game on the holdout.

In [18]:
hold = team_tall.loc[test_mask, ['game_id','team','team_win_team']].copy()
hold['proba'] = p_test

# True side per game
true_side = hold[hold['team_win_team']==1].groupby('game_id')['team'].first()

# Predicted side by higher prob
pred_side = hold.groupby('game_id').apply(lambda g: g.loc[g['proba'].idxmax(),'team'])
acc = (pred_side == true_side.reindex(pred_side.index)).mean()
print("Per-game accuracy (holdout):", round(acc, 4))


Per-game accuracy (holdout): 0.5797


  pred_side = hold.groupby('game_id').apply(lambda g: g.loc[g['proba'].idxmax(),'team'])


## 7) Save model & inference helpers


In [19]:
import joblib, json

joblib.dump(calibrated, ARTS/"lgbm_calibrated_pergame.joblib")
json.dump(USED_FEATS, open(ARTS/"pergame_features.json","w"))
print("Saved to:", ARTS)

def build_team_features_from_players(df_players: pd.DataFrame) -> pd.DataFrame:
    w = compute_elo(df_players)
    w = add_rolling_stats(w)

    aggs = {
        'pre_elo': ['mean','std','min','max', q25, q75],
        'roll5_win_rate': ['mean'],
        'roll20_win_rate': ['mean'],
        'career_win_rate': ['mean'],
        'games_played': ['mean','min','max', 'std']
    }
    if 'roll5_pts_mean' in w.columns:
        aggs['roll5_pts_mean'] = ['mean']
    if 'roll20_pts_mean' in w.columns:
        aggs['roll20_pts_mean'] = ['mean']

    ta = w.groupby(['game_id','team']).agg(aggs)
    ta.columns = ['_'.join(filter(None, map(str, c))).replace('<function ','').replace('>','') for c in ta.columns]
    ta = ta.reset_index()
    if 'id' in w.columns:
        gmaxid = w.groupby('game_id')['id'].max().rename('game_max_id')
        ta = ta.merge(gmaxid, on='game_id', how='left')

    wide = ta.pivot(index='game_id', columns='team')
    wide.columns = [f"{a}__{b}" for a,b in wide.columns]
    wide = wide.reset_index()

    def side_cols(side): return [c for c in wide.columns if c.endswith(f"__{side}") and c != 'game_id']
    maf_cols = side_cols('mafia')

    delta = pd.DataFrame({'game_id': wide['game_id']})
    for mcol in maf_cols:
        base = mcol[:-len("__mafia")]
        ccol = base + "__citizens"
        if ccol in wide.columns:
            delta[base + "__delta_maf_minus_cit"] = wide[mcol] - wide[ccol]

    team_tall_new = ta.merge(delta, on='game_id', how='left')
    return team_tall_new

def predict_game_winner_from_players(df_players_new: pd.DataFrame):
    tt = build_team_features_from_players(df_players_new)
    X_new = tt[USED_FEATS].fillna(0)
    proba = calibrated.predict_proba(X_new)[:,1]
    out = tt[['game_id','team']].copy()
    out['p_team_win'] = proba
    winners = out.loc[out.groupby('game_id')['p_team_win'].idxmax()].rename(columns={'team':'pred_team'})
    winners = winners[['game_id','pred_team','p_team_win']]
    return out, winners

print("Inference functions ready.")


Saved to: artifacts_game
Inference functions ready.


## 8) “Daily” monitoring (simulated)
Score newest ~2% of games by `game_max_id` to catch recent drift.

In [20]:
cut = np.quantile(time_key, 0.98)
daily_mask = time_key > cut
p_daily = calibrated.predict_proba(X[daily_mask])[:,1]
print("DAILY LogLoss:", log_loss(y[daily_mask], p_daily))
print("DAILY ROC-AUC:", roc_auc_score(y[daily_mask], p_daily))
print("DAILY Brier  :", brier_score_loss(y[daily_mask], p_daily))


DAILY LogLoss: 0.6759773134520557
DAILY ROC-AUC: 0.6053565862139021
DAILY Brier  : 0.2414604019967574
