setup

In [2]:
# --- Setup & configuration ---
import os, sys, math, json, time, zipfile
from pathlib import Path

import numpy as np
import pandas as pd
import requests

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold


SEASON_START_YEAR = 2025      # 2025-26 season
TEAM = "TOR"                  # Leafs
DATA_DIR = Path("../data")
DATA_DIR.mkdir(exist_ok=True)

# If you already have local copies of your uploaded files, set these paths:
SKATERS_CSV = DATA_DIR / "raw/skaters.csv"          # put your uploaded skaters.csv here
TEAMS_CSV   = DATA_DIR / "raw/teams.csv"            # optional
TXN_CSV     = DATA_DIR / "clean/transactions.csv"  # will create a template if missing



2) Helper: schedule fetch + simple rest features

What this does: pulls the Leafs’ regular-season schedule from the NHL API and computes simple back-to-back and rest features. Writes data/schedule.csv.

In [3]:
def load_leafs_schedule_from_csv(
    csv_path: str,
    team_abbr: str = "TOR",
    date_col: str = "Date",
    home_col: str = "Home Team",
    away_col: str = "Away Team",
) -> pd.DataFrame:
    """
    Read a FixtureDownload-style schedule CSV for the Leafs and return a clean
    DataFrame with modeling features:
      - date (datetime64[ns])
      - home_team, away_team, opponent
      - home (1 if Leafs are home, else 0)
      - back_to_back (1 if game is the day after previous one)
      - rest_days (days since last Leafs game)
      - rest_diff (placeholder=0)
    """
    df = pd.read_csv(csv_path)

    # Basic column normalization
    rename_map = {date_col: "date", home_col: "home_team", away_col: "away_team"}
    for k in (date_col, home_col, away_col):
        if k not in df.columns:
            raise KeyError(f"Expected column '{k}' not found in CSV. Found: {list(df.columns)}")
    df = df.rename(columns=rename_map)

    # Keep only columns we need (+ optional refs)
    keep = ["date", "home_team", "away_team"]
    for opt in ["Location", "Result", "Match Number", "Round Number"]:
        if opt in df.columns:
            keep.append(opt)
    df = df[keep].copy()

    # --- FIX: robust date parsing (your CSV is DD/MM/YYYY, no time) ---
    # First try strict day-first; then fill any leftovers via generic parser (ISO etc.)
    date_str = df["date"].astype(str).str.strip()
    parsed = pd.to_datetime(date_str, format="%d/%m/%Y", errors="coerce")
    mask = parsed.isna()
    if mask.any():
        parsed.loc[mask] = pd.to_datetime(date_str[mask], errors="coerce")  # handles ISO like YYYY-MM-DD
    df["date"] = parsed

    # Sort by date
    df = df.sort_values("date").reset_index(drop=True)

    # Leafs perspective
    df["home"] = (df["home_team"] == team_abbr).astype(int)
    df["opponent"] = np.where(df["home"] == 1, df["away_team"], df["home_team"])

    # Rest features (Leafs sequence)
    df["back_to_back"] = 0
    df["rest_days"] = 2
    last_date = None
    for i, r in df.iterrows():
        cur_date = r["date"]
        if pd.notna(cur_date) and last_date is not None:
            delta = (cur_date - last_date).days
            df.at[i, "rest_days"] = max(delta, 0)
            if delta == 1:
                df.at[i, "back_to_back"] = 1
        if pd.notna(cur_date):
            last_date = cur_date

    # Placeholder until you compute opponent rest and subtract
    df["rest_diff"] = 0

    # Reorder
    order = ["date", "home_team", "away_team", "opponent", "home",
             "back_to_back", "rest_days", "rest_diff"]
    order += [c for c in ["Location", "Result", "Match Number", "Round Number"] if c in df.columns]
    return df[order]

# Use it:
schedule_df = load_leafs_schedule_from_csv("../data/raw/schedule.csv")
# Optional quick check:
print({"date_nulls_after_parse": int(schedule_df["date"].isna().sum())})
schedule_df.tail(10)


{'date_nulls_after_parse': 0}


Unnamed: 0,date,home_team,away_team,opponent,home,back_to_back,rest_days,rest_diff,Location,Result,Match Number,Round Number
72,2026-03-25,Toronto Maple Leafs,New York Rangers,Toronto Maple Leafs,0,1,1,0,Scotiabank Arena,,1136,21
73,2026-03-28,St. Louis Blues,Toronto Maple Leafs,St. Louis Blues,0,0,3,0,Enterprise Center,,1160,22
74,2026-03-31,Anaheim Ducks,Toronto Maple Leafs,Anaheim Ducks,0,0,3,0,Honda Center,,1176,22
75,2026-04-03,San Jose Sharks,Toronto Maple Leafs,San Jose Sharks,0,0,3,0,SAP Center at San Jose,,1202,23
76,2026-04-04,Los Angeles Kings,Toronto Maple Leafs,Los Angeles Kings,0,1,1,0,Crypto.com Arena,,1217,23
77,2026-04-08,Toronto Maple Leafs,Washington Capitals,Toronto Maple Leafs,0,0,4,0,Scotiabank Arena,,1245,23
78,2026-04-09,New York Islanders,Toronto Maple Leafs,New York Islanders,0,1,1,0,UBS Arena,,1252,24
79,2026-04-11,Toronto Maple Leafs,Florida Panthers,Toronto Maple Leafs,0,0,2,0,Scotiabank Arena,,1270,24
80,2026-04-13,Toronto Maple Leafs,Dallas Stars,Toronto Maple Leafs,0,0,2,0,Scotiabank Arena,,1285,24
81,2026-04-15,Ottawa Senators,Toronto Maple Leafs,Ottawa Senators,0,0,2,0,Canadian Tire Centre,,1304,24


adding elo to each game

In [4]:
# --- Composite Elo (EV/PP/PK + Save%) and attach to schedule_df ---

# Keep your dictionary exactly
name_to_abbr = {
    "Anaheim Ducks": "ANA","Arizona Coyotes": "ARI","Boston Bruins": "BOS","Buffalo Sabres": "BUF",
    "Calgary Flames": "CGY","Carolina Hurricanes": "CAR","Chicago Blackhawks": "CHI","Colorado Avalanche": "COL",
    "Columbus Blue Jackets": "CBJ","Dallas Stars": "DAL","Detroit Red Wings": "DET","Edmonton Oilers": "EDM",
    "Florida Panthers": "FLA","Los Angeles Kings": "LAK","Minnesota Wild": "MIN","Montreal Canadiens": "MTL",
    "Nashville Predators": "NSH","New Jersey Devils": "NJD","New York Islanders": "NYI","New York Rangers": "NYR",
    "Ottawa Senators": "OTT","Philadelphia Flyers": "PHI","Pittsburgh Penguins": "PIT","San Jose Sharks": "SJS",
    "Seattle Kraken": "SEA","St. Louis Blues": "STL","Tampa Bay Lightning": "TBL","Toronto Maple Leafs": "TOR",
    "Utah Hockey Club": "UTA","Vancouver Canucks": "VAN","Vegas Golden Knights": "VGK","Washington Capitals": "WSH",
    "Winnipeg Jets": "WPG"
}

# ---------- Base team metrics from teams.csv (latest season) ----------
teams_df = pd.read_csv("../data/raw/teams.csv").copy()
teams_df["season"] = pd.to_numeric(teams_df["season"], errors="coerce")
latest_season = int(teams_df["season"].max())
t = teams_df.loc[teams_df["season"] == latest_season, ["team","situation","iceTime","xGoalsFor","xGoalsAgainst"]].copy()

# Map situations to EV/PP/PK
SIT_MAP = {"5on5":"EV", "5on4":"PP", "4on5":"PK"}
t["SIT"] = t["situation"].map(SIT_MAP)

# Rates per 60
t["iceTime"] = pd.to_numeric(t["iceTime"], errors="coerce").replace(0, pd.NA)
t["xGoalsFor"] = pd.to_numeric(t["xGoalsFor"], errors="coerce")
t["xGoalsAgainst"] = pd.to_numeric(t["xGoalsAgainst"], errors="coerce")
t["xGF60"] = (t["xGoalsFor"] / t["iceTime"]) * 60.0
t["xGA60"] = (t["xGoalsAgainst"] / t["iceTime"]) * 60.0
t["net_xG60"] = t["xGF60"] - t["xGA60"]

# Aggregate
pp = t.loc[t["SIT"]=="PP"].groupby("team", as_index=False)[["xGF60"]].mean()
pk = t.loc[t["SIT"]=="PK"].groupby("team", as_index=False)[["xGA60"]].mean()
ev = t.loc[t["SIT"]=="EV"].groupby("team", as_index=False)[["net_xG60"]].mean()

# League means for fill
pp_mean = float(pp["xGF60"].mean()) if len(pp) else 0.0
pk_mean = float(pk["xGA60"].mean()) if len(pk) else 0.0
ev_mean = float(ev["net_xG60"].mean()) if len(ev) else 0.0

all_teams = pd.DataFrame({"team": t["team"].dropna().unique()})
pp = all_teams.merge(pp, on="team", how="left"); pp["xGF60"] = pp["xGF60"].fillna(pp_mean)
pk = all_teams.merge(pk, on="team", how="left"); pk["xGA60"] = pk["xGA60"].fillna(pk_mean)
ev = all_teams.merge(ev, on="team", how="left"); ev["net_xG60"] = ev["net_xG60"].fillna(ev_mean)

# Standardize -> z-scores
def zscore(s):
    s = s.astype(float); mu = s.mean(); sd = s.std(ddof=0)
    return (s - mu) / (sd if sd != 0 else 1.0)

ev_z = zscore(ev["net_xG60"])                    # higher is better
pp_z = zscore(pp["xGF60"])                       # higher is better
pk_z = zscore(pk["xGA60"].mean() - pk["xGA60"])  # lower xGA60 => better PK



sv_df = pd.read_csv("../data/clean/team_save_percentages.csv") 

# Map full team names -> acronyms to align with 'team' codes in teams.csv
sv_df = sv_df.rename(columns={"team":"team_name","savePct":"savePct"})
sv_df["team"] = sv_df["team_name"].map(name_to_abbr)
sv = all_teams.merge(sv_df[["team","savePct"]], on="team", how="left")
# Fill missing with league mean (e.g., expansion/rebrand edge cases)
sv["savePct"] = pd.to_numeric(sv["savePct"], errors="coerce")
sv["savePct"] = sv["savePct"].fillna(sv["savePct"].mean())
sv_z = zscore(sv["savePct"])                      # higher is better

# ---------- Combine (favor PP a bit more, include SV%) ----------
# Weights sum to 1.0
W_EV, W_PP, W_PK, W_SV = 0.60, 0.20, 0.15, 0.05   # PP favored; modest SV influence

comp = all_teams.copy()
comp["ev_z"] = ev_z.values
comp["pp_z"] = pp_z.values
comp["pk_z"] = pk_z.values
comp["sv_z"] = sv_z.values

comp["z_composite"] = (
    W_EV*comp["ev_z"] +
    W_PP*comp["pp_z"] +
    W_PK*comp["pk_z"] +
    W_SV*comp["sv_z"]
)

# Elo mapping (same scale as before)
comp["elo"] = 1500.0 + 100.0 * comp["z_composite"]

# Build acronym → Elo dict
teams_elos = comp[["team","elo"]].copy()
elo_map = dict(zip(teams_elos["team"], teams_elos["elo"]))

# ---- Normalize schedule_df to acronyms and attach Elo (unchanged) ----
sch = schedule_df.copy()
sch["home_team"] = sch["home_team"].map(name_to_abbr)
sch["away_team"] = sch["away_team"].map(name_to_abbr)

TEAM = "TOR"
sch["home"] = (sch["home_team"] == TEAM).astype(int)
sch["opponent"] = np.where(sch["home"]==1, sch["away_team"], sch["home_team"])

sch["elo_for"] = np.where(sch["home"]==1, sch["home_team"].map(elo_map), sch["away_team"].map(elo_map))
sch["elo_against"] = np.where(sch["home"]==1, sch["away_team"].map(elo_map), sch["home_team"].map(elo_map))

# Save outputs
sch.to_csv("../data/clean/schedule_with_elos.csv", index=False)
teams_elos.to_csv("../data/clean/teams_elos.csv", index=False)

sch.tail(10)


Unnamed: 0,date,home_team,away_team,opponent,home,back_to_back,rest_days,rest_diff,Location,Result,Match Number,Round Number,elo_for,elo_against
72,2026-03-25,TOR,NYR,NYR,1,1,1,0,Scotiabank Arena,,1136,21,1501.123969,1478.252309
73,2026-03-28,STL,TOR,STL,0,0,3,0,Enterprise Center,,1160,22,1501.123969,1472.227119
74,2026-03-31,ANA,TOR,ANA,0,0,3,0,Honda Center,,1176,22,1501.123969,1375.575028
75,2026-04-03,SJS,TOR,SJS,0,0,3,0,SAP Center at San Jose,,1202,23,1501.123969,1326.2505
76,2026-04-04,LAK,TOR,LAK,0,1,1,0,Crypto.com Arena,,1217,23,1501.123969,1605.0989
77,2026-04-08,TOR,WSH,WSH,1,0,4,0,Scotiabank Arena,,1245,23,1501.123969,1549.680981
78,2026-04-09,NYI,TOR,NYI,0,1,1,0,UBS Arena,,1252,24,1501.123969,1452.205223
79,2026-04-11,TOR,FLA,FLA,1,0,2,0,Scotiabank Arena,,1270,24,1501.123969,1632.53545
80,2026-04-13,TOR,DAL,DAL,1,0,2,0,Scotiabank Arena,,1285,24,1501.123969,1532.127545
81,2026-04-15,OTT,TOR,OTT,0,0,2,0,Canadian Tire Centre,,1304,24,1501.123969,1463.97935


add in transactions from offseason, quantify player impact

In [41]:
# --- Compute Leafs offseason Elo bump and apply it to the schedule ---

import pandas as pd
import numpy as np

# 1) Compute Elo bump from transactions + skaters (GS/GP)
sk = pd.read_csv(SKATERS_CSV).rename(columns={"name": "player"})
tx = pd.read_csv(TXN_CSV)

# If skaters has multiple seasons, use the latest
if "season" in sk.columns:
    sk["season"] = pd.to_numeric(sk["season"], errors="coerce")
    latest_season = int(sk["season"].max())
    sk = sk[sk["season"] == latest_season].copy()

# Normalize names to avoid mismatches (e.g., "Mitch Marner" vs "Mitchell Marner")
def _norm_name(s):
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    return "".join(ch for ch in s if ch.isalnum() or ch.isspace())

sk["player_norm"] = sk["player"].astype(str).map(_norm_name)
tx["player_norm"] = tx["player"].astype(str).map(_norm_name)

# Impact = gameScore per game (safe divide)
gp = pd.to_numeric(sk.get("games_played"), errors="coerce").replace(0, np.nan)
gs = pd.to_numeric(sk.get("gameScore"), errors="coerce")
sk["impact"] = (gs / gp).astype(float)

# Average per player (if multiple rows)
per_player = sk.groupby("player_norm", as_index=False)["impact"].mean()

# Merge onto transactions (IN = to TOR, OUT = from TOR)
tx_in  = tx.loc[tx["team_to"]=="TOR"].merge(per_player, on="player_norm", how="left")
tx_out = tx.loc[tx["team_from"]=="TOR"].merge(per_player, on="player_norm", how="left")

# Compute net impact and convert to Elo
in_sum  = tx_in["impact"].fillna(0).sum()
out_sum = tx_out["impact"].fillna(0).sum()
net_impact = float(in_sum - out_sum)
ELO_PER_UNIT = 10.0  # your chosen scale: 10 Elo per 1 GS/GP unit
elo_bump = net_impact * ELO_PER_UNIT

print("=== Offseason impact (GS/GP) ===")
print(f"IN  total impact: {in_sum:.3f}")
print(f"OUT total impact: {out_sum:.3f}")
print(f"NET impact (IN-OUT): {net_impact:.3f}")
print(f"Elo bump applied to TOR games: {elo_bump:+.1f}  (factor {ELO_PER_UNIT} per GS/GP)")

# 2) Apply the bump to Leafs' side of each game in the schedule
sch = pd.read_csv("../data/clean/schedule_with_elos.csv")

is_home = sch["home_team"] == "TOR"
is_away = sch["away_team"] == "TOR"

# Add the same bump to Toronto's pregame strength whether home or away
sch.loc[is_home, "elo_for"] += elo_bump
sch.loc[is_away, "elo_for"] += elo_bump

sch["elo_for"] = sch["elo_for"].round(1)

# Save new elo
TEAMS_ELOS_PATH = "../data/clean/teams_elos.csv"
teams_elos = pd.read_csv(TEAMS_ELOS_PATH)
curr_elo = teams_elos.loc[teams_elos["team"] == "TOR", "elo"]
new_elo  = int(curr_elo) + elo_bump
teams_elos.loc[teams_elos["team"] == "TOR", "elo"] = new_elo
teams_elos.to_csv(TEAMS_ELOS_PATH, index=False)


sch.to_csv("../data/clean/schedule_with_elos_bumped.csv", index=False)
sch.head(10)

teams_elos.head()


=== Offseason impact (GS/GP) ===
IN  total impact: 0.823
OUT total impact: 1.044
NET impact (IN-OUT): -0.221
Elo bump applied to TOR games: -2.2  (factor 10.0 per GS/GP)


  new_elo  = int(curr_elo) + elo_bump


Unnamed: 0,team,elo
0,VGK,1605.298458
1,TOR,1498.788076
2,MIN,1493.800423
3,PHI,1512.719274
4,BUF,1441.737514


backtest data for training

In [42]:
# Load raw Leafs backtest
bt = pd.read_csv("../data/raw/backtest.csv")

# Load precomputed team Elos
teams_elos = pd.read_csv("../data/clean/teams_elos.csv")
elo_map = dict(zip(teams_elos["team"], teams_elos["elo"]))

# Special case: map UTA to ARI if missing (carryover issue)
if "UTA" in elo_map and "ARI" not in elo_map:
    elo_map["ARI"] = elo_map["UTA"]

# --------------------------
# Build modeling schema
# --------------------------
df = pd.DataFrame()
df["date"] = pd.to_datetime(bt["Date"])
df["opponent"] = bt["Opponent"].map(name_to_abbr)

# Home flag (0=away,1=home)
df["home"] = (bt["is_away"] == 0).astype(int)

TEAM = "TOR"
df["home_team"] = np.where(df["home"] == 1, TEAM, df["opponent"])
df["away_team"] = np.where(df["home"] == 1, df["opponent"], TEAM)

# --------------------------
# Results & labels (with OT/SO handled as OTL for losses)
# --------------------------
# Basic win label
df["win"] = (bt["result"] == "W").astype(int)

# Normalize extra_time if present (values often "no", "OT", "SO")
extra_time = bt.get("extra_time")
if extra_time is None:
    extra_time = pd.Series(["no"] * len(bt))
extra_time_norm = (
    extra_time.fillna("no").astype(str).str.strip().str.lower()
)

# Normalize shootout indicator if present; else infer from extra_time
if "so" in bt.columns:
    try:
        so_flag = bt["so"].fillna(0).astype(int).clip(0, 1)
    except Exception:
        # Fall back to inference if non-numeric
        so_flag = (extra_time_norm == "so").astype(int)
else:
    so_flag = (extra_time_norm == "so").astype(int)

# A game that went beyond regulation (OT or SO)
went_beyond_reg = (extra_time_norm != "no") | (so_flag == 1)

# OTL = loss that went beyond regulation (OT or SO)
df["otl"] = ((bt["result"] == "L") & went_beyond_reg).astype(int)

# Regulation losses only (not OT/SO)
df["loss_reg"] = ((bt["result"] == "L") & (~went_beyond_reg)).astype(int)

# Standings points per game: 2 for W, 1 for OTL, 0 for reg L
df["points_earned"] = 2 * df["win"] + df["otl"]

# --------------------------
# Rest & back-to-back
# --------------------------
df = df.sort_values("date").reset_index(drop=True)
df["back_to_back"] = 0
df["rest_days"] = 2
last = None
for i, r in df.iterrows():
    if last is not None:
        d = (r["date"] - last).days
        df.at[i, "rest_days"] = max(d, 0)
        if d == 1:
            df.at[i, "back_to_back"] = 1
    last = r["date"]
df["rest_diff"] = 0  # placeholder

# --------------------------
# Attach Elo from precomputed map
# --------------------------
team_for = df["away_team"].where(df["home"] == 0, df["home_team"])
df["elo_for"] = team_for.map(elo_map)
df["elo_against"] = df["opponent"].map(elo_map)

# --------------------------
# Optional: quick season summary (W-L-OL and points)
# --------------------------
wins = int(df["win"].sum())
ols = int(df["otl"].sum())
reg_losses = int(df["loss_reg"].sum())
true_record = f"{wins}-{reg_losses}-{ols}"
standings_points = int(2 * wins + ols)
print({"games": int(len(df)), "true_record": true_record, "standings_points": standings_points})

# --------------------------
# Save
# --------------------------
df.to_csv("../data/clean/backtest_with_elos.csv", index=False)
bt = df  # keep using bt downstream
df.head(15)


{'games': 82, 'true_record': '52-26-4', 'standings_points': 108}


Unnamed: 0,date,opponent,home,home_team,away_team,win,otl,loss_reg,points_earned,back_to_back,rest_days,rest_diff,elo_for,elo_against
0,2024-10-09,MTL,0,MTL,TOR,0,0,1,0,0,2,0,1498.788076,1408.218896
1,2024-10-10,NJD,0,NJD,TOR,1,0,0,2,1,1,0,1498.788076,1578.122046
2,2024-10-12,PIT,1,TOR,PIT,1,0,0,2,0,2,0,1498.788076,1489.689508
3,2024-10-16,LAK,1,TOR,LAK,1,0,0,2,0,4,0,1498.788076,1605.0989
4,2024-10-19,NYR,1,TOR,NYR,0,0,1,0,0,3,0,1498.788076,1478.252309
5,2024-10-21,TBL,1,TOR,TBL,1,0,0,2,0,2,0,1498.788076,1546.676774
6,2024-10-22,CBJ,0,CBJ,TOR,0,0,1,0,1,1,0,1498.788076,1448.659061
7,2024-10-24,STL,1,TOR,STL,0,0,1,0,0,2,0,1498.788076,1472.227119
8,2024-10-26,BOS,0,BOS,TOR,0,1,0,1,0,2,0,1498.788076,1457.435121
9,2024-10-28,WPG,0,WPG,TOR,1,0,0,2,0,2,0,1498.788076,1598.478975


ML attempt

Elo-based model

In [43]:
# --- Softer, more random Elo probabilities ---

# Existing knobs you already have (keep or tweak)
HOME_EDGE = 5.0            # very slight home advantage
B2B_PENALTY = 10.0         # smaller penalty than before
REST_PTS_PER_DAY = 3.0     # rest_diff in Elo points per day

# New "randomness / humility" dials
ELO_SCALE_DEN = 600.0      # was ~400; larger => flatter probs (less Elo confidence)
ALPHA_ELO = 0.65           # weight on Elo; (1-ALPHA_ELO) shrinks toward 0.5
NOISE_SD = 35.0            # Elo-points SD; set 0.0 for deterministic

# Optional, reproducible noise
_rng = np.random.default_rng(42)

def elo_prob_row(row):
    # Base Elo difference from your files
    diff = row["elo_for"] - row["elo_against"]

    # Context tweaks
    if row["home"] == 1:
        diff += HOME_EDGE
    if row["back_to_back"] == 1:
        diff -= B2B_PENALTY
    diff += REST_PTS_PER_DAY * row.get("rest_diff", 0)

    # Inject noise in Elo *before* probability (set NOISE_SD=0 for none)
    if NOISE_SD and NOISE_SD > 0:
        diff = diff + _rng.normal(0.0, NOISE_SD)

    # Elo logistic with a gentler denominator (flatter probs)
    p_elo = 1.0 / (1.0 + 10.0 ** (-(diff) / ELO_SCALE_DEN))

    # Blend with a coin flip to further damp confidence
    p = ALPHA_ELO * p_elo + (1.0 - ALPHA_ELO) * 0.5
    return float(p)

bt_soft = bt.copy()
bt_soft["win_prob"] = bt_soft.apply(elo_prob_row, axis=1)
bt_soft["prediction"] = (bt_soft["win_prob"] >= 0.5).astype(int)

brier = float(((bt_soft["win_prob"] - bt_soft["win"])**2).mean())
true_w = int(bt_soft["win"].sum()); true_l = int(len(bt_soft) - true_w)
pred_w = int(bt_soft["prediction"].sum()); pred_l = int(len(bt_soft) - pred_w)

print({
    "n_games": len(bt_soft),
    "brier": round(brier, 4),
    "true_record": f"{true_w}-{true_l}",
    "predicted_record": f"{pred_w}-{pred_l}",
})



{'n_games': 82, 'brier': 0.2511, 'true_record': '52-30', 'predicted_record': '46-36'}


In [71]:
def predict_schedule(sch_df, backtest_csv="../data/raw/backtest.csv"):
    """
    sch_df must have: ['date','home_team','away_team','opponent','home','back_to_back','rest_diff','elo_for','elo_against']
    backtest_csv is used only to learn the OTL share among losses for allocating predicted OTLs.
    """
    sch = sch_df.copy()

    # Probabilities & hard predictions
    sch["win_prob"] = sch.apply(elo_prob_row, axis=1)
    sch["prediction"] = (sch["win_prob"] >= 0.5).astype(int)

    # OTL share from backtest (treat SO as OTL)
    bt = pd.read_csv(backtest_csv)
    extra_time_norm = bt["extra_time"].fillna("no").astype(str).str.strip().str.lower()
    if "so" in bt.columns:
        try:
            so_flag = bt["so"].fillna(0).astype(int).clip(0, 1)
        except Exception:
            so_flag = (extra_time_norm == "so").astype(int)
    else:
        so_flag = (extra_time_norm == "so").astype(int)
    went_beyond_reg = (extra_time_norm != "no") | (so_flag == 1)

    win = (bt["result"] == "W").astype(int)
    otl = ((bt["result"] == "L") & went_beyond_reg).astype(int)
    losses_true = int((win == 0).sum())
    p_otl = (int(otl.sum()) / losses_true) if losses_true > 0 else 0.0

    # Allocate OTL among predicted losses: pick losses closest to 0.5
    pred_losses_idx = np.where(sch["prediction"].values == 0)[0]
    k_otl = int(round(p_otl * len(pred_losses_idx)))

    predicted_result = np.array(["W"] * len(sch), dtype=object)
    predicted_result[pred_losses_idx] = "L"
    if len(pred_losses_idx) > 0 and k_otl > 0:
        closeness = np.abs(sch.loc[pred_losses_idx, "win_prob"].values - 0.5)
        otl_loss_positions = pred_losses_idx[np.argsort(closeness)[:k_otl]]
        predicted_result[otl_loss_positions] = "OTL"

    sch["predicted_result"] = predicted_result

    # Output dataframe
    out_cols = [
        "date","home_team","away_team","opponent","home","back_to_back","rest_diff",
        "elo_for","elo_against","win_prob","predicted_result"
    ]
    preds = sch[out_cols].copy()
    preds["win_prob"] = preds["win_prob"].round(3)

    # Summary (optional)
    pred_w = int((preds["predicted_result"] == "W").sum())
    pred_otl = int((preds["predicted_result"] == "OTL").sum())
    pred_reg_l = int((preds["predicted_result"] == "L").sum())
    summary = {
        "n_games": len(preds),
        "pred_record_W-L-OTL": f"{pred_w}-{pred_reg_l}-{pred_otl}",
        "pred_points": int(2 * pred_w + pred_otl),
        "avg_win_prob": round(float(preds["win_prob"].mean()), 3),
        "assumed_otl_share_from_backtest": round(p_otl, 3),
    }

    return preds, summary

# ---- Example usage in your notebook ----
sch = pd.read_csv("../data/clean/schedule_with_elos_bumped.csv")
preds_df, summary = predict_schedule(sch)
display(preds_df)   # Jupyter display
print(summary)

Unnamed: 0,date,home_team,away_team,opponent,home,back_to_back,rest_diff,elo_for,elo_against,win_prob,predicted_result
0,2020-03-10,MTL,TOR,MTL,0,0,0,1498.9,1408.218896,0.608,W
1,2025-10-08,TOR,MTL,MTL,1,0,0,1498.9,1408.218896,0.543,W
2,2025-10-11,DET,TOR,DET,0,0,0,1498.9,1442.899590,0.524,W
3,2025-10-13,TOR,DET,DET,1,0,0,1498.9,1442.899590,0.531,W
4,2025-10-14,TOR,NSH,NSH,1,1,0,1498.9,1487.773147,0.493,OTL
...,...,...,...,...,...,...,...,...,...,...,...
77,2026-04-08,TOR,WSH,WSH,1,0,0,1498.9,1549.680981,0.506,W
78,2026-04-09,NYI,TOR,NYI,0,1,0,1498.9,1452.205223,0.515,W
79,2026-04-11,TOR,FLA,FLA,1,0,0,1498.9,1632.535450,0.374,L
80,2026-04-13,TOR,DAL,DAL,1,0,0,1498.9,1532.127545,0.493,OTL


{'n_games': 82, 'pred_record_W-L-OTL': '50-28-4', 'pred_points': 104, 'avg_win_prob': 0.504, 'assumed_otl_share_from_backtest': 0.133}
