setup

In [72]:
# Setup & config

import numpy as np
import pandas as pd
from sklearn.metrics import brier_score_loss, roc_auc_score

# Team context
TEAM = "TOR"

# Full name -> abbreviation
NAME_TO_ABBR = {
    "Anaheim Ducks": "ANA","Arizona Coyotes": "ARI","Boston Bruins": "BOS","Buffalo Sabres": "BUF",
    "Calgary Flames": "CGY","Carolina Hurricanes": "CAR","Chicago Blackhawks": "CHI","Colorado Avalanche": "COL",
    "Columbus Blue Jackets": "CBJ","Dallas Stars": "DAL","Detroit Red Wings": "DET","Edmonton Oilers": "EDM",
    "Florida Panthers": "FLA","Los Angeles Kings": "LAK","Minnesota Wild": "MIN","Montreal Canadiens": "MTL",
    "Nashville Predators": "NSH","New Jersey Devils": "NJD","New York Islanders": "NYI","New York Rangers": "NYR",
    "Ottawa Senators": "OTT","Philadelphia Flyers": "PHI","Pittsburgh Penguins": "PIT","San Jose Sharks": "SJS",
    "Seattle Kraken": "SEA","St. Louis Blues": "STL","Tampa Bay Lightning": "TBL","Toronto Maple Leafs": "TOR",
    "Utah Hockey Club": "UTA","Vancouver Canucks": "VAN","Vegas Golden Knights": "VGK","Washington Capitals": "WSH",
    "Winnipeg Jets": "WPG"
}
SCHEDULE_CSV = "../data/raw/schedule.csv"
TEAMS_CSV = "../data/raw/teams.csv"
SAVE_PCT_CSV = "../data/clean/team_save_percentages.csv"
BACKTEST_CSV = "../data/raw/backtest.csv"

# Elo knobs (deterministic)
HOME_EDGE = 5.0           # Elo pts for home ice
B2B_PENALTY = 10.0        # Elo pts penalty for back-to-back
REST_PTS_PER_DAY = 3.0    # Elo pts per rest_diff day
ELO_SCALE_DEN = 600.0     # larger => flatter probabilities
ALPHA_ELO = 0.65          # 1.0 => pure Elo, 0.0 => coin flip

def elo_prob_row(row: pd.Series) -> float:
    """Deterministic win probability from Elo + simple context tweaks."""
    diff = float(row["elo_for"]) - float(row["elo_against"])
    if int(row["home"]) == 1:
        diff += HOME_EDGE
    if int(row["back_to_back"]) == 1:
        diff -= B2B_PENALTY
    diff += REST_PTS_PER_DAY * float(row.get("rest_diff", 0))
    p_elo = 1.0 / (1.0 + 10.0 ** (-(diff) / ELO_SCALE_DEN))
    return float(ALPHA_ELO * p_elo + (1.0 - ALPHA_ELO) * 0.5)

def allocate_otl(loss_idx: np.ndarray, probs: np.ndarray, expected_share: float) -> set:
    """
    Mark a subset of predicted losses as OTL to match an expected share.
    Picks the losses whose probs are closest to 0.5.
    """
    n = len(loss_idx)
    k = int(round(max(0.0, min(1.0, expected_share)) * n))
    if n == 0 or k == 0:
        return set()
    closeness = np.abs(probs[loss_idx] - 0.5)
    otl_pick = loss_idx[np.argsort(closeness)[:k]]
    return set(otl_pick)


load shed

In [73]:
def load_leafs_schedule_from_csv(
    csv_path: str,
    team_abbr: str = TEAM,
    date_col: str = "Date",
    home_col: str = "Home Team",
    away_col: str = "Away Team",
) -> pd.DataFrame:
    """
    Load a FixtureDownload-style Leafs schedule and return:
      date (datetime64[ns]), home_team, away_team, opponent, home,
      back_to_back, rest_days, rest_diff (placeholder=0),
      plus passthrough columns if present: Location, Result, Match Number, Round Number.
    """
    df = pd.read_csv(csv_path)

    # Normalize column names we need
    for col in (date_col, home_col, away_col):
        if col not in df.columns:
            raise KeyError(f"Expected column '{col}' not found. Got: {list(df.columns)}")
    df = df.rename(columns={date_col: "date", home_col: "home_team", away_col: "away_team"})

    # Keep a tidy subset (preserve useful references if present)
    keep = ["date", "home_team", "away_team"]
    for opt in ["Location", "Result", "Match Number", "Round Number"]:
        if opt in df.columns:
            keep.append(opt)
    df = df[keep].copy()

    # Robust date parsing: your CSV is DD/MM/YYYY; also accept ISO if present
    date_str = df["date"].astype(str).str.strip()
    parsed = pd.to_datetime(date_str, format="%d/%m/%Y", errors="coerce")
    na = parsed.isna()
    if na.any():  # try generic parse for lingering ISO etc.
        parsed.loc[na] = pd.to_datetime(date_str[na], errors="coerce")
    df["date"] = parsed

    # Leafs POV
    df["home"] = (df["home_team"] == team_abbr).astype(int)
    df["opponent"] = np.where(df["home"] == 1, df["away_team"], df["home_team"])

    # Vectorized rest features
    df = df.sort_values("date").reset_index(drop=True)
    d_days = df["date"].diff().dt.days.fillna(2).clip(lower=0).astype(int)
    df["rest_days"] = d_days
    df["back_to_back"] = (d_days == 1).astype(int)

    # Placeholder until opponent rest is computed
    df["rest_diff"] = 0

    # Order columns
    order = ["date","home_team","away_team","opponent","home","back_to_back","rest_days","rest_diff"]
    for opt in ["Location", "Result", "Match Number", "Round Number"]:
        if opt in df.columns:
            order.append(opt)
    return df[order]


elo mapping

In [74]:
def build_elo_map(
    teams_csv: str = "../data/raw/teams.csv",
    save_pct_csv: str = "../data/clean/team_save_percentages.csv",
) -> tuple[dict, pd.DataFrame]:
    """
    Build a composite strength metric (EV/PP/PK + Save%) and map to Elo.
    Returns (elo_map: dict[abbr->elo], comp_df).
    """
    teams_df = pd.read_csv(teams_csv).copy()
    teams_df["season"] = pd.to_numeric(teams_df["season"], errors="coerce")
    latest_season = int(teams_df["season"].max())

    t = teams_df.loc[teams_df["season"] == latest_season, ["team","situation","iceTime","xGoalsFor","xGoalsAgainst"]].copy()
    SIT_MAP = {"5on5":"EV", "5on4":"PP", "4on5":"PK"}
    t["SIT"] = t["situation"].map(SIT_MAP)

    # Rates per 60
    t["iceTime"] = pd.to_numeric(t["iceTime"], errors="coerce").replace(0, np.nan)
    t["xGoalsFor"] = pd.to_numeric(t["xGoalsFor"], errors="coerce")
    t["xGoalsAgainst"] = pd.to_numeric(t["xGoalsAgainst"], errors="coerce")
    t["xGF60"] = (t["xGoalsFor"] / t["iceTime"]) * 60.0
    t["xGA60"] = (t["xGoalsAgainst"] / t["iceTime"]) * 60.0
    t["net_xG60"] = t["xGF60"] - t["xGA60"]

    # Aggregate by situation
    pp = t.loc[t["SIT"]=="PP"].groupby("team", as_index=False)[["xGF60"]].mean()
    pk = t.loc[t["SIT"]=="PK"].groupby("team", as_index=False)[["xGA60"]].mean()
    ev = t.loc[t["SIT"]=="EV"].groupby("team", as_index=False)[["net_xG60"]].mean()

    # Fill with league means to avoid gaps
    all_teams = pd.DataFrame({"team": t["team"].dropna().unique()})
    pp = all_teams.merge(pp, on="team", how="left");  pp["xGF60"] = pp["xGF60"].fillna(pp["xGF60"].mean())
    pk = all_teams.merge(pk, on="team", how="left");  pk["xGA60"] = pk["xGA60"].fillna(pk["xGA60"].mean())
    ev = all_teams.merge(ev, on="team", how="left");  ev["net_xG60"] = ev["net_xG60"].fillna(ev["net_xG60"].mean())

    # z-scores (with safe denominator)
    def z(s: pd.Series) -> pd.Series:
        s = s.astype(float)
        mu = s.mean()
        sd = s.std(ddof=0)
        return (s - mu) / (sd if sd != 0 else 1.0)

    ev_z = z(ev["net_xG60"])            # higher better
    pp_z = z(pp["xGF60"])               # higher better
    # PK: lower xGA60 is better -> invert before z
    pk_z = z(-pk["xGA60"])

    # Goalie save % (map full names -> abbr to match others)
    sv_df = pd.read_csv(save_pct_csv).rename(columns={"team":"team_name","savePct":"savePct"})
    sv_df["team"] = sv_df["team_name"].map(NAME_TO_ABBR)
    sv = all_teams.merge(sv_df[["team","savePct"]], on="team", how="left")
    sv["savePct"] = pd.to_numeric(sv["savePct"], errors="coerce")
    sv["savePct"] = sv["savePct"].fillna(sv["savePct"].mean())
    sv_z = z(sv["savePct"])

    # Composite -> Elo
    W_EV, W_PP, W_PK, W_SV = 0.60, 0.20, 0.15, 0.05
    comp = all_teams.copy()
    comp["ev_z"] = ev_z.values
    comp["pp_z"] = pp_z.values
    comp["pk_z"] = pk_z.values
    comp["sv_z"] = sv_z.values
    comp["z_composite"] = W_EV*comp["ev_z"] + W_PP*comp["pp_z"] + W_PK*comp["pk_z"] + W_SV*comp["sv_z"]
    comp["elo"] = 1500.0 + 100.0 * comp["z_composite"]

    elo_map = dict(zip(comp["team"], comp["elo"]))
    return elo_map, comp


attach to sched

In [75]:
def attach_elo_to_schedule(schedule_df: pd.DataFrame, elo_map: dict, team_abbr: str = TEAM) -> pd.DataFrame:
    """
    Normalize team names to abbreviations, then attach elo_for/elo_against.
    Assumes schedule_df has: date, home_team, away_team, home, opponent, back_to_back, rest_days, rest_diff.
    """
    sch = schedule_df.copy()

    # Normalize to abbreviations if full names are present
    sch["home_team"] = sch["home_team"].map(NAME_TO_ABBR).fillna(sch["home_team"])
    sch["away_team"] = sch["away_team"].map(NAME_TO_ABBR).fillna(sch["away_team"])

    # Recompute home/opponent from abbr to be safe
    sch["home"] = (sch["home_team"] == team_abbr).astype(int)
    sch["opponent"] = np.where(sch["home"] == 1, sch["away_team"], sch["home_team"])

    # Attach Elo
    sch["elo_for"] = np.where(sch["home"] == 1, sch["home_team"].map(elo_map), sch["away_team"].map(elo_map))
    sch["elo_against"] = np.where(sch["home"] == 1, sch["away_team"].map(elo_map), sch["home_team"].map(elo_map))

    # Keep tidy
    cols = ["date","home_team","away_team","opponent","home","back_to_back","rest_days","rest_diff","elo_for","elo_against"]
    extra = [c for c in ["Location","Result","Match Number","Round Number"] if c in sch.columns]
    return sch[cols + extra]


predict

In [76]:
import numpy as np
import pandas as pd

def predict_schedule(
    sch_df: pd.DataFrame,
    backtest_csv: str = "../data/raw/backtest.csv",
    n_sims: int = 0,
    rng_seed: int = 42,
    use_elo_noise: bool = False,
    elo_noise_sd: float = 35.0,
):
    """
    n_sims == 0 -> deterministic predictions with OTL allocation.
    n_sims  > 0 -> simulations; display-case per game via MEAN wins + OTL allocated
                   among display losses to match backtest share.
    Returns:
      if n_sims == 0: (preds_df, None, summary)
      else:           (display_df, sims_totals_df, summary)
    """
    sch = sch_df.copy().reset_index(drop=True)

    # ---- Base Elo probability (deterministic) ----
    diff_base = (
        (sch["elo_for"].astype(float) - sch["elo_against"].astype(float))
        + HOME_EDGE * sch["home"].astype(int)
        - B2B_PENALTY * sch["back_to_back"].astype(int)
        + REST_PTS_PER_DAY * sch["rest_diff"].astype(float)
    ).values
    p_elo = 1.0 / (1.0 + 10.0 ** (-(diff_base) / ELO_SCALE_DEN))
    p_base = ALPHA_ELO * p_elo + (1.0 - ALPHA_ELO) * 0.5
    p_base = np.clip(p_base, 1e-6, 1 - 1e-6)

    # ---- Robust OTL share from backtest (SO counts as OTL) ----
    bt = pd.read_csv(backtest_csv)
    # normalize result
    res = bt["result"].astype(str).str.strip().str.upper()
    # normalize extra_time and treat any non-"no" as beyond regulation
    ext = bt.get("extra_time", "no")
    ext = pd.Series(ext).fillna("no").astype(str).str.strip().str.lower()
    ext = ext.replace({
        "overtime": "ot", "otl": "ot", "ot/so": "ot",
        "shootout": "so", "shoot-out": "so",
        "": "no"  # blank -> no
    })
    beyond = ext != "no"
    if "so" in bt.columns:
        so_col = pd.to_numeric(bt["so"], errors="coerce").fillna(0).astype(int)
        beyond = beyond | (so_col == 1)

    wins_true = (res == "W").astype(int).values
    otl_true = ((res == "L") & beyond).astype(int).values
    losses_true = int((wins_true == 0).sum())
    p_otl = (int(otl_true.sum()) / losses_true) if losses_true > 0 else 0.0
    p_otl = float(np.clip(p_otl, 0.0, 1.0))

    base_cols = [c for c in ["date","home_team","away_team","opponent","home","back_to_back","rest_diff","elo_for","elo_against"] if c in sch.columns]

    # ======================
    # Deterministic path
    # ======================
    if n_sims == 0:
        preds = sch[base_cols].copy()
        preds["win_prob"] = np.round(p_base, 3)

        pred_win = (p_base >= 0.5).astype(int)
        loss_idx = np.where(pred_win == 0)[0]

        # Allocate OTLs among predicted losses to match backtest share
        k_otl = int(round(p_otl * len(loss_idx)))
        res_cat = np.array(["W"] * len(preds), dtype=object)
        if len(loss_idx) > 0:
            res_cat[loss_idx] = "L"
            if k_otl > 0:
                closeness = np.abs(p_base[loss_idx] - 0.5)
                otl_pick = loss_idx[np.argsort(closeness)[:k_otl]]
                res_cat[otl_pick] = "OTL"

        preds["predicted_result"] = res_cat

        pred_w  = int((preds["predicted_result"] == "W").sum())
        pred_ol = int((preds["predicted_result"] == "OTL").sum())
        pred_rl = int((preds["predicted_result"] == "L").sum())
        summary = {
            "n_games": len(preds),
            "pred_record_W-L-OTL": f"{pred_w}-{pred_rl}-{pred_ol}",
            "pred_points": int(2 * pred_w + pred_ol),
            "avg_win_prob": float(preds["win_prob"].mean()),
            "assumed_otl_share_from_backtest": round(p_otl, 3),
            "mode": "deterministic",
        }
        return preds, None, summary

    # ======================
    # Simulation path
    # ======================
    rng = np.random.default_rng(rng_seed)
    n_games = len(sch)
    wins_sims = np.zeros((n_sims, n_games), dtype=int)
    otl_sims  = np.zeros((n_sims, n_games), dtype=int)

    for s in range(n_sims):
        if use_elo_noise:
            diff = diff_base + rng.normal(0.0, elo_noise_sd, size=n_games)
            p_e = 1.0 / (1.0 + 10.0 ** (-(diff) / ELO_SCALE_DEN))
            p = ALPHA_ELO * p_e + (1.0 - ALPHA_ELO) * 0.5
            p = np.clip(p, 1e-6, 1 - 1e-6)
        else:
            p = p_base
        w = rng.binomial(1, p, size=n_games)
        wins_sims[s] = w
        loss_positions = np.where(w == 0)[0]
        if len(loss_positions) > 0 and p_otl > 0.0:
            otl_flags = rng.binomial(1, p_otl, size=len(loss_positions))
            otl_sims[s, loss_positions] = otl_flags

    # Per-game means
    win_rate = wins_sims.mean(axis=0)
    loss_count = n_sims - wins_sims.sum(axis=0)

    # Display-case: W by mean-rule; OTL allocated among remaining losses by backtest share
    display_res = np.full(n_games, "L", dtype=object)
    display_res[win_rate >= 0.5] = "W"
    remaining_losses = np.where(win_rate < 0.5)[0]
    k_display_otl = int(round(p_otl * len(remaining_losses)))
    if k_display_otl > 0 and len(remaining_losses) > 0:
        closeness = np.abs(p_base[remaining_losses] - 0.5)
        otl_pick = remaining_losses[np.argsort(closeness)[:k_display_otl]]
        display_res[otl_pick] = "OTL"

    display_df = sch[base_cols].copy()
    display_df["win_prob"] = np.round(p_base, 3)
    display_df["win_rate"] = np.round(win_rate, 3)
    display_df["display_result"] = display_res

    # Per-sim totals
    sim_rows = []
    for s in range(n_sims):
        w = int(wins_sims[s].sum())
        losses_idx = np.where(wins_sims[s] == 0)[0]
        # derive per-sim OTL count from sampled flags
        ol = int(otl_sims[s, losses_idx].sum()) if len(losses_idx) else 0
        rl = int(len(losses_idx) - ol)
        pts = int(2 * w + ol)
        sim_rows.append({"sim": s, "wins": w, "reg_losses": rl, "otl": ol, "points": pts,
                         "record_W-L-OTL": f"{w}-{rl}-{ol}"})
    sims_totals = pd.DataFrame(sim_rows)

    # Summary
    disp_w  = int((display_df["display_result"] == "W").sum())
    disp_ol = int((display_df["display_result"] == "OTL").sum())
    disp_rl = int((display_df["display_result"] == "L").sum())
    summary = {
        "n_games": n_games,
        "display_record_W-L-OTL": f"{disp_w}-{disp_rl}-{disp_ol}",
        "display_points": int(2 * disp_w + disp_ol),
        "mean_points": float(sims_totals["points"].mean()),
        "median_points": float(sims_totals["points"].median()),
        "assumed_otl_share_from_backtest": round(p_otl, 3),
        "n_sims": n_sims,
        "mode": "simulation_mean_rule",
    }
    return display_df, sims_totals, summary


In [77]:

# 1) Load schedule (parses DD/MM/YYYY and adds rest features)
schedule_df = load_leafs_schedule_from_csv(SCHEDULE_CSV)

# 2) Build team Elo map from latest season composites
elo_map, _ = build_elo_map(TEAMS_CSV, SAVE_PCT_CSV)

# 3) Attach Elo to schedule (Leafs = TOR)
sch_with_elos = attach_elo_to_schedule(schedule_df, elo_map, team_abbr="TOR")

# 4) Predict with simulations (set n_sims=0 for deterministic)
display_df, sims_totals, summary = predict_schedule(
    sch_df=sch_with_elos,
    backtest_csv=BACKTEST_CSV,
    n_sims=100,
    rng_seed=7,
    use_elo_noise=False
)

# After: preds_df_or_display_df, sims_totals_or_None, summary = predict_schedule(...)
rec = summary.get("display_record_W-L-OTL", summary.get("pred_record_W-L-OTL"))
pts = summary.get("display_points", summary.get("pred_points"))
print(f"Predicted record (W-L-OTL): {rec} | Points: {pts}")


# Return the results of the function
display_df, sims_totals, summary

Predicted record (W-L-OTL): 41-36-5 | Points: 87


(         date home_team away_team opponent  home  back_to_back  rest_diff  \
 0  2025-10-08       TOR       MTL      MTL     1             0          0   
 1  2025-10-11       DET       TOR      DET     0             0          0   
 2  2025-10-13       TOR       DET      DET     1             0          0   
 3  2025-10-14       TOR       NSH      NSH     1             1          0   
 4  2025-10-16       TOR       NYR      NYR     1             0          0   
 ..        ...       ...       ...      ...   ...           ...        ...   
 77 2026-04-08       TOR       WSH      WSH     1             0          0   
 78 2026-04-09       NYI       TOR      NYI     0             1          0   
 79 2026-04-11       TOR       FLA      FLA     1             0          0   
 80 2026-04-13       TOR       DAL      DAL     1             0          0   
 81 2026-04-15       OTT       TOR      OTT     0             0          0   
 
         elo_for  elo_against  win_prob  win_rate display_resu