setup

In [3]:
# --- Setup & configuration ---
import os, sys, math, json, time, zipfile
from pathlib import Path

import numpy as np
import pandas as pd
import requests

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss, roc_auc_score

SEASON_START_YEAR = 2025      # 2025-26 season
TEAM = "TOR"                  # Leafs
DATA_DIR = Path("../data")
DATA_DIR.mkdir(exist_ok=True)

# If you already have local copies of your uploaded files, set these paths:
SKATERS_CSV = DATA_DIR / "raw/skaters.csv"          # put your uploaded skaters.csv here
TEAMS_CSV   = DATA_DIR / "raw/teams.csv"            # optional
TXN_CSV     = DATA_DIR / "clean/transactions.csv"  # will create a template if missing



2) Helper: schedule fetch + simple rest features

What this does: pulls the Leafs’ regular-season schedule from the NHL API and computes simple back-to-back and rest features. Writes data/schedule.csv.

In [4]:
def load_leafs_schedule_from_csv(
    csv_path: str,
    team_abbr: str = TEAM,
    date_col: str = "Date",
    home_col: str = "Home Team",
    away_col: str = "Away Team",
) -> pd.DataFrame:
    """
    Read a FixtureDownload-style schedule CSV for the Leafs and return a clean
    DataFrame with modeling features:
      - date (datetime64[ns])
      - home_team, away_team, opponent
      - home (1 if Leafs are home, else 0)
      - back_to_back (1 if game is the day after previous one)
      - rest_days (days since last Leafs game)
      - rest_diff (placeholder=0; compute once you also load opponent schedules)
    """
    df = pd.read_csv(csv_path)

    # Basic column normalization
    rename_map = {
        date_col: "date",
        home_col: "home_team",
        away_col: "away_team",
    }
    for k, v in list(rename_map.items()):
        if k not in df.columns:
            raise KeyError(f"Expected column '{k}' not found in CSV. Found: {list(df.columns)}")
    df = df.rename(columns=rename_map)

    # Keep only columns we need; keep 'Location' and 'Result' if present for reference
    keep = ["date", "home_team", "away_team"]
    if "Location" in df.columns: keep.append("Location")
    if "Result" in df.columns: keep.append("Result")
    if "Match Number" in df.columns: keep.append("Match Number")
    if "Round Number" in df.columns: keep.append("Round Number")
    df = df[keep].copy()

    # Parse date and sort
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.sort_values("date").reset_index(drop=True)

    # Derive home flag and opponent from Leafs' perspective
    df["home"] = (df["home_team"] == team_abbr).astype(int)
    df["opponent"] = np.where(df["home"] == 1, df["away_team"], df["home_team"])

    # Compute rest features for Leafs only (sequence through Leafs' games)
    # We assume the CSV is only Leafs' games; if it contained all NHL games,
    # we would first filter to rows where TOR is either home or away.
    df["back_to_back"] = 0
    df["rest_days"] = 2

    last_date = None
    for i, r in df.iterrows():
        cur_date = r["date"]
        if pd.notna(cur_date) and last_date is not None:
            delta = (cur_date - last_date).days
            df.at[i, "rest_days"] = max(delta, 0)
            if delta == 1:
                df.at[i, "back_to_back"] = 1
        if pd.notna(cur_date):
            last_date = cur_date

    # Placeholder until you compute opponent rest and subtract:
    df["rest_diff"] = 0

    # Reorder columns nicely
    order = ["date", "home_team", "away_team", "opponent", "home",
             "back_to_back", "rest_days", "rest_diff"]
    # include optional columns if present
    order += [c for c in ["Location", "Result", "Match Number", "Round Number"] if c in df.columns]
    df = df[order]

    return df

# Use it:
schedule_df = load_leafs_schedule_from_csv("../data/raw/schedule.csv")  # or your actual path
schedule_df.head(10)

Unnamed: 0,date,home_team,away_team,opponent,home,back_to_back,rest_days,rest_diff,Location,Result,Match Number,Round Number
0,2025-01-11 23:00:00,Philadelphia Flyers,Toronto Maple Leafs,Philadelphia Flyers,0,0,2,0,Xfinity Mobile Arena,,188,4
1,2025-03-12 00:00:00,Florida Panthers,Toronto Maple Leafs,Florida Panthers,0,0,59,0,Amerant Bank Arena,,413,8
2,2025-04-11 00:30:00,Toronto Maple Leafs,Pittsburgh Penguins,Toronto Maple Leafs,0,0,30,0,Scotiabank Arena,,199,4
3,2025-05-12 00:00:00,Carolina Hurricanes,Toronto Maple Leafs,Carolina Hurricanes,0,0,30,0,Lenovo Center,,431,9
4,2025-06-11 00:00:00,Toronto Maple Leafs,Utah Hockey Club,Toronto Maple Leafs,0,0,30,0,Scotiabank Arena,,213,5
5,2025-07-12 00:00:00,Toronto Maple Leafs,Montreal Canadiens,Toronto Maple Leafs,0,0,31,0,Scotiabank Arena,,444,9
6,2025-08-10 23:00:00,Toronto Maple Leafs,Montreal Canadiens,Toronto Maple Leafs,0,0,29,0,Scotiabank Arena,,4,1
7,2025-09-11 00:00:00,Toronto Maple Leafs,Boston Bruins,Toronto Maple Leafs,0,0,31,0,Scotiabank Arena,,234,5
8,2025-09-12 00:30:00,Toronto Maple Leafs,Tampa Bay Lightning,Toronto Maple Leafs,0,1,1,0,Scotiabank Arena,,461,9
9,2025-10-11 00:00:00,Toronto Maple Leafs,Carolina Hurricanes,Toronto Maple Leafs,0,0,28,0,Scotiabank Arena,,246,5


adding elo to each game

In [116]:
# --- Composite Elo (EV/PP/PK + Save%) and attach to schedule_df ---

# Keep your dictionary exactly
name_to_abbr = {
    "Anaheim Ducks": "ANA","Arizona Coyotes": "ARI","Boston Bruins": "BOS","Buffalo Sabres": "BUF",
    "Calgary Flames": "CGY","Carolina Hurricanes": "CAR","Chicago Blackhawks": "CHI","Colorado Avalanche": "COL",
    "Columbus Blue Jackets": "CBJ","Dallas Stars": "DAL","Detroit Red Wings": "DET","Edmonton Oilers": "EDM",
    "Florida Panthers": "FLA","Los Angeles Kings": "LAK","Minnesota Wild": "MIN","Montreal Canadiens": "MTL",
    "Nashville Predators": "NSH","New Jersey Devils": "NJD","New York Islanders": "NYI","New York Rangers": "NYR",
    "Ottawa Senators": "OTT","Philadelphia Flyers": "PHI","Pittsburgh Penguins": "PIT","San Jose Sharks": "SJS",
    "Seattle Kraken": "SEA","St. Louis Blues": "STL","Tampa Bay Lightning": "TBL","Toronto Maple Leafs": "TOR",
    "Utah Hockey Club": "UTA","Vancouver Canucks": "VAN","Vegas Golden Knights": "VGK","Washington Capitals": "WSH",
    "Winnipeg Jets": "WPG"
}

# ---------- Base team metrics from teams.csv (latest season) ----------
teams_df = pd.read_csv("../data/raw/teams.csv").copy()
teams_df["season"] = pd.to_numeric(teams_df["season"], errors="coerce")
latest_season = int(teams_df["season"].max())
t = teams_df.loc[teams_df["season"] == latest_season, ["team","situation","iceTime","xGoalsFor","xGoalsAgainst"]].copy()

# Map situations to EV/PP/PK
SIT_MAP = {"5on5":"EV", "5on4":"PP", "4on5":"PK"}
t["SIT"] = t["situation"].map(SIT_MAP)

# Rates per 60
t["iceTime"] = pd.to_numeric(t["iceTime"], errors="coerce").replace(0, pd.NA)
t["xGoalsFor"] = pd.to_numeric(t["xGoalsFor"], errors="coerce")
t["xGoalsAgainst"] = pd.to_numeric(t["xGoalsAgainst"], errors="coerce")
t["xGF60"] = (t["xGoalsFor"] / t["iceTime"]) * 60.0
t["xGA60"] = (t["xGoalsAgainst"] / t["iceTime"]) * 60.0
t["net_xG60"] = t["xGF60"] - t["xGA60"]

# Aggregate
pp = t.loc[t["SIT"]=="PP"].groupby("team", as_index=False)[["xGF60"]].mean()
pk = t.loc[t["SIT"]=="PK"].groupby("team", as_index=False)[["xGA60"]].mean()
ev = t.loc[t["SIT"]=="EV"].groupby("team", as_index=False)[["net_xG60"]].mean()

# League means for fill
pp_mean = float(pp["xGF60"].mean()) if len(pp) else 0.0
pk_mean = float(pk["xGA60"].mean()) if len(pk) else 0.0
ev_mean = float(ev["net_xG60"].mean()) if len(ev) else 0.0

all_teams = pd.DataFrame({"team": t["team"].dropna().unique()})
pp = all_teams.merge(pp, on="team", how="left"); pp["xGF60"] = pp["xGF60"].fillna(pp_mean)
pk = all_teams.merge(pk, on="team", how="left"); pk["xGA60"] = pk["xGA60"].fillna(pk_mean)
ev = all_teams.merge(ev, on="team", how="left"); ev["net_xG60"] = ev["net_xG60"].fillna(ev_mean)

# Standardize -> z-scores
def zscore(s):
    s = s.astype(float); mu = s.mean(); sd = s.std(ddof=0)
    return (s - mu) / (sd if sd != 0 else 1.0)

ev_z = zscore(ev["net_xG60"])                    # higher is better
pp_z = zscore(pp["xGF60"])                       # higher is better
pk_z = zscore(pk["xGA60"].mean() - pk["xGA60"])  # lower xGA60 => better PK



sv_df = pd.read_csv("../data/clean/team_save_percentages.csv") 

# Map full team names -> acronyms to align with 'team' codes in teams.csv
sv_df = sv_df.rename(columns={"team":"team_name","savePct":"savePct"})
sv_df["team"] = sv_df["team_name"].map(name_to_abbr)
sv = all_teams.merge(sv_df[["team","savePct"]], on="team", how="left")
# Fill missing with league mean (e.g., expansion/rebrand edge cases)
sv["savePct"] = pd.to_numeric(sv["savePct"], errors="coerce")
sv["savePct"] = sv["savePct"].fillna(sv["savePct"].mean())
sv_z = zscore(sv["savePct"])                      # higher is better

# ---------- Combine (favor PP a bit more, include SV%) ----------
# Weights sum to 1.0
W_EV, W_PP, W_PK, W_SV = 0.50, 0.30, 0.15, 0.05   # PP favored; modest SV influence

comp = all_teams.copy()
comp["ev_z"] = ev_z.values
comp["pp_z"] = pp_z.values
comp["pk_z"] = pk_z.values
comp["sv_z"] = sv_z.values

comp["z_composite"] = (
    W_EV*comp["ev_z"] +
    W_PP*comp["pp_z"] +
    W_PK*comp["pk_z"] +
    W_SV*comp["sv_z"]
)

# Elo mapping (same scale as before)
comp["elo"] = 1500.0 + 100.0 * comp["z_composite"]

# Build acronym → Elo dict
teams_elos = comp[["team","elo"]].copy()
elo_map = dict(zip(teams_elos["team"], teams_elos["elo"]))

# ---- Normalize schedule_df to acronyms and attach Elo (unchanged) ----
sch = schedule_df.copy()
sch["home_team"] = sch["home_team"].map(name_to_abbr)
sch["away_team"] = sch["away_team"].map(name_to_abbr)

TEAM = "TOR"
sch["home"] = (sch["home_team"] == TEAM).astype(int)
sch["opponent"] = np.where(sch["home"]==1, sch["away_team"], sch["home_team"])

sch["elo_for"] = np.where(sch["home"]==1, sch["home_team"].map(elo_map), sch["away_team"].map(elo_map))
sch["elo_against"] = np.where(sch["home"]==1, sch["away_team"].map(elo_map), sch["home_team"].map(elo_map))

# Save outputs
sch.to_csv("../data/clean/schedule_with_elos.csv", index=False)
teams_elos.to_csv("../data/clean/teams_elos.csv", index=False)

sch.head(10)


Unnamed: 0,date,home_team,away_team,opponent,home,back_to_back,rest_days,rest_diff,Location,Result,Match Number,Round Number,elo_for,elo_against
0,2025-01-11 23:00:00,PHI,TOR,PHI,0,0,2,0,Xfinity Mobile Arena,,188,4,1513.081515,1488.225653
1,2025-03-12 00:00:00,FLA,TOR,FLA,0,0,59,0,Amerant Bank Arena,,413,8,1513.081515,1635.15809
2,2025-04-11 00:30:00,TOR,PIT,PIT,1,0,30,0,Scotiabank Arena,,199,4,1513.081515,1504.218514
3,2025-05-12 00:00:00,CAR,TOR,CAR,0,0,30,0,Lenovo Center,,431,9,1513.081515,1618.234455
4,2025-06-11 00:00:00,TOR,UTA,UTA,1,0,30,0,Scotiabank Arena,,213,5,1513.081515,1559.680293
5,2025-07-12 00:00:00,TOR,MTL,MTL,1,0,31,0,Scotiabank Arena,,444,9,1513.081515,1406.68834
6,2025-08-10 23:00:00,TOR,MTL,MTL,1,0,29,0,Scotiabank Arena,,4,1,1513.081515,1406.68834
7,2025-09-11 00:00:00,TOR,BOS,BOS,1,0,31,0,Scotiabank Arena,,234,5,1513.081515,1459.125175
8,2025-09-12 00:30:00,TOR,TBL,TBL,1,1,1,0,Scotiabank Arena,,461,9,1513.081515,1547.066849
9,2025-10-11 00:00:00,TOR,CAR,CAR,1,0,28,0,Scotiabank Arena,,246,5,1513.081515,1618.234455


add in transactions from offseason, quantify player impact

In [117]:
# --- Compute Leafs offseason Elo bump and apply it to the schedule ---

import pandas as pd
import numpy as np

# 1) Compute Elo bump from transactions + skaters (GS/GP)
sk = pd.read_csv(SKATERS_CSV).rename(columns={"name": "player"})
tx = pd.read_csv(TXN_CSV)

# If skaters has multiple seasons, use the latest
if "season" in sk.columns:
    sk["season"] = pd.to_numeric(sk["season"], errors="coerce")
    latest_season = int(sk["season"].max())
    sk = sk[sk["season"] == latest_season].copy()

# Normalize names to avoid mismatches (e.g., "Mitch Marner" vs "Mitchell Marner")
def _norm_name(s):
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    return "".join(ch for ch in s if ch.isalnum() or ch.isspace())

sk["player_norm"] = sk["player"].astype(str).map(_norm_name)
tx["player_norm"] = tx["player"].astype(str).map(_norm_name)

# Impact = gameScore per game (safe divide)
gp = pd.to_numeric(sk.get("games_played"), errors="coerce").replace(0, np.nan)
gs = pd.to_numeric(sk.get("gameScore"), errors="coerce")
sk["impact"] = (gs / gp).astype(float)

# Average per player (if multiple rows)
per_player = sk.groupby("player_norm", as_index=False)["impact"].mean()

# Merge onto transactions (IN = to TOR, OUT = from TOR)
tx_in  = tx.loc[tx["team_to"]=="TOR"].merge(per_player, on="player_norm", how="left")
tx_out = tx.loc[tx["team_from"]=="TOR"].merge(per_player, on="player_norm", how="left")

# Compute net impact and convert to Elo
in_sum  = tx_in["impact"].fillna(0).sum()
out_sum = tx_out["impact"].fillna(0).sum()
net_impact = float(in_sum - out_sum)
ELO_PER_UNIT = 10.0  # your chosen scale: 10 Elo per 1 GS/GP unit
elo_bump = net_impact * ELO_PER_UNIT

print("=== Offseason impact (GS/GP) ===")
print(f"IN  total impact: {in_sum:.3f}")
print(f"OUT total impact: {out_sum:.3f}")
print(f"NET impact (IN-OUT): {net_impact:.3f}")
print(f"Elo bump applied to TOR games: {elo_bump:+.1f}  (factor {ELO_PER_UNIT} per GS/GP)")

# 2) Apply the bump to Leafs' side of each game in the schedule
sch = pd.read_csv("../data/clean/schedule_with_elos.csv")

is_home = sch["home_team"] == "TOR"
is_away = sch["away_team"] == "TOR"

# Add the same bump to Toronto's pregame strength whether home or away
sch.loc[is_home, "elo_for"] += elo_bump
sch.loc[is_away, "elo_for"] += elo_bump

sch["elo_for"] = sch["elo_for"].round(1)

# Save new elo
TEAMS_ELOS_PATH = "../data/clean/teams_elos.csv"
teams_elos = pd.read_csv(TEAMS_ELOS_PATH)
curr_elo = teams_elos.loc[teams_elos["team"] == "TOR", "elo"]
new_elo  = int(curr_elo) + elo_bump
teams_elos.loc[teams_elos["team"] == "TOR", "elo"] = new_elo
teams_elos.to_csv(TEAMS_ELOS_PATH, index=False)


sch.to_csv("../data/clean/schedule_with_elos_bumped.csv", index=False)
sch.head(10)

teams_elos.head()


=== Offseason impact (GS/GP) ===
IN  total impact: 0.823
OUT total impact: 1.044
NET impact (IN-OUT): -0.221
Elo bump applied to TOR games: -2.2  (factor 10.0 per GS/GP)


  new_elo  = int(curr_elo) + elo_bump


Unnamed: 0,team,elo
0,VGK,1613.848639
1,TOR,1510.788076
2,MIN,1500.46928
3,PHI,1488.225653
4,BUF,1452.338744


backtest data for training

In [118]:


# Load raw Leafs backtest
bt = pd.read_csv("../data/raw/backtest.csv")

# Load precomputed team Elos
teams_elos = pd.read_csv("../data/clean/teams_elos.csv")
elo_map = dict(zip(teams_elos["team"], teams_elos["elo"]))

# Special case: map UTA to ARI if missing (carryover issue)
if "UTA" in elo_map and "ARI" not in elo_map:
    elo_map["ARI"] = elo_map["UTA"]

# Build modeling schema
df = pd.DataFrame()
df["date"] = pd.to_datetime(bt["Date"])
df["opponent"] = bt["Opponent"].map(name_to_abbr)

# Home flag (0=away,1=home)
df["home"] = (bt["is_away"] == 0).astype(int)

TEAM = "TOR"
df["home_team"] = np.where(df["home"] == 1, TEAM, df["opponent"])
df["away_team"] = np.where(df["home"] == 1, df["opponent"], TEAM)

# Win label
df["win"] = (bt["result"] == "W").astype(int)

# Rest & back-to-back
df = df.sort_values("date").reset_index(drop=True)
df["back_to_back"] = 0
df["rest_days"] = 2
last = None
for i, r in df.iterrows():
    if last is not None:
        d = (r["date"] - last).days
        df.at[i, "rest_days"] = max(d, 0)
        if d == 1:
            df.at[i, "back_to_back"] = 1
    last = r["date"]
df["rest_diff"] = 0  # placeholder

# Attach Elo from precomputed map
team_for = df["away_team"].where(df["home"] == 0, df["home_team"])
df["elo_for"] = team_for.map(elo_map)
df["elo_against"] = df["opponent"].map(elo_map)

# Save
df.to_csv("../data/clean/backtest_with_elos.csv", index=False)
bt = df  # so you can keep using bt downstream
df.head(15)


Unnamed: 0,date,opponent,home,home_team,away_team,win,back_to_back,rest_days,rest_diff,elo_for,elo_against
0,2023-10-11,MTL,1,TOR,MTL,1,0,2,0,1510.788076,1406.68834
1,2023-10-14,MIN,1,TOR,MIN,1,0,3,0,1510.788076,1500.46928
2,2023-10-16,CHI,1,TOR,CHI,0,0,2,0,1510.788076,1373.652382
3,2023-10-19,FLA,0,FLA,TOR,0,0,3,0,1510.788076,1635.15809
4,2023-10-21,TBL,0,TBL,TOR,1,0,2,0,1510.788076,1547.066849
5,2023-10-24,WSH,0,WSH,TOR,1,0,3,0,1510.788076,1543.60118
6,2023-10-26,DAL,0,DAL,TOR,1,0,2,0,1510.788076,1533.931059
7,2023-10-28,NSH,0,NSH,TOR,0,0,2,0,1510.788076,1485.566625
8,2023-10-31,LAK,1,TOR,LAK,0,0,3,0,1510.788076,1579.591924
9,2023-11-02,BOS,0,BOS,TOR,0,0,2,0,1510.788076,1459.125175


ML attempt

In [119]:
# --- Softer, more random Elo probabilities ---

# Existing knobs you already have (keep or tweak)
HOME_EDGE = 5.0            # very slight home advantage
B2B_PENALTY = 10.0         # smaller penalty than before
REST_PTS_PER_DAY = 3.0     # rest_diff in Elo points per day

# New "randomness / humility" dials
ELO_SCALE_DEN = 600.0      # was ~400; larger => flatter probs (less Elo confidence)
ALPHA_ELO = 0.65           # weight on Elo; (1-ALPHA_ELO) shrinks toward 0.5
NOISE_SD = 35.0            # Elo-points SD; set 0.0 for deterministic

# Optional, reproducible noise
_rng = np.random.default_rng(42)

def elo_prob_row(row):
    # Base Elo difference from your files
    diff = row["elo_for"] - row["elo_against"]

    # Context tweaks
    if row["home"] == 1:
        diff += HOME_EDGE
    if row["back_to_back"] == 1:
        diff -= B2B_PENALTY
    diff += REST_PTS_PER_DAY * row.get("rest_diff", 0)

    # Inject noise in Elo *before* probability (set NOISE_SD=0 for none)
    if NOISE_SD and NOISE_SD > 0:
        diff = diff + _rng.normal(0.0, NOISE_SD)

    # Elo logistic with a gentler denominator (flatter probs)
    p_elo = 1.0 / (1.0 + 10.0 ** (-(diff) / ELO_SCALE_DEN))

    # Blend with a coin flip to further damp confidence
    p = ALPHA_ELO * p_elo + (1.0 - ALPHA_ELO) * 0.5
    return float(p)

bt_soft = bt.copy()
bt_soft["win_prob"] = bt_soft.apply(elo_prob_row, axis=1)
bt_soft["prediction"] = (bt_soft["win_prob"] >= 0.5).astype(int)

brier = float(((bt_soft["win_prob"] - bt_soft["win"])**2).mean())
true_w = int(bt_soft["win"].sum()); true_l = int(len(bt_soft) - true_w)
pred_w = int(bt_soft["prediction"].sum()); pred_l = int(len(bt_soft) - pred_w)

print({
    "n_games": len(bt_soft),
    "brier": round(brier, 4),
    "true_record": f"{true_w}-{true_l}",
    "predicted_record": f"{pred_w}-{pred_l}",
})



{'n_games': 82, 'brier': 0.2495, 'true_record': '46-36', 'predicted_record': '48-34'}


In [120]:
sch = pd.read_csv("../data/clean/schedule_with_elos_bumped.csv").copy()

# Use your already-defined elo_prob_row (and knobs) as-is:
sch["win_prob"] = sch.apply(elo_prob_row, axis=1)
sch["prediction"] = (sch["win_prob"] >= 0.5).astype(int)

pred_w = int(sch["prediction"].sum())
pred_l = int(len(sch) - pred_w)
print({
    "n_games": len(sch),
    "predicted_record": f"{pred_w}-{pred_l}",
    "avg_win_prob": round(sch["win_prob"].mean(), 3),
})

{'n_games': 82, 'predicted_record': '47-35', 'avg_win_prob': np.float64(0.505)}
