setup

In [12]:
# --- Setup & configuration ---
import os, sys, math, json, time, zipfile
from pathlib import Path

import numpy as np
import pandas as pd
import requests

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss, roc_auc_score

SEASON_START_YEAR = 2025      # 2025-26 season
TEAM = "TOR"                  # Leafs
DATA_DIR = Path("../data")
DATA_DIR.mkdir(exist_ok=True)

# If you already have local copies of your uploaded files, set these paths:
SKATERS_CSV = DATA_DIR / "raw/skaters.csv"          # put your uploaded skaters.csv here
TEAMS_CSV   = DATA_DIR / "raw/teams.csv"            # optional
TXN_CSV     = DATA_DIR / "transactions_2025_offseason.csv"  # will create a template if missing



2) Helper: schedule fetch + simple rest features

What this does: pulls the Leafs’ regular-season schedule from the NHL API and computes simple back-to-back and rest features. Writes data/schedule.csv.

In [17]:
def load_leafs_schedule_from_csv(
    csv_path: str,
    team_abbr: str = TEAM,
    date_col: str = "Date",
    home_col: str = "Home Team",
    away_col: str = "Away Team",
) -> pd.DataFrame:
    """
    Read a FixtureDownload-style schedule CSV for the Leafs and return a clean
    DataFrame with modeling features:
      - date (datetime64[ns])
      - home_team, away_team, opponent
      - home (1 if Leafs are home, else 0)
      - back_to_back (1 if game is the day after previous one)
      - rest_days (days since last Leafs game)
      - rest_diff (placeholder=0; compute once you also load opponent schedules)
    """
    df = pd.read_csv(csv_path)

    # Basic column normalization
    rename_map = {
        date_col: "date",
        home_col: "home_team",
        away_col: "away_team",
    }
    for k, v in list(rename_map.items()):
        if k not in df.columns:
            raise KeyError(f"Expected column '{k}' not found in CSV. Found: {list(df.columns)}")
    df = df.rename(columns=rename_map)

    # Keep only columns we need; keep 'Location' and 'Result' if present for reference
    keep = ["date", "home_team", "away_team"]
    if "Location" in df.columns: keep.append("Location")
    if "Result" in df.columns: keep.append("Result")
    if "Match Number" in df.columns: keep.append("Match Number")
    if "Round Number" in df.columns: keep.append("Round Number")
    df = df[keep].copy()

    # Parse date and sort
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.sort_values("date").reset_index(drop=True)

    # Derive home flag and opponent from Leafs' perspective
    df["home"] = (df["home_team"] == team_abbr).astype(int)
    df["opponent"] = np.where(df["home"] == 1, df["away_team"], df["home_team"])

    # Compute rest features for Leafs only (sequence through Leafs' games)
    # We assume the CSV is only Leafs' games; if it contained all NHL games,
    # we would first filter to rows where TOR is either home or away.
    df["back_to_back"] = 0
    df["rest_days"] = 2

    last_date = None
    for i, r in df.iterrows():
        cur_date = r["date"]
        if pd.notna(cur_date) and last_date is not None:
            delta = (cur_date - last_date).days
            df.at[i, "rest_days"] = max(delta, 0)
            if delta == 1:
                df.at[i, "back_to_back"] = 1
        if pd.notna(cur_date):
            last_date = cur_date

    # Placeholder until you compute opponent rest and subtract:
    df["rest_diff"] = 0

    # Reorder columns nicely
    order = ["date", "home_team", "away_team", "opponent", "home",
             "back_to_back", "rest_days", "rest_diff"]
    # include optional columns if present
    order += [c for c in ["Location", "Result", "Match Number", "Round Number"] if c in df.columns]
    df = df[order]

    return df

# Use it:
schedule_df = load_leafs_schedule_from_csv("../data/raw/schedule.csv")  # or your actual path
schedule_df.head()

Unnamed: 0,date,home_team,away_team,opponent,home,back_to_back,rest_days,rest_diff,Location,Result,Match Number,Round Number
0,2025-01-11 23:00:00,Philadelphia Flyers,Toronto Maple Leafs,Philadelphia Flyers,0,0,2,0,Xfinity Mobile Arena,,188,4
1,2025-03-12 00:00:00,Florida Panthers,Toronto Maple Leafs,Florida Panthers,0,0,59,0,Amerant Bank Arena,,413,8
2,2025-04-11 00:30:00,Toronto Maple Leafs,Pittsburgh Penguins,Toronto Maple Leafs,0,0,30,0,Scotiabank Arena,,199,4
3,2025-05-12 00:00:00,Carolina Hurricanes,Toronto Maple Leafs,Carolina Hurricanes,0,0,30,0,Lenovo Center,,431,9
4,2025-06-11 00:00:00,Toronto Maple Leafs,Utah Hockey Club,Toronto Maple Leafs,0,0,30,0,Scotiabank Arena,,213,5


adding elo to each game

In [23]:
name_to_abbr = {
    "Anaheim Ducks": "ANA",
    "Arizona Coyotes": "ARI",
    "Boston Bruins": "BOS",
    "Buffalo Sabres": "BUF",
    "Calgary Flames": "CGY",
    "Carolina Hurricanes": "CAR",
    "Chicago Blackhawks": "CHI",
    "Colorado Avalanche": "COL",
    "Columbus Blue Jackets": "CBJ",
    "Dallas Stars": "DAL",
    "Detroit Red Wings": "DET",
    "Edmonton Oilers": "EDM",
    "Florida Panthers": "FLA",
    "Los Angeles Kings": "LAK",
    "Minnesota Wild": "MIN",
    "Montr√©al Canadiens": "MTL",
    "Nashville Predators": "NSH",
    "New Jersey Devils": "NJD",
    "New York Islanders": "NYI",
    "New York Rangers": "NYR",
    "Ottawa Senators": "OTT",
    "Philadelphia Flyers": "PHI",
    "Pittsburgh Penguins": "PIT",
    "San Jose Sharks": "SJS",
    "Seattle Kraken": "SEA",
    "St. Louis Blues": "STL",
    "Tampa Bay Lightning": "TBL",
    "Toronto Maple Leafs": "TOR",
    "Utah Hockey Club": "UTA",
    "Vancouver Canucks": "VAN",
    "Vegas Golden Knights": "VGK",
    "Washington Capitals": "WSH",
    "Winnipeg Jets": "WPG"
}

# Load team-level data (acronyms + xGoals%)
teams_df = pd.read_csv("../data/raw/teams.csv")

# Average by team (teams.csv may have multiple situations: 5v5, PP, etc.)
teams_elos = teams_df.groupby("team", as_index=False)["xGoalsPercentage"].mean()

# Handle 0–1 vs 0–100 scale
if teams_elos["xGoalsPercentage"].max() <= 1.5:
    xgp = teams_elos["xGoalsPercentage"] * 100
else:
    xgp = teams_elos["xGoalsPercentage"]

teams_elos["elo"] = 1500 + 20 * (xgp - 50)

# Build acronym → Elo dict
elo_map = dict(zip(teams_elos["team"], teams_elos["elo"]))

# Normalize schedule_df to acronyms
sch = schedule_df.copy()
sch["home_team"] = sch["home_team"].map(name_to_abbr)
sch["away_team"] = sch["away_team"].map(name_to_abbr)

# Recompute Leafs "home" and opponent
TEAM = "TOR"
sch["home"] = (sch["home_team"] == TEAM).astype(int)
sch["opponent"] = np.where(sch["home"]==1, sch["away_team"], sch["home_team"])

# Attach Elo ratings
sch["elo_for"] = np.where(sch["home"]==1,
                          sch["home_team"].map(elo_map),
                          sch["away_team"].map(elo_map))
sch["elo_against"] = np.where(sch["home"]==1,
                              sch["away_team"].map(elo_map),
                              sch["home_team"].map(elo_map))

# Save enriched schedule
sch.to_csv("../data/clean/schedule_with_elos.csv", index=False)

sch.head(10)

Unnamed: 0,date,home_team,away_team,opponent,home,back_to_back,rest_days,rest_diff,Location,Result,Match Number,Round Number,elo_for,elo_against
0,2025-01-11 23:00:00,PHI,TOR,PHI,0,0,2,0,Xfinity Mobile Arena,,188,4,1480.0,1500.0
1,2025-03-12 00:00:00,FLA,TOR,FLA,0,0,59,0,Amerant Bank Arena,,413,8,1480.0,1564.0
2,2025-04-11 00:30:00,TOR,PIT,PIT,1,0,30,0,Scotiabank Arena,,199,4,1480.0,1440.0
3,2025-05-12 00:00:00,CAR,TOR,CAR,0,0,30,0,Lenovo Center,,431,9,1480.0,1576.0
4,2025-06-11 00:00:00,TOR,UTA,UTA,1,0,30,0,Scotiabank Arena,,213,5,1480.0,1512.0
5,2025-07-12 00:00:00,TOR,,,1,0,31,0,Scotiabank Arena,,444,9,1480.0,
6,2025-08-10 23:00:00,TOR,,,1,0,29,0,Scotiabank Arena,,4,1,1480.0,
7,2025-09-11 00:00:00,TOR,BOS,BOS,1,0,31,0,Scotiabank Arena,,234,5,1480.0,1440.0
8,2025-09-12 00:30:00,TOR,TBL,TBL,1,1,1,0,Scotiabank Arena,,461,9,1480.0,1564.0
9,2025-10-11 00:00:00,TOR,CAR,CAR,1,0,28,0,Scotiabank Arena,,246,5,1480.0,1576.0
