In [5]:
# ---------- Daily Barttovik Ratings ----------

import asyncio
import os
from datetime import datetime, timedelta
from typing import List
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PWTimeout

BASE_URL = (
    "https://barttorvik.com/trank.php"
    "?year={year}&sort=&hteam=&t2value=&conlimit=All&state=All"
    "&begin={begin}&end={end}&top=0&revquad=0&quad=5&venue=All&type=All&mingames=0#"
)

# ---------- HTML fetch ----------

async def goto_and_get_html(page, url: str, table_selector: str = "table", timeout_ms: int = 30000) -> str:
    await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
    try:
        await page.wait_for_selector(table_selector, timeout=20000)
    except PWTimeout:
        await page.wait_for_load_state("networkidle", timeout=10000)

    # if we’re still on the verification page, wait a bit
    for _ in range(6):
        html = await page.content()
        if "Verifying your browser" not in html and "js_test_submitted" not in html:
            break
        await asyncio.sleep(1)

    return await page.content()

# ---------- Table parsing ----------

def parse_first_table(html: str) -> List[List[str]]:
    soup = BeautifulSoup(html, "html.parser")
    table = soup.select_one("table")
    if not table:
        return []
    rows = []
    for tr in table.select("tr"):
        cells = [c.get_text(strip=True) for c in tr.select("th, td")]
        if cells:
            rows.append(cells)
    return rows


def rows_to_dataframe(rows: List[List[str]]) -> pd.DataFrame:
    """Convert raw scraped rows into a DataFrame"""
    if len(rows) < 3:
        return pd.DataFrame()
    columns = rows[1]  # second row = headers
    data = rows[2:]
    max_len = len(columns)
    norm = [r[:max_len] + ([""] * (max_len - len(r))) for r in data]
    df = pd.DataFrame(norm, columns=columns)
    return df

# ---------- Orchestrator ----------

async def scrape_barttorvik_daily(
    year: int = 2021,
    begin: str = "20201101",
    end: str = "20210313",
    output_dir: str = "daily_csvs",
    master_csv: str = "barttorvik_2021_all.csv",
    table_selector: str = "table",
    headless: bool = True,
    pause_sec: float = 3.8
):
    os.makedirs(output_dir, exist_ok=True)
    start_dt = datetime.strptime(begin, "%Y%m%d")
    final_dt = datetime.strptime(end, "%Y%m%d")

    first_write = not os.path.exists(master_csv)

    async with async_playwright() as p:
        browser = await p.chromium.launch(channel="chrome", headless=headless)
        context = await browser.new_context(
            user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/119.0.0.0 Safari/537.36"),
            locale="en-US",
            timezone_id="America/New_York",
        )
        page = await context.new_page()

        dt = start_dt
        total_rows = 0
        while dt <= final_dt:
            end_str = dt.strftime("%Y%m%d")
            url = BASE_URL.format(year=year, begin=begin, end=end_str)

            try:
                html = await goto_and_get_html(page, url, table_selector=table_selector)
                rows = parse_first_table(html)
                df = rows_to_dataframe(rows)

                if not df.empty:
                    df.insert(0, "Date", end_str)

                    # write individual daily file
                    daily_path = os.path.join(output_dir, f"barttorvik_{end_str}.csv")
                    df.to_csv(daily_path, index=False)
                    print(f"✔️  {end_str}: saved {len(df)} rows to {daily_path}")

                    # append to master CSV
                    if first_write:
                        df.to_csv(master_csv, index=False)
                        first_write = False
                    else:
                        df.to_csv(master_csv, mode="a", header=False, index=False)

                    total_rows += len(df)
                else:
                    print(f"⚠️  {end_str}: no data (empty table)")

            except Exception as e:
                print(f"❌ {end_str}: ERROR {e}")

            await asyncio.sleep(pause_sec)
            dt += timedelta(days=1)

        await browser.close()

    print(f"\n✅ Done! {total_rows} total rows saved across days.")

# ---------- Run ----------
# In Jupyter or async environment:
await scrape_barttorvik_daily(
    year=2026,
    begin="20251103",
    end="20251109",
    output_dir="daily_csvs_2026",
    master_csv="s3://collegebasketballinsiders/torvik/2026/team-ratings.csv"
)

✔️  20251103: saved 225 rows to daily_csvs_2026/barttorvik_20251103.csv
✔️  20251104: saved 267 rows to daily_csvs_2026/barttorvik_20251104.csv
✔️  20251105: saved 284 rows to daily_csvs_2026/barttorvik_20251105.csv
✔️  20251106: saved 299 rows to daily_csvs_2026/barttorvik_20251106.csv
✔️  20251107: saved 332 rows to daily_csvs_2026/barttorvik_20251107.csv
✔️  20251108: saved 363 rows to daily_csvs_2026/barttorvik_20251108.csv
✔️  20251109: saved 363 rows to daily_csvs_2026/barttorvik_20251109.csv

✅ Done! 2133 total rows saved across days.


In [8]:
daily_torvik_2026_df = pd.read_csv("s3://collegebasketballinsiders/torvik/2026/team-ratings.csv")

daily_torvik_2026_df = daily_torvik_2026_df[daily_torvik_2026_df['Team'] != "Team"]
daily_torvik_2026_df['Team'] = daily_torvik_2026_df['Team'].str.extract(r'^([A-Za-z\s.&]+)')[0].str.strip()
daily_torvik_2026_df['WAB'] = daily_torvik_2026_df['WAB'].str.replace("+","", regex=False).astype("float")
daily_torvik_2026_df['season'] = 2026
daily_torvik_2026_df = daily_torvik_2026_df[['season','Date', 'Team', 'Rk', 'Conf', 'G', 'AdjOE', 'AdjDE', 'Barthag',
       'EFG%', 'EFGD%', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P%',
       '2P%D', '3P%', '3P%D', '3PR', '3PRD', 'Adj T.', 'WAB']].sort_values(["Date","Team"], ascending=True)
daily_torvik_2026_df.columns = ['season', 'date', 'team', 'rank', 'conf', 'games', 'adj_off_eff', 'adj_def_eff', 'barthag',
       'efg_pct', 'efgd_pct', 'tor', 'tord', 'orb', 'drb', 'ftr', 'ftrd', 'two_pt_pct',
       'two_pt_def_pct', 'three_pt_pct', 'three_pt_def_pct', 'three_pt_rt', 'three_pt_def_rt', 'adj_tempo', 'wab']

daily_torvik_2026_df.to_csv("s3://collegebasketballinsiders/torvik/2026/team-ratings.csv")

In [9]:
import os
import time
import json
import math
import datetime as dt
from typing import Iterable, Optional, Dict, Any, List

import requests
from requests.adapters import HTTPAdapter, Retry
import pandas as pd
from dotenv import load_dotenv
import os

# Optional .env support
try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

API_KEY = os.getenv("KENPOM_API_KEY")

API_BASE = "https://kenpom.com/api.php"
HEADERS = {"Authorization": f"Bearer {API_KEY}"}


def session_with_retries(total=5, backoff=0.5):
    s = requests.Session()
    retries = Retry(
        total=total,
        connect=total,
        read=total,
        backoff_factor=backoff,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def season_date_window(y: int,
                       start: Optional[str] = None,
                       end: Optional[str] = None) -> Iterable[dt.date]:
    # Default window: Oct 1 of previous year → Apr 15 of season’s end year
    if start is None:
        start = f"{y-1}-11-01"
    if end is None:
        end = f"{y}-04-15"
    d0 = dt.date.fromisoformat(start)
    d1 = dt.date.fromisoformat(end)
    cur = d0
    one = dt.timedelta(days=1)
    while cur <= d1:
        yield cur
        cur += one

def fetch_archive_for_date(s: requests.Session, day: dt.date) -> List[Dict[str, Any]]:
    params = {"endpoint": "archive", "d": day.isoformat()}
    r = s.get(API_BASE, params=params, headers=HEADERS, timeout=30)
    if r.status_code == 404:
        return []  # day not available
    r.raise_for_status()
    data = r.json()
    if not isinstance(data, list):
        return []
    return data

def fetch_preseason_for_year(s: requests.Session, y: int) -> List[Dict[str, Any]]:
    params = {"endpoint": "archive", "preseason": "true", "y": y}
    r = s.get(API_BASE, params=params, headers=HEADERS, timeout=30)
    if r.status_code == 404:
        return []
    r.raise_for_status()
    data = r.json()
    if not isinstance(data, list):
        return []
    return data

def fetch_teams_for_year(s: requests.Session, y: int) -> pd.DataFrame:
    params = {"endpoint": "teams", "y": y}
    r = s.get(API_BASE, params=params, headers=HEADERS, timeout=30)
    r.raise_for_status()
    data = r.json()
    return pd.DataFrame(data) if data else pd.DataFrame(columns=["Season","TeamName","TeamID","ConfShort"])

def normalize_rows(rows: List[Dict[str, Any]]) -> pd.DataFrame:
    if not rows:
        return pd.DataFrame()
    df = pd.json_normalize(rows)
    # Ensure consistent columns; add ArchiveDate if missing (e.g., preseason response should include it per docs)
    if "ArchiveDate" not in df.columns and "DataThrough" in df.columns:
        df["ArchiveDate"] = df["DataThrough"]  # fallback
    # Keep essential identifiers first
    preferred = [
        "ArchiveDate", "Season", "TeamName", "Seed", "ConfShort", "Event",
        "AdjEM", "RankAdjEM",
        "AdjOE", "RankAdjOE",
        "AdjDE", "RankAdjDE",
        "AdjTempo", "RankAdjTempo",
        "AdjEMFinal", "RankAdjEMFinal",
        "AdjOEFinal", "RankAdjOEFinal",
        "AdjDEFinal", "RankAdjDEFinal",
        "AdjTempoFinal", "RankAdjTempoFinal",
        "RankChg", "AdjEMChg", "AdjTChg",
        "Preseason"
    ]
    # Reorder if present
    cols = [c for c in preferred if c in df.columns] + [c for c in df.columns if c not in preferred]
    df = df[cols]
    # Ensure date type
    if "ArchiveDate" in df.columns:
        df["ArchiveDate"] = pd.to_datetime(df["ArchiveDate"], errors="coerce").dt.date
    return df

def scrape_daily_kenpom(y: int,
                        include_preseason: bool = True,
                        start: Optional[str] = None,
                        end: Optional[str] = None,
                        sleep_sec: float = 0.25) -> pd.DataFrame:
    if not API_KEY:
        raise RuntimeError("Set KENPOM_API_KEY in your environment (export KENPOM_API_KEY=...).")
    s = session_with_retries()

    # Optional: team list (useful for mapping / validation)
    teams_df = fetch_teams_for_year(s, y)

    frames = []

    if include_preseason:
        pre = fetch_preseason_for_year(s, y)
        pdf = normalize_rows(pre)
        if not pdf.empty:
            # Mark as preseason if field not provided
            if "Preseason" not in pdf.columns:
                pdf["Preseason"] = "true"
            frames.append(pdf)

    for day in season_date_window(y, start=start, end=end):
        try:
            rows = fetch_archive_for_date(s, day)
        except requests.HTTPError as e:
            # Skip on hard errors
            continue
        df = normalize_rows(rows)
        if not df.empty:
            # Ensure a season column exists
            if "Season" not in df.columns:
                df["Season"] = y
            frames.append(df)
        time.sleep(sleep_sec)

    if not frames:
        return pd.DataFrame()

    out = pd.concat(frames, ignore_index=True).drop_duplicates()

    # Optional: join TeamID if you want a stable key
    if "TeamName" in out.columns and not teams_df.empty:
        out = out.merge(
            teams_df[["TeamName", "TeamID"]],
            on="TeamName",
            how="left",
            validate="m:1"
        )

    # Sort for readability
    sort_cols = [c for c in ["ArchiveDate", "Season", "RankAdjEM", "TeamName"] if c in out.columns]
    if sort_cols:
        out = out.sort_values(sort_cols).reset_index(drop=True)

    return out

def _get_api_key() -> str:
    key = os.getenv("KENPOM_API_KEY")
    if not key:
        raise RuntimeError("KENPOM_API_KEY not set. Use a .env file or export it.")
    return key

def _session_with_retries(total: int = 5, backoff: float = 0.5) -> requests.Session:
    s = requests.Session()
    r = Retry(
        total=total, connect=total, read=total,
        backoff_factor=backoff,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=r))
    return s

def fetch_height_for_season(y: int, c: Optional[str] = None) -> pd.DataFrame:
    """
    Return Height snapshot for season y (optionally filtered by conference short name c).
    """
    headers = {"Authorization": f"Bearer {_get_api_key()}"}
    params = {"endpoint": "height", "y": y}
    if c:
        params["c"] = c
    s = _session_with_retries()
    resp = s.get(API_BASE, params=params, headers=headers, timeout=45)
    resp.raise_for_status()
    data = resp.json()
    df = pd.json_normalize(data) if isinstance(data, list) else pd.DataFrame()
    if df.empty:
        return df

    # Light normalization (no file I/O)
    if "DataThrough" in df.columns:
        df["DataThrough"] = pd.to_datetime(df["DataThrough"], errors="coerce").dt.date
    for col in [x for x in ["TeamName", "ConfShort"] if x in df.columns]:
        df[col] = df[col].astype("string")
    if "Season" in df.columns:
        df["Season"] = pd.to_numeric(df["Season"], errors="coerce").astype("Int64")
    # Rank columns to Int64; metric columns to float64
    for col in [c for c in df.columns if c.lower().endswith("rank")]:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
    for col in ["AvgHgt","HgtEff","Hgt5","Hgt4","Hgt3","Hgt2","Hgt1","Exp","Bench","Continuity"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")
    return df

def fetch_height_range(start_season: int, end_season: int, c: Optional[str] = None) -> pd.DataFrame:
    """
    Loop seasons [start_season, end_season] inclusive and return one concatenated DataFrame.
    """
    frames: List[pd.DataFrame] = []
    for y in range(start_season, end_season + 1):
        h = fetch_height_for_season(y, c=c)
        if not h.empty:
            frames.append(h)
    if not frames:
        return pd.DataFrame()
    out = pd.concat(frames, ignore_index=True)

    # Nice column order (purely cosmetic)
    preferred = [
        "Season","TeamName","ConfShort","DataThrough",
        "AvgHgt","AvgHgtRank","HgtEff","HgtEffRank",
        "Hgt5","Hgt5Rank","Hgt4","Hgt4Rank","Hgt3","Hgt3Rank","Hgt2","Hgt2Rank","Hgt1","Hgt1Rank",
        "Exp","ExpRank","Bench","BenchRank","Continuity","RankContinuity"
    ]
    cols = [c for c in preferred if c in out.columns] + [c for c in out.columns if c not in preferred]
    return out.reindex(columns=cols)


year = 2026
df = scrape_daily_kenpom(year, include_preseason=True)
df = df[['ArchiveDate','Season','TeamID','TeamName','Seed','ConfShort','Event',
         'AdjEM','RankAdjEM','AdjOE','RankAdjOE','AdjDE','RankAdjDE',
         'AdjTempo','RankAdjTempo']].copy()

df.to_csv(f"s3://collegebasketballinsiders/kenpom/{year}/team-ratings.csv")

df = fetch_height_range(year,year)
df.to_csv(f"s3://collegebasketballinsiders/kenpom/{year}/team-height.csv")

In [10]:
# ---------- Game Ids (dated subfolders) ----------
import os
import re
import json
from datetime import datetime, timedelta

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Optional import: only needed if OUT_DIR is s3://...
try:
    import s3fs
except Exception:
    s3fs = None

# ---------------- Config ----------------
START_DATE = "20251102"  # inclusive
END_DATE   = "20251109"  # inclusive
GROUP = 50               # 50 = NCAA Division I
OUT_DIR = "s3://collegebasketballinsiders/daily-box-score-ids"
OVERWRITE = False        # True to overwrite existing daily files
PAUSE_SECONDS = 3.4      # be polite (optional)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/119.0.0.0 Safari/537.36"
    )
}

# ---------------- Helpers ----------------
def is_s3_path(path: str) -> bool:
    return isinstance(path, str) and path.lower().startswith("s3://")

def s3_join(base: str, *parts: str) -> str:
    """Join S3 URI segments without mangling the scheme/bucket."""
    base = base.rstrip("/")
    tail = "/".join(p.strip("/") for p in parts if p is not None)
    return f"{base}/{tail}" if tail else base

class FS:
    """
    Tiny facade over local fs vs S3 so the rest of the code stays clean.
    """
    def __init__(self, root: str):
        self.root = root
        self._is_s3 = is_s3_path(root)
        self._fs = None
        if self._is_s3:
            if s3fs is None:
                raise RuntimeError(
                    "s3fs is required for S3 paths. Install with: pip install s3fs"
                )
            # Uses standard AWS credential chain (env vars, ~/.aws/credentials, IAM role, etc.)
            self._fs = s3fs.S3FileSystem(anon=False)

    def join(self, *parts: str) -> str:
        if self._is_s3:
            return s3_join(*parts)
        return os.path.join(*parts)

    def mkdir(self, path: str):
        if self._is_s3:
            # No-op: S3 prefixes are virtual; objects create "folders"
            return
        os.makedirs(path, exist_ok=True)

    def exists(self, path: str) -> bool:
        if self._is_s3:
            return self._fs.exists(path)
        return os.path.exists(path)

    def open_write_text(self, path: str):
        if self._is_s3:
            # text mode with utf-8 encoding
            return self._fs.open(path, "w")
        # ensure local dir exists
        os.makedirs(os.path.dirname(path), exist_ok=True)
        return open(path, "w", encoding="utf-8")

    def to_csv(self, df: pd.DataFrame, path: str):
        if self._is_s3:
            with self._fs.open(path, "w") as f:
                df.to_csv(f, index=False)
        else:
            # ensure local dir exists
            os.makedirs(os.path.dirname(path), exist_ok=True)
            df.to_csv(path, index=False)

# ---------------- Data fetchers ----------------
def fetch_games_api(date_yyyymmdd: str, group: int = GROUP) -> pd.DataFrame:
    """Preferred: ESPN public JSON API (no HTML parsing)."""
    url = (
        "https://site.api.espn.com/apis/v2/sports/basketball/mens-college-basketball/"
        f"scoreboard?dates={date_yyyymmdd}&groups={group}"
    )
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    data = r.json()

    rows = []
    for e in data.get("events", []):
        gid = e.get("id")
        comp = (e.get("competitions") or [{}])[0]
        comps = comp.get("competitors", [])
        home = next((c for c in comps if c.get("homeAway") == "home"), {})
        away = next((c for c in comps if c.get("homeAway") == "away"), {})
        rows.append({
            "game_id": gid,
            "home_team": (home.get("team") or {}).get("displayName"),
            "away_team": (away.get("team") or {}).get("displayName"),
        })
    return pd.DataFrame(rows)

def fetch_scoreboard_html(date_yyyymmdd: str, group: int = GROUP) -> str:
    """Fallback: fetch the scoreboard HTML for the date/group."""
    url = f"https://www.espn.com/mens-college-basketball/scoreboard/_/date/{date_yyyymmdd}/group/{group}"
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text

def parse_games_from_html(html: str) -> pd.DataFrame:
    """
    Parse game blocks from server-rendered DOM.
    <section class="Scoreboard" id="<game_id>">…</section>
    """
    soup = BeautifulSoup(html, "html.parser")
    games = []

    # Primary: section blocks with game id
    for sec in soup.select("section.Scoreboard[id]"):
        gid = sec.get("id")
        away = sec.select_one(".ScoreboardScoreCell__Item--away .ScoreCell__TeamName--shortDisplayName")
        home = sec.select_one(".ScoreboardScoreCell__Item--home .ScoreCell__TeamName--shortDisplayName")
        games.append({
            "game_id": gid,
            "home_team": home.get_text(strip=True) if home else None,
            "away_team": away.get_text(strip=True) if away else None,
        })

    # Secondary: backup to anchor pattern if nothing found
    if not games:
        for a in soup.find_all("a", href=True):
            m = re.search(r"/mens-college-basketball/game/_/gameId/(\d+)", a["href"])
            if m:
                games.append({"game_id": m.group(1), "home_team": None, "away_team": None})

    return pd.DataFrame(games)

def get_games_for_date(date_yyyymmdd: str, group: int = GROUP) -> pd.DataFrame:
    """Try API first; if empty/error, fall back to HTML."""
    try:
        df_api = fetch_games_api(date_yyyymmdd, group=group)
        if not df_api.empty:
            df_api.insert(0, "date", date_yyyymmdd)
            return df_api
    except Exception:
        pass

    try:
        html = fetch_scoreboard_html(date_yyyymmdd, group=group)
        df_html = parse_games_from_html(html)
        if not df_html.empty:
            df_html.insert(0, "date", date_yyyymmdd)
        return df_html
    except Exception:
        return pd.DataFrame()

def daterange(start_yyyymmdd: str, end_yyyymmdd: str):
    start = datetime.strptime(start_yyyymmdd, "%Y%m%d")
    end = datetime.strptime(end_yyyymmdd, "%Y%m%d")
    d = start
    while d <= end:
        yield d.strftime("%Y%m%d")
        d += timedelta(days=1)

# ---------------- Main ----------------
def main():
    fs = FS(OUT_DIR)

    total_days = 0
    total_games = 0

    for day in daterange(START_DATE, END_DATE):
        # Make a dated subfolder like: daily-box-score-ids/20251103/
        if is_s3_path(OUT_DIR):
            day_dir = s3_join(OUT_DIR, day)
        else:
            day_dir = os.path.join(OUT_DIR, day)
            fs.mkdir(day_dir)  # local only

        # Files we’ll write inside that folder
        csv_path = fs.join(day_dir, "game_ids.csv") if not is_s3_path(OUT_DIR) else s3_join(day_dir, "game_ids.csv")
        txt_path = fs.join(day_dir, "game_ids.txt") if not is_s3_path(OUT_DIR) else s3_join(day_dir, "game_ids.txt")

        if (not OVERWRITE) and fs.exists(csv_path):
            print(f"⏭️  {day}: {csv_path} exists, skipping (set OVERWRITE=True to redo)")
            continue

        try:
            df = get_games_for_date(day, group=GROUP)
            if df is None or df.empty:
                print(f"— {day}: no games")
            else:
                # Normalize columns/order
                df = df[["date", "game_id", "home_team", "away_team"]]

                # Write CSV
                fs.to_csv(df, csv_path)

                # Also write a plain text list of IDs (one per line)
                with fs.open_write_text(txt_path) as f:
                    for gid in df["game_id"].astype(str):
                        f.write(gid + "\n")

                total_days += 1
                total_games += len(df)
                print(f"✅ {day}: saved {len(df)} games → {csv_path} and {txt_path}")
        except Exception as e:
            print(f"❌ {day}: ERROR {e}")

        # be polite to the server
        if PAUSE_SECONDS:
            try:
                time.sleep(PAUSE_SECONDS)
            except Exception:
                pass

    print(f"\nDone. Wrote {total_games} games across {total_days} days into '{OUT_DIR}/<YYYYMMDD>/' folders.")

if __name__ == "__main__":
    main()

— 20251102: no games
✅ 20251103: saved 169 games → s3://collegebasketballinsiders/daily-box-score-ids/20251103/game_ids.csv and s3://collegebasketballinsiders/daily-box-score-ids/20251103/game_ids.txt
✅ 20251104: saved 36 games → s3://collegebasketballinsiders/daily-box-score-ids/20251104/game_ids.csv and s3://collegebasketballinsiders/daily-box-score-ids/20251104/game_ids.txt
✅ 20251105: saved 35 games → s3://collegebasketballinsiders/daily-box-score-ids/20251105/game_ids.csv and s3://collegebasketballinsiders/daily-box-score-ids/20251105/game_ids.txt
✅ 20251106: saved 44 games → s3://collegebasketballinsiders/daily-box-score-ids/20251106/game_ids.csv and s3://collegebasketballinsiders/daily-box-score-ids/20251106/game_ids.txt
✅ 20251107: saved 76 games → s3://collegebasketballinsiders/daily-box-score-ids/20251107/game_ids.csv and s3://collegebasketballinsiders/daily-box-score-ids/20251107/game_ids.txt
✅ 20251108: saved 69 games → s3://collegebasketballinsiders/daily-box-score-ids/202

In [19]:
# ---------- Game Box Scores (S3-ready) ----------

import os
import csv
import requests
from typing import Any, Dict, List, Tuple, Optional


try:
    import s3fs
except Exception:
    s3fs = None

import glob
import pandas as pd
import time

# ----------------------------
# Config
# ----------------------------
SUMMARY_URL = "https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/summary?event={gid}"
UA_HEADERS = {"User-Agent": "Mozilla/5.0"}

# Where to SAVE box score CSVs (can be s3://bucket/prefix or local)
OUT_DIR = "s3://collegebasketballinsiders/boxscores-2026"

# Where to READ daily game-id CSVs (can be s3://bucket/prefix or local)
# Example matches output from your first script: s3://.../daily-box-score-ids/<YYYYMMDD>/*.csv
IDS_DIR = "s3://collegebasketballinsiders/daily-box-score-ids"
IDS_DATE = "20251109"  # which day’s game_id CSVs to ingest

# ----------------------------
# Simple FS facade (local vs S3)
# ----------------------------
def is_s3_path(path: str) -> bool:
    return isinstance(path, str) and path.lower().startswith("s3://")

def s3_join(base: str, *parts: str) -> str:
    base = base.rstrip("/")
    tail = "/".join(p.strip("/") for p in parts if p is not None)
    return f"{base}/{tail}" if tail else base
# --- Add this helper near your FS code ---
def _ensure_s3_scheme(p: str) -> str:
    if p and not p.lower().startswith("s3://"):
        return f"s3://{p}"
    return p

class FS:
    def __init__(self, root_hint: Optional[str] = None):
        self._is_s3 = is_s3_path(root_hint) if root_hint else False
        self._fs = None
        if self._is_s3:
            if s3fs is None:
                raise RuntimeError("s3fs is required for S3 paths. Install: pip install s3fs")
            self._fs = s3fs.S3FileSystem(anon=False)

    def join(self, *parts: str) -> str:
        if self._is_s3:
            return s3_join(*parts)
        return os.path.join(*parts)

    def mkdirs_for_file(self, path: str):
        if not self._is_s3:
            os.makedirs(os.path.dirname(path), exist_ok=True)
        # S3: no-op, prefixes are virtual

    def open_text_write(self, path: str):
        if self._is_s3:
            return self._fs.open(path, "w")
        self.mkdirs_for_file(path)
        return open(path, "w", newline="", encoding="utf-8")

    def open_binary_read(self, path: str):
        if self._is_s3:
            return self._fs.open(path, "rb")
        return open(path, "rb")

    def exists(self, path: str) -> bool:
        if self._is_s3:
            return self._fs.exists(path)
        return os.path.exists(path)

    def glob(self, pattern: str) -> List[str]:
        if self._is_s3:
            results = self._fs.glob(pattern)
            # Some s3fs versions return keys without the scheme. Normalize.
            return [_ensure_s3_scheme(p) for p in results]
        return glob.glob(pattern)

# Instantiate two facades: one for OUT_DIR and one for IDS_DIR
fs_out = FS(OUT_DIR)
fs_ids = FS(IDS_DIR)

# ----------------------------
# Fetch
# ----------------------------
def _get_summary(game_id: str) -> Dict[str, Any]:
    r = requests.get(SUMMARY_URL.format(gid=game_id), headers=UA_HEADERS, timeout=20)
    r.raise_for_status()
    return r.json()

# ----------------------------
# Helpers
# ----------------------------
def _first_comp(summary: Dict[str, Any]) -> Dict[str, Any]:
    return (summary.get("header", {}).get("competitions") or [{}])[0]

def _competitors(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
    return _first_comp(summary).get("competitors") or []

def _home_comp(comp_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    return next((c for c in comp_list if c.get("homeAway") == "home"), comp_list[0] if comp_list else {})

def _away_comp(comp_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    return next((c for c in comp_list if c.get("homeAway") == "away"), comp_list[1] if len(comp_list) > 1 else {})

def _team_name(team_obj: Dict[str, Any]) -> str:
    return team_obj.get("displayName") or team_obj.get("name") or team_obj.get("location") or ""

def _parse_value_to_int(val: Any) -> Optional[int]:
    if val is None:
        return None
    try:
        return int(val)
    except Exception:
        try:
            return int(str(val).split(".")[0])
        except Exception:
            return None

def _extract_period_num(item: Dict[str, Any]) -> Optional[int]:
    p = item.get("period")
    if isinstance(p, dict):
        return _parse_value_to_int(p.get("number"))
    return _parse_value_to_int(p)

def _extract_score_from_item(item: Dict[str, Any]) -> Optional[int]:
    for k in ("value", "displayValue", "score"):
        if k in item:
            return _parse_value_to_int(item[k])
    return None

def _half_scores(competitor: Dict[str, Any]) -> Tuple[Optional[int], Optional[int]]:
    lines = competitor.get("linescores") or competitor.get("scoreByPeriod") or []
    if not isinstance(lines, list) or not lines:
        return (None, None)

    by_period: Dict[int, int] = {}
    fallback_order_vals: List[int] = []

    for it in lines:
        if not isinstance(it, dict):
            continue
        val = _extract_score_from_item(it)
        if val is not None:
            fallback_order_vals.append(val)
        pnum = _extract_period_num(it)
        if pnum is not None and val is not None:
            by_period[pnum] = val

    h1 = by_period.get(1)
    h2 = by_period.get(2)
    if h1 is not None or h2 is not None:
        return h1, h2

    if len(fallback_order_vals) >= 2:
        return fallback_order_vals[0], fallback_order_vals[1]

    return (None, None)

def _write_csv(path: str, rows: List[Dict[str, Any]], header: List[str] = None):
    """
    S3/local-aware CSV writer (uses fs_out).
    """
    fs_out.mkdirs_for_file(path)

    if not rows:
        if header:
            with fs_out.open_text_write(path) as f:
                csv.DictWriter(f, fieldnames=header).writeheader()
        else:
            # create empty file
            with fs_out.open_text_write(path) as f:
                pass
        return

    cols = header or list({k for r in rows for k in r.keys()})
    with fs_out.open_text_write(path) as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        w.writerows(rows)

# ----------------------------
# Parsers (summary-only)
# ----------------------------
def parse_game_info(summary: Dict[str, Any], game_id: str) -> List[Dict[str, Any]]:
    header = summary.get("header", {}) or {}
    comp = _first_comp(summary)
    comps = _competitors(summary)

    home = _home_comp(comps)
    away = _away_comp(comps)

    dt_iso = (header.get("competitions", [{}])[0].get("date")
              or header.get("date")
              or "")
    date_utc, time_utc = "", ""
    if "T" in dt_iso:
        date_utc, rest = dt_iso.split("T", 1)
        time_utc = rest
    else:
        date_utc = dt_iso

    home_score = _parse_value_to_int(home.get("score"))
    away_score = _parse_value_to_int(away.get("score"))

    home_1h, home_2h = _half_scores(home)
    away_1h, away_2h = _half_scores(away)

    row = {
        "game_id": game_id,
        "date_utc": date_utc,
        "time_utc": time_utc,
        "neutral_site": bool(comp.get("neutralSite")),
        "home_team": _team_name((home.get("team") or {})),
        "away_team": _team_name((away.get("team") or {})),
        "home_1h": home_1h,
        "away_1h": away_1h,
        "home_2h": home_2h,
        "away_2h": away_2h,
        "home_score": home_score,
        "away_score": away_score,
    }
    return [row]

def parse_team_stats(summary: Dict[str, Any], game_id: str) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    teams = (summary.get("boxscore", {}) or {}).get("teams") or []
    for t in teams:
        team_obj = t.get("team", {}) or {}
        stats = t.get("statistics") or []
        row = {
            "game_id": game_id,
            "team_id": team_obj.get("id"),
            "team": _team_name(team_obj),
            "abbreviation": team_obj.get("abbreviation"),
            "homeAway": t.get("homeAway"),
            "displayOrder": t.get("displayOrder"),
        }
        for s in stats:
            key = s.get("name") or s.get("label")
            if key:
                row[key] = s.get("displayValue")
        rows.append(row)
    return rows

def parse_player_stats(summary: Dict[str, Any], game_id: str) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    players_blocks = (summary.get("boxscore", {}) or {}).get("players") or []

    for team_block in players_blocks:
        team_obj = team_block.get("team", {}) or {}
        team_id = team_obj.get("id")
        team_name = _team_name(team_obj)
        team_abbr = team_obj.get("abbreviation")

        for stats_pack in team_block.get("statistics") or []:
            keys = stats_pack.get("keys") or []
            for ath in stats_pack.get("athletes") or []:
                athlete = ath.get("athlete", {}) or {}
                values = ath.get("stats") or []
                row = {
                    "game_id": game_id,
                    "team_id": team_id,
                    "team": team_name,
                    "abbreviation": team_abbr,
                    "athlete_id": athlete.get("id"),
                    "athlete_name": athlete.get("displayName"),
                    "jersey": athlete.get("jersey"),
                    "position": (athlete.get("position") or {}).get("abbreviation") or (athlete.get("position") or {}).get("displayName"),
                    "starter": ath.get("starter"),
                    "didNotPlay": ath.get("didNotPlay"),
                    "ejected": ath.get("ejected"),
                }
                for k, v in zip(keys, values):
                    row[k] = v
                rows.append(row)
    return rows

def parse_officials(summary: Dict[str, Any], game_id: str) -> List[Dict[str, Any]]:
    officials = (
        summary.get("officials")
        or summary.get("gameInfo", {}).get("officials")
        or _first_comp(summary).get("officials")
        or []
    )
    out = []
    for off in officials:
        name = off.get("fullName") or off.get("displayName")
        out.append({"game_id": game_id, "official_name": name})
    return out

# ----------------------------
# Public: single-game saver
# ----------------------------
def save_single_game(game_id: str, outdir: str = OUT_DIR) -> Dict[str, str]:
    """
    Writes FOUR CSVs using only the summary endpoint to S3/local:
      1) {gid}_game_info.csv
      2) {gid}_team_stats.csv
      3) {gid}_player_stats.csv
      4) {gid}_officials.csv
    """
    summary = _get_summary(game_id)

    game_info_rows  = parse_game_info(summary, game_id)
    team_stats_rows = parse_team_stats(summary, game_id)
    player_rows     = parse_player_stats(summary, game_id)
    officials_rows  = parse_officials(summary, game_id)

    # Subfolders (adjust year tag as desired)
    game_info_path    = (s3_join(outdir, f"game-info-2026/{game_id}_game_info.csv")
                         if is_s3_path(outdir) else os.path.join(outdir, f"game-info-2026/{game_id}_game_info.csv"))
    team_stats_path   = (s3_join(outdir, f"team-stats-2026/{game_id}_team_stats.csv")
                         if is_s3_path(outdir) else os.path.join(outdir, f"team-stats-2026/{game_id}_team_stats.csv"))
    player_stats_path = (s3_join(outdir, f"player-stats-2026/{game_id}_player_stats.csv")
                         if is_s3_path(outdir) else os.path.join(outdir, f"player-stats-2026/{game_id}_player_stats.csv"))
    officials_path    = (s3_join(outdir, f"officials-2026/{game_id}_officials.csv")
                         if is_s3_path(outdir) else os.path.join(outdir, f"officials-2026/{game_id}_officials.csv"))

    _write_csv(
        game_info_path,
        game_info_rows,
        header=[
            "game_id","date_utc","time_utc","neutral_site",
            "home_team","away_team",
            "home_1h","away_1h","home_2h","away_2h",
            "home_score","away_score",
        ],
    )
    _write_csv(team_stats_path, team_stats_rows)
    _write_csv(player_stats_path, player_rows)
    _write_csv(officials_path, officials_rows, header=["game_id","official_name"])

    return {
        "game_info_csv": game_info_path,
        "team_stats_csv": team_stats_path,
        "player_stats_csv": player_stats_path,
        "officials_csv": officials_path,
    }

# ----------------------------
# Pull IDs and run
# ----------------------------
def _read_ids_from_folder(ids_dir: str, ids_date: str) -> List[str]:
    """
    Read all game_id CSVs under <ids_dir>/<ids_date>/*.csv from S3/local.
    Expects a 'game_id' column.
    """
    pattern = (s3_join(ids_dir, ids_date, "*.csv") if is_s3_path(ids_dir)
               else os.path.join(ids_dir, ids_date, "*.csv"))

    files = fs_ids.glob(pattern)
    if not files:
        print(f"⚠️  No CSV files found at {pattern}")
        return []

    dfs = []
    for fpath in files:
        try:
            # Use the fs (not string inspection) to decide how to open
            if fs_ids._is_s3:  # S3-backed
                with fs_ids.open_binary_read(fpath) as fh:
                    dfs.append(pd.read_csv(fh))
            else:              # local
                dfs.append(pd.read_csv(fpath))
        except Exception as e:
            print(f"⚠️  Skipping unreadable file {fpath}: {e}")

    if not dfs:
        return []

    combined = pd.concat(dfs, ignore_index=True)
    if "game_id" not in combined.columns:
        print("⚠️  Combined CSVs missing 'game_id' column.")
        return []

    return [str(x) for x in combined["game_id"].dropna().astype(str).tolist()]

def main():
    game_ids = _read_ids_from_folder(IDS_DIR, IDS_DATE)
    if not game_ids:
        print("No game IDs to process.")
        return

    count = 1
    for gid in game_ids:
        print(f"{count}: saving {gid}")
        try:
            save_single_game(gid, OUT_DIR)
        except Exception as e:
            print(f"❌  {gid}: ERROR {e}")
        time.sleep(0.5)
        count += 1

if __name__ == "__main__":
    main()

1: saving 401812600
2: saving 401828296
3: saving 401826732
4: saving 401823022
5: saving 401822758
6: saving 401812574
7: saving 401828576
8: saving 401823485
9: saving 401823417
10: saving 401823393
11: saving 401823256
12: saving 401820542
13: saving 401818548
14: saving 401813320
15: saving 401813287
16: saving 401813254
17: saving 401811103
18: saving 401826896
19: saving 401825577
20: saving 401819916
21: saving 401819814
22: saving 401828912
23: saving 401823515
24: saving 401828276
25: saving 401824089
26: saving 401823555
27: saving 401827088
28: saving 401826772
29: saving 401823858
30: saving 401823808
31: saving 401827257
32: saving 401829390
33: saving 401819824
34: saving 401824896


In [20]:
import s3fs
import pandas as pd
from pandas.errors import EmptyDataError, ParserError

# Initialize S3 filesystem (uses your AWS credentials)
fs = s3fs.S3FileSystem(anon=False)

# List all CSVs
files = fs.glob("collegebasketballinsiders/boxscores-2026/team-stats-2026/*.csv")
print(f"Found {len(files)} files")

dfs = []
for path in files:
    s3_url = f"s3://{path}"
    try:
        df_part = pd.read_csv(s3_url, storage_options={"anon": False})
        if df_part.empty:
            print(f"[skip] Empty file: {s3_url}")
            continue
        dfs.append(df_part)
    except (EmptyDataError, ParserError, UnicodeDecodeError) as e:
        print(f"[skip] Could not read {s3_url}: {e}")
        continue

if not dfs:
    raise RuntimeError("No valid CSV files found.")

df = pd.concat(dfs, ignore_index=True)
df.to_csv("s3://collegebasketballinsiders/box-scores/2026/teams/team-stats.csv")


# List all CSVs
files = fs.glob("collegebasketballinsiders/boxscores-2026/game-info-2026/*.csv")
print(f"Found {len(files)} files")

dfs = []
for path in files:
    s3_url = f"s3://{path}"
    try:
        df_part = pd.read_csv(s3_url, storage_options={"anon": False})
        if df_part.empty:
            print(f"[skip] Empty file: {s3_url}")
            continue
        dfs.append(df_part)
    except (EmptyDataError, ParserError, UnicodeDecodeError) as e:
        print(f"[skip] Could not read {s3_url}: {e}")
        continue

if not dfs:
    raise RuntimeError("No valid CSV files found.")

df = pd.concat(dfs, ignore_index=True)
df.to_csv("s3://collegebasketballinsiders/box-scores/2026/game-info/game-info.csv")

Found 463 files
Found 463 files


In [29]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

torvik_df = pd.read_csv("s3://collegebasketballinsiders/torvik/2026/team-ratings.csv")
map_df = pd.read_csv("s3://collegebasketballinsiders/general/map.csv", index_col=0)

In [30]:
torvik_df = torvik_df[['season', 'date', 'team', 'rank', 'conf', 'games',
       'adj_off_eff', 'adj_def_eff', 'barthag', 'efg_pct', 'efgd_pct', 'tor',
       'tord', 'orb', 'drb', 'ftr', 'ftrd', 'two_pt_pct', 'two_pt_def_pct',
       'three_pt_pct', 'three_pt_def_pct', 'three_pt_rt', 'three_pt_def_rt',
       'adj_tempo', 'wab']]
torvik_df["date"] = pd.to_datetime(torvik_df["date"], format="%Y%m%d")

torvik_df = torvik_df.merge(map_df[["team_id", "torrvik"]], left_on="team", right_on="torrvik").drop("torrvik", axis=1)

In [35]:
kenpom_df = pd.read_csv("s3://collegebasketballinsiders/kenpom/2026/team-ratings.csv")
kenpom_df = kenpom_df[['ArchiveDate', 'Season', 'TeamID', 'TeamName',
       'ConfShort', 'AdjEM', 'RankAdjEM', 'AdjOE', 'RankAdjOE',
       'AdjDE', 'RankAdjDE', 'AdjTempo', 'RankAdjTempo']]
kenpom_df.columns = ['date', 'season', 'team_id', 'team',
       'conf', 'adj_em', 'adj_em_rank', 'adj_oe', 'adj_oe_rank',
       'adj_def', 'adj_def_rank', 'adj_tempo', 'adj_tempo_rank']
kenpom_df["date"] = pd.to_datetime(kenpom_df["date"])
height_df = pd.read_csv("s3://collegebasketballinsiders/kenpom/2026/team-height.csv")
height_df = height_df.merge(map_df[["team_id", "kenpom"]], left_on="TeamName", right_on="kenpom")
height_df = height_df[['Season', 'team_id', 'TeamName', 'ConfShort', 
       'AvgHgt', 'AvgHgtRank', 'HgtEff', 'HgtEffRank', 'Hgt5', 'Hgt5Rank',
       'Hgt4', 'Hgt4Rank', 'Hgt3', 'Hgt3Rank', 'Hgt2', 'Hgt2Rank', 'Hgt1',
       'Hgt1Rank', 'Exp', 'ExpRank', 'Bench', 'BenchRank', 'Continuity',
       'RankContinuity']]
height_df.columns = ['season', 'team_id', 'team', 'conf',
       'avg_height', 'avg_height_rank', 'eff_height', 'eff_height_rank', 'height_5', 'height_5_rank',
       'height_4', 'height_4_rank', 'height_3', 'height_3_rank', 'height_2', 'height_2_rank', 'height_1',
       'height_1_rank', 'exp', 'exp_rank', 'bench', 'bench_rank', 'continuity',
       'continuity_rank']

In [36]:
rating_df = kenpom_df.merge(torvik_df, on=['date', 'team_id'], how="left")
rating_df['season'] = rating_df['season_x']
rating_df['conf'] = rating_df['conf_x']
rating_df['team'] = rating_df['team_x']
rating_df['adj_tempo_km'] = rating_df['adj_tempo_x']
rating_df['adj_tempo_tvk'] = rating_df['adj_tempo_y']
rating_df = rating_df[['date', 'season', 'team_id', 'team', 'conf', 'adj_em',
       'adj_em_rank', 'adj_oe', 'adj_oe_rank', 'adj_def', 'adj_def_rank',
       'adj_tempo_km', 'adj_tempo_rank','rank', 
       'games', 'adj_off_eff', 'adj_def_eff', 'barthag', 'efg_pct', 'efgd_pct',
       'tor', 'tord', 'orb', 'drb', 'ftr', 'ftrd', 'two_pt_pct',
       'two_pt_def_pct', 'three_pt_pct', 'three_pt_def_pct', 'three_pt_rt',
       'three_pt_def_rt', 'adj_tempo_tvk', 'wab']]

In [38]:
rating_df = rating_df.merge(height_df, on=['season', 'team_id'], how="left")
rating_df['team'] = rating_df['team_x']
rating_df['conf'] = rating_df['conf_x']
rating_df = rating_df[['date', 'season', 'team_id', 'team', 'conf', 'adj_em',
       'adj_em_rank', 'adj_oe', 'adj_oe_rank', 'adj_def', 'adj_def_rank',
       'adj_tempo_km', 'adj_tempo_rank', 'rank', 'games', 'adj_off_eff',
       'adj_def_eff', 'barthag', 'efg_pct', 'efgd_pct', 'tor', 'tord', 'orb',
       'drb', 'ftr', 'ftrd', 'two_pt_pct', 'two_pt_def_pct', 'three_pt_pct',
       'three_pt_def_pct', 'three_pt_rt', 'three_pt_def_rt', 'adj_tempo_tvk',
       'wab', 'avg_height', 'avg_height_rank',
       'eff_height', 'eff_height_rank', 'height_5', 'height_5_rank',
       'height_4', 'height_4_rank', 'height_3', 'height_3_rank', 'height_2',
       'height_2_rank', 'height_1', 'height_1_rank', 'exp', 'exp_rank',
       'bench', 'bench_rank', 'continuity', 'continuity_rank']]

In [39]:
def shift_ratings_one_game(rating_df: pd.DataFrame, make_new_cols=False) -> pd.DataFrame:
    """
    Shift per-game rating/stat columns back by 1 within (season, team_id),
    while leaving height/exp/bench/continuity + ID/meta columns unshifted.

    If make_new_cols=True, creates *_lag1 columns instead of overwriting.
    """
    df = rating_df.copy()

    # ---- ID/meta (never shift)
    id_cols = ["date", "season", "team_id", "team", "conf"]

    # ---- Non-shift stat families (explicit names from your schema)
    no_shift_cols = id_cols + [
        "avg_height", "avg_height_rank",
        "eff_height", "eff_height_rank",
        "height_5", "height_5_rank",
        "height_4", "height_4_rank",
        "height_3", "height_3_rank",
        "height_2", "height_2_rank",
        "height_1", "height_1_rank",
        "exp", "exp_rank",
        "bench", "bench_rank",
        "continuity", "continuity_rank",
    ]

    # ---- Columns to shift = everything else that exists in the df
    shift_cols = [c for c in df.columns if c not in set(no_shift_cols)]

    # Ensure date is datetime and sort for proper game order
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values(["season", "team_id", "date"])

    # Grouped lag
    lagged = df.groupby(["season", "team_id"], dropna=False)[shift_cols].shift(1)

    if make_new_cols:
        df[[f"{c}_lag1" for c in shift_cols]] = lagged
    else:
        df[shift_cols] = lagged

    return df

kp_tvk_features_df = shift_ratings_one_game(rating_df=rating_df)

In [40]:
kp_tvk_features_df = kp_tvk_features_df[~kp_tvk_features_df['adj_em'].isna()]

In [68]:
### ESPN GAME STATS
game_info_df = pd.read_csv("s3://collegebasketballinsiders/box-scores/2026/game-info/game-info.csv")
team_stats_df = pd.read_csv("s3://collegebasketballinsiders/box-scores/2026/teams/team-stats.csv")
todays_games_df = game_info_df[game_info_df['date_utc'] == "2025-11-09"]

todays_games_df = todays_games_df.merge(map_df[["espn_2", "team_id"]], left_on="home_team", right_on="espn_2")
todays_games_df['home_team_id'] = todays_games_df['team_id']
todays_games_df = todays_games_df[['game_id', 'date_utc', 'time_utc', 'neutral_site',
       'home_team', 'home_team_id', 'away_team', 'home_1h', 'away_1h', 'home_2h', 'away_2h',
       'home_score', 'away_score']]
todays_games_df = todays_games_df.merge(map_df[["espn_2", "team_id"]], left_on="away_team", right_on="espn_2")
todays_games_df['away_team_id'] = todays_games_df['team_id']
todays_games_df = todays_games_df[['game_id', 'date_utc', 'time_utc', 'neutral_site',
       'home_team', 'home_team_id', 'away_team', 'away_team_id', 'home_1h', 'away_1h', 'home_2h', 'away_2h',
       'home_score', 'away_score']]
todays_games_df['season'] = 2026
todays_games_df = todays_games_df[['game_id', 'season', 'date_utc', 'time_utc', 'neutral_site', 'home_team',
       'home_team_id', 'away_team', 'away_team_id']]

In [69]:
game_info_df = game_info_df.merge(map_df[["espn_2", "team_id"]], left_on="home_team", right_on="espn_2")
game_info_df['home_team_id'] = game_info_df['team_id']
game_info_df = game_info_df[['game_id', 'date_utc', 'time_utc', 'neutral_site',
       'home_team', 'home_team_id', 'away_team', 'home_1h', 'away_1h', 'home_2h', 'away_2h',
       'home_score', 'away_score']]
game_info_df = game_info_df.merge(map_df[["espn_2", "team_id"]], left_on="away_team", right_on="espn_2")
game_info_df['away_team_id'] = game_info_df['team_id']
game_info_df = game_info_df[['game_id', 'date_utc', 'time_utc', 'neutral_site',
       'home_team', 'home_team_id', 'away_team', 'away_team_id', 'home_1h', 'away_1h', 'home_2h', 'away_2h',
       'home_score', 'away_score']]
game_info_df = game_info_df[~game_info_df['home_2h'].isna()]

In [70]:
team_stats_df = team_stats_df.merge(map_df[["espn_2", "team_id"]], left_on="team", right_on="espn_2")
team_stats_df['team_id'] = team_stats_df['team_id_y']
team_stats_df = team_stats_df[['game_id', 'team', 'team_id', 'assists', 'defensiveRebounds', 'fouls',
       'totalRebounds', 
       'pointsInPaint', 'technicalFouls',
       'offensiveRebounds',  'turnoverPoints', 'steals', 'blocks', 'fastBreakPoints',
       'turnovers']]

In [71]:
def season_grab(df):
    """
    Assigns a season year based on date_utc.
    Example: games before 2021-04-01 belong to season 2021,
             games between 2021-04-01 and 2022-04-01 belong to 2022, etc.
    """
    # Ensure date_utc is datetime
    df = df.copy()
    df['date_utc'] = pd.to_datetime(df['date_utc'])

    # Define season cutoffs
    bins = [
        pd.Timestamp("1900-01-01"),
        pd.Timestamp("2021-04-21"),
        pd.Timestamp("2022-04-21"),
        pd.Timestamp("2023-04-21"),
        pd.Timestamp("2024-04-21"),
        pd.Timestamp("2025-04-21"),
        pd.Timestamp("2100-01-21"),
    ]
    seasons = [2021, 2022, 2023, 2024, 2025, 2026]

    # Use pandas cut to categorize efficiently
    df['season'] = pd.cut(df['date_utc'], bins=bins, labels=seasons, right=False).astype(int)

    return df
game_info_df = season_grab(game_info_df)


In [83]:
import pandas as pd
import numpy as np

# ======================
# Your existing config
# ======================
BASE_STATS = [
    'assists', 'defensiveRebounds', 'fouls', 'totalRebounds',
    'pointsInPaint', 'technicalFouls', 'offensiveRebounds',
    'turnoverPoints', 'steals', 'blocks', 'fastBreakPoints', 'turnovers'
]
ROLL_WINDOWS = [1, 3, 5, 10]

# --- Robust datetime combiner (handles "HHZ", "HH:MMZ", "HH:MM:SSZ" or without Z):
def _combine_utc_datetime(date_series: pd.Series, time_series: pd.Series) -> pd.Series:
    d = date_series.astype(str).str.strip()
    t = time_series.astype(str).str.strip().str.upper()
    # default blank -> noon UTC
    t = t.mask((t.eq("")) | t.isna(), "12:00:00Z")
    # strip Z, normalize to HH:MM:SS, then add Z back
    s = pd.Series(t.str.replace("Z", "", regex=False))
    s = s.where(~s.str.match(r"^\d{1,2}$"), s + ":00:00")   # HH -> HH:00:00
    s = s.where(~s.str.match(r"^\d{1,2}:\d{2}$"), s + ":00")# HH:MM -> HH:MM:00
    t_full = s.astype(str) + "Z"
    return pd.to_datetime(d + " " + t_full, errors="coerce", utc=True)

# ======================
# Your existing builders
# ======================

def build_team_stats_features_no_dup(
    game_info_df: pd.DataFrame,
    team_stats_df: pd.DataFrame,
    windows: list[int] = ROLL_WINDOWS,
    local_time_zone: str | None = None,
    agg: str | dict = "sum",
) -> pd.DataFrame:
    req_game = ['game_id','date_utc','time_utc','season',
                'home_team','home_team_id','away_team','away_team_id']
    missing = [c for c in req_game if c not in game_info_df.columns]
    if missing:
        raise KeyError(f"Missing in game_info_df: {missing}")

    if not {'game_id','team_id'}.issubset(team_stats_df.columns):
        raise KeyError("team_stats_df must include 'game_id' and 'team_id'.")

    stats = [s for s in BASE_STATS if s in team_stats_df.columns]
    if not stats:
        out = game_info_df.copy()
        out["game_datetime_utc"] = _combine_utc_datetime(out["date_utc"], out["time_utc"])
        out["game_datetime_local"] = (
            out["game_datetime_utc"].dt.tz_convert(local_time_zone)
            if local_time_zone else out["game_datetime_utc"]
        )
        return out

    agg_map = {s: agg if isinstance(agg, str) else agg.get(s, "sum") for s in stats}
    ts_clean = (
        team_stats_df
        .groupby(["game_id","team_id"], as_index=False)
        .agg(agg_map)
    )
    assert not ts_clean.duplicated(["game_id","team_id"]).any(), "Aggregation failed to ensure uniqueness."

    gif = game_info_df.copy()
    gif["game_datetime_utc"] = _combine_utc_datetime(gif["date_utc"], gif["time_utc"])
    gif["game_datetime_local"] = (
        gif["game_datetime_utc"].dt.tz_convert(local_time_zone)
        if local_time_zone else gif["game_datetime_utc"]
    )

    home_stats = ts_clean.rename(columns={"team_id":"home_team_id", **{s: f"{s}_home" for s in stats}})
    away_stats = ts_clean.rename(columns={"team_id":"away_team_id", **{s: f"{s}_away" for s in stats}})
    merged = (
        gif.merge(home_stats, on=["game_id","home_team_id"], how="left")
           .merge(away_stats, on=["game_id","away_team_id"], how="left")
    )

    home_payload = {
        'game_id': merged['game_id'],
        'season': merged['season'],
        'game_dt': merged['game_datetime_utc'],
        'team_id': merged['home_team_id'],
        'side': 'home'
    }
    for s in stats:
        home_payload[s] = merged.get(f"{s}_home")
        home_payload[f"allowed_{s}"] = merged.get(f"{s}_away")

    away_payload = {
        'game_id': merged['game_id'],
        'season': merged['season'],
        'game_dt': merged['game_datetime_utc'],
        'team_id': merged['away_team_id'],
        'side': 'away'
    }
    for s in stats:
        away_payload[s] = merged.get(f"{s}_away")
        away_payload[f"allowed_{s}"] = merged.get(f"{s}_home")

    long_team = pd.concat([pd.DataFrame(home_payload), pd.DataFrame(away_payload)], ignore_index=True)
    long_team = long_team.sort_values(['team_id','season','game_dt'], kind='mergesort')
    g = long_team.groupby(['team_id','season'], group_keys=False)

    for s in stats:
        for col in (s, f"allowed_{s}"):
            shifted = g[col].shift(1)  # use only past games
            for w in windows:
                long_team[f"ra{w}_{col}"] = (
                    long_team.assign(_s=shifted)
                             .groupby(['team_id','season'], group_keys=False)['_s']
                             .rolling(window=w, min_periods=1)
                             .mean()
                             .reset_index(level=[0,1], drop=True)
                )

    roll_cols = [c for c in long_team.columns if c.startswith("ra")]
    home_feats = (
        long_team[long_team['side']=='home'][['game_id','team_id'] + roll_cols]
        .rename(columns={'team_id':'home_team_id', **{c: f"home_{c}" for c in roll_cols}})
    )
    away_feats = (
        long_team[long_team['side']=='away'][['game_id','team_id'] + roll_cols]
        .rename(columns={'team_id':'away_team_id', **{c: f"away_{c}" for c in roll_cols}})
    )

    out = (
        merged.merge(home_feats, on=['game_id','home_team_id'], how='left')
              .merge(away_feats, on=['game_id','away_team_id'], how='left')
    )

    id_cols = ['game_id','date_utc','time_utc','season','home_team','home_team_id','away_team','away_team_id']
    roll_outs = [c for c in out.columns if c.startswith('home_ra') or c.startswith('away_ra')]
    ordered = [c for c in id_cols if c in out.columns] + roll_outs
    ordered += [c for c in out.columns if c not in ordered]
    return out[ordered]

# --- Game-level rolling (points/totals/margins/time features) you shared:
REQUIRED_COLS = [
    "game_id", "date_utc", "time_utc", "neutral_site",
    "home_team", "home_team_id", "away_team", "away_team_id",
    "home_1h", "away_1h", "home_2h", "away_2h",
    "home_score", "away_score", "season",
    "total", "1h_total", "2h_total",
    "margin", "1h_margin", "2h_margin",
]

def _to_flag(x) -> int:
    if pd.isna(x):
        return 0
    if isinstance(x, (int, float)) and not pd.isna(x):
        return int(x != 0)
    s = str(x).strip().lower()
    return int(s in {"1","true","t","y","yes","neutral","neutral_site"})

def build_cbb_features_multiroll(
    games_df: pd.DataFrame,
    windows: list[int] = [1, 3, 5, 10],
    local_time_zone: str | None = None,
) -> pd.DataFrame:
    missing = [c for c in REQUIRED_COLS if c not in games_df.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")

    df = games_df.copy()
    df["game_datetime_utc"] = _combine_utc_datetime(df["date_utc"], df["time_utc"])
    df["game_datetime_local"] = (
        df["game_datetime_utc"].dt.tz_convert(local_time_zone)
        if local_time_zone else df["game_datetime_utc"]
    )

    gdl = "game_datetime_local"
    df["game_dow"]        = df[gdl].dt.weekday
    df["game_month"]      = df[gdl].dt.month
    df["game_dayofyear"]  = df[gdl].dt.dayofyear
    df["game_weekofyear"] = df[gdl].dt.isocalendar().week.astype(int)
    df["game_hour"]       = df[gdl].dt.hour
    df["is_weekend"]      = df["game_dow"].isin([5, 6]).astype(int)
    df["neutral_site_flag"] = df["neutral_site"].apply(_to_flag).astype(int)

    if "1h_total" not in df or df["1h_total"].isna().all():
        df["1h_total"] = df["home_1h"] + df["away_1h"]
    if "2h_total" not in df or df["2h_total"].isna().all():
        df["2h_total"] = df["home_2h"] + df["away_2h"]
    if "total" not in df or df["total"].isna().all():
        df["total"] = df["home_score"] + df["away_score"]
    if "1h_margin" not in df or df["1h_margin"].isna().all():
        df["1h_margin"] = df["home_1h"] - df["away_1h"]
    if "2h_margin" not in df or df["2h_margin"].isna().all():
        df["2h_margin"] = df["home_2h"] - df["away_2h"]
    if "margin" not in df or df["margin"].isna().all():
        df["margin"] = df["home_score"] - df["away_score"]

    home_side = pd.DataFrame({
        "game_id":     df["game_id"],
        "season":      df["season"],
        "game_dt":     df["game_datetime_utc"],
        "team":        df["home_team"],
        "team_id":     df["home_team_id"],
        "opponent":    df["away_team"],
        "opponent_id": df["away_team_id"],
        "side":        "home",
        "pts_1h":      df["home_1h"],
        "pts_2h":      df["home_2h"],
        "pts_g":       df["home_score"],
        "opp_1h":      df["away_1h"],
        "opp_2h":      df["away_2h"],
        "opp_g":       df["away_score"],
        "is_neutral":  df["neutral_site_flag"].astype(int),
    })
    away_side = pd.DataFrame({
        "game_id":     df["game_id"],
        "season":      df["season"],
        "game_dt":     df["game_datetime_utc"],
        "team":        df["away_team"],
        "team_id":     df["away_team_id"],
        "opponent":    df["home_team"],
        "opponent_id": df["home_team_id"],
        "side":        "away",
        "pts_1h":      df["away_1h"],
        "pts_2h":      df["away_2h"],
        "pts_g":       df["away_score"],
        "opp_1h":      df["home_1h"],
        "opp_2h":      df["home_2h"],
        "opp_g":       df["home_score"],
        "is_neutral":  df["neutral_site_flag"].astype(int),
    })
    long_team = pd.concat([home_side, away_side], ignore_index=True)

    long_team["mgn_1h"] = long_team["pts_1h"] - long_team["opp_1h"]
    long_team["mgn_2h"] = long_team["pts_2h"] - long_team["opp_2h"]
    long_team["mgn_g"]  = long_team["pts_g"]  - long_team["opp_g"]
    long_team["tot_1h"] = long_team["pts_1h"] + long_team["opp_1h"]
    long_team["tot_g"]  = long_team["pts_g"]  + long_team["opp_g"]

    long_team = long_team.sort_values(["team_id", "season", "game_dt"], kind="mergesort")

    # days since last (this will compute rest for today's rows too)
    long_team["days_since_last_game"] = (
        long_team.groupby(["team_id", "season"], group_keys=False)["game_dt"]
                 .diff()
                 .dt.total_seconds()
                 .div(86400.0)
    )

    base_feats = {
        "pts_1h": "ra_pts_1h",
        "pts_2h": "ra_pts_2h",
        "pts_g":  "ra_pts_g",
        "opp_1h": "ra_allowed_1h",
        "opp_2h": "ra_allowed_2h",
        "opp_g":  "ra_allowed_g",
        "mgn_1h": "ra_mgn_1h",
        "mgn_2h": "ra_mgn_2h",
        "mgn_g":  "ra_mgn_g",
        "tot_1h": "ra_tot_1h",
        "tot_g":  "ra_tot_g",
    }
    g = long_team.groupby(["team_id", "season"], group_keys=False)
    for src_col, base_name in base_feats.items():
        shifted = g[src_col].shift(1)  # <-- leakage-safe
        long_team[f"__shifted__{src_col}"] = shifted
        for w in windows:
            long_team[f"{base_name}_{w}"] = (
                long_team.groupby(["team_id", "season"], group_keys=False)[f"__shifted__{src_col}"]
                        .rolling(window=w, min_periods=1)
                        .mean()
                        .reset_index(level=[0,1], drop=True)
            )
    long_team.drop(columns=[c for c in long_team.columns if c.startswith("__shifted__")], inplace=True)

    roll_cols = [c for c in long_team.columns if c.startswith("ra_")]
    keep_cols = ["game_id", "team", "team_id", "side", "days_since_last_game", "is_neutral"] + roll_cols

    home_feats = (
        long_team[long_team["side"] == "home"][keep_cols]
        .drop(columns=["side"])
        .rename(columns={
            "team": "home_team",
            "team_id": "home_team_id",
            "days_since_last_game": "home_days_since_last",
            "is_neutral": "home_is_neutral",
            **{c: f"home_{c}" for c in roll_cols}
        })
    )
    away_feats = (
        long_team[long_team["side"] == "away"][keep_cols]
        .drop(columns=["side"])
        .rename(columns={
            "team": "away_team",
            "team_id": "away_team_id",
            "days_since_last_game": "away_days_since_last",
            "is_neutral": "away_is_neutral",
            **{c: f"away_{c}" for c in roll_cols}
        })
    )

    out = (
        df.merge(home_feats, on=["game_id", "home_team", "home_team_id"], how="left")
          .merge(away_feats, on=["game_id", "away_team", "away_team_id"], how="left")
    )

    id_cols   = [
        "game_id","date_utc","time_utc","game_datetime_utc","game_datetime_local",
        "season","neutral_site","neutral_site_flag",
        "home_team","home_team_id","away_team","away_team_id"
    ]
    time_cols = ["game_dow","game_month","game_dayofyear","game_weekofyear","game_hour","is_weekend"]
    spacing   = ["home_days_since_last","away_days_since_last","home_is_neutral","away_is_neutral"]
    roll_outs = [c for c in out.columns if c.startswith("home_ra_") or c.startswith("away_ra_")]
    ordered = [c for c in id_cols + time_cols + spacing + roll_outs if c in out.columns]
    ordered += [c for c in out.columns if c not in ordered]
    return out[ordered]

# ============================================
# Minimal-tweak inference via simple append
# ============================================
def build_inference_features_via_append(
    todays_games_df: pd.DataFrame,
    hist_games_df: pd.DataFrame,
    team_stats_df: pd.DataFrame,
    local_time_zone: str | None = "America/New_York",
) -> pd.DataFrame:
    """
    Append today's games (with targets/boxscore stats set to NaN) to history,
    run the SAME builders (which already use shift(1)), then return rows for today.
    No asof merges needed.
    """
    # 1) Ensure today's has the columns training code expects
    today = todays_games_df.copy()
    today = today[['game_id','season','date_utc','time_utc','neutral_site',
                   'home_team','home_team_id','away_team','away_team_id']]

    # add target columns (NaN) so build_cbb_features_multiroll can run without error
    add_cols = ["home_1h","away_1h","home_2h","away_2h","home_score","away_score",
                "total","1h_total","2h_total","margin","1h_margin","2h_margin"]
    for c in add_cols:
        if c not in today.columns:
            today[c] = np.nan

    # 2) Align history to required cols (keep originals in hist)
    hist = hist_games_df.copy()
    # make sure all required columns exist in hist (if any of total/margins missing, builder recomputes anyway)
    for c in ["total","1h_total","2h_total","margin","1h_margin","2h_margin"]:
        if c not in hist.columns:
            hist[c] = np.nan

    # 3) Concatenate history + today (today last so time order is natural)
    combined = pd.concat([hist, today], ignore_index=True, sort=False)

    # 4) Build game-level rolling/time features on the combined frame
    game_feats_all = build_cbb_features_multiroll(combined, windows=ROLL_WINDOWS, local_time_zone=local_time_zone)

    # 5) Build team boxscore rolling features on the combined schedule
    # (team_stats_df only contains historical rows; today's games simply won't match -> NaN FOR/AGAINST, which is fine)
    ts_feats_all = build_team_stats_features_no_dup(
        game_info_df=combined[['game_id','date_utc','time_utc','season','home_team','home_team_id','away_team','away_team_id']],
        team_stats_df=team_stats_df,
        windows=ROLL_WINDOWS,
        local_time_zone=local_time_zone
    )

    # 6) Extract today's rows (by game_id) and merge both feature sets
    today_ids = set(today['game_id'].tolist())
    gf_today = game_feats_all[game_feats_all['game_id'].isin(today_ids)].copy()
    ts_today = ts_feats_all[ts_feats_all['game_id'].isin(today_ids)].copy()

    # Keep only the rolling cols from team-stats frame (avoid duplicate id/time cols)
    ts_roll_cols = [c for c in ts_today.columns if c.startswith('home_ra') or c.startswith('away_ra')]
    out = gf_today.merge(
        ts_today[['game_id','home_team_id','away_team_id'] + ts_roll_cols],
        on=['game_id','home_team_id','away_team_id'],
        how='left'
    )

    # 7) Return one row per today's game with all rolling/time features
    out = out.sort_values('game_datetime_utc').reset_index(drop=True)
    return out


In [91]:
game_features_df = build_inference_features_via_append(todays_games_df, game_info_df, team_stats_df).drop_duplicates(subset=["home_team", "away_team"])
game_features_df = game_features_df[game_features_df['game_id'].isin(list(todays_games_df['game_id']))]

In [94]:
game_features = game_features_df[[
       ## GAME INFO
       'game_id', 'date_utc', 'season', 'neutral_site', 'home_team',
       'home_team_id', 'away_team', 'away_team_id', 
       ## TARGETS
       'home_1h', 'away_1h','home_2h', 'away_2h', 'home_score', 'away_score', 'total',
       '1h_total', '2h_total', 'margin', '1h_margin', '2h_margin',
       ## GAME INFO FEATURES
       'neutral_site_flag', 'game_dow', 'game_month', 'game_dayofyear',
       'game_weekofyear', 'game_hour', 'is_weekend',
       ## TEAM GAME INFO FEATURES
       'home_days_since_last', 'away_days_since_last', 'home_ra_pts_1h_1',
       'home_ra_pts_1h_3', 'home_ra_pts_1h_5', 'home_ra_pts_1h_10',
       'home_ra_pts_2h_1', 'home_ra_pts_2h_3', 'home_ra_pts_2h_5',
       'home_ra_pts_2h_10', 'home_ra_pts_g_1', 'home_ra_pts_g_3',
       'home_ra_pts_g_5', 'home_ra_pts_g_10', 'home_ra_allowed_1h_1',
       'home_ra_allowed_1h_3', 'home_ra_allowed_1h_5',
       'home_ra_allowed_1h_10', 'home_ra_allowed_2h_1',
       'home_ra_allowed_2h_3', 'home_ra_allowed_2h_5',
       'home_ra_allowed_2h_10', 'home_ra_allowed_g_1',
       'home_ra_allowed_g_3', 'home_ra_allowed_g_5',
       'home_ra_allowed_g_10', 'home_ra_mgn_1h_1', 'home_ra_mgn_1h_3',
       'home_ra_mgn_1h_5', 'home_ra_mgn_1h_10', 'home_ra_mgn_2h_1',
       'home_ra_mgn_2h_3', 'home_ra_mgn_2h_5', 'home_ra_mgn_2h_10',
       'home_ra_mgn_g_1', 'home_ra_mgn_g_3', 'home_ra_mgn_g_5',
       'home_ra_mgn_g_10', 'home_ra_tot_1h_1', 'home_ra_tot_1h_3',
       'home_ra_tot_1h_5', 'home_ra_tot_1h_10', 'home_ra_tot_g_1',
       'home_ra_tot_g_3', 'home_ra_tot_g_5', 'home_ra_tot_g_10',
       'away_ra_pts_1h_1', 'away_ra_pts_1h_3', 'away_ra_pts_1h_5',
       'away_ra_pts_1h_10', 'away_ra_pts_2h_1', 'away_ra_pts_2h_3',
       'away_ra_pts_2h_5', 'away_ra_pts_2h_10', 'away_ra_pts_g_1',
       'away_ra_pts_g_3', 'away_ra_pts_g_5', 'away_ra_pts_g_10',
       'away_ra_allowed_1h_1', 'away_ra_allowed_1h_3',
       'away_ra_allowed_1h_5', 'away_ra_allowed_1h_10',
       'away_ra_allowed_2h_1', 'away_ra_allowed_2h_3',
       'away_ra_allowed_2h_5', 'away_ra_allowed_2h_10',
       'away_ra_allowed_g_1', 'away_ra_allowed_g_3',
       'away_ra_allowed_g_5', 'away_ra_allowed_g_10', 'away_ra_mgn_1h_1',
       'away_ra_mgn_1h_3', 'away_ra_mgn_1h_5', 'away_ra_mgn_1h_10',
       'away_ra_mgn_2h_1', 'away_ra_mgn_2h_3', 'away_ra_mgn_2h_5',
       'away_ra_mgn_2h_10', 'away_ra_mgn_g_1', 'away_ra_mgn_g_3',
       'away_ra_mgn_g_5', 'away_ra_mgn_g_10', 'away_ra_tot_1h_1',
       'away_ra_tot_1h_3', 'away_ra_tot_1h_5', 'away_ra_tot_1h_10',
       'away_ra_tot_g_1', 'away_ra_tot_g_3', 'away_ra_tot_g_5',
       'away_ra_tot_g_10', 
       ## TEAM GAME STATS FEATURES
       'home_ra1_assists', 'home_ra3_assists',
       'home_ra5_assists', 'home_ra10_assists',
       'home_ra1_allowed_assists', 'home_ra3_allowed_assists',
       'home_ra5_allowed_assists', 'home_ra10_allowed_assists',
       'home_ra1_defensiveRebounds', 'home_ra3_defensiveRebounds',
       'home_ra5_defensiveRebounds', 'home_ra10_defensiveRebounds',
       'home_ra1_allowed_defensiveRebounds',
       'home_ra3_allowed_defensiveRebounds',
       'home_ra5_allowed_defensiveRebounds',
       'home_ra10_allowed_defensiveRebounds', 'home_ra1_fouls',
       'home_ra3_fouls', 'home_ra5_fouls', 'home_ra10_fouls',
       'home_ra1_allowed_fouls', 'home_ra3_allowed_fouls',
       'home_ra5_allowed_fouls', 'home_ra10_allowed_fouls',
       'home_ra1_totalRebounds', 'home_ra3_totalRebounds',
       'home_ra5_totalRebounds', 'home_ra10_totalRebounds',
       'home_ra1_allowed_totalRebounds', 'home_ra3_allowed_totalRebounds',
       'home_ra5_allowed_totalRebounds',
       'home_ra10_allowed_totalRebounds', 'home_ra1_pointsInPaint',
       'home_ra3_pointsInPaint', 'home_ra5_pointsInPaint',
       'home_ra10_pointsInPaint', 'home_ra1_allowed_pointsInPaint',
       'home_ra3_allowed_pointsInPaint', 'home_ra5_allowed_pointsInPaint',
       'home_ra10_allowed_pointsInPaint', 'home_ra1_technicalFouls',
       'home_ra3_technicalFouls', 'home_ra5_technicalFouls',
       'home_ra10_technicalFouls', 'home_ra1_allowed_technicalFouls',
       'home_ra3_allowed_technicalFouls',
       'home_ra5_allowed_technicalFouls',
       'home_ra10_allowed_technicalFouls', 'home_ra1_offensiveRebounds',
       'home_ra3_offensiveRebounds', 'home_ra5_offensiveRebounds',
       'home_ra10_offensiveRebounds',
       'home_ra1_allowed_offensiveRebounds',
       'home_ra3_allowed_offensiveRebounds',
       'home_ra5_allowed_offensiveRebounds',
       'home_ra10_allowed_offensiveRebounds', 'home_ra1_turnoverPoints',
       'home_ra3_turnoverPoints', 'home_ra5_turnoverPoints',
       'home_ra10_turnoverPoints', 'home_ra1_allowed_turnoverPoints',
       'home_ra3_allowed_turnoverPoints',
       'home_ra5_allowed_turnoverPoints',
       'home_ra10_allowed_turnoverPoints', 'home_ra1_steals',
       'home_ra3_steals', 'home_ra5_steals', 'home_ra10_steals',
       'home_ra1_allowed_steals', 'home_ra3_allowed_steals',
       'home_ra5_allowed_steals', 'home_ra10_allowed_steals',
       'home_ra1_blocks', 'home_ra3_blocks', 'home_ra5_blocks',
       'home_ra10_blocks', 'home_ra1_allowed_blocks',
       'home_ra3_allowed_blocks', 'home_ra5_allowed_blocks',
       'home_ra10_allowed_blocks', 'home_ra1_fastBreakPoints',
       'home_ra3_fastBreakPoints', 'home_ra5_fastBreakPoints',
       'home_ra10_fastBreakPoints', 'home_ra1_allowed_fastBreakPoints',
       'home_ra3_allowed_fastBreakPoints',
       'home_ra5_allowed_fastBreakPoints',
       'home_ra10_allowed_fastBreakPoints', 'home_ra1_turnovers',
       'home_ra3_turnovers', 'home_ra5_turnovers', 'home_ra10_turnovers',
       'home_ra1_allowed_turnovers', 'home_ra3_allowed_turnovers',
       'home_ra5_allowed_turnovers', 'home_ra10_allowed_turnovers',
       'away_ra1_assists', 'away_ra3_assists', 'away_ra5_assists',
       'away_ra10_assists', 'away_ra1_allowed_assists',
       'away_ra3_allowed_assists', 'away_ra5_allowed_assists',
       'away_ra10_allowed_assists', 'away_ra1_defensiveRebounds',
       'away_ra3_defensiveRebounds', 'away_ra5_defensiveRebounds',
       'away_ra10_defensiveRebounds',
       'away_ra1_allowed_defensiveRebounds',
       'away_ra3_allowed_defensiveRebounds',
       'away_ra5_allowed_defensiveRebounds',
       'away_ra10_allowed_defensiveRebounds', 'away_ra1_fouls',
       'away_ra3_fouls', 'away_ra5_fouls', 'away_ra10_fouls',
       'away_ra1_allowed_fouls', 'away_ra3_allowed_fouls',
       'away_ra5_allowed_fouls', 'away_ra10_allowed_fouls',
       'away_ra1_totalRebounds', 'away_ra3_totalRebounds',
       'away_ra5_totalRebounds', 'away_ra10_totalRebounds',
       'away_ra1_allowed_totalRebounds', 'away_ra3_allowed_totalRebounds',
       'away_ra5_allowed_totalRebounds',
       'away_ra10_allowed_totalRebounds', 'away_ra1_pointsInPaint',
       'away_ra3_pointsInPaint', 'away_ra5_pointsInPaint',
       'away_ra10_pointsInPaint', 'away_ra1_allowed_pointsInPaint',
       'away_ra3_allowed_pointsInPaint', 'away_ra5_allowed_pointsInPaint',
       'away_ra10_allowed_pointsInPaint', 'away_ra1_technicalFouls',
       'away_ra3_technicalFouls', 'away_ra5_technicalFouls',
       'away_ra10_technicalFouls', 'away_ra1_allowed_technicalFouls',
       'away_ra3_allowed_technicalFouls',
       'away_ra5_allowed_technicalFouls',
       'away_ra10_allowed_technicalFouls', 'away_ra1_offensiveRebounds',
       'away_ra3_offensiveRebounds', 'away_ra5_offensiveRebounds',
       'away_ra10_offensiveRebounds',
       'away_ra1_allowed_offensiveRebounds',
       'away_ra3_allowed_offensiveRebounds',
       'away_ra5_allowed_offensiveRebounds',
       'away_ra10_allowed_offensiveRebounds', 'away_ra1_turnoverPoints',
       'away_ra3_turnoverPoints', 'away_ra5_turnoverPoints',
       'away_ra10_turnoverPoints', 'away_ra1_allowed_turnoverPoints',
       'away_ra3_allowed_turnoverPoints',
       'away_ra5_allowed_turnoverPoints',
       'away_ra10_allowed_turnoverPoints', 'away_ra1_steals',
       'away_ra3_steals', 'away_ra5_steals', 'away_ra10_steals',
       'away_ra1_allowed_steals', 'away_ra3_allowed_steals',
       'away_ra5_allowed_steals', 'away_ra10_allowed_steals',
       'away_ra1_blocks', 'away_ra3_blocks', 'away_ra5_blocks',
       'away_ra10_blocks', 'away_ra1_allowed_blocks',
       'away_ra3_allowed_blocks', 'away_ra5_allowed_blocks',
       'away_ra10_allowed_blocks', 'away_ra1_fastBreakPoints',
       'away_ra3_fastBreakPoints', 'away_ra5_fastBreakPoints',
       'away_ra10_fastBreakPoints', 'away_ra1_allowed_fastBreakPoints',
       'away_ra3_allowed_fastBreakPoints',
       'away_ra5_allowed_fastBreakPoints',
       'away_ra10_allowed_fastBreakPoints', 'away_ra1_turnovers',
       'away_ra3_turnovers', 'away_ra5_turnovers', 'away_ra10_turnovers',
       'away_ra1_allowed_turnovers', 'away_ra3_allowed_turnovers',
       'away_ra5_allowed_turnovers', 'away_ra10_allowed_turnovers']]


In [97]:
kp_tvk_features_df = kp_tvk_features_df.sort_values(["date","team_id"], ascending=True).drop_duplicates(subset="team_id", keep="last")

In [99]:
def merge_team_ratings(features_df: pd.DataFrame, team_ratings_df: pd.DataFrame) -> pd.DataFrame:
    # --- Merge home team ratings
    merged = features_df.merge(
        team_ratings_df.add_suffix("_home"),
        how="left",
        left_on="home_team_id",
        right_on="team_id_home"
    )

    # --- Merge away team ratings
    merged = merged.merge(
        team_ratings_df.add_suffix("_away"),
        how="left",
        left_on="away_team_id",
        right_on="team_id_away"
    )

    # --- Optional: drop duplicate key columns introduced by the merges
    drop_cols = [
        "team_id_home","team_id_away"
    ]
    drop_cols = [c for c in drop_cols if c in merged.columns]
    merged = merged.drop(columns=drop_cols)

    return merged

inference_df = merge_team_ratings(game_features, kp_tvk_features_df)

In [100]:
inference_df = inference_df[[
       ## GAME INFO
       'game_id', 'date_utc', 'season', 'neutral_site', 'home_team',
       'home_team_id', 'conf_home', 'away_team', 'away_team_id', 'conf_away',
       ## GAME INFO FEATURES
       'neutral_site_flag', 'game_dow', 'game_month', 'game_dayofyear',
       'game_weekofyear', 'game_hour', 'is_weekend',
       ## TEAM GAME INFO FEATURES
       'home_days_since_last', 'away_days_since_last', 'home_ra_pts_1h_1',
       'home_ra_pts_1h_3', 'home_ra_pts_1h_5', 'home_ra_pts_1h_10',
       'home_ra_pts_2h_1', 'home_ra_pts_2h_3', 'home_ra_pts_2h_5',
       'home_ra_pts_2h_10', 'home_ra_pts_g_1', 'home_ra_pts_g_3',
       'home_ra_pts_g_5', 'home_ra_pts_g_10', 'home_ra_allowed_1h_1',
       'home_ra_allowed_1h_3', 'home_ra_allowed_1h_5',
       'home_ra_allowed_1h_10', 'home_ra_allowed_2h_1',
       'home_ra_allowed_2h_3', 'home_ra_allowed_2h_5',
       'home_ra_allowed_2h_10', 'home_ra_allowed_g_1',
       'home_ra_allowed_g_3', 'home_ra_allowed_g_5',
       'home_ra_allowed_g_10', 'home_ra_mgn_1h_1', 'home_ra_mgn_1h_3',
       'home_ra_mgn_1h_5', 'home_ra_mgn_1h_10', 'home_ra_mgn_2h_1',
       'home_ra_mgn_2h_3', 'home_ra_mgn_2h_5', 'home_ra_mgn_2h_10',
       'home_ra_mgn_g_1', 'home_ra_mgn_g_3', 'home_ra_mgn_g_5',
       'home_ra_mgn_g_10', 'home_ra_tot_1h_1', 'home_ra_tot_1h_3',
       'home_ra_tot_1h_5', 'home_ra_tot_1h_10', 'home_ra_tot_g_1',
       'home_ra_tot_g_3', 'home_ra_tot_g_5', 'home_ra_tot_g_10',
       'away_ra_pts_1h_1', 'away_ra_pts_1h_3', 'away_ra_pts_1h_5',
       'away_ra_pts_1h_10', 'away_ra_pts_2h_1', 'away_ra_pts_2h_3',
       'away_ra_pts_2h_5', 'away_ra_pts_2h_10', 'away_ra_pts_g_1',
       'away_ra_pts_g_3', 'away_ra_pts_g_5', 'away_ra_pts_g_10',
       'away_ra_allowed_1h_1', 'away_ra_allowed_1h_3',
       'away_ra_allowed_1h_5', 'away_ra_allowed_1h_10',
       'away_ra_allowed_2h_1', 'away_ra_allowed_2h_3',
       'away_ra_allowed_2h_5', 'away_ra_allowed_2h_10',
       'away_ra_allowed_g_1', 'away_ra_allowed_g_3',
       'away_ra_allowed_g_5', 'away_ra_allowed_g_10', 'away_ra_mgn_1h_1',
       'away_ra_mgn_1h_3', 'away_ra_mgn_1h_5', 'away_ra_mgn_1h_10',
       'away_ra_mgn_2h_1', 'away_ra_mgn_2h_3', 'away_ra_mgn_2h_5',
       'away_ra_mgn_2h_10', 'away_ra_mgn_g_1', 'away_ra_mgn_g_3',
       'away_ra_mgn_g_5', 'away_ra_mgn_g_10', 'away_ra_tot_1h_1',
       'away_ra_tot_1h_3', 'away_ra_tot_1h_5', 'away_ra_tot_1h_10',
       'away_ra_tot_g_1', 'away_ra_tot_g_3', 'away_ra_tot_g_5',
       'away_ra_tot_g_10', 
       ## TEAM GAME STATS FEATURES
       'home_ra1_assists', 'home_ra3_assists',
       'home_ra5_assists', 'home_ra10_assists',
       'home_ra1_allowed_assists', 'home_ra3_allowed_assists',
       'home_ra5_allowed_assists', 'home_ra10_allowed_assists',
       'home_ra1_defensiveRebounds', 'home_ra3_defensiveRebounds',
       'home_ra5_defensiveRebounds', 'home_ra10_defensiveRebounds',
       'home_ra1_allowed_defensiveRebounds',
       'home_ra3_allowed_defensiveRebounds',
       'home_ra5_allowed_defensiveRebounds',
       'home_ra10_allowed_defensiveRebounds', 'home_ra1_fouls',
       'home_ra3_fouls', 'home_ra5_fouls', 'home_ra10_fouls',
       'home_ra1_allowed_fouls', 'home_ra3_allowed_fouls',
       'home_ra5_allowed_fouls', 'home_ra10_allowed_fouls',
       'home_ra1_totalRebounds', 'home_ra3_totalRebounds',
       'home_ra5_totalRebounds', 'home_ra10_totalRebounds',
       'home_ra1_allowed_totalRebounds', 'home_ra3_allowed_totalRebounds',
       'home_ra5_allowed_totalRebounds',
       'home_ra10_allowed_totalRebounds', 'home_ra1_pointsInPaint',
       'home_ra3_pointsInPaint', 'home_ra5_pointsInPaint',
       'home_ra10_pointsInPaint', 'home_ra1_allowed_pointsInPaint',
       'home_ra3_allowed_pointsInPaint', 'home_ra5_allowed_pointsInPaint',
       'home_ra10_allowed_pointsInPaint', 'home_ra1_technicalFouls',
       'home_ra3_technicalFouls', 'home_ra5_technicalFouls',
       'home_ra10_technicalFouls', 'home_ra1_allowed_technicalFouls',
       'home_ra3_allowed_technicalFouls',
       'home_ra5_allowed_technicalFouls',
       'home_ra10_allowed_technicalFouls', 'home_ra1_offensiveRebounds',
       'home_ra3_offensiveRebounds', 'home_ra5_offensiveRebounds',
       'home_ra10_offensiveRebounds',
       'home_ra1_allowed_offensiveRebounds',
       'home_ra3_allowed_offensiveRebounds',
       'home_ra5_allowed_offensiveRebounds',
       'home_ra10_allowed_offensiveRebounds', 'home_ra1_turnoverPoints',
       'home_ra3_turnoverPoints', 'home_ra5_turnoverPoints',
       'home_ra10_turnoverPoints', 'home_ra1_allowed_turnoverPoints',
       'home_ra3_allowed_turnoverPoints',
       'home_ra5_allowed_turnoverPoints',
       'home_ra10_allowed_turnoverPoints', 'home_ra1_steals',
       'home_ra3_steals', 'home_ra5_steals', 'home_ra10_steals',
       'home_ra1_allowed_steals', 'home_ra3_allowed_steals',
       'home_ra5_allowed_steals', 'home_ra10_allowed_steals',
       'home_ra1_blocks', 'home_ra3_blocks', 'home_ra5_blocks',
       'home_ra10_blocks', 'home_ra1_allowed_blocks',
       'home_ra3_allowed_blocks', 'home_ra5_allowed_blocks',
       'home_ra10_allowed_blocks', 'home_ra1_fastBreakPoints',
       'home_ra3_fastBreakPoints', 'home_ra5_fastBreakPoints',
       'home_ra10_fastBreakPoints', 'home_ra1_allowed_fastBreakPoints',
       'home_ra3_allowed_fastBreakPoints',
       'home_ra5_allowed_fastBreakPoints',
       'home_ra10_allowed_fastBreakPoints', 'home_ra1_turnovers',
       'home_ra3_turnovers', 'home_ra5_turnovers', 'home_ra10_turnovers',
       'home_ra1_allowed_turnovers', 'home_ra3_allowed_turnovers',
       'home_ra5_allowed_turnovers', 'home_ra10_allowed_turnovers',
       'away_ra1_assists', 'away_ra3_assists', 'away_ra5_assists',
       'away_ra10_assists', 'away_ra1_allowed_assists',
       'away_ra3_allowed_assists', 'away_ra5_allowed_assists',
       'away_ra10_allowed_assists', 'away_ra1_defensiveRebounds',
       'away_ra3_defensiveRebounds', 'away_ra5_defensiveRebounds',
       'away_ra10_defensiveRebounds',
       'away_ra1_allowed_defensiveRebounds',
       'away_ra3_allowed_defensiveRebounds',
       'away_ra5_allowed_defensiveRebounds',
       'away_ra10_allowed_defensiveRebounds', 'away_ra1_fouls',
       'away_ra3_fouls', 'away_ra5_fouls', 'away_ra10_fouls',
       'away_ra1_allowed_fouls', 'away_ra3_allowed_fouls',
       'away_ra5_allowed_fouls', 'away_ra10_allowed_fouls',
       'away_ra1_totalRebounds', 'away_ra3_totalRebounds',
       'away_ra5_totalRebounds', 'away_ra10_totalRebounds',
       'away_ra1_allowed_totalRebounds', 'away_ra3_allowed_totalRebounds',
       'away_ra5_allowed_totalRebounds',
       'away_ra10_allowed_totalRebounds', 'away_ra1_pointsInPaint',
       'away_ra3_pointsInPaint', 'away_ra5_pointsInPaint',
       'away_ra10_pointsInPaint', 'away_ra1_allowed_pointsInPaint',
       'away_ra3_allowed_pointsInPaint', 'away_ra5_allowed_pointsInPaint',
       'away_ra10_allowed_pointsInPaint', 'away_ra1_technicalFouls',
       'away_ra3_technicalFouls', 'away_ra5_technicalFouls',
       'away_ra10_technicalFouls', 'away_ra1_allowed_technicalFouls',
       'away_ra3_allowed_technicalFouls',
       'away_ra5_allowed_technicalFouls',
       'away_ra10_allowed_technicalFouls', 'away_ra1_offensiveRebounds',
       'away_ra3_offensiveRebounds', 'away_ra5_offensiveRebounds',
       'away_ra10_offensiveRebounds',
       'away_ra1_allowed_offensiveRebounds',
       'away_ra3_allowed_offensiveRebounds',
       'away_ra5_allowed_offensiveRebounds',
       'away_ra10_allowed_offensiveRebounds', 'away_ra1_turnoverPoints',
       'away_ra3_turnoverPoints', 'away_ra5_turnoverPoints',
       'away_ra10_turnoverPoints', 'away_ra1_allowed_turnoverPoints',
       'away_ra3_allowed_turnoverPoints',
       'away_ra5_allowed_turnoverPoints',
       'away_ra10_allowed_turnoverPoints', 'away_ra1_steals',
       'away_ra3_steals', 'away_ra5_steals', 'away_ra10_steals',
       'away_ra1_allowed_steals', 'away_ra3_allowed_steals',
       'away_ra5_allowed_steals', 'away_ra10_allowed_steals',
       'away_ra1_blocks', 'away_ra3_blocks', 'away_ra5_blocks',
       'away_ra10_blocks', 'away_ra1_allowed_blocks',
       'away_ra3_allowed_blocks', 'away_ra5_allowed_blocks',
       'away_ra10_allowed_blocks', 'away_ra1_fastBreakPoints',
       'away_ra3_fastBreakPoints', 'away_ra5_fastBreakPoints',
       'away_ra10_fastBreakPoints', 'away_ra1_allowed_fastBreakPoints',
       'away_ra3_allowed_fastBreakPoints',
       'away_ra5_allowed_fastBreakPoints',
       'away_ra10_allowed_fastBreakPoints', 'away_ra1_turnovers',
       'away_ra3_turnovers', 'away_ra5_turnovers', 'away_ra10_turnovers',
       'away_ra1_allowed_turnovers', 'away_ra3_allowed_turnovers',
       'away_ra5_allowed_turnovers', 'away_ra10_allowed_turnovers',
       ## RATING FEATURES
       'adj_em_home','adj_em_rank_home', 'adj_oe_home', 'adj_oe_rank_home',
       'adj_def_home', 'adj_def_rank_home', 'adj_tempo_km_home',
       'adj_tempo_rank_home', 'rank_home', 'games_home',
       'adj_off_eff_home', 'adj_def_eff_home', 'barthag_home',
       'efg_pct_home', 'efgd_pct_home', 'tor_home', 'tord_home',
       'orb_home', 'drb_home', 'ftr_home', 'ftrd_home', 'two_pt_pct_home',
       'two_pt_def_pct_home', 'three_pt_pct_home',
       'three_pt_def_pct_home', 'three_pt_rt_home',
       'three_pt_def_rt_home', 'adj_tempo_tvk_home', 'wab_home',
       'avg_height_home', 'avg_height_rank_home', 'eff_height_home',
       'eff_height_rank_home', 'height_5_home', 'height_5_rank_home',
       'height_4_home', 'height_4_rank_home', 'height_3_home',
       'height_3_rank_home', 'height_2_home', 'height_2_rank_home',
       'height_1_home', 'height_1_rank_home', 'exp_home', 'exp_rank_home',
       'bench_home', 'bench_rank_home', 'continuity_home',
       'continuity_rank_home','adj_em_away', 'adj_em_rank_away', 'adj_oe_away',
       'adj_oe_rank_away', 'adj_def_away', 'adj_def_rank_away',
       'adj_tempo_km_away', 'adj_tempo_rank_away', 'rank_away',
       'games_away', 'adj_off_eff_away', 'adj_def_eff_away',
       'barthag_away', 'efg_pct_away', 'efgd_pct_away', 'tor_away',
       'tord_away', 'orb_away', 'drb_away', 'ftr_away', 'ftrd_away',
       'two_pt_pct_away', 'two_pt_def_pct_away', 'three_pt_pct_away',
       'three_pt_def_pct_away', 'three_pt_rt_away',
       'three_pt_def_rt_away', 'adj_tempo_tvk_away', 'wab_away',
       'avg_height_away', 'avg_height_rank_away', 'eff_height_away',
       'eff_height_rank_away', 'height_5_away', 'height_5_rank_away',
       'height_4_away', 'height_4_rank_away', 'height_3_away',
       'height_3_rank_away', 'height_2_away', 'height_2_rank_away',
       'height_1_away', 'height_1_rank_away', 'exp_away', 'exp_rank_away',
       'bench_away', 'bench_rank_away', 'continuity_away',
       'continuity_rank_away']]

In [111]:
import os
import json
import numpy as np
import pandas as pd
from joblib import load as joblib_load
from typing import List

# Where your per-target artifacts live
OUT_DIR = "../predicting/training_reports"  # where models & preprocessors were saved

TARGETS: List[str] = [
    "home_1h","away_1h","home_2h","away_2h","home_score","away_score",
    "total","1h_total","2h_total","margin","1h_margin","2h_margin"
]

# Final view columns you requested
ID_VIEW = ["game_id","date_utc","home_team","home_team_id","away_team","away_team_id"]

# Columns that should never be fed as raw features (mirrors your training)
DATE_COL = "date_utc"
RAW_TIME_COLS_TO_EXCLUDE = [
    DATE_COL, "season", "game_datetime_utc", "game_datetime_local",
    "date_home", "date_away"
]
ID_COLS = ["game_id", "home_team_id", "away_team_id"]

def _robust_joblib_load(path: str):
    """
    Load an artifact and gracefully handle sklearn private symbol changes by
    monkey-patching missing names commonly hit when unpickling ColumnTransformer.
    """
    try:
        return joblib_load(path)
    except AttributeError as e:
        # Attempt to patch common missing private symbol(s)
        try:
            from sklearn.compose import _column_transformer as _ct
            # Newer sklearn introduced _RemainderColsList; on older versions it's missing.
            if not hasattr(_ct, "_RemainderColsList"):
                class _RemainderColsList(list):
                    pass
                _ct._RemainderColsList = _RemainderColsList
            # Retry after patch
            return joblib_load(path)
        except Exception:
            # Re-raise original for clarity if patch fails
            raise e

def _load_artifact(target: str):
    path = os.path.join(OUT_DIR, target, f"{target}_final_model.joblib")
    if not os.path.exists(path):
        raise FileNotFoundError(f"Missing artifact for {target}: {path}")
    art = _robust_joblib_load(path)
    for k in ["preprocessor","model","features"]:
        if k not in art:
            raise ValueError(f"Artifact for {target} missing key: '{k}'")
    return art

def _ensure_columns(df: pd.DataFrame, expected_cols: List[str]) -> pd.DataFrame:
    """Add missing expected columns as NaN, drop extras, enforce expected order."""
    X = df.copy()
    for c in expected_cols:
        if c not in X.columns:
            X[c] = np.nan
    return X[expected_cols]

def predict_with_artifacts(inference_df: pd.DataFrame) -> pd.DataFrame:
    # Guard: ID columns for final view
    missing_ids = [c for c in ID_VIEW if c not in inference_df.columns]
    if missing_ids:
        raise KeyError(f"inference_df missing ID/display columns: {missing_ids}")

    preds = inference_df[ID_VIEW].copy()

    for target in TARGETS:
        print(f"[predict] {target} …")
        art = _load_artifact(target)
        pre = art["preprocessor"]       # fitted ColumnTransformer from training
        model = art["model"]            # fitted LightGBMRegressor
        feat_cols = art["features"]     # exact raw feature list used to fit pre

        # Align inference to training feature list
        X_raw = _ensure_columns(inference_df, feat_cols)

        # Transform & predict
        X_t = pre.transform(X_raw)

        # Optional sanity: check feature count if model exposes it
        n_model = getattr(model, "n_features_in_", None)
        if n_model is not None and X_t.shape[1] != n_model:
            raise ValueError(
                f"[{target}] Feature count mismatch: model expects {n_model}, got {X_t.shape[1]}.\n"
                f"Check that you are using the same preprocessor and feature set as training."
            )

        preds[f"pred_{target}"] = model.predict(X_t).astype(float)

    # Sort by date for readability (best-effort)
    try:
        preds = preds.sort_values("date_utc").reset_index(drop=True)
    except Exception:
        pass

    return preds

In [115]:
final_predictions = predict_with_artifacts(inference_df)


[predict] home_1h …
[predict] away_1h …
[predict] home_2h …
[predict] away_2h …
[predict] home_score …
[predict] away_score …
[predict] total …
[predict] 1h_total …
[predict] 2h_total …
[predict] margin …
[predict] 1h_margin …
[predict] 2h_margin …


In [119]:
date = "20251109"
game_ids = list(pd.read_csv(f"s3://collegebasketballinsiders/daily-box-score-ids/{date}/game_ids.csv")['game_id'])

final_predictions = final_predictions[final_predictions['game_id'].isin(game_ids)]
final_predictions.to_csv(f"s3://collegebasketballinsiders/predictions/{date}/preds.csv")