In [1]:
import os
import time
import json
import math
import datetime as dt
from typing import Iterable, Optional, Dict, Any, List

import requests
from requests.adapters import HTTPAdapter, Retry
import pandas as pd
from dotenv import load_dotenv
import os

# Optional .env support
try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

API_KEY = os.getenv("KENPOM_API_KEY")

API_BASE = "https://kenpom.com/api.php"
HEADERS = {"Authorization": f"Bearer {API_KEY}"}

In [2]:

def session_with_retries(total=5, backoff=0.5):
    s = requests.Session()
    retries = Retry(
        total=total,
        connect=total,
        read=total,
        backoff_factor=backoff,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def season_date_window(y: int,
                       start: Optional[str] = None,
                       end: Optional[str] = None) -> Iterable[dt.date]:
    # Default window: Oct 1 of previous year → Apr 15 of season’s end year
    if start is None:
        start = f"{y-1}-11-01"
    if end is None:
        end = f"{y}-04-15"
    d0 = dt.date.fromisoformat(start)
    d1 = dt.date.fromisoformat(end)
    cur = d0
    one = dt.timedelta(days=1)
    while cur <= d1:
        yield cur
        cur += one

def fetch_archive_for_date(s: requests.Session, day: dt.date) -> List[Dict[str, Any]]:
    params = {"endpoint": "archive", "d": day.isoformat()}
    r = s.get(API_BASE, params=params, headers=HEADERS, timeout=30)
    if r.status_code == 404:
        return []  # day not available
    r.raise_for_status()
    data = r.json()
    if not isinstance(data, list):
        return []
    return data

def fetch_preseason_for_year(s: requests.Session, y: int) -> List[Dict[str, Any]]:
    params = {"endpoint": "archive", "preseason": "true", "y": y}
    r = s.get(API_BASE, params=params, headers=HEADERS, timeout=30)
    if r.status_code == 404:
        return []
    r.raise_for_status()
    data = r.json()
    if not isinstance(data, list):
        return []
    return data

def fetch_teams_for_year(s: requests.Session, y: int) -> pd.DataFrame:
    params = {"endpoint": "teams", "y": y}
    r = s.get(API_BASE, params=params, headers=HEADERS, timeout=30)
    r.raise_for_status()
    data = r.json()
    return pd.DataFrame(data) if data else pd.DataFrame(columns=["Season","TeamName","TeamID","ConfShort"])

def normalize_rows(rows: List[Dict[str, Any]]) -> pd.DataFrame:
    if not rows:
        return pd.DataFrame()
    df = pd.json_normalize(rows)
    # Ensure consistent columns; add ArchiveDate if missing (e.g., preseason response should include it per docs)
    if "ArchiveDate" not in df.columns and "DataThrough" in df.columns:
        df["ArchiveDate"] = df["DataThrough"]  # fallback
    # Keep essential identifiers first
    preferred = [
        "ArchiveDate", "Season", "TeamName", "Seed", "ConfShort", "Event",
        "AdjEM", "RankAdjEM",
        "AdjOE", "RankAdjOE",
        "AdjDE", "RankAdjDE",
        "AdjTempo", "RankAdjTempo",
        "AdjEMFinal", "RankAdjEMFinal",
        "AdjOEFinal", "RankAdjOEFinal",
        "AdjDEFinal", "RankAdjDEFinal",
        "AdjTempoFinal", "RankAdjTempoFinal",
        "RankChg", "AdjEMChg", "AdjTChg",
        "Preseason"
    ]
    # Reorder if present
    cols = [c for c in preferred if c in df.columns] + [c for c in df.columns if c not in preferred]
    df = df[cols]
    # Ensure date type
    if "ArchiveDate" in df.columns:
        df["ArchiveDate"] = pd.to_datetime(df["ArchiveDate"], errors="coerce").dt.date
    return df

def scrape_daily_kenpom(y: int,
                        include_preseason: bool = True,
                        start: Optional[str] = None,
                        end: Optional[str] = None,
                        sleep_sec: float = 0.25) -> pd.DataFrame:
    if not API_KEY:
        raise RuntimeError("Set KENPOM_API_KEY in your environment (export KENPOM_API_KEY=...).")
    s = session_with_retries()

    # Optional: team list (useful for mapping / validation)
    teams_df = fetch_teams_for_year(s, y)

    frames = []

    if include_preseason:
        pre = fetch_preseason_for_year(s, y)
        pdf = normalize_rows(pre)
        if not pdf.empty:
            # Mark as preseason if field not provided
            if "Preseason" not in pdf.columns:
                pdf["Preseason"] = "true"
            frames.append(pdf)

    for day in season_date_window(y, start=start, end=end):
        try:
            rows = fetch_archive_for_date(s, day)
        except requests.HTTPError as e:
            # Skip on hard errors
            continue
        df = normalize_rows(rows)
        if not df.empty:
            # Ensure a season column exists
            if "Season" not in df.columns:
                df["Season"] = y
            frames.append(df)
        time.sleep(sleep_sec)

    if not frames:
        return pd.DataFrame()

    out = pd.concat(frames, ignore_index=True).drop_duplicates()

    # Optional: join TeamID if you want a stable key
    if "TeamName" in out.columns and not teams_df.empty:
        out = out.merge(
            teams_df[["TeamName", "TeamID"]],
            on="TeamName",
            how="left",
            validate="m:1"
        )

    # Sort for readability
    sort_cols = [c for c in ["ArchiveDate", "Season", "RankAdjEM", "TeamName"] if c in out.columns]
    if sort_cols:
        out = out.sort_values(sort_cols).reset_index(drop=True)

    return out

def _get_api_key() -> str:
    key = os.getenv("KENPOM_API_KEY")
    if not key:
        raise RuntimeError("KENPOM_API_KEY not set. Use a .env file or export it.")
    return key

def _session_with_retries(total: int = 5, backoff: float = 0.5) -> requests.Session:
    s = requests.Session()
    r = Retry(
        total=total, connect=total, read=total,
        backoff_factor=backoff,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=r))
    return s

def fetch_height_for_season(y: int, c: Optional[str] = None) -> pd.DataFrame:
    """
    Return Height snapshot for season y (optionally filtered by conference short name c).
    """
    headers = {"Authorization": f"Bearer {_get_api_key()}"}
    params = {"endpoint": "height", "y": y}
    if c:
        params["c"] = c
    s = _session_with_retries()
    resp = s.get(API_BASE, params=params, headers=headers, timeout=45)
    resp.raise_for_status()
    data = resp.json()
    df = pd.json_normalize(data) if isinstance(data, list) else pd.DataFrame()
    if df.empty:
        return df

    # Light normalization (no file I/O)
    if "DataThrough" in df.columns:
        df["DataThrough"] = pd.to_datetime(df["DataThrough"], errors="coerce").dt.date
    for col in [x for x in ["TeamName", "ConfShort"] if x in df.columns]:
        df[col] = df[col].astype("string")
    if "Season" in df.columns:
        df["Season"] = pd.to_numeric(df["Season"], errors="coerce").astype("Int64")
    # Rank columns to Int64; metric columns to float64
    for col in [c for c in df.columns if c.lower().endswith("rank")]:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
    for col in ["AvgHgt","HgtEff","Hgt5","Hgt4","Hgt3","Hgt2","Hgt1","Exp","Bench","Continuity"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")
    return df

def fetch_height_range(start_season: int, end_season: int, c: Optional[str] = None) -> pd.DataFrame:
    """
    Loop seasons [start_season, end_season] inclusive and return one concatenated DataFrame.
    """
    frames: List[pd.DataFrame] = []
    for y in range(start_season, end_season + 1):
        h = fetch_height_for_season(y, c=c)
        if not h.empty:
            frames.append(h)
    if not frames:
        return pd.DataFrame()
    out = pd.concat(frames, ignore_index=True)

    # Nice column order (purely cosmetic)
    preferred = [
        "Season","TeamName","ConfShort","DataThrough",
        "AvgHgt","AvgHgtRank","HgtEff","HgtEffRank",
        "Hgt5","Hgt5Rank","Hgt4","Hgt4Rank","Hgt3","Hgt3Rank","Hgt2","Hgt2Rank","Hgt1","Hgt1Rank",
        "Exp","ExpRank","Bench","BenchRank","Continuity","RankContinuity"
    ]
    cols = [c for c in preferred if c in out.columns] + [c for c in out.columns if c not in preferred]
    return out.reindex(columns=cols)

In [None]:
year = 2026
df = scrape_daily_kenpom(year, include_preseason=True)
df = df[['ArchiveDate','Season','TeamID','TeamName','Seed','ConfShort','Event',
         'AdjEM','RankAdjEM','AdjOE','RankAdjOE','AdjDE','RankAdjDE',
         'AdjTempo','RankAdjTempo']].copy()

df.to_csv(f"s3://collegebasketballinsiders/kenpom/{year}/team-ratings.csv")

df = fetch_height_range(year,year)
df.to_csv(f"s3://collegebasketballinsiders/kenpom/{year}/team-height.csv")