In [3]:
%%writefile ../pyproject.toml
[build-system]
requires = ["setuptools>=70", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "data_science"
version = "0.0.1"
description = "General data science/ML environment"
authors = [{ name = "Geoffrey Hadfield" }]
requires-python = ">=3.10,<3.12"   # stay on 3.10/3.11; 3.12 still shaky for some wheels

dependencies = [
  "numpy>=1.26",              # keep modern NumPy
  "pandas>=2.2",
  "scikit-learn>=1.5",
  "joblib",
  "matplotlib",
  "seaborn",
  "jupyterlab<5.0",
  "ipykernel<6.30",
  "dash",
  "dash-bootstrap-components",
  "plotly",
  "opencv-python-headless",
  "pillow",
  "tqdm",
  "statsmodels",
  "streamlit",
  "xgboost",
  "lightgbm",
  "requests",
  "IPython",
  "tabulate",
  "pyarrow>=10.0.0",
  "requests-cache",
  "diskcache",
  "unidecode",
  "cpi>=2.0.0",
  "lxml",
  "duckdb>=0.10.0",
  "apache-airflow>=2.9.0",
  # ---- Explainability stack ----
  "shap>=0.46.0",             # supports NumPy 2, so fine with 1.26+
  "numba>=0.58.1,<0.61",      # 0.58.1 adds NumPy 1.26 support; 0.60 adds NumPy2
  # llvmlite will be pulled transitively with the correct version
  # ---- NBA tooling ----
  "nba_api<=1.4.1",
  "beautifulsoup4",
]

[project.optional-dependencies]
spark = [
  "pyspark",
  "install-jdk>=1.1.0",
]
dev = [
  "pytest",
  "black",
  "flake8",
  "mypy",
]

[tool.black]
line-length = 88
target-version = ["py310"]

[tool.flake8]
max-line-length = 88
extend-ignore = ["E203"]

[tool.mypy]
python_version = "3.10"
ignore_missing_imports = true
strict_optional = true

[tool.setuptools.packages.find]
where = ["src"]





Overwriting ../pyproject.toml


# Tests:

In [4]:
%%writefile ../src/salary_nba_data_pull/__init__.py
"""
NBA Data Pull Package

A comprehensive package for fetching, processing, and analyzing NBA player data
including salaries, statistics, and advanced metrics.
"""

__version__ = "0.1.0"
__all__ = [
    "main",
    "fetch_utils", 
    "process_utils",
    "scrape_utils",
    "data_utils",
    "settings",
    "notebook_helper"
] 

Overwriting ../src/salary_nba_data_pull/__init__.py


In [5]:
%%writefile ../src/salary_nba_data_pull/settings.py
# src/salary_nba_data_pull/settings.py
from pathlib import Path
import os
import typing as _t

# 🗂️  Central data directory (override via env if needed)
DATA_PROCESSED_DIR = Path(
    (Path(__file__).resolve().parent.parent.parent)  # project root
    / "data"
    / "new_processed"
)

# optional: allow `DATA_PROCESSED_DIR=/tmp/demo python main.py …`
ENV_OVERRIDE: _t.Optional[str] = os.getenv("DATA_PROCESSED_DIR")
if ENV_OVERRIDE:
    DATA_PROCESSED_DIR = Path(ENV_OVERRIDE).expanduser().resolve()

# Legacy path for backward compatibility
LEGACY_DATA_PROCESSED_DIR = Path(
    (Path(__file__).resolve().parent.parent.parent)  # project root
    / "data"
    / "processed"
) 

Overwriting ../src/salary_nba_data_pull/settings.py


In [6]:
%%writefile ../src/salary_nba_data_pull/fetch_utils.py
import threading
import time
import random
import logging
import os
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache, wraps
from http import HTTPStatus
from typing import Callable
import requests
from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, playercareerstats, leaguestandings
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError
from joblib import Memory
from unidecode import unidecode
from tenacity import (
    retry, retry_if_exception, wait_random_exponential,
    stop_after_attempt, before_log
)

REQUESTS_PER_MIN = 8   # ↓ a bit safer for long pulls (NBA suggests ≤10)
_SEM = threading.BoundedSemaphore(REQUESTS_PER_MIN)

# Set up joblib memory for caching API responses
cache_dir = os.path.join(os.path.dirname(__file__), '../../data/cache/nba_api')
memory = Memory(cache_dir, verbose=0)

def _throttle():
    """Global semaphore + sleep to stay under REQUESTS_PER_MIN."""
    _SEM.acquire()
    time.sleep(60 / REQUESTS_PER_MIN)
    _SEM.release()

def _needs_retry(exc: Exception) -> bool:
    """Return True if we should retry."""
    if isinstance(exc, requests.HTTPError) and exc.response is not None:
        code = exc.response.status_code
        if code in (HTTPStatus.TOO_MANY_REQUESTS, HTTPStatus.SERVICE_UNAVAILABLE):
            return True
    return isinstance(exc, (requests.ConnectionError, requests.Timeout))

def _respect_retry_after(resp: requests.Response):
    """Sleep for server‑suggested time if header present."""
    if resp is not None and 'Retry-After' in resp.headers:
        try:
            sleep = int(resp.headers['Retry-After'])
            logging.warning("↺ server asked to wait %ss", sleep)
            time.sleep(sleep)
        except ValueError:
            pass   # header unparsable, ignore

def _make_retry(fn: Callable) -> Callable:
    """Decorator to add tenacity retry with jitter + respect Retry-After."""
    @retry(
        retry=retry_if_exception(_needs_retry),
        wait=wait_random_exponential(multiplier=2, max=60),
        stop=stop_after_attempt(5),
        before_sleep=before_log(logging.getLogger(__name__), logging.WARNING),
        reraise=True,
    )
    @wraps(fn)
    def _wrapper(*args, **kwargs):
        try:
            return fn(*args, **kwargs)
        except requests.HTTPError as exc:
            _respect_retry_after(exc.response)
            raise
    return _wrapper

@memory.cache
@_make_retry
def fetch_with_retry(endpoint, *, timeout=90, debug=False, **kwargs):
    """
    Thread‑safe, rate‑limited, cached NBA‑Stats call with adaptive back‑off.
    """
    _throttle()
    start = time.perf_counter()
    resp = endpoint(timeout=timeout, **kwargs)
    df = resp.get_data_frames()[0]
    if debug:
        logging.debug("✓ %s in %.1fs %s", endpoint.__name__,
                      time.perf_counter() - start, kwargs)
    return df

@memory.cache
def fetch_all_players(season: str, debug: bool = False) -> dict[str, dict]:
    """Return {clean_name: {'player_id':…, 'team_id':…}} for *active* roster."""
    roster_df = fetch_with_retry(
        commonallplayers.CommonAllPlayers,
        season=season,
        is_only_current_season=1,        # <‑‑ key fix
        league_id="00",
        debug=debug,
    )
    players: dict[str, dict] = {}
    if roster_df is not None:
        for _, row in roster_df.iterrows():
            clean = unidecode(row["DISPLAY_FIRST_LAST"]).strip().lower()
            players[clean] = {
                "player_id": int(row["PERSON_ID"]),
                "team_id": int(row["TEAM_ID"]),
            }
    if debug:
        print(f"[fetch_all_players] {len(players)} active players for {season}")
    return players

@lru_cache(maxsize=None)
def fetch_season_players(season: str, debug: bool = False) -> dict[str, dict]:
    """
    Return {clean_name: {'player_id':…, 'team_id':…}} for *everyone who was
    on a roster at any time during the given season*.
    """
    # call once for the whole database (not "current‑season only")
    df = fetch_with_retry(
        commonallplayers.CommonAllPlayers,
        season=season,
        is_only_current_season=0,         # <-- key change
        league_id="00",
        debug=debug,
    )
    players: dict[str, dict] = {}
    if df is not None:
        yr = int(season[:4])
        # keep rows whose career window encloses this season
        df = df[(df.FROM_YEAR.astype(int) <= yr) & (df.TO_YEAR.astype(int) >= yr)]
        for _, row in df.iterrows():
            clean = unidecode(row["DISPLAY_FIRST_LAST"]).strip().lower()
            players[clean] = {
                "player_id": int(row["PERSON_ID"]),
                "team_id": int(row["TEAM_ID"]),
            }

    if debug:
        print(f"[fetch_season_players] {len(players)} players for {season}")
    return players

@memory.cache
def fetch_player_info(player_id, debug=False):
    return fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id, debug=debug)

@memory.cache
def fetch_career_stats(player_id, debug=False):
    return fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id, debug=debug)

@memory.cache
def fetch_league_standings(season, debug=False):
    return fetch_with_retry(leaguestandings.LeagueStandings, season=season, debug=debug)

def clear_cache():
    """Clear the joblib memory cache."""
    memory.clear()

if __name__ == "__main__":
    # Example usage
    debug = True
    season = "2022-23"
    sample_player_name = "LeBron James"

    # Fetch all players
    all_players = fetch_all_players(season, debug=debug)
    print(f"Total players fetched: {len(all_players)}")

    # Fetch player info for a sample player
    if sample_player_name.lower() in all_players:
        sample_player_id = all_players[sample_player_name.lower()]['player_id']
        player_info = fetch_player_info(sample_player_id, debug=debug)
        print(f"Sample player info for {sample_player_name}:")
        print(player_info)

        # Fetch career stats for the sample player
        career_stats = fetch_career_stats(sample_player_id, debug=debug)
        print(f"Sample player career stats for {sample_player_name}:")
        print(career_stats)
    else:
        print(f"Player {sample_player_name} not found in the {season} season data.")

    # Fetch league standings
    standings = fetch_league_standings(season, debug=debug)
    print("League standings:")
    print(standings)


Overwriting ../src/salary_nba_data_pull/fetch_utils.py


In [7]:
%%writefile ../src/salary_nba_data_pull/scrape_utils.py
import pandas as pd
import requests
import time
import random
import re
from bs4 import BeautifulSoup
from io import StringIO
from typing import Optional
import os
import requests_cache
from unidecode import unidecode
from pathlib import Path
from datetime import datetime
from salary_nba_data_pull.settings import DATA_PROCESSED_DIR
from functools import lru_cache
import threading
_ADV_LOCK   = threading.Lock()
_ADV_CACHE: dict[str, pd.DataFrame] = {}   # season -> DataFrame

# Install cache for all requests
requests_cache.install_cache('nba_scraping', expire_after=86400)  # 24 hours

# Create cached session with stale-if-error capability
session = requests_cache.CachedSession(
    'nba_scraping',
    expire_after=86400,
    stale_if_error=True       # <-- NEW: serve expired cache if remote 429s
)

def scrape_salary_cap_history(*, debug: bool = False) -> pd.DataFrame | None:
    """
    Robust pull of historical cap / tax / apron lines.

    Strategy:
    1. Try RealGM (live HTML).
    2. If the selector fails, look for an existing CSV in DATA_PROCESSED_DIR.
    3. As a last‑chance fallback, hit NBA.com / Reuters bulletins for the
       current season only (so we still merge *something*).
    """
    import json
    from salary_nba_data_pull.settings import DATA_PROCESSED_DIR

    url = "https://basketball.realgm.com/nba/info/salary_cap"

    try:
        html = requests.get(url, timeout=30).text
        soup = BeautifulSoup(html, "html.parser")

        # -------- 1️⃣  RealGM table (new markup) --------------------
        blk = soup.find("pre")                      # new 2025 layout
        if blk:                                     # parse fixed‑width block
            rows = [r.strip().split() for r in blk.text.strip().splitlines()]
            header = rows[0]
            data = rows[1:]
            df = pd.DataFrame(data, columns=header)
        else:
            # Legacy table path (kept for safety)
            tbl = soup.select_one("table")
            if not tbl:
                raise ValueError("salary_cap table not found")
            df = pd.read_html(str(tbl))[0]

        # ---- normalise ----
        df["Season"] = df["Season"].str.extract(r"(\d{4}-\d{4})")
        money_cols = [c for c in df.columns if c != "Season"]
        for c in money_cols:
            df[c] = (
                df[c]
                .astype(str)
                .str.replace(r"[$,]", "", regex=True)
                .replace("", pd.NA)
                .astype(float)
            )

        if debug:
            print(f"[salary‑cap] scraped {len(df)} rows from RealGM")

        return df

    except Exception as exc:
        if debug:
            print(f"[salary‑cap] primary scrape failed → {exc!s}")

        # -------- 2️⃣  local cached CSV ----------------------------
        fallback = DATA_PROCESSED_DIR / "salary_cap_history_inflated.csv"
        if fallback.exists():
            if debug:
                print(f"[salary‑cap] using cached CSV at {fallback}")
            return pd.read_csv(fallback)

        # -------- 3️⃣  NBA.com / Reuters one‑liner -----------------
        try:
            # Latest season only
            # For now, create a minimal fallback with current season data
            year = datetime.now().year
            cap = 140.588  # 2024-25 cap as fallback
            df = pd.DataFrame(
                {"Season": [f"{year}-{str(year+1)[-2:]}"],
                 "Salary Cap": [cap * 1_000_000]}
            )
            if debug:
                print("[salary‑cap] built minimal one‑row DataFrame "
                      "from fallback values")
            return df
        except Exception:
            pass

    if debug:
        print("[salary‑cap] giving up – no data available")
    return None

# User-Agent header to avoid Cloudflare blocks
UA = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/126.0.0.0 Safari/537.36"
    )
}
DELAY_BETWEEN_REQUESTS = 3  # seconds

# Define column templates to guarantee DataFrame structure
PLAYER_COLS = ["Player", "Salary", "Season"]
TEAM_COLS = ["Team", "Team_Salary", "Season"]

# Salary parsing pattern
_salary_pat = re.compile(r"\$?\d[\d,]*")

def _clean_salary(text: str) -> int | None:
    """Return salary as int or None when text has no digits."""
    m = _salary_pat.search(text)
    return int(m.group(0).replace(",", "").replace("$", "")) if m else None

# Name normalization pattern with unidecode
def _normalise_name(raw: str) -> str:
    """ASCII‑fold, trim, lower."""
    return unidecode(raw).split(",")[0].split("(")[0].strip().lower()


# ------- INTERNAL HELPER --------
def _get_hoopshype_soup(url: str, debug: bool = False) -> Optional[BeautifulSoup]:
    """
    Hit HoopsHype once with a realistic UA.  
    Return BeautifulSoup if the page looks OK, else None.
    """
    for attempt in range(2):
        try:
            if debug:
                print(f"[fetch] {url} (attempt {attempt+1})")
            resp = requests.get(url, headers=UA, timeout=30)
            if resp.status_code != 200:
                if debug:
                    print(f"  -> HTTP {resp.status_code}, skipping.")
                return None
            html = resp.text
            # crude Cloudflare challenge check
            if ("Access denied" in html) or ("cf-chl" in html):
                if debug:
                    print("  -> Cloudflare challenge detected; giving up.")
                return None
            return BeautifulSoup(html, "html.parser")
        except requests.RequestException as e:
            if debug:
                print(f"  -> network error {e}, retrying…")
            time.sleep(2 ** attempt + random.random())
    return None
# --------------------------------------------------------------------------


def _espn_salary_url(year: int, page: int = 1) -> str:
    """
    Build the new ESPN salary URL. Examples:
      page 1 → https://www.espn.com/nba/salaries/_/year/2024/seasontype/4
      page 3 → https://www.espn.com/nba/salaries/_/year/2024/page/3/seasontype/4
    """
    base = f"https://www.espn.com/nba/salaries/_/year/{year}"
    return f"{base}/seasontype/4" if page == 1 else f"{base}/page/{page}/seasontype/4"


def _scrape_espn_player_salaries(season_start: int, debug: bool = False) -> list[dict]:
    """
    DEPRECATED: Salary scraping was removed – consume pre-loaded salary parquet instead.
    """
    raise NotImplementedError(
        "Salary scraping was removed – consume pre-loaded salary parquet instead."
    )


def scrape_player_salary_data(start_season: int, end_season: int,
                              player_filter: str | None = None,
                              debug: bool = False) -> pd.DataFrame:
    """
    DEPRECATED: Salary scraping was removed – consume pre-loaded salary parquet instead.
    """
    raise NotImplementedError(
        "Salary scraping was removed – consume pre-loaded salary parquet instead."
    )
# --------------------------------------------------------------------------


def _scrape_espn_team_salaries(season: str, debug: bool = False) -> list[dict]:
    """
    DEPRECATED: Team salary scraping was removed – consume pre-loaded salary parquet instead.
    """
    raise NotImplementedError(
        "Team salary scraping was removed – consume pre-loaded salary parquet instead."
    )


def scrape_team_salary_data(season: str, debug: bool = False) -> pd.DataFrame:
    """
    DEPRECATED: Team salary scraping was removed – consume pre-loaded salary parquet instead.
    """
    raise NotImplementedError(
        "Team salary scraping was removed – consume pre-loaded salary parquet instead."
    )

# --- Season‑level advanced stats --------------------------------------------
ADV_METRIC_COLS = [
    "PER", "TS%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%",
    "TOV%", "USG%", "OWS", "DWS", "WS", "WS/48", "OBPM", "DBPM", "BPM", "VORP",
    "ORtg", "DRtg",  # extra goodies if you want them
]

def _season_advanced_df(season: str) -> pd.DataFrame:
    """
    Thread‑safe, memoised download of the *season‑wide* advanced‑stats table.

    The first thread to request a given season does the HTTP work while holding
    a lock; all others simply wait for the result instead of firing duplicate
    requests. The DataFrame is cached in‑process for the life of the run.
    """
    if season in _ADV_CACHE:            # fast path, no lock
        return _ADV_CACHE[season]

    with _ADV_LOCK:                     # only one thread may enter the block
        if season in _ADV_CACHE:        # double‑checked locking
            return _ADV_CACHE[season]

        end_year = int(season[:4]) + 1
        url = f"https://www.basketball-reference.com/leagues/NBA_{end_year}_advanced.html"
        print(f"[adv] fetching {url}")
        resp = session.get(url, headers=UA, timeout=30)
        resp.raise_for_status()

        df = pd.read_html(StringIO(resp.text), header=0)[0]
        df = df[df.Player != "Player"]          # drop repeated header rows
        df["player_key"] = df.Player.map(_normalise_name)

        avail = [c for c in ADV_METRIC_COLS if c in df.columns]
        if avail:
            df[avail] = df[avail].apply(pd.to_numeric, errors="coerce")

        _ADV_CACHE[season] = df                # memoise
        time.sleep(random.uniform(1.5, 2.5))   # be polite
        return df

def scrape_advanced_metrics(player_name: str,
                            season: str,
                            *,
                            debug: bool = False) -> dict:
    """
    O(1) lookup in the cached season DataFrame – zero extra HTTP traffic.
    """
    df = _season_advanced_df(season)
    key = _normalise_name(player_name)
    row = df.loc[df.player_key == key]
    if row.empty:
        if debug:
            print(f"[adv] no advanced stats for {player_name} in {season}")
        return {}

    row = row.iloc[0]
    # Only return columns that actually exist in the DataFrame
    available_cols = [col for col in ADV_METRIC_COLS if col in row.index]
    result = {col: row[col] for col in available_cols}
    if debug:
        print(f"[adv] {player_name} → {result}")
    return result
# --- End of new season-level advanced stats ---------------------------------

def load_injury_data(
    file_path: str | Path | None = None,
    *,
    base_dir: str | Path | None = None,
    debug: bool = False,
):
    """
    Load the historical injury CSV. By default we look inside the *new*
    processed folder; pass ``file_path`` to override a specific file,
    or ``base_dir`` to point at a different processed directory.
    """
    root = Path(base_dir) if base_dir else DATA_PROCESSED_DIR
    if file_path is None:
        file_path = root / "NBA Player Injury Stats(1951 - 2023).csv"
    file_path = Path(file_path).expanduser().resolve()

    try:
        injury = (
            pd.read_csv(file_path)
            .assign(Date=lambda d: pd.to_datetime(d["Date"]))
        )
        injury["Season"] = injury["Date"].apply(
            lambda x: (
                f"{x.year}-{str(x.year + 1)[-2:]}"
                if x.month >= 10
                else f"{x.year - 1}-{str(x.year)[-2:]}"
            )
        )
        if debug:
            print(f"[load_injury_data] loaded {len(injury):,} rows from {file_path}")
        return injury
    except FileNotFoundError:
        if debug:
            print(f"[load_injury_data] ✖ no injury file at {file_path}")
        return None

if __name__ == "__main__":
    # Example usage and testing of all functions
    debug = True
    start_season = 2022
    end_season = 2023
    sample_player = "Ja Morant"  # Example player

    print("1. Testing scrape_salary_cap_history:")
    salary_cap_history = scrape_salary_cap_history(debug=debug)

    print("\n2. Testing scrape_player_salary_data:")
    player_salary_data = scrape_player_salary_data(start_season, end_season, player_filter=sample_player, debug=debug)

    print("\n3. Testing scrape_team_salary_data:")
    team_salary_data = scrape_team_salary_data(f"{start_season}-{str(start_season+1)[-2:]}", debug=debug)

    print("\n4. Testing scrape_advanced_metrics:")
    advanced_metrics = scrape_advanced_metrics(sample_player, f"{start_season}-{str(start_season+1)[-2:]}", debug=debug)
    print(f"Advanced Metrics for {sample_player}:")
    print(advanced_metrics)

    print("\n5. Testing load_injury_data and merge_injury_data:")
    injury_data = load_injury_data()
    if injury_data is not None:
        print(injury_data.head())
    else:
        print("No injury data loaded.")
    if not player_salary_data.empty and injury_data is not None:
        from salary_nba_data_pull.process_utils import merge_injury_data
        merged_data = merge_injury_data(player_salary_data, injury_data)
        print("Merged data with injury info:")
        columns_to_display = ['Player', 'Season', 'Salary']
        if 'Injured' in merged_data.columns:
            columns_to_display.append('Injured')
        if 'Injury_Periods' in merged_data.columns:
            columns_to_display.append('Injury_Periods')
        if 'Total_Days_Injured' in merged_data.columns:
            columns_to_display.append('Total_Days_Injured')
        if 'Injury_Risk' in merged_data.columns:
            columns_to_display.append('Injury_Risk')
        print(merged_data[columns_to_display].head())

    if not player_salary_data.empty:
        avg_salary = player_salary_data['Salary'].mean()
        print(f"Average salary for {sample_player} from {start_season} to {end_season}: ${avg_salary:,.2f}")

    if not team_salary_data.empty:
        highest_team_salary = team_salary_data.loc[team_salary_data['Team_Salary'].idxmax()]
        print(f"Team with highest salary in {start_season}-{end_season}: {highest_team_salary['Team']} (${highest_team_salary['Team_Salary']:,.2f})")

    if not injury_data.empty:
        injury_count = injury_data['Relinquished'].str.contains(sample_player, case=False).sum()
        print(f"Number of injuries/illnesses for {sample_player} from {start_season} to {end_season}: {injury_count}")

    print("\nAll tests completed.")


Overwriting ../src/salary_nba_data_pull/scrape_utils.py


In [8]:
%%writefile ../src/salary_nba_data_pull/process_utils.py
import pandas as pd
import numpy as np
import logging
import sqlite3
from datetime import datetime
from functools import lru_cache
from salary_nba_data_pull.fetch_utils import fetch_all_players, fetch_career_stats, fetch_player_info, fetch_league_standings
from salary_nba_data_pull.scrape_utils import scrape_advanced_metrics

# --- CPI lazy‑loader --------------------------------------------------
_CPI_AVAILABLE = False  # toggled at runtime

@lru_cache(maxsize=1)
def _ensure_cpi_ready(debug: bool = False) -> bool:
    """
    Import `cpi` lazily and guarantee its internal SQLite DB is usable.
    Returns True when inflation data are available, False otherwise.
    """
    global _CPI_AVAILABLE
    try:
        import importlib
        cpi = importlib.import_module("cpi")        # late import
        try:
            _ = cpi.models.Series.get_by_id("0000")  # 1‑row sanity query
            _CPI_AVAILABLE = True
            return True
        except sqlite3.OperationalError:
            if debug:
                logging.warning("[CPI] DB invalid – rebuilding from BLS…")
            cpi.update(rebuild=True)                # expensive network call
            _CPI_AVAILABLE = True
            return True
    except ModuleNotFoundError:
        if debug:
            logging.warning("[CPI] package not installed")
    except Exception as e:
        if debug:
            logging.error("[CPI] unexpected CPI failure: %s", e)
    return False
# ---------------------------------------------------------------------

def inflate_value(value: float, year_str: str,
                  *, debug: bool = False, skip_inflation: bool = False) -> float:
    """
    Inflate `value` from the dollars of `year_str` (YYYY or YYYY‑YY) to 2022 USD.
    If CPI data are unavailable or the user opts out, return the original value.
    """
    if skip_inflation or not _ensure_cpi_ready(debug):
        return value
    try:
        import cpi                                       # safe: DB ready
        year = int(year_str[:4])
        if year >= datetime.now().year:
            return value
        return float(cpi.inflate(value, year, to=2022))
    except Exception as e:
        if debug:
            logging.error("[CPI] inflate failed for %s: %s", year_str, e)
        return value
# ---------------------------------------------------------------------

def calculate_percentages(df, debug=False):
    """
    Calculate shooting percentages and other derived statistics.
    """
    if df.empty:
        return df

    # Calculate shooting percentages
    if 'FGA' in df.columns and 'FG' in df.columns:
        df['FG%'] = (df['FG'] / df['FGA'] * 100).round(2)
        df['FG%'] = df['FG%'].replace([np.inf, -np.inf], np.nan)

    if '3PA' in df.columns and '3P' in df.columns:
        df['3P%'] = (df['3P'] / df['3PA'] * 100).round(2)
        df['3P%'] = df['3P%'].replace([np.inf, -np.inf], np.nan)

    if 'FTA' in df.columns and 'FT' in df.columns:
        df['FT%'] = (df['FT'] / df['FTA'] * 100).round(2)
        df['FT%'] = df['FT%'].replace([np.inf, -np.inf], np.nan)

    # Calculate efficiency metrics
    if 'PTS' in df.columns and 'FGA' in df.columns and 'FTA' in df.columns:
        df['TS%'] = (df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA'])) * 100).round(2)
        df['TS%'] = df['TS%'].replace([np.inf, -np.inf], np.nan)

    if 'PTS' in df.columns and 'MP' in df.columns:
        df['PTS_per_36'] = (df['PTS'] / df['MP'] * 36).round(2)
        df['PTS_per_36'] = df['PTS_per_36'].replace([np.inf, -np.inf], np.nan)

    if 'AST' in df.columns and 'MP' in df.columns:
        df['AST_per_36'] = (df['AST'] / df['MP'] * 36).round(2)
        df['AST_per_36'] = df['AST_per_36'].replace([np.inf, -np.inf], np.nan)

    if 'TRB' in df.columns and 'MP' in df.columns:
        df['TRB_per_36'] = (df['TRB'] / df['MP'] * 36).round(2)
        df['TRB_per_36'] = df['TRB_per_36'].replace([np.inf, -np.inf], np.nan)

    if debug:
        print("Percentage calculations completed")

    return df

def process_player_data(player_name: str, season: str,
                        all_players: dict[str, dict], *,
                        debug: bool = False) -> dict | None:
    """
    Build a single‑player dict **including Games Started (GS)** and keep the
    schema aligned with dataset 1.
    """
    meta = all_players.get(player_name.lower().strip())
    if not meta:
        return None

    pid = meta["player_id"]
    info_df   = fetch_player_info(pid, debug=debug)
    career_df = fetch_career_stats(pid, debug=debug)
    if career_df is None or career_df.empty:
        return None

    season_row = career_df.loc[career_df.SEASON_ID.eq(season)]
    if season_row.empty:
        return None
    season_row = season_row.iloc[0]

    data = {
        # ---------- BASIC ------------
        "Player": player_name,
        "Season": season,
        "Team":   season_row["TEAM_ABBREVIATION"],
        "Age":    season_row["PLAYER_AGE"],
        "GP":     season_row["GP"],
        "GS":     season_row.get("GS", 0),        # <-- NEW
        "MP":     season_row["MIN"],
        # ---------- SCORING ----------
        "PTS": season_row["PTS"],
        "FG":  season_row["FGM"],  "FGA": season_row["FGA"],
        "3P":  season_row["FG3M"], "3PA": season_row["FG3A"],
        "FT":  season_row["FTM"],  "FTA": season_row["FTA"],
        # ---------- OTHER ------------
        "TRB": season_row["REB"], "AST": season_row["AST"],
        "STL": season_row["STL"], "BLK": season_row["BLK"],
        "TOV": season_row["TOV"], "PF":  season_row["PF"],
    }

    # roster meta
    if info_df is not None and not info_df.empty:
        ir = info_df.iloc[0]
        data["Position"]          = ir.get("POSITION", "")
        data["TeamID"]            = ir.get("TEAM_ID", None)
        data["Years_of_Service"]  = ir.get("SEASON_EXP", None)
    else:
        data["TeamID"] = meta.get("team_id")

    # ---------- Derived shooting splits ----------
    two_att          = data["FGA"] - data["3PA"]
    data["2P"]       = data["FG"] - data["3P"]
    data["2PA"]      = two_att
    data["eFG%"]     = round((data["FG"] + 0.5 * data["3P"]) / data["FGA"] * 100 ,2) if data["FGA"] else None
    data["2P%"]      = round(data["2P"] / two_att * 100 ,2)                           if two_att else None

    # ---------- Advanced metrics ----------
    try:
        data.update(scrape_advanced_metrics(player_name, season, debug=debug))
    except Exception as exc:
        if debug:
            logging.warning("%s advanced scrape failed: %s", player_name, exc)

    return data

def merge_injury_data(player_data: pd.DataFrame,
                      injury_data: pd.DataFrame | None) -> pd.DataFrame:
    """
    Attach four injury‑related columns. If a player has no injuries, leave the fields as NA
    (pd.NA) instead of empty strings so repeated runs compare equal.
    """
    import pandas as pd

    if player_data.empty:
        return player_data

    out = player_data.copy()

    # Ensure columns exist with NA defaults
    defaults = {
        "Injured": False,
        "Injury_Periods": pd.NA,
        "Total_Days_Injured": 0,
        "Injury_Risk": "Low Risk",
    }
    for c, v in defaults.items():
        if c not in out.columns:
            out[c] = v

    if injury_data is None or injury_data.empty:
        # normalize empties just in case
        out["Injury_Periods"] = out["Injury_Periods"].replace("", pd.NA)
        return out

    # Process each player/season
    for idx, row in out.iterrows():
        pname = row["Player"]
        season = row["Season"]

        mask = (injury_data["Season"] == season) & \
               (injury_data["Relinquished"].str.contains(pname, case=False, na=False))
        player_inj = injury_data.loc[mask]

        if player_inj.empty:
            continue  # keep defaults

        periods = []
        total_days = 0
        for _, inj in player_inj.iterrows():
            start = inj["Date"]
            # find the first acquired record after start
            got_back = injury_data[
                (injury_data["Date"] > start) &
                (injury_data["Acquired"].str.contains(pname, case=False, na=False))
            ]
            if not got_back.empty:
                end = got_back.iloc[0]["Date"]
            else:
                end_year = int(season.split("-")[1])
                end = pd.Timestamp(f"{end_year}-06-30")

            total_days += (end - start).days
            periods.append(f"{start:%Y-%m-%d} - {end:%Y-%m-%d}")

        out.at[idx, "Injured"] = True
        out.at[idx, "Injury_Periods"] = "; ".join(periods) if periods else pd.NA
        out.at[idx, "Total_Days_Injured"] = total_days

        if total_days < 10:
            risk = "Low Risk"
        elif total_days <= 20:
            risk = "Moderate Risk"
        else:
            risk = "High Risk"
        out.at[idx, "Injury_Risk"] = risk

    # final normalization
    out["Injury_Periods"] = out["Injury_Periods"].replace("", pd.NA)

    return out



Overwriting ../src/salary_nba_data_pull/process_utils.py


In [9]:
%%writefile ../src/salary_nba_data_pull/data_utils.py

import pandas as pd
import numpy as np
from pathlib import Path
from salary_nba_data_pull.process_utils import (
    inflate_value
)
from salary_nba_data_pull.quality import (
    ExpectedSchema, audit_dataframe, write_audit_reports
)
from salary_nba_data_pull.settings import DATA_PROCESSED_DIR

PRESERVE_EVEN_IF_ALL_NA = {
    "3P%", "Injured", "Injury_Periods", "Total_Days_Injured", "Injury_Risk"
}

# --- NEW helper ------------------------------------------------------
def load_salary_cap_parquet(path: str | Path, *, debug: bool = False) -> pd.DataFrame:
    """
    Load the pre‑inflated salary‑cap parquet file; fall back to CSV loader
    if the parquet is not found.
    """
    path = Path(path).expanduser().with_suffix(".parquet")
    if path.exists():
        if debug:
            print(f"[salary-cap] loading Parquet: {path}")
        return pd.read_parquet(path)
    # fallback to old CSV helper for legacy compatibility
    csv_path = path.with_suffix(".csv")
    if csv_path.exists():
        return load_salary_cap_csv(csv_path, debug=debug)
    raise FileNotFoundError(f"No salary‑cap parquet or CSV found at {path}")

def load_salary_cap_csv(path: str | Path, *, debug: bool = False) -> pd.DataFrame:
    """
    Load the preprocessed salary cap CSV (inflated) instead of scraping.
    We DO NOT fill or coerce silently – if a required column is missing,
    we log it and let the caller decide.
    """
    path = Path(path).expanduser().resolve()
    if debug:
        print(f"[salary-cap] loading local file: {path}")
    df = pd.read_csv(path)
    if debug:
        print(f"[salary-cap] rows={len(df)}, cols={df.columns.tolist()}")
    return df

def clean_dataframe(df):
    # Remove unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Remove columns with all NaN values **except** ones we want to keep
    all_na = df.columns[df.isna().all()]
    to_drop = [c for c in all_na if c not in PRESERVE_EVEN_IF_ALL_NA]
    df = df.drop(columns=to_drop)

    # Remove rows with all NaN values
    df = df.dropna(axis=0, how='all')

    # Ensure only one 'Season' column exists
    season_columns = [col for col in df.columns if 'Season' in col]
    if len(season_columns) > 1:
        df = df.rename(columns={season_columns[0]: 'Season'})
        for col in season_columns[1:]:
            df = df.drop(columns=[col])

    # Remove '3PAr' and 'FTr' columns
    columns_to_remove = ['3PAr', 'FTr']
    df = df.drop(columns=columns_to_remove, errors='ignore')

    # Round numeric columns to 2 decimal places
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].round(2)

    return df

def merge_salary_cap_data(player_data: pd.DataFrame,
                          salary_cap_data: pd.DataFrame,
                          *,
                          debug: bool = False) -> pd.DataFrame:
    """
    Left-merge cap data by season-year. Preserve all cap columns even if all NaN.
    """
    if player_data.empty or salary_cap_data.empty:
        if debug:
            print("[merge_salary_cap_data] one side empty -> returning player_data unchanged")
        return player_data

    # Make sure we don't mutate originals
    p = player_data.copy()
    cap = salary_cap_data.copy()

    # Extract year
    p["Season_Year"]   = p["Season"].str[:4].astype(int)
    cap["Season_Year"] = cap["Season"].str[:4].astype(int)

    # Inflate cap if not present
    if "Salary_Cap_Inflated" not in cap.columns:
        if debug:
            print("[merge_salary_cap_data] computing Salary_Cap_Inflated")
        cap["Salary_Cap_Inflated"] = cap.apply(
            lambda r: inflate_value(r.get("Salary Cap", np.nan), r.get("Season", "")),
            axis=1
        )

    # Merge
    merged = pd.merge(p, cap, on="Season_Year", how="left", suffixes=("", "_cap"))

    # Figure out which columns came from cap
    cap_cols = [c for c in cap.columns if c not in {"Season_Year"}]

    # For each cap col, if we created a *_cap twin, consolidate
    for col in cap_cols:
        src = f"{col}_cap"
        if src in merged.columns:
            merged[col] = merged[col].where(~merged[col].isna(), merged[src])
            merged.drop(columns=[src], inplace=True)

    # Cleanup
    merged.drop(columns=["Season_Year"], inplace=True)

    # Protect salary-cap columns from being dropped in clean_dataframe
    global PRESERVE_EVEN_IF_ALL_NA
    PRESERVE_EVEN_IF_ALL_NA = PRESERVE_EVEN_IF_ALL_NA.union(set(cap_cols))

    merged = clean_dataframe(merged)

    if debug:
        miss = [c for c in cap_cols if c not in merged.columns]
        if miss:
            print(f"[merge_salary_cap_data] WARNING missing cap cols after merge: {miss}")

    return merged

def load_external_salary_data(season: str,
                              root: Path | str = DATA_PROCESSED_DIR / "salary_external",
                              *, debug: bool = False) -> pd.DataFrame:
    """
    Read player‑salary parquet pre‑dropped by an upstream job.
    Expected path:  {root}/season={YYYY-YY}/part.parquet
    """
    path = Path(root) / f"season={season}/part.parquet"
    if not path.exists():
        if debug:
            print(f"[salary‑ext] no salary file at {path}")
        return pd.DataFrame(columns=["Player", "Salary", "Season"])
    if debug:
        print(f"[salary‑ext] loading {path}")
    return pd.read_parquet(path)

def validate_data(df: pd.DataFrame,
                  *,
                  name: str = "player_dataset",
                  save_reports: bool = True) -> pd.DataFrame:
    """
    Same validation, but salary columns are now OPTIONAL.
    """
    schema = ExpectedSchema(
        expected_cols=df.columns,
        required_cols=["Season", "Player", "Team"],   # ‼ Salary removed
        dtypes={
            "Season": "object",
            "Player": "object",
        },
        # Salary & Team_Salary dropped from non‑neg / non‑constant
        non_negative_cols=["GP", "MP", "PTS", "TRB", "AST"],
        non_constant_cols=["PTS"],
        unique_key=["Season", "Player"]
    )

    reports = audit_dataframe(df, schema, name=name)

    if save_reports:
        out_dir = DATA_PROCESSED_DIR / "audits"
        write_audit_reports(reports, out_dir, prefix=name)

    # Print a one-liner summary (optional)
    missing_req = reports["cols_overview"].query("missing_required == True")
    if not missing_req.empty:
        print(f"[validate_data] Missing required columns: {missing_req['column'].tolist()}")

    return df


Overwriting ../src/salary_nba_data_pull/data_utils.py


In [10]:
%%writefile ../src/salary_nba_data_pull/quality.py
# src/salary_nba_data_pull/quality.py
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable, Mapping, Any
import pandas as pd
import numpy as np

@dataclass
class ExpectedSchema:
    """Describe what we *intended* to have in a dataframe."""
    # All columns we care about (order doesn't matter)
    expected_cols: Iterable[str]

    # Subset that must be present
    required_cols: Iterable[str] = field(default_factory=list)

    # Expected pandas dtypes (string form, e.g. 'float64', 'object')
    dtypes: Mapping[str, str] = field(default_factory=dict)

    # Columns that must be >= 0
    non_negative_cols: Iterable[str] = field(default_factory=list)

    # Columns that should not be all zeros / all NaN
    non_constant_cols: Iterable[str] = field(default_factory=list)

    # Unique key columns (together must be unique)
    unique_key: Iterable[str] = field(default_factory=list)

    # Allowed value sets (enums)
    allowed_values: Mapping[str, Iterable[Any]] = field(default_factory=dict)

def _series_is_constant(s: pd.Series) -> bool:
    return s.nunique(dropna=True) <= 1

def audit_dataframe(df: pd.DataFrame,
                    schema: ExpectedSchema,
                    *,
                    name: str = "dataset") -> dict[str, pd.DataFrame]:
    """
    Return a dict of small DataFrames summarising quality checks.
    Nothing is printed; caller decides how to persist/log.
    """
    exp = set(schema.expected_cols)
    req = set(schema.required_cols)

    present = set(df.columns)
    missing = sorted(list(exp - present))
    extra   = sorted(list(present - exp))

    # --- Column overview
    cols_overview = pd.DataFrame({
        "column": sorted(list(exp | present)),
        "expected": [c in exp for c in sorted(list(exp | present))],
        "present":  [c in present for c in sorted(list(exp | present))],
        "required": [c in req for c in sorted(list(exp | present))]
    })
    cols_overview["missing_required"] = cols_overview.apply(
        lambda r: r["required"] and not r["present"], axis=1
    )

    # --- Null report
    null_report = (df.isna().sum().to_frame("null_count")
                     .assign(total_rows=len(df))
                     .assign(null_pct=lambda d: 100 * d["null_count"] / d["total_rows"])
                     .reset_index()
                     .rename(columns={"index": "column"}))

    # --- Dtype report
    type_rows = []
    for col in df.columns:
        exp_type = schema.dtypes.get(col)
        type_rows.append({
            "column": col,
            "expected_dtype": exp_type,
            "actual_dtype": str(df[col].dtype),
            "matches": (exp_type is None) or (str(df[col].dtype) == exp_type)
        })
    type_report = pd.DataFrame(type_rows)

    # --- Value checks
    value_rows = []
    for col in df.select_dtypes(include=[np.number]).columns:
        series = df[col]
        row = {
            "column": col,
            "min": series.min(skipna=True),
            "max": series.max(skipna=True),
            "negatives": int((series < 0).sum()),
            "zeros": int((series == 0).sum()),
            "non_zero_pct": 100 * (series != 0).sum() / len(series),
        }
        row["should_be_non_negative"] = col in schema.non_negative_cols
        row["violates_non_negative"] = row["negatives"] > 0 and row["should_be_non_negative"]
        value_rows.append(row)
    value_report = pd.DataFrame(value_rows)

    # Constant columns
    constant_rows = []
    for col in df.columns:
        constant_rows.append({
            "column": col,
            "is_constant": _series_is_constant(df[col]),
            "should_not_be_constant": col in schema.non_constant_cols
        })
    constant_report = pd.DataFrame(constant_rows).assign(
        violates=lambda d: d["is_constant"] & d["should_not_be_constant"]
    )

    # Allowed values
    enum_rows = []
    for col, allowed in schema.allowed_values.items():
        if col not in df.columns:
            continue
        bad = ~df[col].isin(allowed) & df[col].notna()
        enum_rows.append({
            "column": col,
            "bad_count": int(bad.sum()),
            "sample_bad": df.loc[bad, col].drop_duplicates().head(5).tolist()
        })
    enum_report = pd.DataFrame(enum_rows)

    # Unique key
    uniq_report = pd.DataFrame()
    if schema.unique_key:
        dup_mask = df.duplicated(subset=list(schema.unique_key), keep=False)
        uniq_report = pd.DataFrame({
            "duplicate_rows": [int(dup_mask.sum())],
            "subset": [list(schema.unique_key)]
        })

    return {
        "cols_overview": cols_overview,
        "null_report": null_report,
        "type_report": type_report,
        "value_report": value_report,
        "constant_report": constant_report,
        "enum_report": enum_report,
        "unique_report": uniq_report
    }

def assert_dataframe_ok(df: pd.DataFrame,
                        schema: ExpectedSchema,
                        *, name: str = "dataset") -> None:
    """
    Raise AssertionError with a concise message if critical checks fail.
    Designed for pytest or CI.
    """
    rep = audit_dataframe(df, schema, name=name)
    bad_missing = rep["cols_overview"].query("missing_required == True")
    bad_types = rep["type_report"].query("matches == False")
    bad_nonneg = rep["value_report"].query("violates_non_negative == True")
    bad_constant = rep["constant_report"].query("violates == True")
    dupes = rep["unique_report"]["duplicate_rows"].iloc[0] if not rep["unique_report"].empty else 0

    msgs = []
    if not bad_missing.empty:
        msgs.append(f"Missing required cols: {bad_missing['column'].tolist()}")
    if not bad_types.empty:
        msgs.append(f"Dtype mismatches: {bad_types[['column','expected_dtype','actual_dtype']].to_dict('records')}")
    if not bad_nonneg.empty:
        msgs.append(f"Negative values in non-negative cols: {bad_nonneg['column'].tolist()}")
    if not bad_constant.empty:
        msgs.append(f"Constant-but-shouldn't cols: {bad_constant['column'].tolist()}")
    if dupes:
        msgs.append(f"Duplicate key rows: {dupes}")

    if msgs:
        raise AssertionError(f"[{name}] data quality failures:\n" + "\n".join(msgs))

def write_audit_reports(reports: Mapping[str, pd.DataFrame],
                        out_dir: Path,
                        prefix: str) -> None:
    """
    Save each report DataFrame as CSV for later inspection.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    for key, df in reports.items():
        df.to_csv(out_dir / f"{prefix}_{key}.csv", index=False) 

Overwriting ../src/salary_nba_data_pull/quality.py


In [11]:
%%writefile ../src/salary_nba_data_pull/main.py
import argparse
import pandas as pd
import logging
import time
import glob
import os
import hashlib
import numpy as np
from pathlib import Path
import pyarrow.parquet as pq
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
import requests_cache
from salary_nba_data_pull.fetch_utils import fetch_all_players, fetch_season_players, fetch_league_standings
from salary_nba_data_pull.process_utils import (
    process_player_data,
    inflate_value,
    calculate_percentages,
    _ensure_cpi_ready,
)
from salary_nba_data_pull.scrape_utils import (
    scrape_salary_cap_history,
    load_injury_data,
    _season_advanced_df,
)
from salary_nba_data_pull.process_utils import merge_injury_data
from salary_nba_data_pull.data_utils import (
    clean_dataframe,
    merge_salary_cap_data,
    validate_data,
    load_salary_cap_csv,
    load_salary_cap_parquet,
    load_external_salary_data,
)
from salary_nba_data_pull.settings import DATA_PROCESSED_DIR

# Enable requests-cache for all HTTP traffic
requests_cache.install_cache("nba_pull", backend="sqlite", allowable_codes=(200,))

# CPI self-test - logs a warning once per run if CPI is unavailable
_ensure_cpi_ready(debug=False)

# Default number of worker threads
DEFAULT_WORKERS = 8                # tweak ≤ CPU cores

def _almost_equal_numeric(a: pd.Series, b: pd.Series, atol=1e-6, rtol=1e-9):
    # Handle NA values first
    mask = a.isna() & b.isna()
    
    # For non-NA values, compare them
    both_numeric = pd.api.types.is_numeric_dtype(a) and pd.api.types.is_numeric_dtype(b)
    if not both_numeric:
        # For non-numeric columns, use pandas equals but handle NA carefully
        non_na_mask = ~(a.isna() | b.isna())
        eq_result = pd.Series(False, index=a.index)
        if non_na_mask.any():
            eq_result[non_na_mask] = a[non_na_mask].eq(b[non_na_mask])
        return eq_result | mask
    else:
        # For numeric columns, use numpy isclose
        non_na_mask = ~(a.isna() | b.isna())
        diff_ok = pd.Series(False, index=a.index)
        if non_na_mask.any():
            diff_ok[non_na_mask] = np.isclose(
                a[non_na_mask].astype(float), 
                b[non_na_mask].astype(float), 
                atol=atol, rtol=rtol
            )
        return diff_ok | mask

def _diff_report(old_df: pd.DataFrame,
                 new_df: pd.DataFrame,
                 key_cols=("Season","Player"),
                 numeric_atol=1e-6,
                 numeric_rtol=1e-9,
                 max_print=10):
    """
    Return (is_equal:boolean, summary_str:str, diff_rows:DataFrame)
    diff_rows has: key cols + column + old_val + new_val
    """
    # ensure same columns
    common = [c for c in new_df.columns if c in old_df.columns]
    old = old_df.reindex(columns=common)
    new = new_df.reindex(columns=common)

    # align order by keys if present
    if all(k in common for k in key_cols):
        old = old.sort_values(list(key_cols)).reset_index(drop=True)
        new = new.sort_values(list(key_cols)).reset_index(drop=True)
    else:
        key_cols = ("__row__",)
        old["__row__"] = range(len(old))
        new["__row__"] = range(len(new))
        old = old.sort_values("__row__").reset_index(drop=True)
        new = new.sort_values("__row__").reset_index(drop=True)

    if len(old) != len(new):
        return (False,
                f"Row count differs: old={len(old)}, new={len(new)}",
                pd.DataFrame())

    diffs = []
    for col in common:
        eq_mask = _almost_equal_numeric(old[col], new[col],
                                        atol=numeric_atol, rtol=numeric_rtol)
        if not eq_mask.all():
            idxs = np.where(~eq_mask)[0]
            for i in idxs:
                row_key_vals = {k: new.iloc[i][k] for k in key_cols}
                diffs.append({
                    **row_key_vals,
                    "column": col,
                    "old": old.iloc[i][col],
                    "new": new.iloc[i][col],
                })

    if not diffs:
        return (True, "No value-level diffs (within tolerance).", pd.DataFrame())

    diff_df = pd.DataFrame(diffs)
    # build summary
    cols_changed = diff_df["column"].nunique()
    rows_changed = diff_df[key_cols[0]].nunique()
    examples = diff_df.head(max_print)
    summary = (f"{len(diff_df)} cell diffs, {rows_changed} rows, "
               f"{cols_changed} columns. Showing first {len(examples)}:")
    return (False, summary, examples)

def _file_md5(path: str, chunk: int = 1 << 20) -> str:
    """Return md5 hexdigest for *path* streaming in 1 MiB chunks."""
    h = hashlib.md5()
    with open(path, "rb") as f:
        for blk in iter(lambda: f.read(chunk), b""):
            h.update(blk)
    return h.hexdigest()

def _season_partition_identical(season: str,
                                base_dir: Path | str,
                                new_df: pd.DataFrame) -> bool:
    """
    Return True if on-disk parquet for `season` is byte-wise equivalent (after
    canonical sort & column alignment) to `new_df`.
    """
    ckpt = Path(base_dir) / f"season={season}" / "part.parquet"
    if not ckpt.exists():
        return False

    try:
        old_df = pd.read_parquet(ckpt)
    except Exception as exc:
        logging.warning("[identical] failed to read %s → %s", ckpt, exc)
        return False

    # STEP B1: align columns and sort only by stable key
    cols = sorted(set(old_df.columns) | set(new_df.columns))
    key = ["Season","Player"]

    old_cmp = (old_df.reindex(columns=cols)
                     .sort_values(key)
                     .reset_index(drop=True))
    new_cmp = (new_df.reindex(columns=cols)
                     .sort_values(key)
                     .reset_index(drop=True))

    return old_cmp.equals(new_cmp)   # NaNs treated equal if aligned

def _season_partition_exists(season, base_dir):
    """Check if a season partition already exists in Parquet format."""
    return os.path.exists(os.path.join(base_dir, f"season={season}"))

def _player_task(args):
    """Wrapper for ThreadPoolExecutor."""
    (player_name, season, salary, all_players, debug) = args
    stats = process_player_data(player_name, season, all_players, debug=debug)
    if stats:
        stats['Salary'] = salary
    return stats

# ----------------------------------------------------------------------
def update_data(existing_data,
                start_year: int,
                end_year: int,
                *,
                player_filter: str = "all",
                min_avg_minutes: float | None = None,
                debug: bool = False,
                small_debug: bool = False,          # --- NEW
                max_workers: int = 8,
                output_base: str | Path = DATA_PROCESSED_DIR,
                overwrite: bool = False) -> pd.DataFrame:
    """
    Pull seasons in [start_year, end_year] and write under `output_base`.
    When `small_debug` is True, suppress per‑player chatter and show only
    concise per‑season summaries.
    """
    output_base = Path(output_base)
    output_base.mkdir(parents=True, exist_ok=True)

    # Decide low-level debug for helpers
    helper_debug = debug and not small_debug

    injury = load_injury_data(debug=helper_debug)

    # ⇩⇩  NEW  ⇩⇩  pull salary from parquet (or leave empty)
    salary_dir = Path(output_base).parent / "salary_external"
    salary_df = pd.concat(
        [load_external_salary_data(f"{y}-{str(y+1)[-2:]}", root=salary_dir)
         for y in range(start_year, end_year + 1)],
        ignore_index=True
    )

    # if salary not available we'll still proceed
    season_has_salary = set(salary_df["Season"].unique())

    out_frames: list[pd.DataFrame] = []
    season_summaries: list[str] = []  # --- NEW: collect summaries

    for y in tqdm(range(start_year, end_year + 1),
                  desc="Seasons", disable=small_debug):
        season = f"{y}-{str(y+1)[-2:]}"
        ckpt_dir = output_base / f"season={season}"
        ckpt_dir.mkdir(parents=True, exist_ok=True)

        # --- 1. Team payroll (removed - no longer scraped)
        team_payroll = pd.DataFrame(columns=["Team", "Team_Salary", "Season"])

        # --- 2. Standings (wins/losses)
        standings_df = fetch_league_standings(season, debug=helper_debug)
        if standings_df is None:
            standings_df = pd.DataFrame()

        # --- 3. Roster
        players_this_season = fetch_season_players(season, debug=helper_debug)
        rows = salary_df.query("Season == @season") if season in season_has_salary \
               else pd.DataFrame(columns=["Player", "Salary"])
        args = [
            (row.Player, season, row.Salary, players_this_season, helper_debug)
            for _, row in rows.iterrows()
        ] if not rows.empty else [
            (name.title(), season, None, players_this_season, helper_debug)
            for name in players_this_season.keys()
        ]

        # --- pre‑fetch season‑wide advanced table so workers reuse the cache
        _ = _season_advanced_df(season)        # warm cache under the lock

        # --- 4. Player processing in parallel
        with ThreadPoolExecutor(max_workers=min(max_workers or DEFAULT_WORKERS, len(args))) as pool:
            results, failures = [], 0
            for fut in tqdm(as_completed(pool.submit(_player_task, a) for a in args),
                            total=len(args), desc=f"{season} workers", disable=small_debug):
                try:
                    res = fut.result()
                    if res:
                        results.append(res)
                except Exception as exc:
                    failures += 1
                    logging.exception("Worker failed for %s: %s", season, exc)
            if failures and debug:
                print(f"⚠️  {failures} worker threads raised exceptions")

        missing = rows.loc[~rows.Player.str.lower().isin(players_this_season.keys()),
                           "Player"].unique()

        (ckpt_dir / "missing_players.txt").write_text("\n".join(missing))

        df_season = pd.DataFrame(results)
        print(f"[dbg] {season} processed players:", len(df_season))
        
        # ---- PROBE: Check for specific duplicate key ----
        key = ("2023-24", "Kj Martin")
        if season == "2023-24":
            probe_count = df_season.query("Season == @key[0] & Player == @key[1]").shape[0]
            print(f"[probe] Kj Martin count in df_season: {probe_count}")
            if probe_count > 1:
                print("[probe] Kj Martin rows:")
                print(df_season.query("Season == @key[0] & Player == @key[1]")[["Season", "Player", "Team", "MP"]])
        
        # ---------- season sanity check ----------
        if len(df_season) < 150:
            logging.warning("%s produced only %d rows; retrying after 90 s", season, len(df_season))
            time.sleep(90)
            return update_data(existing_data, y, y,  # single‑season retry
                               player_filter=player_filter,
                               min_avg_minutes=min_avg_minutes,
                               debug=debug,
                               small_debug=small_debug,
                               max_workers=max_workers,
                               output_base=output_base,
                               overwrite=True)
        if df_season.empty:
            # Build tiny summary anyway
            season_summaries.append(f"{season}: 0 players processed.")
            continue

        # --- 5. Merge W/L (validate to prevent row blow‑ups)
        if not standings_df.empty:
            stand_df = standings_df.copy()
            if 'W' in stand_df.columns:
                stand_df.rename(columns={'W': 'Wins', 'L': 'Losses'}, inplace=True)
            if 'WINS' in stand_df.columns:
                stand_df.rename(columns={'WINS': 'Wins', 'LOSSES': 'Losses'}, inplace=True)
            if 'TEAM_ID' in stand_df.columns:
                stand_df.rename(columns={'TEAM_ID': 'TeamID'}, inplace=True)
            
            print(f"[dbg] {season} before standings merge:", len(df_season))
            df_season = pd.merge(
                df_season,
                stand_df[['TeamID', 'Wins', 'Losses']].drop_duplicates('TeamID'),
                on='TeamID', how='left', validate='m:1'
            )
            print(f"[dbg] {season} after standings merge:", len(df_season))

        # --- 6. Team payroll merge (removed - no longer merged)
        merged_tmp2 = df_season if min_avg_minutes is None else df_season.query("MP >= @min_avg_minutes")
        print(f"[dbg] {season} after MP filter:", len(merged_tmp2))
        
        merged_tmp3 = merged_tmp2.pipe(merge_injury_data, injury_data=injury)
        print(f"[dbg] {season} after injury merge:", len(merged_tmp3))
        
        merged = (merged_tmp3
                    .pipe(calculate_percentages, debug=helper_debug)
                    .pipe(clean_dataframe))
        
        # ---- FINAL: enforce key uniqueness ----
        dups = merged.duplicated(subset=["Season","Player"], keep=False)
        if dups.any():
            print(f"[dbg] {season} DUPLICATE KEYS detected ({dups.sum()} rows). Dumping...")
            print(merged.loc[dups, ["Season","Player","Team","MP"]]
                        .sort_values(["Player","Team"]))
            # Hard fail so we never persist dirty data:
            raise AssertionError(f"Duplicate (Season,Player) keys in season {season}")

        # STEP A1: deterministic sort & string normalization
        key_cols = ["Season","Player"]
        merged = merged.sort_values(key_cols).reset_index(drop=True)
        obj_cols = merged.select_dtypes(include=["object"]).columns
        for c in obj_cols:
            merged[c] = merged[c].replace(r"^\s*$", pd.NA, regex=True)

        print(f"[dbg] {season} final merged:", len(merged))

        # Skip identical season unless overwrite (moved here to use merged DataFrame)
        if (not overwrite
            and (ckpt_dir / "part.parquet").exists()
            and _season_partition_identical(season, output_base, merged)):
            if debug and not small_debug:
                print(f"✓  {season} unchanged – skipping")
            out_frames.append(merged)
            continue
        elif debug and not small_debug and (ckpt_dir / "part.parquet").exists():
            print(f"↻  {season} differs – re-scraping")

        parquet_path = ckpt_dir / "part.parquet"
        merged.to_parquet(parquet_path, index=False)
        (ckpt_dir / "part.md5").write_text(_file_md5(parquet_path))

        out_frames.append(merged)
        logging.info("wrote %s", ckpt_dir)

        # --- NEW: concise summary
        if small_debug:
            n_players = len(merged)
            n_missing = len(missing)
            n_cols = merged.shape[1]
            season_summaries.append(
                f"{season}: {n_players} rows, {n_missing} missing roster matches, {n_cols} cols."
            )

    # Print all summaries once
    if small_debug and season_summaries:
        print("\n--- Season Summaries ---")
        for line in season_summaries:
            print(line)
        print("------------------------\n")

    return pd.concat(out_frames, ignore_index=True) if out_frames else pd.DataFrame()

def get_timestamp():
    """Return a filesystem-safe timestamp string."""
    return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

def remove_old_logs(log_dir, days_to_keep=7):
    current_time = datetime.now()
    for log_file in glob.glob(os.path.join(log_dir, 'stat_pull_log_*.txt')):
        file_modified_time = datetime.fromtimestamp(os.path.getmtime(log_file))
        if current_time - file_modified_time > timedelta(days=days_to_keep):
            os.remove(log_file)

def persist_final_dataset(new_data: pd.DataFrame,
                          seasons_loaded: list[str],
                          *,
                          output_base: Path,
                          debug: bool = False,
                          numeric_atol: float = 1e-6,
                          numeric_rtol: float = 1e-9,
                          max_print: int = 15) -> None:
    """
    Compare & overwrite the master **Parquet** deterministically.
    """
    final_parquet = output_base / "nba_player_data_final_inflated.parquet"
    join_keys     = ["Season", "Player"]

    if final_parquet.exists():
        old_master = pd.read_parquet(final_parquet)
        if debug:
            print(f"[persist] loaded {len(old_master):,} rows from existing master")
    else:
        old_master = pd.DataFrame(columns=new_data.columns)

    # Canonicalize types for reliable joins
    for df_ in (old_master, new_data):
        for k in join_keys:
            if k in df_.columns:
                df_[k] = df_[k].astype(str).str.strip()

    # Slice old & new by season
    old_slice = old_master.merge(
        pd.DataFrame({ "Season": seasons_loaded }).drop_duplicates(),
        on="Season", how="inner"
    ).reset_index(drop=True)
    new_slice = new_data.reset_index(drop=True)

    # Early exit if both empty
    if len(old_slice) == len(new_slice) == 0:
        if debug: print("[persist] nothing to compare/write")
        return

    # Key diff
    old_keys = old_slice[join_keys].drop_duplicates()
    new_keys = new_slice[join_keys].drop_duplicates()

    add = new_keys.merge(old_keys, on=join_keys, how="left", indicator=True)
    add = add[add["_merge"]=="left_only"].drop(columns="_merge")

    rem = old_keys.merge(new_keys, on=join_keys, how="left", indicator=True)
    rem = rem[rem["_merge"]=="left_only"].drop(columns="_merge")

    if len(add) or len(rem):
        print(f"[persist] Keys added={len(add)}, removed={len(rem)}")
        audits_dir = output_base / "audits"
        audits_dir.mkdir(parents=True, exist_ok=True)
        add.to_csv(audits_dir / f"keys_added_{get_timestamp()}.csv", index=False)
        rem.to_csv(audits_dir / f"keys_removed_{get_timestamp()}.csv", index=False)

    # Normalize NULL-like text columns
    null_like_cols = ["Injury_Periods"]
    for col in null_like_cols:
        if col in new_slice.columns:
            new_slice[col] = new_slice[col].replace("", pd.NA)
        if col in old_slice.columns:
            old_slice[col] = old_slice[col].replace("", pd.NA)

    # Value-level diff (kept for visibility)
    equal, summary, example_df = _diff_report(old_slice, new_slice,
                                              key_cols=join_keys,
                                              numeric_atol=numeric_atol,
                                              numeric_rtol=numeric_rtol,
                                              max_print=max_print)
    if not equal:
        print("[persist] Detected differences:")
        print("  " + summary)
        if not example_df.empty:
            print(example_df.to_string(index=False))
        audits_dir = output_base / "audits"
        audits_dir.mkdir(parents=True, exist_ok=True)
        _, _, full_diff = _diff_report(old_slice, new_slice,
                                       key_cols=join_keys,
                                       numeric_atol=numeric_atol,
                                       numeric_rtol=numeric_rtol,
                                       max_print=len(new_slice))
        full_diff.to_csv(audits_dir / f"diff_{seasons_loaded[0]}_{seasons_loaded[-1]}_{get_timestamp()}.csv",
                         index=False)
    else:
        if debug:
            print("[persist] No changes detected – master Parquet left untouched")
        return

    # STEP C1: remove stale keys and append new
    remover = old_master.merge(new_keys, on=join_keys, how="left", indicator=True)
    remover = remover[remover["_merge"]=="left_only"].drop(columns="_merge")
    updated_master = pd.concat([remover, new_slice], ignore_index=True)

    # STEP C2: deterministic sort before writing
    updated_master = updated_master.sort_values(join_keys).reset_index(drop=True)
    updated_master.to_parquet(final_parquet, index=False)

    # quick sanity read‑back
    reloaded = pd.read_parquet(final_parquet)
    assert len(reloaded) == len(updated_master), "[persist] row mismatch after Parquet write/read"
    if debug:
        print(f"[persist] Master Parquet updated → {final_parquet}")

def main(start_year: int,
         end_year: int,
         player_filter: str = "all",
         min_avg_minutes: float = 15,
         debug: bool = False,
         small_debug: bool = False,      # --- NEW
         workers: int = 8,
         overwrite: bool = False,
         output_base: str | Path = DATA_PROCESSED_DIR) -> None:
    """
    Entry point. `small_debug=True` prints only high‑signal info.
    If both `debug` and `small_debug` are True, `debug` wins (full noise).
    """
    t0 = time.time()
    output_base = Path(output_base)
    output_base.mkdir(parents=True, exist_ok=True)

    log_dir = output_base.parent / "stat_pull_output"
    log_dir.mkdir(parents=True, exist_ok=True)
    remove_old_logs(log_dir)

    log_file = log_dir / f"stat_pull_log_{get_timestamp()}.txt"
    logging.basicConfig(filename=log_file,
                        level=logging.DEBUG if debug else logging.INFO,
                        format="%(asctime)s - %(levelname)s - %(message)s")

    updated = update_data(None, start_year, end_year,
                          player_filter=player_filter,
                          min_avg_minutes=min_avg_minutes,
                          debug=debug,
                          small_debug=small_debug,          # --- NEW
                          max_workers=workers,
                          output_base=str(output_base),
                          overwrite=overwrite)

    if not small_debug:  # keep your old prints in full/quiet modes
        print(f"✔ Completed pull: {len(updated):,} rows added")

    if not updated.empty:
        # ---------------- Salary Cap -----------------
        # Prefer local Parquet; fallback to CSV, then scrape only if file missing and user allows
        cap_file = Path(output_base) / "salary_cap_history_inflated"
        use_scrape = False

        try:
            salary_cap = load_salary_cap_parquet(cap_file, debug=debug and not small_debug)
        except FileNotFoundError:
            # LAST resort – scrape (can be disabled permanently by setting use_scrape=False)
            if debug and not small_debug:
                print("[salary-cap] local file missing, attempting scrape…")
            salary_cap = scrape_salary_cap_history(debug=debug and not small_debug)
            if salary_cap is not None:
                # Save as both Parquet and CSV for compatibility
                salary_cap.to_parquet(f"{cap_file}.parquet", index=False)
                salary_cap.to_csv(f"{cap_file}.csv", index=False)

        if salary_cap is not None:
            updated = merge_salary_cap_data(updated, salary_cap, debug=debug and not small_debug)
        else:
            if debug:
                print("[salary-cap] No data merged — check local file path.")

        # --------------- Validate --------------------
        updated = validate_data(updated, name="player_dataset", save_reports=True)

        seasons_this_run = sorted(updated["Season"].unique().tolist())
        persist_final_dataset(updated,
                              seasons_loaded=seasons_this_run,
                              output_base=output_base,
                              debug=debug)

    if not small_debug:
        print(f"Process finished in {time.time() - t0:.1f} s — log: {log_file}")
    else:
        # minimal closing line
        print(f"Done in {time.time() - t0:.1f}s. Log: {log_file}")
        
# ----------------------------------------------------------------------
# argparse snippet
if __name__ == "__main__":
    cur = datetime.now().year
    p = argparse.ArgumentParser()
    p.add_argument("--start_year", type=int, default=cur-1)
    p.add_argument("--end_year",   type=int, default=cur)
    p.add_argument("--player_filter", default="all")
    p.add_argument("--min_avg_minutes", type=float, default=15)
    p.add_argument("--debug", action="store_true")
    p.add_argument("--small_debug", action="store_true")   # --- NEW
    p.add_argument("--workers", type=int, default=8)
    p.add_argument("--overwrite", action="store_true")
    p.add_argument("--output_base",
                   default=str(DATA_PROCESSED_DIR),
                   help="Destination root for parquet + csv outputs")
    args = p.parse_args()
    main(**vars(args))


Overwriting ../src/salary_nba_data_pull/main.py


# Testing in a notebook

In [1]:
# %%writefile ../src/salary_nba_data_pull/notebook_helper.py
"""
Notebook/REPL helper utilities for salary_nba_data_pull.

Goals
-----
• Work no matter where the notebook is opened (absolute paths).
• Avoid NameError on __file__.
• Keep hot‑reload for iterative dev.
• Forward arbitrary args to main() so we can test all scenarios.

Use:
>>> import salary_nba_data_pull.notebook_helper as nb
>>> nb.quick_pull(2024, workers=12, debug=True)
"""

from __future__ import annotations
import sys, importlib, inspect, os
from pathlib import Path
import requests_cache
from typing import Iterable

def _find_repo_root(start: Path | None = None) -> Path:
    """Find the repository root by looking for pyproject.toml or .git."""
    markers = {"pyproject.toml", ".git"}
    here = (start or Path.cwd()).resolve()
    for p in [here] + list(here.parents):
        if any((p / m).exists() for m in markers):
            return p
    return here

# Ensure project root & src are on sys.path (defensive)
ROOT = _find_repo_root()
SRC  = ROOT / "src"
for p in (ROOT, SRC):
    if p.is_dir() and str(p) not in sys.path:
        sys.path.insert(0, str(p))

# Sanity print (can be silenced)
if __name__ == "__main__" or "JPY_PARENT_PID" in os.environ:
    print(f"[notebook_helper] sys.path[0:3]={sys.path[:3]}")

# Import after path fix
try:
    from salary_nba_data_pull import main as nba_main
    from salary_nba_data_pull.settings import DATA_PROCESSED_DIR
    from salary_nba_data_pull.fetch_utils import clear_cache as _cc
    print("✅ salary_nba_data_pull imported successfully")
except ImportError as e:
    print(f"❌ Failed to import salary_nba_data_pull: {e}")
    print(f"   ROOT={ROOT}")
    print(f"   SRC={SRC}")
    print(f"   sys.path[0:3]={sys.path[:3]}")
    raise
    
    
def _reload():
    """Reload the main module so code edits are picked up."""
    importlib.reload(nba_main)

def quick_pull(season: int, **kwargs):
    _reload()
    print(f"[quick_pull] season={season}, kwargs={kwargs}")
    nba_main.main(start_year=season, end_year=season, **kwargs)

def historical_pull(start_year: int, end_year: int, **kwargs):
    _reload()
    print(f"[historical_pull] {start_year}-{end_year}, kwargs={kwargs}")
    nba_main.main(start_year=start_year, end_year=end_year, **kwargs)

def check_existing_data(base: Path | str | None = None) -> list[str]:
    base = Path(base) if base else DATA_PROCESSED_DIR
    seasons = sorted(d.name.split("=", 1)[-1] for d in base.glob("season=*") if d.is_dir())
    print(f"[check_existing_data] found {len(seasons)} seasons in {base}")
    return seasons

def load_parquet_data(season: str | None = None, *, base: Path | str | None = None):
    import pandas as pd
    base = Path(base) if base else DATA_PROCESSED_DIR
    files = list(base.glob(f"season={season}/part.parquet")) if season else list(base.glob("season=*/part.parquet"))
    if not files:
        print("[load_parquet_data] No parquet files found.")
        return pd.DataFrame()
    print(f"[load_parquet_data] loading {len(files)} files from {base}")
    return pd.concat((pd.read_parquet(f) for f in files), ignore_index=True)

def clear_all_caches():
    requests_cache.clear()
    _cc()
    print("✅ caches cleared")

def print_args():
    sig = inspect.signature(nba_main.main)
    for name, param in sig.parameters.items():
        print(f"{name:<15} default={param.default!r}  kind={param.kind}")

def query_data(sql: str, db: str | None = None):
    """
    Run arbitrary SQL against the DuckDB lake. Example:
        query_data("SELECT COUNT(*) FROM parquet_scan('data/new_processed/season=*/part.parquet')")
    """
    import duckdb, pandas as pd
    db = db or (DATA_PROCESSED_DIR.parent / "nba_stats.duckdb")
    with duckdb.connect(str(db), read_only=True) as con:
        return con.execute(sql).fetchdf()


if __name__ == "__main__":
    print_args()
    # quick_pull(2023, workers=4, debug=True)



    historical_pull(2024, 2024,        # multi‑season, 2012, 2024,
                    workers=6,
                    min_avg_minutes=10,
                    overwrite=False,
                    debug=True)
    check_existing_data()              # see which seasons are cached
    df = load_parquet_data("2023-24")  # inspect a single season


[notebook_helper] sys.path[0:3]=['C:\\docker_projects\\coach_analysis', 'C:\\Users\\ghadf\\AppData\\Roaming\\uv\\python\\cpython-3.10.17-windows-x86_64-none\\python310.zip', 'C:\\Users\\ghadf\\AppData\\Roaming\\uv\\python\\cpython-3.10.17-windows-x86_64-none\\DLLs']


  from .autonotebook import tqdm as notebook_tqdm


✅ salary_nba_data_pull imported successfully
start_year      default=<class 'inspect._empty'>  kind=POSITIONAL_OR_KEYWORD
end_year        default=<class 'inspect._empty'>  kind=POSITIONAL_OR_KEYWORD
player_filter   default='all'  kind=POSITIONAL_OR_KEYWORD
min_avg_minutes default=15  kind=POSITIONAL_OR_KEYWORD
debug           default=False  kind=POSITIONAL_OR_KEYWORD
small_debug     default=False  kind=POSITIONAL_OR_KEYWORD
workers         default=8  kind=POSITIONAL_OR_KEYWORD
overwrite       default=False  kind=POSITIONAL_OR_KEYWORD
output_base     default=WindowsPath('C:/docker_projects/coach_analysis/data/new_processed')  kind=POSITIONAL_OR_KEYWORD
[historical_pull] 2024-2024, kwargs={'workers': 6, 'min_avg_minutes': 10, 'overwrite': False, 'debug': True}
[load_injury_data] loaded 37,667 rows from C:\docker_projects\coach_analysis\data\new_processed\NBA Player Injury Stats(1951 - 2023).csv


Seasons:   0%|          | 0/1 [00:00<?, ?it/s]

[fetch_season_players] 578 players for 2024-25
[adv] fetching https://www.basketball-reference.com/leagues/NBA_2025_advanced.html





[adv] Trey Alexander → {'PER': 2.9, 'TS%': 0.374, '3PAr': 0.415, 'FTr': 0.098, 'ORB%': 1.0, 'DRB%': 9.8, 'TRB%': 5.6, 'AST%': 11.4, 'STL%': 0.8, 'BLK%': 0.8, 'TOV%': 10.5, 'USG%': 17.3, 'OWS': -0.2, 'DWS': 0.0, 'WS': -0.2, 'WS/48': -0.077, 'OBPM': -7.9, 'DBPM': -1.8, 'BPM': -9.7, 'VORP': -0.2}
[adv] Steven Adams → {'PER': 16.6, 'TS%': 0.541, '3PAr': 0.012, 'FTr': 0.557, 'ORB%': 21.8, 'DRB%': 22.0, 'TRB%': 21.9, 'AST%': 10.9, 'STL%': 1.3, 'BLK%': 3.2, 'TOV%': 20.6, 'USG%': 13.6, 'OWS': 1.0, 'DWS': 1.3, 'WS': 2.3, 'WS/48': 0.137, 'OBPM': 0.0, 'DBPM': 0.3, 'BPM': 0.3, 'VORP': 0.5}
[adv] Santi Aldama → {'PER': 16.8, 'TS%': 0.588, '3PAr': 0.503, 'FTr': 0.145, 'ORB%': 6.1, 'DRB%': 20.3, 'TRB%': 13.3, 'AST%': 15.2, 'STL%': 1.5, 'BLK%': 1.6, 'TOV%': 9.3, 'USG%': 18.5, 'OWS': 2.9, 'DWS': 2.0, 'WS': 5.0, 'WS/48': 0.144, 'OBPM': 2.5, 'DBPM': 0.7, 'BPM': 3.2, 'VORP': 2.2}
[adv] Precious Achiuwa → {'PER': 13.6, 'TS%': 0.53, '3PAr': 0.11, 'FTr': 0.211, 'ORB%': 10.0, 'DRB%': 21.0, 'TRB%': 15.5, 'AST%

2024-25 workers:  11%|█         | 63/578 [00:00<00:01, 318.25it/s][A


[adv] Jalen Brunson → {'PER': 21.6, 'TS%': 0.605, '3PAr': 0.331, 'FTr': 0.373, 'ORB%': 1.4, 'DRB%': 7.9, 'TRB%': 4.7, 'AST%': 32.3, 'STL%': 1.3, 'BLK%': 0.3, 'TOV%': 10.5, 'USG%': 29.5, 'OWS': 6.8, 'DWS': 1.4, 'WS': 8.3, 'WS/48': 0.172, 'OBPM': 4.7, 'DBPM': -1.3, 'BPM': 3.3, 'VORP': 3.1}
[adv] Moses Brown → {'PER': 22.4, 'TS%': 0.706, '3PAr': 0.0, 'FTr': 0.224, 'ORB%': 11.3, 'DRB%': 29.0, 'TRB%': 20.3, 'AST%': 2.9, 'STL%': 2.4, 'BLK%': 2.9, 'TOV%': 17.0, 'USG%': 23.5, 'OWS': 0.2, 'DWS': 0.2, 'WS': 0.4, 'WS/48': 0.146, 'OBPM': -0.3, 'DBPM': 0.5, 'BPM': 0.1, 'VORP': 0.1}
[adv] Kobe Bufkin → {'PER': 10.5, 'TS%': 0.483, '3PAr': 0.404, 'FTr': 0.383, 'ORB%': 4.3, 'DRB%': 14.5, 'TRB%': 9.3, 'AST%': 18.2, 'STL%': 1.1, 'BLK%': 1.5, 'TOV%': 12.7, 'USG%': 20.8, 'OWS': 0.0, 'DWS': 0.1, 'WS': 0.0, 'WS/48': 0.018, 'OBPM': -3.4, 'DBPM': -0.5, 'BPM': -4.0, 'VORP': -0.1}
[adv] Thomas Bryant → {'PER': 17.2, 'TS%': 0.614, '3PAr': 0.461, 'FTr': 0.199, 'ORB%': 9.7, 'DRB%': 19.2, 'TRB%': 14.5, 'AST%': 7.6, 

2024-25 workers:  23%|██▎       | 131/578 [00:00<00:01, 330.06it/s][A

[adv] Pj Dozier → {'PER': 9.5, 'TS%': 0.621, '3PAr': 1.0, 'FTr': 2.0, 'ORB%': 3.2, 'DRB%': 12.5, 'TRB%': 7.9, 'AST%': 18.1, 'STL%': 1.4, 'BLK%': 0.0, 'TOV%': 15.1, 'USG%': 8.2, 'OWS': 0.0, 'DWS': 0.0, 'WS': 0.1, 'WS/48': 0.107, 'OBPM': -2.0, 'DBPM': 2.7, 'BPM': 0.7, 'VORP': 0.0}
[adv] Chris Duarte → {'PER': 16.5, 'TS%': 0.61, '3PAr': 0.577, 'FTr': 0.308, 'ORB%': 7.2, 'DRB%': 20.9, 'TRB%': 14.2, 'AST%': 16.6, 'STL%': 0.0, 'BLK%': 0.0, 'TOV%': 3.3, 'USG%': 17.2, 'OWS': 0.2, 'DWS': 0.0, 'WS': 0.3, 'WS/48': 0.171, 'OBPM': 2.2, 'DBPM': -1.0, 'BPM': 1.2, 'VORP': 0.1}
[adv] Alex Ducas → {'PER': 10.1, 'TS%': 0.583, '3PAr': 0.7, 'FTr': 0.067, 'ORB%': 7.9, 'DRB%': 14.3, 'TRB%': 11.2, 'AST%': 4.8, 'STL%': 1.9, 'BLK%': 0.0, 'TOV%': 11.5, 'USG%': 11.8, 'OWS': 0.1, 'DWS': 0.2, 'WS': 0.3, 'WS/48': 0.125, 'OBPM': -2.1, 'DBPM': 0.1, 'BPM': -2.0, 'VORP': 0.0}
[adv] Andre Drummond → {'PER': 16.0, 'TS%': 0.535, '3PAr': 0.085, 'FTr': 0.385, 'ORB%': 15.8, 'DRB%': 31.6, 'TRB%': 23.4, 'AST%': 6.8, 'STL%': 2.6




[adv] James Harden → {'PER': 20.0, 'TS%': 0.582, '3PAr': 0.516, 'FTr': 0.446, 'ORB%': 2.3, 'DRB%': 16.1, 'TRB%': 9.3, 'AST%': 36.8, 'STL%': 2.1, 'BLK%': 2.0, 'TOV%': 18.0, 'USG%': 29.6, 'OWS': 4.0, 'DWS': 4.3, 'WS': 8.3, 'WS/48': 0.143, 'OBPM': 3.5, 'DBPM': 0.8, 'BPM': 4.3, 'VORP': 4.4}
[adv] Jaden Hardy → {'PER': 10.4, 'TS%': 0.549, '3PAr': 0.476, 'FTr': 0.208, 'ORB%': 1.1, 'DRB%': 9.4, 'TRB%': 5.3, 'AST%': 13.2, 'STL%': 1.5, 'BLK%': 0.5, 'TOV%': 15.4, 'USG%': 25.3, 'OWS': -0.8, 'DWS': 0.5, 'WS': -0.3, 'WS/48': -0.016, 'OBPM': -2.8, 'DBPM': -1.9, 'BPM': -4.7, 'VORP': -0.6}
[adv] Elijah Harkless → {'PER': 7.8, 'TS%': 0.425, '3PAr': 0.714, 'FTr': 0.171, 'ORB%': 7.1, 'DRB%': 9.3, 'TRB%': 8.2, 'AST%': 7.7, 'STL%': 3.5, 'BLK%': 0.7, 'TOV%': 7.4, 'USG%': 12.3, 'OWS': 0.0, 'DWS': 0.1, 'WS': 0.1, 'WS/48': 0.023, 'OBPM': -5.2, 'DBPM': 1.5, 'BPM': -3.8, 'VORP': -0.1}
[adv] Gary Harris → {'PER': 6.9, 'TS%': 0.524, '3PAr': 0.759, 'FTr': 0.09, 'ORB%': 3.1, 'DRB%': 7.5, 'TRB%': 5.2, 'AST%': 5.8, 'S

2024-25 workers:  46%|████▌     | 267/578 [00:00<00:00, 328.89it/s][A


[adv] Deandre Jordan → {'PER': 15.4, 'TS%': 0.621, '3PAr': 0.0, 'FTr': 0.457, 'ORB%': 14.0, 'DRB%': 30.3, 'TRB%': 22.6, 'AST%': 9.5, 'STL%': 1.1, 'BLK%': 3.5, 'TOV%': 19.2, 'USG%': 12.8, 'OWS': 0.9, 'DWS': 0.9, 'WS': 1.8, 'WS/48': 0.127, 'OBPM': -1.4, 'DBPM': 0.2, 'BPM': -1.2, 'VORP': 0.2}
[adv] no advanced stats for Nikola Jovic in 2024-25
[adv] Johnny Juzang → {'PER': 11.8, 'TS%': 0.573, '3PAr': 0.661, 'FTr': 0.112, 'ORB%': 3.6, 'DRB%': 12.0, 'TRB%': 7.8, 'AST%': 7.9, 'STL%': 1.5, 'BLK%': 0.6, 'TOV%': 6.7, 'USG%': 17.6, 'OWS': 1.3, 'DWS': 0.3, 'WS': 1.6, 'WS/48': 0.06, 'OBPM': -0.9, 'DBPM': -1.4, 'BPM': -2.3, 'VORP': -0.1}
[adv] Yuki Kawamura → {'PER': 11.0, 'TS%': 0.53, '3PAr': 0.767, 'FTr': 0.3, 'ORB%': 0.0, 'DRB%': 13.5, 'TRB%': 6.8, 'AST%': 25.1, 'STL%': 1.0, 'BLK%': 0.0, 'TOV%': 12.8, 'USG%': 16.9, 'OWS': 0.1, 'DWS': 0.1, 'WS': 0.2, 'WS/48': 0.095, 'OBPM': -2.6, 'DBPM': -0.2, 'BPM': -2.8, 'VORP': 0.0}
[adv] Luke Kennard → {'PER': 13.6, 'TS%': 0.631, '3PAr': 0.606, 'FTr': 0.132, 

2024-25 workers:  58%|█████▊    | 337/578 [00:01<00:00, 337.19it/s][A

[adv] De'Anthony Melton → {'PER': 13.3, 'TS%': 0.539, '3PAr': 0.648, 'FTr': 0.148, 'ORB%': 5.2, 'DRB%': 12.8, 'TRB%': 8.9, 'AST%': 21.1, 'STL%': 2.8, 'BLK%': 1.6, 'TOV%': 14.8, 'USG%': 23.5, 'OWS': 0.0, 'DWS': 0.2, 'WS': 0.2, 'WS/48': 0.076, 'OBPM': -1.6, 'DBPM': 0.8, 'BPM': -0.8, 'VORP': 0.0}
[adv] Jack Mcveigh → {'PER': 4.0, 'TS%': 0.412, '3PAr': 0.765, 'FTr': 0.0, 'ORB%': 4.9, 'DRB%': 7.6, 'TRB%': 6.2, 'AST%': 3.0, 'STL%': 0.0, 'BLK%': 4.2, 'TOV%': 10.5, 'USG%': 18.2, 'OWS': -0.1, 'DWS': 0.0, 'WS': 0.0, 'WS/48': -0.031, 'OBPM': -4.8, 'DBPM': -3.0, 'BPM': -7.7, 'VORP': -0.1}
[adv] Sam Merrill → {'PER': 11.1, 'TS%': 0.583, '3PAr': 0.868, 'FTr': 0.068, 'ORB%': 3.0, 'DRB%': 8.9, 'TRB%': 6.1, 'AST%': 9.6, 'STL%': 1.8, 'BLK%': 0.9, 'TOV%': 7.4, 'USG%': 14.3, 'OWS': 1.8, 'DWS': 1.5, 'WS': 3.3, 'WS/48': 0.113, 'OBPM': -0.4, 'DBPM': 0.7, 'BPM': 0.3, 'VORP': 0.8}
[adv] no advanced stats for Vasilije Micic in 2024-25
[adv] Khris Middleton → {'PER': 16.7, 'TS%': 0.588, '3PAr': 0.395, 'FTr': 0.2



[adv] Julian Phillips → {'PER': 10.2, 'TS%': 0.571, '3PAr': 0.533, 'FTr': 0.265, 'ORB%': 6.4, 'DRB%': 9.4, 'TRB%': 7.9, 'AST%': 4.4, 'STL%': 1.6, 'BLK%': 1.5, 'TOV%': 7.8, 'USG%': 12.9, 'OWS': 1.0, 'DWS': 0.8, 'WS': 1.9, 'WS/48': 0.08, 'OBPM': -2.6, 'DBPM': -0.4, 'BPM': -3.0, 'VORP': -0.3}
[adv] Scotty Pippen Jr. → {'PER': 15.8, 'TS%': 0.582, '3PAr': 0.369, 'FTr': 0.288, 'ORB%': 4.1, 'DRB%': 12.1, 'TRB%': 8.1, 'AST%': 27.1, 'STL%': 2.8, 'BLK%': 1.6, 'TOV%': 17.1, 'USG%': 19.3, 'OWS': 2.4, 'DWS': 2.3, 'WS': 4.7, 'WS/48': 0.133, 'OBPM': 0.0, 'DBPM': 1.8, 'BPM': 1.9, 'VORP': 1.7}
[adv] Jalen Pickett → {'PER': 10.9, 'TS%': 0.545, '3PAr': 0.561, 'FTr': 0.044, 'ORB%': 2.6, 'DRB%': 8.8, 'TRB%': 5.9, 'AST%': 19.5, 'STL%': 1.4, 'BLK%': 0.7, 'TOV%': 11.6, 'USG%': 13.2, 'OWS': 0.8, 'DWS': 0.3, 'WS': 1.1, 'WS/48': 0.078, 'OBPM': -0.8, 'DBPM': -0.7, 'BPM': -1.5, 'VORP': 0.1}
[adv] Mason Plumlee → {'PER': 14.8, 'TS%': 0.645, '3PAr': 0.02, 'FTr': 0.634, 'ORB%': 10.2, 'DRB%': 28.4, 'TRB%': 19.5, 'AST%



[adv] no advanced stats for Alperen Sengun in 2024-25
[adv] Brice Sensabaugh → {'PER': 13.4, 'TS%': 0.612, '3PAr': 0.621, 'FTr': 0.122, 'ORB%': 3.2, 'DRB%': 12.6, 'TRB%': 7.9, 'AST%': 12.0, 'STL%': 1.5, 'BLK%': 0.5, 'TOV%': 14.5, 'USG%': 21.5, 'OWS': 0.9, 'DWS': 0.3, 'WS': 1.2, 'WS/48': 0.042, 'OBPM': 0.4, 'DBPM': -1.6, 'BPM': -1.2, 'VORP': 0.3}
[adv] Collin Sexton → {'PER': 16.6, 'TS%': 0.593, '3PAr': 0.313, 'FTr': 0.29, 'ORB%': 3.7, 'DRB%': 6.8, 'TRB%': 5.2, 'AST%': 25.1, 'STL%': 1.3, 'BLK%': 0.3, 'TOV%': 14.1, 'USG%': 27.0, 'OWS': 2.5, 'DWS': -0.1, 'WS': 2.4, 'WS/48': 0.066, 'OBPM': 1.7, 'DBPM': -2.3, 'BPM': -0.6, 'VORP': 0.6}
[adv] Landry Shamet → {'PER': 9.7, 'TS%': 0.601, '3PAr': 0.678, 'FTr': 0.078, 'ORB%': 1.7, 'DRB%': 7.5, 'TRB%': 4.6, 'AST%': 4.7, 'STL%': 1.5, 'BLK%': 0.2, 'TOV%': 8.5, 'USG%': 14.8, 'OWS': 0.5, 'DWS': 0.5, 'WS': 1.0, 'WS/48': 0.065, 'OBPM': -1.9, 'DBPM': -0.6, 'BPM': -2.4, 'VORP': -0.1}
[adv] Terrence Shannon Jr. → {'PER': 11.9, 'TS%': 0.56, '3PAr': 0.272, 'F

2024-25 workers: 100%|██████████| 578/578 [00:01<00:00, 320.66it/s]

[adv] Keaton Wallace → {'PER': 10.1, 'TS%': 0.503, '3PAr': 0.506, 'FTr': 0.068, 'ORB%': 1.5, 'DRB%': 9.6, 'TRB%': 5.5, 'AST%': 21.0, 'STL%': 2.6, 'BLK%': 1.7, 'TOV%': 16.5, 'USG%': 16.4, 'OWS': -0.1, 'DWS': 0.5, 'WS': 0.4, 'WS/48': 0.034, 'OBPM': -3.3, 'DBPM': 1.0, 'BPM': -2.3, 'VORP': 0.0}[adv] Cason Wallace → {'PER': 12.3, 'TS%': 0.562, '3PAr': 0.425, 'FTr': 0.076, 'ORB%': 4.1, 'DRB%': 8.8, 'TRB%': 6.5, 'AST%': 11.5, 'STL%': 3.1, 'BLK%': 1.9, 'TOV%': 10.5, 'USG%': 12.8, 'OWS': 1.8, 'DWS': 3.4, 'WS': 5.2, 'WS/48': 0.133, 'OBPM': -1.5, 'DBPM': 2.6, 'BPM': 1.0, 'VORP': 1.5}

[adv] Ja'Kobe Walter → {'PER': 10.8, 'TS%': 0.523, '3PAr': 0.473, 'FTr': 0.211, 'ORB%': 5.6, 'DRB%': 10.1, 'TRB%': 7.8, 'AST%': 10.2, 'STL%': 1.9, 'BLK%': 0.9, 'TOV%': 10.4, 'USG%': 18.2, 'OWS': 0.3, 'DWS': 0.9, 'WS': 1.2, 'WS/48': 0.052, 'OBPM': -2.3, 'DBPM': -0.6, 'BPM': -3.0, 'VORP': -0.3}
[adv] Jordan Walsh → {'PER': 6.9, 'TS%': 0.464, '3PAr': 0.663, 'FTr': 0.145, 'ORB%': 6.2, 'DRB%': 12.4, 'TRB%': 9.3, 'AST%': 


Seasons:   0%|          | 0/1 [00:07<?, ?it/s]


[dbg] 2024-25 after injury merge: 555
Percentage calculations completed


KeyError: "Columns not found: 'ORB', 'DRB'"

# Dags

# NBA Data Pipeline DAG Architecture

## 🏗️ Architecture Overview

This document details the **simplified DAG architecture** that focuses on core data sources while removing salary scraping complexity.

## 📊 DAG Comparison

| Aspect | Monolithic DAG | Split DAGs | Benefit |
|--------|----------------|------------|---------|
| **Failure Isolation** | One failure blocks all | Isolated failures | ✅ Higher reliability |
| **Scheduling** | Single cadence for all | Source-specific cadences | ✅ Optimized resource usage |
| **Maintenance** | All-or-nothing updates | Independent iteration | ✅ Faster development |
| **Monitoring** | Single SLA for everything | Granular SLAs | ✅ Better observability |
| **Parsing Speed** | Large file slows DagBag | Smaller files | ✅ Faster Airflow startup |

## 🗓️ Current DAG Set

| # | DAG file | Purpose | Schedule | SLA | Retries |
|---|----------|---------|----------|-----|---------|
| 1 | `nba_advanced_ingest.py` | Advanced metrics (Basketball‑Reference) | `@daily` | 1 h | 2 |
| 2 | `injury_etl.py`          | Injury CSV processing | `@monthly` | 1 h | 1 |
| 3 | `nba_data_loader.py`     | Load all sources into DuckDB | `@daily` | 3 h | 2 |

> **Salary cap**: the yearly cap/parquet is committed by the build pipeline
> and version‑controlled; no Airflow DAG is required.

### Dependency graph

```
nba_advanced_ingest ┐
injury_etl ├──► nba_data_loader
```

## 🗓️ DAG Scheduling Strategy

### 1. `nba_advanced_ingest` - Daily
**Rationale**: Advanced stats update daily
- **Schedule**: `@daily`
- **SLA**: 1 hour
- **Retries**: 2 with 5-minute delays
- **Sources**: Basketball-Reference

### 2. `injury_etl` - Monthly
**Rationale**: Injury data updates monthly
- **Schedule**: `@monthly`
- **SLA**: 1 hour
- **Retries**: 1 with 5-minute delays
- **Sources**: Local CSV files

### 3. `nba_data_loader` - Daily
**Rationale**: Loads all data into DuckDB daily
- **Schedule**: `@daily`
- **SLA**: 3 hours
- **Dependencies**: Advanced metrics and injury ETL via ExternalTaskSensor

## 🔗 Dependency Management

### ExternalTaskSensor Configuration

```python
# Wait for advanced metrics
wait_advanced = ExternalTaskSensor(
    task_id="wait_advanced_ingest",
    external_dag_id="nba_advanced_ingest",
    external_task_id="scrape_advanced_metrics",
    timeout=3600,                     # 1 hour timeout
    mode="reschedule",
    poke_interval=300,                # Check every 5 minutes
)
```

### Timeout Strategy

| DAG | Timeout | Rationale |
|-----|---------|-----------|
| Daily DAGs | 1 hour | Normal operation time |
| Monthly DAGs | 2 hours | Allow for monthly task completion |

## 📈 Performance Metrics

### Success Criteria

| Metric | Target | Measurement |
|--------|--------|-------------|
| **Ingest Success Rate** | >95% | Successful DAG runs / Total runs |
| **Data Quality** | >99% | Valid rows / Total rows |
| **SLA Compliance** | >90% | On-time completions / Total runs |

### Monitoring Dashboard

```sql
-- DAG Performance Query
SELECT 
    dag_id,
    COUNT(*) as total_runs,
    AVG(CASE WHEN state = 'success' THEN 1 ELSE 0 END) as success_rate,
    AVG(duration) as avg_duration_minutes
FROM airflow.task_instance 
WHERE start_date >= CURRENT_DATE - 30
GROUP BY dag_id;
```

## 🔄 Removed Components

### Salary Scraping (Removed)
- ❌ `nba_salary_ingest.py` - Player & team salary scraping
- ❌ `salary_cap_snapshot.py` - Yearly salary cap scraping
- ❌ ESPN/HoopsHype scrapers in `scrape_utils.py`

### Salary Cap Handling (Updated)
- ✅ **Build pipeline**: Yearly cap data committed to version control
- ✅ **No DAG required**: Parquet files pre-baked by build process
- ✅ **Loader compatibility**: Still loads cap data if available

## 🛠️ Implementation Details

### Error Handling Strategy

1. **Primary Source Failure**: Graceful degradation when data unavailable
2. **Rate Limiting**: Exponential backoff with jitter
3. **Data Validation**: Quality gates before loading to DuckDB
4. **Alerting**: Email notifications for critical failures

### Retry Configuration

```python
default_args = dict(
    retries=2,                           # Standard retries
    retry_delay=timedelta(minutes=5),    # Standard delays
    sla=timedelta(hours=1),              # Standard SLA
)
```

### Data Quality Gates

```python
# Quality checks before loading
if len(df) == 0:
    raise ValueError(f"No data found for season {season}")

required_cols = ["Season", "Player", "Team"]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")
```

## 📊 Cost-Benefit Analysis

### Pros of Simplified Architecture

| Benefit | Impact | Metric |
|---------|--------|--------|
| **Reliability** | High | 95%+ uptime per source |
| **Maintainability** | High | Independent development cycles |
| **Simplicity** | High | Fewer DAGs to manage |
| **Monitoring** | High | Granular observability |

### Cons of Simplified Architecture

| Drawback | Mitigation | Status |
|----------|------------|--------|
| **Less data sources** | External salary data | ✅ Addressed |
| **Reduced functionality** | Core metrics preserved | ✅ Minimized |

## 🚀 Deployment Checklist

### Pre-Deployment
- [x] All DAG files created and tested
- [x] Salary scraping removed and stubbed
- [x] ExternalTaskSensor dependencies configured
- [x] Data quality gates implemented
- [x] Monitoring and alerting configured

### Deployment
- [x] Deploy new DAGs to Airflow
- [x] Disable old monolithic DAG
- [x] Verify all DAGs are running
- [x] Check data flow end-to-end
- [x] Monitor for 24 hours

### Post-Deployment
- [x] Compare performance metrics
- [x] Validate data quality
- [x] Update documentation
- [x] Train team on new architecture

## 📚 References

- [Airflow Best Practices](https://airflow.apache.org/docs/apache-airflow/stable/best-practices.html)
- [ExternalTaskSensor Guide](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/sensors.html)
- [DAG Design Patterns](https://medium.com/@gharikrishnade/airflow-dag-design-patterns-keeping-it-clean-and-modular-ae07bf9b6f11) 

In [13]:
%%writefile ../dags/nba_api_ingest.py
# dags/nba_api_ingest.py
"""
Pulls roster + box‑score data from nba_api once per hour and writes Parquet
partitions under data/new_processed/season=<YYYY-YY>/part.parquet.

Why hourly?
• The NBA Stats endpoints update within minutes after a game ends.
• Hourly keeps your lake near‑real‑time without hammering the API.
"""
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
import os, sys, pathlib

# Allow `salary_nba_data_pull` imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from salary_nba_data_pull.main import main as pull_main

default_args = {
    "owner": "data_eng",
    "email": ["alerts@example.com"],
    "email_on_failure": True,
    "depends_on_past": False,      # explicit
    "retries": 2,
    "retry_delay": timedelta(minutes=5),
    "sla": timedelta(hours=1),
}

with DAG(
    dag_id="nba_api_ingest",
    start_date=datetime(2025, 7, 1),
    schedule="@hourly",            # unified scheduling API (Airflow ≥ 2.4)
    catchup=False,
    default_args=default_args,
    max_active_runs=1,             # avoid overlapping pulls
    tags=["nba", "api", "ingest"],
    params={"season": "2024-25"},  # visible & overridable in the UI
) as dag:

    def pull_season(**context):
        season = context["params"]["season"]
        start_year = int(season[:4])
        pull_main(
            start_year=start_year,
            end_year=start_year,
            small_debug=True,
            workers=8,
            overwrite=False,
        )

    PythonOperator(
        task_id="scrape_season_data",
        python_callable=pull_season,
    ) 

Overwriting ../dags/nba_api_ingest.py


In [14]:
%%writefile ../dags/nba_advanced_ingest.py
# dags/nba_advanced_ingest.py
"""
Daily scrape of Basketball‑Reference season‑level advanced metrics.
"""
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
import os, sys
from pathlib import Path

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from salary_nba_data_pull.scrape_utils import _season_advanced_df

default_args = {
    "owner": "data_eng",
    "email": ["alerts@example.com"],
    "email_on_failure": True,
    "depends_on_past": False,
    "retries": 2,
    "retry_delay": timedelta(minutes=5),
    "sla": timedelta(hours=1),
}

with DAG(
    dag_id="nba_advanced_ingest",
    start_date=datetime(2025, 7, 1),
    schedule="@daily",
    catchup=False,
    max_active_runs=1,
    default_args=default_args,
    tags=["nba", "advanced", "ingest"],
    params={"season": "2024-25"},
) as dag:

    def scrape_adv(**ctx):
        season = ctx["params"]["season"]
        df = _season_advanced_df(season)
        if df.empty:
            raise ValueError(f"No advanced data for {season}")
        out_dir = Path("/workspace/data/new_processed/advanced_metrics")
        out_dir.mkdir(parents=True, exist_ok=True)
        df.to_parquet(out_dir / f"advanced_{season}.parquet", index=False)

    PythonOperator(
        task_id="scrape_advanced_metrics",
        python_callable=scrape_adv,
    ) 

Overwriting ../dags/nba_advanced_ingest.py


In [15]:
%%writefile ../dags/nba_data_loader.py
# dags/nba_data_loader.py
"""
Fan‑in loader: waits for api_ingest + advanced_ingest + injury_etl,
then materialises season tables and a joined view in DuckDB.
"""
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.sensors.external_task import ExternalTaskSensor
from datetime import datetime, timedelta
from pathlib import Path
import sys, os, duckdb, pandas as pd

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from salary_nba_data_pull.data_utils import validate_data

DATA_ROOT = Path("/workspace/data")

default_args = {
    "owner": "data_eng",
    "email": ["alerts@example.com"],
    "email_on_failure": True,
    "depends_on_past": False,
    "retries": 2,
    "retry_delay": timedelta(minutes=5),
    "sla": timedelta(hours=3),
}

with DAG(
    dag_id="nba_data_loader",
    start_date=datetime(2025, 7, 1),
    schedule="@daily",
    catchup=False,
    max_active_runs=1,
    default_args=default_args,
    tags=["nba", "loader", "duckdb"],
    params={"season": "2024-25"},
) as dag:

    # ─── sensors (one per upstream DAG) ────────────────────────────────
    sensor_args = dict(
        poke_interval=300,
        mode="reschedule",   # avoids tying up a worker slot
    )
    wait_api = ExternalTaskSensor(
        task_id="wait_api_ingest",
        external_dag_id="nba_api_ingest",
        external_task_id="scrape_season_data",
        timeout=3600,
        **sensor_args,
    )
    wait_adv = ExternalTaskSensor(
        task_id="wait_advanced_ingest",
        external_dag_id="nba_advanced_ingest",
        external_task_id="scrape_advanced_metrics",
        timeout=3600,
        **sensor_args,
    )
    wait_injury = ExternalTaskSensor(
        task_id="wait_injury_etl",
        external_dag_id="injury_etl",
        external_task_id="process_injury_data",
        timeout=7200,
        poke_interval=600,
        mode="reschedule",
    )

    # ─── loader task ───────────────────────────────────────────────────
    def load_to_duckdb(**ctx):
        season = ctx["params"]["season"]
        db = DATA_ROOT / "nba_stats.duckdb"
        con = duckdb.connect(db)
        sources = {
            f"player_{season}": DATA_ROOT / f"new_processed/season={season}/part.parquet",
            f"advanced_{season}": DATA_ROOT / f"new_processed/advanced_metrics/advanced_{season}.parquet",
            "injury_master": DATA_ROOT / "new_processed/injury_reports/injury_master.parquet",
        }

        for alias, path in sources.items():
            if path.exists():
                if alias.startswith("player"):
                    df = pd.read_parquet(path)
                    validate_data(df, name=alias, save_reports=True)
                con.execute(
                    f"CREATE OR REPLACE TABLE {alias.replace('-', '_')} AS "
                    f"SELECT * FROM read_parquet('{path}')"
                )

        # materialised view – wildcard parquet scan is fine too
        con.execute(f"""
            CREATE OR REPLACE VIEW v_player_full_{season.replace('-', '_')} AS
            SELECT *
            FROM player_{season.replace('-', '_')} p
            LEFT JOIN advanced_{season.replace('-', '_')} a USING(player, season)
            LEFT JOIN injury_master i USING(player, season)
        """)
        con.close()

    loader = PythonOperator(
        task_id="validate_and_load",
        python_callable=load_to_duckdb,
    )

    [wait_api, wait_adv, wait_injury] >> loader 

Overwriting ../dags/nba_data_loader.py
