In [8]:
%%writefile ../pyproject.toml
[build-system]
requires = ["setuptools>=70"]
build-backend = "setuptools.build_meta"

[project]
name = "data_science"
version = "0.0.1"
description = "General data science/ML environment"
authors = [{ name = "Geoffrey Hadfield" }]
requires-python = ">=3.10,<3.12"

# Main runtime deps only. Dev tools moved to extras.
dependencies = [
  "numpy",
  "pandas",
  "scikit-learn",
  "joblib",
  "matplotlib",
  "seaborn",
  "jupyter",
  "jupyterlab",
  "ipykernel",
  "dash",
  "dash-bootstrap-components",
  "plotly",
  "opencv-python-headless",
  "pillow",
  "tqdm",
  "statsmodels",
  "nba_api",
  "requests",
  "streamlit",
  "xgboost",
  "lightgbm",

  # ---- numba/llvmlite pair (keep in sync). Relax pins unless you must fix.
  # "numba==0.60.*",
  # "llvmlite==0.43.*",
  "shap>=0.46.0",
  "numba>=0.57.1",        # keep if you've tested; otherwise prefer matching pair above
  "llvmlite>=0.41",       # idem

  "beautifulsoup4",
  # CPI lib: use latest to avoid resolution failures on some platforms.
  "cpi>=2.0.0",
  "lxml",
  "IPython",
  "tabulate",
  "pyarrow",
  "requests-cache",
  "diskcache",
  "unidecode",
]

[project.optional-dependencies]
spark = [
  "pyspark",
  "install-jdk>=1.1.0",
]
dev = [
  "pytest",
  "black",
  "flake8",
  "mypy",
]

[tool.black]
line-length = 88
target-version = ["py310"]

[tool.flake8]
max-line-length = 88
extend-ignore = ["E203"]

[tool.mypy]
python_version = "3.10"
ignore_missing_imports = true
strict_optional = true

[tool.pytest.ini_options]
addopts = "-ra -q"
testpaths = ["tests"]

# Optional: uv-specific config (pin index, etc.)
[tool.uv]
# Example: index-url = "https://pypi.org/simple"



Overwriting ../pyproject.toml


In [9]:
%%writefile ../src/salary_nba_data_pull/settings.py
# src/salary_nba_data_pull/settings.py
from pathlib import Path
import os
import typing as _t

# 🗂️  Central data directory (override via env if needed)
DATA_PROCESSED_DIR = Path(
    (Path(__file__).resolve().parent.parent.parent)  # project root
    / "data"
    / "new_processed"
)

# optional: allow `DATA_PROCESSED_DIR=/tmp/demo python main.py …`
ENV_OVERRIDE: _t.Optional[str] = os.getenv("DATA_PROCESSED_DIR")
if ENV_OVERRIDE:
    DATA_PROCESSED_DIR = Path(ENV_OVERRIDE).expanduser().resolve()

# Legacy path for backward compatibility
LEGACY_DATA_PROCESSED_DIR = Path(
    (Path(__file__).resolve().parent.parent.parent)  # project root
    / "data"
    / "processed"
) 

Overwriting ../src/salary_nba_data_pull/settings.py


In [10]:
%%writefile ../src/salary_nba_data_pull/fetch_utils.py
import threading
import time
import random
import logging
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
from nba_api.stats.endpoints import commonallplayers, commonplayerinfo, playercareerstats, leaguestandings
from requests.exceptions import RequestException
from json.decoder import JSONDecodeError
from joblib import Memory
import os
from unidecode import unidecode

REQUESTS_PER_MIN = 20            # public guidance for nba_api
_SEM = threading.BoundedSemaphore(REQUESTS_PER_MIN)

# Set up joblib memory for caching API responses
cache_dir = os.path.join(os.path.dirname(__file__), '../../data/cache/nba_api')
memory = Memory(cache_dir, verbose=0)

def _throttle():
    """Acquire semaphore, release again after 60/REQUESTS_PER_MIN seconds."""
    _SEM.acquire()
    time.sleep(60 / REQUESTS_PER_MIN)
    _SEM.release()

@memory.cache
def fetch_with_retry(endpoint, max_retries=4, timeout=120, debug=False, **kwargs):
    """Thread‑safe, rate‑limited wrapper around nba_api endpoints with caching."""
    for attempt in range(1, max_retries + 1):
        start = time.time()
        try:
            _throttle()
            resp = endpoint(timeout=timeout, **kwargs)
            df = resp.get_data_frames()[0]
            if debug:
                logging.debug("%s OK in %.2fs kwargs=%s", endpoint.__name__,
                              time.time()-start, kwargs)
            return df
        except (RequestException, JSONDecodeError, KeyError) as e:
            if debug:
                logging.debug("%s attempt %d failed: %s",
                              endpoint.__name__, attempt, e)
            if attempt == max_retries:
                return None
            time.sleep(2 ** attempt + random.random())

@memory.cache
def fetch_all_players(season: str, debug: bool = False) -> dict[str, dict]:
    """Return {clean_name: {'player_id':…, 'team_id':…}} for *active* roster."""
    roster_df = fetch_with_retry(
        commonallplayers.CommonAllPlayers,
        season=season,
        is_only_current_season=1,        # <‑‑ key fix
        league_id="00",
        debug=debug,
    )
    players: dict[str, dict] = {}
    if roster_df is not None:
        for _, row in roster_df.iterrows():
            clean = unidecode(row["DISPLAY_FIRST_LAST"]).strip().lower()
            players[clean] = {
                "player_id": int(row["PERSON_ID"]),
                "team_id": int(row["TEAM_ID"]),
            }
    if debug:
        print(f"[fetch_all_players] {len(players)} active players for {season}")
    return players

@lru_cache(maxsize=None)
def fetch_season_players(season: str, debug: bool = False) -> dict[str, dict]:
    """
    Return {clean_name: {'player_id':…, 'team_id':…}} for *everyone who was
    on a roster at any time during the given season*.
    """
    # call once for the whole database (not "current‑season only")
    df = fetch_with_retry(
        commonallplayers.CommonAllPlayers,
        season=season,
        is_only_current_season=0,         # <-- key change
        league_id="00",
        debug=debug,
    )
    players: dict[str, dict] = {}
    if df is not None:
        yr = int(season[:4])
        # keep rows whose career window encloses this season
        df = df[(df.FROM_YEAR.astype(int) <= yr) & (df.TO_YEAR.astype(int) >= yr)]
        for _, row in df.iterrows():
            clean = unidecode(row["DISPLAY_FIRST_LAST"]).strip().lower()
            players[clean] = {
                "player_id": int(row["PERSON_ID"]),
                "team_id": int(row["TEAM_ID"]),
            }

    if debug:
        print(f"[fetch_season_players] {len(players)} players for {season}")
    return players

@memory.cache
def fetch_player_info(player_id, debug=False):
    return fetch_with_retry(commonplayerinfo.CommonPlayerInfo, player_id=player_id, debug=debug)

@memory.cache
def fetch_career_stats(player_id, debug=False):
    return fetch_with_retry(playercareerstats.PlayerCareerStats, player_id=player_id, debug=debug)

@memory.cache
def fetch_league_standings(season, debug=False):
    return fetch_with_retry(leaguestandings.LeagueStandings, season=season, debug=debug)

def clear_cache():
    """Clear the joblib memory cache."""
    memory.clear()

if __name__ == "__main__":
    # Example usage
    debug = True
    season = "2022-23"
    sample_player_name = "LeBron James"

    # Fetch all players
    all_players = fetch_all_players(season, debug=debug)
    print(f"Total players fetched: {len(all_players)}")

    # Fetch player info for a sample player
    if sample_player_name.lower() in all_players:
        sample_player_id = all_players[sample_player_name.lower()]['player_id']
        player_info = fetch_player_info(sample_player_id, debug=debug)
        print(f"Sample player info for {sample_player_name}:")
        print(player_info)

        # Fetch career stats for the sample player
        career_stats = fetch_career_stats(sample_player_id, debug=debug)
        print(f"Sample player career stats for {sample_player_name}:")
        print(career_stats)
    else:
        print(f"Player {sample_player_name} not found in the {season} season data.")

    # Fetch league standings
    standings = fetch_league_standings(season, debug=debug)
    print("League standings:")
    print(standings)


Overwriting ../src/salary_nba_data_pull/fetch_utils.py


In [None]:
%%writefile ../src/salary_nba_data_pull/scrape_utils.py
import pandas as pd
import requests
import time
import random
import re
from bs4 import BeautifulSoup
from io import StringIO
from typing import Optional
import os
import requests_cache
from unidecode import unidecode
from pathlib import Path
from salary_nba_data_pull.settings import DATA_PROCESSED_DIR

# Install cache for all requests
requests_cache.install_cache('nba_scraping', expire_after=86400)  # 24 hours

def scrape_salary_cap_history(debug=False):
    url = "https://basketball.realgm.com/nba/info/salary_cap"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find("table", class_=["basketball", "compact", "tablesaw"])

        if not table:
            if debug:
                print("Could not find the salary cap table on the page.")
            return None

        data = []
        headers = [th.text.strip() for th in table.find('thead').find_all('th')]
        for row in table.find('tbody').find_all('tr'):
            cols = row.find_all('td')
            if cols:
                row_data = [col.text.strip() for col in cols]
                data.append(row_data)

        df = pd.DataFrame(data, columns=headers)

        # Clean up the data
        df['Season'] = df['Season'].str.extract(r'(\d{4}-\d{4})')
        df['Salary Cap'] = df['Salary Cap'].str.replace('$', '').str.replace(',', '').astype(float)

        # Convert other columns to float, handling non-numeric values
        for col in df.columns:
            if col not in ['Season', 'Salary Cap']:
                df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''), errors='coerce')

        if debug:
            print("Salary cap data scraped successfully")
            print(df.head())
        return df
    except Exception as e:
        if debug:
            print(f"Error scraping salary cap history: {str(e)}")
        return None

# User-Agent header to avoid Cloudflare blocks
UA = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/126.0.0.0 Safari/537.36"
    )
}
DELAY_BETWEEN_REQUESTS = 3  # seconds

# Define column templates to guarantee DataFrame structure
PLAYER_COLS = ["Player", "Salary", "Season"]
TEAM_COLS = ["Team", "Team_Salary", "Season"]

# Salary parsing pattern
_salary_pat = re.compile(r"\$?\d[\d,]*")

def _clean_salary(text: str) -> int | None:
    """Return salary as int or None when text has no digits."""
    m = _salary_pat.search(text)
    return int(m.group(0).replace(",", "").replace("$", "")) if m else None

# Name normalization pattern with unidecode
def _normalise_name(raw: str) -> str:
    """ASCII‑fold, trim, lower."""
    return unidecode(raw).split(",")[0].split("(")[0].strip().lower()


# ------- INTERNAL HELPER --------
def _get_hoopshype_soup(url: str, debug: bool = False) -> Optional[BeautifulSoup]:
    """
    Hit HoopsHype once with a realistic UA.  
    Return BeautifulSoup if the page looks OK, else None.
    """
    for attempt in range(2):
        try:
            if debug:
                print(f"[fetch] {url} (attempt {attempt+1})")
            resp = requests.get(url, headers=UA, timeout=30)
            if resp.status_code != 200:
                if debug:
                    print(f"  -> HTTP {resp.status_code}, skipping.")
                return None
            html = resp.text
            # crude Cloudflare challenge check
            if ("Access denied" in html) or ("cf-chl" in html):
                if debug:
                    print("  -> Cloudflare challenge detected; giving up.")
                return None
            return BeautifulSoup(html, "html.parser")
        except requests.RequestException as e:
            if debug:
                print(f"  -> network error {e}, retrying…")
            time.sleep(2 ** attempt + random.random())
    return None
# --------------------------------------------------------------------------


def _scrape_espn_player_salaries(season_start: int, debug: bool = False) -> list[dict]:
    """ESPN fallback: return list of dicts with clean player names."""
    rows, year = [], season_start + 1
    for page in range(1, 30):
        url = f"https://www.espn.com/nba/salaries/_/year/{year}/page/{page}"
        soup = _get_hoopshype_soup(url, debug)
        if soup is None:
            break
        tbl = soup.find("table")
        if not tbl or len(tbl.find_all("tr")) < 3:
            break
        for tr in tbl.find_all("tr")[1:]:
            tds = tr.find_all("td")
            if len(tds) < 4:
                continue
            salary_int = _clean_salary(tds[3].get_text(strip=True))
            if salary_int is None:           # header row ('SALARY', etc.)
                continue
            raw_name = tds[1].get_text(strip=True)
            rows.append({
                "Player":  _normalise_name(raw_name).title(),   # keep display case
                "Salary":  salary_int,
                "Season":  f"{season_start}-{str(season_start+1)[-2:]}"
            })
        time.sleep(0.5)
    return rows


def scrape_player_salary_data(start_season: int, end_season: int,
                              player_filter: str | None = None,
                              debug: bool = False) -> pd.DataFrame:
    """
    Pull player salaries – HoopsHype first, ESPN fallback.
    Player names are normalised so they match nba_api keys.
    """
    out: list[dict] = []

    for yr in range(start_season, end_season + 1):
        canon = f"{yr}-{str(yr+1)[-2:]}"
        for slug in [canon, f"{yr}-{yr+1}"]:          # HH dual slugs
            url = f"https://hoopshype.com/salaries/players/{slug}/"
            soup = _get_hoopshype_soup(url, debug)
            if soup is None:
                continue

            table = soup.find("table", class_="hh-salaries-ranking-table")
            if not table:
                if debug:
                    print(f"  -> salary table not found for {slug}")
                continue

            for row in table.find_all("tr")[1:]:
                tds = row.find_all("td")
                if len(tds) < 3:
                    continue
                raw_name = tds[1].get_text(strip=True)
                clean_name = _normalise_name(raw_name)
                if player_filter and player_filter.lower() != "all" \
                   and clean_name != _normalise_name(player_filter):
                    continue
                salary_int = _clean_salary(tds[2].get_text(strip=True)) or 0
                out.append({"Player": clean_name.title(),
                            "Salary": salary_int,
                            "Season": canon})
            if any(r["Season"] == canon for r in out):
                break  # success for this season

        if not any(r["Season"] == canon for r in out):
            espn_rows = _scrape_espn_player_salaries(yr, debug)
            if debug and espn_rows:
                print(f"  -> ESPN fallback added {len(espn_rows)} rows")
            out.extend(espn_rows)

        if debug:
            got = sum(r['Season'] == canon for r in out)
            print(f"  -> scraped {got} rows for {canon}")
        time.sleep(DELAY_BETWEEN_REQUESTS)

    return pd.DataFrame(out, columns=PLAYER_COLS)
# --------------------------------------------------------------------------


def _scrape_espn_team_salaries(season: str, debug: bool = False) -> list[dict]:
    """Parse ESPN team‑salary table; skip ranking column (#)."""
    rows, year = [], int(season[:4]) + 1
    url = f"https://www.espn.com/nba/salaries/_/type/team/year/{year}"
    soup = _get_hoopshype_soup(url, debug)
    if not soup:
        return rows

    tbl = soup.find("table")
    for tr in tbl.find_all("tr")[1:]:
        tds = tr.find_all("td")
        if len(tds) < 4:          # rank | team | conf | salary
            continue
        team_name = tds[1].get_text(strip=True)   # <‑‑ skip rank col
        salary_int = _clean_salary(tds[3].get_text(strip=True))
        if salary_int is not None:
            rows.append(
                {"Team": team_name, "Team_Salary": salary_int, "Season": season}
            )
    return rows


def scrape_team_salary_data(season: str, debug: bool = False) -> pd.DataFrame:
    """
    Team payrolls for a single season (YYYY-YY or YYYY-YYYY slug resilient).
    """
    records = []
    for slug in [season, f"{season[:4]}-{int(season[:4])+1}"]:
        url = f"https://hoopshype.com/salaries/{slug}/"
        soup = _get_hoopshype_soup(url, debug)
        if soup is None:
            continue

        table = soup.find("table", class_="hh-salaries-ranking-table")
        if not table:
            if debug:
                print(f"  -> team salary table missing for {slug}")
            continue

        for row in table.find_all("tr")[1:]:
            cols = row.find_all("td")
            if len(cols) < 3:
                continue
            team = cols[1].get_text(strip=True)
            salary = int(cols[2].get_text(strip=True).replace("$", "").replace(",", ""))
            records.append({"Team": team, "Team_Salary": salary, "Season": season})
        break  # stop after first hit

    # Fallback if no records found
    if not records:
        espn_rows = _scrape_espn_team_salaries(season, debug)
        if debug and espn_rows:
            print(f"  -> ESPN team salary fallback added {len(espn_rows)} rows")
        records.extend(espn_rows)

    df = pd.DataFrame(records, columns=TEAM_COLS)  # <- guarantees columns
    if debug and not df.empty:
        print(df.head())
    return df

def scrape_advanced_metrics(player_name, season, debug=False, max_retries=3, retry_delay=60):
    def make_request(url):
        response = requests.get(url)
        if response.status_code == 429:
            if debug:
                print(f"Rate limit hit. Waiting for {retry_delay} seconds before retrying.")
            time.sleep(retry_delay)
            return None
        return response

    for attempt in range(max_retries):
        try:
            search_url = f"https://www.basketball-reference.com/search/search.fcgi?search={player_name.replace(' ', '+')}"
            response = make_request(search_url)
            if response is None:
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            search_results = soup.find('div', {'class': 'search-results'})

            if search_results:
                for item in search_results.find_all('div', {'class': 'search-item'}):
                    link = item.find('a')
                    if link and 'players' in link['href']:
                        player_url = f"https://www.basketball-reference.com{link['href']}"
                        break
                else:
                    if debug:
                        print(f"No player URL found for {player_name}")
                    return {}
            else:
                if debug:
                    print(f"No search results found for {player_name}")
                return {}

            time.sleep(2)  # Wait 2 seconds between requests

            response = make_request(player_url)
            if response is None:
                continue

            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'advanced'})
            if table:
                df = pd.read_html(StringIO(str(table)))[0]
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = df.columns.droplevel()
                df['Season'] = df['Season'].astype(str)
                df = df[df['Season'].str.contains(season.split('-')[0], na=False)]
                if not df.empty:
                    row = df.iloc[0]
                    metrics = ['PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']
                    result = {col: row[col] for col in metrics if col in row.index}
                    if debug:
                        print(f"Scraped advanced metrics for {player_name} in season {season}: {result}")
                    return result
                else:
                    if debug:
                        print(f"No advanced metrics found for {player_name} in season {season}")
            else:
                if debug:
                    print(f"No advanced stats table found for {player_name}")

        except Exception as e:
            if debug:
                print(f"Error scraping advanced metrics for {player_name}: {e}")

        if attempt < max_retries - 1:
            if debug:
                print(f"Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)

    if debug:
        print(f"Failed to scrape advanced metrics for {player_name} after {max_retries} attempts")
    return {}

def load_injury_data(
    file_path: str | Path | None = None,
    *,
    base_dir: str | Path | None = None,
    debug: bool = False,
):
    """
    Load the historical injury CSV. By default we look inside the *new*
    processed folder; pass ``file_path`` to override a specific file,
    or ``base_dir`` to point at a different processed directory.
    """
    root = Path(base_dir) if base_dir else DATA_PROCESSED_DIR
    if file_path is None:
        file_path = root / "NBA Player Injury Stats(1951 - 2023).csv"
    file_path = Path(file_path).expanduser().resolve()

    try:
        injury = (
            pd.read_csv(file_path)
            .assign(Date=lambda d: pd.to_datetime(d["Date"]))
        )
        injury["Season"] = injury["Date"].apply(
            lambda x: (
                f"{x.year}-{str(x.year + 1)[-2:]}"
                if x.month >= 10
                else f"{x.year - 1}-{str(x.year)[-2:]}"
            )
        )
        if debug:
            print(f"[load_injury_data] loaded {len(injury):,} rows from {file_path}")
        return injury
    except FileNotFoundError:
        if debug:
            print(f"[load_injury_data] ✖ no injury file at {file_path}")
        return None

def merge_injury_data(player_data, injury_data):
    if injury_data is None:
        return player_data

    all_players_df = player_data.copy()
    all_players_df['Injured'] = False
    all_players_df['Injury_Periods'] = ''
    all_players_df['Total_Days_Injured'] = 0
    all_players_df['Injury_Risk'] = 'Low Risk'

    for index, row in all_players_df.iterrows():
        player_injuries = injury_data[
            (injury_data['Season'] == row['Season']) & 
            (injury_data['Relinquished'].str.contains(row['Player'], case=False, na=False))
        ]
        if not player_injuries.empty:
            periods = []
            total_days = 0
            for _, injury in player_injuries.iterrows():
                start_date = injury['Date']
                acquired_matches = injury_data[
                    (injury_data['Date'] > start_date) & 
                    (injury_data['Acquired'].str.contains(row['Player'], case=False, na=False))
                ]
                if not acquired_matches.empty:
                    end_date = acquired_matches.iloc[0]['Date']
                else:
                    # Assuming injuries last until the end of the season if no acquired date is found
                    end_year = int(row['Season'].split('-')[1])
                    end_date = pd.Timestamp(f"{end_year}-06-30")

                period_days = (end_date - start_date).days
                total_days += period_days
                periods.append(f"{start_date.strftime('%Y-%m-%d')} - {end_date.strftime('%Y-%m-%d')}")

            all_players_df.at[index, 'Injured'] = True
            all_players_df.at[index, 'Injury_Periods'] = '; '.join(periods)
            all_players_df.at[index, 'Total_Days_Injured'] = total_days

            # Categorize injury risk based on total days
            if total_days < 10:
                risk = 'Low Risk'
            elif 10 <= total_days <= 20:
                risk = 'Moderate Risk'
            else:
                risk = 'High Risk'
            all_players_df.at[index, 'Injury_Risk'] = risk

    return all_players_df

if __name__ == "__main__":
    # Example usage and testing of all functions
    debug = True
    start_season = 2022
    end_season = 2023
    sample_player = "Ja Morant"  # Example player

    print("1. Testing scrape_salary_cap_history:")
    salary_cap_history = scrape_salary_cap_history(debug=debug)

    print("\n2. Testing scrape_player_salary_data:")
    player_salary_data = scrape_player_salary_data(start_season, end_season, player_filter=sample_player, debug=debug)

    print("\n3. Testing scrape_team_salary_data:")
    team_salary_data = scrape_team_salary_data(f"{start_season}-{str(start_season+1)[-2:]}", debug=debug)

    print("\n4. Testing scrape_advanced_metrics:")
    advanced_metrics = scrape_advanced_metrics(sample_player, f"{start_season}-{str(start_season+1)[-2:]}", debug=debug)
    print(f"Advanced Metrics for {sample_player}:")
    print(advanced_metrics)

    print("\n5. Testing load_injury_data and merge_injury_data:")
    injury_data = load_injury_data()
    if injury_data is not None:
        print(injury_data.head())
    else:
        print("No injury data loaded.")
    if not player_salary_data.empty and injury_data is not None:
        merged_data = merge_injury_data(player_salary_data, injury_data)
        print("Merged data with injury info:")
        columns_to_display = ['Player', 'Season', 'Salary']
        if 'Injured' in merged_data.columns:
            columns_to_display.append('Injured')
        if 'Injury_Periods' in merged_data.columns:
            columns_to_display.append('Injury_Periods')
        if 'Total_Days_Injured' in merged_data.columns:
            columns_to_display.append('Total_Days_Injured')
        if 'Injury_Risk' in merged_data.columns:
            columns_to_display.append('Injury_Risk')
        print(merged_data[columns_to_display].head())

    if not player_salary_data.empty:
        avg_salary = player_salary_data['Salary'].mean()
        print(f"Average salary for {sample_player} from {start_season} to {end_season}: ${avg_salary:,.2f}")

    if not team_salary_data.empty:
        highest_team_salary = team_salary_data.loc[team_salary_data['Team_Salary'].idxmax()]
        print(f"Team with highest salary in {start_season}-{end_season}: {highest_team_salary['Team']} (${highest_team_salary['Team_Salary']:,.2f})")

    if not injury_data.empty:
        injury_count = injury_data['Relinquished'].str.contains(sample_player, case=False).sum()
        print(f"Number of injuries/illnesses for {sample_player} from {start_season} to {end_season}: {injury_count}")

    print("\nAll tests completed.")


Overwriting ../src/salary_nba_data_pull/scrape_utils.py


In [12]:
%%writefile ../src/salary_nba_data_pull/process_utils.py
import pandas as pd
import numpy as np
import logging
import sqlite3
from datetime import datetime
from functools import lru_cache
from salary_nba_data_pull.fetch_utils import fetch_all_players, fetch_career_stats

# --- CPI lazy‑loader --------------------------------------------------
_CPI_AVAILABLE = False  # toggled at runtime

@lru_cache(maxsize=1)
def _ensure_cpi_ready(debug: bool = False) -> bool:
    """
    Import `cpi` lazily and guarantee its internal SQLite DB is usable.
    Returns True when inflation data are available, False otherwise.
    """
    global _CPI_AVAILABLE
    try:
        import importlib
        cpi = importlib.import_module("cpi")        # late import
        try:
            _ = cpi.models.Series.get_by_id("0000")  # 1‑row sanity query
            _CPI_AVAILABLE = True
            return True
        except sqlite3.OperationalError:
            if debug:
                logging.warning("[CPI] DB invalid – rebuilding from BLS…")
            cpi.update(rebuild=True)                # expensive network call
            _CPI_AVAILABLE = True
            return True
    except ModuleNotFoundError:
        if debug:
            logging.warning("[CPI] package not installed")
    except Exception as e:
        if debug:
            logging.error("[CPI] unexpected CPI failure: %s", e)
    return False
# ---------------------------------------------------------------------

def inflate_value(value: float, year_str: str,
                  *, debug: bool = False, skip_inflation: bool = False) -> float:
    """
    Inflate `value` from the dollars of `year_str` (YYYY or YYYY‑YY) to 2022 USD.
    If CPI data are unavailable or the user opts out, return the original value.
    """
    if skip_inflation or not _ensure_cpi_ready(debug):
        return value
    try:
        import cpi                                       # safe: DB ready
        year = int(year_str[:4])
        if year >= datetime.now().year:
            return value
        return float(cpi.inflate(value, year, to=2022))
    except Exception as e:
        if debug:
            logging.error("[CPI] inflate failed for %s: %s", year_str, e)
        return value
# ---------------------------------------------------------------------

def calculate_percentages(df, debug=False):
    """
    Calculate shooting percentages and other derived statistics.
    """
    if df.empty:
        return df
    
    # Calculate shooting percentages
    if 'FGA' in df.columns and 'FG' in df.columns:
        df['FG%'] = (df['FG'] / df['FGA'] * 100).round(2)
        df['FG%'] = df['FG%'].replace([np.inf, -np.inf], np.nan)
    
    if '3PA' in df.columns and '3P' in df.columns:
        df['3P%'] = (df['3P'] / df['3PA'] * 100).round(2)
        df['3P%'] = df['3P%'].replace([np.inf, -np.inf], np.nan)
    
    if 'FTA' in df.columns and 'FT' in df.columns:
        df['FT%'] = (df['FT'] / df['FTA'] * 100).round(2)
        df['FT%'] = df['FT%'].replace([np.inf, -np.inf], np.nan)
    
    # Calculate efficiency metrics
    if 'PTS' in df.columns and 'FGA' in df.columns and 'FTA' in df.columns:
        df['TS%'] = (df['PTS'] / (2 * (df['FGA'] + 0.44 * df['FTA'])) * 100).round(2)
        df['TS%'] = df['TS%'].replace([np.inf, -np.inf], np.nan)
    
    if 'PTS' in df.columns and 'MP' in df.columns:
        df['PTS_per_36'] = (df['PTS'] / df['MP'] * 36).round(2)
        df['PTS_per_36'] = df['PTS_per_36'].replace([np.inf, -np.inf], np.nan)
    
    if 'AST' in df.columns and 'MP' in df.columns:
        df['AST_per_36'] = (df['AST'] / df['MP'] * 36).round(2)
        df['AST_per_36'] = df['AST_per_36'].replace([np.inf, -np.inf], np.nan)
    
    if 'TRB' in df.columns and 'MP' in df.columns:
        df['TRB_per_36'] = (df['TRB'] / df['MP'] * 36).round(2)
        df['TRB_per_36'] = df['TRB_per_36'].replace([np.inf, -np.inf], np.nan)
    
    if debug:
        print("Percentage calculations completed")
    
    return df

def process_player_data(player_name, season, all_players, debug=False):
    """
    Process individual player data and return a dictionary with player statistics.
    """
    if not all_players:
        if debug:
            print(f"No players data available for {player_name}")
        return None
    
    # Normalize player name for lookup
    player_name_lower = player_name.lower().strip()
    
    # Check if player exists in all_players
    if player_name_lower not in all_players:
        if debug:
            print(f"Player {player_name} not found in all_players")
        return None
    
    player_info = all_players[player_name_lower]
    player_id = player_info['player_id']
    
    if debug:
        print(f"Processing {player_name} (ID: {player_id})")
    
    try:
        # Fetch player career stats
        career_stats = fetch_career_stats(player_id, debug=debug)
        
        if career_stats is None or career_stats.empty:
            if debug:
                print(f"No career stats found for {player_name}")
            return None
        
        # Filter for the specific season
        season_stats = career_stats[career_stats['SEASON_ID'] == season]
        
        if season_stats.empty:
            if debug:
                print(f"No stats found for {player_name} in season {season}")
            return None
        
        # Get the first (and should be only) row for this season
        stats_row = season_stats.iloc[0]
        
        # Create player data dictionary
        player_data = {
            'Player': player_name,
            'Season': season,
            'Age': stats_row.get('PLAYER_AGE', None),
            'GP': stats_row.get('GP', 0),
            'MP': stats_row.get('MIN', 0),
            'PTS': stats_row.get('PTS', 0),
            'TRB': stats_row.get('REB', 0),
            'AST': stats_row.get('AST', 0),
            'STL': stats_row.get('STL', 0),
            'BLK': stats_row.get('BLK', 0),
            'TOV': stats_row.get('TOV', 0),
            'FG': stats_row.get('FGM', 0),
            'FGA': stats_row.get('FGA', 0),
            '3P': stats_row.get('FG3M', 0),
            '3PA': stats_row.get('FG3A', 0),
            'FT': stats_row.get('FTM', 0),
            'FTA': stats_row.get('FTA', 0),
            'PER': stats_row.get('PER', None),
            'WS': stats_row.get('WS', None),
            'VORP': stats_row.get('VORP', None),
            'Team': stats_row.get('TEAM_ABBREVIATION', 'Unknown'),
            'Injured': False,  # Default value, will be updated by merge_injury_data
            'Injury_Periods': '',
            'Total_Days_Injured': 0,
            'Injury_Risk': 'Low Risk'
        }
        
        if debug:
            print(f"Successfully processed {player_name}: {len(player_data)} fields")
        
        return player_data
        
    except Exception as e:
        if debug:
            print(f"Error processing {player_name}: {str(e)}")
        return None

def merge_injury_data(player_data, injury_data):
    """
    Merge injury data with player data.
    """
    if injury_data is None or player_data.empty:
        return player_data
    
    # Create a copy to avoid modifying the original
    merged_data = player_data.copy()
    
    # Initialize injury columns if they don't exist
    if 'Injured' not in merged_data.columns:
        merged_data['Injured'] = False
    if 'Injury_Periods' not in merged_data.columns:
        merged_data['Injury_Periods'] = ''
    if 'Total_Days_Injured' not in merged_data.columns:
        merged_data['Total_Days_Injured'] = 0
    if 'Injury_Risk' not in merged_data.columns:
        merged_data['Injury_Risk'] = 'Low Risk'
    
    # Process each player
    for index, row in merged_data.iterrows():
        player_name = row['Player']
        season = row['Season']
        
        # Find injuries for this player in this season
        player_injuries = injury_data[
            (injury_data['Season'] == season) & 
            (injury_data['Relinquished'].str.contains(player_name, case=False, na=False))
        ]
        
        if not player_injuries.empty:
            periods = []
            total_days = 0
            
            for _, injury in player_injuries.iterrows():
                start_date = injury['Date']
                
                # Find when player was acquired back
                acquired_matches = injury_data[
                    (injury_data['Date'] > start_date) & 
                    (injury_data['Acquired'].str.contains(player_name, case=False, na=False))
                ]
                
                if not acquired_matches.empty:
                    end_date = acquired_matches.iloc[0]['Date']
                else:
                    # Assume injury lasts until end of season if no acquired date
                    end_year = int(season.split('-')[1])
                    end_date = pd.Timestamp(f"{end_year}-06-30")
                
                period_days = (end_date - start_date).days
                total_days += period_days
                periods.append(f"{start_date.strftime('%Y-%m-%d')} - {end_date.strftime('%Y-%m-%d')}")
            
            # Update injury data
            merged_data.at[index, 'Injured'] = True
            merged_data.at[index, 'Injury_Periods'] = '; '.join(periods)
            merged_data.at[index, 'Total_Days_Injured'] = total_days
            
            # Categorize injury risk
            if total_days < 10:
                risk = 'Low Risk'
            elif 10 <= total_days <= 20:
                risk = 'Moderate Risk'
            else:
                risk = 'High Risk'
            merged_data.at[index, 'Injury_Risk'] = risk
    
    return merged_data


Overwriting ../src/salary_nba_data_pull/process_utils.py


In [None]:
%%writefile ../src/salary_nba_data_pull/data_utils.py

import pandas as pd
import numpy as np
from salary_nba_data_pull.process_utils import (
    inflate_value
)
from salary_nba_data_pull.quality import (
    ExpectedSchema, audit_dataframe, write_audit_reports
)
from salary_nba_data_pull.settings import DATA_PROCESSED_DIR

def clean_dataframe(df):
    # Remove unnamed columns
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Remove columns with all NaN values
    df = df.dropna(axis=1, how='all')

    # Remove rows with all NaN values
    df = df.dropna(axis=0, how='all')

    # Ensure only one 'Season' column exists
    season_columns = [col for col in df.columns if 'Season' in col]
    if len(season_columns) > 1:
        df = df.rename(columns={season_columns[0]: 'Season'})
        for col in season_columns[1:]:
            df = df.drop(columns=[col])

    # Remove '3PAr' and 'FTr' columns
    columns_to_remove = ['3PAr', 'FTr']
    df = df.drop(columns=columns_to_remove, errors='ignore')

    # Round numeric columns to 2 decimal places
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].round(2)

    return df

def merge_salary_cap_data(player_data, salary_cap_data):
    player_data['Season_Year'] = player_data['Season'].str[:4].astype(int)
    salary_cap_data['Season_Year'] = salary_cap_data['Season'].str[:4].astype(int)

    # Add inflation-adjusted salary cap
    salary_cap_data['Salary_Cap_Inflated'] = salary_cap_data.apply(
        lambda row: inflate_value(row['Salary Cap'], row['Season']),
        axis=1
    )

    # Merge salary cap data
    merged_data = pd.merge(player_data, salary_cap_data, on='Season_Year', how='left', suffixes=('', '_cap'))

    # Update salary cap columns
    cap_columns = ['Mid-Level Exception', 'Salary Cap', 'Luxury Tax', '1st Apron', '2nd Apron', 'BAE',
                   'Standard /Non-Taxpayer', 'Taxpayer', 'Team Room /Under Cap', 'Salary_Cap_Inflated']
    for col in cap_columns:
        if f'{col}_cap' in merged_data.columns:
            merged_data[col] = merged_data[col].fillna(merged_data[f'{col}_cap'])
            merged_data.drop(columns=[f'{col}_cap'], inplace=True)

    # Clean up temporary columns
    merged_data.drop(columns=['Season_Year'], inplace=True)

    # Clean the dataframe
    merged_data = clean_dataframe(merged_data)

    return merged_data

def validate_data(df: pd.DataFrame,
                  *,
                  name: str = "player_dataset",
                  save_reports: bool = True) -> pd.DataFrame:
    """
    Run a comprehensive audit and optionally persist CSV reports.
    Returns the original df untouched.
    """
    schema = ExpectedSchema(
        expected_cols=df.columns,           # you can narrow this if you have a canonical list
        required_cols=[
            "Season", "Player", "Salary", "Team"
        ],
        dtypes={
            "Season": "object",
            "Player": "object",
            "Salary": "float64",
        },
        non_negative_cols=["Salary", "GP", "MP", "PTS", "TRB", "AST", "Team_Salary"],
        non_constant_cols=["Salary", "PTS", "Team_Salary"],
        unique_key=["Season", "Player"]
    )

    reports = audit_dataframe(df, schema, name=name)

    if save_reports:
        out_dir = DATA_PROCESSED_DIR / "audits"
        write_audit_reports(reports, out_dir, prefix=name)

    # Print a one-liner summary (optional)
    missing_req = reports["cols_overview"].query("missing_required == True")
    if not missing_req.empty:
        print(f"[validate_data] Missing required columns: {missing_req['column'].tolist()}")

    return df


Overwriting ../src/salary_nba_data_pull/data_utils.py


In [14]:
%%writefile ../src/salary_nba_data_pull/quality.py
# src/salary_nba_data_pull/quality.py
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable, Mapping, Any
import pandas as pd
import numpy as np

@dataclass
class ExpectedSchema:
    """Describe what we *intended* to have in a dataframe."""
    # All columns we care about (order doesn't matter)
    expected_cols: Iterable[str]

    # Subset that must be present
    required_cols: Iterable[str] = field(default_factory=list)

    # Expected pandas dtypes (string form, e.g. 'float64', 'object')
    dtypes: Mapping[str, str] = field(default_factory=dict)

    # Columns that must be >= 0
    non_negative_cols: Iterable[str] = field(default_factory=list)

    # Columns that should not be all zeros / all NaN
    non_constant_cols: Iterable[str] = field(default_factory=list)

    # Unique key columns (together must be unique)
    unique_key: Iterable[str] = field(default_factory=list)

    # Allowed value sets (enums)
    allowed_values: Mapping[str, Iterable[Any]] = field(default_factory=dict)

def _series_is_constant(s: pd.Series) -> bool:
    return s.nunique(dropna=True) <= 1

def audit_dataframe(df: pd.DataFrame,
                    schema: ExpectedSchema,
                    *,
                    name: str = "dataset") -> dict[str, pd.DataFrame]:
    """
    Return a dict of small DataFrames summarising quality checks.
    Nothing is printed; caller decides how to persist/log.
    """
    exp = set(schema.expected_cols)
    req = set(schema.required_cols)

    present = set(df.columns)
    missing = sorted(list(exp - present))
    extra   = sorted(list(present - exp))

    # --- Column overview
    cols_overview = pd.DataFrame({
        "column": sorted(list(exp | present)),
        "expected": [c in exp for c in sorted(list(exp | present))],
        "present":  [c in present for c in sorted(list(exp | present))],
        "required": [c in req for c in sorted(list(exp | present))]
    })
    cols_overview["missing_required"] = cols_overview.apply(
        lambda r: r["required"] and not r["present"], axis=1
    )

    # --- Null report
    null_report = (df.isna().sum().to_frame("null_count")
                     .assign(total_rows=len(df))
                     .assign(null_pct=lambda d: 100 * d["null_count"] / d["total_rows"])
                     .reset_index()
                     .rename(columns={"index": "column"}))

    # --- Dtype report
    type_rows = []
    for col in df.columns:
        exp_type = schema.dtypes.get(col)
        type_rows.append({
            "column": col,
            "expected_dtype": exp_type,
            "actual_dtype": str(df[col].dtype),
            "matches": (exp_type is None) or (str(df[col].dtype) == exp_type)
        })
    type_report = pd.DataFrame(type_rows)

    # --- Value checks
    value_rows = []
    for col in df.select_dtypes(include=[np.number]).columns:
        series = df[col]
        row = {
            "column": col,
            "min": series.min(skipna=True),
            "max": series.max(skipna=True),
            "negatives": int((series < 0).sum()),
            "zeros": int((series == 0).sum()),
            "non_zero_pct": 100 * (series != 0).sum() / len(series),
        }
        row["should_be_non_negative"] = col in schema.non_negative_cols
        row["violates_non_negative"] = row["negatives"] > 0 and row["should_be_non_negative"]
        value_rows.append(row)
    value_report = pd.DataFrame(value_rows)

    # Constant columns
    constant_rows = []
    for col in df.columns:
        constant_rows.append({
            "column": col,
            "is_constant": _series_is_constant(df[col]),
            "should_not_be_constant": col in schema.non_constant_cols
        })
    constant_report = pd.DataFrame(constant_rows).assign(
        violates=lambda d: d["is_constant"] & d["should_not_be_constant"]
    )

    # Allowed values
    enum_rows = []
    for col, allowed in schema.allowed_values.items():
        if col not in df.columns:
            continue
        bad = ~df[col].isin(allowed) & df[col].notna()
        enum_rows.append({
            "column": col,
            "bad_count": int(bad.sum()),
            "sample_bad": df.loc[bad, col].drop_duplicates().head(5).tolist()
        })
    enum_report = pd.DataFrame(enum_rows)

    # Unique key
    uniq_report = pd.DataFrame()
    if schema.unique_key:
        dup_mask = df.duplicated(subset=list(schema.unique_key), keep=False)
        uniq_report = pd.DataFrame({
            "duplicate_rows": [int(dup_mask.sum())],
            "subset": [list(schema.unique_key)]
        })

    return {
        "cols_overview": cols_overview,
        "null_report": null_report,
        "type_report": type_report,
        "value_report": value_report,
        "constant_report": constant_report,
        "enum_report": enum_report,
        "unique_report": uniq_report
    }

def assert_dataframe_ok(df: pd.DataFrame,
                        schema: ExpectedSchema,
                        *, name: str = "dataset") -> None:
    """
    Raise AssertionError with a concise message if critical checks fail.
    Designed for pytest or CI.
    """
    rep = audit_dataframe(df, schema, name=name)
    bad_missing = rep["cols_overview"].query("missing_required == True")
    bad_types = rep["type_report"].query("matches == False")
    bad_nonneg = rep["value_report"].query("violates_non_negative == True")
    bad_constant = rep["constant_report"].query("violates == True")
    dupes = rep["unique_report"]["duplicate_rows"].iloc[0] if not rep["unique_report"].empty else 0

    msgs = []
    if not bad_missing.empty:
        msgs.append(f"Missing required cols: {bad_missing['column'].tolist()}")
    if not bad_types.empty:
        msgs.append(f"Dtype mismatches: {bad_types[['column','expected_dtype','actual_dtype']].to_dict('records')}")
    if not bad_nonneg.empty:
        msgs.append(f"Negative values in non-negative cols: {bad_nonneg['column'].tolist()}")
    if not bad_constant.empty:
        msgs.append(f"Constant-but-shouldn't cols: {bad_constant['column'].tolist()}")
    if dupes:
        msgs.append(f"Duplicate key rows: {dupes}")

    if msgs:
        raise AssertionError(f"[{name}] data quality failures:\n" + "\n".join(msgs))

def write_audit_reports(reports: Mapping[str, pd.DataFrame],
                        out_dir: Path,
                        prefix: str) -> None:
    """
    Save each report DataFrame as CSV for later inspection.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    for key, df in reports.items():
        df.to_csv(out_dir / f"{prefix}_{key}.csv", index=False) 

Overwriting ../src/salary_nba_data_pull/quality.py


In [None]:
%%writefile ../src/salary_nba_data_pull/main.py
import argparse
import pandas as pd
import logging
import time
import glob
import os
import hashlib
from pathlib import Path
import pyarrow.parquet as pq
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
import requests_cache
from salary_nba_data_pull.fetch_utils import fetch_all_players, fetch_season_players
from salary_nba_data_pull.process_utils import (
    process_player_data,
    inflate_value,
    calculate_percentages,
    _ensure_cpi_ready,
)
from salary_nba_data_pull.scrape_utils import (
    scrape_salary_cap_history,
    merge_injury_data,
    scrape_player_salary_data,
    scrape_team_salary_data,
    load_injury_data,
)
from salary_nba_data_pull.data_utils import (
    clean_dataframe,
    merge_salary_cap_data,
    validate_data,
)
from salary_nba_data_pull.settings import DATA_PROCESSED_DIR

# Enable requests-cache for all HTTP traffic
requests_cache.install_cache("nba_pull", backend="sqlite", allowable_codes=(200,))

# CPI self-test - logs a warning once per run if CPI is unavailable
_ensure_cpi_ready(debug=False)

# Default number of worker threads
DEFAULT_WORKERS = 8                # tweak ≤ CPU cores

def _file_md5(path: str, chunk: int = 1 << 20) -> str:
    """Return md5 hexdigest for *path* streaming in 1 MiB chunks."""
    h = hashlib.md5()
    with open(path, "rb") as f:
        for blk in iter(lambda: f.read(chunk), b""):
            h.update(blk)
    return h.hexdigest()

def _season_partition_identical(season: str, base_dir: str, new_df: pd.DataFrame) -> bool:
    """True when an on‑disk parquet for *season* has same schema & md5 as *new_df*."""
    ckpt = Path(base_dir) / f"season={season}" / "part.parquet"
    md5_file = ckpt.with_suffix(".md5")
    if not ckpt.exists():
        return False

    # ---------- 1. schema check (fast) ----------
    stored_schema = pq.read_schema(ckpt)
    if set(stored_schema.names) != set(new_df.columns):
        return False

    # ---------- 2. content hash ----------
    if not md5_file.exists():
        return False                       # force re‑write → hash will be created
    stored_hash = md5_file.read_text().strip()
    new_hash = hashlib.md5(new_df.to_parquet(index=False)).hexdigest()
    return stored_hash == new_hash

def _season_partition_exists(season, base_dir):
    """Check if a season partition already exists in Parquet format."""
    return os.path.exists(os.path.join(base_dir, f"season={season}"))

def _player_task(args):
    """Wrapper so we can map in ThreadPoolExecutor."""
    (player_name, season, salary, all_players, debug) = args
    stats = process_player_data(player_name, season, all_players, debug)
    if stats:
        stats['Salary'] = salary
    return stats

# ----------------------------------------------------------------------
def update_data(existing_data,
                start_year: int,
                end_year: int,
                *,
                player_filter: str = "all",
                min_avg_minutes: float | None = None,
                debug: bool = False,
                max_workers: int = 8,
                output_base: str | Path = DATA_PROCESSED_DIR,
                overwrite: bool = False) -> pd.DataFrame:
    """
    Pull the seasons in [start_year, end_year] and write everything under
    `output_base` (parquet partitions + per‑season artifacts).
    Skips a season when an identical parquet + md5 already exist.
    """
    output_base = Path(output_base)
    output_base.mkdir(parents=True, exist_ok=True)

    injury = load_injury_data()
    salary_df = scrape_player_salary_data(start_year, end_year,
                                          player_filter, debug)

    out_frames: list[pd.DataFrame] = []

    for y in tqdm(range(start_year, end_year + 1), desc="Seasons"):
        season = f"{y}-{str(y+1)[-2:]}"
        ckpt_dir = output_base / f"season={season}"
        ckpt_dir.mkdir(parents=True, exist_ok=True)        # <‑‑ FIX

        if (not overwrite
            and (ckpt_dir / "part.parquet").exists()
            and _season_partition_identical(season, output_base, salary_df.query("Season == @season"))):
            if debug:
                print(f"✓  {season} unchanged – skipping")
            continue
        elif debug and (ckpt_dir / "part.parquet").exists():
            print(f"↻  {season} differs – re‑scraping")

        # -------- scrape per‑season data ---------------------------------
        team_payroll = scrape_team_salary_data(season, debug)
        if team_payroll.empty:
            team_payroll = pd.DataFrame(columns=["Team", "Team_Salary", "Season"])

        players_this_season = fetch_season_players(season, debug)
        rows = salary_df.query("Season == @season")

        args = [(row.Player, season, row.Salary, players_this_season, debug)
                for _, row in rows.iterrows()]

        with ThreadPoolExecutor(max_workers=max_workers or DEFAULT_WORKERS) as pool:
            results = [r for r in
                       tqdm(pool.map(_player_task, args), total=len(args), leave=False, desc=season)
                       if r]

        # -------- handle roster gaps -------------------------------------
        missing = rows.loc[~rows.Player.str.lower()
                           .isin(players_this_season.keys()), "Player"].unique()
        if missing.size and debug:
            print(f"⚠️  {len(missing)} players not in roster for {season}")
        (ckpt_dir / "missing_players.txt").write_text("\n".join(missing))

        df_season = pd.DataFrame(results)
        if df_season.empty:
            continue

        merged = (pd.merge(df_season, team_payroll, on=["Team", "Season"], how="left")
                    .pipe(lambda d: d.query("MP >= @min_avg_minutes") if min_avg_minutes else d)
                    .pipe(merge_injury_data, injury_data=injury)
                    .pipe(calculate_percentages, debug=debug)
                    .pipe(clean_dataframe))

        # -------- write parquet + md5 ------------------------------------
        parquet_path = ckpt_dir / "part.parquet"
        merged.to_parquet(parquet_path, index=False)
        (ckpt_dir / "part.md5").write_text(_file_md5(parquet_path))

        out_frames.append(merged)
        logging.info("wrote %s", ckpt_dir)

    return pd.concat(out_frames, ignore_index=True) if out_frames else pd.DataFrame()

def get_timestamp():
    """Return a filesystem-safe timestamp string."""
    return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

def remove_old_logs(log_dir, days_to_keep=7):
    current_time = datetime.now()
    for log_file in glob.glob(os.path.join(log_dir, 'stat_pull_log_*.txt')):
        file_modified_time = datetime.fromtimestamp(os.path.getmtime(log_file))
        if current_time - file_modified_time > timedelta(days=days_to_keep):
            os.remove(log_file)

def main(start_year: int,
         end_year: int,
         player_filter: str = "all",
         min_avg_minutes: float = 15,
         debug: bool = False,
         workers: int = 8,
         overwrite: bool = False,
         output_base: str | Path = DATA_PROCESSED_DIR) -> None:
    """
    CLI / notebook entry point.
    All artefacts go under `output_base`; log files remain in stat_pull_output.
    """
    t0 = time.time()
    output_base = Path(output_base)
    output_base.mkdir(parents=True, exist_ok=True)

    log_dir = output_base.parent / "stat_pull_output"
    log_dir.mkdir(parents=True, exist_ok=True)
    remove_old_logs(log_dir)

    log_file = log_dir / f"stat_pull_log_{get_timestamp()}.txt"
    logging.basicConfig(filename=log_file,
                        level=logging.DEBUG if debug else logging.INFO,
                        format="%(asctime)s - %(levelname)s - %(message)s")

    updated = update_data(None, start_year, end_year,
                          player_filter=player_filter,
                          min_avg_minutes=min_avg_minutes,
                          debug=debug,
                          max_workers=workers,
                          output_base=str(output_base),
                          overwrite=overwrite)

    print(f"✔ Completed pull: {len(updated):,} rows added")

    if not updated.empty:
        salary_cap = scrape_salary_cap_history(debug=debug)
        if salary_cap is not None:
            cap_csv = output_base / "salary_cap_history_inflated.csv"
            salary_cap.to_csv(cap_csv, index=False)
            updated = merge_salary_cap_data(updated, salary_cap)

        # Run comprehensive data audit
        from salary_nba_data_pull.data_utils import validate_data
        updated = validate_data(updated, name="player_dataset", save_reports=True)

        final_csv = output_base / "nba_player_data_final_inflated.csv"
        updated.to_csv(final_csv, index=False, float_format="%.2f")
        if debug:
            print(f"Updated flat‑file saved to {final_csv}")

    print(f"Process finished in {time.time() - t0:.1f} s — log: {log_file}")
# ----------------------------------------------------------------------

# argparse snippet (bottom of file – keep imports above):
if __name__ == "__main__":
    cur = datetime.now().year
    p = argparse.ArgumentParser()
    p.add_argument("--start_year", type=int, default=cur-1)
    p.add_argument("--end_year",   type=int, default=cur)
    p.add_argument("--player_filter", default="all")
    p.add_argument("--min_avg_minutes", type=float, default=15)
    p.add_argument("--debug", action="store_true")
    p.add_argument("--workers", type=int, default=8)
    p.add_argument("--overwrite", action="store_true")
    p.add_argument("--output_base",
                   default=str(DATA_PROCESSED_DIR),
                   help="Destination root for parquet + csv outputs")
    args = p.parse_args()
    main(**vars(args))



Overwriting ../src/salary_nba_data_pull/main.py


In [1]:
# %%writefile ../src/salary_nba_data_pull/notebook_helper.py
"""
Notebook/REPL helper utilities for salary_nba_data_pull.

Goals
-----
• Work no matter where the notebook is opened (absolute paths).
• Avoid NameError on __file__.
• Keep hot‑reload for iterative dev.
• Forward arbitrary args to main() so we can test all scenarios.

Use:
>>> import salary_nba_data_pull.notebook_helper as nb
>>> nb.quick_pull(2024, workers=12, debug=True)
"""

from __future__ import annotations

import sys
import importlib
import inspect
from pathlib import Path
from typing import Iterable
from salary_nba_data_pull.settings import DATA_PROCESSED_DIR

# ----------  PATH / ROOT DISCOVERY  ---------------------------------
def find_repo_root(start: Path | None = None) -> Path:
    """Walk upward until we find pyproject.toml or .git."""
    markers = {"pyproject.toml", ".git"}
    here = (start or Path.cwd()).resolve()
    for p in [here] + list(here.parents):
        if any((p / m).exists() for m in markers):
            return p
    return here

try:
    # __file__ is undefined in Jupyter; this will raise NameError there.
    _MODULE_PATH = Path(__file__).resolve()
except NameError:
    _MODULE_PATH = Path.cwd()

ROOT = find_repo_root(_MODULE_PATH)

if str(ROOT) not in sys.path:
    # Put project root at the *front* so local package wins over installed ones.
    sys.path.insert(0, str(ROOT))
    print(f"[notebook_helper] Added ROOT to sys.path: {ROOT}")

# ----------  IMPORT PACKAGE MAIN (ABSOLUTE)  ------------------------
from salary_nba_data_pull import main as nba_main  # noqa: E402  (import after sys.path tweak)

def _reload():
    """Reload the main module so code edits are picked up."""
    importlib.reload(nba_main)

# ----------  USER-FACING WRAPPERS  ----------------------------------
def quick_pull(season: int, **kwargs):
    """
    Pull a single season.
    All kwargs are passed straight into nba_main.main()
    so you can test: debug, workers, overwrite, player_filter, etc.
    """
    _reload()
    print(f"[quick_pull] season={season}, kwargs={kwargs}")
    nba_main.main(start_year=season, end_year=season, **kwargs)

def historical_pull(start_year: int, end_year: int, **kwargs):
    """Pull an inclusive range of seasons."""
    _reload()
    print(f"[historical_pull] {start_year}-{end_year}, kwargs={kwargs}")
    nba_main.main(start_year=start_year, end_year=end_year, **kwargs)

def check_existing_data(base: Path | str | None = None) -> list[str]:
    """Return the list of partitioned seasons already present."""
    base = Path(base) if base else DATA_PROCESSED_DIR
    seasons = sorted(
        d.name.split("=", 1)[-1] for d in base.glob("season=*") if d.is_dir()
    )
    print(f"[check_existing_data] found {len(seasons)} seasons in {base}")
    return seasons

def load_parquet_data(
    season: str | None = None, *,
    base: Path | str | None = None
) -> pd.DataFrame:
    """
    Load parquet partitions into a DataFrame.
    If `season` is None, load all partitions.
    """
    import pandas as pd
    base = Path(base) if base else DATA_PROCESSED_DIR
    if season:
        files = list(base.glob(f"season={season}/part.parquet"))
    else:
        files = list(base.glob("season=*/part.parquet"))

    if not files:
        print("[load_parquet_data] No parquet files found.")
        return pd.DataFrame()

    print(f"[load_parquet_data] loading {len(files)} files from {base}")
    return pd.concat((pd.read_parquet(f) for f in files), ignore_index=True)

def clear_all_caches():
    """Clear requests-cache and joblib memory caches."""
    import requests_cache
    from salary_nba_data_pull.fetch_utils import clear_cache as _cc
    requests_cache.clear()
    _cc()
    print("✅ caches cleared")

def print_args():
    """
    Show every argument accepted by nba_main.main() with its default value.
    Handy for quick tests in notebooks.
    """
    sig = inspect.signature(nba_main.main)
    for name, param in sig.parameters.items():
        print(f"{name:<15} default={param.default!r}  kind={param.kind}")

# ----------  CLI-ish entry point for quick manual test --------------
if __name__ == "__main__":
    print_args()           # <-- shows the table
    quick_pull(2023, workers=4, debug=True)



    historical_pull(2019, 2024,        # multi‑season
                    workers=6,
                    min_avg_minutes=10,
                    overwrite=False,
                    debug=True)
    check_existing_data()              # see which seasons are cached
    df = load_parquet_data("2023-24")  # inspect a single season

[notebook_helper] Added ROOT to sys.path: C:\docker_projects\coach_analysis
start_year      default=<class 'inspect._empty'>  kind=POSITIONAL_OR_KEYWORD
end_year        default=<class 'inspect._empty'>  kind=POSITIONAL_OR_KEYWORD
player_filter   default='all'  kind=POSITIONAL_OR_KEYWORD
min_avg_minutes default=15  kind=POSITIONAL_OR_KEYWORD
debug           default=False  kind=POSITIONAL_OR_KEYWORD
workers         default=8  kind=POSITIONAL_OR_KEYWORD
overwrite       default=False  kind=POSITIONAL_OR_KEYWORD
output_base     default=WindowsPath('C:/docker_projects/coach_analysis/data/new_processed')  kind=POSITIONAL_OR_KEYWORD
[quick_pull] season=2023, kwargs={'workers': 4, 'debug': True}
[fetch] https://hoopshype.com/salaries/players/2023-24/ (attempt 1)
  -> HTTP 404, skipping.
[fetch] https://hoopshype.com/salaries/players/2023-2024/ (attempt 1)
  -> HTTP 404, skipping.
[fetch] https://www.espn.com/nba/salaries/_/year/2024/page/1 (attempt 1)
[fetch] https://www.espn.com/nba/salaries/_

Seasons:   0%|          | 0/1 [00:00<?, ?it/s]

↻  2023-24 differs – re‑scraping
[fetch] https://hoopshype.com/salaries/2023-24/ (attempt 1)
  -> HTTP 404, skipping.
[fetch] https://hoopshype.com/salaries/2023-2024/ (attempt 1)
  -> HTTP 404, skipping.
[fetch] https://www.espn.com/nba/salaries/_/type/team/year/2024 (attempt 1)
  -> ESPN team salary fallback added 40 rows
                Team  Team_Salary   Season
0  Stephen Curry, PG     51915615  2023-24
1   Kevin Durant, PF     47649433  2023-24
2   LeBron James, SF     47607350  2023-24
3    Nikola Jokic, C     47607350  2023-24
4     Joel Embiid, C     46900000  2023-24
[fetch_season_players] 596 players for 2023-24
Processing Stephen Curry (ID: 201939)
Processing Kevin Durant (ID: 201142)
Processing Lebron James (ID: 2544)
Processing Nikola Jokic (ID: 203999)
Successfully processed Kevin Durant: 25 fields
Processing Joel Embiid (ID: 203954)
Successfully processed Stephen Curry: 25 fields
Processing Bradley Beal (ID: 203078)
Successfully processed Nikola Jokic: 25 fields
Process

2023-24:   0%|          | 0/475 [00:00<?, ?it/s]

Successfully processed Damian Lillard: 25 fields
Processing Klay Thompson (ID: 202691)
Successfully processed Kawhi Leonard: 25 fields
Processing Rudy Gobert (ID: 203497)
Successfully processed Jimmy Butler Iii: 25 fields
Processing Fred Vanvleet (ID: 1627832)
Successfully processed Paul George: 25 fields
Processing Anthony Davis (ID: 203076)
Successfully processed Klay Thompson: 25 fields
Processing Luka Doncic (ID: 1629029)
Successfully processed Fred Vanvleet: 25 fields
Processing Zach Lavine (ID: 203897)
Successfully processed Rudy Gobert: 25 fields
Processing Trae Young (ID: 1629027)
Successfully processed Anthony Davis: 25 fields
Processing Tobias Harris (ID: 202699)
Successfully processed Luka Doncic: 25 fields
Processing Ben Simmons (ID: 1627732)
Successfully processed Trae Young: 25 fields
Processing Pascal Siakam (ID: 1627783)
Successfully processed Zach Lavine: 25 fields
Processing Kyrie Irving (ID: 202681)
Successfully processed Tobias Harris: 25 fields
Processing Jrue Holi

Seasons:   0%|          | 0/6 [00:00<?, ?it/s]

↻  2019-20 differs – re‑scraping
[fetch] https://hoopshype.com/salaries/2019-20/ (attempt 1)
  -> HTTP 404, skipping.
[fetch] https://hoopshype.com/salaries/2019-2020/ (attempt 1)
  -> HTTP 404, skipping.
[fetch] https://www.espn.com/nba/salaries/_/type/team/year/2020 (attempt 1)
  -> ESPN team salary fallback added 40 rows
                    Team  Team_Salary   Season
0      Stephen Curry, PG     40231758  2019-20
1  Russell Westbrook, PG     38506482  2019-20
2         Chris Paul, PG     38506482  2019-20
3       Kevin Durant, PF     38199000  2019-20
4       James Harden, SG     38199000  2019-20
[fetch_season_players] 562 players for 2019-20
Processing Stephen Curry (ID: 201939)
Processing Russell Westbrook (ID: 201566)
Processing Chris Paul (ID: 101108)
Processing Kevin Durant (ID: 201142)
Processing James Harden (ID: 201935)
Processing John Wall (ID: 202322)
Successfully processed Russell Westbrook: 25 fields
Processing Lebron James (ID: 2544)
Successfully processed Stephen Curr

2019-20:   0%|          | 0/520 [00:00<?, ?it/s]

Successfully processed Gordon Hayward: 25 fields
Processing Paul Millsap (ID: 200794)
Successfully processed Kemba Walker: 25 fields
Processing Damian Lillard (ID: 203081)
Successfully processed Khris Middleton: 25 fields
Processing Kevin Love (ID: 201567)
Successfully processed Kyrie Irving: 25 fields
Processing Nikola Vucevic (ID: 202696)
Successfully processed Mike Conley: 25 fields
Processing Al Horford (ID: 201143)
Successfully processed Paul George: 25 fields
Processing Demar Derozan (ID: 201942)
Successfully processed Damian Lillard: 25 fields
Processing Cj Mccollum (ID: 203468)
Successfully processed Paul Millsap: 25 fields
Processing Joel Embiid (ID: 203954)
Successfully processed Kevin Love: 25 fields
Processing Andrew Wiggins (ID: 203952)
Successfully processed Nikola Vucevic: 25 fields
Processing Nikola Jokic (ID: 203999)
Successfully processed Demar Derozan: 25 fields
Processing Devin Booker (ID: 1626164)
Successfully processed Al Horford: 25 fields
Processing Karl-Anthony

2020-21:   0%|          | 0/556 [00:00<?, ?it/s]

Successfully processed Kemba Walker: 25 fields
Processing Ben Simmons (ID: 1627732)
Successfully processed Khris Middleton: 25 fields
Processing Pascal Siakam (ID: 1627783)
Successfully processed Anthony Davis: 25 fields
Processing Kyle Lowry (ID: 200768)
Successfully processed Damian Lillard: 25 fields
Processing Steven Adams (ID: 203500)
Successfully processed Kyrie Irving: 25 fields
Processing Joel Embiid (ID: 203954)
Successfully processed Ben Simmons: 25 fields
Processing Andrew Wiggins (ID: 203952)
Successfully processed Kevin Love: 25 fields
Processing Nikola Jokic (ID: 203999)
Successfully processed Kyle Lowry: 25 fields
Processing Devin Booker (ID: 1626164)
Successfully processed Pascal Siakam: 25 fields
Processing Karl-Anthony Towns (ID: 1626157)
Successfully processed Steven Adams: 25 fields
Processing Kristaps Porzingis (ID: 204001)
Successfully processed Joel Embiid: 25 fields
Processing Cj Mccollum (ID: 203468)
Successfully processed Nikola Jokic: 25 fields
Processing Bra

2021-22:   0%|          | 0/497 [00:00<?, ?it/s]

Successfully processed Bradley Beal: 25 fields
Processing Kristaps Porzingis (ID: 204001)
Successfully processed Pascal Siakam: 25 fields
Processing Joel Embiid (ID: 203954)
Successfully processed Jrue Holiday: 25 fields
Processing Andrew Wiggins (ID: 203952)
No stats found for Ben Simmons in season 2021-22
Processing Nikola Jokic (ID: 203999)
Successfully processed Karl-Anthony Towns: 25 fields
Processing Kevin Love (ID: 201567)
Successfully processed Devin Booker: 25 fields
Processing Cj Mccollum (ID: 203468)
Successfully processed Kristaps Porzingis: 25 fields
Processing D'Angelo Russell (ID: 1626156)
Successfully processed Joel Embiid: 25 fields
Processing Chris Paul (ID: 101108)
Successfully processed Nikola Jokic: 25 fields
Processing Gordon Hayward (ID: 202330)
Successfully processed Andrew Wiggins: 25 fields
Processing Jamal Murray (ID: 1627750)
Successfully processed D'Angelo Russell: 25 fields
Processing Brandon Ingram (ID: 1627742)
Successfully processed Kevin Love: 25 field

2022-23:   0%|          | 0/529 [00:00<?, ?it/s]

Successfully processed Khris Middleton: 25 fields
Processing Ben Simmons (ID: 1627732)
Successfully processed Jimmy Butler Iii: 25 fields
Processing Pascal Siakam (ID: 1627783)
Successfully processed Tobias Harris: 25 fields
Processing Myles Turner (ID: 1626167)
Successfully processed Luka Doncic: 25 fields
Processing Devin Booker (ID: 1626164)
Successfully processed Zach Lavine: 25 fields
Processing Karl-Anthony Towns (ID: 1626157)
Successfully processed Trae Young: 25 fields
Processing Kristaps Porzingis (ID: 204001)
Successfully processed Ben Simmons: 25 fields
Processing Jrue Holiday (ID: 201950)
Successfully processed Devin Booker: 25 fields
Processing Joel Embiid (ID: 203954)
Successfully processed Pascal Siakam: 25 fields
Processing Andrew Wiggins (ID: 203952)
Successfully processed Karl-Anthony Towns: 25 fields
Processing Cj Mccollum (ID: 203468)
Successfully processed Myles Turner: 25 fields
Processing Nikola Jokic (ID: 203999)
Successfully processed Kristaps Porzingis: 25 fie

2023-24:   0%|          | 0/475 [00:00<?, ?it/s]

Successfully processed Anthony Davis: 25 fields
Processing Ben Simmons (ID: 1627732)
Successfully processed Fred Vanvleet: 25 fields
Processing Pascal Siakam (ID: 1627783)
Successfully processed Luka Doncic: 25 fields
Processing Kyrie Irving (ID: 202681)
Successfully processed Trae Young: 25 fields
Processing Jrue Holiday (ID: 201950)
Successfully processed Tobias Harris: 25 fields
Processing Devin Booker (ID: 1626164)
Successfully processed Zach Lavine: 25 fields
Processing Karl-Anthony Towns (ID: 1626157)
Successfully processed Ben Simmons: 25 fields
Processing Kristaps Porzingis (ID: 204001)
Successfully processed Pascal Siakam: 25 fields
Processing Cj Mccollum (ID: 203468)
Successfully processed Kyrie Irving: 25 fields
Processing James Harden (ID: 201935)
Successfully processed Karl-Anthony Towns: 25 fields
Processing Ja Morant (ID: 1629630)
Successfully processed Jrue Holiday: 25 fields
Processing Darius Garland (ID: 1629636)
Successfully processed Devin Booker: 25 fields
Processi

2024-25:   0%|          | 0/491 [00:00<?, ?it/s]

Successfully processed Jimmy Butler Iii: 25 fields
Processing Anthony Davis (ID: 203076)
Successfully processed Damian Lillard: 25 fields
Processing Luka Doncic (ID: 1629029)
Successfully processed Rudy Gobert: 25 fields
Processing Trae Young (ID: 1629027)
Successfully processed Giannis Antetokounmpo: 25 fields
Processing Fred Vanvleet (ID: 1627832)
Successfully processed Zach Lavine: 25 fields
Processing Lauri Markkanen (ID: 1628374)
Successfully processed Lebron James: 25 fields
Processing Anthony Edwards (ID: 1630162)
Successfully processed Anthony Davis: 25 fields
Processing Tyrese Haliburton (ID: 1630169)
Successfully processed Trae Young: 25 fields
Processing Pascal Siakam (ID: 1627783)
Successfully processed Luka Doncic: 25 fields
Processing Kyrie Irving (ID: 202681)
Successfully processed Lauri Markkanen: 25 fields
Processing Domantas Sabonis (ID: 1627734)
Successfully processed Anthony Edwards: 25 fields
Processing Ja Morant (ID: 1629630)
Successfully processed Pascal Siakam: 

# Tests: