## NBA Analytics and Betting Value Analysis Notebook

In [1]:
# -- Cell 01: imports, config, folders -----------------------------------------
import os
import re
import json
import time
import warnings
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import requests
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Display & randomness
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)
np.random.seed(42)

# Folders
DATA_DIR = "data_raw"
DEBUG_DIR = "_rotowire_debug"
EXPORT_DIR = "exports"

for d in (DATA_DIR, DEBUG_DIR, EXPORT_DIR):
    os.makedirs(d, exist_ok=True)

print("‚úÖ Env ready | DATA_DIR:", DATA_DIR, "| DEBUG_DIR:", DEBUG_DIR, "| EXPORT_DIR:", EXPORT_DIR)

‚úÖ Env ready | DATA_DIR: data_raw | DEBUG_DIR: _rotowire_debug | EXPORT_DIR: exports


In [2]:
# -- Cell 02: odds scraper + wide->long helper ---------------------------------
import re
import json
import requests
import pandas as pd
from datetime import datetime

class NBAOddsScraper:
    def __init__(self):
        self.session = requests.Session()
        self.setup_headers()

    def setup_headers(self):
        self.headers = {
            "accept": "*/*",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/130.0.0.0 Safari/537.36",
            "referer": "https://www.rotowire.com/",
        }

    def get_player_props_odds_wide_raw(self, book: str = "mgm") -> pd.DataFrame:
        """
        Pulls the RotoWire player-props page for a single book and extracts all JSON 'data: [...]' blobs.
        Returns a single wide DataFrame with the raw columns RotoWire emits.
        """
        url = f"https://www.rotowire.com/betting/nba/player-props.php?book={book}"
        try:
            r = self.session.get(url, headers=self.headers, timeout=20)
            r.raise_for_status()
        except Exception as e:
            print(f"‚ùå Failed to GET odds page: {e}")
            return pd.DataFrame()

        matches = re.findall(r"data:\s*(\[\{.*?\}\])", r.text, flags=re.DOTALL)
        frames = []
        for m in matches:
            try:
                rows = json.loads(m)
                if isinstance(rows, list) and rows:
                    frames.append(pd.DataFrame(rows))
            except Exception:
                continue

        if not frames:
            print("‚ö†Ô∏è No odds JSON blocks found.")
            return pd.DataFrame()

        df = pd.concat(frames, ignore_index=True)

        # Normalize a few columns
        base_cols = [c for c in ["name","gameID","playerID","firstName","lastName","team","opp","logo","playerLink"] if c in df.columns]
        other_cols = [c for c in df.columns if c not in base_cols]
        df = df[base_cols + other_cols]

        if "opp" in df.columns and "opponent" not in df.columns:
            df = df.rename(columns={"opp": "opponent"})

        df["asof_date"] = datetime.utcnow().strftime("%Y-%m-%d")
        if "game_date" not in df.columns:
            df["game_date"] = df["asof_date"]
        df["book"] = book

        print(f"‚úÖ Fetched {len(df)} odds rows | {len(df.columns)} columns | book={book}")
        return df

def odds_wide_to_long_from_columns(wide: pd.DataFrame,
                                   books=("mgm","draftkings","fanduel","caesars","betrivers","espnbet","hardrock"),
                                   markets=("PTS","REB","AST")) -> pd.DataFrame:
    """
    Convert the scraped wide odds table into tidy long format:
    columns: player, team, opponent, game_date, market, book, line, over_odds, under_odds
    Works by scanning for patterns like '{book}_{suffix}' where suffix in {'pts','reb','ast'}.
    """
    if wide.empty:
        return pd.DataFrame()

    # base identity cols best-effort
    base_map = {
        "name": "player",
        "team": "team",
        "opponent": "opponent",
        "game_date": "game_date",
    }
    present_keys = [src for src in base_map if src in wide.columns]
    base = wide[present_keys].rename(columns={k: base_map[k] for k in present_keys}).copy()

    rows = []
    suffix_map = {"PTS":"pts","REB":"reb","AST":"ast"}

    for m in markets:
        suf = suffix_map[m]
        for b in books:
            line_col  = f"{b}_{suf}"
            over_col  = f"{b}_{suf}Over"
            under_col = f"{b}_{suf}Under"

            if line_col not in wide.columns:
                continue  # this book-market not present

            # Use get to avoid KeyErrors if over/under missing
            sub = pd.DataFrame({
                "player":   base.get("player", pd.Series([""]*len(wide))),
                "team":     base.get("team", pd.Series([""]*len(wide))),
                "opponent": base.get("opponent", pd.Series([""]*len(wide))),
                "game_date":base.get("game_date", pd.Series([""]*len(wide))),
                "market":   m,
                "book":     b,
                "line":     wide[line_col],
                "over_odds":wide.get(over_col),
                "under_odds":wide.get(under_col),
            })
            rows.append(sub)

    out = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()
    # numeric cleaning
    def _num_float(x):
        try:
            if pd.isna(x): return np.nan
            s = str(x).strip()
            if s=="" or s.lower()=="none": return np.nan
            return float(re.search(r"[-+]?\d+(?:\.\d+)?", s).group())
        except Exception:
            return np.nan

    def _num_int(x):
        try:
            if pd.isna(x): return np.nan
            s = str(x).strip()
            if s=="" or s.lower()=="none": return np.nan
            return int(re.search(r"[-+]?\d+", s).group())
        except Exception:
            return np.nan

    if not out.empty:
        out["line"] = out["line"].apply(_num_float)
        if "over_odds" in out.columns:
            out["over_odds"] = out["over_odds"].apply(_num_int)
        if "under_odds" in out.columns:
            out["under_odds"] = out["under_odds"].apply(_num_int)
        out = out.dropna(subset=["line"]).reset_index(drop=True)

    return out

# ---- Run scrape + save -------------------------------------------------------
scraper = NBAOddsScraper()
odds_wide_mgm = scraper.get_player_props_odds_wide_raw(book="mgm")
odds_long = odds_wide_to_long_from_columns(odds_wide_mgm)

print("\nüîé Long odds preview:")
print(odds_long.head(12).to_string(index=False))

# Save both forms
stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
wide_path = f"{DATA_DIR}/rotowire_odds_wide_mgm_{stamp}.csv"
long_path = f"{DATA_DIR}/rotowire_odds_long_mgm_{stamp}.csv"
odds_wide_mgm.to_csv(wide_path, index=False)
odds_long.to_csv(long_path, index=False)
print(f"\nüíæ Saved: {wide_path}\nüíæ Saved: {long_path}")

‚úÖ Fetched 1138 odds rows | 264 columns | book=mgm

üîé Long odds preview:
         player team opponent  game_date market book  line  over_odds  under_odds
    Luka Doncic  LAL      UTA 2025-11-18    PTS  mgm  31.5     -115.0      -115.0
   Devin Booker  PHX     @POR 2025-11-18    PTS  mgm  29.5     -120.0      -110.0
   Jaylen Brown  BOS     @BKN 2025-11-18    PTS  mgm  26.5     -110.0      -120.0
Cade Cunningham  DET     @ATL 2025-11-18    PTS  mgm  27.5     -110.0      -120.0
  Stephen Curry  GSW     @ORL 2025-11-18    PTS  mgm  26.5     -120.0      -110.0
Lauri Markkanen  UTA     @LAL 2025-11-18    PTS  mgm  26.5     -115.0      -118.0
   De'Aaron Fox  SAS      MEM 2025-11-18    PTS  mgm  25.5     -115.0      -115.0
    Deni Avdija  POR      PHX 2025-11-18    PTS  mgm  25.5     -105.0      -125.0
 Shaedon Sharpe  POR      PHX 2025-11-18    PTS  mgm  25.5     -105.0      -125.0
   Franz Wagner  ORL      GSW 2025-11-18    PTS  mgm  24.5     -115.0      -115.0
 Michael Porter  BKN 

In [3]:
# pip install selenium webdriver-manager bs4 pandas lxml

import os, re, time, pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ---------------- helpers ----------------

def _clean_list(xs):
    return [re.sub(r"\s+\(.*?\)\s*$", "", x) for x in xs]

def _try_click_consent(driver, timeout=6):
    XPATHS = [
        "//button[contains(.,'Accept')]",
        "//button[contains(.,'I Agree')]",
        "//button[contains(.,'Agree')]",
        "//button[contains(.,'ŒëœÄŒøŒ¥ŒøœáŒÆ')]",
        "//button[contains(.,'Œ£œÖŒºœÜœâŒΩœé')]",
    ]
    end = time.time() + timeout
    for xp in XPATHS:
        try:
            btn = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, xp)))
            btn.click()
            return True
        except Exception:
            if time.time() > end: break
    return False

def _progress_scroll(driver, steps=10, pause=0.8):
    h = driver.execute_script("return document.body.scrollHeight || document.documentElement.scrollHeight;")
    for i in range(1, steps + 1):
        y = int(h * i / steps)
        driver.execute_script(f"window.scrollTo(0, {y});")
        time.sleep(pause)

def _extract_team(side):
    team_el = side.select_one(".lineup__abbr, .lineup__team-name, .lineup__name")
    if team_el:
        return team_el.get_text(strip=True)
    logo = side.select_one("img[alt]")
    return (logo.get("alt") or "").strip() if logo else ""

def _extract_status(side):
    status_el = side.select_one(".lineup__status")
    txt = (status_el.get_text(" ", strip=True) if status_el else "").upper()
    if "CONFIRM" in txt:  return "CONFIRMED"
    if "EXPECT" in txt or "PROBABLE" in txt: return "EXPECTED"
    return "UNKNOWN"

def _extract_starters(side):
    # Try several variants for starters content
    containers = side.select(".lineup__list--starters, .lineup__list, .lineup__players")
    if not containers:
        containers = [side]

    names = []
    for blk in containers:
        for a in blk.select("a.lineup__player-link, .lineup__player a"):
            t = a.get_text(" ", strip=True)
            if t: names.append(t)
        if not names:
            for row in blk.select(".lineup__player"):
                t = row.get_text(" ", strip=True)
                if re.match(r"^(PG|SG|SF|PF|C)\b", t): names.append(t)
        if not names:
            for li in blk.select("li"):
                t = li.get_text(" ", strip=True)
                if re.match(r"^(PG|SG|SF|PF|C)\b", t): names.append(t)

    if not names:
        txt = side.get_text("\n", strip=True)
        names = re.findall(r"(?:^|\n)(?:PG|SG|SF|PF|C)\s+[^\n]+", txt)

    return _clean_list(names)[:5]

# ---------------- main ----------------

def fetch_rotowire_lineups_selenium(date: str | None = None,
                                    wait_sec: float = 14.0,
                                    headless: bool = False) -> pd.DataFrame:
    """
    Render Rotowire lineups & parse BOTH sides per game (global side selectors).
    Returns:
      game_time, team, side (AWAY/HOME), lineup_status, starters,
      starter_1..starter_5, lineup_confirmed (0/1)
    """
    base = "https://www.rotowire.com/basketball/nba-lineups.php"
    url = base if not date else f"{base}?date={date}"

    opts = Options()
    if headless: opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1400,1000")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--lang=en-US,en;q=0.9")
    opts.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    driver.get(url)

    _try_click_consent(driver, timeout=6)
    time.sleep(1.2)
    try:
        WebDriverWait(driver, int(wait_sec)).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".lineup, .lineup.is-nba"))
        )
    except Exception:
        pass

    _progress_scroll(driver, steps=10, pause=0.8)
    time.sleep(1.0)

    # quick diagnostics
    blocks = driver.find_elements(By.CSS_SELECTOR, ".lineup.is-nba, .lineup")
    players = driver.find_elements(By.CSS_SELECTOR, ".lineup__player, a.lineup__player-link")
    print(f"diagnostics: lineup blocks={len(blocks)}, player nodes={len(players)}")

    html = driver.page_source
    os.makedirs("_rotowire_debug", exist_ok=True)
    with open("_rotowire_debug/last_lineups.html", "w", encoding="utf-8") as f:
        f.write(html)
    try:
        driver.save_screenshot("_rotowire_debug/last_lineups.png")
    except Exception:
        pass
    driver.quit()

    # -------- parse globally by side classes ----------
    soup = BeautifulSoup(html, "lxml")

    # game time map: find each game container time
    game_time_map = {}
    for gi, g in enumerate(soup.select(".lineup__main, .lineup.is-nba, .lineup")):
        t = g.select_one(".lineup__time, .game-time")
        game_time_map[id(g)] = t.get_text(strip=True) if t else ""

    # Select **visit/away** & **home** side boxes explicitly
    visit_sel = (
        '[class*="lineup__box"][class*="is-visit"], '
        '[class*="lineup__team"][class*="is-visit"], '
        '[class*="lineup__side"][class*="is-visit"], '
        '[class*="visit"]'
    )
    home_sel = (
        '[class*="lineup__box"][class*="is-home"], '
        '[class*="lineup__team"][class*="is-home"], '
        '[class*="lineup__side"][class*="is-home"], '
        '[class*="home"]'
    )

    visit_boxes = soup.select(visit_sel)
    home_boxes  = soup.select(home_sel)

    rows = []

    def add_rows(boxes, side_label):
        for box in boxes:
            # nearest parent game container for time
            parent = box.find_parent(lambda tag: tag.has_attr("class") and any(
                c in {"lineup__main","lineup","lineup is-nba"} for c in tag.get("class", [])
            ))
            game_time = game_time_map.get(id(parent), "") if parent else ""
            team = _extract_team(box)
            starters = _extract_starters(box)
            status = _extract_status(box)
            if starters or team:
                rows.append({
                    "game_time": game_time,
                    "team": team,
                    "side": side_label,
                    "lineup_status": status,
                    "starters": starters,
                    "starter_1": starters[0] if len(starters)>0 else None,
                    "starter_2": starters[1] if len(starters)>1 else None,
                    "starter_3": starters[2] if len(starters)>2 else None,
                    "starter_4": starters[3] if len(starters)>3 else None,
                    "starter_5": starters[4] if len(starters)>4 else None,
                    "lineup_confirmed": int(status == "CONFIRMED"),
                })

    add_rows(visit_boxes, "AWAY")
    add_rows(home_boxes,  "HOME")

    df = pd.DataFrame(rows)

    if not df.empty:
        df = df.drop_duplicates(
            subset=["game_time","team","side","starter_1","starter_2","starter_3","starter_4","starter_5"]
        )
        all_na = df[["starter_1","starter_2","starter_3","starter_4","starter_5"]].isna().all(axis=1)
        df = df[~all_na].reset_index(drop=True)
    else:
        print("‚ö†Ô∏è Parsed zero rows. Check _rotowire_debug/last_lineups.html & .png")

    return df


# ---------- run it ----------
df_lineups = fetch_rotowire_lineups_selenium(wait_sec=14.0, headless=False)
print("‚úÖ Shape:", df_lineups.shape)
print(df_lineups.sort_values(["game_time","side"]).head(12).to_string(index=False))


diagnostics: lineup blocks=9, player nodes=103
‚úÖ Shape: (12, 11)
game_time team side lineup_status                                                                       starters     starter_1           starter_2     starter_3     starter_4     starter_5  lineup_confirmed
               AWAY      EXPECTED             [Stephen Curry, Will Richard, Moses Moody, Jimmy Butler, D. Green] Stephen Curry        Will Richard   Moses Moody  Jimmy Butler      D. Green                 0
               AWAY      EXPECTED              [C. Cunningham, D. Robinson, A. Thompson, T. Harris, Jalen Duren] C. Cunningham         D. Robinson   A. Thompson     T. Harris   Jalen Duren                 0
               AWAY      EXPECTED       [Derrick White, P. Pritchard, Jaylen Brown, Jordan Walsh, Neemias Queta] Derrick White        P. Pritchard  Jaylen Brown  Jordan Walsh Neemias Queta                 0
               AWAY      EXPECTED           [Cam Spencer, Jaylen Wells, Cedric Coward, Jaren Jackson, Zac

In [4]:
# -- Cell 03: parse saved HTML to starters + MNP count --------------------------
import os
import re
import pandas as pd
from bs4 import BeautifulSoup

def _txt(x):
    return re.sub(r"\s+", " ", x.get_text(" ", strip=True)) if x else ""

def _clean_player(n):
    if not n:
        return n
    n = re.sub(r"\s+\(.*?\)\s*$", "", n).strip()
    n = re.sub(r"^(PG|SG|SF|PF|C)\s+", "", n, flags=re.I)
    return n

def _get_mnp_from_ul(ul):
    """Extract 'May Not Play' entries from a team UL."""
    mnp = []
    title = ul.find("li", class_=lambda c: c and "lineup__title" in c and re.search(
        r"may\s+not\s+play", _txt(ul.find("li", class_=c)) if ul.find("li", class_=c) else "", re.I
    ))
    if title:
        for li in title.find_all_next("li"):
            if "lineup__title" in (li.get("class") or []):
                break
            if "lineup__player" in (li.get("class") or []):
                a = li.select_one("a")
                tag = li.select_one(".lineup__inj")
                nm = _txt(a) if a else ""
                if nm:
                    mnp.append(f"{nm} ({_txt(tag)})" if tag else nm)
        return [_clean_player(x) for x in mnp if x and x.lower() != "none"]

    for li in ul.select(".lineup__notplay li, .lineup__status--out, .lineup__inj-list li"):
        nm = _txt(li)
        if nm:
            mnp.append(_clean_player(nm))
    return [x for x in mnp if x and x.lower() != "none"]

def _extract_starters_from_ul(ul):
    names = []
    for li in ul.select("li.lineup__player.is-pct-play-100 a"):
        nm = _txt(li)
        if nm:
            names.append(nm)
    if len(names) < 5:
        for li in ul.select("li.lineup__player a"):
            nm = _txt(li)
            if nm:
                names.append(nm)
            if len(names) >= 5:
                break
    names = [_clean_player(n) for n in names]
    return names[:5]

def _lineup_status(ul):
    st = _txt(ul.select_one(".lineup__status"))
    stU = st.upper()
    if "CONFIRM" in stU: return "CONFIRMED"
    if "EXPECT" in stU or "PROBABLE" in stU: return "EXPECTED"
    return "UNKNOWN"

def parse_rotowire_lineups_flexible(html_path: str) -> pd.DataFrame:
    with open(html_path, "r", encoding="utf-8", errors="ignore") as f:
        html = f.read()
    soup = BeautifulSoup(html, "lxml")

    diag = {
        "lineup__teams": len(soup.select("div.lineup__teams")),
        "ul.lineup__list": len(soup.select("ul.lineup__list")),
        "ul.is-visit": len(soup.select("ul.lineup__list.is-visit")),
        "ul.is-home": len(soup.select("ul.lineup__list.is-home")),
        "see-proj-minutes buttons": len(soup.select("button.see-proj-minutes")),
        "header abbr": len(soup.select(".lineup__hdr .lineup__abbr")),
        "header team": len(soup.select(".lineup__hdr .lineup__team")),
        "player anchors": len(soup.select("a.lineup__player-link, .lineup__player a")),
        "MNP titles": len(soup.find_all(string=re.compile(r"^\s*may\s+not\s+play\s*$", re.I))),
    }
    print("DOM diagnostics:", diag)

    rows = []

    # Strategy A: by matchup blocks
    for teams_div in soup.select("div.lineup__teams"):
        time_el = teams_div.find_previous("div", class_="lineup__time")
        game_time = _txt(time_el)

        uls = teams_div.select("ul.lineup__list")
        if len(uls) < 1:
            continue

        away_ul = None
        home_ul = None
        for ul in uls:
            classes = " ".join(ul.get("class", [])).lower()
            if "is-visit" in classes or "visit" in classes or "away" in classes:
                away_ul = ul
            if "is-home" in classes or "home" in classes:
                home_ul = home_ul or ul

        if away_ul is None and home_ul is None and len(uls) >= 2:
            away_ul, home_ul = uls[0], uls[1]
        elif away_ul is None and len(uls) >= 1:
            away_ul = uls[0]
        elif home_ul is None and len(uls) >= 2:
            home_ul = next((u for u in uls if u is not away_ul), None)

        header_abbrs = [_txt(el) for el in teams_div.select(".lineup__abbr") if _txt(el)]
        if not header_abbrs:
            parent_main = teams_div.find_parent(["div","section"])
            if parent_main:
                header_abbrs = [_txt(el) for el in parent_main.select(".lineup__abbr") if _txt(el)]

        for idx, (side, ul) in enumerate([("AWAY", away_ul), ("HOME", home_ul)]):
            if not ul:
                continue
            btn = ul.select_one("button.see-proj-minutes")
            team = btn["data-team"].strip().upper() if btn and btn.has_attr("data-team") else None
            if not team and header_abbrs and idx < len(header_abbrs):
                team = header_abbrs[idx].upper()

            starters = _extract_starters_from_ul(ul)
            mnp = _get_mnp_from_ul(ul)
            status = _lineup_status(ul)

            if team or starters or mnp:
                rows.append({
                    "game_time": game_time,
                    "team": team,
                    "side": side,
                    "lineup_status": status,
                    "starters": starters,
                    "may_not_play": mnp,
                    "may_not_play_count": len(mnp),
                    "lineup_confirmed": int(status == "CONFIRMED"),
                })

    # Strategy B: global scan if A found nothing
    if not rows:
        print("Fallback B: scanning all ul.lineup__list globally...")
        for ul in soup.select("ul.lineup__list"):
            side = "AWAY" if "is-visit" in (ul.get("class") or []) else ("HOME" if "is-home" in (ul.get("class") or []) else None)
            btn = ul.select_one("button.see-proj-minutes")
            team = btn["data-team"].strip().upper() if btn and btn.has_attr("data-team") else None
            starters = _extract_starters_from_ul(ul)
            mnp = _get_mnp_from_ul(ul)
            status = _lineup_status(ul)

            if side and (team or starters or mnp):
                rows.append({
                    "game_time": "",
                    "team": team,
                    "side": side,
                    "lineup_status": status,
                    "starters": starters,
                    "may_not_play": mnp,
                    "may_not_play_count": len(mnp),
                    "lineup_confirmed": int(status == "CONFIRMED"),
                })

    df = pd.DataFrame(rows)
    for i in range(5):
        col = f"starter_{i+1}"
        if "starters" in df.columns:
            df[col] = df["starters"].apply(lambda xs: xs[i] if isinstance(xs, list) and len(xs) > i else None)

    print(f"‚Üí Parsed rows: {len(df)}")

    # Save a copy for downstream
    stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    out_path = f"{DATA_DIR}/lineups_parsed_{stamp}.csv"
    df.to_csv(out_path, index=False)
    print(f"üíæ Saved parsed lineups to {out_path}")

    return df

# ---- RUN IT (point to the saved HTML) ----
HTML_PATH = f"{DEBUG_DIR}/last_lineups.html"
if not os.path.exists(HTML_PATH) and os.path.exists("last_lineups.html"):
    HTML_PATH = "last_lineups.html"

df_lineups = parse_rotowire_lineups_flexible(HTML_PATH)

if df_lineups.empty:
    print("\n‚ö†Ô∏è Still empty. Check DOM diagnostics and ensure Cell 3 ran successfully.")
else:
    cols = ["game_time","team","side","lineup_status","may_not_play_count",
            "starter_1","starter_2","starter_3","starter_4","starter_5"]
    print("\n‚úÖ Preview:")
    print(df_lineups[cols].sort_values(["game_time","side","team"], na_position="last").to_string(index=False))

DOM diagnostics: {'lineup__teams': 6, 'ul.lineup__list': 12, 'ul.is-visit': 6, 'ul.is-home': 6, 'see-proj-minutes buttons': 12, 'header abbr': 0, 'header team': 0, 'player anchors': 103, 'MNP titles': 12}
Fallback B: scanning all ul.lineup__list globally...
‚Üí Parsed rows: 12
üíæ Saved parsed lineups to data_raw/lineups_parsed_20251118_183337.csv

‚úÖ Preview:
game_time team side lineup_status  may_not_play_count     starter_1           starter_2     starter_3     starter_4           starter_5
           BOS AWAY      EXPECTED                   6 Derrick White        P. Pritchard  Jaylen Brown  Jordan Walsh       Neemias Queta
           DET AWAY      EXPECTED                  11   D. Robinson         Jalen Duren C. Cunningham   D. Robinson         A. Thompson
           GSW AWAY      EXPECTED                   8 Stephen Curry        Will Richard   Moses Moody  Jimmy Butler            D. Green
           MEM AWAY      EXPECTED                  10   Cam Spencer        Jaylen Wells Ced

In [5]:
# -- Cell 04: Parse "May Not Play" (MNP) from saved HTML -----------------------
# pip install bs4 lxml pandas
import os, re, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

# Folders (fallbacks, in case Cell 1 wasn't run)
DATA_DIR = "data_raw"; DEBUG_DIR = "_rotowire_debug"
os.makedirs(DATA_DIR, exist_ok=True); os.makedirs(DEBUG_DIR, exist_ok=True)

HTML_PATH = f"{DEBUG_DIR}/last_lineups.html" if os.path.exists(f"{DEBUG_DIR}/last_lineups.html") else "last_lineups.html"

LIKELIHOOD_MAP = {
    "is-pct-play-100": 100, "is-pct-play-90": 90, "is-pct-play-75": 75,
    "is-pct-play-60": 60, "is-pct-play-50": 50, "is-pct-play-40": 40,
    "is-pct-play-25": 25, "is-pct-play-10": 10, "is-pct-play-0": 0
}

def _txt(node): 
    return re.sub(r"\s+", " ", node.get_text(" ", strip=True)) if node else ""

def _likelihood_from_classes(classes):
    for c in classes or []:
        if c in LIKELIHOOD_MAP:
            return LIKELIHOOD_MAP[c]
    return None

def _clean_player(n):
    if not n: return n
    n = re.sub(r"\s+\(.*?\)\s*$", "", n).strip()
    n = re.sub(r"^(PG|SG|SF|PF|C)\s+", "", n, flags=re.I)
    return n

def parse_rotowire_mnp_final(html_path: str) -> pd.DataFrame:
    with open(html_path, "r", encoding="utf-8", errors="ignore") as f:
        soup = BeautifulSoup(f.read(), "lxml")

    rows = []

    # Primary structure
    games = soup.select("div.lineup.is-nba[data-lnum]")
    print(f"Found {len(games)} games in HTML.")

    for game in games:
        game_time = _txt(game.select_one(".lineup__time"))
        # Pair teams by .lineup__team, then iterate their ULs
        team_blocks = game.select(".lineup__team")
        teams = []
        for tb in team_blocks:
            abbr = _txt(tb.select_one(".lineup__abbr")) or _txt(tb.select_one(".lineup__team-name"))
            side = "AWAY" if "is-visit" in (tb.get("class") or []) else ("HOME" if "is-home" in (tb.get("class") or []) else None)
            teams.append((abbr, side))

        ul_lists = game.select("ul.lineup__list")
        for idx, ul in enumerate(ul_lists):
            team, side = (teams[idx] if idx < len(teams) else (None, None))
            # Find the MNP title in this UL
            mnp_title = ul.find("li", class_="lineup__title", string=lambda s: s and "MAY NOT PLAY" in s.upper())
            if not mnp_title:
                continue

            for li in mnp_title.find_next_siblings("li"):
                classes = li.get("class") or []
                if "lineup__title" in classes:
                    break
                if "lineup__player" not in classes:
                    continue

                pos = _txt(li.select_one(".lineup__pos"))
                a = li.select_one("a")
                player = _clean_player(_txt(a))
                if not player:
                    continue

                status = _txt(li.select_one(".lineup__inj"))
                title_text = (li.get("title") or "").strip()
                likelihood_pct = _likelihood_from_classes(classes)

                rows.append({
                    "game_time": game_time,
                    "team": team,
                    "side": side,
                    "position": pos,
                    "player": player,
                    "status": status,
                    "title_text": title_text,
                    "likelihood_pct": likelihood_pct
                })

    # Fallback: global scan (if nothing found in primary structure)
    if not rows:
        print("Fallback: global MNP scan‚Ä¶")
        for ul in soup.select("ul.lineup__list"):
            title = ul.find("li", class_="lineup__title", string=lambda s: s and "MAY NOT PLAY" in s.upper())
            if not title:
                continue
            for li in title.find_next_siblings("li"):
                classes = li.get("class") or []
                if "lineup__title" in classes:
                    break
                if "lineup__player" not in classes:
                    continue
                player = _clean_player(_txt(li.select_one("a")))
                if not player:
                    continue
                rows.append({
                    "game_time": "",
                    "team": None,
                    "side": None,
                    "position": _txt(li.select_one(".lineup__pos")),
                    "player": player,
                    "status": _txt(li.select_one(".lineup__inj")),
                    "title_text": (li.get("title") or "").strip(),
                    "likelihood_pct": _likelihood_from_classes(classes)
                })

    df = pd.DataFrame(rows)
    if df.empty:
        print("‚ö†Ô∏è No 'May Not Play' players found. Check if Rotowire changed markup or re-run Cell 3.")
        return df

    df = df.sort_values(["game_time","side","team","player"], na_position="last").reset_index(drop=True)
    print(f"‚úÖ Parsed {len(df)} 'May Not Play' players across {df['team'].nunique(dropna=True)} teams.")
    return df

# ---- RUN ----
mnp_df = parse_rotowire_mnp_final(HTML_PATH)
if not mnp_df.empty:
    print(mnp_df.head(30).to_string(index=False))
    stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    out_csv = f"{DATA_DIR}/may_not_play_players_{stamp}.csv"
    mnp_df.to_csv(out_csv, index=False)
    print(f"\nüíæ Saved: {out_csv}")

Found 6 games in HTML.
‚úÖ Parsed 43 'May Not Play' players across 12 teams.
  game_time team side position        player status            title_text  likelihood_pct
10:30 PM ET  UTA AWAY        F      G. Niang    Out Very Unlikely To Play               0
10:30 PM ET  UTA AWAY        F   K. Anderson   Ques       Toss Up To Play              50
10:30 PM ET  UTA AWAY        C K. Filipowski   Ques       Toss Up To Play              50
10:30 PM ET  UTA AWAY        F  T. Hendricks   Ques       Toss Up To Play              50
10:30 PM ET  UTA AWAY        C    W. Kessler    OFS   Very Likely To Play               0
10:30 PM ET  LAL HOME        G    G. Vincent   Ques       Toss Up To Play              50
10:30 PM ET  LAL HOME        F      L. James   Ques       Toss Up To Play              50
11:00 PM ET  PHX AWAY        G      G. Allen    Out Very Unlikely To Play               0
11:00 PM ET  PHX AWAY        G      J. Green    Out Very Unlikely To Play               0
11:00 PM ET  POR HOME  

In [6]:
# -- Cell 05: NBA betting analysis functions -------------------------------------
def get_daily_matchups(date=None):
    """Get NBA games for a specific date"""
    if date is None:
        date = datetime.now().strftime('%Y-%m-%d')
    # Placeholder demo; replace with a real schedule API if desired
    sample_matchups = [
        {'home_team': 'GSW', 'away_team': 'LAL', 'time': '7:30 PM ET'},
        {'home_team': 'BOS', 'away_team': 'MIA', 'time': '8:00 PM ET'},
        {'home_team': 'DEN', 'away_team': 'DAL', 'time': '9:00 PM ET'},
    ]
    return sample_matchups

def calculate_player_correlations(player_a_logs, player_b_logs):
    """Calculate correlation between two players' performances"""
    merged = pd.merge(player_a_logs, player_b_logs, on='GAME_DATE', suffixes=('_a', '_b'))
    correlations = {}
    for stat in ['PTS', 'REB', 'AST']:
        if f'{stat}_a' in merged.columns and f'{stat}_b' in merged.columns:
            corr = merged[f'{stat}_a'].corr(merged[f'{stat}_b'])
            correlations[stat] = corr
    return correlations

# Export results to Excel
def export_analysis(results, filename='nba_betting_analysis.xlsx'):
    """Export analysis results to Excel"""
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        if 'value_bets' in results:
            pd.DataFrame(results['value_bets']).to_excel(writer, sheet_name='Value_Bets', index=False)
        if 'predictions' in results:
            predictions_df = pd.DataFrame.from_dict(results['predictions'], orient='index')
            predictions_df.to_excel(writer, sheet_name='Player_Predictions')
    print(f"Analysis exported to {filename}")

In [7]:
# -- Cell 06: starter flags (time-aware) + injury flags ------------------------
from datetime import datetime, timedelta
import pytz
import numpy as np
import pandas as pd
import os

# Folders (fallbacks, in case Cell 1 wasn't run)
DATA_DIR = "data_raw"; os.makedirs(DATA_DIR, exist_ok=True)

def compute_time_based_prob(game_time_str: str, lineup_status: str) -> float:
    """Rough start probability based on status and hours to tip (ET)."""
    try:
        if not game_time_str:
            return 0.7
        game_time_clean = game_time_str.replace("ET", "").strip()
        base_dt = datetime.strptime(game_time_clean, "%I:%M %p")
        now_et = datetime.now(pytz.timezone("US/Eastern"))
        game_dt = now_et.replace(hour=base_dt.hour, minute=base_dt.minute, second=0, microsecond=0)
        hours_to_tip = (game_dt - now_et).total_seconds() / 3600.0
        if hours_to_tip < -3:
            game_dt += timedelta(days=1)
            hours_to_tip = (game_dt - now_et).total_seconds() / 3600.0
    except Exception:
        hours_to_tip = 6.0

    st = (lineup_status or "").upper()
    if "CONFIRM" in st:
        return 1.0
    if "EXPECT" in st or "PROBABLE" in st:
        if hours_to_tip > 6: return 0.70
        if hours_to_tip > 2: return 0.85
        return 0.95
    # unknown
    return 0.60 if hours_to_tip > 4 else 0.80

def build_starter_flags_timeaware(df_lineups: pd.DataFrame, mnp_df: pd.DataFrame) -> pd.DataFrame:
    """From df_lineups (with 'starters' list per row), emit per-player start_prob."""
    mnp_players = set(mnp_df["player"].str.strip()) if (isinstance(mnp_df, pd.DataFrame) and not mnp_df.empty) else set()
    rows = []

    if df_lineups is None or df_lineups.empty or "starters" not in df_lineups.columns:
        return pd.DataFrame(columns=["player","team","is_starter","start_prob"])

    for _, row in df_lineups.iterrows():
        team = row.get("team", None)
        lineup_status = row.get("lineup_status", "")
        game_time = row.get("game_time", "")
        starters = row.get("starters", [])
        starters = starters if isinstance(starters, list) else []

        for p in starters:
            p_clean = (p or "").strip()
            if not p_clean:
                continue
            prob = compute_time_based_prob(game_time, lineup_status)
            if p_clean in mnp_players:
                prob *= 0.6  # penalize if on MNP
            rows.append({
                "player": p_clean,
                "team": team,
                "is_starter": 1,
                "start_prob": round(float(np.clip(prob, 0.0, 1.0)), 2),
            })

    df_out = pd.DataFrame(rows).drop_duplicates(subset=["player"])
    print(f"‚úÖ Created {len(df_out)} starter probability rows.")
    return df_out

def build_injury_flags(mnp_df: pd.DataFrame) -> pd.DataFrame:
    """Return DataFrame with may_not_play flag and injury_prob derived from status."""
    if mnp_df is None or mnp_df.empty:
        return pd.DataFrame(columns=["player","may_not_play","injury_prob"])

    def map_status_to_prob(status: str):
        """Map injury status string to (may_not_play, injury_prob)."""
        status = str(status).lower().strip()
        if "out" in status:
            return 1, 1.0
        elif "ques" in status or "doubt" in status:
            return 1, 0.5
        elif "prob" in status:
            return 0, 0.25
        return 0, 0.0

    mapped = mnp_df.dropna(subset=["player"]).copy()
    mapped["player"] = mapped["player"].str.strip()

    if "status" in mapped.columns:
        mapped[["may_not_play", "injury_prob"]] = mapped["status"].apply(
            lambda s: pd.Series(map_status_to_prob(s))
        )
    else:
        mapped["injury_prob"] = mapped["likelihood_pct"].fillna(40) / 100.0
        mapped["may_not_play"] = 1

    return mapped[["player", "may_not_play", "injury_prob"]].drop_duplicates(subset=["player"])

# ---- RUN (expects df_lineups from Cell 3 and mnp_df from Cell 5) ------------
starter_flags_df = build_starter_flags_timeaware(df_lineups, mnp_df)
injury_flags_df = build_injury_flags(mnp_df)

print("‚úÖ Starter flags sample:")
print(starter_flags_df.head(10).to_string(index=False))

print("\n‚úÖ Injury flags sample:")
print(injury_flags_df.head(10).to_string(index=False))

# Save outputs
stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
sf_path  = f"{DATA_DIR}/starter_flags_{stamp}.csv"
inj_path = f"{DATA_DIR}/injury_flags_{stamp}.csv"
starter_flags_df.to_csv(sf_path, index=False)
injury_flags_df.to_csv(inj_path, index=False)
print(f"\nüíæ Saved starter flags ‚Üí {sf_path}\nüíæ Saved injury flags ‚Üí {inj_path}")

‚úÖ Created 57 starter probability rows.
‚úÖ Starter flags sample:
       player team  is_starter  start_prob
Stephen Curry  GSW           1        0.70
 Will Richard  GSW           1        0.70
  Moses Moody  GSW           1        0.70
 Jimmy Butler  GSW           1        0.70
     D. Green  GSW           1        0.70
 Desmond Bane  ORL           1        0.70
 Franz Wagner  ORL           1        0.70
  T. da Silva  ORL           1        0.70
    W. Carter  ORL           1        0.70
     J. Suggs  ORL           1        0.42

‚úÖ Injury flags sample:
       player  may_not_play  injury_prob
     G. Niang           1.0          1.0
  K. Anderson           1.0          0.5
K. Filipowski           1.0          0.5
 T. Hendricks           1.0          0.5
   W. Kessler           0.0          0.0
   G. Vincent           1.0          0.5
     L. James           1.0          0.5
     G. Allen           1.0          1.0
     J. Green           1.0          1.0
    B. Wesley           

## NBA

In [8]:
# -- Cell 07: Download per-season player stats from NBA Stats API --------------
# (Be polite: retries + small random delays)
import os, time, random, requests, pandas as pd
from datetime import datetime

# Folders (fallbacks)
DATA_DIR = "data_raw"; os.makedirs(DATA_DIR, exist_ok=True)

URL = "https://stats.nba.com/stats/leaguedashplayerstats"

BASE_PARAMS = {
    "College": "", "Conference": "", "Country": "", "DateFrom": "", "DateTo": "",
    "Division": "", "DraftPick": "", "DraftYear": "", "GameScope": "", "GameSegment": "",
    "Height": "", "ISTRound": "", "LastNGames": "0", "LeagueID": "00", "Location": "",
    "MeasureType": "Base", "Month": "0", "OpponentTeamID": "0", "Outcome": "",
    "PORound": "0", "PaceAdjust": "N", "PerMode": "PerGame", "Period": "0",
    "PlayerExperience": "", "PlayerPosition": "", "PlusMinus": "N", "Rank": "N",
    "SeasonSegment": "", "SeasonType": "Regular Season", "ShotClockRange": "",
    "StarterBench": "", "TeamID": "0", "VsConference": "", "VsDivision": "", "Weight": ""
}

HEADERS = {
    "Accept": "application/json, text/plain, */*",
    "Accept-Language": "en-US,en;q=0.9",
    "Origin": "https://www.nba.com",
    "Referer": "https://www.nba.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36",
    "x-nba-stats-origin": "stats",
    "x-nba-stats-token": "true"
}

SEASONS = ["2023-24", "2024-25", "2025-26"]

def fetch_season(season: str, retries: int = 3) -> pd.DataFrame:
    params = BASE_PARAMS.copy()
    params["Season"] = season
    for attempt in range(1, retries + 1):
        try:
            print(f"‚Üí Attempt {attempt} fetching {season}‚Ä¶")
            r = requests.get(URL, headers=HEADERS, params=params, timeout=30)
            r.raise_for_status()
            js = r.json()
            rs = js["resultSets"][0]
            df = pd.DataFrame(rs["rowSet"], columns=rs["headers"])
            return df
        except requests.exceptions.Timeout:
            print(f"‚ö†Ô∏è Timeout {attempt}/{retries}; retrying‚Ä¶")
            time.sleep(2 * attempt)
        except requests.exceptions.RequestException as e:
            print(f"‚ùå Error {attempt}/{retries}: {e}"); time.sleep(2 * attempt)
    raise RuntimeError(f"Failed to fetch {season} after {retries} attempts.")

all_paths = []
for season in SEASONS:
    print(f"\nüèÄ Fetching NBA stats for {season}‚Ä¶")
    df = fetch_season(season)
    path = f"{DATA_DIR}/nba_player_stats_{season.replace('-','_')}.csv"
    df.to_csv(path, index=False)
    print(f"‚úÖ {season}: saved {len(df)} rows ‚Üí {path}")
    all_paths.append(path)
    time.sleep(random.uniform(3, 6))  # throttle politely

print("\nüéâ Done! Saved:", all_paths)


üèÄ Fetching NBA stats for 2023-24‚Ä¶
‚Üí Attempt 1 fetching 2023-24‚Ä¶
‚úÖ 2023-24: saved 572 rows ‚Üí data_raw/nba_player_stats_2023_24.csv

üèÄ Fetching NBA stats for 2024-25‚Ä¶
‚Üí Attempt 1 fetching 2024-25‚Ä¶
‚úÖ 2024-25: saved 569 rows ‚Üí data_raw/nba_player_stats_2024_25.csv

üèÄ Fetching NBA stats for 2025-26‚Ä¶
‚Üí Attempt 1 fetching 2025-26‚Ä¶
‚úÖ 2025-26: saved 468 rows ‚Üí data_raw/nba_player_stats_2025_26.csv

üéâ Done! Saved: ['data_raw/nba_player_stats_2023_24.csv', 'data_raw/nba_player_stats_2024_25.csv', 'data_raw/nba_player_stats_2025_26.csv']


## GAME LOGS

In [9]:
# -- Cell 08: Download player game logs (box scores) by season ------------------
import os, time, requests, pandas as pd
from datetime import datetime

# Folders (fallbacks)
DATA_DIR = "data_raw"; os.makedirs(DATA_DIR, exist_ok=True)

def get_box_scores(season: str, season_type: str = "Regular Season") -> pd.DataFrame:
    url = "https://stats.nba.com/stats/leaguegamelog"
    params = {
        "Counter": 1000, "DateFrom": "", "DateTo": "", "Direction": "DESC",
        "ISTRound": "", "LeagueID": "00", "PlayerOrTeam": "P",
        "Season": season, "SeasonType": season_type, "Sorter": "DATE"
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36",
        "Referer": "https://www.nba.com/", "Origin": "https://www.nba.com",
        "Accept": "application/json, text/plain, */*"
    }
    r = requests.get(url, params=params, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()["resultSets"][0]
    df = pd.DataFrame(data["rowSet"], columns=data["headers"])
    return df

SEASONS = ["2023-24","2024-25","2025-26"]
# already saved once season: ["2023-24", "2024-25"]
saved = []
for season in SEASONS:
    print(f"Fetching {season}‚Ä¶")
    df = get_box_scores(season)
    path = f"{DATA_DIR}/nba_boxscores_{season}.csv"
    df.to_csv(path, index=False)
    print(f"‚úÖ Saved {len(df)} records ‚Üí {path}")
    saved.append(path)
    time.sleep(2)  # polite delay

print("\nüéâ Box score downloads complete.")

Fetching 2023-24‚Ä¶
‚úÖ Saved 26401 records ‚Üí data_raw/nba_boxscores_2023-24.csv
Fetching 2024-25‚Ä¶
‚úÖ Saved 26306 records ‚Üí data_raw/nba_boxscores_2024-25.csv
Fetching 2025-26‚Ä¶
‚úÖ Saved 4605 records ‚Üí data_raw/nba_boxscores_2025-26.csv

üéâ Box score downloads complete.


In [10]:
# -- Cell 09: Fetch BBRef Advanced tables, align, and enrich season CSVs -------
import os, io, unicodedata, requests, pandas as pd
from datetime import datetime

# Folders (work even if Cell 1 didn't run)
DATA_DIR   = "data_raw"
ENRICH_DIR = "data_enriched"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(ENRICH_DIR, exist_ok=True)

ADV_COLS_KEEP = [
    "Player","Pos","Age","Tm","G","MP",
    "PER","TS%","3PAr","FTr",
    "ORB%","DRB%","TRB%",
    "AST%","STL%","BLK%",
    "TOV%","USG%",
    "ORtg","DRtg",
    "OWS","DWS","WS","WS/48",
    "OBPM","DBPM","BPM","VORP"
]

TEAM_ABBR_MAP = {
    "BRK": "BKN",
    "PHO": "PHX",
    "CHO": "CHA",
    "UTH": "UTA",
    "NJN": "BKN",
    "SEA": "OKC",
    "VAN": "MEM",
}

def normalize_name(s: str):
    if pd.isna(s): return s
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    for ch in [".","'","`","‚Äô","‚Äú","‚Äù",","]:
        s = s.replace(ch, "")
    return " ".join(s.split())

def fetch_advanced_table(season_end_year: int, retries: int = 3) -> pd.DataFrame:
    """
    season_end_year=2025 -> https://www.basketball-reference.com/leagues/NBA_2025_advanced.html
    """
    url = f"https://www.basketball-reference.com/leagues/NBA_{season_end_year}_advanced.html"
    headers = {
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/120.0.0.0 Safari/537.36")
    }
    last_err = None
    for attempt in range(1, retries+1):
        try:
            r = requests.get(url, headers=headers, timeout=30)
            r.raise_for_status()
            tables = pd.read_html(io.StringIO(r.text), header=0)
            if not tables:
                raise RuntimeError("No tables parsed from page.")
            df = tables[0].copy()
            # Drop duplicate header rows
            if "Rk" in df.columns:
                df = df[df["Rk"] != "Rk"].copy()
                df.drop(columns=["Rk"], inplace=True, errors="ignore")
            df.columns = [c.strip() for c in df.columns]

            # Ensure team column name
            team_col = None
            for c in df.columns:
                if c.lower() in ("tm","team","team_name"):
                    team_col = c; break
            if not team_col:
                raise KeyError(f"Team column not found. Columns: {df.columns.tolist()}")
            df.rename(columns={team_col: "Tm"}, inplace=True)

            keep = [c for c in ADV_COLS_KEEP if c in df.columns]
            df = df[keep].copy()

            for c in df.columns:
                if c not in {"Player","Pos","Tm"}:
                    df[c] = pd.to_numeric(df[c], errors="coerce")

            df["Tm"] = df["Tm"].replace(TEAM_ABBR_MAP)
            df["player_key"] = df["Player"].map(normalize_name)
            df["team_key"] = df["Tm"].astype(str).str.strip().str.upper()
            return df
        except Exception as e:
            last_err = e
    raise last_err

def load_averages_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # map columns to canonical 'Player' and 'Team'
    col_map = {}
    for c in df.columns:
        cl = c.strip().lower()
        if cl == "player_name": col_map[c] = "Player"
        elif cl in ("team_abbreviation","tm","team"): col_map[c] = "Team"
    df = df.rename(columns=col_map)
    if "Player" not in df.columns or "Team" not in df.columns:
        raise ValueError(f"'Player' and 'Team' required. Got: {list(df.columns)}")
    df["player_key"] = df["Player"].map(normalize_name)
    df["team_key"]   = df["Team"].astype(str).str.strip().str.upper()
    return df

def merge_advanced_into_averages(df_avg: pd.DataFrame, df_adv: pd.DataFrame) -> pd.DataFrame:
    adv_team = df_adv[df_adv["Tm"] != "TOT"].copy()
    adv_tot  = df_adv[df_adv["Tm"] == "TOT"].copy()

    adv_cols = [c for c in df_adv.columns if c not in {"Player","Pos","Age","Tm","player_key","team_key"}]
    meta_cols = [c for c in ["Pos","Age"] if c in df_adv.columns]
    add_cols = meta_cols + adv_cols

    merged = df_avg.merge(
        adv_team[["player_key","team_key"] + add_cols],
        on=["player_key","team_key"], how="left"
    )

    # Fill gaps from TOT by player
    probe = "PER" if "PER" in merged.columns else ("WS/48" if "WS/48" in merged.columns else None)
    missing = merged[probe].isna() if probe else merged.isna().any(axis=1)
    if missing.any() and not adv_tot.empty:
        fb = merged.loc[missing, ["player_key"]].merge(
            adv_tot[["player_key"] + add_cols], on="player_key", how="left"
        )
        for col in add_cols:
            if col in merged.columns and col in fb.columns:
                merged.loc[missing, col] = merged.loc[missing, col].fillna(fb[col])
    return merged

# --- Build both seasons and save into ENRICH_DIR ------------------------------
# Map: "2023-24" -> 2024, "2024-25" -> 2025
pairs = [
    ("2023_24", 2024),
    ("2024_25", 2025),
    ("2025_26", 2026),
]
out_paths = []
for tag, yr in pairs:
    avg_path = os.path.join(DATA_DIR, f"nba_player_stats_{tag}.csv")
    if not os.path.exists(avg_path):
        raise FileNotFoundError(f"Missing averages CSV: {avg_path}. Run Cell 8 first.")
    df_avg = load_averages_csv(avg_path)
    df_adv = fetch_advanced_table(yr)
    df_enriched = merge_advanced_into_averages(df_avg, df_adv)

    outp = os.path.join(ENRICH_DIR, f"nba_player_stats_{tag}_enriched.csv")
    df_enriched.to_csv(outp, index=False)
    print(f"‚úÖ Saved: {outp}")
    out_paths.append(outp)

# Optional combined
combined = pd.concat([pd.read_csv(p) for p in out_paths], ignore_index=True)
combined.to_csv(os.path.join(ENRICH_DIR, "nba_player_stats_2023_25_combined.csv"), index=False)
print("üèÄ Combined ‚Üí", os.path.join(ENRICH_DIR, "nba_player_stats_2023_25_combined.csv"))

‚úÖ Saved: data_enriched\nba_player_stats_2023_24_enriched.csv
‚úÖ Saved: data_enriched\nba_player_stats_2024_25_enriched.csv
‚úÖ Saved: data_enriched\nba_player_stats_2025_26_enriched.csv
üèÄ Combined ‚Üí data_enriched\nba_player_stats_2023_25_combined.csv


In [11]:
# -- Cell 10: Game-log ‚Üí features (rolling, team ratings, opponent allowances) -
import numpy as np
import pandas as pd

def standardize_logs_cols(df_logs: pd.DataFrame) -> pd.DataFrame:
    colmap = {}
    for c in df_logs.columns:
        cl = c.strip().lower()
        if cl in {"game_date","game_date_est","date"}: colmap[c] = "GAME_DATE"
        elif cl in {"player","player_name"}: colmap[c] = "PLAYER_NAME"
        elif cl in {"team","team_abbreviation","tm"}: colmap[c] = "TEAM_ABBREVIATION"
        elif cl in {"opp","opponent","opponent_abbreviation"}: colmap[c] = "OPPONENT_ABBREVIATION"
        elif cl in {"min","minutes"}: colmap[c] = "MIN"
    df = df_logs.rename(columns=colmap).copy()
    df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])
    df = df.sort_values(["PLAYER_NAME","GAME_DATE"])
    return df

def add_shooting_efficiency(df: pd.DataFrame) -> pd.DataFrame:
    for col in ["FGA","FTA","PTS"]:
        if col not in df.columns: df[col] = 0.0
    denom = 2 * (df["FGA"].astype(float) + 0.44 * df["FTA"].astype(float))
    df["TS_game"] = np.where(denom > 0, df["PTS"].astype(float)/denom, np.nan)
    return df

def rolling_player_form(df: pd.DataFrame, windows=(3,5,10,20)) -> pd.DataFrame:
    df = df.sort_values(["PLAYER_NAME","GAME_DATE"]).copy()
    g = df.groupby("PLAYER_NAME", group_keys=False)
    for w in windows:
        for stat in ["PTS","REB","AST","MIN","TS_game"]:
            if stat not in df.columns: df[stat] = np.nan
            df[f"{stat}_roll{w}"] = g[stat].shift(1).rolling(w, min_periods=1).mean()
    if {"FGA","TEAM_ABBREVIATION"}.issubset(df.columns):
        df["teamFGA_game"] = df.groupby(["TEAM_ABBREVIATION","GAME_DATE"])["FGA"].transform("sum")
        df["usage_share"] = np.where(df["teamFGA_game"]>0, df["FGA"]/df["teamFGA_game"], np.nan)
        df["usage_share_roll5"] = g["usage_share"].shift(1).rolling(5, min_periods=1).mean()
    return df

def team_daily_ratings(df: pd.DataFrame, windows=(5,10)) -> pd.DataFrame:
    # Poss ‚âà FGA + 0.44*FTA - OREB + TOV  (OREB optional)
    for c in ["TEAM_ABBREVIATION","OPPONENT_ABBREVIATION","GAME_DATE","PTS","FGA","FTA","TOV","OREB"]:
        if c not in df.columns: df[c] = 0.0

    g = df.groupby(["GAME_DATE","TEAM_ABBREVIATION"], as_index=False).agg(
        PTS_team=("PTS","sum"), FGA=("FGA","sum"), FTA=("FTA","sum"),
        TOV=("TOV","sum"), OREB=("OREB","sum")
    )
    g["poss"] = g["FGA"] + 0.44*g["FTA"] - g["OREB"] + g["TOV"]

    opp = g.rename(columns={
        "TEAM_ABBREVIATION":"OPPONENT_ABBREVIATION",
        "PTS_team":"PTS_opp", "poss":"poss_opp"
    })[["GAME_DATE","OPPONENT_ABBREVIATION","PTS_opp","poss_opp"]]

    g2 = g.merge(opp, on=["GAME_DATE"], how="left")
    g2["ORtg_g"] = np.where(g2["poss"]>0, 100*g2["PTS_team"]/g2["poss"], np.nan)
    g2["DRtg_g"] = np.where(g2["poss_opp"]>0, 100*g2["PTS_opp"]/g2["poss_opp"], np.nan)
    g2["Pace_g"] = (g2["poss"] + g2["poss_opp"]) / 2.0

    g2 = g2.sort_values(["TEAM_ABBREVIATION","GAME_DATE"])
    for w in windows:
        for stat in ["ORtg_g","DRtg_g","Pace_g"]:
            g2[f"{stat}_roll{w}"] = (
                g2.groupby("TEAM_ABBREVIATION")[stat].shift(1).rolling(w, min_periods=1).mean()
            )
    keep = ["GAME_DATE","TEAM_ABBREVIATION","ORtg_g_roll5","DRtg_g_roll5","Pace_g_roll5",
            "ORtg_g_roll10","DRtg_g_roll10","Pace_g_roll10"]
    return g2[keep].drop_duplicates(subset=["GAME_DATE","TEAM_ABBREVIATION"], keep="last")

def opponent_position_allowances(df: pd.DataFrame, window=10) -> pd.DataFrame:
    if "START_POSITION" not in df.columns:
        df["START_POSITION"] = np.nan
    base = (df.groupby(["GAME_DATE","OPPONENT_ABBREVIATION","START_POSITION"], as_index=False)
              .agg(PTS_allowed=("PTS","sum"), AST_allowed=("AST","sum"), REB_allowed=("REB","sum"))
              .sort_values(["OPPONENT_ABBREVIATION","START_POSITION","GAME_DATE"]))
    for stat in ["PTS_allowed","AST_allowed","REB_allowed"]:
        base[f"{stat}_roll{window}"] = (
            base.groupby(["OPPONENT_ABBREVIATION","START_POSITION"])[stat]
                .shift(1).rolling(window, min_periods=3).mean()
        )
    wide = base.pivot_table(
        index=["GAME_DATE","OPPONENT_ABBREVIATION"],
        columns="START_POSITION",
        values=[f"PTS_allowed_roll{window}",f"AST_allowed_roll{window}",f"REB_allowed_roll{window}"]
    )
    wide.columns = [f"{a}_{b}" for a,b in wide.columns.to_flat_index()]
    return wide.reset_index()

def assemble_player_game_features(df_logs: pd.DataFrame, df_enriched_season: pd.DataFrame) -> pd.DataFrame:
    df = standardize_logs_cols(df_logs)
    # Extract OPPONENT_ABBREVIATION from MATCHUP column
    df["OPPONENT_ABBREVIATION"] = df["MATCHUP"].str.extract(r"(?:vs\.|@)\s+([A-Z]+)")
    # Normalize opp_key from abbreviation
    abbrev_to_key = {
        "ATL": "ATLANTAHAWKS", "BOS": "BOSTONCELTICS", "BKN": "BROOKLYNNETS", "CHA": "CHARLOTTEHORNETS",
        "CHI": "CHICAGOBULLS", "CLE": "CAVALIERS", "DAL": "DALLASMAVERICKS", "DEN": "DENVERNUGGETS",
        "DET": "DETPISTONS", "GSW": "WARRIORS", "HOU": "ROCKETS", "IND": "PACERS",
        "LAC": "CLIPPERS", "LAL": "LAKERS", "MEM": "GRIZZLIES", "MIA": "HEAT",
        "MIL": "BUCKS", "MIN": "TIMBERWOLVES", "NOP": "PELICANS", "NYK": "KNICKS",
        "OKC": "THUNDER", "ORL": "MAGIC", "PHI": "SIXERS", "PHX": "SUNS",
        "POR": "BLAZERS", "SAC": "KINGS", "SAS": "SPURS", "TOR": "RAPTORS",
        "UTA": "UTAHJAZZ", "WAS": "WASHINGTONWIZARDS"
    }

    df["opp_key"] = df["OPPONENT_ABBREVIATION"].map(abbrev_to_key)
    
    df = add_shooting_efficiency(df)
    df = rolling_player_form(df)

    tr = team_daily_ratings(df)
    df = df.merge(tr, on=["GAME_DATE","TEAM_ABBREVIATION"], how="left")

    oppw = opponent_position_allowances(df)
    df = df.merge(oppw, on=["GAME_DATE","OPPONENT_ABBREVIATION"], how="left")

    # Merge season-enriched (PER/TS%/USG%/ORtg/DRtg/etc.)
    def _norm(s):
        s = str(s).strip().lower()
        s = unicodedata.normalize("NFKD", s)
        s = "".join(ch for ch in s if not unicodedata.combining(ch))
        for ch in [".","'","`","‚Äô","‚Äú","‚Äù",","]:
            s = s.replace(ch,"")
        return " ".join(s.split())

    tmp = df_enriched_season.copy()
    df["player_key"] = df["PLAYER_NAME"].map(_norm)
    df["team_key"]   = df["TEAM_ABBREVIATION"].astype(str).str.upper()
    tmp["player_key"] = tmp["Player"].map(_norm)
    tmp["team_key"]   = tmp["Team"].astype(str).str.upper()

    keep_adv = [c for c in ["PER","TS%","USG%","ORtg","DRtg","WS/48","BPM","VORP","Pos","Age"] if c in tmp.columns]
    df = df.merge(tmp[["player_key","team_key"] + keep_adv], on=["player_key","team_key"], how="left")

    # Situational flags
    if "MATCHUP" in df.columns:
        df["HOME"] = df["MATCHUP"].str.contains(" vs. ", regex=False).astype(int)
    else:
        df["HOME"] = np.nan

    # Rest flags + next-game targets (minutes too)
    df = df.sort_values(["PLAYER_NAME","GAME_DATE"])
    df["prev_date"] = df.groupby("PLAYER_NAME")["GAME_DATE"].shift(1)
    df["days_rest"] = (df["GAME_DATE"] - df["prev_date"]).dt.days
    df["is_b2b"]    = (df["days_rest"] == 0).astype(int)

    for target, src in [("PTS_next","PTS"), ("REB_next","REB"), ("AST_next","AST"), ("MIN_next","MIN")]:
        if src not in df.columns: df[src] = np.nan
        df[target] = df.groupby("PLAYER_NAME")[src].shift(-1)

    return df


In [12]:
# -- cell 12a --
import requests
import pandas as pd
import time

# Base URL
url = "https://www.dunkest.com/api/stats/defense-vs-position"

# Define the combinations with corrected mappings
stats_ids = [4, 26, 5]  # 4: pts, 26: reb, 5: ast
position_ids = [1, 2, 3]  # 1: G, 2: F, 3: C
season_ids = [25, 19, 13]  # 25: 2025‚Äì26, 19: 2024‚Äì25, 13: 2023‚Äì24

headers = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json"
}

all_data = []

for season_id in season_ids:
    for stats_id in stats_ids:
        for position_id in position_ids:
            params = {
                "season_id": season_id,
                "stats_id": stats_id,
                "position_id": position_id
            }
            
            try:
                response = requests.get(url, params=params, headers=headers)
                data = response.json()
                
                # Add identifiers to each record
                for record in data:
                    record['season_id'] = season_id   # ‚úÖ FIXED: correct single season_id
                    record['stats_id'] = stats_id
                    record['position_id'] = position_id

                    # Map IDs to readable names
                    stat_names = {4: 'PTS', 26: 'REB', 5: 'AST'}
                    position_names = {1: 'G', 2: 'F', 3: 'C'}

                    record['stat_type'] = stat_names.get(stats_id, f'unknown_{stats_id}')
                    record['position'] = position_names.get(position_id, f'unknown_{position_id}')
                
                all_data.extend(data)
                
                time.sleep(0.5)  # be polite to API
                
            except Exception as e:
                print(f"‚ùå Error fetching stats_id={stats_id}, position_id={position_id}, season_id={season_id}: {e}")

# Convert to DataFrame
df_combined = pd.DataFrame(all_data)

print(f"\n‚úÖ Total records: {len(df_combined)}")
print(f"Columns: {df_combined.columns.tolist()}")
print(df_combined.head())

# Optional: Save to CSV
df_combined.to_csv('defense_vs_position_combined.csv', index=False)
print(f"\nüíæ Saved to defense_vs_position_combined.csv")



‚úÖ Total records: 810
Columns: ['id', 'name', 'l3', 'l5', 'l10', 'all', 'season_id', 'stats_id', 'position_id', 'stat_type', 'position']
  id               name     l3     l5    l10    all  season_id  stats_id  position_id stat_type position
0  1      Atlanta Hawks  20.67  21.25  19.92  19.11         25         4            1       PTS        G
1  2     Boston Celtics  16.00  16.00  16.35  18.32         25         4            1       PTS        G
2  3      Brooklyn Nets  18.50  19.14  18.31  18.90         25         4            1       PTS        G
3  4  Charlotte Hornets  18.40  19.40  17.47  17.88         25         4            1       PTS        G
4  5      Chicago Bulls  20.57  20.91  20.33  18.75         25         4            1       PTS        G

üíæ Saved to defense_vs_position_combined.csv


In [13]:
# -- Cell 11: Load logs + build features_all with season weights ------------------------------
import os
import pandas as pd

DATA_DIR   = "data_raw"
ENRICH_DIR = "data_enriched"

paths = {
    "logs_2324": os.path.join(DATA_DIR, "nba_boxscores_2023-24.csv"),
    "logs_2425": os.path.join(DATA_DIR, "nba_boxscores_2024-25.csv"),
    "logs_2526": os.path.join(DATA_DIR, "nba_boxscores_2025-26.csv"),
    "enr_2324":  os.path.join(ENRICH_DIR, "nba_player_stats_2023_24_enriched.csv"),
    "enr_2425":  os.path.join(ENRICH_DIR, "nba_player_stats_2024_25_enriched.csv"),
    "enr_2526":  os.path.join(ENRICH_DIR, "nba_player_stats_2025_26_enriched.csv")
}

# Check all required files exist
for k, p in paths.items():
    if not os.path.exists(p):
        raise FileNotFoundError(f"Missing required file {p} (from earlier cells).")

# Load boxscores + enriched stats
logs_2324     = pd.read_csv(paths["logs_2324"])
logs_2425     = pd.read_csv(paths["logs_2425"])
logs_2526     = pd.read_csv(paths["logs_2526"])
enriched_2324 = pd.read_csv(paths["enr_2324"])
enriched_2425 = pd.read_csv(paths["enr_2425"])
enriched_2526 = pd.read_csv(paths["enr_2526"])

# Apply weights to enriched stats before combining
enriched_2324["season_weight"] = 0.1
enriched_2425["season_weight"] = 0.3
enriched_2526["season_weight"] = 0.6

# Optionally mark source season (if needed later)
enriched_2324["season"] = "2023-24"
enriched_2425["season"] = "2024-25"
enriched_2526["season"] = "2025-26"

# Feature engineering
feat_2324 = assemble_player_game_features(logs_2324, enriched_2324)
feat_2425 = assemble_player_game_features(logs_2425, enriched_2425)
feat_2526 = assemble_player_game_features(logs_2526, enriched_2526)

# Combine all seasons
features_all = pd.concat([feat_2324, feat_2425, feat_2526], ignore_index=True)

# Parse game dates
if "GAME_DATE" in features_all.columns:
    features_all["GAME_DATE"] = pd.to_datetime(features_all["GAME_DATE"])

display(features_all.head())

Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,VIDEO_AVAILABLE,OPPONENT_ABBREVIATION,opp_key,TS_game,PTS_roll3,REB_roll3,AST_roll3,MIN_roll3,TS_game_roll3,PTS_roll5,REB_roll5,AST_roll5,MIN_roll5,TS_game_roll5,PTS_roll10,REB_roll10,AST_roll10,MIN_roll10,TS_game_roll10,PTS_roll20,REB_roll20,AST_roll20,MIN_roll20,TS_game_roll20,teamFGA_game,usage_share,usage_share_roll5,ORtg_g_roll5,DRtg_g_roll5,Pace_g_roll5,ORtg_g_roll10,DRtg_g_roll10,Pace_g_roll10,START_POSITION,player_key,team_key,PER,TS%,USG%,WS/48,BPM,VORP,Pos,Age,HOME,prev_date,days_rest,is_b2b,PTS_next,REB_next,AST_next,MIN_next
0,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300277,2023-12-01,DAL vs. MEM,L,4,0,1,0.0,0,1,0.0,1,2,0.5,0,1,1,0,0,0,0,0,1,4,2.2,1,MEM,GRIZZLIES,0.265957,,,,,,,,,,,,,,,,,,,,,92,0.01087,,94.719871,111.136901,101.024,94.719871,112.807308,99.83,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,NaT,,0,12.0,0.0,2.0,19.0
1,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300287,2023-12-02,DAL vs. OKC,L,19,4,10,0.4,3,7,0.429,1,2,0.5,0,0,0,2,0,1,0,1,12,9,18.0,1,OKC,THUNDER,0.551471,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,87,0.114943,0.01087,117.1875,109.79431,103.896,117.1875,115.510621,102.764,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,2023-12-01,1.0,0,4.0,1.0,0.0,7.0
2,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22301213,2023-12-06,DAL vs. UTA,W,7,2,2,1.0,0,0,,0,0,,1,0,1,0,0,0,0,0,4,7,5.2,1,UTA,UTAHJAZZ,1.0,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,101,0.019802,0.062906,139.52164,102.83723,104.28,139.52164,105.263825,103.66,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,2023-12-02,4.0,0,2.0,1.0,0.0,7.0
3,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22301226,2023-12-08,DAL @ POR,W,7,1,3,0.333,0,2,0.0,0,0,,0,1,1,0,0,0,0,1,2,-4,3.2,1,POR,BLAZERS,0.333333,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,91,0.032967,0.048538,117.172853,110.954684,103.672,117.172853,115.883515,103.93,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,0,2023-12-06,2.0,0,4.0,1.0,1.0,14.0
4,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300299,2023-12-11,DAL @ MEM,W,14,2,7,0.286,0,4,0.0,0,0,,1,0,1,1,0,0,1,0,4,1,5.7,1,MEM,GRIZZLIES,0.285714,6.0,0.666667,0.666667,11.0,0.628268,4.75,0.75,0.5,9.25,0.53769,4.75,0.75,0.5,9.25,0.53769,4.75,0.75,0.5,9.25,0.53769,86,0.081395,0.044645,118.389897,113.679259,102.652,118.389897,119.2089,102.326,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,0,2023-12-08,3.0,0,0.0,0.0,0.0,0.0


In [14]:
# üß± Normalize team and position keys in matchup_df
df_combined["team_key"] = df_combined["name"].str.upper().str.replace(" ", "")
df_combined["position"] = df_combined["position"].map({"G": "PG", "F": "SF", "C": "C"})
display(df_combined.head())
display(features_all.head())
# üß± Map season_id to readable string
season_id_map = {
    25: "2025-26",
    19: "2024-25",
    13: "2023-24"
}
df_combined["season"] = df_combined["season_id"].map(season_id_map)

# üß± Normalize keys in features_all
features_all["opp_key"] = features_all["OPPONENT_ABBREVIATION"].astype(str).str.upper().str.replace(" ", "")
features_all["dvp_position"] = features_all["Pos"].copy()
features_all["season"] = features_all["SEASON_ID"].astype(int).apply(lambda x: f"{x - 20000}-{x - 19999}")
features_all["season"] = features_all["season"].astype(str)

# üß© Safe merge function that drops prior columns if they exist
def merge_matchup_stat(df_base, stat: str, suffix: str):
    score_col = f"matchup_score_{suffix}"

    # Remove any previously merged version of the same stat
    df_base = df_base.drop(columns=[score_col, "team_key", "position"], errors="ignore")

    # Prepare stat-specific df
    df_stat = df_combined[df_combined["stat_type"] == stat].copy()
    df_stat = df_stat.rename(columns={"all": score_col})
    df_stat = df_stat[["team_key", "position", "season", score_col]]

    # Merge
    df_merged = pd.merge(
        df_base,
        df_stat,
        left_on=["opp_key", "dvp_position", "season"],
        right_on=["team_key", "position", "season"],
        how="left"
    )

    return df_merged.drop(columns=["team_key", "position"], errors="ignore")

# ‚úÖ Apply matchup stats
features_all = merge_matchup_stat(features_all, "PTS", "pts")
features_all = merge_matchup_stat(features_all, "REB", "reb")
features_all = merge_matchup_stat(features_all, "AST", "ast")

# üßπ Keep only the final matchup score columns
features_all = features_all.drop(columns=[
    col for col in features_all.columns 
    if col.startswith("matchup_score_") and col not in [
        "matchup_score_pts", "matchup_score_reb", "matchup_score_ast"
    ]
])


# ‚úÖ Check result
display(features_all.head())


Unnamed: 0,id,name,l3,l5,l10,all,season_id,stats_id,position_id,stat_type,position,team_key
0,1,Atlanta Hawks,20.67,21.25,19.92,19.11,25,4,1,PTS,PG,ATLANTAHAWKS
1,2,Boston Celtics,16.0,16.0,16.35,18.32,25,4,1,PTS,PG,BOSTONCELTICS
2,3,Brooklyn Nets,18.5,19.14,18.31,18.9,25,4,1,PTS,PG,BROOKLYNNETS
3,4,Charlotte Hornets,18.4,19.4,17.47,17.88,25,4,1,PTS,PG,CHARLOTTEHORNETS
4,5,Chicago Bulls,20.57,20.91,20.33,18.75,25,4,1,PTS,PG,CHICAGOBULLS


Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,VIDEO_AVAILABLE,OPPONENT_ABBREVIATION,opp_key,TS_game,PTS_roll3,REB_roll3,AST_roll3,MIN_roll3,TS_game_roll3,PTS_roll5,REB_roll5,AST_roll5,MIN_roll5,TS_game_roll5,PTS_roll10,REB_roll10,AST_roll10,MIN_roll10,TS_game_roll10,PTS_roll20,REB_roll20,AST_roll20,MIN_roll20,TS_game_roll20,teamFGA_game,usage_share,usage_share_roll5,ORtg_g_roll5,DRtg_g_roll5,Pace_g_roll5,ORtg_g_roll10,DRtg_g_roll10,Pace_g_roll10,START_POSITION,player_key,team_key,PER,TS%,USG%,WS/48,BPM,VORP,Pos,Age,HOME,prev_date,days_rest,is_b2b,PTS_next,REB_next,AST_next,MIN_next
0,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300277,2023-12-01,DAL vs. MEM,L,4,0,1,0.0,0,1,0.0,1,2,0.5,0,1,1,0,0,0,0,0,1,4,2.2,1,MEM,GRIZZLIES,0.265957,,,,,,,,,,,,,,,,,,,,,92,0.01087,,94.719871,111.136901,101.024,94.719871,112.807308,99.83,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,NaT,,0,12.0,0.0,2.0,19.0
1,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300287,2023-12-02,DAL vs. OKC,L,19,4,10,0.4,3,7,0.429,1,2,0.5,0,0,0,2,0,1,0,1,12,9,18.0,1,OKC,THUNDER,0.551471,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,87,0.114943,0.01087,117.1875,109.79431,103.896,117.1875,115.510621,102.764,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,2023-12-01,1.0,0,4.0,1.0,0.0,7.0
2,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22301213,2023-12-06,DAL vs. UTA,W,7,2,2,1.0,0,0,,0,0,,1,0,1,0,0,0,0,0,4,7,5.2,1,UTA,UTAHJAZZ,1.0,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,101,0.019802,0.062906,139.52164,102.83723,104.28,139.52164,105.263825,103.66,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,2023-12-02,4.0,0,2.0,1.0,0.0,7.0
3,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22301226,2023-12-08,DAL @ POR,W,7,1,3,0.333,0,2,0.0,0,0,,0,1,1,0,0,0,0,1,2,-4,3.2,1,POR,BLAZERS,0.333333,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,91,0.032967,0.048538,117.172853,110.954684,103.672,117.172853,115.883515,103.93,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,0,2023-12-06,2.0,0,4.0,1.0,1.0,14.0
4,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300299,2023-12-11,DAL @ MEM,W,14,2,7,0.286,0,4,0.0,0,0,,1,0,1,1,0,0,1,0,4,1,5.7,1,MEM,GRIZZLIES,0.285714,6.0,0.666667,0.666667,11.0,0.628268,4.75,0.75,0.5,9.25,0.53769,4.75,0.75,0.5,9.25,0.53769,4.75,0.75,0.5,9.25,0.53769,86,0.081395,0.044645,118.389897,113.679259,102.652,118.389897,119.2089,102.326,,aj lawson,DAL,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,0,2023-12-08,3.0,0,0.0,0.0,0.0,0.0


Unnamed: 0,SEASON_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,FANTASY_PTS,VIDEO_AVAILABLE,OPPONENT_ABBREVIATION,opp_key,TS_game,PTS_roll3,REB_roll3,AST_roll3,MIN_roll3,TS_game_roll3,PTS_roll5,REB_roll5,AST_roll5,MIN_roll5,TS_game_roll5,PTS_roll10,REB_roll10,AST_roll10,MIN_roll10,TS_game_roll10,PTS_roll20,REB_roll20,AST_roll20,MIN_roll20,TS_game_roll20,teamFGA_game,usage_share,usage_share_roll5,ORtg_g_roll5,DRtg_g_roll5,Pace_g_roll5,ORtg_g_roll10,DRtg_g_roll10,Pace_g_roll10,START_POSITION,player_key,PER,TS%,USG%,WS/48,BPM,VORP,Pos,Age,HOME,prev_date,days_rest,is_b2b,PTS_next,REB_next,AST_next,MIN_next,dvp_position,season,matchup_score_pts,matchup_score_reb,matchup_score_ast
0,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300277,2023-12-01,DAL vs. MEM,L,4,0,1,0.0,0,1,0.0,1,2,0.5,0,1,1,0,0,0,0,0,1,4,2.2,1,MEM,MEM,0.265957,,,,,,,,,,,,,,,,,,,,,92,0.01087,,94.719871,111.136901,101.024,94.719871,112.807308,99.83,,aj lawson,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,NaT,,0,12.0,0.0,2.0,19.0,SG,2023-2024,,,
1,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300287,2023-12-02,DAL vs. OKC,L,19,4,10,0.4,3,7,0.429,1,2,0.5,0,0,0,2,0,1,0,1,12,9,18.0,1,OKC,OKC,0.551471,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,1.0,1.0,0.0,4.0,0.265957,87,0.114943,0.01087,117.1875,109.79431,103.896,117.1875,115.510621,102.764,,aj lawson,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,2023-12-01,1.0,0,4.0,1.0,0.0,7.0,SG,2023-2024,,,
2,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22301213,2023-12-06,DAL vs. UTA,W,7,2,2,1.0,0,0,,0,0,,1,0,1,0,0,0,0,0,4,7,5.2,1,UTA,UTA,1.0,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,6.5,0.5,1.0,11.5,0.408714,101,0.019802,0.062906,139.52164,102.83723,104.28,139.52164,105.263825,103.66,,aj lawson,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,1,2023-12-02,4.0,0,2.0,1.0,0.0,7.0,SG,2023-2024,,,
3,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22301226,2023-12-08,DAL @ POR,W,7,1,3,0.333,0,2,0.0,0,0,,0,1,1,0,0,0,0,1,2,-4,3.2,1,POR,POR,0.333333,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,5.666667,0.666667,0.666667,10.0,0.605809,91,0.032967,0.048538,117.172853,110.954684,103.672,117.172853,115.883515,103.93,,aj lawson,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,0,2023-12-06,2.0,0,4.0,1.0,1.0,14.0,SG,2023-2024,,,
4,22023,1630639,A.J. Lawson,1610612742,DAL,Dallas Mavericks,22300299,2023-12-11,DAL @ MEM,W,14,2,7,0.286,0,4,0.0,0,0,,1,0,1,1,0,0,1,0,4,1,5.7,1,MEM,MEM,0.285714,6.0,0.666667,0.666667,11.0,0.628268,4.75,0.75,0.5,9.25,0.53769,4.75,0.75,0.5,9.25,0.53769,4.75,0.75,0.5,9.25,0.53769,86,0.081395,0.044645,118.389897,113.679259,102.652,118.389897,119.2089,102.326,,aj lawson,11.2,0.519,20.0,0.036,-4.6,-0.2,SG,23.0,0,2023-12-08,3.0,0,0.0,0.0,0.0,0.0,SG,2023-2024,,,


In [15]:
# cell 13a: Add data validation checks
def validate_features(features_df):
    """Validate feature dataframe for modeling"""
    required_cols = ['PLAYER_NAME', 'GAME_DATE', 'PTS', 'REB', 'AST', 'MIN']
    missing_cols = [col for col in required_cols if col not in features_df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Check for data freshness
    latest_date = pd.to_datetime(features_all['GAME_DATE']).max()
    days_old = (pd.Timestamp.now() - latest_date).days
    if days_old > 30:
        print(f"‚ö†Ô∏è Warning: Data is {days_old} days old")
    
    return True

# Add to Cell 12 after loading features_all
validate_features(features_all)

print(pd.to_datetime(features_all['GAME_DATE']).max())
print(pd.Timestamp.now())

2025-11-17 00:00:00
2025-11-18 20:34:31.808878


In [16]:
# -- Cell 12: Build TODAY minutes features (starter/injury ‚Üí minutes signals) --
import os, re, numpy as np, pandas as pd
from datetime import datetime

# Expect df_lineups (Cell 3), mnp_df (Cell 5), and starter_flags_df / injury_flags_df (Cell 7)
assert 'features_all' in globals() and isinstance(features_all, pd.DataFrame) and not features_all.empty, \
    "features_all must exist (Cell 12)."
assert 'starter_flags_df' in globals(), "starter_flags_df missing (Cell 7)."
assert 'injury_flags_df' in globals(),  "injury_flags_df missing (Cell 7)."

def _norm_player(name: str) -> str:
    if not isinstance(name, str): return ""
    s = re.sub(r"[.\-`'‚Äô]", "", name).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

# Latest row per player for context
keep_cols = [
    "PLAYER_ID","PLAYER_NAME","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION","GAME_DATE",
    "MIN_roll5","MIN_roll10","days_rest","is_b2b","HOME",
    "USG%","TS%","PER","BPM"
]
keep_cols = [c for c in keep_cols if c in features_all.columns]

latest = (features_all.sort_values(["PLAYER_NAME","GAME_DATE"])
                     .groupby("PLAYER_NAME", as_index=False)
                     .tail(1)[keep_cols]
                     .copy())

# Fabricate PLAYER_ID if missing (rare)
if "PLAYER_ID" not in latest.columns:
    latest["PLAYER_ID"] = latest["PLAYER_NAME"].factorize()[0] + 1

latest["player_key"] = latest["PLAYER_NAME"].map(_norm_player)
latest = latest.rename(columns={
    "TEAM_ABBREVIATION": "team",
    "OPPONENT_ABBREVIATION": "opponent"
})

# Normalize and merge flags
sf = starter_flags_df.copy()
sf["player_key"] = sf["player"].map(_norm_player)
sf["team"] = sf["team"].astype(str).str.upper().str.strip()

inj = injury_flags_df.copy()
inj["player_key"] = inj["player"].map(_norm_player)

# Merge starter and injury flags into today frame
today = (latest.merge(sf[["player_key","team","is_starter","start_prob"]],
                      on=["player_key","team"], how="left")
               .merge(inj[["player_key","may_not_play","injury_prob"]],
                      on="player_key", how="left"))

today["is_starter"]   = today["is_starter"].fillna(0).astype(int)
today["start_prob"]   = today["start_prob"].fillna(0.70)
today["may_not_play"] = today["may_not_play"].fillna(0).astype(int)
today["injury_prob"]  = today["injury_prob"].fillna(0.0)

# ========== ‚úÖ NEW FIXED: Teammate absence proxy from full team rosters ==========

# Build full roster from latest features
roster_all = (features_all.sort_values(["PLAYER_NAME","GAME_DATE"])
                         .groupby("PLAYER_NAME", as_index=False)
                         .tail(1)[["PLAYER_NAME","TEAM_ABBREVIATION","USG%"]].copy())
roster_all["player_key"] = roster_all["PLAYER_NAME"].map(_norm_player)
roster_all = roster_all.rename(columns={"TEAM_ABBREVIATION": "team"})

# Merge in injury flags
inj = injury_flags_df.copy()
inj["player_key"] = inj["player"].map(_norm_player)
roster_all = roster_all.merge(inj[["player_key","injury_prob"]], on="player_key", how="left")
roster_all["injury_prob"] = roster_all["injury_prob"].fillna(0.0)

# Calculate expected missing usage per team
roster_all["usg_missing_expected"] = roster_all["USG%"].fillna(0.0) * roster_all["injury_prob"].clip(0, 1)

by_team_abs = roster_all.groupby("team", as_index=False).agg(
    teammates_out=("injury_prob", lambda s: float((s > 0).sum())),
    missing_usage_share=("usg_missing_expected","sum")
)

# Merge team-level features into today
today = today.merge(by_team_abs, on="team", how="left")
today["teammates_out"] = today["teammates_out"].fillna(0.0)
today["missing_usage_share"] = today["missing_usage_share"].fillna(0.0)

# Final output expected by minutes model
minutes_today = today.rename(columns={"GAME_DATE":"last_game_date"})[[
    "PLAYER_ID","PLAYER_NAME","team","opponent",
    "is_starter","start_prob","may_not_play","injury_prob",
    "teammates_out","missing_usage_share",
    *(c for c in ["MIN_roll5","MIN_roll10","days_rest","is_b2b","HOME","USG%","TS%","PER","BPM"] if c in today.columns),
]].copy()

print("‚úÖ minutes_today shape:", minutes_today.shape)
print(minutes_today.head(30).to_string(index=False))

‚úÖ minutes_today shape: (756, 19)
 PLAYER_ID        PLAYER_NAME team opponent  is_starter  start_prob  may_not_play  injury_prob  teammates_out  missing_usage_share  MIN_roll5  MIN_roll10  days_rest  is_b2b  HOME  USG%   TS%  PER   BPM
   1630639        A.J. Lawson  TOR      SAS           0         0.7             0          0.0            0.0                  0.0      24.60   18.900000        2.0       0     0  20.0 0.542 12.7  -2.5
   1631260           AJ Green  MIL      CLE           0         0.7             0          0.0            0.0                  0.0      26.20   27.500000        2.0       0     0  12.8 0.713 10.7  -2.2
   1631100         AJ Griffin  ATL      IND           0         0.7             0          0.0            1.0                 29.5       6.00    7.700000        2.0       0     0  17.3 0.382  1.2  -9.6
   1642358         AJ Johnson  WAS      BKN           0         0.7             0          0.0            0.0                  0.0       5.80    9.555556    

In [17]:
# -- Cell 13 (Fixed): bridge/standardize minutes_today -------------------------------
import pandas as pd
import numpy as np
import re

# ‚úÖ Skip processing if minutes_today from Cell 12 already exists with predictions
if "minutes_today" in globals() and "pred_minutes" in minutes_today.columns:
    print("‚úÖ Reusing minutes_today from Cell 12. Rows:", len(minutes_today))
    print(minutes_today.head(30).to_string(index=False))
else:
    def _find_minutes_df():
        candidates = []
        for name, obj in globals().items():
            if not isinstance(obj, pd.DataFrame):
                continue
            lname = name.lower()
            if re.search(r"(minutes|mins)", lname) and re.search(r"(today|pred|proj|features)", lname):
                candidates.append((name, obj))
        prio = ["minutes_today", "features_today_minutes", "minutes_pred", "minutes_predictions", "df_minutes_today"]
        for p in prio:
            for name, df in candidates:
                if name == p:
                    return name, df
        return candidates[0] if candidates else (None, None)

    name, df_src = _find_minutes_df()
    print(f"Detected minutes source: {name or 'None'}")

    if df_src is None or df_src.empty:
        print("‚ö†Ô∏è No minutes_today found; building fallback from features_all.")
        latest = (features_all.sort_values(["PLAYER_NAME","GAME_DATE"])
                            .groupby("PLAYER_NAME", as_index=False).tail(1).copy())

        base_min = latest.get("MIN_roll5", pd.Series(24, index=latest.index)).fillna(24).clip(10, 38)

        minutes_today = latest[["PLAYER_ID","PLAYER_NAME","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION"]].copy()
        minutes_today["pred_minutes"] = base_min

        for flags_name in ["starter_flags_df", "injury_flags_df"]:
            if flags_name in globals() and isinstance(globals()[flags_name], pd.DataFrame):
                flags = globals()[flags_name]
                key_cols = [c for c in ["PLAYER_ID","PLAYER_NAME"] if c in flags.columns]
                if key_cols:
                    minutes_today = minutes_today.merge(
                        flags.drop_duplicates(subset=key_cols),
                        on=key_cols, how="left"
                    )

        minutes_today["start_prob"]   = minutes_today.get("start_prob", 0.75)
        minutes_today["is_starter"]   = minutes_today.get("is_starter", 0).fillna(0).astype(int)
        minutes_today["may_not_play"] = minutes_today.get("may_not_play", 0).fillna(0).astype(int)
        minutes_today["injury_prob"]  = minutes_today.get("injury_prob", 0.0).fillna(0.0)

    else:
        # Standardization logic from detected df_src
        df = df_src.copy()
        cmap = {}
        for c in df.columns:
            cl = c.lower()
            if cl in ["player_id","id"]: cmap[c] = "PLAYER_ID"
            elif cl in ["player_name","player","name"]: cmap[c] = "PLAYER_NAME"
            elif cl in ["team","team_abbreviation"]: cmap[c] = "TEAM_ABBREVIATION"
            elif cl in ["opponent","opp","opponent_abbreviation"]: cmap[c] = "OPPONENT_ABBREVIATION"
            elif cl in ["pred_minutes","minutes","min_pred"]: cmap[c] = "pred_minutes"
            elif cl in ["start_prob","starter_prob"]: cmap[c] = "start_prob"
            elif cl in ["is_starter","starter_flag"]: cmap[c] = "is_starter"
            elif cl in ["may_not_play","dnp_flag"]: cmap[c] = "may_not_play"
            elif cl in ["injury_prob","inj_prob"]: cmap[c] = "injury_prob"

        df = df.rename(columns=cmap)

        if "PLAYER_ID" not in df.columns or df["PLAYER_ID"].isna().all():
            latest = (features_all.sort_values(["PLAYER_NAME","GAME_DATE"])
                                .groupby("PLAYER_NAME", as_index=False).tail(1)[
                                    ["PLAYER_ID","PLAYER_NAME","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION"]
                                ])
            df = df.merge(latest, on="PLAYER_NAME", how="left")

        req = ["PLAYER_ID","PLAYER_NAME","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION",
               "pred_minutes","start_prob","is_starter","may_not_play","injury_prob"]
        for r in req:
            if r not in df.columns:
                if r == "pred_minutes":
                    base = (features_all.sort_values(["PLAYER_NAME","GAME_DATE"])
                                     .groupby("PLAYER_NAME", as_index=False).tail(1))
                    base = base[["PLAYER_NAME","MIN_roll5"]].rename(columns={"MIN_roll5":"pred_minutes"})
                    df = df.merge(base, on="PLAYER_NAME", how="left")
                    df["pred_minutes"] = df["pred_minutes"].fillna(24).clip(10, 38)
                elif r in ["start_prob","injury_prob"]:
                    df[r] = 0.75 if r == "start_prob" else 0.0
                elif r in ["is_starter","may_not_play"]:
                    df[r] = 0
                else:
                    df[r] = np.nan

        df["pred_minutes"] = pd.to_numeric(df["pred_minutes"], errors="coerce").fillna(24).clip(0, 48)
        df["is_starter"]   = df["is_starter"].fillna(0).astype(int)
        df["may_not_play"] = df["may_not_play"].fillna(0).astype(int)
        df["start_prob"]   = df["start_prob"].fillna(0.75)
        df["injury_prob"]  = df["injury_prob"].fillna(0.0)

        minutes_today = df[["PLAYER_ID","PLAYER_NAME","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION",
                            "pred_minutes","start_prob","is_starter","may_not_play","injury_prob"]].copy()

    print("‚úÖ Finalized minutes_today rows:", len(minutes_today))
    print(minutes_today.head(30).to_string(index=False))


Detected minutes source: minutes_today
‚úÖ Finalized minutes_today rows: 756
 PLAYER_ID        PLAYER_NAME TEAM_ABBREVIATION OPPONENT_ABBREVIATION  pred_minutes  start_prob  is_starter  may_not_play  injury_prob
   1630639        A.J. Lawson               TOR                   SAS         24.60         0.7           0             0          0.0
   1631260           AJ Green               MIL                   CLE         26.20         0.7           0             0          0.0
   1631100         AJ Griffin               ATL                   IND         10.00         0.7           0             0          0.0
   1642358         AJ Johnson               WAS                   BKN         10.00         0.7           0             0          0.0
    203932       Aaron Gordon               DEN                   CHI         30.40         0.7           0             0          0.0
   1628988      Aaron Holiday               HOU                   ORL         10.00         0.7           0      

In [18]:
## -- Cell 14: Fetch projected minutes from Rotowire -------------------------------
import time
import json
import requests
import pandas as pd

# ============================
# Settings
# ============================

BASE_URL = "https://www.rotowire.com/basketball/ajax/get-projected-minutes.php"

# All NBA team codes (you can trim this list to just the teams playing today)
TEAMS = [
    "ATL","BOS","BKN","CHA","CHI","CLE","DAL","DEN","DET",
    "GSW","HOU","IND","LAC","LAL","MEM","MIA","MIL","MIN",
    "NOP","NYK","OKC","ORL","PHI","PHX","POR","SAC","SAS",
    "TOR","UTA","WAS"
]

# Create a session to reuse TCP connection and headers
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; projected-minutes-notebook/1.0)",
    "Accept": "application/json,*/*",
    "X-Requested-With": "XMLHttpRequest",
})

all_rows = []         # flattened player rows (if possible)
raw_per_team = {}     # raw JSON per team in case you want to inspect it

for team in TEAMS:
    params = {"team": team}
    try:
        resp = session.get(BASE_URL, params=params, timeout=10)
    except Exception as e:
        print(f"Request failed for {team}: {e}")
        continue

    if resp.status_code != 200:
        print(f"Team {team}: HTTP {resp.status_code}")
        continue

    try:
        data = resp.json()
    except json.JSONDecodeError:
        print(f"Team {team}: could not decode JSON")
        continue

    raw_per_team[team] = data  # always keep raw, whatever the shape is

    # ============================
    # Try to flatten into rows
    # ============================

    # Case 1: endpoint returns a list of player dicts
    if isinstance(data, list):
        for row in data:
            if isinstance(row, dict):
                row_with_team = {"team": team, **row}
                all_rows.append(row_with_team)
            else:
                # non-dict elements, keep as raw
                all_rows.append({"team": team, "raw": row})

    # Case 2: common pattern: {"players": [ {...}, {...}, ... ], ...}
    elif isinstance(data, dict) and "players" in data and isinstance(data["players"], list):
        for row in data["players"]:
            if isinstance(row, dict):
                row_with_team = {"team": team, **row}
                all_rows.append(row_with_team)
            else:
                all_rows.append({"team": team, "raw": row})

    # Fallback: unknown structure ‚Üí store whole object in "raw" column
    else:
        all_rows.append({"team": team, "raw": data})

    # Tiny delay to be polite to the server (adjust or remove at your own risk)
    time.sleep(0.3)

# ============================
# Build DataFrame
# ============================

df_minutes = pd.DataFrame(all_rows)

print("Shape:", df_minutes.shape)
display(df_minutes.head(20))

# If you want to inspect what the raw JSON looks like for a specific team, e.g. LAC:
# raw_per_team["LAC"]

Shape: (388, 17)


Unnamed: 0,team,raw,id,firstname,lastname,name,link,pos,inj,lineup,proj,avg,min,max,gap,stdev,hasRecentGames
0,ATL,{'error': 'You must be a paid subscriber to vi...,,,,,,,,,,,,,,,
1,BOS,,3894.0,Jaylen,Brown,Jaylen Brown,/basketball/player/jaylen-brown-3894,F,No,SF,34.0,33.0,27.0,38.0,1.0,5.467175,True
2,BOS,,4108.0,Jayson,Tatum,Jayson Tatum,/basketball/player/jayson-tatum-4108,F,Out,BE,0.0,,,,0.0,15.0,False
3,BOS,,4143.0,Chris,Boucher,Chris Boucher,/basketball/player/chris-boucher-4143,F,No,BE,0.0,12.0,9.0,15.0,-12.0,2.828427,True
4,BOS,,4195.0,Derrick,White,Derrick White,/basketball/player/derrick-white-4195,G,No,PG,34.0,33.0,27.0,39.0,1.0,5.748043,True
5,BOS,,4373.0,Anfernee,Simons,Anfernee Simons,/basketball/player/anfernee-simons-4373,G,No,BE,24.0,21.0,16.0,27.0,3.0,5.43047,True
6,BOS,,5131.0,Payton,Pritchard,Payton Pritchard,/basketball/player/payton-pritchard-5131,G,No,SG,31.0,31.0,28.0,34.0,0.0,3.238827,True
7,BOS,,5290.0,Xavier,Tillman,Xavier Tillman,/basketball/player/xavier-tillman-5290,F,No,BE,0.0,6.0,2.0,10.0,-6.0,4.145781,True
8,BOS,,5333.0,Neemias,Queta,Neemias Queta,/basketball/player/neemias-queta-5333,C,No,C,27.0,26.0,21.0,31.0,1.0,5.192302,True
9,BOS,,5338.0,Sam,Hauser,Sam Hauser,/basketball/player/sam-hauser-5338,F,No,BE,18.0,19.0,13.0,24.0,-1.0,5.318834,True


In [19]:
# -- Cell 15 (Updated): Inject Rotowire projected minutes into minutes_today ---------
import numpy as np
import pandas as pd
import re

assert 'df_minutes' in globals(), "df_minutes (Rotowire results) not found."
assert 'minutes_today' in globals(), "minutes_today not found. Run Cells 13b‚Äì14 first."

def _norm_player_local(name: str) -> str:
    if not isinstance(name, str):
        return ""
    s = re.sub(r"[.\-`'‚Äô]", "", name).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

# Step 1: Prepare Rotowire minutes
rw = df_minutes.copy()
if "name" not in rw.columns:
    raise RuntimeError("df_minutes must have a 'name' column from Rotowire.")

rw["player_key"] = rw["name"].map(_norm_player_local)
rw["rotowire_minutes"] = pd.to_numeric(rw.get("proj", np.nan), errors="coerce")

# ‚úÖ Treat 0.0 projections as missing
rw["rotowire_minutes"] = rw["rotowire_minutes"].replace(0.0, np.nan)

rw = (
    rw.dropna(subset=["player_key", "rotowire_minutes"])
      .groupby("player_key", as_index=False)
      .agg(rotowire_minutes=("rotowire_minutes", "mean"))
)
print(f"‚úÖ Rotowire players with valid minutes: {len(rw)}")

# Step 2: Normalize player keys in minutes_today
mt = minutes_today.copy()

if "PLAYER_NAME" in mt.columns:
    mt["player_key"] = mt["PLAYER_NAME"].map(_norm_player_local)
elif "player" in mt.columns:
    mt["player_key"] = mt["player"].map(_norm_player_local)
else:
    raise RuntimeError("minutes_today must have PLAYER_NAME or player column.")

if "team" not in mt.columns and "TEAM_ABBREVIATION" in mt.columns:
    mt["team"] = mt["TEAM_ABBREVIATION"]

# Step 2b: Build active player pool
active_keys = set(rw["player_key"].dropna())

current_logs_name = None
for nm in ["logs_2526", "logs_2425", "logs_2324"]:
    if nm in globals():
        current_logs_name = nm
        break

if current_logs_name is not None:
    logs_df = globals()[current_logs_name]
    if isinstance(logs_df, pd.DataFrame) and "PLAYER_NAME" in logs_df.columns:
        log_keys = logs_df["PLAYER_NAME"].map(_norm_player_local).dropna().unique()
        active_keys.update(log_keys)
        print(f"Using {current_logs_name} for active players, count={len(log_keys)}")
else:
    print("‚ö†Ô∏è No logs_25xx DataFrame found; active players based only on Rotowire.")

before_n = len(mt)
mt = mt[mt["player_key"].isin(active_keys)].copy()
after_n = len(mt)
print(f"üîç Filtered minutes_today from {before_n} ‚Üí {after_n} rows based on active players.")

# Step 3: Merge
mt = mt.merge(rw, on="player_key", how="left")
if "rotowire_minutes" not in mt.columns:
    mt["rotowire_minutes"] = np.nan

# Step 4: Internal projection fallback
if "pred_minutes" in mt.columns:
    mt["pred_minutes_internal"] = mt["pred_minutes"]
elif "projected_minutes" in mt.columns:
    mt["pred_minutes_internal"] = mt["projected_minutes"]
else:
    mt["pred_minutes_internal"] = np.nan

# Step 5: Blend
w_rw = 0.6
w_internal = 0.4

base_internal = mt["pred_minutes_internal"].fillna(mt["rotowire_minutes"])
mt["pred_minutes"] = np.where(
    mt["rotowire_minutes"].notna(),
    w_rw * mt["rotowire_minutes"] + w_internal * base_internal,
    base_internal
)

mt["pred_minutes"] = mt["pred_minutes"].clip(0, 48)

# Step 6: Save
minutes_today = mt

print("‚úÖ minutes_today enriched with blended Rotowire + internal minutes.")
cols_to_show = [c for c in [
    "PLAYER_NAME", "team", "TEAM_ABBREVIATION",
    "pred_minutes_internal", "rotowire_minutes", "pred_minutes"
] if c in minutes_today.columns]

print(minutes_today[cols_to_show].head(25).to_string(index=False))


‚úÖ Rotowire players with valid minutes: 61
Using logs_2526 for active players, count=468
üîç Filtered minutes_today from 756 ‚Üí 468 rows based on active players.
‚úÖ minutes_today enriched with blended Rotowire + internal minutes.
      PLAYER_NAME team TEAM_ABBREVIATION  pred_minutes_internal  rotowire_minutes  pred_minutes
         AJ Green  MIL               MIL                  26.20               NaN         26.20
       AJ Johnson  WAS               WAS                  10.00               NaN         10.00
     Aaron Gordon  DEN               DEN                  30.40               NaN         30.40
    Aaron Holiday  HOU               HOU                  10.00               NaN         10.00
    Aaron Nesmith  IND               IND                  31.80               NaN         31.80
    Aaron Wiggins  OKC               OKC                  27.40               NaN         27.40
       Ace Bailey  UTA               UTA                  25.80              27.0         26.5

In [20]:
# -- Cell 15b: Fix low internal minute projections for known starters ------------
from collections import defaultdict

# Reference to use: current logs like logs_2526
log_df = globals().get("logs_2526", None)
if log_df is None or not isinstance(log_df, pd.DataFrame):
    raise RuntimeError("‚ùå logs_2526 not found or invalid.")

log_df["player_key"] = log_df["PLAYER_NAME"].map(_norm_player_local)

# Compute full-season average minutes per player
season_mins = (
    log_df.groupby("player_key")["MIN"]
          .mean()
          .dropna()
          .to_dict()
)

# Threshold: if pred_minutes_internal < 10 and season avg > 20, then override
def should_override(row):
    internal = row["pred_minutes_internal"]
    season_avg = season_mins.get(row["player_key"], None)
    return pd.notna(internal) and internal < 10 and season_avg and season_avg > 20

# Apply fix
minutes_today["minutes_avg_season"] = minutes_today["player_key"].map(season_mins)
minutes_today["corrected_pred_minutes"] = minutes_today.apply(
    lambda row: row["minutes_avg_season"] if should_override(row) else row["pred_minutes"],
    axis=1
)

minutes_today["pred_minutes"] = minutes_today["corrected_pred_minutes"].clip(0, 48)
minutes_today.drop(columns=["corrected_pred_minutes"], inplace=True)

print("‚úÖ Applied season average fallback to correct underprojected minutes.")
print(
    minutes_today[
        ["PLAYER_NAME", "team", "pred_minutes_internal", "rotowire_minutes", "minutes_avg_season", "pred_minutes"]
    ]
    .sort_values("pred_minutes", ascending=False)
    .head(25)
    .to_string(index=False)
)


‚úÖ Applied season average fallback to correct underprojected minutes.
          PLAYER_NAME team  pred_minutes_internal  rotowire_minutes  minutes_avg_season  pred_minutes
         Tyrese Maxey  PHI                  38.00               NaN           40.461538         38.00
        James Wiseman  IND                  38.00               NaN           20.000000         38.00
         James Harden  LAC                  38.00               NaN           36.230769         38.00
         Cooper Flagg  DAL                  37.60               NaN           33.800000         37.60
        Miles Bridges  CHA                  37.20               NaN           35.214286         37.20
          Josh Giddey  CHI                  37.00               NaN           34.363636         37.00
         Franz Wagner  ORL                  36.80               NaN           35.714286         36.80
        Amen Thompson  HOU                  36.40               NaN           35.500000         36.40
Giannis Ant

In [35]:
# -- New Cell: Scrape and build odds_long (minimal version) ------------

from datetime import datetime
import numpy as np, pandas as pd

BOOKS = ("mgm","draftkings","fanduel","caesars","betrivers")
scraper = NBAOddsScraper()

long_parts = []
for b in BOOKS:
    wide_b = scraper.get_player_props_odds_wide_raw(book=b)
    if wide_b.empty:
        print(f"‚ö†Ô∏è {b}: no rows scraped.")
        continue
    long_b = odds_wide_to_long_from_columns(wide_b, books=(b,), markets=("PTS","REB","AST"))
    if long_b.empty:
        print(f"‚ö†Ô∏è {b}: long table empty after conversion.")
    else:
        long_parts.append(long_b)

odds_long = pd.concat(long_parts, ignore_index=True) if long_parts else pd.DataFrame()
print(f"‚úÖ Combined long odds rows: {len(odds_long)}")

print(odds_long.head(20).to_string(index=False))

# 1. Normalize opponent names
odds_today = odds_long.copy()
odds_today["opp_clean"] = odds_today["opponent"].astype(str).str.replace("@", "", regex=False).str.upper().str.strip()
odds_today["team_clean"] = odds_today["team"].astype(str).str.upper().str.strip()

# 2. Create team-opponent matchup (both directions appear)
games_raw = odds_today[["team_clean", "opp_clean"]].dropna().drop_duplicates()

# 3. Create consistent keys by sorting team names alphabetically
games_raw["match_key"] = games_raw.apply(
    lambda r: "_vs_".join(sorted([r["team_clean"], r["opp_clean"]])), axis=1
)

# 4. Deduplicate using the match_key, and re-expand into team/opponent
games_deduped = (
    games_raw.drop_duplicates("match_key")[["team_clean", "opp_clean"]]
    .rename(columns={"team_clean": "team", "opp_clean": "opponent"})
    .reset_index(drop=True)
)

today_games_clean = pd.concat([
    games_deduped,
    games_deduped.rename(columns={"team": "opponent", "opponent": "team"})
]).drop_duplicates().reset_index(drop=True)

print("üìÖ today_games_clean (team vs opponent):")
print(today_games_clean.to_string(index=False))



‚úÖ Fetched 1139 odds rows | 264 columns | book=mgm
‚úÖ Fetched 1139 odds rows | 264 columns | book=draftkings
‚úÖ Fetched 1139 odds rows | 264 columns | book=fanduel
‚úÖ Fetched 1139 odds rows | 264 columns | book=caesars
‚úÖ Fetched 1139 odds rows | 264 columns | book=betrivers
‚úÖ Combined long odds rows: 3057
                  player team opponent  game_date market book  line  over_odds  under_odds
             Luka Doncic  LAL      UTA 2025-11-18    PTS  mgm  31.5     -120.0      -110.0
            Devin Booker  PHX     @POR 2025-11-18    PTS  mgm  29.5     -120.0      -110.0
            Jaylen Brown  BOS     @BKN 2025-11-18    PTS  mgm  26.5     -110.0      -120.0
           Stephen Curry  GSW     @ORL 2025-11-18    PTS  mgm  26.5     -120.0      -110.0
         Lauri Markkanen  UTA     @LAL 2025-11-18    PTS  mgm  26.5     -120.0      -110.0
         Cade Cunningham  DET     @ATL 2025-11-18    PTS  mgm  26.5     -120.0      -110.0
            De'Aaron Fox  SAS      MEM 2025-11-1

In [43]:
# -- Rebuild today_games_clean from odds_long with game_id ---------------

from datetime import datetime

assert "odds_long" in globals(), "odds_long missing. Run the odds scraping cell first."

today_str = datetime.utcnow().strftime("%Y-%m-%d")
odds_today = odds_long[odds_long["game_date"] == today_str].copy()

if odds_today.empty:
    raise ValueError(f"‚ùå No odds found for today ({today_str})")

# Normalize opponent and team
odds_today["opp_clean"] = odds_today["opponent"].astype(str).str.replace("@","", regex=False).str.strip()
odds_today["is_away"] = odds_today["opponent"].astype(str).str.startswith("@")
odds_today["home_team"] = odds_today.apply(lambda r: r["opp_clean"] if r["is_away"] else r["team"], axis=1)
odds_today["away_team"] = odds_today.apply(lambda r: r["team"] if r["is_away"] else r["opp_clean"], axis=1)

# Create game key
odds_today["game_key"] = (
    odds_today["game_date"].astype(str) + "_" +
    odds_today["home_team"] + "_" +
    odds_today["away_team"]
)

games_unique = (
    odds_today[["game_date", "game_key", "home_team", "away_team"]]
    .drop_duplicates("game_key")
    .reset_index(drop=True)
)

# Final clean table for today
today_games_clean = pd.concat([
    games_unique.assign(
        game_id = games_unique["game_key"],
        team = games_unique["home_team"],
        opponent = games_unique["away_team"],
        home = 1
    )[["game_id", "team", "opponent", "home", "game_date"]],
    games_unique.assign(
        game_id = games_unique["game_key"],
        team = games_unique["away_team"],
        opponent = games_unique["home_team"],
        home = 0
    )[["game_id", "team", "opponent", "home", "game_date"]],
], ignore_index=True)

print("‚úÖ today_games_clean ready:")
print(today_games_clean.head(10).to_string(index=False))


‚úÖ today_games_clean ready:
           game_id team opponent  home  game_date
2025-11-18_LAL_UTA  LAL      UTA     1 2025-11-18
2025-11-18_POR_PHX  POR      PHX     1 2025-11-18
2025-11-18_BKN_BOS  BKN      BOS     1 2025-11-18
2025-11-18_ORL_GSW  ORL      GSW     1 2025-11-18
2025-11-18_ATL_DET  ATL      DET     1 2025-11-18
2025-11-18_SAS_MEM  SAS      MEM     1 2025-11-18
2025-11-18_LAL_UTA  UTA      LAL     0 2025-11-18
2025-11-18_POR_PHX  PHX      POR     0 2025-11-18
2025-11-18_BKN_BOS  BOS      BKN     0 2025-11-18
2025-11-18_ORL_GSW  GSW      ORL     0 2025-11-18


In [44]:
# -- Cell 16: Robust minutes cleaning + 240-min normalization --------------

import numpy as np
import pandas as pd

nba_boxscores_2025_26 = pd.read_csv("data_raw/nba_boxscores_2025-26.csv")

# Safety checks
assert "minutes_today" in globals(), "minutes_today missing ‚Äî run Cells 13b‚Äì14b first."
assert "today_games_clean" in globals(), "today_games_clean missing ‚Äî run the schedule cell."
assert "nba_boxscores_2025_26" in globals(), "nba_boxscores_2025_26 missing ‚Äî required for filtering active teams."

mt = minutes_today.copy()

# 1) Standardize 'team' column
if "team" not in mt.columns:
    if "TEAM_ABBREVIATION" in mt.columns:
        mt["team"] = mt["TEAM_ABBREVIATION"]
    else:
        raise ValueError("minutes_today must have either 'team' or 'TEAM_ABBREVIATION'.")

mt["team"] = mt["team"].astype(str).str.upper().str.strip()

# 2) Restrict to teams playing today
valid_teams = set(today_games_clean["team"].astype(str).str.upper().unique())
mt = mt[mt["team"].isin(valid_teams)].copy()

# 3) Filter out teams with no boxscore history this season
played_teams = set(nba_boxscores_2025_26["TEAM_ABBREVIATION"].astype(str).str.upper().unique())
mt = mt[mt["team"].isin(played_teams)].copy()

# 4) Map TODAY's opponent from today_games_clean
opp_map = (
    today_games_clean
    .assign(team=lambda d: d["team"].astype(str).str.upper().str.strip())
    .set_index("team")["opponent"]
    .to_dict()
)
mt["opponent"] = mt["team"].map(opp_map)
mt = mt.dropna(subset=["opponent"])

# 5) Remove duplicate PLAYER_IDs (keep highest minutes)
mt = mt.sort_values("pred_minutes", ascending=False)
mt = mt.drop_duplicates(subset=["PLAYER_ID"], keep="first")

# 6) Remove low-confidence players
if "may_not_play" in mt.columns:
    mt = mt[mt["may_not_play"] == 0]
mt = mt[mt["pred_minutes"] > 0].copy()

# 7) Map game_id to enable per-game normalization
if "game_id" not in today_games_clean.columns:
    raise ValueError("‚ùå today_games_clean must contain 'game_id' column.")

game_team_lookup = today_games_clean.set_index("team")["game_id"].to_dict()
mt["game_id"] = mt["team"].map(game_team_lookup)
mt = mt.dropna(subset=["game_id"])

# 8) Normalize predicted minutes to exactly 240 per team per game
def normalize_minutes(df):
    total = df["pred_minutes"].sum()
    if total < 1:
        return df
    scale = 240.0 / total
    df["pred_minutes"] = df["pred_minutes"] * scale
    return df

mt = (
    mt.groupby(["game_id", "team"], group_keys=False)
      .apply(normalize_minutes)
      .reset_index(drop=True)
)

# 9) Final checks
mins_team_check = mt.groupby(["game_id", "team"])["pred_minutes"].sum()
print("‚è± Final normalized minutes per team:")
print(mins_team_check.round(2).to_string())

if not np.allclose(mins_team_check.values, 240.0, atol=0.2):
    raise ValueError("‚ùå Minutes normalization failed: team totals not ~240.")

# Output
minutes_today_clean = mt.copy()
minutes_today = minutes_today_clean.copy()  # overwrite for downstream use

print("‚úÖ minutes_today_clean ready and applied to minutes_today.")
print(minutes_today_clean.head(20).to_string(index=False))


‚è± Final normalized minutes per team:
game_id             team
2025-11-18_ATL_DET  ATL     240.0
                    DET     240.0
2025-11-18_BKN_BOS  BKN     240.0
                    BOS     240.0
2025-11-18_LAL_UTA  LAL     240.0
                    UTA     240.0
2025-11-18_ORL_GSW  GSW     240.0
                    ORL     240.0
2025-11-18_POR_PHX  PHX     240.0
                    POR     240.0
2025-11-18_SAS_MEM  MEM     240.0
                    SAS     240.0
‚úÖ minutes_today_clean ready and applied to minutes_today.
 PLAYER_ID        PLAYER_NAME TEAM_ABBREVIATION OPPONENT_ABBREVIATION  pred_minutes  start_prob  is_starter  may_not_play  injury_prob        player_key team  rotowire_minutes  pred_minutes_internal  minutes_avg_season opponent            game_id
   1641739     Toumani Camara               POR                   DAL     28.387518         0.7           0             0          0.0    toumani camara  POR               NaN                   36.2           34.461538   

In [45]:
# -- Cell 17: per-minute rate models + today projections -----------------------
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error

# ---------------------------------------------------------------------------
# 0) Safety checks / inputs from previous cells
# ---------------------------------------------------------------------------
assert 'features_all' in globals() and isinstance(features_all, pd.DataFrame) and not features_all.empty, \
    "features_all missing. Run Cell 7 + Cell 0_data first."
assert 'minutes_today' in globals() and isinstance(minutes_today, pd.DataFrame) and not minutes_today.empty, \
    "minutes_today missing. Run Cells 5‚Äì7 first."

# ---------------------------------------------------------------------------
# 1) Build leakage-safe rate targets for training
#    We predict per-minute rates (not raw totals); later multiply by pred_minutes.
# ---------------------------------------------------------------------------
train = features_all.copy()

# Ensure numeric MIN
if "MIN" not in train.columns:
    raise RuntimeError("MIN column not found in features_all.")
train["MIN"] = pd.to_numeric(train["MIN"], errors="coerce")

# Filter out super-low minute games (noisy rate)
train = train[train["MIN"].fillna(0) >= 6].copy()

# Targets as same-day rates (no look-ahead)
for stat in ["PTS","REB","AST"]:
    if stat not in train.columns:
        train[stat] = np.nan
    train[f"{stat}_per_min"] = train[stat] / train["MIN"].replace(0, np.nan)

# Drop impossible rows
for stat in ["PTS_per_min","REB_per_min","AST_per_min"]:
    train = train[~np.isinf(train[stat])].copy()

# ---------------------------------------------------------------------------
# 2) Feature set for rate models
#    Avoid using future minutes; keep context/skill/opponent/usage style features.
# ---------------------------------------------------------------------------
CANDIDATE_FEATURES = [
    # form / efficiency (all must be shift(1) upstream in features_all)
    "TS_game_roll5","TS_game_roll10",
    "MIN_roll5","MIN_roll10",            # ok: proxy for role, but target is rate not minutes
    "PTS_roll5","PTS_roll10",
    "REB_roll5","REB_roll10",
    "AST_roll5","AST_roll10",
    "usage_share_roll5",

    # season labels
    "PER","TS%","USG%","ORtg","DRtg","WS/48","BPM","VORP",

    # team/matchup context (shifted rolling at team level)
    "ORtg_g_roll5","DRtg_g_roll5","Pace_g_roll5",

    # optional opponent allowances by position (if present from Cell 7)
    # Common column names look like: PTS_allowed_roll10_PG, AST_allowed_roll10_C, etc.
    # We'll auto-include any *_allowed_roll10_* columns if present:
] + [c for c in features_all.columns if "_allowed_roll10_" in c]

# Situational flags (can influence rate a bit)
SITUATIONAL = ["HOME","days_rest","is_b2b"]
CANDIDATE_FEATURES += [c for c in SITUATIONAL if c in features_all.columns]

# Robust final feature list (present in the dataframe)
RATE_FEATURES = [c for c in CANDIDATE_FEATURES if c in train.columns]

print(f"Using {len(RATE_FEATURES)} features for rate models.")

# ---------------------------------------------------------------------------
# 3) Train one model per stat rate with GroupKFold by player
# ---------------------------------------------------------------------------
models_rate = {}
cv_scores_rate = {}
gkf = GroupKFold(n_splits=5)

def train_rate_model(df: pd.DataFrame, target_col: str, feat_cols: list[str]):
    df_ = df.dropna(subset=feat_cols + [target_col, "PLAYER_ID"]).copy()
    X = df_[feat_cols]
    y = df_[target_col]
    groups = df_["PLAYER_ID"]

    fold_mae = []
    for tr, te in gkf.split(X, y, groups):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y.iloc[tr], y.iloc[te]

        model = XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=5,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=1.0,
            reg_alpha=0.0,
            random_state=42,
            n_jobs=-1,
            verbosity=0
        )
        model.fit(Xtr, ytr)
        pred = model.predict(Xte)
        fold_mae.append(mean_absolute_error(yte, pred))

    model.fit(X, y)  # final fit
    return model, float(np.mean(fold_mae)), float(np.std(fold_mae))

for stat in ["PTS","REB","AST"]:
    target = f"{stat}_per_min"
    model, m, s = train_rate_model(train, target, RATE_FEATURES)
    models_rate[stat] = model
    cv_scores_rate[stat] = (m, s)
    print(f"üìè {stat}_per_min MAE: {m:.4f} ¬± {s:.4f}")

# ---------------------------------------------------------------------------
# 4) Project TODAY's rates and totals = rate * predicted minutes
# ---------------------------------------------------------------------------
# Build today's feature frame: take latest per player and align with RATE_FEATURES.
latest_today = (
    features_all.sort_values(["PLAYER_NAME","GAME_DATE"])
                .groupby("PLAYER_NAME", as_index=False)
                .tail(1)
                .copy()
)

# Ensure all feature columns exist; if not, fill with neutral values
for c in RATE_FEATURES:
    if c not in latest_today.columns:
        latest_today[c] = 0.0

X_today = latest_today[RATE_FEATURES].copy()

# Join minutes prediction
mt = minutes_today.rename(columns={"PLAYER_NAME":"PLAYER_NAME_mins"})
proj_base = latest_today.merge(
    mt[["PLAYER_ID","pred_minutes","start_prob","is_starter","may_not_play","injury_prob"]],
    on="PLAYER_ID", how="left"
)

# If a player lacks a minutes prediction, give a conservative fallback
proj_base["pred_minutes"] = proj_base["pred_minutes"].fillna(proj_base.get("MIN_roll5", 24)).clip(0, 48)

# Predict rates
out_frames = []
for stat in ["PTS","REB","AST"]:
    mu_rate = models_rate[stat].predict(X_today)

    # Simple uncertainty model:
    #   - empirical residual SD in rate space per stat (global)
    #   - then combine with minutes variance (approx) for total SD
    # Build residual SD once from training data
    df_t = train.dropna(subset=RATE_FEATURES + [f"{stat}_per_min","PLAYER_ID"]).copy()
    pred_rate_t = models_rate[stat].predict(df_t[RATE_FEATURES])
    resid = (df_t[f"{stat}_per_min"] - pred_rate_t).values
    sd_rate = np.nanstd(resid, ddof=1) if len(resid) > 8 else 0.0

    # Minutes uncertainty proxy from minutes model signals:
    #   baseline 3.0 min SD, boosted if not confirmed starter or has injury_prob
    min_sd = 3.0 \
             + 4.0*(1.0 - proj_base["start_prob"].fillna(0.7).values) \
             + 4.0*(proj_base["injury_prob"].fillna(0.0).values)

    # Combine:
    #   Var(total) ‚âà Var(rate*min) ‚âà E[min]^2 * Var(rate) + E[rate]^2 * Var(min)
    pred_min = proj_base["pred_minutes"].values
    var_total = (pred_min**2) * (sd_rate**2) + (mu_rate**2) * (min_sd**2)
    sd_total = np.sqrt(np.maximum(var_total, 1e-6))

    totals = mu_rate * pred_min

    df_out = pd.DataFrame({
        "PLAYER_ID": proj_base["PLAYER_ID"],
        "PLAYER_NAME": proj_base["PLAYER_NAME"],
        "TEAM_ABBREVIATION": proj_base["TEAM_ABBREVIATION"],
        "OPPONENT_ABBREVIATION": proj_base["OPPONENT_ABBREVIATION"],
        "market": stat,
        "projection_mean": totals,
        "projection_sd": sd_total,
        "pred_minutes": pred_min,
        "pred_rate": mu_rate,
        "start_prob": proj_base["start_prob"].round(2),
        "is_starter": proj_base["is_starter"].fillna(0).astype(int),
        "may_not_play": proj_base["may_not_play"].fillna(0).astype(int),
        "injury_prob": proj_base["injury_prob"].fillna(0.0).round(2)
    })
    out_frames.append(df_out)

proj_base["OPPONENT_ABBREVIATION"] = proj_base["OPPONENT_ABBREVIATION"].fillna("UNK").astype(str)

df_projections_all = pd.concat(out_frames, ignore_index=True)

# Normalize export columns like your Cell 16 pipeline expects
df_projections_all = df_projections_all.rename(columns={
    "PLAYER_NAME": "player",
    "TEAM_ABBREVIATION": "team",
    "OPPONENT_ABBREVIATION": "opponent",
    "pred_minutes": "projected_minutes"
})
df_projections_all["game_date"] = pd.Timestamp.utcnow().strftime("%Y-%m-%d")

print("‚úÖ Projections ready:")
print(df_projections_all.groupby("market")["player"].count().to_dict())
print(df_projections_all.head(40).to_string(index=False))

Using 23 features for rate models.
üìè PTS_per_min MAE: 0.1620 ¬± 0.0021
üìè REB_per_min MAE: 0.0774 ¬± 0.0028
üìè AST_per_min MAE: 0.0568 ¬± 0.0014
‚úÖ Projections ready:
{'AST': 756, 'PTS': 756, 'REB': 756}
 PLAYER_ID             player team opponent market  projection_mean  projection_sd  projected_minutes  pred_rate  start_prob  is_starter  may_not_play  injury_prob  game_date
   1630639        A.J. Lawson  TOR      SAS    PTS        11.208425       4.996829          24.600000   0.455627         NaN           0             0          0.0 2025-11-18
   1631260           AJ Green  MIL      CLE    PTS         8.700673       5.110123          26.200000   0.332087         NaN           0             0          0.0 2025-11-18
   1631100         AJ Griffin  ATL      IND    PTS         2.273416       1.949360           6.000000   0.378903         NaN           0             0          0.0 2025-11-18
   1642358         AJ Johnson  WAS      BKN    PTS         1.466789       1.520712      

In [None]:
# -- Cell 18 (fixed): Assign today's opponent to each player -----------------------

assert "today_games_clean" in globals()

# Create lookup: team -> opponent
opp_map = today_games_clean.set_index("team")["opponent"].to_dict()

# Assign opponent to each player
df_projections_all["opponent"] = df_projections_all["team"].map(opp_map)

print(df_projections_all[["player","team","opponent"]].head(20))

# Filter only teams in today‚Äôs slate
valid_teams = set(today_games_clean["team"].unique())

df_projections_all = df_projections_all[
    df_projections_all["team"].isin(valid_teams)
].copy()

print("Remaining teams:", df_projections_all["team"].unique())


AssertionError: 

In [47]:
# -- Sanity check: projected minutes per team today -------------------------
import pandas as pd

assert "minutes_today" in globals(), "minutes_today missing."
assert "today_games_clean" in globals(), "today_games_clean from Cell X missing."

# Map players to today's teams only
mt_today = minutes_today.copy()
if "team" not in mt_today.columns and "TEAM_ABBREVIATION" in mt_today.columns:
    mt_today["team"] = mt_today["TEAM_ABBREVIATION"]

teams_today = today_games_clean["team"].unique()
mt_today = mt_today[mt_today["team"].isin(teams_today)].copy()

# Sum minutes per team
mins_by_team = (
    mt_today.groupby("team")["pred_minutes"]
    .sum()
    .reset_index()
    .rename(columns={"pred_minutes":"team_total_minutes"})
)

print("‚è± Projected total minutes per team:")
print(mins_by_team.to_string(index=False))


‚è± Projected total minutes per team:
team  team_total_minutes
 ATL               240.0
 BKN               240.0
 BOS               240.0
 DET               240.0
 GSW               240.0
 LAL               240.0
 MEM               240.0
 ORL               240.0
 PHX               240.0
 POR               240.0
 SAS               240.0
 UTA               240.0


In [None]:
import matplotlib.pyplot as plt

for stat, model in models_rate.items():
    importance = model.feature_importances_
    sorted_idx = np.argsort(importance)[::-1]
    sorted_feats = [RATE_FEATURES[i] for i in sorted_idx]
    sorted_imp = importance[sorted_idx]

    plt.figure(figsize=(8, 5))
    plt.title(f"Feature Importance: {stat}_per_min")
    plt.barh(sorted_feats[:12][::-1], sorted_imp[:12][::-1])  # Top 12
    plt.tight_layout()
    plt.show()


In [None]:
print("Features used in rate models:\n", RATE_FEATURES)


## Analysis

In [None]:
# # -- cell 8_rate_model ---------------------------------------------------------
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import TimeSeriesSplit
# from sklearn.metrics import mean_absolute_error
# from xgboost import XGBRegressor

# # ---------------------------------------------------------------------------
# # 0) Safety checks
# # ---------------------------------------------------------------------------
# assert 'features_all' in globals() and isinstance(features_all, pd.DataFrame) and not features_all.empty, \
#     "features_all missing. Run Cell 7 + Cell 0_data first."
# assert 'minutes_today' in globals() and isinstance(minutes_today, pd.DataFrame) and not minutes_today.empty, \
#     "minutes_today missing. Build your base minutes first."

# fa = features_all.copy()
# if "GAME_DATE" in fa.columns:
#     fa["GAME_DATE"] = pd.to_datetime(fa["GAME_DATE"])

# # Minimal id keys (best-effort)
# if "PLAYER_ID" not in fa.columns:
#     # fabricate a stable id per name if you don't have PLAYER_ID
#     fa["PLAYER_ID"] = fa["PLAYER_NAME"].factorize()[0] + 1

# # ---------------------------------------------------------------------------
# # 1) Build leakage-safe RATE targets using next game's totals/minutes
# #    target_rate(stat) = stat_next / max(MIN_next, 1)
# # ---------------------------------------------------------------------------
# fa = fa.sort_values(["PLAYER_NAME","GAME_DATE"]).copy()
# for col in ["MIN","PTS","REB","AST"]:
#     if col not in fa.columns:
#         fa[col] = np.nan

# fa["MIN_next"] = fa.groupby("PLAYER_NAME")["MIN"].shift(-1)
# fa["PTS_next"] = fa.groupby("PLAYER_NAME")["PTS"].shift(-1)
# fa["REB_next"] = fa.groupby("PLAYER_NAME")["REB"].shift(-1)
# fa["AST_next"] = fa.groupby("PLAYER_NAME")["AST"].shift(-1)

# def _rate_next(numer_next, min_next):
#     m = np.maximum(min_next.astype(float), 1.0)
#     return numer_next.astype(float) / m

# fa["PTS_rate_next"] = _rate_next(fa["PTS_next"], fa["MIN_next"])
# fa["REB_rate_next"] = _rate_next(fa["REB_next"], fa["MIN_next"])
# fa["AST_rate_next"] = _rate_next(fa["AST_next"], fa["MIN_next"])

# # ---------------------------------------------------------------------------
# # 2) Feature set (reuse what you already engineered if present)
# #    Keep it robust to missing columns.
# # ---------------------------------------------------------------------------
# BASE_FEATURES = [
#     "MIN_roll5","MIN_roll10","TS_game_roll5","TS_game_roll10",
#     "usage_share_roll5","ORtg_g_roll5","DRtg_g_roll5","Pace_g_roll5",
#     "PER","TS%","USG%","ORtg","DRtg","WS/48","BPM","VORP",
#     "days_rest","HOME",
#     # Extras if your pipeline created them:
#     "PTS_roll5","PTS_roll10","REB_roll5","REB_roll10","AST_roll5","AST_roll10",
#     "PTS_trend","REB_trend","AST_trend",
#     "PTS_roll5_std","REB_roll5_std","AST_roll5_std",
#     "usage_minutes_interact","ts_usage_interact",
#     "opp_ORtg_g_roll5","opp_DRtg_g_roll5","opp_Pace_g_roll5","pace_diff5"
# ]
# TARGETS = {
#     "PTS": "PTS_rate_next",
#     "REB": "REB_rate_next",
#     "AST": "AST_rate_next",
# }

# def _present(cols): 
#     return [c for c in cols if c in fa.columns]

# # ---------------------------------------------------------------------------
# # 3) Train XGBRegressor per stat on RATE targets (time-ordered CV)
# # ---------------------------------------------------------------------------
# models_rate = {}
# cv_scores = {}
# tscv = TimeSeriesSplit(n_splits=5)

# for stat, tgt in TARGETS.items():
#     feat_cols = _present(BASE_FEATURES)
#     need = feat_cols + ["PLAYER_ID","GAME_DATE","MIN_next", tgt]
#     data = fa.dropna(subset=[c for c in need if c in fa.columns]).copy()

#     # Optional: restrict to games where next minutes >= 6 to reduce noisy targets
#     data = data[data["MIN_next"] >= 6].copy()

#     if data.empty:
#         print(f"‚ö†Ô∏è No training data for {stat}. Skipping.")
#         continue

#     data = data.sort_values("GAME_DATE")
#     X = data[feat_cols].fillna(0.0)
#     y = data[tgt].astype(float)

#     fold_mae=[]
#     for tr_idx, te_idx in tscv.split(X):
#         Xtr, Xte = X.iloc[tr_idx], X.iloc[te_idx]
#         ytr, yte = y.iloc[tr_idx], y.iloc[te_idx]
#         model = XGBRegressor(
#             n_estimators=500, learning_rate=0.05, max_depth=5,
#             subsample=0.85, colsample_bytree=0.9,
#             reg_lambda=1.0, reg_alpha=0.0,
#             random_state=42, n_jobs=-1, verbosity=0
#         )
#         model.fit(Xtr, ytr)
#         pred = model.predict(Xte)
#         fold_mae.append(mean_absolute_error(yte, pred))
#     cv_scores[stat] = (float(np.mean(fold_mae)), float(np.std(fold_mae)))
#     print(f"Rate {stat} MAE (TimeSeries CV): {np.mean(fold_mae):.3f} ¬± {np.std(fold_mae):.3f}")

#     final_model = XGBRegressor(
#         n_estimators=600, learning_rate=0.045, max_depth=5,
#         subsample=0.9, colsample_bytree=0.9,
#         reg_lambda=1.0, reg_alpha=0.0,
#         random_state=42, n_jobs=-1, verbosity=0
#     )
#     final_model.fit(X, y)
#     models_rate[stat] = (final_model, feat_cols)

# # ---------------------------------------------------------------------------
# # 4) Build today's feature rows and merge with minutes_today
# #    Use each player's latest historical row as "today context".
# # ---------------------------------------------------------------------------
# latest = (
#     fa.sort_values(["PLAYER_NAME","GAME_DATE"])
#       .groupby("PLAYER_NAME", as_index=False)
#       .tail(1)
#       .copy()
# )

# # Normalize team/opponent keys
# for c in ["TEAM_ABBREVIATION","OPPONENT_ABBREVIATION"]:
#     if c not in latest.columns:
#         latest[c] = np.nan

# # align to players we have minutes for today
# key_cols = ["PLAYER_NAME","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION"]
# mt = minutes_today.copy()
# if "PLAYER_ID" not in mt.columns:
#     # map PLAYER_ID from latest by name when missing
#     name_to_id = latest.set_index("PLAYER_NAME")["PLAYER_ID"].to_dict()
#     mt["PLAYER_ID"] = mt["PLAYER_NAME"].map(name_to_id)

# today = pd.merge(
#     latest,
#     mt[["PLAYER_ID","PLAYER_NAME","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION",
#         "pred_minutes","start_prob"]].drop_duplicates("PLAYER_ID"),
#     on=["PLAYER_ID","PLAYER_NAME","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION"],
#     how="inner",
#     suffixes=("","")
# )

# if today.empty:
#     raise RuntimeError("No overlap between latest history and minutes_today. Check name/team keys.")

# # ---------------------------------------------------------------------------
# # 5) Predict per-minute rates ‚Üí multiply by predicted minutes ‚Üí totals
# #    Also compute a reasonable per-player SD per market.
# # ---------------------------------------------------------------------------
# proj_frames = []
# for stat, (model, feat_cols) in models_rate.items():
#     cols = [c for c in feat_cols if c in today.columns]
#     if not cols:
#         print(f"‚ö†Ô∏è No features present for {stat}. Skipping.")
#         continue

#     Xp = today[cols].fillna(0.0)
#     rate_hat = model.predict(Xp)                      # predicted stat per minute
#     mins_hat = today["pred_minutes"].astype(float).clip(lower=0, upper=48).values
#     total_hat = np.clip(rate_hat * mins_hat, 0, None)

#     # SD heuristic:
#     # - player-specific per-minute volatility from last 10 games
#     # - scaled by predicted minutes
#     # - floor to avoid zero SD
#     hist = (
#         fa.assign(rate=lambda d: np.where(d["MIN"]>0, d[stat] / d["MIN"], np.nan))
#           .sort_values(["PLAYER_NAME","GAME_DATE"])
#     )
#     sd_map = (
#         hist.groupby("PLAYER_NAME")["rate"]
#             .apply(lambda s: s.tail(10).std(ddof=1))
#             .replace([np.inf,-np.inf], np.nan)
#     ).to_dict()
#     sd_rate = today["PLAYER_NAME"].map(sd_map).astype(float).fillna(0.10)   # fallback 0.10 per min
#     sd_total = (sd_rate.values * np.sqrt(np.maximum(mins_hat, 1.0))).clip(0.75, None)

#     dfp = pd.DataFrame({
#         "player": today["PLAYER_NAME"],
#         "team": today["TEAM_ABBREVIATION"],
#         "opponent": today["OPPONENT_ABBREVIATION"],
#         "game_date": pd.Timestamp.utcnow().strftime("%Y-%m-%d"),
#         "market": stat,
#         "projected_minutes": mins_hat,
#         "start_prob": today.get("start_prob", pd.Series(0.75, index=today.index)).fillna(0.75).values,
#         "projection_mean": total_hat,
#         "projection_sd": sd_total
#     })
#     proj_frames.append(dfp)

# df_projections_all = pd.concat(proj_frames, ignore_index=True) if proj_frames else pd.DataFrame()
# df_projections_pts = df_projections_all[df_projections_all["market"].eq("PTS")].copy() if not df_projections_all.empty else pd.DataFrame()
# df_projections_reb = df_projections_all[df_projections_all["market"].eq("REB")].copy() if not df_projections_all.empty else pd.DataFrame()
# df_projections_ast = df_projections_all[df_projections_all["market"].eq("AST")].copy() if not df_projections_all.empty else pd.DataFrame()

# print("\n‚úÖ Rate models trained and projections built.")
# print("CV (MAE on rate targets):", cv_scores)
# print("Projection rows by market:", df_projections_all["market"].value_counts().to_dict() if not df_projections_all.empty else {})
# print(df_projections_all.head(10).to_string(index=False))


In [None]:
# #--cell 8--#
# from sklearn.model_selection import TimeSeriesSplit
# from sklearn.metrics import mean_absolute_error
# from xgboost import XGBRegressor
# import numpy as np
# import pandas as pd

# # --- Load your logs and enriched season files ---
# logs_2324 = pd.read_csv("nba_boxscores_2023-24.csv")
# logs_2425 = pd.read_csv("nba_boxscores_2024-25.csv")
# enriched_2324 = pd.read_csv("nba_player_stats_2023_24_enriched.csv")
# enriched_2425 = pd.read_csv("nba_player_stats_2024_25_enriched.csv")

# # --- Build feature tables per season and concatenate ---
# feat_2324 = assemble_player_game_features(logs_2324, enriched_2324)
# feat_2425 = assemble_player_game_features(logs_2425, enriched_2425)
# features_all = pd.concat([feat_2324, feat_2425], ignore_index=True)

# # --- Base feature pool ---
# BASE_FEATURES = [
#     "MIN_roll5", "MIN_roll10", "TS_game_roll5", "TS_game_roll10", "usage_share_roll5",
#     "ORtg_g_roll5", "DRtg_g_roll5", "Pace_g_roll5",
#     "PER", "TS%", "USG%", "ORtg", "DRtg", "WS/48", "BPM", "VORP",
#     "days_rest", "HOME"
# ]
# STAT_ROLLING = {
#     "PTS": ["PTS_roll5", "PTS_roll10"],
#     "REB": ["REB_roll5", "REB_roll10"],
#     "AST": ["AST_roll5", "AST_roll10"],
# }
# TARGETS = {
#     "PTS": "PTS_next",
#     "REB": "REB_next",
#     "AST": "AST_next",
# }

# models = {}
# feature_cols_by_stat = {}
# cv_scores = {}
# tscv = TimeSeriesSplit(n_splits=5)

# for stat, target_col in TARGETS.items():
#     cand_feats = BASE_FEATURES + STAT_ROLLING[stat]
#     feat_cols = [c for c in cand_feats if c in features_all.columns]
#     feature_cols_by_stat[stat] = feat_cols

#     data = features_all.dropna(subset=feat_cols + [target_col]).copy()
#     if data.empty:
#         print(f"‚ö†Ô∏è No training data for {stat}. Skipping.")
#         continue

#     data_sorted = data.sort_values("GAME_DATE")
#     X = data_sorted[feat_cols]
#     y = data_sorted[target_col]

#     maes = []
#     for train_idx, test_idx in tscv.split(X):
#         Xtr, Xte = X.iloc[train_idx], X.iloc[test_idx]
#         ytr, yte = y.iloc[train_idx], y.iloc[test_idx]
#         model = XGBRegressor(
#             n_estimators=300,
#             learning_rate=0.1,
#             max_depth=4,
#             subsample=0.8,
#             colsample_bytree=0.8,
#             random_state=42,
#             n_jobs=-1,
#             verbosity=0
#         )
#         model.fit(Xtr, ytr)
#         pred = model.predict(Xte)
#         maes.append(mean_absolute_error(yte, pred))

#     cv_scores[stat] = (float(np.mean(maes)), float(np.std(maes)))
#     print(f"XGBoost Player {stat} MAE (TimeSeries CV): {np.mean(maes):.2f} ¬± {np.std(maes):.2f}")

#     final_model = XGBRegressor(
#         n_estimators=300,
#         learning_rate=0.1,
#         max_depth=4,
#         subsample=0.8,
#         colsample_bytree=0.8,
#         random_state=42,
#         n_jobs=-1,
#         verbosity=0
#     )
#     final_model.fit(X, y)
#     models[stat] = final_model

# if "PTS" in models:
#     model = models["PTS"]
#     feature_cols = feature_cols_by_stat["PTS"]


In [None]:
# -- cell 16 (feature importances ‚Äî robust for models_rate/models_mean/models) ---
import os
from datetime import datetime
import pandas as pd

try:
    from IPython.display import display
except Exception:
    def display(x): print(x)

# 1) Detect trained models + their feature maps
_models = None
_feat_map = {}

def _is_tuple_model(v):
    # supports (model, feat_cols) or [model, feat_cols]
    return isinstance(v, (tuple, list)) and len(v) >= 1

if "models_rate" in globals() and isinstance(models_rate, dict) and models_rate:
    # models_rate can be {"PTS": model} OR {"PTS": (model, feat_cols)}
    _models = {}
    _feat_map = {}
    for k, v in models_rate.items():
        if _is_tuple_model(v):
            _models[k] = v[0]
            _feat_map[k] = list(v[1]) if len(v) > 1 else []
        else:
            _models[k] = v
            # fallback: use RATE_FEATURES if present, else empty
            _feat_map[k] = list(globals().get("RATE_FEATURES", []))
elif "models_mean" in globals() and isinstance(models_mean, dict) and models_mean:
    _models = models_mean
    if "feature_bags" in globals() and isinstance(feature_bags, dict):
        _feat_map = feature_bags
    else:
        # fallback: use RATE_FEATURES if present
        rf = list(globals().get("RATE_FEATURES", []))
        _feat_map = {k: rf for k in _models.keys()}
elif "models" in globals() and isinstance(models, dict) and models:
    _models = models
    if "feature_cols_by_stat" in globals() and isinstance(feature_cols_by_stat, dict):
        _feat_map = feature_cols_by_stat
    else:
        rf = list(globals().get("RATE_FEATURES", []))
        _feat_map = {k: rf for k in _models.keys()}

if not _models:
    raise RuntimeError("No trained models found (expected models_rate / models_mean / models). Train first.")

# 2) Importance extractor for XGBoost (gain -> weight -> sklearn attr)
def _xgb_importances(mdl, feat_cols):
    # Try booster-based importances
    booster = None
    try:
        booster = mdl.get_booster()
    except Exception:
        pass

    if booster is not None:
        # prefer 'gain', else 'weight'
        try:
            raw = booster.get_score(importance_type="gain")
        except Exception:
            raw = booster.get_score(importance_type="weight")

        s = pd.Series(raw, dtype=float)
        if not s.empty:
            # Map f0,f1,... to actual names if available
            feat_names = None
            try:
                feat_names = booster.feature_names
            except Exception:
                pass

            if feat_names and all(isinstance(x, str) for x in feat_names):
                if all(k.startswith("f") and k[1:].isdigit() for k in s.index):
                    idx_map = {f"f{i}": feat_names[i] for i in range(len(feat_names))}
                    s.index = [idx_map.get(k, k) for k in s.index]
            elif feat_cols:
                # last-resort: align by index order if keys look like f0,f1,...
                if all(k.startswith("f") and k[1:].isdigit() for k in s.index) and len(feat_cols) >= len(s):
                    idx_map = {f"f{i}": feat_cols[i] for i in range(len(feat_cols))}
                    s.index = [idx_map.get(k, k) for k in s.index]
            return s.sort_values(ascending=False)

    # Fallback to sklearn-style attribute
    if hasattr(mdl, "feature_importances_") and feat_cols:
        s = pd.Series(mdl.feature_importances_, index=feat_cols, dtype=float)
        return s.sort_values(ascending=False)

    # Last fallback: return empty
    return pd.Series(dtype=float)

# 3) Build & display per-stat importances
rows = []
print("=== Feature Importances (top 15 by stat) ===")
for stat, mdl in _models.items():
    feat_cols = _feat_map.get(stat, [])
    imp = _xgb_importances(mdl, feat_cols)

    print(f"\nTop 15 ‚Äî {stat}:")
    if imp.empty:
        print("(no importance info available)")
        continue

    display(imp.head(15))
    for feat, val in imp.items():
        rows.append({"stat": stat, "feature": feat, "importance": float(val)})

# 4) Save tidy CSV + normalized pivot
imp_df = pd.DataFrame(rows)
if not imp_df.empty:
    out_dir = "model_outputs_rate"
    os.makedirs(out_dir, exist_ok=True)

    imp_df["importance_norm"] = (
        imp_df.groupby("stat")["importance"].transform(lambda x: x / (x.sum() if x.sum() else 1.0))
    )

    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    out_path = os.path.join(out_dir, f"feature_importances_{ts}.csv")
    imp_df.sort_values(["stat", "importance"], ascending=[True, False]).to_csv(out_path, index=False)
    print(f"\n‚úÖ Saved importances to: {out_path}")

    feature_importance_wide = imp_df.pivot_table(
        index="feature", columns="stat", values="importance_norm", aggfunc="max", fill_value=0.0
    ).sort_values(by=list(_models.keys())[0] if _models else None, ascending=False)
    print("\n(Preview) Normalized importance wide table:")
    display(feature_importance_wide.head(20))
else:
    print("\n Nothing to export ‚Äî no importances produced.")


## team-level predictions

In [None]:
# #--cell 10--#
# # Team game table
# team_games = features_all.groupby(["GAME_DATE","TEAM_ABBREVIATION"], as_index=False)\
#     .agg(
#         team_pts=("PTS","sum"),
#         team_pts_next=("PTS_next","sum"),
#         or5=("ORtg_g_roll5","mean"),
#         dr5=("DRtg_g_roll5","mean"),
#         pace5=("Pace_g_roll5","mean"),
#     )

# # Join opponent features (same date)
# opp = team_games.rename(columns={
#     "TEAM_ABBREVIATION":"OPPONENT_ABBREVIATION",
#     "team_pts":"opp_pts",
#     "team_pts_next":"opp_pts_next",
#     "or5":"opp_or5","dr5":"opp_dr5","pace5":"opp_pace5"
# })
# team_matchups = team_games.merge(opp, on=["GAME_DATE"], how="inner")

# # Simple features for team total prediction
# team_feature_cols = ["or5","dr5","pace5","opp_or5","opp_dr5","opp_pace5"]
# tm = team_matchups.dropna(subset=team_feature_cols + ["team_pts_next"]).copy()

# from sklearn.linear_model import Ridge
# X_tm = tm[team_feature_cols]
# y_tm = tm["team_pts_next"]
# ridge = Ridge(alpha=5.0).fit(X_tm, y_tm)
# print("Team PTS baseline R^2:", ridge.score(X_tm, y_tm))


In [None]:
#-- Cell 17 (value bet finder) --
import pandas as pd
import numpy as np
from datetime import datetime

def american_to_prob(odds):
    if pd.isna(odds): return np.nan
    o = float(odds)
    return 100.0/(o+100.0) if o>0 else (-o)/(-o+100.0)

def devig_pair(p_over, p_under):
    if pd.isna(p_over) or pd.isna(p_under): return (np.nan, np.nan)
    s = p_over + p_under
    if s <= 0: return (np.nan, np.nan)
    return (p_over/s, p_under/s)

def kelly_fraction(p, american_odds, cap=0.25):
    if pd.isna(p) or pd.isna(american_odds): return 0.0
    o = float(american_odds)
    b = o/100.0 if o>0 else 100.0/(-o)
    f = (p*(b+1)-1)/b
    return float(max(0.0, min(f, cap)))

def ev_flat_over(p, american_odds):
    if pd.isna(p) or pd.isna(american_odds): return np.nan
    o = float(american_odds)
    win = o/100.0 if o>0 else 100.0/(-o)
    lose = 1.0
    return p*win - (1-p)*lose

# Normal CDF helper (if SciPy available) to turn mean/sd into p_over
try:
    from scipy.stats import norm
    def p_over_from_normal(mu, sd, line):
        if pd.isna(mu) or pd.isna(sd) or pd.isna(line) or sd <= 0: return np.nan
        return 1.0 - norm.cdf((line - mu)/sd)
except Exception:
    def p_over_from_normal(mu, sd, line): return np.nan

def build_value_bets_excel(
    df_projections, df_odds, outfile_path=None,
    join_keys=("player","team","opponent","market","line","book","game_date"),
    cap_kelly=0.25
):
    def _norm(x): return None if pd.isna(x) else str(x).strip()
    proj, odds = df_projections.copy(), df_odds.copy()
    for k in join_keys:
        if k in proj: proj[k] = proj[k].map(_norm)
        if k in odds: odds[k] = odds[k].map(_norm)

    merged = proj.merge(odds, on=list(join_keys), how="inner", suffixes=("", "_odds"))

    if "p_over_model" not in merged.columns or merged["p_over_model"].isna().all():
        merged["p_over_model"] = merged.apply(
            lambda r: p_over_from_normal(r.get("projection_mean"), r.get("projection_sd"), r.get("line")), axis=1
        )

    merged["p_over_imp"]  = merged["over_odds"].map(american_to_prob)
    merged["p_under_imp"] = merged["under_odds"].map(american_to_prob)
    merged[["p_over_fair","p_under_fair"]] = merged.apply(
        lambda r: pd.Series(devig_pair(r["p_over_imp"], r["p_under_imp"])), axis=1
    )

    merged["edge_over"]       = merged["p_over_model"] - merged["p_over_fair"]
    merged["kelly_frac_over"] = merged.apply(lambda r: kelly_fraction(r["p_over_model"], r["over_odds"], cap=cap_kelly), axis=1)
    merged["EV_over_1u"]      = merged.apply(lambda r: ev_flat_over(r["p_over_model"], r["over_odds"]), axis=1)
    merged["asof_date"]       = merged.get("asof_date") if "asof_date" in merged else datetime.utcnow().strftime("%Y-%m-%d")

    preferred = [
        "asof_date","game_date","book","player","team","opponent","market","line","lineup_status",
        "over_odds","under_odds","p_over_imp","p_under_imp","p_over_fair","p_under_fair","p_over_model",
        "edge_over","kelly_frac_over","EV_over_1u",
        "projected_minutes","projection_mean","projection_sd","start_prob",
        "opponent_allowance_idx","team_orating","opp_drating",
    ]
    cols = [c for c in preferred if c in merged.columns] + [c for c in merged.columns if c not in preferred]
    bets = merged[cols].sort_values(["edge_over","EV_over_1u"], ascending=False).reset_index(drop=True)

    summary = pd.DataFrame({
        "n_bets":[len(bets)],
        "avg_edge_pp":[bets["edge_over"].mean()*100.0 if len(bets) else np.nan],
        "avg_kelly_pct":[bets["kelly_frac_over"].mean()*100.0 if len(bets) else np.nan],
        "avg_ev_1u":[bets["EV_over_1u"].mean() if len(bets) else np.nan],
    })
    by_market = bets.groupby("market", dropna=False).agg(
        n=("player","count"),
        avg_edge_pp=("edge_over", lambda x: 100.0*x.mean()),
        avg_kelly_pct=("kelly_frac_over", lambda x: 100.0*x.mean()),
        avg_ev_1u=("EV_over_1u","mean")
    ).reset_index()
    by_book = bets.groupby("book", dropna=False).agg(
        n=("player","count"),
        avg_edge_pp=("edge_over", lambda x: 100.0*x.mean()),
        avg_ev_1u=("EV_over_1u","mean")
    ).reset_index()

    if outfile_path is None:
        outfile_path = f"nba_value_bets_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.xlsx"
    with pd.ExcelWriter(outfile_path, engine="openpyxl") as w:
        bets.to_excel(w, sheet_name="Bets", index=False)
        summary.to_excel(w, sheet_name="Summary", index=False, startrow=0)
        by_market.to_excel(w, sheet_name="Summary", index=False, startrow=5)
        by_book.to_excel(w, sheet_name="Summary", index=False, startrow=5+len(by_market)+3)

        dd = pd.DataFrame([
            ("asof_date","UTC run date"), ("game_date","Game date"),
            ("player","Player"), ("team","Team abbr"), ("opponent","Opponent abbr"),
            ("market","PTS/REB/AST/3PM/PRA etc."), ("line","Book line"), ("book","Sportsbook id"),
            ("lineup_status","EXPECTED/CONFIRMED/UNKNOWN"),
            ("over_odds","American odds Over"), ("under_odds","American odds Under"),
            ("p_over_imp","Implied prob Over (pre-vig)"), ("p_under_imp","Implied prob Under (pre-vig)"),
            ("p_over_fair","De-vigged prob Over"), ("p_under_fair","De-vigged prob Under"),
            ("p_over_model","Model prob Over"), ("edge_over","p_model ‚àí p_fair"),
            ("kelly_frac_over","Kelly fraction (cap)"), ("EV_over_1u","EV if staking 1u"),
            ("projected_minutes","Projected minutes"), ("projection_mean","Projected mean"),
            ("projection_sd","Projected stdev"), ("start_prob","Start probability"),
            ("opponent_allowance_idx","Opponent allowance index"),
            ("team_orating","Team ORtg"), ("opp_drating","Opponent DRtg"),
        ], columns=["column","description"])
        dd.to_excel(w, sheet_name="Data_Dictionary", index=False)

    return bets, outfile_path


In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import TimeSeriesSplit
# from sklearn.metrics import mean_absolute_error
# from xgboost import XGBRegressor

# # Safety: need features_all built (from your Cell 7 + 0_data)
# assert 'features_all' in globals() and isinstance(features_all, pd.DataFrame) and not features_all.empty, \
#     "features_all missing. Run Cell 7 + 0_data first."

# # Ensure GAME_DATE is datetime and sort
# if "GAME_DATE" in features_all.columns:
#     features_all["GAME_DATE"] = pd.to_datetime(features_all["GAME_DATE"])
# features_all = features_all.sort_values(["PLAYER_NAME","GAME_DATE"]).copy()

# # ---------------- Features / Targets ----------------
# BASE_FEATURES = [
#     "MIN_roll5","MIN_roll10",
#     "TS_game_roll5","TS_game_roll10",
#     "usage_share_roll5",
#     "ORtg_g_roll5","DRtg_g_roll5","Pace_g_roll5",
#     "PER","TS%","USG%","ORtg","DRtg","WS/48","BPM","VORP",
#     "days_rest","HOME"
# ]
# STAT_ROLLING = {
#     "PTS": ["PTS_roll5","PTS_roll10"],
#     "REB": ["REB_roll5","REB_roll10"],
#     "AST": ["AST_roll5","AST_roll10"],
# }
# TARGETS = {
#     "PTS": "PTS_next",
#     "REB": "REB_next",
#     "AST": "AST_next",
# }

# models = {}
# feature_cols_by_stat = {}
# cv_scores = {}

# # Time-aware CV across all players chronologically
# tscv = TimeSeriesSplit(n_splits=5)

# for stat, target_col in TARGETS.items():
#     # pick available cols only
#     cand = BASE_FEATURES + STAT_ROLLING[stat]
#     feat_cols = [c for c in cand if c in features_all.columns]
#     feature_cols_by_stat[stat] = feat_cols

#     # drop rows missing features/target
#     data = features_all.dropna(subset=feat_cols + [target_col]).copy()
#     if data.empty:
#         print(f"‚ö†Ô∏è No training data for {stat}. Skipping.")
#         continue

#     data = data.sort_values("GAME_DATE")
#     X = data[feat_cols]
#     y = data[target_col]

#     # CV MAE for sanity
#     fold_mae = []
#     for tr, te in tscv.split(X):
#         Xtr, Xte = X.iloc[tr], X.iloc[te]
#         ytr, yte = y.iloc[tr], y.iloc[te]
#         model = XGBRegressor(
#             n_estimators=400,
#             learning_rate=0.05,
#             max_depth=5,
#             subsample=0.85,
#             colsample_bytree=0.9,
#             reg_lambda=1.0,
#             reg_alpha=0.0,
#             random_state=42,
#             n_jobs=-1,
#             verbosity=0
#         )
#         model.fit(Xtr, ytr)
#         pred = model.predict(Xte)
#         fold_mae.append(mean_absolute_error(yte, pred))

#     cv_scores[stat] = (float(np.mean(fold_mae)), float(np.std(fold_mae)))
#     print(f"‚úî {stat} MAE (TimeSeries CV): {np.mean(fold_mae):.2f} ¬± {np.std(fold_mae):.2f}")

#     # Fit final model on all data
#     final_model = XGBRegressor(
#         n_estimators=400,
#         learning_rate=0.05,
#         max_depth=5,
#         subsample=0.85,
#         colsample_bytree=0.9,
#         reg_lambda=1.0,
#         reg_alpha=0.0,
#         random_state=42,
#         n_jobs=-1,
#         verbosity=0
#     )
#     final_model.fit(X, y)
#     models[stat] = final_model

# print("\nDone. Trained models:", list(models.keys()))


In [None]:
# # === Cell 16: projections for PTS/REB/AST using your trained RF models ===
# import pandas as pd
# import numpy as np
# from datetime import datetime

# # Safety checks
# if "models" not in globals() or not models:
#     raise RuntimeError("No trained models found. Run Cell 7 first to populate `models` and `feature_cols_by_stat`.")

# # We'll project for these markets
# MARKETS = ["PTS", "REB", "AST"]

# # Latest row per player as basis for "next game"
# latest = features_all.sort_values(["PLAYER_NAME","GAME_DATE"]).groupby("PLAYER_NAME").tail(1).copy()

# # Helper: per-stat stdev from last N actual games
# def _player_sd_map(stat: str, n=10):
#     def _sd(g):
#         s = g[stat].tail(n)
#         if s.notna().sum() >= 4:
#             return float(s.std(ddof=1))
#         return float(features_all[stat].std(ddof=1))
#     return features_all.groupby("PLAYER_NAME").apply(_sd)

# # Normalize export keys common to all markets
# base_cols = {
#     "PLAYER_NAME": "player",
#     "TEAM_ABBREVIATION": "team",
#     "OPPONENT_ABBREVIATION": "opponent",
# }
# base_out = latest.rename(columns=base_cols)[["player","team","opponent"]].copy()
# base_out["game_date"] = datetime.utcnow().strftime("%Y-%m-%d")
# base_out["projected_minutes"] = latest.get("MIN_roll5", pd.Series(index=latest.index)).fillna(30).clip(lower=10, upper=40).values
# base_out["start_prob"] = 0.90
# base_out["lineup_status"] = "EXPECTED"

# # Build one projection frame per market
# proj_frames = {}
# for stat in MARKETS:
#     if stat not in models:
#         print(f"‚ö†Ô∏è Skipping {stat}: model not found in `models`.")
#         continue
#     feat_cols = feature_cols_by_stat.get(stat, [])
#     if not feat_cols:
#         print(f"‚ö†Ô∏è Skipping {stat}: no feature columns recorded in `feature_cols_by_stat`.")
#         continue

#     X_pred = latest[feat_cols].fillna(method="ffill").fillna(0)
#     pred_mean = models[stat].predict(X_pred)

#     # per-player SD
#     sd_map = _player_sd_map(stat)
#     pred_sd = latest["PLAYER_NAME"].map(sd_map)
#     # conservative fallback SD = 15% of mean (min 1.0)
#     sd_fallback = np.maximum(np.abs(pred_mean) * 0.15, 1.0)
#     pred_sd = np.where(np.isnan(pred_sd), sd_fallback, pred_sd)

#     dfp = base_out.copy()
#     dfp["projection_mean"] = pred_mean
#     dfp["projection_sd"] = pred_sd
#     dfp["market"] = stat

#     # Expose per-market frames
#     proj_frames[stat] = dfp[["player","team","opponent","game_date","market",
#                              "projection_mean","projection_sd","projected_minutes","start_prob","lineup_status"]].copy()

# # Individual frames (kept for backward compatibility)
# df_projections_pts = proj_frames.get("PTS", pd.DataFrame())
# df_projections_reb = proj_frames.get("REB", pd.DataFrame())
# df_projections_ast = proj_frames.get("AST", pd.DataFrame())

# # Combined projections across markets
# df_projections_all = pd.concat(list(proj_frames.values()), ignore_index=True) if proj_frames else pd.DataFrame()

# print("Projection rows by market:",
#       {k: len(v) for k, v in proj_frames.items()})


# display(df_projections_all.head(9))

In [None]:
# # --- Cell 18: Helper: turn wide props (per-book columns) into a long, tidy table ---
# import re
# import numpy as np
# import pandas as pd

# def odds_wide_to_long_from_columns(
#     wide_df: pd.DataFrame,
#     *,
#     books: tuple[str, ...] = ("mgm","draftkings","fanduel","caesars","betrivers","espnbet","hardrock"),
#     markets: tuple[str, ...] = ("PTS","REB","AST"),
#     player_cols=("name","player","PLAYER_NAME"),
#     team_cols=("team","TEAM","team_name","TEAM_ABBREVIATION"),
#     opp_cols=("opponent","opp","OPPONENT","OPPONENT_ABBREVIATION"),
#     date_cols=("game_date","GAME_DATE","date")
# ) -> pd.DataFrame:
#     """
#     Convert a 'wide' props frame into a tidy long format:
#     one row per (player, market, book), with numeric line and American odds.

#     Expected column patterns (flexible by regex):
#       <book>_<suffix>                 -> the line (e.g., mgm_pts, fanduel_ast)
#       <book>_<suffix>_over_odds       -> over odds (American)
#       <book>_<suffix>_under_odds      -> under odds (American)

#     Suffixes recognized per market:
#       PTS:  'pts','points'
#       REB:  'reb','rebounds'
#       AST:  'ast','assists'
#     """
#     df = wide_df.copy()

#     # Identify reference columns
#     def _first_col(cands):
#         for c in cands:
#             if c in df.columns: return c
#         return None

#     player_col = _first_col(player_cols)
#     team_col   = _first_col(team_cols)
#     opp_col    = _first_col(opp_cols)
#     date_col   = _first_col(date_cols)

#     # Fallbacks if totally missing
#     if player_col is None:
#         raise ValueError("Could not find a player name column in wide_df. "
#                          f"Tried {player_cols}. Got columns: {list(df.columns)[:20]}...")

#     # Normalize helpers
#     def _num_float(x):
#         if pd.isna(x): return np.nan
#         m = re.search(r"[-+]?\d+(?:\.\d+)?", str(x))
#         return float(m.group()) if m else np.nan

#     def _num_int(x):
#         if pd.isna(x): return np.nan
#         m = re.search(r"[-+]?\d+", str(x))
#         return int(m.group()) if m else np.nan

#     # Market suffix map (flex)
#     market_suffixes = {
#         "PTS": ("pts","points"),
#         "REB": ("reb","rebounds"),
#         "AST": ("ast","assists"),
#     }

#     # Build long rows
#     long_rows = []
#     # Iterate rows once; pull columns per book/market dynamically
#     for _, row in df.iterrows():
#         base = {
#             "player": row[player_col],
#             "team": row[team_col] if team_col else np.nan,
#             "opponent": row[opp_col] if opp_col else np.nan,
#             "game_date": row[date_col] if date_col else np.nan,
#         }
#         for mkt in markets:
#             suffixes = market_suffixes.get(mkt, ())
#             for b in books:
#                 # Find the *line* column by trying allowed suffixes
#                 line_val = np.nan
#                 over_val = np.nan
#                 under_val = np.nan
#                 line_col_used = None

#                 for suf in suffixes:
#                     # exact line column (most common)
#                     c_line = f"{b}_{suf}"
#                     if c_line in df.columns and pd.notna(row[c_line]):
#                         line_val = row[c_line]
#                         line_col_used = c_line
#                         # odds columns (several sites use these names)
#                         for over_name in (f"{b}_{suf}_over_odds", f"{b}_{suf}_o_odds", f"{b}_{suf}_over"):
#                             if over_name in df.columns:
#                                 over_val = row[over_name]
#                                 break
#                         for under_name in (f"{b}_{suf}_under_odds", f"{b}_{suf}_u_odds", f"{b}_{suf}_under"):
#                             if under_name in df.columns:
#                                 under_val = row[under_name]
#                                 break
#                         break  # found a suffix match

#                 # If not found, try a looser search (e.g., 'mgm_pts_line')
#                 if (isinstance(line_val, float) and np.isnan(line_val)) or line_col_used is None:
#                     pat = re.compile(rf"^{re.escape(b)}_({ '|'.join(map(re.escape, suffixes)) })(_line)?$", re.I)
#                     for c in df.columns:
#                         if pat.match(str(c)) and pd.notna(row[c]):
#                             line_val = row[c]
#                             line_col_used = c
#                             # odds columns with same base
#                             base_prefix = re.sub(r"(_line)?$", "", c)
#                             for over_name in (f"{base_prefix}_over_odds", f"{base_prefix}_o_odds", f"{base_prefix}_over"):
#                                 if over_name in df.columns:
#                                     over_val = row[over_name]
#                                     break
#                             for under_name in (f"{base_prefix}_under_odds", f"{base_prefix}_u_odds", f"{base_prefix}_under"):
#                                 if under_name in df.columns:
#                                     under_val = row[under_name]
#                                     break
#                             break

#                 # Only emit a row if we actually found a line
#                 if pd.notna(line_val):
#                     long_rows.append({
#                         **base,
#                         "market": mkt,
#                         "book": b,
#                         "line": _num_float(line_val),
#                         "over_odds": _num_int(over_val),
#                         "under_odds": _num_int(under_val),
#                     })

#     out = pd.DataFrame(long_rows)

#     # Clean up: drop obviously invalid lines
#     if not out.empty:
#         out = out[pd.notna(out["line"])]
#         # remove zero/negative lines that can't be real for these markets (optional)
#         out = out[out["line"] > 0]

#         # De-duplicate best-effort (sometimes the page contains duplicates per book)
#         out = (out.sort_values(["player","market","book","line"])
#                   .drop_duplicates(subset=["player","market","book"], keep="last")
#                   .reset_index(drop=True))

#     return out


In [None]:
# # --- Cell 19: robust wide->long adapter for Rotowire props ---
# import re
# import numpy as np
# import pandas as pd

# def odds_wide_to_long_rotowire(
#     wide_df: pd.DataFrame,
#     *,
#     books=("mgm","draftkings","fanduel","caesars","betrivers","espnbet","hardrock"),
#     markets=("PTS","REB","AST"),
#     player_cols=("name","player","PLAYER_NAME"),
#     team_cols=("team","TEAM_ABBREVIATION"),
#     opp_cols=("opponent","opp","OPPONENT_ABBREVIATION"),
#     date_cols=("game_date","GAME_DATE")
# ) -> pd.DataFrame:
#     df = wide_df.copy()

#     def _first_col(cols):
#         for c in cols:
#             if c in df.columns: return c
#         return None

#     ply = _first_col(player_cols)
#     tm  = _first_col(team_cols)
#     opp = _first_col(opp_cols)
#     dt  = _first_col(date_cols)
#     if ply is None:
#         raise ValueError(f"No player column found. Tried {player_cols}. Got sample: {list(df.columns)[:25]}")

#     # market suffixes we‚Äôll search (order matters)
#     suffixes = {"PTS": ("pts","p","points"),
#                 "REB": ("reb","rebounds"),
#                 "AST": ("ast","assists")}

#     # helpers
#     def _num_float(x):
#         if pd.isna(x): return np.nan
#         m = re.search(r"[-+]?\d+(?:\.\d+)?", str(x))
#         return float(m.group()) if m else np.nan

#     def _num_int(x):
#         if pd.isna(x): return np.nan
#         m = re.search(r"[-+]?\d+", str(x))
#         return int(m.group()) if m else np.nan

#     cols_lc = {c.lower(): c for c in df.columns}  # lower->actual

#     def _find(name_like: str):
#         return cols_lc.get(name_like.lower())

#     rows = []
#     for _, r in df.iterrows():
#         base = {
#             "player": r[ply],
#             "team": r[tm] if tm else np.nan,
#             "opponent": r[opp] if opp else np.nan,
#             "game_date": r[dt] if dt else np.nan,
#         }
#         for mkt in markets:
#             for book in books:
#                 ln = np.nan; ov = np.nan; un = np.nan; used = None
#                 # find the line column (e.g. mgm_pts / fanduel_p / caesars_ast)
#                 for suf in suffixes[mkt]:
#                     for cand in (f"{book}_{suf}", f"{book}_{suf}_line"):
#                         real = _find(cand)
#                         if real and pd.notna(r.get(real)):
#                             ln = r[real]; used = real
#                             break
#                     if used: break

#                 if used:
#                     # odds columns around that base; support camel & underscore
#                     base_prefix = re.sub(r"_line$", "", used, flags=re.I)
#                     over_cands  = [f"{base_prefix}Over", f"{base_prefix}_over",
#                                    f"{base_prefix}_o", f"{base_prefix}_over_odds"]
#                     under_cands = [f"{base_prefix}Under", f"{base_prefix}_under",
#                                    f"{base_prefix}_u", f"{base_prefix}_under_odds"]
#                     for oc in over_cands:
#                         c = _find(oc)
#                         if c and pd.notna(r.get(c)): ov = r[c]; break
#                     for uc in under_cands:
#                         c = _find(uc)
#                         if c and pd.notna(r.get(c)): un = r[c]; break

#                     rows.append({
#                         **base,
#                         "market": mkt,
#                         "book": book,
#                         "line": _num_float(ln),
#                         "over_odds": _num_int(ov),
#                         "under_odds": _num_int(un),
#                     })

#     out = pd.DataFrame(rows)
#     if not out.empty:
#         out = out[pd.notna(out["line"]) & (out["line"] > 0)]
#         out = (out.sort_values(["player","market","book","line"])
#                  .drop_duplicates(subset=["player","market","book"], keep="last")
#                  .reset_index(drop=True))
#         if "game_date" in out and out["game_date"].isna().all():
#             out["game_date"] = pd.Timestamp.utcnow().strftime("%Y-%m-%d")
#     return out


In [25]:
# === Cell 20 (fixed): wide‚Üílong with robust Over/Under detection ===
import re, numpy as np, pandas as pd

def odds_wide_to_long_rotowire_final_v2(
    wide_df: pd.DataFrame,
    *,
    books=("mgm","draftkings","fanduel","betrivers"),  # focus on these
    markets=("PTS","REB","AST"),
    player_cols=("name","player","PLAYER_NAME"),
    team_cols=("team","TEAM_ABBREVIATION","TEAM"),
    opp_cols=("opponent","opp","OPPONENT_ABBREVIATION","OPPONENT"),
    date_cols=("game_date","GAME_DATE","asof_date"),
) -> pd.DataFrame:
    df = wide_df.copy()
    if df.empty:
        return pd.DataFrame()

    # ‚Äî‚Äî‚Äî helpers to pick identity columns ‚Äî‚Äî‚Äî
    def _first_col(cands):
        for c in cands:
            if c in df.columns:
                return c
        return None

    ply = _first_col(player_cols)
    tm  = _first_col(team_cols)
    opp = _first_col(opp_cols)
    dt  = _first_col(date_cols)
    if ply is None:
        raise ValueError("No player column found in wide odds frame.")

    # market suffixes we‚Äôll search (ALL of them, not just the first)
    suf_map = {
        "PTS": ("pts", "points", "p"),
        "REB": ("reb", "rebounds"),
        "AST": ("ast", "assists"),
    }

    # numeric cleaners
    def _num_float(x):
        if pd.isna(x): return np.nan
        m = re.search(r"[-+]?\d+(?:\.\d+)?", str(x))
        return float(m.group()) if m else np.nan

    def _num_int(x):
        if pd.isna(x): return np.nan
        m = re.search(r"[-+]?\d+", str(x))
        return int(m.group()) if m else np.nan

    # prebuild a case-insensitive lookup for columns
    lc_to_real = {c.lower(): c for c in df.columns}

    def _get_series_any(names):
        """return the first non-missing column among name list (case-insensitive)"""
        for n in names:
            key = n.lower()
            if key in lc_to_real:
                return df[lc_to_real[key]]
        return pd.Series([np.nan] * len(df))

    rows = []
    # iterate rows once; for each book+market pick the most plausible columns
    for idx, r in df.iterrows():
        base = {
            "player":   r[ply],
            "team":     (r[tm]  if tm  else np.nan),
            "opponent": (r[opp] if opp else np.nan),
            "game_date":(r[dt]  if dt  else np.nan),
        }

        for mkt in markets:
            suffixes = suf_map[mkt]
            # --- line column candidates, e.g. mgm_pts / draftkings_points ---
            for book in books:
                line = np.nan
                over = np.nan
                under = np.nan

                # 1) try explicit line columns
                line_names = []
                for suf in suffixes:
                    line_names += [f"{book}_{suf}", f"{book}_{suf}_line"]
                # pick the first present
                for nm in line_names:
                    key = nm.lower()
                    if key in lc_to_real and pd.notna(r[lc_to_real[key]]):
                        line = r[lc_to_real[key]]
                        break

                # 2) over/under with lots of spellings (camel + underscore + *_odds)
                # build patterns that include ANY of the suffixes
                # e.g. ^mgm_.*(pts|points)\w*(over(_odds)?|overodds)?$
                suf_pat = "(" + "|".join(map(re.escape, suffixes)) + ")"
                # scan all columns once; pick the first non-na for over/under
                for c in df.columns:
                    cl = c.lower()
                    if cl.startswith(book + "_"):
                        if re.search(suf_pat, cl):
                            if re.search(r"(over)(_odds|odds)?$", cl):
                                if pd.notna(r[c]) and pd.isna(over):
                                    over = r[c]
                            elif re.search(r"(under)(_odds|odds)?$", cl):
                                if pd.notna(r[c]) and pd.isna(under):
                                    under = r[c]

                # skip if absolutely nothing present for this book+market on this row
                if pd.isna(line) and pd.isna(over) and pd.isna(under):
                    continue

                rows.append({
                    **base,
                    "market": mkt,
                    "book": book,
                    "line": _num_float(line),
                    "over_odds": _num_int(over),
                    "under_odds": _num_int(under),
                })

    out = pd.DataFrame(rows)

    if out.empty:
        return out

    # keep ONLY rows with a real line and at least one price
    has_line = out["line"].notna()
    has_price = out["over_odds"].notna() | out["under_odds"].notna()
    out = out[has_line & has_price].copy()

    # positive / plausible lines
    out = out[out["line"] > 0]

    # dedupe within (player, market, book) keeping the most recent non-na price/line
    out = (out.sort_values(["player", "market", "book", "line"])
              .drop_duplicates(subset=["player", "market", "book"], keep="last")
              .reset_index(drop=True))

    # fill missing dates with today if needed
    if "game_date" in out and out["game_date"].isna().all():
        out["game_date"] = pd.Timestamp.utcnow().strftime("%Y-%m-%d")

    return out


In [32]:
# === Cell 21 (fixed): fetch odds for multiple books ‚Üí long ‚Üí join with projections ===
from datetime import datetime
import re, unicodedata, numpy as np, pandas as pd
from statistics import NormalDist

BOOKS = ("mgm","draftkings","fanduel","caesars","betrivers")

scraper = NBAOddsScraper()

# 1) Scrape EACH book and convert to long immediately (avoids NaN rows for non-present books)
long_parts = []
for b in BOOKS:
    wide_b = scraper.get_player_props_odds_wide_raw(book=b)
    if wide_b.empty:
        print(f"‚ö†Ô∏è {b}: no rows scraped.")
        continue
    # Use the converter that already works for you in Cell 2
    long_b = odds_wide_to_long_from_columns(wide_b, books=(b,), markets=("PTS","REB","AST"))
    if long_b.empty:
        print(f"‚ö†Ô∏è {b}: long table empty after conversion.")
    else:
        long_parts.append(long_b)

odds_long = pd.concat(long_parts, ignore_index=True) if long_parts else pd.DataFrame()
print(f"‚úÖ Combined long odds rows: {len(odds_long)}")

# 2) Row-level fallback line within each (player, market, game_date) group
if not odds_long.empty:
    grp = ["player","market","game_date"]
    # take the first non-null line within the group
    line_fallback = odds_long.groupby(grp)["line"].transform(lambda s: s.dropna().iloc[0] if s.dropna().size else np.nan)
    missing_before = odds_long["line"].isna().sum()
    odds_long["line"] = odds_long["line"].fillna(line_fallback)
    missing_after  = odds_long["line"].isna().sum()
    print(f"üõü Filled {missing_before - missing_after} missing lines via group fallback.")

    # Keep only rows with a usable line and at least one price
    odds_long = odds_long[(odds_long["line"].notna()) & ( (odds_long["over_odds"].notna()) | (odds_long["under_odds"].notna()) )].copy()
    print(f"‚úÖ Usable odds rows after filters: {len(odds_long)}")

# 3) Join with projections
if "df_projections_all" not in globals() or df_projections_all.empty:
    raise RuntimeError("df_projections_all missing ‚Äì run the projection cell first.")

def _norm_player(s):
    if not isinstance(s, str): return ""
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"[.\-`'‚Äô]", "", s).strip().lower()
    s = re.sub(r"\s+"," ", s)
    return s

odds_long["player_key"] = odds_long["player"].map(_norm_player)
df_projections_all["player_key"] = df_projections_all["player"].map(_norm_player)

def p_over_from_normal(mu, sd, line):
    if pd.isna(mu) or pd.isna(sd) or pd.isna(line) or sd <= 0: return np.nan
    z = (line - mu) / sd
    return 1.0 - NormalDist().cdf(z)

def implied_prob(a):
    if pd.isna(a): return np.nan
    a = float(a)
    return (-a)/(-a+100.0) if a < 0 else 100.0/(a+100.0)

joined = []
for mkt in ("PTS","REB","AST"):
    proj = df_projections_all.query("market == @mkt")
    odds = odds_long.query("market == @mkt")
    if proj.empty or odds.empty:
        print(f"‚ö†Ô∏è Skipping {mkt} (proj empty? {proj.empty}, odds empty? {odds.empty})")
        continue

    dfj = proj.merge(
        odds,
        on=["player_key","market"],
        how="inner",
        suffixes=("_proj","_odds")
    )
    if dfj.empty:
        print(f"‚ö†Ô∏è No matches for {mkt} after merge.")
        continue

    # model P(over)
    dfj["p_over_model"] = dfj.apply(
        lambda r: p_over_from_normal(r["projection_mean"], r["projection_sd"], r["line"]),
        axis=1
    )

    # market implied
    dfj["p_over_imp"]  = dfj["over_odds"].map(implied_prob)
    dfj["p_under_imp"] = dfj["under_odds"].map(implied_prob)

    # edge vs implied (de-vig will happen in Cell 22)
    dfj["edge_over"] = dfj["p_over_model"] - dfj["p_over_imp"]

    joined.append(dfj)

df_proj_join_all = pd.concat(joined, ignore_index=True) if joined else pd.DataFrame()

print(f"üîó Joined frame size: {len(df_proj_join_all)}")
print(odds_long.head(10))
print(df_proj_join_all.head(10))

# Optional: export a trace file to inspect later
df_proj_join_all.to_csv(f"nba_player_props_joined_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.csv", index=False)


‚úÖ Fetched 1125 odds rows | 264 columns | book=mgm
‚úÖ Fetched 1125 odds rows | 264 columns | book=draftkings
‚úÖ Fetched 1125 odds rows | 264 columns | book=fanduel
‚úÖ Fetched 1125 odds rows | 264 columns | book=caesars
‚úÖ Fetched 1125 odds rows | 264 columns | book=betrivers
‚úÖ Combined long odds rows: 2970
üõü Filled 0 missing lines via group fallback.
‚úÖ Usable odds rows after filters: 990


RuntimeError: df_projections_all missing ‚Äì run the projection cell first.

In [None]:
# def peek_cols(df, book, key="pts"):
#     cols = [c for c in df.columns if re.match(fr"^{book}_.{{0,12}}{key}", c, re.I) or c.lower().startswith(f"{book}_{key}")]
#     print(book, key, "->", cols[:20])

# peek_cols(wide_raw, "mgm", "pts")
# peek_cols(wide_raw, "fanduel", "p")     # note the short 'p'
# peek_cols(wide_raw, "caesars", "ast")

# print(df_proj_join_all['under_odds'].unique())

In [31]:
# === Cell 22 (Final) ‚Äî Price, Edge, EV & Kelly Calculations ==================
import os, re
import numpy as np
import pandas as pd
from datetime import datetime
from statistics import NormalDist

# ---------------------------------------------------------------------
# 0Ô∏è‚É£ Safety check
# ---------------------------------------------------------------------
assert "df_proj_join_all" in globals() and not df_proj_join_all.empty, \
    "Run Cell 21 first to build df_proj_join_all."

df = df_proj_join_all.copy()
print(f"üìà Starting pricing with {len(df):,} merged projection/odds rows...")
print(df.head(20))
# ---------------------------------------------------------------------
# 1Ô∏è‚É£ Normalize & coalesce entity columns
# ---------------------------------------------------------------------
def _coalesce(df_, target, candidates):
    s = pd.Series(index=df_.index, dtype=object)
    for c in candidates:
        if c in df_:
            s = s.fillna(df_[c])
    df_[target] = s

_coalesce(df, "player",   ["player_odds","player_proj","player"])
_coalesce(df, "team",     ["team_odds","team_proj","team"])
_coalesce(df, "opponent", ["opponent_odds","opponent_proj","opponent"])
_coalesce(df, "game_date",["game_date_odds","game_date_proj","game_date"])

# drop duplicate versions of these columns
to_drop = [c for c in [
    "player_odds","player_proj","team_odds","team_proj",
    "opponent_odds","opponent_proj","game_date_odds","game_date_proj"
] if c in df.columns]
df.drop(columns=to_drop, inplace=True, errors="ignore")

# deduplicate columns
df = df.loc[:, ~df.columns.duplicated()].copy()

# ---------------------------------------------------------------------
# 2Ô∏è‚É£ Filter to rows with valid odds
# ---------------------------------------------------------------------
if not {"over_odds","under_odds"}.issubset(df.columns):
    raise RuntimeError("Missing odds columns. Re-run Cell 21 to rebuild df_proj_join_all.")

priced = df.dropna(subset=["over_odds","under_odds"], how="all").copy()
print(f"‚úÖ {len(priced):,} rows with valid odds available for pricing.")
print(priced.head(5))
# ---------------------------------------------------------------------
# 3Ô∏è‚É£ Numeric coercion helpers
# ---------------------------------------------------------------------
def _num_int(x):
    if pd.isna(x): return np.nan
    m = re.search(r"[-+]?\d+", str(x))
    return int(m.group()) if m else np.nan

def _num_float(x):
    if pd.isna(x): return np.nan
    m = re.search(r"[-+]?\d+(?:\.\d+)?", str(x))
    return float(m.group()) if m else np.nan

for c in ["line","projection_mean","projection_sd"]:
    if c in priced.columns:
        priced[c] = priced[c].apply(_num_float)
for c in ["over_odds","under_odds"]:
    if c in priced.columns:
        priced[c] = priced[c].apply(_num_int)

# ---------------------------------------------------------------------
# 4Ô∏è‚É£ Handle missing SD (fallback 15% of mean, min = 1.0)
# ---------------------------------------------------------------------
if ("projection_sd" not in priced.columns) or priced["projection_sd"].fillna(0).eq(0).all():
    priced["projection_sd"] = (priced["projection_mean"].abs() * 0.15).clip(lower=1.0)

# ---------------------------------------------------------------------
# 5Ô∏è‚É£ Model probability: P(over) from Normal(Œº, œÉ)
# ---------------------------------------------------------------------
def p_over_from_normal(mu, sd, line):
    if pd.isna(mu) or pd.isna(sd) or pd.isna(line) or sd <= 0: 
        return np.nan
    z = (line - mu) / sd
    return 1.0 - NormalDist().cdf(z)

priced["p_over_model"] = priced.apply(
    lambda r: p_over_from_normal(r["projection_mean"], r["projection_sd"], r["line"]), axis=1
)

# ---------------------------------------------------------------------
# 6Ô∏è‚É£ Market-implied probabilities + de-vig + edge
# ---------------------------------------------------------------------
def implied_prob(a):
    if pd.isna(a): return np.nan
    a = float(a)
    return (-a)/(-a+100.0) if a < 0 else 100.0/(a+100.0)

priced["p_over_imp"]  = priced["over_odds"].map(implied_prob)
priced["p_under_imp"] = priced["under_odds"].map(implied_prob)

def devig_pair(p_o, p_u):
    if pd.isna(p_o) or pd.isna(p_u): return (np.nan, np.nan)
    s = p_o + p_u
    if s <= 0: return (np.nan, np.nan)
    return (p_o/s, p_u/s)

fair = priced.apply(
    lambda r: pd.Series(devig_pair(r["p_over_imp"], r["p_under_imp"]),
                        index=["p_over_fair","p_under_fair"]),
    axis=1
)
priced = pd.concat([priced, fair], axis=1)

priced["edge_over"] = np.where(
    priced["p_over_fair"].notna(),
    priced["p_over_model"] - priced["p_over_fair"],
    priced["p_over_model"] - priced["p_over_imp"]
)

# ---------------------------------------------------------------------
# 7Ô∏è‚É£ EV and Kelly (capped ‚â§ 25%)
# ---------------------------------------------------------------------
def kelly_fraction(p, american, cap=0.25):
    if pd.isna(p) or pd.isna(american): return 0.0
    a = float(american)
    b = (a/100.0) if a > 0 else (100.0/abs(a))
    f = (p*(b+1)-1)/b
    return float(max(0.0, min(f, cap)))

def ev_flat_over(p, american):
    if pd.isna(p) or pd.isna(american): return np.nan
    a = float(american)
    win = (a/100.0) if a > 0 else (100.0/abs(a))
    lose = 1.0
    return p*win - (1-p)*lose

priced["kelly_frac_over"] = priced.apply(lambda r: kelly_fraction(r["p_over_model"], r["over_odds"]), axis=1)
priced["EV_over_1u"]      = priced.apply(lambda r: ev_flat_over(r["p_over_model"], r["over_odds"]), axis=1)

# ---------------------------------------------------------------------
# 8Ô∏è‚É£ Clean duplicates and column conflicts
# ---------------------------------------------------------------------
if priced.columns.duplicated().any():
    print("‚ö†Ô∏è Duplicate columns detected ‚Äî removing duplicates.")
    priced = priced.loc[:, ~priced.columns.duplicated()].copy()

if priced.columns.duplicated().any():
    raise RuntimeError("Column duplication persists ‚Äî please inspect DataFrame.")

# ---------------------------------------------------------------------
# 9Ô∏è‚É£ Build sorted slates
# ---------------------------------------------------------------------
cols_keep = [
    "game_date","book","player","team","opponent","market","line","lineup_status",
    "over_odds","under_odds","p_over_imp","p_under_imp","p_over_fair","p_under_fair",
    "p_over_model","edge_over","EV_over_1u","kelly_frac_over",
    "projected_minutes","projection_mean","projection_sd","start_prob"
]
cols_keep = [c for c in cols_keep if c in priced.columns]

priced_sorted = priced.sort_values(["player","market","edge_over"], ascending=[True,True,False])
best_per_player = priced_sorted.drop_duplicates(subset=["player","market"], keep="first")[cols_keep].reset_index(drop=True)

# ---------------------------------------------------------------------
# üîü Filtering thresholds for ‚Äúvalue slate‚Äù
# ---------------------------------------------------------------------
EDGE_MIN   = 0.02   # ‚â• 2% model edge
EV_MIN     = 0.00   # non-negative EV
KELLY_MIN  = 0.01   # ‚â• 1% Kelly fraction
MIN_MINUTES= 14
START_PROB = 0.50

slate = best_per_player[
    (best_per_player["edge_over"] >= EDGE_MIN) &
    (best_per_player["EV_over_1u"] >= EV_MIN) &
    (best_per_player["kelly_frac_over"] >= KELLY_MIN) &
    (best_per_player["projected_minutes"].fillna(0) >= MIN_MINUTES) &
    (best_per_player["start_prob"].fillna(1.0) >= START_PROB)
].sort_values(["edge_over","EV_over_1u"], ascending=False).reset_index(drop=True)

print(f"\n‚úÖ Final value slate built ‚Äî {len(slate)} bets meet thresholds.")

# ---------------------------------------------------------------------
# 1Ô∏è‚É£1Ô∏è‚É£  Save outputs
# ---------------------------------------------------------------------
stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
os.makedirs("data/bets", exist_ok=True)
csv_all   = f"data/bets/nba_priced_candidates_{stamp}.csv"
csv_slate = f"data/bets/nba_priced_slate_{stamp}.csv"
xlsx_path = f"data/bets/nba_priced_{stamp}.xlsx"

best_per_player.to_csv(csv_all, index=False)
slate.to_csv(csv_slate, index=False)
with pd.ExcelWriter(xlsx_path, engine="openpyxl") as w:
    best_per_player.to_excel(w, sheet_name="Candidates", index=False)
    slate.to_excel(w, sheet_name="Slate", index=False)

print("\nüìÅ Files saved:")
print(f"  ‚Ä¢ {csv_all}")
print(f"  ‚Ä¢ {csv_slate}")
print(f"  ‚Ä¢ {xlsx_path}")

# expose to later cells
df_best = best_per_player.copy()

# ---------------------------------------------------------------------
# ‚úÖ  Quick summary preview
# ---------------------------------------------------------------------
print("\nEdge quantiles:")
print(best_per_player["edge_over"].quantile([0.1,0.25,0.5,0.75,0.9]))

print("\nTop 10 by edge:")
display(best_per_player.sort_values("edge_over", ascending=False).head(10))


AssertionError: Run Cell 21 first to build df_proj_join_all.

In [30]:
# === Cell 23: Diagnostics + more-forgiving slate builder (FIXED) ===
import os
import numpy as np, pandas as pd
from statistics import NormalDist

assert 'df_best' in globals() and isinstance(df_best, pd.DataFrame), "Run the priced-slate cell first."

ND = NormalDist()
dfD = df_best.copy()

# --- Helpers -----------------------------------------------------------------
def american_to_decimal(a):
    """Convert American odds to Decimal odds; return NaN if not valid."""
    if pd.isna(a): return np.nan
    a = float(a)
    if a == 0: return np.nan
    return 1.0 + (a/100.0 if a > 0 else 100.0/abs(a))

def implied_from_decimal(d):
    """Break-even probability from decimal odds."""
    return np.nan if (pd.isna(d) or d <= 1) else 1.0 / d

def kelly_fraction_decimal(p, dec_odds, cap=0.25):
    if pd.isna(p) or pd.isna(dec_odds) or dec_odds <= 1: return 0.0
    b = dec_odds - 1.0
    f = (p*(b+1) - 1) / b
    return float(max(0.0, min(f, cap)))

def ev_flat_decimal(p, dec_odds):
    if pd.isna(p) or pd.isna(dec_odds) or dec_odds <= 1: return np.nan
    return p*(dec_odds-1) - (1-p)*1.0

def p_over_from_normal(mu, sd, line):
    if pd.isna(mu) or pd.isna(sd) or pd.isna(line) or sd <= 0: return np.nan
    z = (line - mu) / sd
    return 1.0 - ND.cdf(z)

# --- Ensure we have decimal odds columns -------------------------------------
# Build from American odds if missing
if "over_dec" not in dfD.columns:
    dfD["over_dec"] = dfD.get("over_odds").map(american_to_decimal)
if "under_dec" not in dfD.columns:
    dfD["under_dec"] = dfD.get("under_odds").map(american_to_decimal)

# --- Quick coverage checks ----------------------------------------------------
have_over  = dfD['over_dec'].notna()  & (dfD['over_dec']  > 1.0)
have_under = dfD['under_dec'].notna() & (dfD['under_dec'] > 1.0)
print("Coverage:")
print(f"  with over_dec:  {have_over.sum()} / {len(dfD)}")
print(f"  with under_dec: {have_under.sum()} / {len(dfD)}")
print(f"  both prices:    {(have_over & have_under).sum()} / {len(dfD)}")

# --- Edge distribution snapshot ----------------------------------------------
q = dfD['edge_over'].dropna().quantile([0.1,0.25,0.5,0.75,0.9]) if 'edge_over' in dfD else pd.Series(dtype=float)
print("\nEdge quantiles (model - fair/imp):")
print(q.to_string())

# --- Top 20 by edge (even if below your threshold) ---------------------------
cols_preview = [c for c in [
    "player","team","opponent","market","line","book",
    "over_dec","under_dec","p_over_model","p_over_fair","p_over_imp","edge_over","EV_over_1u","kelly_frac_over"
] if c in dfD.columns]
print("\nTop by edge (first 20):")
print(dfD.sort_values('edge_over', ascending=False).head(20)[cols_preview].to_string(index=False))

# =========================
# Alternative slates
# =========================

# 1) EV-positive slate (uses model p_over and OVER decimal price)
MIN_EV  = 0.01   # > 0.01u per 1u stake
MIN_DEC = 1.01   # must have a real price
slate_ev = dfD[
    (dfD["EV_over_1u"] > MIN_EV) &
    (dfD["over_dec"].fillna(0) > MIN_DEC)
].copy()

# 2) Lower-edge slate (relax edge threshold)
MIN_EDGE_RELAXED = 0.005   # 0.5%
slate_edge_relaxed = dfD[
    (dfD["edge_over"] >= MIN_EDGE_RELAXED) &
    (dfD["over_dec"].fillna(0) > MIN_DEC)
].copy()

# 3) Price-only slate (ignore de-vig; compare model vs break-even p from OVER decimal)
dfD["p_over_price"] = dfD["over_dec"].map(implied_from_decimal)
dfD["edge_vs_price"] = dfD["p_over_model"] - dfD["p_over_price"]
MIN_EDGE_PRICE = 0.01  # 1% vs break-even
slate_price_only = dfD[
    (dfD["edge_vs_price"] >= MIN_EDGE_PRICE) &
    (dfD["over_dec"].fillna(0) > MIN_DEC)
].copy()

# 4) Sensitivity slate: try a slightly tighter SD (10% of mean) if SD looked fallback-ish
need_sd_tighten = dfD["projection_sd"].isna() | (dfD["projection_sd"] <= 0)
sd_tight = (dfD["projection_mean"].abs() * 0.10).clip(lower=0.75)
p_model_tight = []
for mu, sd, line, tight_sd in zip(dfD["projection_mean"], dfD["projection_sd"], dfD["line"], sd_tight):
    use_sd = sd if pd.notna(sd) and sd > 0 else tight_sd
    p_model_tight.append(p_over_from_normal(mu, use_sd, line))
dfD["p_over_model_tight"] = p_model_tight
dfD["EV_over_1u_tight"] = dfD.apply(lambda r: ev_flat_decimal(r["p_over_model_tight"], r["over_dec"]), axis=1)
dfD["edge_over_tight"] = np.where(dfD["p_over_fair"].notna(),
                                  dfD["p_over_model_tight"] - dfD["p_over_fair"],
                                  dfD["p_over_model_tight"] - dfD["p_over_imp"])
slate_tight = dfD[
    (dfD["EV_over_1u_tight"] > MIN_EV) &
    (dfD["over_dec"].fillna(0) > MIN_DEC)
].copy()

def _keep_cols(d):
    keep = [c for c in [
        "asof_date","game_date","book","player","team","opponent","market","line","lineup_status",
        "over_dec","under_dec",
        "p_over_imp","p_under_imp","p_over_fair","p_under_fair",
        "p_over_model","edge_over","EV_over_1u","kelly_frac_over",
        "p_over_price","edge_vs_price",
        "p_over_model_tight","edge_over_tight","EV_over_1u_tight",
        "projected_minutes","projection_mean","projection_sd","start_prob"
    ] if c in d.columns]
    return d[keep].sort_values(["market","edge_over"], ascending=[True, False])

print("\nSlate sizes:")
print(f"  EV-positive (>{MIN_EV:.2f}u):           {len(slate_ev)}")
print(f"  Relaxed edge (‚â•{MIN_EDGE_RELAXED*100:.1f}%): {len(slate_edge_relaxed)}")
print(f"  Price-only edge (‚â•{MIN_EDGE_PRICE*100:.1f}%): {len(slate_price_only)}")
print(f"  Tight-SD EV-positive:                  {len(slate_tight)}")

# Preview a few from each
for name, slate_df in [
    ("EV-positive", slate_ev),
    ("Relaxed-edge", slate_edge_relaxed),
    ("Price-only", slate_price_only),
    ("Tight-SD EV+", slate_tight),
]:
    if not slate_df.empty:
        print(f"\n{name} ‚Äî top 10")
        print(_keep_cols(slate_df).head(10).to_string(index=False))

# Save all variants
ts = pd.Timestamp.utcnow().strftime("%Y%m%d_%H%M%S")
os.makedirs("data/bets", exist_ok=True)
_keep_cols(slate_ev).to_csv(f"data/bets/nba_slate_evpos_{ts}.csv", index=False)
_keep_cols(slate_edge_relaxed).to_csv(f"data/bets/nba_slate_edge_relaxed_{ts}.csv", index=False)
_keep_cols(slate_price_only).to_csv(f"data/bets/nba_slate_price_only_{ts}.csv", index=False)
_keep_cols(slate_tight).to_csv(f"data/bets/nba_slate_tightsd_{ts}.csv", index=False)
print(f"\nSaved CSVs with the four slate variants (timestamp {ts}).")


AssertionError: Run the priced-slate cell first.

In [None]:
# -- Cell 19: Export clean value-bets CSV (adds PUnderModel) --
import os
from datetime import datetime

# Merge (already built earlier): best_per_player + dfD on ["player","market"]
merged_df = best_per_player.merge(
    dfD, on=["player", "market"], how="inner", suffixes=("_best", "_dfD")
)

# Optional quick peek that this column exists
if "p_over_model_dfD" in merged_df.columns:
    display(merged_df["p_over_model_dfD"].head())

# Columns we want to keep from the merge
# NOTE: use line_dfD (from dfD) as the line source since that's what you previewed
selected_columns = [
    "player",
    "team_best",
    "opponent_best",
    "market",
    "line_dfD",                 # keep dfD line
    "over_odds_best",
    "under_odds_best",
    "p_over_imp_best",
    "p_under_imp_best",
    "p_over_fair_best",
    "p_under_fair_best",
    "projected_minutes_best",
    "projection_mean_best",
    "projection_sd_best",
    "p_over_model_dfD",
]

# Clean column names for output
column_rename = {
    "player": "Player",
    "team_best": "Team",
    "opponent_best": "Opponent",
    "market": "Market",
    "line_dfD": "Line",                 # map dfD line -> Line
    "over_odds_best": "OverOdds",
    "under_odds_best": "UnderOdds",
    "p_over_imp_best": "POverImp",
    "p_under_imp_best": "PUnderImp",
    "p_over_fair_best": "POverFair",
    "p_under_fair_best": "PUnderFair",
    "projected_minutes_best": "ProjMins",
    "projection_mean_best": "ProjMean",
    "projection_sd_best": "ProjSD",
    "p_over_model_dfD": "POverModel",
}

# Build clean frame
missing = [c for c in selected_columns if c not in merged_df.columns]
if missing:
    raise KeyError(f"Missing expected columns in merged_df: {missing}")

merged_df_clean = merged_df[selected_columns].rename(columns=column_rename)

# Add PUnderModel = 1 - POverModel (clip to [0,1] for safety)
merged_df_clean["PUnderModel"] = (1.0 - merged_df_clean["POverModel"]).clip(lower=0.0, upper=1.0)

# Export
os.makedirs("data/bets", exist_ok=True)
csv_path = os.path.join("data/bets", f"value_bets_top100_{datetime.now().strftime('%Y%m%d')}.csv")
merged_df_clean.to_csv(csv_path, index=False)

print(f"Saved clean value bets to: {csv_path}")
print(f"Columns: {list(merged_df_clean.columns)}")
print(f"Rows: {len(merged_df_clean)}")
print("\nPreview:")
print(merged_df_clean.head().to_string(index=False))

In [None]:
# === Convert American odds ‚Üí Decimal odds and save ===
import os
import pandas as pd
from datetime import datetime

# --- Load latest value bets CSV (use today‚Äôs date automatically) ---
today_str = datetime.now().strftime("%Y%m%d")
input_path = f"data/bets/value_bets_top100_{today_str}.csv"
df = pd.read_csv(input_path)

# --- Convert American ‚Üí Decimal ---
def american_to_decimal(american):
    """Convert American odds to decimal odds."""
    if pd.isna(american):
        return None
    try:
        american = float(american)
        if american > 0:
            return 1 + (american / 100)
        elif american < 0:
            return 1 + (100 / abs(american))
        else:
            return None
    except Exception:
        return None

# Apply conversion
df["OverDecimal"] = df["OverOdds"].apply(american_to_decimal)
df["UnderDecimal"] = df["UnderOdds"].apply(american_to_decimal)

# --- Option A: keep both versions (recommended) ---
# rename old odds for clarity
df.rename(columns={"OverOdds": "OverOdds_American", "UnderOdds": "UnderOdds_American"}, inplace=True)

# --- Option B: if you really want to remove them, uncomment this ---
# df.drop(columns=["OverOdds", "UnderOdds"], inplace=True)

# --- Save updated CSV ---
output_path = f"data/bets/value_bets_top100_{today_str}_decimal.csv"
os.makedirs("data/bets", exist_ok=True)
df.to_csv(output_path, index=False)

print(f"‚úÖ Converted odds and saved to: {output_path}")
print("Preview:")
print(df[["Player", "Market", "OverDecimal", "UnderDecimal"]].head())


In [None]:
# === Visual diagnostics for model vs decimal odds (with market labels) ===
import os, glob, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

# ---------- Load latest decimal CSV ----------
cand = sorted(glob.glob("data/bets/value_bets_top100_*_decimal.csv"))
if not cand:
    cand = sorted(glob.glob("data/bets/value_bets_top100_*.csv"))
    if not cand:
        raise FileNotFoundError("No value bets file found in data/bets.")
path = cand[-1]
print(f"Using file: {path}")

df = pd.read_csv(path)

# ---------- Inspect columns ----------
print("All columns in CSV:", list(df.columns))

# ---------- Map column names (robustly) ----------
def find_col(df, candidates):
    cols_lc = {c.lower(): c for c in df.columns}
    for name in candidates:
        c = cols_lc.get(name.lower())
        if c: return c
    return None

col_player   = find_col(df, ["Player"])
col_team     = find_col(df, ["Team"])
col_opp      = find_col(df, ["Opponent"])
col_market   = find_col(df, ["Market"])
col_line     = find_col(df, ["Line", "line_dfD", "posted_line"])
col_p_model  = find_col(df, ["POverModel","p_over_model","P_over_model","pModel","p_model"])
col_over_dec = find_col(df, ["OverDecimal","over_dec","OverDec","OverDecimalOdds"])

required = [col_player, col_market, col_p_model, col_over_dec]
if any(x is None for x in required):
    missing = [n for n, x in zip(
        ["Player","Market","POverModel","OverDecimal"], required) if x is None]
    raise KeyError(f"Missing required columns: {missing}")

# ---------- Build working frame ----------
opt_cols = [col_team, col_opp, col_line, find_col(df, ["ProjMean"]), find_col(df, ["ProjSD"]), "ProjMins", "POverImp", "POverFair"]
keep = [c for c in [col_player, col_market, col_over_dec, col_p_model] + opt_cols if c and c in df.columns]
d = df[keep].copy()

# Coerce numerics (robust)
def _to_float(s):
    try:
        return float(s)
    except Exception:
        m = re.search(r"[-+]?\d+(?:\.\d+)?", str(s))
        return float(m.group()) if m else np.nan

for c in [col_over_dec, col_p_model, col_line] if col_line else [col_over_dec, col_p_model]:
    d[c] = d[c].apply(_to_float)

# ---------- Derived metrics ----------
d["p_over_price"] = 1.0 / d[col_over_dec]
d["edge_over"]    = d[col_p_model] - d["p_over_price"]
d["EV_over_1u"]   = d[col_p_model] * (d[col_over_dec] - 1.0) - (1.0 - d[col_p_model])

# Label like "Player o7.5"
def fmt_line(x):
    return "" if pd.isna(x) else f"{x:g}"
d["label"] = d.apply(
    lambda r: f"{r[col_player]} o{fmt_line(r[col_line])}" if col_line else f"{r[col_player]}",
    axis=1
)

# Filter usable rows
viz = d[
    d[col_over_dec].notna() &
    d[col_p_model].notna() &
    (d[col_over_dec] > 1.0) &
    (d[col_p_model].between(0.01, 0.99))
].copy()

print("Usable rows for visuals:", len(viz))
if viz.empty:
    print("No usable rows to visualize. Sample:")
    print(d.head(10))
else:
    # ---------- Output paths ----------
    outdir = "data/bets/visuals"
    os.makedirs(outdir, exist_ok=True)
    stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")

    # ---------- Color by market ----------
    color_map = {"PTS": "C0", "REB": "C1", "AST": "C2"}
    fig, ax = plt.subplots(figsize=(10, 8))
    for mkt, grp in viz.groupby(col_market):
        ax.scatter(
            grp[col_over_dec], grp[col_p_model],
            alpha=0.65, s=40, label=mkt, c=color_map.get(str(mkt), "C3")
        )

    ax.set_xlabel("Over decimal odds")
    ax.set_ylabel("Model P(Over)")
    ax.set_title(f"Model Probability vs Over Decimal Odds\n({len(viz)} bets)")

    # Break-even curve & shaded "value zone"
    x_min = max(1.01, float(viz[col_over_dec].min()))
    x_max = float(viz[col_over_dec].max())
    x = np.linspace(x_min, x_max, 300)
    y = 1.0 / x
    ax.plot(x, y, color="red", linewidth=2, label="Break-even line")
    ax.fill_between(x, y, 1.0, color="green", alpha=0.08, label="Value zone")

    # Label top N by edge
    TOP_N_LABELS = 20
    to_label = viz.sort_values("edge_over", ascending=False).head(TOP_N_LABELS)
    for _, r in to_label.iterrows():
        ax.annotate(
            r["label"],
            (r[col_over_dec], r[col_p_model]),
            textcoords="offset points", xytext=(5, 4),
            fontsize=8, color="black"
        )

    ax.legend(loc="best", title="Market")
    ax.grid(True, alpha=0.3)
    fig.tight_layout()
    scatter_path = os.path.join(outdir, f"prob_vs_decimal_{stamp}.png")
    fig.savefig(scatter_path, dpi=150, bbox_inches="tight")
    plt.show()
    print("Saved:", scatter_path)

    # ---------- Calibration plot ----------
    fig, ax = plt.subplots(figsize=(10, 8))
    viz_cal = viz.copy()
    viz_cal["p_market_over"] = 1.0 / viz_cal[col_over_dec]
    for mkt, grp in viz_cal.groupby(col_market):
        ax.scatter(
            grp["p_market_over"], grp[col_p_model],
            alpha=0.65, s=40, label=mkt, c=color_map.get(str(mkt), "C3")
        )
    ax.plot([0, 1], [0, 1], color="red", linewidth=2, label="Perfect calibration")
    ax.set_xlabel("Market implied P(Over) = 1 / OverDecimal")
    ax.set_ylabel("Model P(Over)")
    ax.set_title(f"Calibration: Model vs Market\n({len(viz_cal)} bets)")
    ax.grid(True, alpha=0.3)
    ax.legend(loc="best", title="Market")
    fig.tight_layout()
    calib_path = os.path.join(outdir, f"calibration_{stamp}.png")
    fig.savefig(calib_path, dpi=150, bbox_inches="tight")
    plt.show()
    print("Saved:", calib_path)

    # ---------- Edge distribution ----------
    edges = viz["edge_over"].dropna()
    if not edges.empty:
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.hist(edges, bins=30, alpha=0.75, edgecolor="black")
        ax.set_xlabel("Edge = Model P(Over) - Market implied P")
        ax.set_ylabel("Count")
        ax.set_title(f"Edge Distribution (mean={edges.mean():.3f}, sd={edges.std():.3f})")
        ax.grid(True, alpha=0.3)
        fig.tight_layout()
        hist_path = os.path.join(outdir, f"edge_distribution_{stamp}.png")
        fig.savefig(hist_path, dpi=150, bbox_inches="tight")
        plt.show()
        print("Saved:", hist_path)

    # ---------- Top 20 preview + export ----------
    top = viz.sort_values("edge_over", ascending=False).head(50).copy()
    show_cols = [c for c in [
        col_player, col_team, col_opp, col_market, col_line,
        col_over_dec, col_p_model, "p_over_price", "edge_over", "EV_over_1u"
    ] if c in top.columns]
    print("\nTop 20 value bets (by edge):")
    if not top.empty:
        print(top.head(20)[show_cols].to_string(index=False,
              float_format=lambda x: f"{x:.3f}" if isinstance(x, float) else str(x)))
    outdir_csv = os.path.join("data/bets/visuals")
    os.makedirs(outdir_csv, exist_ok=True)
    top_path = os.path.join(outdir_csv, f"top_value_{stamp}.csv")
    top.to_csv(top_path, index=False)
    print("Saved ranked value table ‚Üí", top_path)

print("\nVisualization complete. Check the 'data/bets/visuals' folder for results.")


In [None]:
# === Visual: Model P(Under) vs Under Decimal Odds ‚Äî color by market with legend (top 20 labels) ===
import os, glob, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# -------- 1) Load latest decimal bets file --------
cands = sorted(glob.glob("data/bets/value_bets_top100_*_decimal.csv"))
if not cands:
    cands = sorted(glob.glob("data/bets/value_bets_top100_*.csv"))
    if not cands:
        raise FileNotFoundError("No value bets files found in data/bets/")
path = cands[-1]
print("Using:", path)
df = pd.read_csv(path)

# -------- 2) Column resolution (tolerant to naming) --------
def find_col(dframe, names):
    cols = {re.sub(r"[\W_]+","", c).lower(): c for c in dframe.columns.astype(str)}
    for n in names:
        k = re.sub(r"[\W_]+","", n).lower()
        if k in cols: return cols[k]
    return None

col_player = find_col(df, ["Player"])
col_market = find_col(df, ["Market"])
col_line   = find_col(df, ["Line","line_dfD","posted_line","line"])
col_punder = find_col(df, ["PUnderModel","p_under_model"])
col_pover  = find_col(df, ["POverModel","p_over_model"])  # fallback to compute p_under
col_underD = find_col(df, ["UnderDecimal","under_dec","UnderDec","UnderDecimalOdds"])
col_underUS= find_col(df, ["UnderOdds_American","UnderOdds","under_odds"])  # for conversion if needed

# American ‚Üí decimal (fallback)
def american_to_decimal(a):
    if pd.isna(a): return np.nan
    try:
        a = float(a)
    except Exception:
        m = re.search(r"[-+]?\d+(?:\.\d+)?", str(a))
        a = float(m.group()) if m else np.nan
    if pd.isna(a): return np.nan
    return 1.0 + (a/100.0) if a > 0 else 1.0 + (100.0/abs(a))

# Build UnderDecimal if missing
if col_underD is None and col_underUS is not None:
    df["UnderDecimal_fallback"] = df[col_underUS].map(american_to_decimal)
    col_underD = "UnderDecimal_fallback"

need = [col_player, col_market, col_line, col_underD]
miss = [n for n,v in zip(["Player","Market","Line","UnderDecimal"], need) if v is None]
if miss:
    raise KeyError(f"Missing required columns in file: {miss}")

# Canonicalize core columns
df = df.rename(columns={
    col_player:"player",
    col_market:"market",
    col_line:"line",
    col_underD:"under_dec",
})

# p_under_model: use direct column if present, else 1 - p_over_model
if col_punder:
    df = df.rename(columns={col_punder:"p_under_model"})
    df["p_under_model"] = pd.to_numeric(df["p_under_model"], errors="coerce")
elif col_pover:
    df["p_under_model"] = 1.0 - pd.to_numeric(df[col_pover], errors="coerce")
else:
    raise KeyError("Need either PUnderModel/p_under_model or POverModel/p_over_model to derive P(Under).")

# Ensure numeric types
for c in ["line","under_dec"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# -------- 3) Metrics & filter for plotting --------
df["p_under_price"] = 1.0 / df["under_dec"]
df["edge_under"]    = df["p_under_model"] - df["p_under_price"]

viz = df[
    df["under_dec"].notna() & (df["under_dec"] > 1.0) &
    df["p_under_model"].between(0.01, 0.99)
].copy()

if viz.empty:
    raise SystemExit("No usable rows to plot (need UnderDecimal>1 and valid PUnderModel).")

# Short label "Player u7.5"
def fmt_line(x):
    return "" if pd.isna(x) else f"{x:g}"
viz["label"] = viz.apply(lambda r: f"{r['player']} u{fmt_line(r['line'])}", axis=1)

# -------- 4) Scatter colored by market with legend (mirrors Over plot style) --------
MARKETS = ["PTS","REB","AST"]
color_map = {"PTS": "C0", "REB": "C1", "AST": "C2"}  # consistent with OVER plot

fig, ax = plt.subplots(figsize=(10, 8))
for mkt, grp in viz.groupby(viz["market"].astype(str).str.upper()):
    ax.scatter(
        grp["under_dec"], grp["p_under_model"],
        s=40, alpha=0.65, label=mkt, c=color_map.get(mkt, "C3")
    )

ax.set_xlabel("Under decimal odds")
ax.set_ylabel("Model P(Under)")
ax.set_title(f"Model Probability vs Under Decimal Odds\n({len(viz)} bets)")

# Break-even line and shaded "value zone" for UNDERS (y > 1/x)
x_min = max(1.01, float(viz["under_dec"].min()))
x_max = float(viz["under_dec"].max())
x = np.linspace(x_min, x_max, 300)
y = 1.0 / x
ax.plot(x, y, color="red", linewidth=2, label="Break-even line")
ax.fill_between(x, y, 1.0, color="green", alpha=0.08, label="Value zone")

# Label only the top 20 by edge_under
TOP_N_LABELS = 20
to_label = viz.sort_values("edge_under", ascending=False).head(TOP_N_LABELS)
for _, r in to_label.iterrows():
    ax.annotate(
        r["label"],
        (r["under_dec"], r["p_under_model"]),
        textcoords="offset points", xytext=(5, 4),
        fontsize=8, color="black"
    )

ax.legend(loc="best", title="Market")
ax.grid(True, alpha=0.3)
fig.tight_layout()

# Save
outdir = "data/bets/visuals"
os.makedirs(outdir, exist_ok=True)
stamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
out_path = os.path.join(outdir, f"prob_vs_under_decimal_{stamp}.png")
fig.savefig(out_path, dpi=150, bbox_inches="tight")
plt.show()
print("Saved:", out_path)


In [None]:
# === EVALUATE YESTERDAY'S BETS (Europe/Athens) ‚Äî model-driven suggestions (OVER/UNDER) ===
import os, re, glob
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

# ------------------ settings: yesterday in Europe/Athens ------------------
TZ = ZoneInfo("Europe/Athens")
today_local = datetime.now(TZ).date()
ydate = today_local - timedelta(days=1)
ystr = ydate.strftime("%Y%m%d")
print(f"Evaluating bets for YESTERDAY (Europe/Athens): {ydate} ({ystr})")
print(f"Today is: {today_local}")

# where to save evaluation CSVs
os.makedirs("data/eval", exist_ok=True)

# ------------------ helpers ------------------
def _norm_player(s):
    if not isinstance(s, str): return ""
    return re.sub(r"[.`'‚Äô\-]", "", s.strip()).lower()

def pick_col(df, candidates, default=np.nan):
    for c in candidates:
        if c in df.columns:
            return df[c]
    return pd.Series([default]*len(df))

def _first_float(x):
    if pd.isna(x): return np.nan
    m = re.search(r"[-+]?\d+(?:\.\d+)?", str(x))
    return float(m.group()) if m else np.nan

def infer_opponent(df):
    if "OPPONENT_ABBREVIATION" in df.columns:
        return df["OPPONENT_ABBREVIATION"]
    matchup = pick_col(df, ["MATCHUP","Matchup"])
    team = pick_col(df, ["TEAM_ABBREVIATION","TEAM"])
    out = []
    for t, m in zip(team.fillna(""), matchup.fillna("")):
        opp = np.nan
        if isinstance(m, str) and m:
            parts = re.split(r"[@vVsS]+\.*", m)
            if len(parts) >= 2:
                cand = parts[-1].strip().upper()
                if cand == str(t).upper() and len(parts) >= 2:
                    cand = parts[0].strip().upper()
                opp = cand
        out.append(opp)
    return pd.Series(out, index=df.index)

def parse_date8_from_name(path):
    m = re.search(r"(20\d{6})", os.path.basename(path))
    return m.group(1) if m else None

# ------------------ 1) Find the bets file in data/bets with the requested structure ------------------
required_cols = {
    "Player","Team","Opponent","Market","Line",
    "OverOdds_American","UnderOdds_American","POverImp","PUnderImp",
    "POverFair","PUnderFair","ProjMins","ProjMean","ProjSD",
    "POverModel","PUnderModel","OverDecimal","UnderDecimal"
}

# Accept case-insensitive match and allow underscores vs camel
def has_required_columns(path):
    try:
        if path.lower().endswith(".csv"):
            head = pd.read_csv(path, nrows=0)
        else:
            with pd.ExcelFile(path) as xf:
                head = pd.read_excel(path, sheet_name=xf.sheet_names[0], nrows=0)
        cols_norm = {re.sub(r"[\W_]+", "", c).lower() for c in head.columns.astype(str)}
        need_norm = {re.sub(r"[\W_]+", "", c).lower() for c in required_cols}
        return need_norm.issubset(cols_norm)
    except Exception:
        return False

candidates = sorted(
    glob.glob("data/bets/*.csv") + glob.glob("data/bets/*.xlsx")
)

# Filter to files that have the required columns
candidates = [p for p in candidates if has_required_columns(p)]
if not candidates:
    raise FileNotFoundError(
        "No bets files in data/bets matching the required columns: "
        "Player, Team, Opponent, Market, Line, OverOdds_American, UnderOdds_American, "
        "POverImp, PUnderImp, POverFair, PUnderFair, ProjMins, ProjMean, ProjSD, "
        "POverModel, PUnderModel, OverDecimal, UnderDecimal"
    )

print(f"Found {len(candidates)} candidate files with required columns:")
for c in candidates:
    print(f"  - {os.path.basename(c)}")

# Look for bets file with the SAME date as yesterday
bets_path = None
for p in candidates:
    ds = parse_date8_from_name(p)
    if ds == ystr:  # Exact match for yesterday's date
        bets_path = p
        print(f"Found exact match for yesterday: {os.path.basename(bets_path)}")
        break

# If no exact match, use the most recent file BEFORE yesterday
if bets_path is None:
    dated_files = []
    for p in candidates:
        ds = parse_date8_from_name(p)
        if ds and ds <= ystr:  # Only consider files dated on or before yesterday
            dated_files.append((ds, p))
    
    if dated_files:
        dated_files.sort(key=lambda x: x[0])
        bets_path = dated_files[-1][1]  # Use the most recent one
        print(f"Warning: No bets file found for {ystr}. Using most recent available: {os.path.basename(bets_path)}")
    else:
        # Fallback to latest file (even if future dated)
        bets_path = candidates[-1]
        print(f"Warning: No dated bets files found. Using: {os.path.basename(bets_path)}")

print("Using bets file:", os.path.basename(bets_path))

# ------------------ 2) Load bets ------------------
if bets_path.lower().endswith(".csv"):
    bets = pd.read_csv(bets_path)
else:
    with pd.ExcelFile(bets_path) as xf:
        bets = pd.read_excel(bets_path, sheet_name=xf.sheet_names[0])

print(f"Loaded bets rows: {len(bets)}")

# Map to canonical names using EXACT column names from your file
cols = {c.lower(): c for c in bets.columns}
def col(name_variants):
    for v in name_variants:
        key = v.lower()
        if key in cols: return cols[key]
    return None

# Use the exact column names from your file
cn_player = col(["Player"])
cn_team   = col(["Team"])
cn_opp    = col(["Opponent"])
cn_mkt    = col(["Market"])
cn_line   = col(["Line"])
cn_pom    = col(["POverModel"])
cn_pum    = col(["PUnderModel"])
cn_od     = col(["OverDecimal"])
cn_ud     = col(["UnderDecimal"])

# Check for required columns
need = [cn_player, cn_team, cn_opp, cn_mkt, cn_line, cn_pom, cn_od, cn_ud]
if any(x is None for x in need):
    missing = [n for n, x in zip(
        ["Player","Team","Opponent","Market","Line","POverModel","OverDecimal","UnderDecimal"], need) if x is None]
    raise KeyError(f"Missing expected columns in bets file: {missing}")

# Rename columns to standard names for processing
bets = bets.rename(columns={
    cn_player: "player",
    cn_team: "team", 
    cn_opp: "opponent",
    cn_mkt: "market", 
    cn_line: "line",
    cn_pom: "p_over_model", 
    cn_pum: "p_under_model",
    cn_od: "over_dec", 
    cn_ud: "under_dec"
})

# Convert numeric columns
bets["line"] = pd.to_numeric(bets["line"], errors="coerce")
bets["p_over_model"] = pd.to_numeric(bets["p_over_model"], errors="coerce")
bets["p_under_model"] = pd.to_numeric(bets["p_under_model"], errors="coerce")
bets["over_dec"] = pd.to_numeric(bets["over_dec"], errors="coerce")
bets["under_dec"] = pd.to_numeric(bets["under_dec"], errors="coerce")

# Add key for joining to box scores
bets["player_key"] = bets["player"].map(_norm_player)

# ------------------ 3) Load boxscores for yesterday (from data_raw/) ------------------
BOXSCORE_DIR = "data_raw"

def load_boxscores_for_date(target_date):
    """Load boxscores for a specific date, handling various file naming patterns"""
    target_date_str = target_date.strftime("%Y%m%d")
    
    # Try multiple file patterns
    patterns = [
        f"nba_boxscores_{target_date_str}.csv",
        f"nba_boxscores_*{target_date_str}*.csv",
        "nba_boxscores_*.csv"  # season file
    ]
    
    for pattern in patterns:
        matches = glob.glob(os.path.join(BOXSCORE_DIR, pattern))
        if matches:
            # Use the most recent file if multiple matches
            box_file = sorted(matches)[-1]
            box = pd.read_csv(box_file)
            print(f"Loaded boxscores from: {os.path.basename(box_file)}")
            
            # Filter to target date
            date_cols = ["GAME_DATE", "GAME_DATE_EST", "GAME_DATE_LCL", "Date", "date"]
            for date_col in date_cols:
                if date_col in box.columns:
                    box_dates = pd.to_datetime(box[date_col], errors='coerce').dt.date
                    filtered = box[box_dates == target_date].copy()
                    if len(filtered) > 0:
                        print(f"Filtered to {target_date} using column '{date_col}': {len(filtered)} rows")
                        return filtered
            
            # If no date filtering worked but we have data, return all
            if len(box) > 0:
                print(f"Warning: Could not filter by date. Using all {len(box)} rows.")
                return box
    
    raise FileNotFoundError(f"No boxscore data found for {target_date}")

try:
    box = load_boxscores_for_date(ydate)
except FileNotFoundError as e:
    print(e)
    # Create empty evaluation file
    eval_out = os.path.join("data/eval", f"value_bets_eval_{ystr}.csv")
    bets.assign(
        actual=np.nan, suggestion="NO BET", suggested_prob=np.nan, result_model="NA"
    ).to_csv(eval_out, index=False)
    print(f"Saved empty evaluation to: {eval_out}")
    raise SystemExit

# ------------------ 4) Normalize box & join ------------------
box = box.copy()
box["player"] = pick_col(box, ["PLAYER_NAME","Player"])
box["player_key"] = box["player"].map(_norm_player)
box["PTS"] = pd.to_numeric(pick_col(box, ["PTS","Points"]), errors="coerce")
box["REB"] = pd.to_numeric(pick_col(box, ["REB","Rebounds"]), errors="coerce")
box["AST"] = pd.to_numeric(pick_col(box, ["AST","Assists"]), errors="coerce")
box["TEAM_ABBREVIATION"] = pick_col(box, ["TEAM_ABBREVIATION","TEAM"])
box["OPPONENT_ABBREVIATION"] = infer_opponent(box)

joined = bets.merge(
    box[["player_key","PTS","REB","AST","TEAM_ABBREVIATION","OPPONENT_ABBREVIATION"]],
    on="player_key", how="left", suffixes=("","_box")
)

def pick_actual(row):
    m = str(row.get("market","")).upper()
    return row.get(m, np.nan) if m in ["PTS","REB","AST"] else np.nan

joined["actual"] = joined.apply(pick_actual, axis=1)

# ------------------ 5) Model suggestion & grading ------------------
THRESH = 0.59  # model-confidence threshold

def suggest_side(row):
    po = row.get("p_over_model", np.nan)
    pu = row.get("p_under_model", np.nan)
    # If both are NaN ‚Üí NO BET
    if pd.isna(po) and pd.isna(pu):
        return "NO BET", np.nan
    # Determine which side meets threshold and pick the higher prob if both do
    cand = []
    if pd.notna(po) and po >= THRESH:
        cand.append(("OVER", float(po)))
    if pd.notna(pu) and pu >= THRESH:
        cand.append(("UNDER", float(pu)))
    if not cand:
        return "NO BET", max([v for v in [po, pu] if pd.notna(v)] + [np.nan])
    # pick the larger probability among the qualifying sides
    cand.sort(key=lambda x: x[1], reverse=True)
    return cand[0]

joined[["suggestion","suggested_prob"]] = joined.apply(
    lambda r: pd.Series(suggest_side(r)), axis=1
)

def grade_row(row):
    side = row.get("suggestion", "NO BET")
    act  = row.get("actual", np.nan)
    line = row.get("line", np.nan)
    if side == "NO BET" or pd.isna(act) or pd.isna(line):
        return "NA"
    if side == "OVER":
        if act > line:  return "WIN"
        if act == line: return "PUSH"
        return "LOSS"
    if side == "UNDER":
        if act < line:  return "WIN"
        if act == line: return "PUSH"
        return "LOSS"
    return "NA"

joined["result_model"] = joined.apply(grade_row, axis=1)

# ------------------ 6) Summary & save ------------------
is_bet = joined["suggestion"].isin(["OVER","UNDER"])
graded = joined.loc[is_bet & joined["result_model"].isin(["WIN","LOSS","PUSH"])]

wins   = (graded["result_model"]=="WIN").sum()
losses = (graded["result_model"]=="LOSS").sum()
pushes = (graded["result_model"]=="PUSH").sum()
hitrate = wins / max(wins+losses, 1)

print(f"Suggested bets (THRESH={THRESH:.2f}): {is_bet.sum()} of {len(joined)} rows")
print(f"Graded bets: {len(graded)}  (WIN={wins}, LOSS={losses}, PUSH={pushes})")
print(f"Hit rate (excl. pushes): {hitrate:.1%}")

# Side-specific breakdown
graded_over  = graded.loc[joined["suggestion"]=="OVER"]
graded_under = graded.loc[joined["suggestion"]=="UNDER"]
def _rate(g):
    w = (g["result_model"]=="WIN").sum()
    l = (g["result_model"]=="LOSS").sum()
    return w / max(w+l,1)

print(f"OVER bets graded:  {len(graded_over)}  | Hit: {_rate(graded_over):.1%}")
print(f"UNDER bets graded: {len(graded_under)} | Hit: {_rate(graded_under):.1%}")

eval_out = os.path.join("data/eval", f"value_bets_eval_{ystr}.csv")
joined.to_csv(eval_out, index=False)
print(f"Saved evaluation to: {eval_out}")

# Preview a few rows with the actual column names from your file
cols_preview = [
    "player","team","opponent","market","line","actual",
    "p_over_model","p_under_model","over_dec","under_dec",
    "suggestion","suggested_prob","result_model",
    "TEAM_ABBREVIATION","OPPONENT_ABBREVIATION"
]
print("\nPreview:")
print(joined[ [c for c in cols_preview if c in joined.columns] ].head(25).to_string(index=False))

In [None]:
import pandas as pd

# Load feature importance file
fi = pd.read_csv("model_outputs_rate/feature_importances_20251106_140212.csv")

# Preview to see what columns exist
print(fi.head())

# Normalize and compute mean importance
cols = [c for c in fi.columns if "importance" in c.lower()]
fi["mean_importance"] = fi[cols].mean(axis=1)
fi = fi.sort_values("mean_importance", ascending=False)

# Show top & bottom
print("üèÜ Top 20 most important features:")
print(fi.head(20).to_string(index=False))

print("\nü™∂ Bottom 20 least important features:")
print(fi.tail(20).to_string(index=False))


## TEAM LEVEL PREDICTIONS

In [None]:
# -- Cell 20: Build team-game dataset for team models (fixed) ----------
import numpy as np
import pandas as pd

# We assume logs_2324, logs_2425, logs_2526 already loaded in your data cells.
for name in ["logs_2324", "logs_2425", "logs_2526"]:
    assert name in globals(), f"{name} missing. Make sure your log DataFrames are loaded."

def _build_team_games_from_logs(logs: pd.DataFrame, season_label: str) -> pd.DataFrame:
    """
    Aggregate player boxscores into team-level game rows.
    Each row = one team in one game (so 2 rows per game_id).
    Does NOT require an 'opponent' column in the logs; we reconstruct opponent later.
    """
    df = logs.copy()

    # Standardize some expected columns if needed
    col_map = {
        "TEAM_ABBREVIATION": "team",
        "GAME_ID": "game_id",
        "GAME_DATE": "game_date",
        "HOME": "home",          # 1 if home, 0 if away (if exists)
        "PLUS_MINUS": "plus_minus",
    }
    for old, new in col_map.items():
        if old in df.columns and new not in df.columns:
            df = df.rename(columns={old: new})

    # Minimal sanity
    if "team" not in df.columns or "game_id" not in df.columns:
        raise RuntimeError("logs must have TEAM_ABBREVIATION and GAME_ID (or mapped).")

    # We aggregate common stats
    agg_spec = {}
    for c in ["PTS","REB","AST","TOV",
              "FGM","FGA","FG3M","FG3A","FTM","FTA",
              "OREB","DREB"]:
        if c in df.columns:
            agg_spec[c] = "sum"

    # Some team-level flags
    if "home" in df.columns:
        agg_spec["home"] = "max"   # if any player flagged home, team is home
    if "game_date" in df.columns:
        agg_spec["game_date"] = "max"
    if "plus_minus" in df.columns:
        agg_spec["plus_minus"] = "sum"

    # Aggregate by game and team (no opponent column needed yet)
    team_games = (
        df.groupby(["game_id","team"], as_index=False)
          .agg(agg_spec)
    )

    # Reconstruct opponent for each (game_id, team) pair
    # For each game_id, there should be exactly 2 teams.
    opp_map = team_games[["game_id","team"]].rename(columns={"team": "opponent"})

    team_games = team_games.merge(opp_map, on="game_id", how="left")

    # Remove self matches so each row has the *other* team as opponent
    team_games = team_games[team_games["team"] != team_games["opponent"]].copy()

    # If there were more than two teams per game_id for some reason,
    # this may duplicate rows. We keep the first opponent per (game_id, team).
    team_games = (
        team_games.sort_values(["game_id","team","opponent"])
                  .drop_duplicates(subset=["game_id","team"])
                  .reset_index(drop=True)
    )

    # Derived team-level fields
    if "FG3M" in team_games.columns and "FGM" in team_games.columns:
        team_games["FG2M"] = team_games["FGM"] - team_games["FG3M"]
    if "FG3A" in team_games.columns and "FGA" in team_games.columns:
        team_games["FG2A"] = team_games["FGA"] - team_games["FG3A"]

    # Possessions (simple estimate)
    if all(c in team_games.columns for c in ["FGA","FTA","TOV","OREB"]):
        team_games["POSS"] = (
            team_games["FGA"]
            + 0.44 * team_games["FTA"]
            + team_games["TOV"]
            - team_games["OREB"]
        )
    else:
        team_games["POSS"] = np.nan

    # Offensive Rating (points per 100 possessions)
    if "PTS" in team_games.columns:
        team_games["OFF_RTG"] = 100 * team_games["PTS"] / team_games["POSS"].replace(0, np.nan)
    else:
        team_games["OFF_RTG"] = np.nan

    # Mark season for weighting later if desired
    team_games["season"] = season_label

    return team_games

team_games_2324 = _build_team_games_from_logs(logs_2324, "2023-24")
team_games_2425 = _build_team_games_from_logs(logs_2425, "2024-25")
team_games_2526 = _build_team_games_from_logs(logs_2526, "2025-26")

team_games_all = pd.concat([team_games_2324, team_games_2425, team_games_2526], ignore_index=True)

# Parse dates
if "game_date" in team_games_all.columns:
    team_games_all["game_date"] = pd.to_datetime(team_games_all["game_date"])

print("team_games_all shape:", team_games_all.shape)
display(team_games_all.head())

In [None]:
# -- Cell 21: Add rolling team features ---------------------------------
import numpy as np
import pandas as pd

assert "team_games_all" in globals(), "team_games_all missing. Run Cell 20_team first."

def add_team_rolling_features(df_team: pd.DataFrame, windows=(5, 10)) -> pd.DataFrame:
    """
    For each team, compute rolling averages and sums over last N games.
    """
    df = df_team.sort_values(["team","game_date"]).copy()

    stat_cols = [c for c in [
        "PTS","REB","AST","TOV","FGM","FGA","FG3M","FG3A",
        "FTM","FTA","FG2M","FG2A","POSS","OFF_RTG"
    ] if c in df.columns]

    for w in windows:
        rolled = (
            df.groupby("team", group_keys=False)[stat_cols]
              .rolling(w, min_periods=1)
              .mean()
              .add_suffix(f"_roll{w}")
        )
        for col in rolled.columns:
            df[col] = rolled[col].values

    # Simple rest features: days since last game, B2B, 3-in-4, etc.
    if "game_date" in df.columns:
        df["prev_game_date"] = (
            df.groupby("team")["game_date"]
              .shift(1)
        )
        df["days_rest"] = (df["game_date"] - df["prev_game_date"]).dt.days.fillna(3)
        df["is_b2b"] = (df["days_rest"] == 1).astype(int)
        df["is_3in4"] = (
            df.groupby("team")["is_b2b"]
              .rolling(3, min_periods=1)
              .sum()
              .reset_index(level=0, drop=True) >= 2
        ).astype(int)
    else:
        df["days_rest"] = 3
        df["is_b2b"] = 0
        df["is_3in4"] = 0

    # Win / loss target and spread
    if "PTS" in df.columns:
        # Merge opponent points to compute WIN & margin
        opp_pts = df[["game_id","team","PTS"]].rename(columns={"team":"team_tmp","PTS":"PTS_opp"})
        df = df.merge(
            opp_pts,
            left_on=["game_id","opponent"],
            right_on=["game_id","team_tmp"],
            how="left"
        )
        df.drop(columns=["team_tmp"], inplace=True)
        df["POINT_MARGIN"] = df["PTS"] - df["PTS_opp"]
        df["WIN"] = (df["POINT_MARGIN"] > 0).astype(int)
    else:
        df["POINT_MARGIN"] = np.nan
        df["WIN"] = np.nan

    return df

team_features_all = add_team_rolling_features(team_games_all)

print("team_features_all shape:", team_features_all.shape)
display(team_features_all.head())

In [None]:
# -- Cell 22: Train team-level prop models ------------------------------
import numpy as np
import pandas as pd
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, log_loss

assert "team_features_all" in globals(), "team_features_all missing. Run Cell 21_team first."

df = team_features_all.copy()

# Basic sanity: drop rows with missing target essentials
df = df.dropna(subset=["PTS","REB","AST","TOV"]).reset_index(drop=True)

# Feature columns for team models
TEAM_FEATURES = []

# Rolling stats
for base in ["PTS","REB","AST","TOV","FG3M","FG3A","FGA","FTM","FTA","OFF_RTG","POSS"]:
    for w in [5, 10]:
        col = f"{base}_roll{w}"
        if col in df.columns:
            TEAM_FEATURES.append(col)

# Scheduling and context
for col in ["home","days_rest","is_b2b","is_3in4"]:
    if col in df.columns:
        TEAM_FEATURES.append(col)

TEAM_FEATURES = sorted(set(TEAM_FEATURES))
print(f"Using {len(TEAM_FEATURES)} features for team models.")

gkf = GroupKFold(n_splits=8)
models_team_reg = {}
cv_scores_team_reg = {}

def train_team_regressor(df_in: pd.DataFrame, target_col: str):
    df_ = df_in.dropna(subset=TEAM_FEATURES + [target_col, "team"]).copy()
    X = df_[TEAM_FEATURES]
    y = df_[target_col]
    groups = df_["team"]

    fold_mae = []
    model = XGBRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        reg_alpha=0.0,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    )

    for tr_idx, te_idx in gkf.split(X, y, groups):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
        model.fit(X_tr, y_tr)
        pred = model.predict(X_te)
        fold_mae.append(mean_absolute_error(y_te, pred))

    model.fit(X, y)
    return model, float(np.mean(fold_mae)), float(np.std(fold_mae))

# Train regressors for common team prop stats
TEAM_TARGETS = ["PTS","REB","AST","TOV","FG3M","FG2M","FTM"]
for tgt in TEAM_TARGETS:
    if tgt not in df.columns:
        print(f"Skipping {tgt}: not in data.")
        continue
    model, m, s = train_team_regressor(df, tgt)
    models_team_reg[tgt] = model
    cv_scores_team_reg[tgt] = (m, s)
    print(f"üìä Team {tgt} MAE: {m:.3f} ¬± {s:.3f}")

# Train classification model for WIN
df_cls = df.dropna(subset=TEAM_FEATURES + ["WIN"]).copy()
X = df_cls[TEAM_FEATURES]
y = df_cls["WIN"].astype(int)
groups = df_cls["team"]

win_model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss",
    verbosity=0
)

fold_ll = []
for tr_idx, te_idx in gkf.split(X, y, groups):
    X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
    y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
    win_model.fit(X_tr, y_tr)
    p_te = win_model.predict_proba(X_te)[:,1]
    # Clip probs for logloss stability
    p_te = np.clip(p_te, 1e-4, 1-1e-4)
    fold_ll.append(log_loss(y_te, p_te))

win_model.fit(X, y)
cv_win_logloss = (float(np.mean(fold_ll)), float(np.std(fold_ll)))

print(f"\nüèÜ WIN model logloss: {cv_win_logloss[0]:.4f} ¬± {cv_win_logloss[1]:.4f}")

In [None]:
# -- Cell 23: Build today's games from odds_long ------------------------
import pandas as pd
from datetime import datetime

assert "odds_long" in globals(), "odds_long not found. Run the odds scraper cell first."

# Step 1: filter to today's date
today_str = datetime.utcnow().strftime("%Y-%m-%d")
odds_today = odds_long[odds_long["game_date"] == today_str].copy()

if odds_today.empty:
    print(f"‚ö†Ô∏è No odds found for today ({today_str}).")
else:
    # Clean opponent & away flag
    odds_today["opp_clean"] = odds_today["opponent"].astype(str).str.replace("@","", regex=False).str.strip()
    odds_today["is_away"] = odds_today["opponent"].astype(str).str.startswith("@")

    # Determine home/away teams
    odds_today["home_team"] = odds_today.apply(
        lambda r: r["opp_clean"] if r["is_away"] else r["team"], axis=1
    )
    odds_today["away_team"] = odds_today.apply(
        lambda r: r["team"] if r["is_away"] else r["opp_clean"], axis=1
    )

    # Build unique game key
    odds_today["game_key"] = (
        odds_today["game_date"].astype(str) + "_" +
        odds_today["home_team"] + "_" +
        odds_today["away_team"]
    )

    games_unique = (
        odds_today[["game_date","game_key","home_team","away_team"]]
        .drop_duplicates("game_key")
        .reset_index(drop=True)
    )

    print("üìå Games detected from odds:")
    print(games_unique.to_string(index=False))

    # Build final today_games (home and away rows)
    today_games = pd.concat([
        # Home side
        games_unique.assign(
            game_id = games_unique["game_key"],
            team = games_unique["home_team"],
            opponent = games_unique["away_team"],
            home = 1
        )[["game_id","team","opponent","home","game_date"]],

        # Away side
        games_unique.assign(
            game_id = games_unique["game_key"],
            team = games_unique["away_team"],
            opponent = games_unique["home_team"],
            home = 0
        )[["game_id","team","opponent","home","game_date"]],
    ], ignore_index=True)

    print("\n‚úÖ today_games:")
    print(today_games.to_string(index=False))

In [None]:
# -- Cell 24: Enrich team_features_all with opponent, pace, rest, Elo, matchup ---
import numpy as np
import pandas as pd
from collections import defaultdict

assert "team_features_all" in globals(), "team_features_all missing."
assert "team_games_all" in globals(), "team_games_all missing."

# Base frame: one row per team-game with existing rolling stats
df = team_features_all.sort_values(["team", "game_date"]).copy()

# -------------------------------------------------------------------
# 1) Merge in opponent RAW stats for each game (what this team allows)
# -------------------------------------------------------------------
opp_raw = (
    team_games_all[
        [
            "game_id",
            "team",
            "PTS",
            "REB",
            "AST",
            "TOV",
            "FG3M",
            "FG2M",
            "FTM",
            "POSS",
        ]
    ]
    .rename(
        columns={
            "team": "opponent",
            "PTS": "opp_PTS_raw",
            "REB": "opp_REB_raw",
            "AST": "opp_AST_raw",
            "TOV": "opp_TOV_raw",
            "FG3M": "opp_FG3M_raw",
            "FG2M": "opp_FG2M_raw",
            "FTM": "opp_FTM_raw",
            "POSS": "opp_POSS_raw",
        }
    )
)

df = df.merge(opp_raw, on=["game_id", "opponent"], how="left")

# -------------------------------------------------------------------
# 2) Rolling ALLOWED stats per team ‚Üí defensive profile
#     (shifted by 1 to avoid look-ahead leakage)
# -------------------------------------------------------------------
for stat in ["PTS", "REB", "AST", "TOV", "FG3M", "FG2M", "FTM", "POSS"]:
    col = f"opp_{stat}_raw"
    if col not in df.columns:
        continue
    for w in (5, 10):
        roll_col = f"{stat}_allowed_roll{w}"
        df[roll_col] = (
            df.groupby("team")[col]
              .transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
        )

# -------------------------------------------------------------------
# 3) Rest days and simple flags
# -------------------------------------------------------------------
df = df.sort_values(["team", "game_date"])
df["prev_game_date"] = df.groupby("team")["game_date"].shift(1)
df["days_rest"] = (df["game_date"] - df["prev_game_date"]).dt.days.clip(lower=0)
df["is_b2b"] = (df["days_rest"] == 1).astype(int)

# Rest advantage vs opponent in the same game
if "game_id" in df.columns:
    mean_rest_by_game = df.groupby("game_id")["days_rest"].transform("mean")
    df["is_rest_advantage"] = (df["days_rest"] > mean_rest_by_game).astype(int)

# -------------------------------------------------------------------
# 4) Projected pace as the average of available rolling possessions
# -------------------------------------------------------------------
pace_cols = [c for c in df.columns if "POSS_roll" in c]
if pace_cols:
    df["proj_pace"] = df[pace_cols].mean(axis=1)

# -------------------------------------------------------------------
# 5) Build Elo ratings (FiveThirtyEight-style)
# -------------------------------------------------------------------
def compute_elo_table(games: pd.DataFrame,
                      base_elo: float = 1500.0,
                      k_factor: float = 20.0,
                      hca_points: float = 100.0,
                      carry_over: float = 0.75,
                      mean_elo: float = 1500.0) -> pd.DataFrame:
    """
    Compute per-team pre-game Elo ratings for each game_id/team.
    Expect columns: game_id, game_date, season, team, opponent, PTS, home.
    """
    required_cols = {"game_id", "game_date", "season", "team", "opponent", "PTS"}
    missing = required_cols - set(games.columns)
    if missing:
        raise ValueError(f"compute_elo_table: games missing columns: {missing}")

    games_local = games.copy()
    games_local = games_local.sort_values(["season", "game_date", "game_id"])

    # Ensure 'home' exists; if not, approximate (first row home)
    if "home" not in games_local.columns:
        # Assume that for each game, exactly one row is home (we'll assign later if absent)
        games_local["home"] = 0

    # Per-team current Elo and last season tracking
    elo = defaultdict(lambda: base_elo)
    last_season = {}

    records = []

    # Group by game_id in chronological order
    # Use sort=False to preserve order from the sorted frame
    for gid, g in games_local.groupby("game_id", sort=False):
        if len(g) != 2:
            # Skip games without exactly 2 team-rows
            continue

        g = g.copy()

        # Identify home/away
        if "home" in g.columns and g["home"].isin([0, 1]).all():
            g_home = g[g["home"] == 1]
            if len(g_home) == 1:
                home_row = g_home.iloc[0]
                away_row = g[g["team"] != home_row["team"]].iloc[0]
            else:
                # Fallback: first row home
                home_row = g.iloc[0]
                away_row = g.iloc[1]
        else:
            home_row = g.iloc[0]
            away_row = g.iloc[1]

        # Make sure season is defined for both rows (assume same)
        season = home_row["season"]

        for row in (home_row, away_row):
            t = row["team"]
            s = row["season"]
            if t not in last_season:
                last_season[t] = s
            elif last_season[t] != s:
                # New season for this team: regress Elo to mean
                elo[t] = carry_over * elo[t] + (1 - carry_over) * mean_elo
                last_season[t] = s

        team_home = home_row["team"]
        team_away = away_row["team"]

        pts_home = home_row["PTS"]
        pts_away = away_row["PTS"]

        # Pre-game Elo
        Ra = elo[team_home]
        Rb = elo[team_away]

        # Home-court advantage: +HCA for home
        elo_diff = (Ra + hca_points) - Rb
        exp_home = 1.0 / (1.0 + 10.0 ** (-elo_diff / 400.0))
        exp_away = 1.0 - exp_home

        # Margin of victory multiplier (approx 538's formula)
        margin = abs(pts_home - pts_away)
        mov_mult = ((margin + 3) ** 0.8) / (7.5 + 0.006 * abs(elo_diff))

        # Outcome
        if pts_home > pts_away:
            # Home win
            Ra_new = Ra + k_factor * mov_mult * (1 - exp_home)
            Rb_new = Rb - k_factor * mov_mult * (1 - exp_home)
        else:
            # Away win
            Ra_new = Ra - k_factor * mov_mult * (1 - exp_away)
            Rb_new = Rb + k_factor * mov_mult * (1 - exp_away)

        # Store pre-game Elo for each team
        records.append(
            {
                "game_id": home_row["game_id"],
                "team": team_home,
                "elo_pre": Ra,
                "opp_team": team_away,
                "opp_elo_pre": Rb,
                "elo_diff_pre": Ra - Rb,
            }
        )
        records.append(
            {
                "game_id": away_row["game_id"],
                "team": team_away,
                "elo_pre": Rb,
                "opp_team": team_home,
                "opp_elo_pre": Ra,
                "elo_diff_pre": Rb - Ra,
            }
        )

        # Update Elo
        elo[team_home] = Ra_new
        elo[team_away] = Rb_new

    elo_df = pd.DataFrame(records)
    return elo_df


# Compute Elo from team_games_all and merge into df
elo_input_cols = ["game_id", "game_date", "season", "team", "opponent", "PTS"]
missing_elo_cols = set(elo_input_cols) - set(team_games_all.columns)
if missing_elo_cols:
    raise ValueError(f"team_games_all missing required Elo cols: {missing_elo_cols}")

elo_table = compute_elo_table(team_games_all[elo_input_cols + (["home"] if "home" in team_games_all.columns else [])])

df = df.merge(
    elo_table[["game_id", "team", "elo_pre", "opp_elo_pre", "elo_diff_pre"]],
    on=["game_id", "team"],
    how="left",
)

# -------------------------------------------------------------------
# 6) Opponent rolling features (offense + defense)
#     ‚Üí we self-merge df to bring opponent rolling stats into each row.
# -------------------------------------------------------------------
rolling_cols = [
    c
    for c in df.columns
    if (("_roll5" in c or "_roll10" in c) and not c.startswith("opp_"))
]

if rolling_cols:
    opp_feats = df[["game_id", "team"] + rolling_cols].copy()
    rename_map = {"team": "opponent"}
    rename_map.update({c: f"opp_{c}" for c in rolling_cols})
    opp_feats = opp_feats.rename(columns=rename_map)

    df = df.merge(opp_feats, on=["game_id", "opponent"], how="left")

# -------------------------------------------------------------------
# 7) Matchup features ‚Üí team strengths vs opponent weaknesses
# -------------------------------------------------------------------
# Example: PTS_roll10 - opp_PTS_allowed_roll10, etc.
stats_for_matchup = ["PTS", "FG3M", "FG2M", "REB", "AST", "TOV", "POSS"]

for stat in stats_for_matchup:
    for w in (5, 10):
        off_col = f"{stat}_roll{w}"
        def_col = f"opp_{stat}_allowed_roll{w}"
        if off_col in df.columns and def_col in df.columns:
            df[f"matchup_{stat}_roll{w}"] = df[off_col] - df[def_col]

# Pace matchup (optionally: own pace + opp allowed pace)
for w in (5, 10):
    off_col = f"POSS_roll{w}"
    def_col = f"opp_POSS_allowed_roll{w}"
    if off_col in df.columns and def_col in df.columns:
        df[f"matchup_POSS_roll{w}"] = df[off_col] + df[def_col]

team_features_all_enriched = df

print("‚úÖ Built team_features_all_enriched:", team_features_all_enriched.shape)
print(team_features_all_enriched.tail().to_string(index=False))

In [None]:
# -- Cell 25: Train team models with enriched features --------------
import numpy as np
import pandas as pd
from xgboost import XGBRegressor, XGBClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, log_loss

assert "team_features_all_enriched" in globals(), "team_features_all_enriched missing; run Cell 36_team_fixed."

df = team_features_all_enriched.copy()

# Basic sanity: drop rows with missing target essentials
df = df.dropna(subset=["PTS", "REB", "AST", "TOV"]).reset_index(drop=True)

# -------------------------------------------------------------------
# Build candidate feature list:
#   - rolling offense/defense
#   - opponent rolling
#   - matchup features
#   - Elo
#   - pace & rest
#   - home & injuries
# -------------------------------------------------------------------
candidate_feats = []
for c in df.columns:
    if any(
        tok in c
        for tok in [
            "_roll5",
            "_roll10",
            "_allowed_roll5",
            "_allowed_roll10",
            "proj_pace",
            "days_rest",
            "is_b2b",
            "is_rest_advantage",
            "home",
            "missing_usage_team",
            "elo_pre",
            "elo_diff_pre",
            "opp_",
            "matchup_",
        ]
    ):
        candidate_feats.append(c)

# Remove true leakage: raw stats (non-rolled) and explicit labels
leak_exact = {"PTS", "REB", "AST", "TOV", "FGM", "FGA"}
leak_contains = ["_raw", "WIN", "LOSS", "TARGET", "label"]

TEAM_FEATURES = [
    c
    for c in candidate_feats
    if c not in leak_exact and not any(tok in c for tok in leak_contains)
]

TEAM_FEATURES = sorted(set(TEAM_FEATURES))

print(f"Using {len(TEAM_FEATURES)} team features")
print(TEAM_FEATURES[:50])

X = df[TEAM_FEATURES].fillna(0.0)

# Use season if present, fall back to game_id as grouping key for CV
groups = df["season"] if "season" in df.columns else df["game_id"]

n_groups = groups.nunique()
n_splits = min(5, n_groups)

if n_splits < 2:
    print(f"‚ö†Ô∏è Not enough groups for CV (only {n_groups}). Training models without cross-validation.")
    gkf = None
else:
    print(f"Using GroupKFold with n_splits={n_splits} (groups={n_groups})")
    gkf = GroupKFold(n_splits=n_splits)

# -------------------------------------------------------------------
# Regression models for team stats
# -------------------------------------------------------------------
targets_reg = ["PTS", "REB", "AST", "TOV", "FG2M", "FG3M", "FTM"]
models_team_reg = {}

for tgt in targets_reg:
    if tgt not in df.columns:
        print(f"‚ö†Ô∏è Skipping {tgt}: not found in dataframe.")
        continue

    y = df[tgt]
    model = XGBRegressor(
        n_estimators=700,
        max_depth=5,
        learning_rate=0.02,
        subsample=0.85,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        tree_method="hist",
    )

    fold_mae = []
    if gkf is not None:
        for tr, te in gkf.split(X, y, groups):
            model.fit(X.iloc[tr], y.iloc[tr])
            pred = model.predict(X.iloc[te])
            fold_mae.append(mean_absolute_error(y.iloc[te], pred))

        print(f"üìà {tgt} MAE: {np.mean(fold_mae):.3f} ¬± {np.std(fold_mae):.3f}")
    else:
        print(f"üìà {tgt}: no CV (not enough groups), training on full data.")

    model.fit(X, y)
    models_team_reg[tgt] = model

# -------------------------------------------------------------------
# Win / loss model
# -------------------------------------------------------------------
if "WIN" in df.columns:
    y_win = df["WIN"].astype(int)
elif "opp_PTS_raw" in df.columns:
    y_win = (df["PTS"] > df["opp_PTS_raw"]).astype(int)
else:
    raise ValueError("Cannot build WIN label; add WIN or opp_PTS_raw to team_features_all_enriched.")

win_model = XGBClassifier(
    n_estimators=800,
    max_depth=4,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss",
)

fold_ll = []
if gkf is not None:
    for tr, te in gkf.split(X, y_win, groups):
        win_model.fit(X.iloc[tr], y_win.iloc[tr])
        p = win_model.predict_proba(X.iloc[te])[:, 1]
        p = np.clip(p, 1e-4, 1 - 1e-4)
        fold_ll.append(log_loss(y_win.iloc[te], p))

    print(f"üèÜ WIN model logloss: {np.mean(fold_ll):.4f} ¬± {np.std(fold_ll):.4f}")
else:
    print("üèÜ WIN model: no CV (not enough groups), training on full data.")

win_model.fit(X, y_win)

In [None]:
# -- Cell 26: Build enriched today_team (schedule + features) ------
import pandas as pd
import numpy as np

assert "today_games" in globals(), "today_games missing; build it from schedule."
assert "team_features_all_enriched" in globals(), "team_features_all_enriched missing; run Cell 36_team_fixed."

# Use the latest available game before today for each team
today_date = today_games["game_date"].max()

hist = team_features_all_enriched[team_features_all_enriched["game_date"] < today_date].copy()
hist = hist.sort_values(["team", "game_date"])

team_latest = (
    hist.groupby("team")
        .tail(1)
        .drop_duplicates("team", keep="last")
)

print("team_latest shape:", team_latest.shape)

today_team = (
    today_games.merge(team_latest, on="team", how="left", suffixes=("", "_feat"))
)

# Standardize potential _x / _y columns from merges
rename = {}
for col in ["game_id_x", "game_date_x", "opponent_x"]:
    if col in today_team.columns:
        rename[col] = col.replace("_x", "")
today_team = today_team.rename(columns=rename)

if today_team.isna().sum().sum() > 0:
    print("‚ö†Ô∏è Some missing values in today_team ‚Äî filling with zeros.")
    today_team = today_team.fillna(0)

print("‚úÖ today_team (enriched) built. Shape:", today_team.shape)
print(today_team.head().to_string(index=False))

In [None]:
# -- Cell 27: Predict today team props + improved Monte Carlo ------
import numpy as np
import pandas as pd

assert "today_team" in globals(), "today_team missing; run Cell 38_team_fixed."
assert "models_team_reg" in globals(), "models_team_reg missing; run Cell 37_team_fixed."
assert "win_model" in globals(), "win_model missing; run Cell 37_team_fixed."
assert "TEAM_FEATURES" in globals(), "TEAM_FEATURES missing; run Cell 37_team_fixed."

# Standardize column names in case of _x / _y from merges
tt = today_team.rename(
    columns={
        "game_id_x": "game_id",
        "opponent_x": "opponent",
        "game_date_x": "game_date",
    }
)

# Build feature matrix for today
X_today = tt[TEAM_FEATURES].fillna(0.0)

# Minimal frame with identifiers
id_cols = [c for c in ["game_id", "team", "opponent", "home"] if c in tt.columns]
df_team_today = tt[id_cols].copy()

# Predict core team stats
for tgt, model in models_team_reg.items():
    df_team_today[f"pred_{tgt}"] = model.predict(X_today)

# Simple injury-based adjustment: reduce scoring / assists if a lot of usage missing
if "missing_usage_team" in tt.columns:
    mu = tt["missing_usage_team"].clip(0.0, 0.5)  # in [0, 0.5] after clipping
    scale_pts = 1.0 - 0.3 * mu                     # up to -15% PTS if half usage missing
    scale_ast = 1.0 - 0.2 * mu                     # up to -10% AST

    if "pred_PTS" in df_team_today.columns:
        df_team_today["pred_PTS"] = df_team_today["pred_PTS"] * scale_pts.values
    if "pred_AST" in df_team_today.columns:
        df_team_today["pred_AST"] = df_team_today["pred_AST"] * scale_ast.values

# --- Win probabilities from classifier (use the model's own feature list) ---
booster = win_model.get_booster()
clf_features = booster.feature_names

missing = [f for f in clf_features if f not in tt.columns]
if missing:
    raise ValueError(f"Today's data is missing classifier features: {missing}")

X_today_clf = tt[clf_features].fillna(0.0)

win_probs = win_model.predict_proba(X_today_clf)[:, 1]
df_team_today["win_prob"] = win_probs

print("‚úÖ Today team predictions (enriched):")
print(df_team_today.to_string(index=False))

# --- Monte Carlo simulation of game scores / totals / margins -------------
def simulate_games(df_today: pd.DataFrame, n_sims: int = 5000, stat: str = "PTS") -> pd.DataFrame:
    rows = []

    for gid, g in df_today.groupby("game_id"):
        g = g.sort_values("home", ascending=False).reset_index(drop=True)
        if len(g) != 2:
            print(f"Skipping {gid}: expected 2 teams, found {len(g)}")
            continue

        home = g.iloc[0]
        away = g.iloc[1]

        mean_H = home.get(f"pred_{stat}", np.nan)
        mean_A = away.get(f"pred_{stat}", np.nan)

        if np.isnan(mean_H) or np.isnan(mean_A):
            print(f"Skipping {gid}: missing pred_{stat} for one of the teams.")
            continue

        # Simple residual model; can be calibrated from training residuals
        resid_std = 12.0
        sims_H = np.random.normal(mean_H, resid_std, size=n_sims)
        sims_A = np.random.normal(mean_A, resid_std, size=n_sims)

        total = sims_H + sims_A
        margin = sims_H - sims_A
        win_H = (margin > 0).mean()

        rows.append(
            {
                "game_id": gid,
                "team_home": home["team"],
                "team_away": away["team"],
                "sim_home_win_prob": win_H,
                "sim_avg_home_pts": float(sims_H.mean()),
                "sim_avg_away_pts": float(sims_A.mean()),
                "sim_avg_total": float(total.mean()),
                "sim_avg_margin": float(margin.mean()),
            }
        )

    return pd.DataFrame(rows)

df_sim_games = simulate_games(df_team_today, n_sims=5000, stat="PTS")

print("\nüé≤ Simulation summary (enriched):")
print(df_sim_games.to_string(index=False))

In [None]:
# -- Cell 28: Aggregate player projections to team level (robust) ----------

import pandas as pd

assert "df_projections_all" in globals(), "Run Cell 15 first."
assert "minutes_today_clean" in globals(), "Run Cell 14c first."
assert "today_games_clean" in globals(), "today_games_clean missing."

# 1) Keep only players on teams in today's slate
slate_teams = set(minutes_today_clean["team"].unique())
proj = df_projections_all[df_projections_all["team"].isin(slate_teams)].copy()

# 2) Force correct opponent from today_games_clean
proj = proj.drop(columns=["opponent"], errors="ignore")

opp_map = (
    today_games_clean
    .assign(team=lambda d: d["team"].astype(str).str.upper().str.strip())
    .set_index("team")["opponent"]
    .to_dict()
)

proj["opponent"] = proj["team"].map(opp_map)

# 3) Build team-level player sums
team_player_agg = (
    proj.pivot_table(
        index=["team", "opponent"],
        columns="market",
        values="projection_mean",
        aggfunc="sum"
    )
    .reset_index()
)

team_player_agg.columns.name = None

print("‚úÖ team_player_agg built from player projections:")
print(team_player_agg.head().to_string(index=False))

In [None]:
# -- Cell 29: Blend team model and player-aggregate projections (robust) ----

import numpy as np
import pandas as pd

assert "df_team_today" in globals(), "Run 39_team_fixed first (team model)."
assert "team_player_agg" in globals(), "Run Cell X2 first."
assert "df_projections_all" in globals(), "Run Cell 15 first."

keys = ["team", "opponent"]

# 1) Join team model predictions with player sums
blend = df_team_today.merge(
    team_player_agg,
    on=keys,
    how="left",
    suffixes=("", "_playersum")
)

# Fill missing player sums with 0 (shouldn't happen if everything is aligned)
for stat in ["PTS", "REB", "AST"]:
    col_sum = f"{stat}_playersum"
    if col_sum in blend.columns:
        blend[col_sum] = blend[col_sum].fillna(0.0)

# 2) Compute scale factors: team_model / player_sum (clamped)
scales = {}
for stat in ["PTS", "REB", "AST"]:
    if stat not in blend.columns:
        continue
    col_sum = f"{stat}_playersum"
    if col_sum not in blend.columns:
        continue

    team_pred = blend[stat].values
    player_sum = blend[col_sum].values

    with np.errstate(divide="ignore", invalid="ignore"):
        raw_scale = np.where(player_sum > 1e-6, team_pred / player_sum, 1.0)

    scale = np.clip(raw_scale, 0.8, 1.2)
    scales[stat] = scale
    blend[f"{stat}_scale"] = scale

print("\nüîç Blend preview:")
cols = [c for c in [
    "team","opponent",
    "PTS","PTS_playersum","PTS_scale",
    "REB","REB_playersum","REB_scale",
    "AST","AST_playersum","AST_scale"
] if c in blend.columns]
print(blend[cols].head().to_string(index=False))

# 3) Apply scales back to player projections
df_blended = df_projections_all.copy()

for stat in ["PTS", "REB", "AST"]:
    if stat not in scales:
        continue

    scale_map = (
        blend[keys + [f"{stat}_scale"]]
        .drop_duplicates(keys)
        .set_index(keys)[f"{stat}_scale"]
        .to_dict()
    )

    mask = df_blended["market"] == stat
    df_blended.loc[mask, "projection_mean"] = df_blended[mask].apply(
        lambda r: r["projection_mean"] * scale_map.get((r["team"], r["opponent"]), 1.0),
        axis=1
    )

df_projections_final = df_blended.copy()

print("\nüéØ Final blended player projections (first 30 rows):")
print(df_projections_final.head(30).to_string(index=False))