### get data

In [2]:
from nba_api.stats.endpoints import teamgamelogs
import numpy as np
import pandas as pd

In [3]:

seasons = ["2023-24","2024-25"]
all_rows = []

for s in seasons:
    # pulls all teams if team_id_nullable=''
    r = teamgamelogs.TeamGameLogs(
        season_nullable=s,
        season_type_nullable="Regular Season",
        team_id_nullable="",
        league_id_nullable="00",  # NBA
    )
    df = r.get_data_frames()[0]
    all_rows.append(df)

team_game_rows = pd.concat(all_rows, ignore_index=True)


In [None]:
import re

def to_game_rows(team_game_rows: pd.DataFrame) -> pd.DataFrame:
    """
    Input:  TeamGameLogs-style dataframe with 2 rows per GAME_ID (one per team).
    Output: 1 row per GAME_ID with HOME_* / AWAY_* columns + HOME_WIN label.
    """

    df = team_game_rows.copy()

    required = {"GAME_ID", "GAME_DATE", "TEAM_ID", "MATCHUP", "WL"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    # Normalize date for safe merge/sort
    df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])

    # Home/away flag from MATCHUP string (common NBA stats convention)
    # - Home: "XYZ vs. ABC" (or "XYZ vs ABC")
    # - Away: "XYZ @ ABC"
    matchup = df["MATCHUP"].astype(str)
    df["IS_HOME"] = matchup.str.contains(r"\bvs\.?\b", flags=re.IGNORECASE, regex=True)
    df["IS_AWAY"] = matchup.str.contains(r"@", regex=True)

    # Keep rows we can confidently classify
    df = df[df["IS_HOME"] | df["IS_AWAY"]].copy()

    # If any GAME_ID doesn't have exactly one home + one away row, drop it (play it safe)
    counts = df.groupby("GAME_ID")[["IS_HOME", "IS_AWAY"]].sum()
    good_game_ids = counts[(counts["IS_HOME"] == 1) & (counts["IS_AWAY"] == 1)].index
    df = df[df["GAME_ID"].isin(good_game_ids)].copy()

    # Split
    home = df[df["IS_HOME"]].copy()
    away = df[df["IS_AWAY"]].copy()

    # Columns to keep unprefixed (keys)
    keys = ["GAME_ID", "GAME_DATE"]

    def prefix_columns(d: pd.DataFrame, pfx: str) -> pd.DataFrame:
        rename = {}
        for c in d.columns:
            if c in keys:
                continue
            rename[c] = f"{pfx}{c}"
        return d.rename(columns=rename)

    home = prefix_columns(home, "HOME_")
    away = prefix_columns(away, "AWAY_")

    # Merge into single game row
    games = home[keys + [c for c in home.columns if c not in keys]].merge(
        away[keys + [c for c in away.columns if c not in keys]],
        on=keys,
        how="inner",
        validate="one_to_one",
    )

    # Label: home team win
    games["HOME_WIN"] = (games["HOME_WL"] == "W").astype(int)

    drop_cols = [c for c in games.columns if c.endswith("IS_HOME") or c.endswith("IS_AWAY")]
    if drop_cols:
        games = games.drop(columns=drop_cols)

    return games


# Example usage:
# games_df = to_game_rows(team_game_rows)
# games_df.head()

games_df = to_game_rows(team_game_rows)

games_df


Unnamed: 0,GAME_ID,GAME_DATE,HOME_SEASON_YEAR,HOME_TEAM_ID,HOME_TEAM_ABBREVIATION,HOME_TEAM_NAME,HOME_MATCHUP,HOME_WL,HOME_MIN,HOME_FGM,...,AWAY_TOV_RANK,AWAY_STL_RANK,AWAY_BLK_RANK,AWAY_BLKA_RANK,AWAY_PF_RANK,AWAY_PFD_RANK,AWAY_PTS_RANK,AWAY_PLUS_MINUS_RANK,AWAY_AVAILABLE_FLAG,HOME_WIN
0,0022301196,2024-04-14,2023-24,1610612760,OKC,Oklahoma City Thunder,OKC vs. DAL,W,48.0,54,...,1500,839,2080,1459,381,1494,2430,2453,1,1
1,0022301188,2024-04-14,2023-24,1610612754,IND,Indiana Pacers,IND vs. ATL,W,48.0,65,...,1920,1516,1003,1459,78,195,1114,2446,1,1
2,0022301200,2024-04-14,2023-24,1610612758,SAC,Sacramento Kings,SAC vs. POR,W,48.0,43,...,2077,205,2080,1459,1208,1020,2449,2437,1,1
3,0022301197,2024-04-14,2023-24,1610612759,SAS,San Antonio Spurs,SAS vs. DET,W,48.0,49,...,1228,1161,1752,382,6,1494,2287,2362,1,1
4,0022301191,2024-04-14,2023-24,1610612753,ORL,Orlando Magic,ORL vs. MIL,W,48.0,42,...,1920,351,1367,1096,968,1254,2412,2321,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450,0022400065,2024-10-23,2024-25,1610612748,MIA,Miami Heat,MIA vs. ORL,L,48.0,32,...,1092,1073,187,2116,1694,971,1025,260,1,0
2451,0022400067,2024-10-23,2024-25,1610612761,TOR,Toronto Raptors,TOR vs. CLE,L,48.0,36,...,850,539,109,1573,1887,215,95,69,1,0
2452,0022400072,2024-10-23,2024-25,1610612757,POR,Portland Trail Blazers,POR vs. GSW,L,48.0,36,...,1953,137,889,750,2368,416,52,35,1,0
2453,0022400061,2024-10-22,2024-25,1610612738,BOS,Boston Celtics,BOS vs. NYK,W,48.0,48,...,640,2421,1712,384,92,1892,1548,2268,1,1


### STEP 5

In [5]:
def _infer_is_home_away(matchup_series: pd.Series) -> tuple[pd.Series, pd.Series]:
    m = matchup_series.astype(str)
    is_home = m.str.contains(r"\bvs\.?\b", case=False, regex=True)
    is_away = m.str.contains("@", regex=False)
    return is_home, is_away

def _ensure_season_col(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prefer an existing season column if present; otherwise derive a simple season label from date.
    This is used to reset rolling features each season (recommended).
    """
    for cand in ["SEASON", "SEASON_ID", "SEASON_YEAR"]:
        if cand in df.columns:
            df = df.rename(columns={cand: "SEASON"})
            return df

    # Derive season like 2023-24 based on date: season starts ~Oct, ends ~Jun.
    # If month >= 10 => season starts that year, else season starts previous year.
    d = pd.to_datetime(df["GAME_DATE"])
    start_year = np.where(d.dt.month >= 10, d.dt.year, d.dt.year - 1)
    df["SEASON"] = start_year.astype(int).astype(str) + "-" + (start_year.astype(int) + 1).astype(str).str[-2:]
    return df

def add_pregame_team_features(
    team_game_rows: pd.DataFrame,
    windows=(5, 10, 20),
    base_stats=("PTS", "REB", "AST", "TOV", "PLUS_MINUS", "FG_PCT", "FG3_PCT", "FT_PCT"),
    reset_each_season=True,
) -> pd.DataFrame:
    """
    Produces pregame features for each team-game row using ONLY PRIOR GAMES.
    Key rule: EVERYTHING that uses game outcomes/stats is shifted by 1.
    """
    df = team_game_rows.copy()

    required = {"GAME_ID", "GAME_DATE", "TEAM_ID", "MATCHUP", "WL"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])
    df["IS_HOME"], df["IS_AWAY"] = _infer_is_home_away(df["MATCHUP"])

    df["WIN"] = (df["WL"] == "W").astype(int)

    # Season column for rolling reset
    if reset_each_season:
        df = _ensure_season_col(df)

    group_keys = ["TEAM_ID"]
    if reset_each_season:
        group_keys = ["TEAM_ID", "SEASON"]

    df = df.sort_values(group_keys + ["GAME_DATE", "GAME_ID"]).reset_index(drop=True)
    g = df.groupby(group_keys, group_keys=False)

    # Rest features
    prev_date = g["GAME_DATE"].shift(1)
    df["DAYS_REST"] = (df["GAME_DATE"] - prev_date).dt.days - 1
    df["DAYS_REST"] = df["DAYS_REST"].fillna(7).clip(lower=0)
    df["B2B"] = (df["DAYS_REST"] == 0).astype(int)

    # Pick available stats
    available_stats = [c for c in base_stats if c in df.columns]

    # Season-to-date win% (pregame)
    df["S2D_WINPCT"] = g["WIN"].transform(lambda s: s.shift(1).expanding(min_periods=1).mean())

    # Streak (pregame): positive win streak, negative loss streak
    def streak_from_win(win: pd.Series) -> pd.Series:
        # compute from shifted series (pregame)
        x = win.shift(1).fillna(0).astype(int).to_numpy()
        out = np.zeros_like(x, dtype=int)
        cur = 0
        for i in range(len(x)):
            if i == 0:
                cur = 0
            else:
                if x[i] == 1:
                    cur = cur + 1 if cur >= 0 else 1
                else:
                    cur = cur - 1 if cur <= 0 else -1
            out[i] = cur
        return pd.Series(out, index=win.index)

    df["STREAK"] = g["WIN"].apply(streak_from_win)

    # Overall rolling features (shifted)
    for w in windows:
        df[f"ROLL_WINPCT_{w}"] = g["WIN"].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
        for col in available_stats:
            df[f"ROLL_{col}_{w}"] = g[col].transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())

    # Home/away split rolling features (shifted) computed safely on subsets
    for w in windows:
        home_mask = df["IS_HOME"]
        away_mask = df["IS_AWAY"]

        # WINPCT splits
        if home_mask.any():
            sub = df.loc[home_mask, group_keys + ["WIN"]].copy()
            df.loc[home_mask, f"HOME_ROLL_WINPCT_{w}"] = (
                sub.groupby(group_keys)["WIN"]
                   .transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
                   .to_numpy()
            )
        if away_mask.any():
            sub = df.loc[away_mask, group_keys + ["WIN"]].copy()
            df.loc[away_mask, f"AWAY_ROLL_WINPCT_{w}"] = (
                sub.groupby(group_keys)["WIN"]
                   .transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
                   .to_numpy()
            )

        # Stat splits
        for col in available_stats:
            if home_mask.any():
                sub = df.loc[home_mask, group_keys + [col]].copy()
                df.loc[home_mask, f"HOME_ROLL_{col}_{w}"] = (
                    sub.groupby(group_keys)[col]
                       .transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
                       .to_numpy()
                )
            if away_mask.any():
                sub = df.loc[away_mask, group_keys + [col]].copy()
                df.loc[away_mask, f"AWAY_ROLL_{col}_{w}"] = (
                    sub.groupby(group_keys)[col]
                       .transform(lambda s: s.shift(1).rolling(w, min_periods=1).mean())
                       .to_numpy()
                )

    # Forward-fill split columns within group
    split_cols = [c for c in df.columns if c.startswith("HOME_ROLL_") or c.startswith("AWAY_ROLL_")]
    if split_cols:
        df[split_cols] = df.groupby(group_keys)[split_cols].ffill()

    return df


In [6]:
def build_game_feature_table(games_df, team_feat_df):
    games = games_df[["GAME_ID","GAME_DATE","HOME_TEAM_ID","AWAY_TEAM_ID","HOME_WIN"]].copy()
    games["GAME_DATE"] = pd.to_datetime(games["GAME_DATE"])

    need = {"GAME_ID", "GAME_DATE", "HOME_TEAM_ID", "AWAY_TEAM_ID", "HOME_WIN"}
    missing = need - set(games.columns)
    if missing:
        raise ValueError(f"games_df missing: {sorted(missing)}")

    # Keep only pregame features from team_feat_df
    pregame_cols = [
        "TEAM_ID", "GAME_ID", "GAME_DATE",
        "DAYS_REST", "B2B", "S2D_WINPCT", "STREAK",
    ] + [c for c in team_feat_df.columns if c.startswith(("ROLL_", "HOME_ROLL_", "AWAY_ROLL_"))]

    tf = team_feat_df[pregame_cols].copy()
    tf["GAME_DATE"] = pd.to_datetime(tf["GAME_DATE"])

    # Home join
    home = tf.rename(columns={"TEAM_ID": "HOME_TEAM_ID"}).copy()
    home_cols = {c: f"HOME_{c}" for c in home.columns if c not in ["GAME_ID", "GAME_DATE", "HOME_TEAM_ID"]}
    home = home.rename(columns=home_cols)

    out = games.merge(
        home,
        on=["GAME_ID", "GAME_DATE", "HOME_TEAM_ID"],
        how="left",
        validate="one_to_one",
    )

    # Away join
    away = tf.rename(columns={"TEAM_ID": "AWAY_TEAM_ID"}).copy()
    away_cols = {c: f"AWAY_{c}" for c in away.columns if c not in ["GAME_ID", "GAME_DATE", "AWAY_TEAM_ID"]}
    away = away.rename(columns=away_cols)

    out = out.merge(
        away,
        on=["GAME_ID", "GAME_DATE", "AWAY_TEAM_ID"],
        how="left",
        validate="one_to_one",
    )

    # DIFF features: home - away for matched pregame fields
    for c in list(out.columns):
        if c.startswith("HOME_"):
            base = c[len("HOME_"):]
            ac = "AWAY_" + base
            if ac in out.columns and pd.api.types.is_numeric_dtype(out[c]) and pd.api.types.is_numeric_dtype(out[ac]):
                out["DIFF_" + base] = out[c] - out[ac]

    return out


In [7]:
def make_model_matrix(model_df: pd.DataFrame, fillna=True):
    y = model_df["HOME_WIN"].astype(int)

    # safest feature set: only DIFF_* built from pregame features
    X = model_df[[c for c in model_df.columns if c.startswith("DIFF_")]].copy()

    if fillna:
        X = X.fillna(0.0)

    return X, y


In [None]:
def leakage_audit(X):
    box = ["PTS","FGM","FGA","FG3M","FG3A","FTM","FTA","OREB","DREB","REB","AST","STL","BLK","TOV","PF","PLUS_MINUS"]
    sus = [c for c in X.columns if c.startswith("DIFF_") and ("ROLL_" not in c) and any(k in c for k in box)]
    print("Suspicious non-rolling DIFF boxscore:", sus[:50])


In [9]:
# 1) Team-level pregame features
team_feat = add_pregame_team_features(
    team_game_rows,
    windows=(5, 10, 20),
    base_stats=("PTS","REB","AST","TOV","PLUS_MINUS","FG_PCT","FG3_PCT","FT_PCT"),
    reset_each_season=True,
)

# 2) Merge to game-level + DIFF features
model_df = build_game_feature_table(games_df, team_feat)

# 3) Build X, y
X, y = make_model_matrix(model_df, fillna=True)

# 4) Quick sanity checks
print("Rows:", len(model_df))
print("X shape:", X.shape)
print("Home win rate:", y.mean())

# 5) Leakage check
leakage_audit(X)


Rows: 2455
X shape: (2455, 86)
Home win rate: 0.5437881873727087
Suspicious non-rolling DIFF boxscore: []


In [None]:
from sklearn.model_selection import train_test_split

def time_split_by_date(model_df, X, y, train_end, val_end):
    """
    train: GAME_DATE <= train_end
    val:   train_end < GAME_DATE <= val_end
    test:  GAME_DATE > val_end
    """
    d = pd.to_datetime(model_df["GAME_DATE"])

    train_mask = d <= pd.to_datetime(train_end)
    val_mask   = (d > pd.to_datetime(train_end)) & (d <= pd.to_datetime(val_end))
    test_mask  = d > pd.to_datetime(val_end)

    X_train, y_train = X.loc[train_mask], y.loc[train_mask]
    X_val, y_val     = X.loc[val_mask],   y.loc[val_mask]
    X_test, y_test   = X.loc[test_mask],  y.loc[test_mask]

    return X_train, y_train, X_val, y_val, X_test, y_test

X_train, y_train, X_val, y_val, X_test, y_test = time_split_by_date(
    model_df, X, y,
    train_end="2024-06-30",
    val_end="2024-12-31"
)


### Define XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, brier_score_loss

model = XGBClassifier(
    n_estimators=5000,
    learning_rate=0.02,
    max_depth=4,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_weight=5,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",
    n_jobs=-1,
    random_state=42,
    early_stopping_rounds=200,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)



[0]	validation_0-logloss:0.68660
[100]	validation_0-logloss:0.62626
[200]	validation_0-logloss:0.62671
[300]	validation_0-logloss:0.62681
[350]	validation_0-logloss:0.62769


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.85
,device,
,early_stopping_rounds,200
,enable_categorical,False


In [37]:
def eval_set(name, model, Xs, ys):
    p = model.predict_proba(Xs)[:, 1]
    pred = (p >= 0.5).astype(int)

    print(f"\n{name}")
    print("  logloss :", log_loss(ys, p))
    print("  brier   :", brier_score_loss(ys, p))
    print("  auc     :", roc_auc_score(ys, p))
    print("  acc     :", accuracy_score(ys, pred))

eval_set("TEST", model, X_test, y_test)



TEST
  logloss : 0.6187691982948487
  brier   : 0.21466734792534856
  auc     : 0.7126092020966803
  acc     : 0.6639784946236559


### Query the model

In [None]:
team_map = (
    team_game_rows[["TEAM_ID", "TEAM_ABBREVIATION"]]
    .dropna()
    .drop_duplicates()
    .set_index("TEAM_ABBREVIATION")["TEAM_ID"]
    .to_dict()
)

In [18]:
def latest_team_features_before(team_feat: pd.DataFrame, team_id: int, asof_date: str) -> pd.Series:
    asof = pd.to_datetime(asof_date)
    sub = team_feat[(team_feat["TEAM_ID"] == team_id) & (pd.to_datetime(team_feat["GAME_DATE"]) < asof)]
    if sub.empty:
        raise ValueError(f"No history for TEAM_ID={team_id} before {asof_date}")
    return sub.sort_values("GAME_DATE").iloc[-1]


In [None]:
def make_matchup_features(team_feat, home_id, away_id, asof_date, feature_cols):
    h = latest_team_features_before(team_feat, home_id, asof_date)
    a = latest_team_features_before(team_feat, away_id, asof_date)

    row = {}

    # Example DIFF fields correspond to base names like "ROLL_PTS_10", "DAYS_REST", "S2D_WINPCT", etc.
    for c in feature_cols:
        assert c.startswith("DIFF_")
        base = c[len("DIFF_"):]
        # base exists in team_feat as a column (pregame feature)
        if base not in team_feat.columns:
            if base == "DAYS_REST" and "DAYS_REST" in h.index:
                row[c] = float(h["DAYS_REST"]) - float(a["DAYS_REST"])
            elif base == "B2B" and "B2B" in h.index:
                row[c] = float(h["B2B"]) - float(a["B2B"])
            else:
                row[c] = 0.0
        else:
            row[c] = float(h[base]) - float(a[base])

    X_one = pd.DataFrame([row], columns=feature_cols).fillna(0.0)
    return X_one


In [None]:
def predict_matchup(model, team_feat, home_abbr, away_abbr, asof_date, feature_cols):
    home_id = int(team_map[home_abbr])
    away_id = int(team_map[away_abbr])

    X_one = make_matchup_features(team_feat, home_id, away_id, asof_date, feature_cols)
    p_home = float(model.predict_proba(X_one)[:, 1][0])
    return p_home

# Example:
feature_cols = list(X.columns) 
team1 = "NYK"
team2 = "SAS"
p = predict_matchup(model, team_feat, team1, team2, "2025-12-17", feature_cols)

winner = team1 if p > .5 else team2
eval_set("Model Accuracy", model, X_test, y_test)
print(f"Predicted Winner: {winner}")
print(f"New York Wins Probability: {p*100:.2f}%")
print(f"San Antonio Spurs Probability: {(1-p)*100:.2f}%")




Model Accuracy
  acc: 66.40%
Predicted Winner: NYK
New York Wins Probability: 72.65%
San Antonio Spurs Probability: 27.35%
