In [4]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
)
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

STATS_PATH = "kaggle_seasons.csv"            # stats file (renamed)
ROSTER_PATH = "player_teams_2025_26.csv"     # 2025-26 roster file (Team + semicolon "Roster")

# ------------ helpers

def find_col(cols, candidates):
    low = {c.lower(): c for c in cols}
    for cand in candidates:
        if cand.lower() in low:
            return low[cand.lower()]
    return None


def season_to_year(s):
    if pd.isna(s):
        return np.nan
    s = str(s)
    if "-" in s:
        a, b = s.split("-", 1)
        a = int(a)
        b = int(b) if len(b) == 4 else int(str(a)[:2] + b)
        return b
    return int(float(s))


def normalize_name(name):
    if pd.isna(name):
        return name
    s = str(name).strip()
    s = s.replace("’", "'").replace(".", "")
    s = s.replace(" Jr", "").replace(" Jr.", "")
    s = s.replace(" III", "").replace(" II", "")
    return s


TEAM_MAP = {
    'BRK': 'BKN', 'NJN': 'BKN',
    'CHO': 'CHA', 'CHH': 'CHA', 'CHA': 'CHA',
    'NOH': 'NOP', 'NOK': 'NOP', 'NOP': 'NOP',
    'SEA': 'OKC', 'OKC': 'OKC',
    'PHO': 'PHX',
    'WSB': 'WAS',
    'VAN': 'MEM', 'MEM': 'MEM',
}

# ------------ load & standardize stats
raw = pd.read_csv(STATS_PATH)

player_col = find_col(raw.columns, ['Player', 'player_name', 'Name'])
team_col   = find_col(raw.columns, ['Tm', 'team_abbreviation', 'Team'])
season_col = find_col(raw.columns, ['Year', 'season', 'Season'])
pts_col    = find_col(raw.columns, ['PTS', 'pts', 'pts_per_game'])
g_col      = find_col(raw.columns, ['G', 'gp', 'games', 'games_played'])
mp_col     = find_col(raw.columns, ['MP', 'mp', 'min', 'minutes'])

if not all([player_col, team_col, season_col, pts_col]):
    raise ValueError("Required columns not found. Need player, team, season, points.")

stats = raw.copy()
stats['Player'] = stats[player_col].map(normalize_name)
stats['Tm']     = stats[team_col]
stats['Year']   = stats[season_col].apply(season_to_year)
stats['PTS']    = pd.to_numeric(stats[pts_col], errors='coerce')
stats['G']      = pd.to_numeric(stats[g_col], errors='coerce') if g_col else np.nan
stats['MP']     = pd.to_numeric(stats[mp_col], errors='coerce') if mp_col else np.nan
stats['Tm_norm']= stats['Tm'].map(TEAM_MAP).fillna(stats['Tm'])

# choose one row per (Player, Year): highest games
stats['_g_rank'] = stats.groupby(['Player', 'Year'])['G'].rank(ascending=False, method='first')
base = (
    stats.sort_values(['Player', 'Year', '_g_rank'])
         .drop_duplicates(['Player', 'Year'])
         .drop(columns=['_g_rank'])
)

# targets and next team from stats itself
base = base.sort_values(['Player', 'Year'])
base['PTS_next'] = base.groupby('Player')['PTS'].shift(-1)

next_primary = (
    stats.assign(Year=stats['Year'])
         .sort_values(['Player', 'Year', 'G'], ascending=[True, True, False])
         .drop_duplicates(['Player', 'Year'])[['Player', 'Year', 'Tm_norm']]
         .rename(columns={'Tm_norm': 'team_next'})
)
next_primary['Year'] = next_primary['Year'] - 1
base = base.merge(next_primary, on=['Player', 'Year'], how='left')

# keep rows with a real next season & known team
base = base[(base['PTS_next'].notna()) & (base['team_next'].notna())]

# last 20 seasons
max_year = int(base['Year'].max())
cutoff   = max_year - 19
train_df = base[base['Year'] >= cutoff].copy()

# drop junk index cols if present
junk = [c for c in train_df.columns if c.strip().lower() in ("unnamed: 0", "unnamed:0", "index")]
if junk:
    train_df.drop(columns=junk, inplace=True)

# teammate context (leave-one-out means) by (Year, team_next)
# only include columns that exist AND have at least one non-null value
cand_cols = [c for c in ['PTS', 'MP'] if c in train_df.columns and train_df[c].notna().any()]
grp = train_df.groupby(['Year', 'team_next'])
team_sum = grp[cand_cols].sum().add_prefix('sum_') if cand_cols else pd.DataFrame()
team_cnt = grp.size().rename('cnt_players')
team_aggs = pd.concat([team_sum, team_cnt], axis=1).reset_index()
train_df = train_df.merge(team_aggs, on=['Year', 'team_next'], how='left')

for col in cand_cols:
    train_df[f'teammates_mean_{col}'] = (
        (train_df[f'sum_{col}'] - train_df[col]) / (train_df['cnt_players'] - 1).clip(lower=1)
    )
train_df.drop(columns=[c for c in train_df.columns if c.startswith('sum_')] + ['cnt_players'], inplace=True, errors='ignore')

# --------- feature lists (avoid duplicates!)
drop_cols = ['Player', 'Year', 'Tm', 'Tm_norm', 'team_next', 'PTS_next']
# numeric base
numeric_base = train_df.select_dtypes('number').columns.tolist()
# teammate features
teammate_feats = [c for c in train_df.columns if c.startswith('teammates_mean_')]
# numeric feats exclude target/ids & keep unique
numeric_feats = [c for c in numeric_base if c not in drop_cols and not c.startswith('teammates_mean_')]
# remove columns that are entirely NaN
numeric_feats = [c for c in numeric_feats if not train_df[c].isna().all()]
# combine with teammate feats
num_cols = list(dict.fromkeys(numeric_feats + teammate_feats))
cat_feats = ['team_next']
all_feature_cols = list(dict.fromkeys(num_cols + cat_feats))

# OHE compatibility across sklearn versions
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

# small factory so we can optionally scale for non-tree models
from sklearn.pipeline import make_pipeline

def make_preprocessor(scale_numeric: bool):
    steps = [('imputer', SimpleImputer(strategy='median'))]
    if scale_numeric:
        steps.append(('scaler', StandardScaler()))
    num_pipe = Pipeline(steps)
    return ColumnTransformer([
        ('num', num_pipe, num_cols),
        ('cat', ohe, cat_feats)
    ])

# candidate models: (estimator, needs_scaling)
models = {
    'RandomForest': (RandomForestRegressor(n_estimators=700, random_state=42, n_jobs=-1), False),
    'ExtraTrees'  : (ExtraTreesRegressor(n_estimators=700, random_state=42, n_jobs=-1), False),
    'HistGB'      : (HistGradientBoostingRegressor(random_state=42), False),
    'GBM'         : (GradientBoostingRegressor(random_state=42), False),
    'Ridge'       : (Ridge(), True),
    'Lasso'       : (Lasso(max_iter=5000), True),
    'ElasticNet'  : (ElasticNet(max_iter=5000), True),
    'SVR'         : (SVR(), True),
    'KNN'         : (KNeighborsRegressor(n_neighbors=15), True),
}

# --- build X, y
X = train_df[all_feature_cols]
y = train_df['PTS_next']

# --- compare models with 5-fold CV (R2 and RMSE)
results = []
for name, (est, need_scale) in models.items():
    pre = make_preprocessor(scale_numeric=need_scale)
    pipe = Pipeline([('prep', pre), ('model', est)])
    scores = cross_validate(
        pipe, X, y,
        cv=5,
        scoring={'r2': 'r2', 'rmse': 'neg_root_mean_squared_error'},
        n_jobs=-1,
        return_train_score=False
    )
    r2_mean = scores['test_r2'].mean()
    r2_std  = scores['test_r2'].std()
    rmse_mean = -scores['test_rmse'].mean()
    rmse_std  =  scores['test_rmse'].std()
    results.append({
        'model': name,
        'r2_mean': r2_mean,
        'r2_std': r2_std,
        'rmse_mean': rmse_mean,
        'rmse_std': rmse_std
    })

cv_df = pd.DataFrame(results).sort_values(['r2_mean', 'rmse_mean'], ascending=[False, True])
print("\nCV comparison (higher R2, lower RMSE is better):\n", cv_df.to_string(index=False))

# --- choose the best by R2 then RMSE
best_name = cv_df.iloc[0]['model']
best_est, best_scale = models[best_name]
print(f"\nSelected best model: {best_name}")

# --- final train/test evaluation with the winner
pre_best = make_preprocessor(scale_numeric=best_scale)
best_pipe = Pipeline([('prep', pre_best), ('model', best_est)])
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
best_pipe.fit(X_tr, y_tr)

y_hat = best_pipe.predict(X_te)
rmse = float(np.sqrt(mean_squared_error(y_te, y_hat)))
r2   = float(r2_score(y_te, y_hat))
print({
    'test_model': best_name,
    'test_RMSE': rmse,
    'test_R2': r2
})

# ------------ inference for 2025-26 using roster (still works with the chosen model)
ros_raw = pd.read_csv(ROSTER_PATH)
r_team = find_col(ros_raw.columns, ['Team'])
r_rost = find_col(ros_raw.columns, ['Roster'])
if not all([r_team, r_rost]):
    raise ValueError("Roster file must have columns for Team and Roster (semicolon-separated).")

ros = ros_raw[[r_team, r_rost]].copy()
ros['Player'] = ros[r_rost].astype(str).str.split(';')
ros = ros.explode('Player')
ros['Player'] = ros['Player'].astype(str).str.strip().map(normalize_name)
ros = ros.rename(columns={r_team: 'team_2025_26'})[['Player', 'team_2025_26']].drop_duplicates()

# base for inference: last season rows from the already-built base table
base_inf = base[base['Year'] == max_year].copy()
inf_df = base_inf.merge(ros, on='Player', how='left')
inf_df['team_2025_26'] = inf_df['team_2025_26'].fillna('Unknown')

# compute teammate context for 2025-26 using last-season stats grouped by 2025-26 team
cand_cols_inf = [c for c in ['PTS', 'MP'] if c in inf_df.columns and inf_df[c].notna().any()]
grp_inf = inf_df.groupby('team_2025_26')
team_sum_inf = grp_inf[cand_cols_inf].sum().add_prefix('sum_') if cand_cols_inf else pd.DataFrame()
team_cnt_inf = grp_inf.size().rename('cnt_players')
team_aggs_inf = pd.concat([team_sum_inf, team_cnt_inf], axis=1).reset_index()
inf_df = inf_df.merge(team_aggs_inf, on='team_2025_26', how='left')
for col in cand_cols_inf:
    inf_df[f'teammates_mean_{col}'] = (
        (inf_df[f'sum_{col}'] - inf_df[col]) / (inf_df['cnt_players'] - 1).clip(lower=1)
    )
inf_df.drop(columns=[c for c in inf_df.columns if c.startswith('sum_')] + ['cnt_players'], inplace=True, errors='ignore')
inf_df['team_next'] = inf_df['team_2025_26']  # align with training categorical

# columns for inference must match training
feature_cols_inf = list(dict.fromkeys(num_cols + cat_feats))


def predict_ppg_2025(player_name: str) -> float:
    row = inf_df[inf_df['Player'] == normalize_name(player_name)]
    if row.empty:
        raise ValueError(f"{player_name} not found in last-season base or 2025-26 roster.")
    return float(best_pipe.predict(row[feature_cols_inf])[0])

# quick spot-check
for p in ["Anthony Davis", "DeMar DeRozan", "Zach LaVine"]:
    try:
        print(p, "->", round(predict_ppg_2025(p), 2))
    except Exception as e:
        print(p, "->", e)



CV comparison (higher R2, lower RMSE is better):
        model  r2_mean   r2_std  rmse_mean  rmse_std
         GBM 0.770568 0.025891   2.938230  0.048287
      HistGB 0.767260 0.025729   2.960164  0.061489
RandomForest 0.763304 0.026848   2.984586  0.055564
       Ridge 0.761176 0.026209   2.998310  0.045655
         SVR 0.755081 0.027871   3.035366  0.029482
  ExtraTrees 0.752220 0.028473   3.053181  0.050328
         KNN 0.745423 0.025791   3.097016  0.040273
       Lasso 0.706489 0.018353   3.332034  0.089883
  ElasticNet 0.701451 0.015181   3.362296  0.111417

Selected best model: GBM
{'test_model': 'GBM', 'test_RMSE': 3.059027476576049, 'test_R2': 0.7502706356546678}
Anthony Davis -> 20.73
DeMar DeRozan -> 24.45
Zach LaVine -> 22.02


In [8]:
# Assumes you've already built:
# - train_df, num_cols, cat_feats
# - inf_df, feature_cols_inf, normalize_name
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# OneHotEncoder version-safe
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

pre = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), num_cols),
    ('cat', ohe, cat_feats),
])

X = train_df[num_cols + cat_feats]
y = train_df['PTS_next']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

gbm = GradientBoostingRegressor(
    random_state=42,
    validation_fraction=0.1,   # early stopping
    n_iter_no_change=15,
    tol=1e-4,
    loss='squared_error'       # we'll also try 'huber' via the search space below
)

pipe = Pipeline([('prep', pre), ('model', gbm)])

# Local search around your best params
param_dist = {
    "model__learning_rate":   [0.015, 0.02, 0.025, 0.03],
    "model__n_estimators":    [1200, 1600, 2000, 2400],
    "model__max_depth":       [2, 3, 4],
    "model__min_samples_leaf":[5, 10, 15, 20],
    "model__min_samples_split":[5, 10, 20, 40],
    "model__subsample":       [0.6, 0.7, 0.8, 0.9],
    "model__max_features":    ["sqrt", 0.3, 0.5, 0.7, None],
    "model__loss":            ["squared_error", "huber"],
    "model__alpha":           [0.85, 0.9, 0.95],  # only used if loss='huber'
}

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=30,             # you said 150 fits took ~30s; this should be fine
    cv=5,
    scoring="r2",
    n_jobs=-1,
    random_state=42,
    verbose=1,
    refit=True,
)

search.fit(X_tr, y_tr)
best = search.best_estimator_
print("Best CV R2:", search.best_score_)
print("Best params:", search.best_params_)

# Optionally expand the forest if the LR is small (post-tune bump)
lr = best.get_params()["model__learning_rate"]
trees = best.get_params()["model__n_estimators"]
if lr <= 0.02 and trees < 3000:
    best.set_params(model__n_estimators=3000)

# Final fit on the training split
best.fit(X_tr, y_tr)

# Held-out test metrics
pred = best.predict(X_te)
rmse = float(np.sqrt(mean_squared_error(y_te, pred)))
r2   = float(r2_score(y_te, pred))
print({"GBM_test_RMSE": rmse, "GBM_test_R2": r2})

# Use tuned GBM for 2025–26 predictions
def predict_ppg_2025(name: str) -> float:
    row = inf_df[inf_df['Player'] == normalize_name(name)]
    if row.empty:
        raise ValueError(f"{name} not found in last-season base or 2025-26 roster.")
    return float(best.predict(row[feature_cols_inf])[0])

for p in ["Anthony Davis","DeMar DeRozan","Zach LaVine", "LeBron James", "Luka Doncic", "Jimmy Butler"]:
    try:
        print(p, "->", round(predict_ppg_2025(p), 2))
    except Exception as e:
        print(p, "->", e)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best CV R2: 0.7811520474902518
Best params: {'model__subsample': 0.7, 'model__n_estimators': 1600, 'model__min_samples_split': 10, 'model__min_samples_leaf': 15, 'model__max_features': 0.5, 'model__max_depth': 3, 'model__loss': 'huber', 'model__learning_rate': 0.015, 'model__alpha': 0.95}
{'GBM_test_RMSE': 3.036927728602918, 'GBM_test_R2': 0.7538659092398875}
Anthony Davis -> 20.34
DeMar DeRozan -> 24.47
Zach LaVine -> 22.55
LeBron James -> 26.14
Luka Doncic -> 28.57
Jimmy Butler -> 20.13
