In [None]:
import sys
sys.path.append("/Users/chase/Desktop/Comp_Sci/Capstone/Dynasty/data/CFB_Data")

from src.process.process_college import build_player_dict

player_dict = build_player_dict(verbose=False)
print(len(player_dict))
print(player_dict.get("Rashee Rice"))


In [None]:
import sys
sys.path.append("/Users/chase/Desktop/Comp_Sci/Capstone/Dynasty")

from src.process.process_combine import build_combine_dict

player_combine = build_combine_dict(verbose=False)
print(len(player_combine))
print(player_combine.get("Amon-Ra St Brown"))


In [None]:
from src.process.process_pro_qb import run_pro_qb_player

qb_dict = run_pro_qb_player(years=range(2016, 2025), s_type="REG", verbose=False)
print(len(qb_dict))
print(qb_dict.get("Patrick Mahomes"))

In [None]:
from src.process.process_pro_wr import run_pro_wr_player

wr_dict = run_pro_wr_player(years=range(2016, 2025), s_type="REG", verbose=False)
print(len(wr_dict))
print(wr_dict.get("Rashee Rice"))

In [None]:
from src.process.process_pro_rb import run_pro_rb_player

rb_dict = run_pro_rb_player(years=range(2016, 2025), s_type="REG", verbose=False)
print(len(rb_dict))
print(rb_dict.get("Christian McCaffrey"))

In [None]:
from src.process.process_pro_te import run_pro_te_player

te_dict = run_pro_te_player(years=range(2016, 2025), s_type="REG", verbose=False)
print(len(te_dict))
print(te_dict.get("Travis Kelce"))

In [None]:
from src.scrapers.fantasycalc_client import (
        get_player_value, search_players,
        get_rankings_df, save_current_rankings
    )

# Look up one player
row = get_player_value("Breece Hall")
print(row)

# Search for possible name matches
print(search_players("Harrison"))

# Get full rankings as a DataFrame
df = get_rankings_df(dynasty=True, num_qbs=2, teams=12, ppr=1.0)
print(df.head())

# Save CSV snapshot(s) to Market_Value/
path = save_current_rankings(dynasty=True, num_qbs=2, teams=12, ppr=1.0)
print("Saved:", path)

In [None]:
from src.visuals.plot_feature_scatter_batch import main

# RB plots using RB CSV under data/Bakery/RB and saving under tests/feature_scatter/RB
main(position="RB", max_plots=10000)

In [None]:
import sys
sys.path.append("../src")

from src.utils import clean_player_name

print(clean_player_name("Amon-Ra St. Brown"))   # Amon-Ra StBrown
print(clean_player_name("amon-ra st brown"))    # Amon-Ra StBrown
print(clean_player_name(None))                  # ""


In [None]:
import re

def clean_player_name(player_name):
    """Clean names while preserving apostrophes/hyphens, fusing 'St. X' -> 'StX',
    and normalizing casing to Title Case."""
    if not isinstance(player_name, str):
        return player_name

    s = player_name.strip()

    # 1) Remove common suffixes (Jr, Sr, II, III, IV, V), case-insensitive
    suffixes = ['Jr', 'Sr', 'II', 'III', 'IV', 'V']
    s = re.sub(r'\b(?:' + '|'.join(suffixes) + r')\b\.?', '', s, flags=re.IGNORECASE)

    # 2) Keep only word chars, whitespace, apostrophes, and hyphens
    s = re.sub(r"[^\w\s'-]", '', s)

    # 3) Fuse 'St. ' or 'St ' (any case) before a capitalized surname -> 'StSurname'
    s = re.sub(r"\bSt[.\s]+(?=[A-Z])", "St", s, flags=re.IGNORECASE)

    # 4) Collapse extra spaces
    s = ' '.join(s.split())

    # 5) Normalize to Title Case (preserves apostrophes/hyphens properly)
    s = s.title()

    # Fix common cases where title-casing breaks (e.g., "O'Neal" -> "O'Neal", not "O'Neal")
    # The default .title() already does this okay, but just in case:
    s = re.sub(r"\bO'([A-Z])", lambda m: "O'" + m.group(1).upper(), s)

    return s


def strip_name_marks(s: object) -> object:
    """Strip common extraneous marks like '*' without touching apostrophes or hyphens."""
    if not isinstance(s, str):
        return s
    return s.replace("*", "")


# ---- quick checks ----
tests = [
    "Amon-Ra St. Brown",
    "amon-ra st. brown",
    "O'Neal Jr.",
    "jean-baptiste iii",
    "ST. JOHN",
]
for t in tests:
    print(t, "->", clean_player_name(t))


In [2]:
from src.visuals.plot_feature_scatter_batch import main

# Generate up to 20 plots, 6 per PDF page, for RB
main(position="RB", max_plots=900, cols=3, rows=2)


[RB] Wrote multi-page PDF → /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/tests/feature_scatter/RB/RB_feature_scatters.pdf


[PosixPath('/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/tests/feature_scatter/RB/RB_feature_scatters.pdf')]

In [None]:
import pandas as pd

# adjust path (this assumes you’re in Dynasty/notebooks/)
df = pd.read_csv("./data/Bakery/RB/Bakery_RB_2017.csv")

# Clean up column names
df.columns = [c.strip() for c in df.columns]
print("Columns:", df.columns.tolist())
df.head()


In [None]:
features = [
    "DOM++", "40 Time", "BMI", "YPC",
    "ELU", "YCO/A", "Break%", "Draft Cap", "BAMA"
]

target = "RB Grade"


In [None]:
from sklearn.preprocessing import StandardScaler

X = df[features].copy()
y = df[target]

# invert "lower is better"
X["40 Time"]      = -X["40 Time"]
X["Draft Cap"] = -X["Draft Cap"]

# drop rows with missing values
mask = X.notna().all(axis=1) & y.notna()
X = X[mask]
y = y[mask]

# normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.linear_model import LinearRegression
import pandas as pd

model = LinearRegression()
model.fit(X_scaled, y)

weights = pd.Series(model.coef_, index=features).sort_values(ascending=False)
print("Approximate Weights for RB Grade:")
print(weights)

print("\nIntercept:", model.intercept_)
print("R² (fit quality):", round(model.score(X_scaled, y), 4))


In [None]:
from sklearn.linear_model import Lasso

lasso_pos = Lasso(alpha=0.01, positive=True, max_iter=10000)
lasso_pos.fit(X_scaled, y)

weights_lasso = pd.Series(lasso_pos.coef_, index=features).sort_values(ascending=False)
print("Lasso (positive, shrunk weights):\n", weights_lasso)
print("\nR²:", round(lasso_pos.score(X_scaled, y), 4))


In [None]:
from sklearn.linear_model import LinearRegression

model_pos = LinearRegression(positive=True)
model_pos.fit(X_scaled, y)

weights_pos = pd.Series(model_pos.coef_, index=features).sort_values(ascending=False)
print("Non-Negative Weights:\n", weights_pos)
print("\nR²:", round(model_pos.score(X_scaled, y), 4))


In [None]:
# ===== Reverse-engineer Bakery RB Grade from Bakery_RB_Overall.csv (non-negative weights, no Breakout Age) =====
import re, json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from scipy.optimize import nnls

# ---------- config ----------
CSV_PATH = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
ROOT = CSV_PATH.parent
OUT_DIR = Path("./data/Bakery/_derived"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+", "", c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+", "", cand).lower()
        if key in norm:
            return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

# canonical features to look for (NO Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    # optional extras if present
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Vertical":     ["Vertical","Vertical Jump"],
    "Broad":        ["Broad","Broad Jump"],
    "Speed Score":  ["Speed Score","SpeedScore"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    "Targets":      ["Targets","Target Share","Tgt%"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]

# ---------- load ----------
if not CSV_PATH.exists():
    # fall back to any similarly named overall file
    candidates = list(ROOT.glob("Bakery_RB_Overall*.csv"))
    if not candidates:
        raise FileNotFoundError(f"Could not find {CSV_PATH} or any Bakery_RB_Overall*.csv under {ROOT}")
    CSV_PATH = candidates[0]

df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]
print("Loaded:", CSV_PATH)
print("Rows x Cols:", df.shape)

# ---------- map target + features ----------
y_col = find_col(df, TARGET_CANDS)
if not y_col:
    raise ValueError(f"Could not find RB Grade in columns:\n{df.columns.tolist()}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

if not mapped:
    raise ValueError("No usable feature columns found. Inspect df.columns for header names.")

print("\nUsing features (canonical <- sheet column):")
for k,v in mapped.items():
    print(f"  {k:<12} <- {v}")

X_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_raw = to_num(df[y_col])

# ---------- drop rows with NaN TARGET ----------
mask = y_raw.notna()
dropped = len(y_raw) - mask.sum()
if dropped:
    print(f"\nDropped {dropped} rows with NaN RB Grade.")
X_raw = X_raw.loc[mask].reset_index(drop=True)
y = y_raw.loc[mask].reset_index(drop=True)

# ---------- keep columns with enough data (loose thresholds for real-world sheets) ----------
keep = [c for c in X_raw.columns if X_raw[c].notna().sum() >= 5 and X_raw[c].nunique(dropna=True) > 1]
if not keep:
    raise ValueError("All candidate features are too sparse/constant. "
                     "Relax thresholds or ensure the Overall file has those columns filled.")
X_raw = X_raw[keep]
print("Kept features:", keep)

# ---------- invert where lower is better (NO Breakout Age) ----------
for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
    if c in X_raw.columns:
        X_raw[c] = -X_raw[c]

# ---------- impute X (median) + standardize ----------
imp = SimpleImputer(strategy="median")
X_imputed = imp.fit_transform(X_raw)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# final NaN guards
if np.isnan(X_scaled).any():
    raise ValueError("X still contains NaNs after imputation/standardization. Please inspect your data.")

if y.isna().any():
    raise ValueError("y contains NaNs after filtering; this should not happen.")

# ---------- fit non-negative models ----------
results = {}

# (A) Positive OLS
ols_pos = LinearRegression(positive=True)
ols_pos.fit(X_scaled, y)
r2_ols = float(ols_pos.score(X_scaled, y)) if y.var() > 0 else float("nan")
results["OLS_Positive"] = (r2_ols, pd.Series(ols_pos.coef_, index=X_raw.columns))

# (B) NNLS with mean intercept (stable, non-negative)
y_mean = float(y.mean())
w_nnls, _ = nnls(X_scaled, (y - y_mean).to_numpy())
y_pred = y_mean + X_scaled @ w_nnls
r2_nnls = float(1 - np.sum((y - y_pred)**2) / np.sum((y - y_mean)**2)) if y.var() > 0 else float("nan")
results["NNLS_Positive"] = (r2_nnls, pd.Series(w_nnls, index=X_raw.columns))

# ---------- report ----------
rows = []
for name, (r2, coefs) in results.items():
    row = {"Model": name, "R2": r2}
    row.update({f"w:{k}": v for k,v in coefs.items()})
    rows.append(row)

comp = pd.DataFrame(rows).set_index("Model").sort_values("R2", ascending=False)
pd.set_option("display.max_columns", None)
print("\n=== Model comparison (non-negative only) ===")
display(comp.round(4))

best_name = comp.index[0]
best_r2, best_coefs = results[best_name]
print(f"\nBest non-negative model: {best_name}  (R²={best_r2:.3f})")
print("\nSorted weights (standardized):")
print(best_coefs.sort_values(ascending=False).round(4))

# ---------- save artifacts for reuse ----------
weights_path = OUT_DIR / f"rb_weights_{best_name}.csv"
scaler_path  = OUT_DIR / "rb_scaler.json"
meta_path    = OUT_DIR / "rb_feature_mapping.json"

best_coefs.to_csv(weights_path, header=["coef"])
with open(scaler_path, "w") as f:
    json.dump({
        "means": scaler.mean_.tolist(),
        "scales": scaler.scale_.tolist(),
        "feature_order": list(X_raw.columns),
        "intercept_mean": y_mean,
        "model": best_name
    }, f, indent=2)

with open(meta_path, "w") as f:
    json.dump({"mapped_columns": mapped, "kept_features": keep, "target": y_col}, f, indent=2)

print(f"\nSaved weights → {weights_path}")
print(f"Saved scaler   → {scaler_path}")
print(f"Saved mapping  → {meta_path}")


In [None]:
# =========================
# Bakery RB Grade: Multi-seed 80/20 Train/Test Model Bake-off
# - Source file: ./data/Bakery/RB/Bakery_RB_Overall.csv
# - No Breakout Age is used
# - Cleans, imputes, standardizes (per split, train-only to avoid leakage)
# - Models: OLS_Positive, Ridge, Lasso, RandomForest, GradientBoosting, NNLS_Positive
# - Metrics averaged across multiple random seeds: R2, MAE, RMSE
# =========================

import re
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.optimize import nnls

# ---------- Config ----------
CSV_PATH = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
SEEDS = [0, 1, 2, 3, 4, 42, 123, 777, 1337, 2025]  # 10 runs
TEST_SIZE = 0.20

# Canonical feature names (NO Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    # Optional extras if present (auto-used)
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Vertical":     ["Vertical","Vertical Jump"],
    "Broad":        ["Broad","Broad Jump"],
    "Speed Score":  ["Speed Score","SpeedScore"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    "Targets":      ["Targets","Target Share","Tgt%"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]

# ---------- Helpers ----------
def find_col(frame: pd.DataFrame, candidates):
    norm = {re.sub(r"\s+", "", c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+", "", cand).lower()
        if key in norm:
            return norm[key]
    return None

def to_num(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.strip()
    s = (s.str.replace("%", "", regex=False)
           .str.replace(r"(?i)round\s*", "", regex=True)
           .str.replace(r"(?i)^r\s*", "", regex=True)
           .str.replace(r"(?i)(st|nd|rd|th)$", "", regex=True)
           .str.replace(",", "", regex=False)
           .str.replace(r"[^0-9\.\-]", "", regex=True))
    return pd.to_numeric(s, errors="coerce")

def build_clean_matrix(csv_path: Path):
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]

    y_col = find_col(df, TARGET_CANDS)
    if not y_col:
        raise ValueError(f"Could not find RB Grade. Columns:\n{df.columns.tolist()}")

    mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
    mapped = {k:v for k,v in mapped.items() if v is not None}

    X = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
    y = to_num(df[y_col])

    # Drop rows with missing target
    mask = y.notna()
    X, y = X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True)

    # Keep loose: feature must have at least 5 non-NaN values and >1 unique value
    keep = [c for c in X.columns if X[c].notna().sum() >= 5 and X[c].nunique(dropna=True) > 1]
    if not keep:
        raise ValueError("All candidate features are too sparse/constant. Check your CSV.")
    X = X[keep]

    # Invert "lower is better"
    for c in ["40 Time", "Draft Capital", "Shuttle", "Three Cone"]:
        if c in X.columns:
            X[c] = -X[c]

    return X, y, keep

# ---------- Load & prepare once (feature set decided here) ----------
X_all, y_all, kept_features = build_clean_matrix(CSV_PATH)
print("Features used:", kept_features)
print("Dataset size:", len(y_all))

# ---------- Model factory (per-seed) ----------
def get_models(random_state: int):
    return {
        "OLS_Positive": LinearRegression(positive=True),
        "Ridge": Ridge(alpha=1.0, random_state=random_state) if "random_state" in Ridge().get_params() else Ridge(alpha=1.0),
        "Lasso": Lasso(alpha=0.01, max_iter=5000, random_state=random_state) if "random_state" in Lasso().get_params() else Lasso(alpha=0.01, max_iter=5000),
        "RandomForest": RandomForestRegressor(n_estimators=300, max_depth=None, random_state=random_state),
        "GradientBoosting": GradientBoostingRegressor(random_state=random_state),
        "NNLS_Positive": "custom"
    }

# ---------- Run multiple 80/20 splits ----------
all_runs = []  # collects per-run metrics

for seed in SEEDS:
    # Split indices (avoid leakage by fitting imputer/scaler on train only)
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_all, y_all, test_size=TEST_SIZE, random_state=seed
    )

    # Impute + scale (train-only fit)
    imp = SimpleImputer(strategy="median")
    scaler = StandardScaler()

    X_train_imp = imp.fit_transform(X_train_raw)
    X_test_imp  = imp.transform(X_test_raw)

    X_train = scaler.fit_transform(X_train_imp)
    X_test  = scaler.transform(X_test_imp)

    models = get_models(seed)

    for name, model in models.items():
        if name == "NNLS_Positive":
            # Non-negative least squares with mean intercept from TRAIN
            mu = float(y_train.mean())
            w, _ = nnls(X_train, (y_train - mu).to_numpy())
            y_pred = mu + X_test @ w
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        all_runs.append({
            "seed": seed,
            "model": name,
            "R2": r2_score(y_test, y_pred),
            "MAE": mean_absolute_error(y_test, y_pred),
            "RMSE": np.sqrt(mean_squared_error(y_test, y_pred))  # version-safe RMSE
        })

# ---------- Aggregate results ----------
runs_df = pd.DataFrame(all_runs)
summary = (runs_df
           .groupby("model")
           .agg(R2_mean=("R2","mean"),   R2_std=("R2","std"),
                MAE_mean=("MAE","mean"), MAE_std=("MAE","std"),
                RMSE_mean=("RMSE","mean"), RMSE_std=("RMSE","std"))
           .sort_values("R2_mean", ascending=False))

pd.set_option("display.max_columns", None)
print("\n=== 10x 80/20 Train/Test — Averaged Metrics ===")
display(summary.round(4))

print("\nPer-run results (first few rows):")
display(runs_df.sort_values(["model","seed"]).head(100))


In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Use same X_all, y_all from your pipeline

param_dist_rf = {
    "n_estimators": [300, 500, 800, 1200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", 0.5, 0.7, 1.0],
}

param_dist_gb = {
    "n_estimators": [300, 500, 1000, 2000],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2, 3, 4, 5],
    "subsample": [0.7, 0.85, 1.0],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)

rf_search = RandomizedSearchCV(rf, param_dist_rf, n_iter=30, cv=cv,
                               scoring="r2", n_jobs=-1, random_state=42, verbose=2)
gb_search = RandomizedSearchCV(gb, param_dist_gb, n_iter=30, cv=cv,
                               scoring="r2", n_jobs=-1, random_state=42, verbose=2)

# Fit searches
rf_search.fit(X_all, y_all)
gb_search.fit(X_all, y_all)

print("Best RF params:", rf_search.best_params_)
print("Best RF R²:", rf_search.best_score_)

print("Best GB params:", gb_search.best_params_)
print("Best GB R²:", gb_search.best_score_)


In [None]:
# ===========================
# Bakery RB Grade — Feature Engineering + XGBoost/LightGBM + Stacking
# Source: ./data/Bakery/RB/Bakery_RB_Overall.csv
# Goals:
#   - richer features (interactions/ratios)
#   - tuned GB/RF + (optional) XGBoost/LightGBM
#   - stacking & simple blending
# Outputs:
#   - Test-set metrics for each model and ensembles
# ===========================

import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

# Optional boosters (handled gracefully if not installed)
_has_xgb = _has_lgbm = False
try:
    from xgboost import XGBRegressor
    _has_xgb = True
except Exception:
    pass
try:
    from lightgbm import LGBMRegressor
    _has_lgbm = True
except Exception:
    pass

# ---------- Config ----------
CSV_PATH = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
TEST_SIZE = 0.20
RANDOM_STATE = 42
N_JOBS = -1
CV = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Canonical features (NO Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    # optional extras if present
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Vertical":     ["Vertical","Vertical Jump"],
    "Broad":        ["Broad","Broad Jump"],
    "Speed Score":  ["Speed Score","SpeedScore"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    "Targets":      ["Targets","Target Share","Tgt%"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]

# ---------- Helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+", "", c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+", "", cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def load_and_map(csv_path: Path):
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]
    y_col = find_col(df, TARGET_CANDS)
    if not y_col:
        raise ValueError(f"Could not find RB Grade. Columns:\n{df.columns.tolist()}")
    mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
    mapped = {k:v for k,v in mapped.items() if v is not None}

    X = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
    y = to_num(df[y_col])

    # drop NaN target
    mask = y.notna()
    X, y = X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True)

    # keep loose: ≥5 non-nan and >1 unique
    keep = [c for c in X.columns if X[c].notna().sum() >= 5 and X[c].nunique(dropna=True) > 1]
    if not keep:
        raise ValueError("All candidate features are too sparse/constant. Check your CSV.")
    X = X[keep]
    return X, y

def add_engineered_features(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # Invert "lower is better" BEFORE interactions
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
        if c in X.columns:
            X[c] = -X[c]

    # Interactions / ratios (created only if inputs exist)
    def safe_mul(a,b,name):
        if a in X.columns and b in X.columns:
            X[name] = X[a] * X[b]
    def safe_div(a,b,name):
        if a in X.columns and b in X.columns:
            X[name] = X[a] / X[b].replace(0,np.nan)

    safe_mul("BMI", "40 Time", "BMIx40")
    safe_mul("ELU", "YCO/A", "ELUxYCOA")
    safe_mul("DOM++", "Draft Capital", "DOMxDraft")
    safe_mul("YPC", "ELU", "YPCxELU")
    safe_div("YCO/A", "YPC", "YCOA_to_YPC")
    safe_div("Rec Yards", "DOM++", "RecYds_to_DOM")

    # Clip extreme ratios to reduce noise
    for c in ["YCOA_to_YPC","RecYds_to_DOM"]:
        if c in X.columns:
            X[c] = X[c].clip(lower=-10, upper=10)

    # (Optional) small quantile bins to capture nonlinearity for linear models
    for c in ["DOM++","YPC","ELU","YCO/A","Break%"]:
        if c in X.columns:
            X[f"{c}_q"] = pd.qcut(X[c].rank(method="first"), q=5, labels=False)

    return X

def metrics(y_true, y_pred):
    return dict(
        R2 = r2_score(y_true, y_pred),
        MAE = mean_absolute_error(y_true, y_pred),
        RMSE = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    )

# ---------- Build dataset ----------
X_raw, y = load_and_map(CSV_PATH)
X_feat = add_engineered_features(X_raw)

# Split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_feat, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# Impute (median) — train only
imp = SimpleImputer(strategy="median")
X_train_imp = imp.fit_transform(X_train_raw)
X_test_imp  = imp.transform(X_test_raw)

# Standardize for linear/meta models
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_imp)
X_test_std  = scaler.transform(X_test_imp)

# ---------- Base models (with tuning) ----------
results = []

# Gradient Boosting (tuned)
gb = GradientBoostingRegressor(random_state=RANDOM_STATE)
gb_param = {
    "n_estimators": [500, 1000, 2000],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "max_depth": [2,3,4,5],
    "subsample": [0.7, 0.85, 1.0],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4],
}
gb_search = RandomizedSearchCV(
    gb, gb_param, n_iter=35, scoring="r2", cv=CV, n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0
)
gb_search.fit(X_train_imp, y_train)
gb_best = gb_search.best_estimator_
y_pred = gb_best.predict(X_test_imp)
results.append(("GradientBoosting(Tuned)", gb_search.best_score_, metrics(y_test, y_pred)))

# Random Forest (tuned)
rf = RandomForestRegressor(random_state=RANDOM_STATE)
rf_param = {
    "n_estimators": [600, 1000, 1500],
    "max_depth": [None, 8, 12, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", 0.5, 0.7, 1.0],
}
rf_search = RandomizedSearchCV(
    rf, rf_param, n_iter=35, scoring="r2", cv=CV, n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0
)
rf_search.fit(X_train_imp, y_train)
rf_best = rf_search.best_estimator_
y_pred = rf_best.predict(X_test_imp)
results.append(("RandomForest(Tuned)", rf_search.best_score_, metrics(y_test, y_pred)))

# Lasso / Ridge on standardized features (baseline linear)
lasso = Lasso(alpha=0.005, max_iter=20000).fit(X_train_std, y_train)
y_pred = lasso.predict(X_test_std)
results.append(("Lasso", np.nan, metrics(y_test, y_pred)))

ridge = Ridge(alpha=1.0).fit(X_train_std, y_train)
y_pred = ridge.predict(X_test_std)
results.append(("Ridge", np.nan, metrics(y_test, y_pred)))

# XGBoost (tuned) — if available
if _has_xgb:
    xgb = XGBRegressor(
        random_state=RANDOM_STATE, objective="reg:squarederror", nthread=-1
    )
    xgb_param = {
        "n_estimators": [800, 1200, 2000],
        "max_depth": [3,4,5,6],
        "learning_rate": [0.01, 0.03, 0.05, 0.1],
        "subsample": [0.7, 0.85, 1.0],
        "colsample_bytree": [0.7, 0.9, 1.0],
        "reg_alpha": [0, 0.1, 1.0],
        "reg_lambda": [1.0, 2.0, 5.0],
    }
    xgb_search = RandomizedSearchCV(
        xgb, xgb_param, n_iter=40, scoring="r2", cv=CV, n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0
    )
    xgb_search.fit(X_train_imp, y_train)
    xgb_best = xgb_search.best_estimator__
    y_pred = xgb_best.predict(X_test_imp)
    results.append(("XGBoost(Tuned)", xgb_search.best_score_, metrics(y_test, y_pred)))
else:
    xgb_best = None

# LightGBM (tuned) — if available
if _has_lgbm:
    lgbm = LGBMRegressor(random_state=RANDOM_STATE)
    lgbm_param = {
        "n_estimators": [800, 1200, 2000],
        "learning_rate": [0.01, 0.03, 0.05, 0.1],
        "max_depth": [-1, 4, 6, 8],
        "num_leaves": [31, 63, 127],
        "subsample": [0.7, 0.85, 1.0],
        "colsample_bytree": [0.7, 0.9, 1.0],
        "min_child_samples": [5, 10, 20],
        "reg_alpha": [0, 0.1, 1.0],
        "reg_lambda": [1.0, 2.0, 5.0],
    }
    lgbm_search = RandomizedSearchCV(
        lgbm, lgbm_param, n_iter=40, scoring="r2", cv=CV, n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0
    )
    lgbm_search.fit(X_train_imp, y_train)
    lgbm_best = lgbm_search.best_estimator_
    y_pred = lgbm_best.predict(X_test_imp)
    results.append(("LightGBM(Tuned)", lgbm_search.best_score_, metrics(y_test, y_pred)))
else:
    lgbm_best = None

# ---------- Simple blend (average of available boosted/forest models) ----------
blend_preds = []
for model in [gb_best, rf_best, xgb_best, lgbm_best]:
    if model is not None:
        blend_preds.append(model.predict(X_test_imp))
if len(blend_preds) >= 2:
    y_blend = np.mean(blend_preds, axis=0)
    results.append(("Blend(GB+RF+XGB+LGBM avail.)", np.nan, metrics(y_test, y_blend)))

# ---------- Stacking ensemble (meta: Ridge on standardized features) ----------
base_estimators = []
base_for_stack_preds = []

# fit base models on TRAIN to produce stack features for TEST
for name, model in [
    ("gb", gb_best),
    ("rf", rf_best),
    ("xgb", xgb_best if xgb_best is not None else None),
    ("lgbm", lgbm_best if lgbm_best is not None else None),
]:
    if model is not None:
        base_estimators.append((name, model))

if len(base_estimators) >= 2:
    stack = StackingRegressor(
        estimators=base_estimators,
        final_estimator=Ridge(alpha=1.0),
        passthrough=False, n_jobs=N_JOBS
    )
    stack.fit(X_train_imp, y_train)
    y_pred = stack.predict(X_test_imp)
    results.append(("Stacking(Ridge meta)", np.nan, metrics(y_test, y_pred)))

# ---------- Report ----------
rows = []
for name, cv_best_r2, m in results:
    row = {"Model": name, "CV_R2_best": cv_best_r2, **m}
    rows.append(row)
report = pd.DataFrame(rows).sort_values("R2", ascending=False)
pd.set_option("display.max_columns", None)
display(report.round(4))

# Tip: If you're still under 0.90 R², consider:
# - Adding more years/features (agility jumps, bench, SOS, OL strength, conference)
# - More interactions (e.g., ELU×Break%, DOM×YPC, BMI×YCO/A)
# - Bayesian optimization (Optuna) for tighter tuning
# - Calibrating/denoising target (e.g., year-wise z-score of RB Grade)


In [None]:
# ===========================
# Leakage-free OOF stacking + NNLS blending for best test R²
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import RidgeCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from scipy.optimize import nnls

# Optional boosters (skip if not installed)
_has_xgb = _has_lgbm = False
try:
    from xgboost import XGBRegressor
    _has_xgb = True
except Exception:
    pass
try:
    from lightgbm import LGBMRegressor
    _has_lgbm = True
except Exception:
    pass

# ---------- config ----------
CSV_PATH = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
TEST_SIZE = 0.20
RANDOM_STATE = 42
N_JOBS = -1
CV = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Vertical":     ["Vertical","Vertical Jump"],
    "Broad":        ["Broad","Broad Jump"],
    "Speed Score":  ["Speed Score","SpeedScore"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    "Targets":      ["Targets","Target Share","Tgt%"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]

def find_col(frame, candidates):
    norm = {re.sub(r"\s+", "", c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+", "", cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def load_and_map(csv_path: Path):
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]
    y_col = find_col(df, TARGET_CANDS)
    if not y_col:
        raise ValueError("RB Grade column not found.")
    mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
    mapped = {k:v for k,v in mapped.items() if v is not None}
    X = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
    y = to_num(df[y_col])
    mask = y.notna()
    X, y = X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True)
    keep = [c for c in X.columns if X[c].notna().sum() >= 5 and X[c].nunique(dropna=True) > 1]
    if not keep: raise ValueError("All candidate features are too sparse/constant.")
    return X[keep], y

def add_features(X):
    X = X.copy()
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
        if c in X.columns: X[c] = -X[c]
    def mul(a,b,n): 
        if a in X.columns and b in X.columns: X[n] = X[a]*X[b]
    def div(a,b,n): 
        if a in X.columns and b in X.columns: X[n] = X[a]/X[b].replace(0,np.nan)
    mul("BMI","40 Time","BMIx40")
    mul("ELU","YCO/A","ELUxYCOA")
    mul("DOM++","Draft Capital","DOMxDraft")
    mul("YPC","ELU","YPCxELU")
    div("YCO/A","YPC","YCOA_to_YPC")
    div("Rec Yards","DOM++","RecYds_to_DOM")
    for c in ["YCOA_to_YPC","RecYds_to_DOM"]:
        if c in X.columns: X[c] = X[c].clip(-10,10)
    for c in ["DOM++","YPC","ELU","YCO/A","Break%"]:
        if c in X.columns: X[f"{c}_q"] = pd.qcut(X[c].rank(method="first"), 5, labels=False)
    return X

def rmse(y_true,y_pred): 
    return float(np.sqrt(((y_true-y_pred)**2).mean()))

# 1) build data
X0, y = load_and_map(CSV_PATH)
X = add_features(X0)

# 2) split once (final evaluation split)
X_tr_raw, X_te_raw, y_tr, y_te = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# 3) imputers (fit on train only)
imp = SimpleImputer(strategy="median")
X_tr_imp = imp.fit_transform(X_tr_raw)
X_te_imp = imp.transform(X_te_raw)

# 4) base models with light tuning on TRAIN ONLY
gb = GradientBoostingRegressor(random_state=RANDOM_STATE)
gb_param = {
    "n_estimators": [600,1000,1500,2000],
    "learning_rate": [0.01,0.03,0.05,0.1],
    "max_depth": [2,3,4,5],
    "subsample": [0.7,0.85,1.0],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4],
}
gb_search = RandomizedSearchCV(gb, gb_param, n_iter=35, scoring="r2", cv=KFold(4, shuffle=True, random_state=7),
                               n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0)
gb_search.fit(X_tr_imp, y_tr)
gb_best = gb_search.best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_STATE)
rf_param = {
    "n_estimators": [800,1200,1600],
    "max_depth": [None,10,14,20],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4],
    "max_features": ["sqrt",0.6,0.9,1.0],
}
rf_search = RandomizedSearchCV(rf, rf_param, n_iter=30, scoring="r2", cv=KFold(4, shuffle=True, random_state=8),
                               n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0)
rf_search.fit(X_tr_imp, y_tr)
rf_best = rf_search.best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

if _has_xgb:
    xgb = XGBRegressor(objective="reg:squarederror", random_state=RANDOM_STATE, nthread=-1)
    xgb_param = {
        "n_estimators":[800,1200,2000],
        "max_depth":[3,4,5,6],
        "learning_rate":[0.01,0.03,0.05,0.1],
        "subsample":[0.7,0.85,1.0],
        "colsample_bytree":[0.7,0.9,1.0],
        "reg_alpha":[0,0.1,1.0], "reg_lambda":[1.0,2.0,5.0]
    }
    xgb_search = RandomizedSearchCV(xgb, xgb_param, n_iter=35, scoring="r2", cv=KFold(4, shuffle=True, random_state=9),
                                    n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0)
    xgb_search.fit(X_tr_imp, y_tr)
    base_models.append(("xgb", xgb_search.best_estimator_))

if _has_lgbm:
    lgbm = LGBMRegressor(random_state=RANDOM_STATE)
    lgbm_param = {
        "n_estimators":[800,1200,2000],
        "learning_rate":[0.01,0.03,0.05,0.1],
        "max_depth":[-1,4,6,8],
        "num_leaves":[31,63,127],
        "subsample":[0.7,0.85,1.0],
        "colsample_bytree":[0.7,0.9,1.0],
        "min_child_samples":[5,10,20],
        "reg_alpha":[0,0.1,1.0], "reg_lambda":[1.0,2.0,5.0],
    }
    lgbm_search = RandomizedSearchCV(lgbm, lgbm_param, n_iter=35, scoring="r2", cv=KFold(4, shuffle=True, random_state=10),
                                     n_jobs=N_JOBS, random_state=RANDOM_STATE, verbose=0)
    lgbm_search.fit(X_tr_imp, y_tr)
    base_models.append(("lgbm", lgbm_search.best_estimator_))

# 5) OOF predictions for meta training (no leakage)
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
oof_preds = np.zeros((len(X_tr_imp), len(base_models)))
te_preds = np.zeros((len(X_te_imp), len(base_models)))

for m_idx, (name, model) in enumerate(base_models):
    fold_te = []
    oof = np.zeros(len(X_tr_imp))
    for tr_idx, val_idx in kf.split(X_tr_imp):
        X_tr_f, X_val_f = X_tr_imp[tr_idx], X_tr_imp[val_idx]
        y_tr_f, y_val_f = y_tr.iloc[tr_idx], y_tr.iloc[val_idx]
        model.fit(X_tr_f, y_tr_f)
        oof[val_idx] = model.predict(X_val_f)
        fold_te.append(model.predict(X_te_imp))
    oof_preds[:, m_idx] = oof
    te_preds[:, m_idx] = np.mean(fold_te, axis=0)

# 6) meta models on OOF preds
sc_meta = StandardScaler()
Z_tr = sc_meta.fit_transform(oof_preds)
Z_te = sc_meta.transform(te_preds)

ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Z_tr, y_tr)
elas_meta  = ElasticNetCV(l1_ratio=[.1,.3,.5,.7,.9], alphas=np.logspace(-3,1,20), max_iter=20000).fit(Z_tr, y_tr)

# small GB meta (nonlinear combiner)
gb_meta = GradientBoostingRegressor(random_state=RANDOM_STATE, n_estimators=500, learning_rate=0.03, max_depth=2)
gb_meta.fit(oof_preds, y_tr)

# 7) non-negative weighted blend (NNLS) on train
w_nnls, _ = nnls(oof_preds, (y_tr - y_tr.mean()).to_numpy())
blend_tr = y_tr.mean() + oof_preds @ w_nnls
blend_te = y_tr.mean() + te_preds @ w_nnls

# 8) evaluate on TEST
def eval_and_print(name, y_true, y_hat):
    r2 = r2_score(y_true, y_hat); mae = float(np.mean(np.abs(y_true - y_hat))); r = rmse(y_true, y_hat)
    print(f"{name:28s}  R2={r2:.4f}  MAE={mae:.4f}  RMSE={r:.4f}")
    return (name, r2, mae, r)

results = []
# individual tuned bases
for name, model in base_models:
    y_hat = model.fit(X_tr_imp, y_tr).predict(X_te_imp)
    results.append(eval_and_print(f"BASE {name}", y_te, y_hat))

# stackers
results.append(eval_and_print("Stack Ridge meta", y_te, ridge_meta.predict(Z_te)))
results.append(eval_and_print("Stack ElasticNet meta", y_te, elas_meta.predict(Z_te)))
results.append(eval_and_print("Stack GB meta", y_te, gb_meta.predict(te_preds)))
results.append(eval_and_print("NNLS non-neg blend", y_te, blend_te))

# simple average of all bases
avg_te = te_preds.mean(axis=1)
results.append(eval_and_print("Simple average blend", y_te, avg_te))

# summary table
summary = pd.DataFrame(results, columns=["Model","R2","MAE","RMSE"]).sort_values("R2", ascending=False)
display(summary.round(4))


In [None]:
# ===========================
# Final: Stack Ridge Meta on RB Grade
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# ---------- config ----------
CSV_PATH = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
RANDOM_STATE = 42
TEST_SIZE = 0.20
N_JOBS = -1

ALIASES = {
    "DOM++": ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time": ["40 Time","Forty","40"],
    "BMI": ["BMI"],
    "YPC": ["YPC","Yards per Carry","Yards/Carry"],
    "ELU": ["ELU","Elusiveness"],
    "YCO/A": ["YCO/A","YAC/A","Yards After Contact / Att"],
    "Break%": ["Break%","Breakaway %"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round"],
    "Bama": ["Bama","Bama Rating"],
    "Rec Yards":["Receiving Yards","Rec Yds"]
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower():c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def load_and_map(csv_path: Path):
    df = pd.read_csv(csv_path)
    df.columns = [c.strip() for c in df.columns]
    y_col = find_col(df, TARGET_CANDS)
    mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
    mapped = {k:v for k,v in mapped.items() if v}
    X = pd.DataFrame({feat: to_num(df[col]) for feat,col in mapped.items()})
    y = to_num(df[y_col])
    mask = y.notna()
    return X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True)

def add_features(X):
    X = X.copy()
    for c in ["40 Time","Draft Capital"]:  # invert "lower is better"
        if c in X.columns: X[c] = -X[c]
    return X

def rmse(y_true,y_pred): return np.sqrt(((y_true-y_pred)**2).mean())

# ---------- load ----------
X0,y = load_and_map(CSV_PATH)
X = add_features(X0)

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# impute
imp = SimpleImputer(strategy="median")
X_train_imp = imp.fit_transform(X_train)
X_test_imp  = imp.transform(X_test)

# ---------- base models ----------
gb = GradientBoostingRegressor(random_state=RANDOM_STATE)
gb_param = {
    "n_estimators":[800,1200],
    "learning_rate":[0.03,0.05],
    "max_depth":[3,4],
    "subsample":[0.8,1.0]
}
gb_best = RandomizedSearchCV(gb, gb_param, n_iter=5, scoring="r2",
                             cv=3, n_jobs=N_JOBS, random_state=RANDOM_STATE).fit(X_train_imp,y_train).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_STATE)
rf_param = {
    "n_estimators":[800,1200],
    "max_depth":[None,12,16],
    "min_samples_split":[2,5],
    "min_samples_leaf":[1,2]
}
rf_best = RandomizedSearchCV(rf, rf_param, n_iter=5, scoring="r2",
                             cv=3, n_jobs=N_JOBS, random_state=RANDOM_STATE).fit(X_train_imp,y_train).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

# ---------- OOF stacking ----------
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
oof_preds = np.zeros((len(X_train_imp), len(base_models)))
test_preds = np.zeros((len(X_test_imp), len(base_models)))

for m_idx, (name, model) in enumerate(base_models):
    fold_preds = []
    oof = np.zeros(len(X_train_imp))
    for tr_idx, val_idx in kf.split(X_train_imp):
        model.fit(X_train_imp[tr_idx], y_train.iloc[tr_idx])
        oof[val_idx] = model.predict(X_train_imp[val_idx])
        fold_preds.append(model.predict(X_test_imp))
    oof_preds[:,m_idx] = oof
    test_preds[:,m_idx] = np.mean(fold_preds, axis=0)

sc_meta = StandardScaler()
Z_train = sc_meta.fit_transform(oof_preds)
Z_test  = sc_meta.transform(test_preds)

ridge_meta = RidgeCV(alphas=np.logspace(-3,2,30)).fit(Z_train,y_train)
y_pred = ridge_meta.predict(Z_test)

# ---------- evaluate ----------
print("\n=== Stack Ridge Meta Results on Test Set ===")
print("R²:", round(r2_score(y_test,y_pred),4))
print("MAE:", round(mean_absolute_error(y_test,y_pred),4))
print("RMSE:", round(rmse(y_test,y_pred),4))

# show predictions vs actuals
results = pd.DataFrame({
    "Actual_RB_Grade": y_test.values,
    "Predicted_RB_Grade": y_pred
})
print("\nSample predictions:")
print(results.head(15).round(3))


In [None]:
# ===========================
# Stack Ridge Meta — 80/20 split, year-aware features, names in output
# Source CSV: ./data/Bakery/RB/Bakery_RB_Overall.csv
# Output CSV: ./data/Bakery/_derived/stack_ridge_predictions.csv
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# ---------- config ----------
CSV_PATH   = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_PATH   = Path("./data/Bakery/_derived/stack_ridge_predictions.csv")
RANDOM_SEED = 42
TEST_SIZE   = 0.20
N_JOBS      = -1

# canonical feature names (NO Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    # optional, used if present
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]
YEAR_CANDS   = ["Year","Draft Year","Class Year","class_year","Draft Class","DraftClass"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def load_sheet(path: Path):
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    y_col   = find_col(df, TARGET_CANDS)
    name_c  = find_col(df, NAME_CANDS) or "Player"
    year_c  = find_col(df, YEAR_CANDS)   # can be None

    mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
    mapped = {k:v for k,v in mapped.items() if v is not None}

    X = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
    y = to_num(df[y_col])
    names = df[name_c].astype(str).fillna("").values
    years = df[year_c].astype(int).values if year_c else None

    # drop rows with missing target
    mask = y.notna()
    return X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), \
           pd.Series(names)[mask].reset_index(drop=True), \
           (pd.Series(years)[mask].reset_index(drop=True) if years is not None else None)

def basic_interactions(X):
    X = X.copy()
    # invert "lower is better" BEFORE interactions
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
        if c in X.columns: X[c] = -X[c]
    # light interactions
    def mul(a,b,n): 
        if a in X.columns and b in X.columns: X[n] = X[a]*X[b]
    def div(a,b,n):
        if a in X.columns and b in X.columns:
            denom = X[b].replace(0, np.nan)
            X[n] = (X[a]/denom).clip(-10,10)
    mul("BMI","40 Time","BMIx40")
    mul("ELU","YCO/A","ELUxYCOA")
    mul("DOM++","Draft Capital","DOMxDraft")
    mul("YPC","ELU","YPCxELU")
    div("YCO/A","YPC","YCOA_to_YPC")
    return X

def add_year_z(train_df, test_df, year_series, cols_for_z):
    """
    Compute per-year z-scores using TRAIN-only stats; apply to both train/test.
    """
    if year_series is None:
        return train_df, test_df  # nothing to do

    train = train_df.copy(); test = test_df.copy()
    train["__YR__"] = year_series.loc[train.index].values
    # stats on TRAIN only
    grp = train.groupby("__YR__")[cols_for_z].agg(["mean","std"])
    # helper
    def zify(df):
        df = df.copy()
        df["__YR__"] = year_series.loc[df.index].values
        for c in cols_for_z:
            mu = df["__YR__"].map(grp[(c,"mean")])
            sd = df["__YR__"].map(grp[(c,"std")]).replace(0,np.nan)
            z  = (df[c]-mu)/sd
            df[c+"_yrz"] = z.fillna(0.0).values
        return df.drop(columns="__YR__")

    return zify(train_df), zify(test_df)

# ---------- pipeline ----------
# 1) load
X0, y, names, years = load_sheet(CSV_PATH)

# 2) interactions first
X1 = basic_interactions(X0)

# 3) split (keep names/years aligned)
X_tr_raw, X_te_raw, y_tr, y_te, names_tr, names_te, years_tr, years_te = train_test_split(
    X1, y, names, years, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

# 4) year-aware z features (train-only stats applied to test)
cols_for_z = [c for c in ["DOM++","40 Time","BMI","YPC","ELU","YCO/A","Break%","Draft Capital","Bama"] if c in X_tr_raw.columns]
X_tr_raw, X_te_raw = add_year_z(X_tr_raw, X_te_raw, pd.Series(years) if years is not None else None, cols_for_z)

# 5) impute (median) — fit on train only
imp = SimpleImputer(strategy="median")
X_tr_imp = imp.fit_transform(X_tr_raw)
X_te_imp = imp.transform(X_te_raw)

# 6) base models (light tuning to keep fast/reproducible)
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = {
    "n_estimators":[800,1200],
    "learning_rate":[0.03,0.05],
    "max_depth":[3,4],
    "subsample":[0.8,1.0],
    "min_samples_split":[2,5],
    "min_samples_leaf":[1,2],
}
gb_best = RandomizedSearchCV(gb, gb_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                             random_state=RANDOM_SEED).fit(X_tr_imp, y_tr).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf_param = {
    "n_estimators":[800,1200],
    "max_depth":[None,12,16],
    "min_samples_split":[2,5],
    "min_samples_leaf":[1,2],
    "max_features":["sqrt", 0.8, 1.0],
}
rf_best = RandomizedSearchCV(rf, rf_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                             random_state=RANDOM_SEED).fit(X_tr_imp, y_tr).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

# 7) OOF stacking (leakage-free)
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_tr_imp), len(base_models)))
te  = np.zeros((len(X_te_imp), len(base_models)))
for j,(nm, mdl) in enumerate(base_models):
    preds_te_folds = []
    fold_oof = np.zeros(len(X_tr_imp))
    for tr_idx, val_idx in kf.split(X_tr_imp):
        mdl.fit(X_tr_imp[tr_idx], y_tr.iloc[tr_idx])
        fold_oof[val_idx] = mdl.predict(X_tr_imp[val_idx])
        preds_te_folds.append(mdl.predict(X_te_imp))
    oof[:,j] = fold_oof
    te[:,j]  = np.mean(preds_te_folds, axis=0)

# 8) meta learner (RidgeCV on standardized OOF)
sc_meta = StandardScaler()
Z_tr = sc_meta.fit_transform(oof)
Z_te = sc_meta.transform(te)

ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Z_tr, y_tr)
y_hat = ridge_meta.predict(Z_te)

# 9) metrics + output
r2   = r2_score(y_te, y_hat)
mae  = mean_absolute_error(y_te, y_hat)
rmse_val = rmse(y_te, y_hat)

print("\n=== Stack Ridge Meta — Test Results ===")
print(f"R²:   {r2:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"RMSE: {rmse_val:.4f}")

pred_df = pd.DataFrame({
    "Player": names_te.values,
    "Actual_RB_Grade": y_te.values,
    "Predicted_RB_Grade": y_hat,
    "Error": (y_hat - y_te.values)
}).sort_values("Actual_RB_Grade", ascending=False)

print("\nTop 15 by Actual RB Grade (test set):")
print(pred_df.head(15).round(3))

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
pred_df.to_csv(OUT_PATH, index=False)
print(f"\nSaved predictions → {OUT_PATH}")


In [None]:
# ===========================
# Apply Stack Ridge Meta to entire dataset
# Train on ALL rows, predict for ALL rows
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# ---------- config ----------
CSV_PATH   = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_PATH   = Path("./data/Bakery/_derived/stack_ridge_all_players.csv")
RANDOM_SEED = 42
N_JOBS      = -1

# canonical features (no Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X):
    X = X.copy()
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
        if c in X.columns: X[c] = -X[c]
    if "BMI" in X and "40 Time" in X: X["BMIx40"] = X["BMI"]*X["40 Time"]
    if "ELU" in X and "YCO/A" in X: X["ELUxYCOA"] = X["ELU"]*X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"]*X["Draft Capital"]
    if "YPC" in X and "ELU" in X: X["YPCxELU"] = X["YPC"]*X["ELU"]
    return X

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

X = pd.DataFrame({feat: to_num(df[col]) for feat,col in mapped.items()})
y = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

mask = y.notna()
X, y, names = X[mask], y[mask], names[mask]

# interactions
X = basic_interactions(X)

# impute
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)

# ---------- train base models (with light tuning) ----------
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = {"n_estimators":[800],"learning_rate":[0.05],"max_depth":[4],"subsample":[0.85]}
gb_best = RandomizedSearchCV(gb, gb_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS).fit(X_imp,y).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf_param = {"n_estimators":[800],"max_depth":[12],"min_samples_split":[2],"min_samples_leaf":[1]}
rf_best = RandomizedSearchCV(rf, rf_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS).fit(X_imp,y).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

# ---------- stacking with OOF ----------
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))
for j,(nm,mdl) in enumerate(base_models):
    fold_preds = np.zeros(len(X_imp))
    for tr_idx,val_idx in kf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
        fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
    oof[:,j] = fold_preds

# meta learner (Ridge on OOF)
sc_meta = StandardScaler()
Z = sc_meta.fit_transform(oof)
ridge_meta = RidgeCV(alphas=np.logspace(-3,2,30)).fit(Z,y)

# retrain base models on full data
full_preds = []
for nm,mdl in base_models:
    mdl.fit(X_imp,y)
    full_preds.append(mdl.predict(X_imp))
stack_inputs = np.vstack(full_preds).T
stack_inputs = sc_meta.transform(stack_inputs)

y_hat = ridge_meta.predict(stack_inputs)

# ---------- results ----------
out = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y.values,
    "Predicted_RB_Grade": y_hat,
    "Error": y_hat - y.values
}).sort_values("Actual_RB_Grade", ascending=False)

print("\n=== Full Dataset Results (Stack Ridge Meta) ===")
print(out.head(20).round(3))

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False)
print(f"\nSaved → {OUT_PATH}")


In [None]:
# ===========================
# Full-dataset application + metrics + feature impact (Stack Ridge Meta)
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
PLAYERS_CSV = OUT_DIR / "stack_ridge_all_players.csv"
IMPACT_CSV  = OUT_DIR / "stack_ridge_feature_impact.csv"

RANDOM_SEED = 42
N_JOBS      = -1
N_SPLITS    = 5  # for OOF stacking

# canonical features (no Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better"
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
        if c in X.columns: X[c] = -X[c]
    # a few high-signal interactions
    if "BMI" in X and "40 Time" in X: X["BMIx40"] = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X: X["ELUxYCOA"] = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X: X["YPCxELU"] = X["YPC"] * X["ELU"]
    return X

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

X0 = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y  = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

# drop rows with missing target
mask = y.notna()
X0, y, names = X0.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

# interactions
X = basic_interactions(X0)

# impute full matrix (for later refit + predictions)
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)
feature_order = list(X.columns)

# ---------- base models (tuned lightly) ----------
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = { "n_estimators":[800,1200], "learning_rate":[0.03,0.05], "max_depth":[3,4], "subsample":[0.85,1.0],
             "min_samples_split":[2,5], "min_samples_leaf":[1,2] }
gb_best = RandomizedSearchCV(gb, gb_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                             random_state=RANDOM_SEED).fit(X_imp, y).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf_param = { "n_estimators":[800,1200], "max_depth":[None,12,16], "min_samples_split":[2,5], "min_samples_leaf":[1,2],
             "max_features":["sqrt",0.8,1.0] }
rf_best = RandomizedSearchCV(rf, rf_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                             random_state=RANDOM_SEED).fit(X_imp, y).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

# ---------- OOF stacking for honest metrics ----------
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))
for j,(nm, mdl) in enumerate(base_models):
    fold_preds = np.zeros(len(X_imp))
    for tr_idx, val_idx in kf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
        fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
    oof[:, j] = fold_preds

# meta on standardized OOF
sc_meta = StandardScaler()
Z = sc_meta.fit_transform(oof)
ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Z, y)

# OOF metrics (generalization estimate)
y_oof_hat = ridge_meta.predict(Z)
oof_r2  = r2_score(y, y_oof_hat)
oof_mae = mean_absolute_error(y, y_oof_hat)
oof_rmse= rmse(y, y_oof_hat)

# ---------- full refit on all data, then predict all rows ----------
# refit bases on ALL data
base_preds_full = []
for nm, mdl in base_models:
    mdl.fit(X_imp, y)
    base_preds_full.append(mdl.predict(X_imp))
stack_inputs = np.vstack(base_preds_full).T
stack_inputs_std = sc_meta.transform(stack_inputs)  # use same scaler as OOF
y_full_hat = ridge_meta.predict(stack_inputs_std)

# full fit metrics (in-sample)
full_r2  = r2_score(y, y_full_hat)
full_mae = mean_absolute_error(y, y_full_hat)
full_rmse= rmse(y, y_full_hat)

# ---------- export per-player predictions ----------
OUT_DIR.mkdir(parents=True, exist_ok=True)
players_out = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y.values,
    "Predicted_RB_Grade": y_full_hat,
    "Error": (y_full_hat - y.values)
}).sort_values("Actual_RB_Grade", ascending=False)
players_out.to_csv(PLAYERS_CSV, index=False)

# ---------- feature impact ----------
# 1) raw importances from bases
gb_imp = pd.Series(gb_best.feature_importances_, index=feature_order)
rf_imp = pd.Series(rf_best.feature_importances_, index=feature_order)

# 2) normalize each to sum to 1 (avoid bias from scale)
gb_imp_n = gb_imp / (gb_imp.sum() + 1e-12)
rf_imp_n = rf_imp / (rf_imp.sum() + 1e-12)

# 3) meta weights from Ridge on standardized base predictions
meta_coef = pd.Series(ridge_meta.coef_, index=[nm for nm,_ in base_models])
# Allow signs but normalize by L1 to represent relative influence
meta_w = meta_coef / (meta_coef.abs().sum() + 1e-12)

# 4) combine: weighted sum of normalized importances
# (Only GB and RF here; extend if you add more bases)
combined = meta_w.get("gb",0.0)*gb_imp_n + meta_w.get("rf",0.0)*rf_imp_n
impact_df = pd.DataFrame({
    "Impact_MetaWeighted": combined,
    "GB_Importance": gb_imp_n,
    "RF_Importance": rf_imp_n,
}).sort_values("Impact_MetaWeighted", ascending=False)
impact_df.to_csv(IMPACT_CSV, index=False)

# ---------- print summary ----------
print("\n=== Stack Ridge Meta — Metrics ===")
print(f"OOF R²:   {oof_r2:.4f}   (generalization estimate)")
print(f"OOF MAE:  {oof_mae:.4f}")
print(f"OOF RMSE: {oof_rmse:.4f}")
print(f"\nFull Fit R²:   {full_r2:.4f}   (in-sample)")
print(f"Full Fit MAE:  {full_mae:.4f}")
print(f"Full Fit RMSE: {full_rmse:.4f}")

print(f"\nSaved per-player predictions → {PLAYERS_CSV}")
print(f"Saved feature impact table  → {IMPACT_CSV}")

# Show top 15 players + top 15 features
print("\nTop players by Actual RB Grade:")
print(players_out.head(15).round(3))

print("\nTop features by meta-weighted impact:")
print(impact_df.head(15).round(4))


In [None]:
# ===========================
# Ablation: does DOMxDraft add value?
# Trains Stack Ridge Meta twice on ALL rows:
#   (A) with interactions: ["BMIx40","ELUxYCOA","DOMxDraft","YPCxELU"]
#   (B) same but WITHOUT "DOMxDraft"
# Saves metrics, per-player predictions, and feature impacts for both runs.
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
RANDOM_SEED = 42
N_JOBS      = -1
N_SPLITS    = 5

ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true,y_pred): return float(np.sqrt(mean_squared_error(y_true,y_pred)))

def load_base_table(path: Path):
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    y_col = find_col(df, TARGET_CANDS)
    name_col = find_col(df, NAME_CANDS) or "Player"
    mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
    mapped = {k:v for k,v in mapped.items() if v is not None}

    X = pd.DataFrame({feat: to_num(df[col]) for feat,col in mapped.items()})
    y = to_num(df[y_col])
    names = df[name_col].astype(str).fillna("")
    mask = y.notna()
    return X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

def add_interactions(X: pd.DataFrame, interactions: list) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better" BEFORE interactions
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
        if c in X.columns: X[c] = -X[c]
    def mul(a,b,n):
        if a in X.columns and b in X.columns:
            X[n] = X[a]*X[b]
    if "BMIx40" in interactions:
        mul("BMI","40 Time","BMIx40")
    if "ELUxYCOA" in interactions:
        mul("ELU","YCO/A","ELUxYCOA")
    if "DOMxDraft" in interactions:
        mul("DOM++","Draft Capital","DOMxDraft")
    if "YPCxELU" in interactions:
        mul("YPC","ELU","YPCxELU")
    return X

def train_stack_ridge_allrows(X: pd.DataFrame, y: pd.Series, names: pd.Series, tag: str):
    """
    Trains GB+RF bases + Ridge meta on ALL rows.
    Uses OOF predictions for meta training to report honest OOF metrics.
    Then refits bases on ALL rows and predicts everyone.
    Saves per-player CSV and feature-impact CSV with suffix `tag`.
    Returns metrics and path info.
    """
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    players_csv = OUT_DIR / f"stack_ridge_all_players_{tag}.csv"
    impact_csv  = OUT_DIR / f"stack_ridge_feature_impact_{tag}.csv"

    # impute
    imp = SimpleImputer(strategy="median")
    X_imp = imp.fit_transform(X)
    feature_order = list(X.columns)

    # base models (light but solid tuning)
    gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
    gb_param = {
        "n_estimators":[800,1200],
        "learning_rate":[0.03,0.05],
        "max_depth":[3,4],
        "subsample":[0.85,1.0],
        "min_samples_split":[2,5],
        "min_samples_leaf":[1,2],
    }
    gb_best = RandomizedSearchCV(gb, gb_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                                 random_state=RANDOM_SEED).fit(X_imp,y).best_estimator_

    rf = RandomForestRegressor(random_state=RANDOM_SEED)
    rf_param = {
        "n_estimators":[800,1200],
        "max_depth":[None,12,16],
        "min_samples_split":[2,5],
        "min_samples_leaf":[1,2],
        "max_features":["sqrt",0.8,1.0],
    }
    rf_best = RandomizedSearchCV(rf, rf_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                                 random_state=RANDOM_SEED).fit(X_imp,y).best_estimator_

    base_models = [("gb", gb_best), ("rf", rf_best)]

    # OOF stacking (no leakage)
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    oof = np.zeros((len(X_imp), len(base_models)))
    for j,(nm, mdl) in enumerate(base_models):
        fold_preds = np.zeros(len(X_imp))
        for tr_idx, val_idx in kf.split(X_imp):
            mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
            fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
        oof[:,j] = fold_preds

    sc_meta = StandardScaler()
    Z = sc_meta.fit_transform(oof)
    ridge_meta = RidgeCV(alphas=np.logspace(-3,2,30)).fit(Z, y)

    # OOF metrics
    y_oof = ridge_meta.predict(Z)
    oof_r2, oof_mae, oof_rmse = r2_score(y,y_oof), mean_absolute_error(y,y_oof), rmse(y,y_oof)

    # Refit bases on ALL rows, predict ALL rows
    base_full_preds = []
    for nm, mdl in base_models:
        mdl.fit(X_imp, y)
        base_full_preds.append(mdl.predict(X_imp))
    stack_inputs_std = sc_meta.transform(np.vstack(base_full_preds).T)
    y_hat = ridge_meta.predict(stack_inputs_std)

    # Full-fit metrics (in-sample)
    full_r2, full_mae, full_rmse = r2_score(y,y_hat), mean_absolute_error(y,y_hat), rmse(y,y_hat)

    # Save per-player predictions
    players_out = pd.DataFrame({
        "Player": names.values,
        "Actual_RB_Grade": y.values,
        "Predicted_RB_Grade": y_hat,
        "Error": y_hat - y.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    players_out.to_csv(players_csv, index=False)

    # Feature impact (meta-weighted)
    gb_imp = pd.Series(gb_best.feature_importances_, index=feature_order)
    rf_imp = pd.Series(rf_best.feature_importances_, index=feature_order)
    gb_imp_n = gb_imp / (gb_imp.sum() + 1e-12)
    rf_imp_n = rf_imp / (rf_imp.sum() + 1e-12)
    meta_coef = pd.Series(ridge_meta.coef_, index=[nm for nm,_ in base_models])
    meta_w = meta_coef / (meta_coef.abs().sum() + 1e-12)
    combined = meta_w.get("gb",0.0)*gb_imp_n + meta_w.get("rf",0.0)*rf_imp_n
    impact_df = pd.DataFrame({
        "Impact_MetaWeighted": combined,
        "GB_Importance": gb_imp_n,
        "RF_Importance": rf_imp_n,
    }).sort_values("Impact_MetaWeighted", ascending=False)
    impact_df.to_csv(impact_csv, index=False)

    metrics = {
        "tag": tag,
        "OOF_R2": oof_r2, "OOF_MAE": oof_mae, "OOF_RMSE": oof_rmse,
        "Full_R2": full_r2, "Full_MAE": full_mae, "Full_RMSE": full_rmse,
        "players_csv": str(players_csv),
        "impact_csv": str(impact_csv),
    }
    return metrics, players_out.head(8), impact_df.head(12)

# ===========================
# Run ablation
# ===========================
X_base, y, names = load_base_table(CSV_PATH)

interactions_with = ["BMIx40","ELUxYCOA","DOMxDraft","YPCxELU"]
interactions_wo   = ["BMIx40","ELUxYCOA","YPCxELU"]  # DOMxDraft removed

X_with = add_interactions(X_base, interactions_with)
X_wo   = add_interactions(X_base, interactions_wo)

m_with, sample_players_with, top_feats_with = train_stack_ridge_allrows(X_with, y, names, tag="with_domxdraft")
m_wo,   sample_players_wo,   top_feats_wo   = train_stack_ridge_allrows(X_wo,   y, names, tag="no_domxdraft")

# Compare
summary = pd.DataFrame([m_with, m_wo]).drop(columns=["players_csv","impact_csv"])
delta = summary.set_index("tag")
print("\n=== Ablation Summary (OOF = generalization estimate) ===")
print(delta[["OOF_R2","OOF_MAE","OOF_RMSE","Full_R2","Full_MAE","Full_RMSE"]].round(4))

print("\nTop players (with DOMxDraft):")
print(sample_players_with.round(3))
print("\nTop players (no DOMxDraft):")
print(sample_players_wo.round(3))

print("\nTop features by impact (with DOMxDraft):")
print(top_feats_with.round(4))
print("\nTop features by impact (no DOMxDraft):")
print(top_feats_wo.round(4))

print(f"\nPer-player CSVs:\n  {m_with['players_csv']}\n  {m_wo['players_csv']}")
print(f"Feature impact CSVs:\n  {m_with['impact_csv']}\n  {m_wo['impact_csv']}")


In [None]:
# ===========================
# One-by-one interaction ablation for Stack Ridge Meta
# - Baseline uses all interactions in `ALL_INTERACTIONS`
# - Runs ablation removing each interaction individually + "no interactions"
# - Outputs OOF (CV) metrics, Full-fit metrics, ranked Δ vs baseline
# - Saves per-player predictions and feature impact for every run
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived/ablation")
RANDOM_SEED = 42
N_JOBS      = -1
N_SPLITS    = 5

# choose your baseline interactions here
ALL_INTERACTIONS = ["BMIx40", "ELUxYCOA", "DOMxDraft", "YPCxELU"]

# canonical features (no Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true,y_pred): return float(np.sqrt(mean_squared_error(y_true,y_pred)))

def load_base_table(path: Path):
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    y_col = find_col(df, TARGET_CANDS)
    name_col = find_col(df, NAME_CANDS) or "Player"
    mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
    mapped = {k:v for k,v in mapped.items() if v is not None}

    X = pd.DataFrame({feat: to_num(df[col]) for feat,col in mapped.items()})
    y = to_num(df[y_col])
    names = df[name_col].astype(str).fillna("")
    mask = y.notna()
    return X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

def add_interactions(X: pd.DataFrame, interactions: list) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better" BEFORE interactions
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
        if c in X.columns: X[c] = -X[c]
    def mul(a,b,n):
        if a in X.columns and b in X.columns:
            X[n] = X[a]*X[b]
    if "BMIx40" in interactions:   mul("BMI","40 Time","BMIx40")
    if "ELUxYCOA" in interactions: mul("ELU","YCO/A","ELUxYCOA")
    if "DOMxDraft" in interactions:mul("DOM++","Draft Capital","DOMxDraft")
    if "YPCxELU" in interactions:  mul("YPC","ELU","YPCxELU")
    return X

def train_stack_ridge_allrows(X: pd.DataFrame, y: pd.Series, names: pd.Series, tag: str):
    """
    Trains GB+RF bases + Ridge meta on ALL rows.
    Uses OOF predictions for meta training => OOF metrics (honest).
    Refit bases on ALL rows => per-player predictions.
    Saves per-player CSV + feature-impact CSV (meta-weighted).
    """
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    players_csv = OUT_DIR / f"players_{tag}.csv"
    impact_csv  = OUT_DIR / f"impact_{tag}.csv"

    # impute
    imp = SimpleImputer(strategy="median")
    X_imp = imp.fit_transform(X)
    feature_order = list(X.columns)

    # base models (light tuning to keep fast)
    gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
    gb_param = {
        "n_estimators":[800,1200],
        "learning_rate":[0.03,0.05],
        "max_depth":[3,4],
        "subsample":[0.85,1.0],
        "min_samples_split":[2,5],
        "min_samples_leaf":[1,2],
    }
    gb_best = RandomizedSearchCV(gb, gb_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                                 random_state=RANDOM_SEED).fit(X_imp,y).best_estimator_

    rf = RandomForestRegressor(random_state=RANDOM_SEED)
    rf_param = {
        "n_estimators":[800,1200],
        "max_depth":[None,12,16],
        "min_samples_split":[2,5],
        "min_samples_leaf":[1,2],
        "max_features":["sqrt",0.8,1.0],
    }
    rf_best = RandomizedSearchCV(rf, rf_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                                 random_state=RANDOM_SEED).fit(X_imp,y).best_estimator_
    base_models = [("gb", gb_best), ("rf", rf_best)]

    # OOF stacking (no leakage)
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    oof = np.zeros((len(X_imp), len(base_models)))
    for j,(nm, mdl) in enumerate(base_models):
        fold_preds = np.zeros(len(X_imp))
        for tr_idx, val_idx in kf.split(X_imp):
            mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
            fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
        oof[:,j] = fold_preds

    sc_meta = StandardScaler()
    Z = sc_meta.fit_transform(oof)
    ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Z, y)

    # OOF metrics
    y_oof = ridge_meta.predict(Z)
    oof_r2, oof_mae, oof_rmse = r2_score(y,y_oof), mean_absolute_error(y,y_oof), rmse(y,y_oof)

    # Refit bases on ALL rows => predict ALL rows
    full_preds = []
    for nm, mdl in base_models:
        mdl.fit(X_imp, y)
        full_preds.append(mdl.predict(X_imp))
    stack_inputs_std = sc_meta.transform(np.vstack(full_preds).T)
    y_hat = ridge_meta.predict(stack_inputs_std)

    # Full-fit metrics
    full_r2, full_mae, full_rmse = r2_score(y,y_hat), mean_absolute_error(y,y_hat), rmse(y,y_hat)

    # Save per-player predictions
    players_out = pd.DataFrame({
        "Player": names.values,
        "Actual_RB_Grade": y.values,
        "Predicted_RB_Grade": y_hat,
        "Error": y_hat - y.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    players_out.to_csv(players_csv, index=False)

    # Feature impact (meta-weighted)
    gb_imp = pd.Series(gb_best.feature_importances_, index=feature_order)
    rf_imp = pd.Series(rf_best.feature_importances_, index=feature_order)
    gb_imp_n = gb_imp / (gb_imp.sum() + 1e-12)
    rf_imp_n = rf_imp / (rf_imp.sum() + 1e-12)
    meta_coef = pd.Series(ridge_meta.coef_, index=[nm for nm,_ in base_models])
    meta_w = meta_coef / (meta_coef.abs().sum() + 1e-12)
    combined = meta_w.get("gb",0.0)*gb_imp_n + meta_w.get("rf",0.0)*rf_imp_n
    impact_df = pd.DataFrame({
        "Impact_MetaWeighted": combined,
        "GB_Importance": gb_imp_n,
        "RF_Importance": rf_imp_n,
    }).sort_values("Impact_MetaWeighted", ascending=False)
    impact_df.to_csv(impact_csv, index=False)

    return {
        "tag": tag,
        "OOF_R2": oof_r2, "OOF_MAE": oof_mae, "OOF_RMSE": oof_rmse,
        "Full_R2": full_r2, "Full_MAE": full_mae, "Full_RMSE": full_rmse,
        "players_csv": str(players_csv),
        "impact_csv": str(impact_csv),
    }

# ---------- load base table once ----------
X_base, y, names = load_base_table(CSV_PATH)

# ---------- run baseline (all interactions) ----------
X_all = add_interactions(X_base, ALL_INTERACTIONS)
baseline = train_stack_ridge_allrows(X_all, y, names, tag="baseline_all_interactions")

# ---------- ablations: remove one interaction at a time ----------
results = [baseline]
for rem in ALL_INTERACTIONS:
    interactions = [x for x in ALL_INTERACTIONS if x != rem]
    X_ab = add_interactions(X_base, interactions)
    res = train_stack_ridge_allrows(X_ab, y, names, tag=f"minus_{rem}")
    results.append(res)

# also try "no interactions"
X_none = add_interactions(X_base, [])
res_none = train_stack_ridge_allrows(X_none, y, names, tag="no_interactions")
results.append(res_none)

# ---------- summarize ----------
tab = pd.DataFrame(results)
tab["ΔOOF_R2_vsBaseline"] = tab["OOF_R2"] - float(baseline["OOF_R2"])
tab["ΔFull_R2_vsBaseline"] = tab["Full_R2"] - float(baseline["Full_R2"])

ranked = (tab
          .sort_values(["ΔOOF_R2_vsBaseline","OOF_R2"], ascending=[True, False])
          .reset_index(drop=True))

pd.set_option("display.max_columns", None)
print("\n=== Interaction Ablation — Ranked by ΔOOF_R2 (higher is better; negative means worse than baseline) ===")
display(ranked[["tag","OOF_R2","ΔOOF_R2_vsBaseline","Full_R2","ΔFull_R2_vsBaseline","players_csv","impact_csv"]]
        .round(4))

print("\nOpen these CSVs to inspect predictions & feature impacts for any run (paths shown above).")


In [None]:
# ===========================
# Ablation: does DOMxDraft add value? (Now includes Draft Age)
# Trains Stack Ridge Meta twice on ALL rows:
#   (A) with interactions: ["BMIx40","ELUxYCOA","DOMxDraft","YPCxELU"]
#   (B) same but WITHOUT "DOMxDraft"
# Adds "Draft Age" as a feature (younger is better → inverted in preprocessing).
# Saves metrics, per-player predictions, and feature impacts for both runs.
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
RANDOM_SEED = 42
N_JOBS      = -1
N_SPLITS    = 5

ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BAMA","BamaAdj"],
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    # NEW: Draft Age (younger is better -> will be inverted)
    "Draft Age":    ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"]
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true,y_pred): return float(np.sqrt(mean_squared_error(y_true,y_pred)))

def load_base_table(path: Path):
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]
    y_col = find_col(df, TARGET_CANDS)
    name_col = find_col(df, NAME_CANDS) or "Player"
    mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
    mapped = {k:v for k,v in mapped.items() if v is not None}

    X = pd.DataFrame({feat: to_num(df[col]) for feat,col in mapped.items()})
    y = to_num(df[y_col])
    names = df[name_col].astype(str).fillna("")
    mask = y.notna()
    return X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

def add_interactions(X: pd.DataFrame, interactions: list) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better" BEFORE interactions
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns:
            X[c] = -X[c]  # faster 40, earlier round, quicker shuttle/3-cone, YOUNGER age => larger is better after invert
    def mul(a,b,n):
        if a in X.columns and b in X.columns:
            X[n] = X[a]*X[b]
    if "BMIx40" in interactions:
        mul("BMI","40 Time","BMIx40")
    if "ELUxYCOA" in interactions:
        mul("ELU","YCO/A","ELUxYCOA")
    if "DOMxDraft" in interactions:
        mul("DOM++","Draft Capital","DOMxDraft")
    if "YPCxELU" in interactions:
        mul("YPC","ELU","YPCxELU")
    return X

def train_stack_ridge_allrows(X: pd.DataFrame, y: pd.Series, names: pd.Series, tag: str):
    """
    Trains GB+RF bases + Ridge meta on ALL rows.
    Uses OOF predictions for meta training to report honest OOF metrics.
    Then refits bases on ALL rows and predicts everyone.
    Saves per-player CSV and feature-impact CSV with suffix `tag`.
    Returns metrics and path info.
    """
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    players_csv = OUT_DIR / f"stack_ridge_all_players_{tag}.csv"
    impact_csv  = OUT_DIR / f"stack_ridge_feature_impact_{tag}.csv"

    # impute
    imp = SimpleImputer(strategy="median")
    X_imp = imp.fit_transform(X)
    feature_order = list(X.columns)

    # base models (light but solid tuning)
    gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
    gb_param = {
        "n_estimators":[800,1200],
        "learning_rate":[0.03,0.05],
        "max_depth":[3,4],
        "subsample":[0.85,1.0],
        "min_samples_split":[2,5],
        "min_samples_leaf":[1,2],
    }
    gb_best = RandomizedSearchCV(gb, gb_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                                 random_state=RANDOM_SEED).fit(X_imp,y).best_estimator_

    rf = RandomForestRegressor(random_state=RANDOM_SEED)
    rf_param = {
        "n_estimators":[800,1200],
        "max_depth":[None,12,16],
        "min_samples_split":[2,5],
        "min_samples_leaf":[1,2],
        "max_features":["sqrt",0.8,1.0],
    }
    rf_best = RandomizedSearchCV(rf, rf_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                                 random_state=RANDOM_SEED).fit(X_imp,y).best_estimator_

    base_models = [("gb", gb_best), ("rf", rf_best)]

    # OOF stacking (no leakage)
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    oof = np.zeros((len(X_imp), len(base_models)))
    for j,(nm, mdl) in enumerate(base_models):
        fold_preds = np.zeros(len(X_imp))
        for tr_idx, val_idx in kf.split(X_imp):
            mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
            fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
        oof[:,j] = fold_preds

    sc_meta = StandardScaler()
    Z = sc_meta.fit_transform(oof)
    ridge_meta = RidgeCV(alphas=np.logspace(-3,2,30)).fit(Z, y)

    # OOF metrics
    y_oof = ridge_meta.predict(Z)
    oof_r2, oof_mae, oof_rmse = r2_score(y,y_oof), mean_absolute_error(y,y_oof), rmse(y,y_oof)

    # Refit bases on ALL rows, predict ALL rows
    base_full_preds = []
    for nm, mdl in base_models:
        mdl.fit(X_imp, y)
        base_full_preds.append(mdl.predict(X_imp))
    stack_inputs_std = sc_meta.transform(np.vstack(base_full_preds).T)
    y_hat = ridge_meta.predict(stack_inputs_std)

    # Full-fit metrics (in-sample)
    full_r2, full_mae, full_rmse = r2_score(y,y_hat), mean_absolute_error(y,y_hat), rmse(y,y_hat)

    # Save per-player predictions
    players_out = pd.DataFrame({
        "Player": names.values,
        "Actual_RB_Grade": y.values,
        "Predicted_RB_Grade": y_hat,
        "Error": y_hat - y.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    players_out.to_csv(players_csv, index=False)

    # Feature impact (meta-weighted)
    gb_imp = pd.Series(gb_best.feature_importances_, index=feature_order)
    rf_imp = pd.Series(rf_best.feature_importances_, index=feature_order)
    gb_imp_n = gb_imp / (gb_imp.sum() + 1e-12)
    rf_imp_n = rf_imp / (rf_imp.sum() + 1e-12)
    meta_coef = pd.Series(ridge_meta.coef_, index=[nm for nm,_ in base_models])
    meta_w = meta_coef / (meta_coef.abs().sum() + 1e-12)
    combined = meta_w.get("gb",0.0)*gb_imp_n + meta_w.get("rf",0.0)*rf_imp_n
    impact_df = pd.DataFrame({
        "Impact_MetaWeighted": combined,
        "GB_Importance": gb_imp_n,
        "RF_Importance": rf_imp_n,
    }).sort_values("Impact_MetaWeighted", ascending=False)
    impact_df.to_csv(impact_csv, index=False)

    metrics = {
        "tag": tag,
        "OOF_R2": oof_r2, "OOF_MAE": oof_mae, "OOF_RMSE": oof_rmse,
        "Full_R2": full_r2, "Full_MAE": full_mae, "Full_RMSE": full_rmse,
        "players_csv": str(players_csv),
        "impact_csv": str(impact_csv),
    }
    return metrics, players_out.head(8), impact_df.head(12)

# ===========================
# Run ablation
# ===========================
X_base, y, names = load_base_table(CSV_PATH)

interactions_with = ["BMIx40","ELUxYCOA","DOMxDraft","YPCxELU"]
interactions_wo   = ["BMIx40","ELUxYCOA","YPCxELU"]  # DOMxDraft removed

X_with = add_interactions(X_base, interactions_with)
X_wo   = add_interactions(X_base, interactions_wo)

m_with, sample_players_with, top_feats_with = train_stack_ridge_allrows(X_with, y, names, tag="with_domxdraft_plus_draftage")
m_wo,   sample_players_wo,   top_feats_wo   = train_stack_ridge_allrows(X_wo,   y, names, tag="no_domxdraft_plus_draftage")

# Compare
summary = pd.DataFrame([m_with, m_wo]).drop(columns=["players_csv","impact_csv"])
delta = summary.set_index("tag")
print("\n=== Ablation Summary (OOF = generalization estimate) ===")
print(delta[["OOF_R2","OOF_MAE","OOF_RMSE","Full_R2","Full_MAE","Full_RMSE"]].round(4))

print("\nTop players (with DOMxDraft):")
print(sample_players_with.round(3))
print("\nTop players (no DOMxDraft):")
print(sample_players_wo.round(3))

print("\nTop features by impact (with DOMxDraft):")
print(top_feats_with.round(4))
print("\nTop features by impact (no DOMxDraft):")
print(top_feats_wo.round(4))

print(f"\nPer-player CSVs:\n  {m_with['players_csv']}\n  {m_wo['players_csv']}")
print(f"Feature impact CSVs:\n  {m_with['impact_csv']}\n  {m_wo['impact_csv']}")


In [None]:
# ===========================
# Full-dataset application + metrics + feature impact (Stack Ridge Meta)
# Includes Draft Age feature (younger is better -> inverted)
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
PLAYERS_CSV = OUT_DIR / "stack_ridge_all_players.csv"
IMPACT_CSV  = OUT_DIR / "stack_ridge_feature_impact.csv"

RANDOM_SEED = 42
N_JOBS      = -1
N_SPLITS    = 5  # for OOF stacking

# canonical features (no Breakout Age) + Draft Age
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    # NEW: Draft Age (younger is better -> will be inverted)
    "Draft Age":    ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"]
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better" — faster times, earlier draft round, younger draft age
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns: 
            X[c] = -X[c]
    # a few high-signal interactions (unchanged)
    if "BMI" in X and "40 Time" in X: X["BMIx40"] = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X: X["ELUxYCOA"] = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X: X["YPCxELU"] = X["YPC"] * X["ELU"]
    return X

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

X0 = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y  = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

# drop rows with missing target
mask = y.notna()
X0, y, names = X0.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

# interactions (+ invert step)
X = basic_interactions(X0)

# impute full matrix (for later refit + predictions)
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)
feature_order = list(X.columns)

# ---------- base models (tuned lightly) ----------
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = { "n_estimators":[800,1200], "learning_rate":[0.03,0.05], "max_depth":[3,4], "subsample":[0.85,1.0],
             "min_samples_split":[2,5], "min_samples_leaf":[1,2] }
gb_best = RandomizedSearchCV(gb, gb_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                             random_state=RANDOM_SEED).fit(X_imp, y).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf_param = { "n_estimators":[800,1200], "max_depth":[None,12,16], "min_samples_split":[2,5], "min_samples_leaf":[1,2],
             "max_features":["sqrt",0.8,1.0] }
rf_best = RandomizedSearchCV(rf, rf_param, n_iter=6, scoring="r2", cv=3, n_jobs=N_JOBS,
                             random_state=RANDOM_SEED).fit(X_imp, y).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

# ---------- OOF stacking for honest metrics ----------
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))
for j,(nm, mdl) in enumerate(base_models):
    fold_preds = np.zeros(len(X_imp))
    for tr_idx, val_idx in kf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
        fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
    oof[:, j] = fold_preds

# meta on standardized OOF
sc_meta = StandardScaler()
Z = sc_meta.fit_transform(oof)
ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Z, y)

# OOF metrics (generalization estimate)
y_oof_hat = ridge_meta.predict(Z)
oof_r2  = r2_score(y, y_oof_hat)
oof_mae = mean_absolute_error(y, y_oof_hat)
oof_rmse= rmse(y, y_oof_hat)

# ---------- full refit on all data, then predict all rows ----------
# refit bases on ALL data
base_preds_full = []
for nm, mdl in base_models:
    mdl.fit(X_imp, y)
    base_preds_full.append(mdl.predict(X_imp))
stack_inputs = np.vstack(base_preds_full).T
stack_inputs_std = sc_meta.transform(stack_inputs)  # use same scaler as OOF
y_full_hat = ridge_meta.predict(stack_inputs_std)

# full fit metrics (in-sample)
full_r2  = r2_score(y, y_full_hat)
full_mae = mean_absolute_error(y, y_full_hat)
full_rmse= rmse(y, y_full_hat)

# ---------- export per-player predictions ----------
OUT_DIR.mkdir(parents=True, exist_ok=True)
players_out = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y.values,
    "Predicted_RB_Grade": y_full_hat,
    "Error": (y_full_hat - y.values)
}).sort_values("Actual_RB_Grade", ascending=False)
players_out.to_csv(PLAYERS_CSV, index=False)

# ---------- feature impact ----------
# 1) raw importances from bases
gb_imp = pd.Series(gb_best.feature_importances_, index=feature_order)
rf_imp = pd.Series(rf_best.feature_importances_, index=feature_order)

# 2) normalize each to sum to 1 (avoid bias from scale)
gb_imp_n = gb_imp / (gb_imp.sum() + 1e-12)
rf_imp_n = rf_imp / (rf_imp.sum() + 1e-12)

# 3) meta weights from Ridge on standardized base predictions
meta_coef = pd.Series(ridge_meta.coef_, index=[nm for nm,_ in base_models])
# Allow signs but normalize by L1 to represent relative influence
meta_w = meta_coef / (meta_coef.abs().sum() + 1e-12)

# 4) combine: weighted sum of normalized importances
combined = meta_w.get("gb",0.0)*gb_imp_n + meta_w.get("rf",0.0)*rf_imp_n
impact_df = pd.DataFrame({
    "Impact_MetaWeighted": combined,
    "GB_Importance": gb_imp_n,
    "RF_Importance": rf_imp_n,
}).sort_values("Impact_MetaWeighted", ascending=False)
impact_df.to_csv(IMPACT_CSV, index=False)

# ---------- print summary ----------
print("\n=== Stack Ridge Meta — Metrics (with Draft Age) ===")
print(f"OOF R²:   {oof_r2:.4f}   (generalization estimate)")
print(f"OOF MAE:  {oof_mae:.4f}")
print(f"OOF RMSE: {oof_rmse:.4f}")
print(f"\nFull Fit R²:   {full_r2:.4f}   (in-sample)")
print(f"Full Fit MAE:  {full_mae:.4f}")
print(f"Full Fit RMSE: {full_rmse:.4f}")

print(f"\nSaved per-player predictions → {PLAYERS_CSV}")
print(f"Saved feature impact table  → {IMPACT_CSV}")

# Show top 15 players + top 15 features
print("\nTop players by Actual RB Grade:")
print(players_out.head(15).round(3))

print("\nTop features by meta-weighted impact:")
print(impact_df.head(15).round(4))


In [None]:
# ===========================
# Tightened Stack Meta (with Draft Age, isotonic calibration, NNLS meta)
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import RepeatedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    HistGradientBoostingRegressor
)
from sklearn.linear_model import RidgeCV  # kept for feature impact ref
from scipy.optimize import nnls

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
PLAYERS_CSV = OUT_DIR / "stack_ridge_all_players_tight.csv"
IMPACT_CSV  = OUT_DIR / "stack_ridge_feature_impact_tight.csv"

RANDOM_SEED = 42
N_JOBS      = -1
N_SPLITS    = 5
N_REPEATS   = 3   # more stable OOF
N_ITER      = 30  # random search iterations per base

# canonical features + Draft Age
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BAMA","BamaAdj"],
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":    ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def winsorize_df(X: pd.DataFrame, lower=0.01, upper=0.99):
    X = X.copy()
    qs = X.quantile([lower, upper])
    for c in X.columns:
        lo, hi = qs.loc[lower, c], qs.loc[upper, c]
        if np.isfinite(lo) and np.isfinite(hi):
            X[c] = X[c].clip(lo, hi)
    return X

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better"
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns: X[c] = -X[c]
    # high-signal interactions
    if "BMI" in X and "40 Time" in X: X["BMIx40"] = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X: X["ELUxYCOA"] = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X: X["YPCxELU"] = X["YPC"] * X["ELU"]
    return X

# ---------- load & build feature matrix ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]
y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

X_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y     = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

mask = y.notna()
X_raw, y, names = X_raw.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

# filter sparse/constant features
keep = [c for c in X_raw.columns if X_raw[c].notna().sum() >= max(10, int(0.25*len(X_raw))) and X_raw[c].nunique(dropna=True) > 2]
X_raw = X_raw[keep]

# winsorize to reduce outlier drag
X_raw = winsorize_df(X_raw, 0.01, 0.99)

# add interactions + invert step
X = basic_interactions(X_raw)

# impute
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)
feature_order = list(X.columns)

# ---------- build tuned base models ----------
rng = np.random.RandomState(RANDOM_SEED)

gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = {
    "n_estimators":   rng.randint(600, 1400, N_ITER),
    "learning_rate":  rng.uniform(0.02, 0.08, N_ITER),
    "max_depth":      rng.randint(2, 5, N_ITER),
    "subsample":      rng.uniform(0.75, 1.0, N_ITER),
    "min_samples_split": rng.randint(2, 8, N_ITER),
    "min_samples_leaf":  rng.randint(1, 4, N_ITER),
}

rf = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)
rf_param = {
    "n_estimators":   rng.randint(700, 1400, N_ITER),
    "max_depth":      list(np.random.choice([None, 10, 12, 16, 20], N_ITER)),
    "min_samples_split": rng.randint(2, 8, N_ITER),
    "min_samples_leaf":  rng.randint(1, 4, N_ITER),
    "max_features":   list(np.random.choice(["sqrt", 0.6, 0.8, 1.0], N_ITER)),
}

et = ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)
et_param = {
    "n_estimators":   rng.randint(700, 1400, N_ITER),
    "max_depth":      list(np.random.choice([None, 12, 16, 20], N_ITER)),
    "min_samples_split": rng.randint(2, 8, N_ITER),
    "min_samples_leaf":  rng.randint(1, 4, N_ITER),
    "max_features":   list(np.random.choice(["sqrt", 0.6, 0.8, 1.0], N_ITER)),
}

hgb = HistGradientBoostingRegressor(random_state=RANDOM_SEED)
hgb_param = {
    "learning_rate":  rng.uniform(0.02, 0.1, N_ITER),
    "max_depth":      rng.randint(2, 10, N_ITER),
    "max_bins":       rng.randint(128, 255, N_ITER),
    "l2_regularization": rng.uniform(0.0, 1.0, N_ITER),
}

def fit_best(base, param_dist):
    search = RandomizedSearchCV(base, param_distributions=param_dist, n_iter=N_ITER,
                                scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED, verbose=0)
    return search.fit(X_imp, y).best_estimator_

gb_best  = fit_best(gb,  gb_param)
rf_best  = fit_best(rf,  rf_param)
et_best  = fit_best(et,  et_param)
hgb_best = fit_best(hgb, hgb_param)

base_models = [("gb", gb_best), ("hgb", hgb_best), ("rf", rf_best), ("et", et_best)]

# ---------- OOF predictions with RepeatedKFold ----------
rkf = RepeatedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))

for j,(nm, mdl) in enumerate(base_models):
    fold_pred = np.zeros(len(X_imp))
    # average multiple OOF passes per sample across repeats/folds
    counts = np.zeros(len(X_imp))
    for tr_idx, val_idx in rkf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
        p = mdl.predict(X_imp[val_idx])
        fold_pred[val_idx] += p
        counts[val_idx] += 1
    fold_pred = np.divide(fold_pred, counts, out=np.zeros_like(fold_pred), where=counts>0)
    oof[:, j] = fold_pred

# ---------- non-negative meta (NNLS) on OOF + isotonic calibration ----------
# standardize OOF before NNLS? For NNLS we keep raw scale and let it pick non-negative weights
w_nnls, _ = nnls(oof, y.values)
stack_oof = oof @ w_nnls

# calibrate with isotonic regression
iso = IsotonicRegression(out_of_bounds="clip")
stack_oof_cal = iso.fit_transform(stack_oof, y.values)

# OOF metrics
oof_r2   = r2_score(y, stack_oof_cal)
oof_mae  = mean_absolute_error(y, stack_oof_cal)
oof_rmse = rmse(y, stack_oof_cal)

# ---------- refit bases on ALL rows + produce calibrated predictions ----------
base_full_preds = []
for nm, mdl in base_models:
    mdl.fit(X_imp, y)
    base_full_preds.append(mdl.predict(X_imp))
base_full_preds = np.vstack(base_full_preds).T

y_full_raw = base_full_preds @ w_nnls
y_full_hat = iso.transform(y_full_raw)  # calibrated

# full fit (in-sample) metrics
full_r2   = r2_score(y, y_full_hat)
full_mae  = mean_absolute_error(y, y_full_hat)
full_rmse = rmse(y, y_full_hat)

# ---------- export per-player predictions ----------
OUT_DIR.mkdir(parents=True, exist_ok=True)
players_out = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y.values,
    "Predicted_RB_Grade": y_full_hat,
    "Error": (y_full_hat - y.values)
}).sort_values("Actual_RB_Grade", ascending=False)
players_out.to_csv(PLAYERS_CSV, index=False)

# ---------- feature impact (meta-weighted) ----------
# compute normalized importances for each base
def norm_imp(estimator, feats):
    if hasattr(estimator, "feature_importances_"):
        s = pd.Series(estimator.feature_importances_, index=feats)
    elif hasattr(estimator, "feature_names_in_"):
        # HistGB exposes no permutation by default; fallback to zero vec to avoid misreport
        s = pd.Series(np.zeros(len(feats)), index=feats)
    else:
        s = pd.Series(np.zeros(len(feats)), index=feats)
    s = s / (s.sum() + 1e-12)
    return s

imps = [norm_imp(m, feature_order) for _,m in base_models]

# convert meta weights to convex weights (non-negative, L1 normalized)
meta_w = w_nnls / (np.sum(w_nnls) + 1e-12)

# blend importances
combined = sum(meta_w[j] * imps[j] for j in range(len(imps)))
impact_df = pd.DataFrame({"Impact_MetaWeighted": combined})
for j,(nm,_) in enumerate(base_models):
    impact_df[f"{nm.upper()}_Importance"] = imps[j]
impact_df = impact_df.sort_values("Impact_MetaWeighted", ascending=False)
impact_df.to_csv(IMPACT_CSV, index=False)

# ---------- summary ----------
print("\n=== Tightened Stack Meta — Metrics (Draft Age included) ===")
print(f"OOF R²:   {oof_r2:.4f}")
print(f"OOF MAE:  {oof_mae:.4f}")
print(f"OOF RMSE: {oof_rmse:.4f}")
print(f"\nFull Fit R²:   {full_r2:.4f}")
print(f"Full Fit MAE:  {full_mae:.4f}")
print(f"Full Fit RMSE: {full_rmse:.4f}")

print(f"\nSaved per-player predictions → {PLAYERS_CSV}")
print(f"Saved feature impact table  → {IMPACT_CSV}")

print("\nTop players by Actual RB Grade:")
print(players_out.head(15).round(3))

print("\nTop features by meta-weighted impact:")
print(impact_df.head(15).round(4))


In [None]:
# ===========================
# Apply Stack Ridge Meta to entire dataset
# Train on ALL rows, predict for ALL rows
# Now includes Draft Age (younger is better -> inverted)
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_PATH    = Path("./data/Bakery/_derived/stack_ridge_all_players.csv")
RANDOM_SEED = 42
N_JOBS      = -1

# canonical features (+ Draft Age; still no Breakout Age)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    # NEW: Draft Age (younger is better -> will invert)
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better" (faster times, earlier rounds, younger age)
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns:
            X[c] = -X[c]
    # a few high-signal interactions (unchanged from your baseline)
    if "BMI" in X and "40 Time" in X:  X["BMIx40"]   = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:    X["ELUxYCOA"] = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:      X["YPCxELU"]  = X["YPC"] * X["ELU"]
    return X

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col   = find_col(df, TARGET_CANDS)
name_col= find_col(df, NAME_CANDS) or "Player"

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

X = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

mask = y.notna()
X, y, names = X.loc[mask], y.loc[mask], names.loc[mask]

# interactions (+ invert step)
X = basic_interactions(X)

# impute
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)

# ---------- train base models (with light tuning) ----------
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = {"n_estimators":[800], "learning_rate":[0.05], "max_depth":[4], "subsample":[0.85]}
gb_best = RandomizedSearchCV(gb, gb_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS).fit(X_imp, y).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf_param = {"n_estimators":[800], "max_depth":[12], "min_samples_split":[2], "min_samples_leaf":[1]}
rf_best = RandomizedSearchCV(rf, rf_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS).fit(X_imp, y).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

# ---------- stacking with OOF ----------
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))
for j,(nm, mdl) in enumerate(base_models):
    fold_preds = np.zeros(len(X_imp))
    for tr_idx, val_idx in kf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
        fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
    oof[:, j] = fold_preds

# meta learner (Ridge on standardized OOF)
sc_meta = StandardScaler()
Z = sc_meta.fit_transform(oof)
ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Z, y)

# retrain base models on full data
full_preds = []
for nm, mdl in base_models:
    mdl.fit(X_imp, y)
    full_preds.append(mdl.predict(X_imp))
stack_inputs = np.vstack(full_preds).T
stack_inputs = sc_meta.transform(stack_inputs)

y_hat = ridge_meta.predict(stack_inputs)

# ---------- results ----------
out = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y.values,
    "Predicted_RB_Grade": y_hat,
    "Error": y_hat - y.values
}).sort_values("Actual_RB_Grade", ascending=False)

print("\n=== Full Dataset Results (Stack Ridge Meta + Draft Age) ===")
print(out.head(20).round(3))

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False)
print(f"\nSaved → {OUT_PATH}")


In [None]:
# ===========================
# Apply Stack Ridge Meta to entire dataset
# Train on ALL rows, predict for ALL rows
# Draft Age is INCLUDED in the STACK META features (not just in bases)
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_PATH    = Path("./data/Bakery/_derived/stack_ridge_all_players.csv")
RANDOM_SEED = 42
N_JOBS      = -1

# canonical features (+ Draft Age; still no Breakout Age)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    # Draft Age (younger is better -> will invert)
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better" (faster times, earlier rounds, younger age)
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns:
            X[c] = -X[c]
    # baseline interactions
    if "BMI" in X and "40 Time" in X:        X["BMIx40"]   = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:          X["ELUxYCOA"] = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:            X["YPCxELU"]  = X["YPC"] * X["ELU"]
    return X

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

X = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

mask = y.notna()
X, y, names = X.loc[mask], y.loc[mask], names.loc[mask]

# interactions (+ invert step)
X = basic_interactions(X)

# impute
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)
feature_order = list(X.columns)

# keep track of Draft Age column index (post inversion)
draft_age_idx = feature_order.index("Draft Age") if "Draft Age" in feature_order else None

# ---------- train base models (with light tuning) ----------
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = {"n_estimators":[800], "learning_rate":[0.05], "max_depth":[4], "subsample":[0.85]}
gb_best = RandomizedSearchCV(gb, gb_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS).fit(X_imp, y).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf_param = {"n_estimators":[800], "max_depth":[12], "min_samples_split":[2], "min_samples_leaf":[1]}
rf_best = RandomizedSearchCV(rf, rf_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS).fit(X_imp, y).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

# ---------- stacking with OOF ----------
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))
for j,(nm, mdl) in enumerate(base_models):
    fold_preds = np.zeros(len(X_imp))
    for tr_idx, val_idx in kf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
        fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
    oof[:, j] = fold_preds

# === META INPUTS: OOF preds + Draft Age (imputed & standardized) ===
if draft_age_idx is not None:
    draft_age_meta = X_imp[:, draft_age_idx].reshape(-1, 1)
    sc_meta = StandardScaler()
    Z = np.hstack([oof, sc_meta.fit_transform(draft_age_meta)])
else:
    sc_meta = StandardScaler()
    Z = sc_meta.fit_transform(oof)

# meta learner (Ridge on meta inputs above)
ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Z, y)

# retrain base models on full data
full_preds = []
for nm, mdl in base_models:
    mdl.fit(X_imp, y)
    full_preds.append(mdl.predict(X_imp))
stack_inputs = np.vstack(full_preds).T

# add Draft Age to meta at predict-time too (use same scaler)
if draft_age_idx is not None:
    draft_age_meta_full = X_imp[:, draft_age_idx].reshape(-1, 1)
    stack_inputs_meta = np.hstack([stack_inputs, sc_meta.transform(draft_age_meta_full)])
else:
    stack_inputs_meta = sc_meta.transform(stack_inputs)

y_hat = ridge_meta.predict(stack_inputs_meta)

# ---------- results ----------
out = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y.values,
    "Predicted_RB_Grade": y_hat,
    "Error": y_hat - y.values
}).sort_values("Actual_RB_Grade", ascending=False)

print("\n=== Full Dataset Results (Stack Ridge Meta + Draft Age in META) ===")
print(out.head(20).round(3))

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False)
print(f"\nSaved → {OUT_PATH}")


In [None]:
# ===== Reverse-engineer Bakery RB Grade from Bakery_RB_Overall.csv (non-negative weights, no Breakout Age) =====
import re, json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from scipy.optimize import nnls

# ---------- config ----------
CSV_PATH = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
ROOT = CSV_PATH.parent
OUT_DIR = Path("./data/Bakery/_derived"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+", "", c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+", "", cand).lower()
        if key in norm:
            return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

# canonical features to look for (NO Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    # optional extras if present
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Vertical":     ["Vertical","Vertical Jump"],
    "Broad":        ["Broad","Broad Jump"],
    "Speed Score":  ["Speed Score","SpeedScore"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    "Targets":      ["Targets","Target Share","Tgt%"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]

# ---------- load ----------
if not CSV_PATH.exists():
    # fall back to any similarly named overall file
    candidates = list(ROOT.glob("Bakery_RB_Overall*.csv"))
    if not candidates:
        raise FileNotFoundError(f"Could not find {CSV_PATH} or any Bakery_RB_Overall*.csv under {ROOT}")
    CSV_PATH = candidates[0]

df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]
print("Loaded:", CSV_PATH)
print("Rows x Cols:", df.shape)

# ---------- map target + features ----------
y_col = find_col(df, TARGET_CANDS)
if not y_col:
    raise ValueError(f"Could not find RB Grade in columns:\n{df.columns.tolist()}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

if not mapped:
    raise ValueError("No usable feature columns found. Inspect df.columns for header names.")

print("\nUsing features (canonical <- sheet column):")
for k,v in mapped.items():
    print(f"  {k:<12} <- {v}")

X_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_raw = to_num(df[y_col])

# ---------- drop rows with NaN TARGET ----------
mask = y_raw.notna()
dropped = len(y_raw) - mask.sum()
if dropped:
    print(f"\nDropped {dropped} rows with NaN RB Grade.")
X_raw = X_raw.loc[mask].reset_index(drop=True)
y = y_raw.loc[mask].reset_index(drop=True)

# ---------- keep columns with enough data (loose thresholds for real-world sheets) ----------
keep = [c for c in X_raw.columns if X_raw[c].notna().sum() >= 5 and X_raw[c].nunique(dropna=True) > 1]
if not keep:
    raise ValueError("All candidate features are too sparse/constant. "
                     "Relax thresholds or ensure the Overall file has those columns filled.")
X_raw = X_raw[keep]
print("Kept features:", keep)

# ---------- invert where lower is better (NO Breakout Age) ----------
for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
    if c in X_raw.columns:
        X_raw[c] = -X_raw[c]

# ---------- impute X (median) + standardize ----------
imp = SimpleImputer(strategy="median")
X_imputed = imp.fit_transform(X_raw)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# final NaN guards
if np.isnan(X_scaled).any():
    raise ValueError("X still contains NaNs after imputation/standardization. Please inspect your data.")

if y.isna().any():
    raise ValueError("y contains NaNs after filtering; this should not happen.")

# ---------- fit non-negative models ----------
results = {}

# (A) Positive OLS
ols_pos = LinearRegression(positive=True)
ols_pos.fit(X_scaled, y)
r2_ols = float(ols_pos.score(X_scaled, y)) if y.var() > 0 else float("nan")
results["OLS_Positive"] = (r2_ols, pd.Series(ols_pos.coef_, index=X_raw.columns))

# (B) NNLS with mean intercept (stable, non-negative)
y_mean = float(y.mean())
w_nnls, _ = nnls(X_scaled, (y - y_mean).to_numpy())
y_pred = y_mean + X_scaled @ w_nnls
r2_nnls = float(1 - np.sum((y - y_pred)**2) / np.sum((y - y_mean)**2)) if y.var() > 0 else float("nan")
results["NNLS_Positive"] = (r2_nnls, pd.Series(w_nnls, index=X_raw.columns))

# ---------- report ----------
rows = []
for name, (r2, coefs) in results.items():
    row = {"Model": name, "R2": r2}
    row.update({f"w:{k}": v for k,v in coefs.items()})
    rows.append(row)

comp = pd.DataFrame(rows).set_index("Model").sort_values("R2", ascending=False)
pd.set_option("display.max_columns", None)
print("\n=== Model comparison (non-negative only) ===")
display(comp.round(4))

best_name = comp.index[0]
best_r2, best_coefs = results[best_name]
print(f"\nBest non-negative model: {best_name}  (R²={best_r2:.3f})")
print("\nSorted weights (standardized):")
print(best_coefs.sort_values(ascending=False).round(4))

# ---------- save artifacts for reuse ----------
weights_path = OUT_DIR / f"rb_weights_{best_name}.csv"
scaler_path  = OUT_DIR / "rb_scaler.json"
meta_path    = OUT_DIR / "rb_feature_mapping.json"

best_coefs.to_csv(weights_path, header=["coef"])
with open(scaler_path, "w") as f:
    json.dump({
        "means": scaler.mean_.tolist(),
        "scales": scaler.scale_.tolist(),
        "feature_order": list(X_raw.columns),
        "intercept_mean": y_mean,
        "model": best_name
    }, f, indent=2)

with open(meta_path, "w") as f:
    json.dump({"mapped_columns": mapped, "kept_features": keep, "target": y_col}, f, indent=2)

print(f"\nSaved weights → {weights_path}")
print(f"Saved scaler   → {scaler_path}")
print(f"Saved mapping  → {meta_path}")


In [None]:
# ===========================
# Apply Stack Ridge Meta to entire dataset
# Train on ALL rows, predict for ALL rows
# Draft Age treated like any other feature (base models only)
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_PATH    = Path("./data/Bakery/_derived/stack_ridge_all_players.csv")
RANDOM_SEED = 42
N_JOBS      = -1

# canonical features (+ Draft Age; no Breakout Age)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    # Draft Age (treated as a normal feature)
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better" (faster times, earlier rounds, younger age)
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns:
            X[c] = -X[c]
    # baseline interactions (unchanged; Draft Age is NOT injected into meta)
    if "BMI" in X and "40 Time" in X:        X["BMIx40"]   = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:          X["ELUxYCOA"] = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:            X["YPCxELU"]  = X["YPC"] * X["ELU"]
    return X

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

X = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

mask = y.notna()
X, y, names = X.loc[mask], y.loc[mask], names.loc[mask]

# interactions (+ invert)
X = basic_interactions(X)

# impute
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X)

# ---------- base models (light tuning) ----------
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = {"n_estimators":[800], "learning_rate":[0.05], "max_depth":[4], "subsample":[0.85]}
gb_best = RandomizedSearchCV(gb, gb_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS).fit(X_imp, y).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED)
rf_param = {"n_estimators":[800], "max_depth":[12], "min_samples_split":[2], "min_samples_leaf":[1]}
rf_best = RandomizedSearchCV(rf, rf_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS).fit(X_imp, y).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best)]

# ---------- stacking with OOF (meta sees ONLY OOF predictions) ----------
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))
for j,(nm, mdl) in enumerate(base_models):
    fold_preds = np.zeros(len(X_imp))
    for tr_idx, val_idx in kf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y.iloc[tr_idx])
        fold_preds[val_idx] = mdl.predict(X_imp[val_idx])
    oof[:, j] = fold_preds

# meta learner (Ridge on standardized OOF preds)
sc_meta = StandardScaler()
Z = sc_meta.fit_transform(oof)
ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Z, y)

# retrain base models on full data
full_preds = []
for nm, mdl in base_models:
    mdl.fit(X_imp, y)
    full_preds.append(mdl.predict(X_imp))
stack_inputs = np.vstack(full_preds).T
stack_inputs = sc_meta.transform(stack_inputs)

y_hat = ridge_meta.predict(stack_inputs)

# ---------- results ----------
out = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y.values,
    "Predicted_RB_Grade": y_hat,
    "Error": y_hat - y.values
}).sort_values("Actual_RB_Grade", ascending=False)

print("\n=== Full Dataset Results (Stack Ridge Meta; Draft Age as normal feature) ===")
print(out.head(20).round(3))

OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False)
print(f"\nSaved → {OUT_PATH}")


In [None]:
# ===========================
# Stack Ridge Meta with 80/20 Train-Test (Draft Age included as a normal feature)
# - Base models: GradientBoosting + RandomForest
# - Meta: RidgeCV on OOF predictions
# - Interactions: BMIx40, ELUxYCOA, DOMxDraft, YPCxELU
# - Inversions: 40 Time, Draft Capital, Shuttle, Three Cone, Draft Age (lower is better)
# - Predictions clipped to [0, 15]
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
TEST_SIZE   = 0.20
SEEDS       = [42, 1337, 7]   # run multiple iterations; add/remove seeds as desired
N_JOBS      = -1

# canonical features (+ Draft Age; no Breakout Age)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    # Draft Age (treated as a normal feature)
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better" (faster times, earlier rounds, younger age)
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns:
            X[c] = -X[c]
    # high-signal interactions
    if "BMI" in X and "40 Time" in X:              X["BMIx40"]    = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:                X["ELUxYCOA"]  = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X:      X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:                  X["YPCxELU"]   = X["YPC"] * X["ELU"]
    return X

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Could not find target column in {TARGET_CANDS}. Available: {list(df.columns)}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}
if not mapped:
    raise ValueError("No usable feature columns found from ALIASES.")

X_all = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all, y_all, names_all = X_all.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)

# build features
X_all = basic_interactions(X_all)

# ---------- runner for one split ----------
def run_one(seed: int):
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    pred_path = OUT_DIR / f"stack_ridge_test_predictions_seed{seed}.csv"

    # split
    X_train, X_test, y_train, y_test, n_train, n_test = train_test_split(
        X_all, y_all, names_all, test_size=TEST_SIZE, random_state=seed
    )

    # impute (fit on train only)
    imp = SimpleImputer(strategy="median")
    Xtr = imp.fit_transform(X_train)
    Xte = imp.transform(X_test)

    # base models (light tuning, CV on train only)
    gb = GradientBoostingRegressor(random_state=seed)
    gb_param = {"n_estimators":[800], "learning_rate":[0.05], "max_depth":[4], "subsample":[0.85]}
    gb_best = RandomizedSearchCV(gb, gb_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed).fit(Xtr, y_train).best_estimator_

    rf = RandomForestRegressor(random_state=seed, n_jobs=N_JOBS)
    rf_param = {"n_estimators":[800], "max_depth":[12], "min_samples_split":[2], "min_samples_leaf":[1]}
    rf_best = RandomizedSearchCV(rf, rf_param, n_iter=1, scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed).fit(Xtr, y_train).best_estimator_

    base_models = [("gb", gb_best), ("rf", rf_best)]

    # OOF for meta — on TRAIN only
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    oof = np.zeros((len(Xtr), len(base_models)))
    for j,(nm, mdl) in enumerate(base_models):
        fold_preds = np.zeros(len(Xtr))
        for tr_idx, val_idx in kf.split(Xtr):
            mdl.fit(Xtr[tr_idx], y_train.iloc[tr_idx])
            fold_preds[val_idx] = mdl.predict(Xtr[val_idx])
        oof[:, j] = fold_preds

    # meta (Ridge) on standardized OOF
    sc_meta = StandardScaler()
    Ztr = sc_meta.fit_transform(oof)
    ridge_meta = RidgeCV(alphas=np.logspace(-3, 2, 30)).fit(Ztr, y_train)

    # fit bases on full TRAIN, predict TEST
    base_preds_test = []
    for nm, mdl in base_models:
        mdl.fit(Xtr, y_train)
        base_preds_test.append(mdl.predict(Xte))
    stack_inputs_te = np.vstack(base_preds_test).T
    stack_inputs_te = sc_meta.transform(stack_inputs_te)

    y_pred_test = ridge_meta.predict(stack_inputs_te)

    # clip predictions to [0, 15] per requirement
    y_pred_test = np.clip(y_pred_test, 0.0, 15.0)

    # metrics on TEST
    r2   = r2_score(y_test, y_pred_test)
    mae  = mean_absolute_error(y_test, y_pred_test)
    rmse_val = rmse(y_test, y_pred_test)

    # save test predictions
    out = pd.DataFrame({
        "Player": n_test.values,
        "Actual_RB_Grade": y_test.values,
        "Predicted_RB_Grade": y_pred_test,
        "Error": y_pred_test - y_test.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    out.to_csv(pred_path, index=False)

    # print quick preview
    print(f"\n=== Seed {seed} — Test Metrics ===")
    print(f"R²: {r2:.4f} | MAE: {mae:.4f} | RMSE: {rmse_val:.4f}")
    print(f"Max predicted (test): {out['Predicted_RB_Grade'].max():.3f}")
    print(f"Saved test predictions → {pred_path}")

    return {"seed": seed, "R2": r2, "MAE": mae, "RMSE": rmse_val, "csv": str(pred_path)}

# ---------- run multiple iterations ----------
results = [run_one(s) for s in SEEDS]
summary = pd.DataFrame(results).sort_values("R2", ascending=False)
print("\n=== Summary across seeds (80/20 splits) ===")
print(summary.to_string(index=False, float_format=lambda x: f"{x:.4f}"))


In [None]:
# ===========================
# 80/20 Stack with NNLS + Isotonic (push R^2 toward .90)
# Bases: GB, RF, ExtraTrees, HistGB
# Meta: NNLS (non-negative) + Isotonic calibration
# Draft Age treated like any other feature (inverted)
# Predictions clipped to [0, 15]
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    HistGradientBoostingRegressor
)
from scipy.optimize import nnls

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
TEST_SIZE   = 0.20
SEEDS       = [42, 1337, 7]
N_JOBS      = -1
N_FOLDS     = 5
N_ITER      = 20   # random search iterations per base (cap by grid size)

ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better"
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns: X[c] = -X[c]
    # a few strong interactions
    if "BMI" in X and "40 Time" in X:         X["BMIx40"]    = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:           X["ELUxYCOA"]  = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:             X["YPCxELU"]   = X["YPC"] * X["ELU"]
    return X

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def space_size(param_grid: dict) -> int:
    """Number of unique combinations in a randomized grid (product of list lengths)."""
    n = 1
    for v in param_grid.values():
        n *= len(v)
    return n

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]
y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target column not found. Available: {list(df.columns)}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}
if not mapped:
    raise ValueError("No usable feature columns found from ALIASES.")

X_all = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all, y_all, names_all = X_all.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)
X_all = basic_interactions(X_all)

# ---------- run one seed ----------
from sklearn.utils._param_validation import InvalidParameterError  # safe import; unused but helps debuggers

def run_one(seed: int):
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    pred_path = OUT_DIR / f"stack_nnls_iso_test_predictions_seed{seed}.csv"

    X_train, X_test, y_train, y_test, n_train, n_test = train_test_split(
        X_all, y_all, names_all, test_size=TEST_SIZE, random_state=seed
    )

    imp = SimpleImputer(strategy="median")
    Xtr = imp.fit_transform(X_train)
    Xte = imp.transform(X_test)

    # --- tune bases on TRAIN only ---
    gb = GradientBoostingRegressor(random_state=seed)
    gb_param = {
        "n_estimators": [600, 900, 1200],
        "learning_rate": [0.03, 0.05, 0.07],
        "max_depth": [3, 4],
        "subsample": [0.8, 1.0],
        "min_samples_leaf": [1, 2],
    }
    gb_best = RandomizedSearchCV(
        gb, gb_param, n_iter=min(N_ITER, space_size(gb_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    rf = RandomForestRegressor(random_state=seed, n_jobs=N_JOBS)
    rf_param = {
        "n_estimators": [700, 1000, 1300],
        "max_depth": [None, 12, 16],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", 0.8, 1.0],
    }
    rf_best = RandomizedSearchCV(
        rf, rf_param, n_iter=min(N_ITER, space_size(rf_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    et = ExtraTreesRegressor(random_state=seed, n_jobs=N_JOBS)
    et_param = {
        "n_estimators": [700, 1000, 1300],
        "max_depth": [None, 12, 16],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", 0.8, 1.0],
    }
    et_best = RandomizedSearchCV(
        et, et_param, n_iter=min(N_ITER, space_size(et_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    hgb = HistGradientBoostingRegressor(random_state=seed)
    hgb_param = {
        "learning_rate": [0.03, 0.05, 0.08],
        "max_depth": [3, 6, 9],
        "l2_regularization": [0.0, 0.1, 0.5],
        "max_bins": [128, 255],
    }
    hgb_best = RandomizedSearchCV(
        hgb, hgb_param, n_iter=min(N_ITER, space_size(hgb_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    base_models = [("gb", gb_best), ("rf", rf_best), ("et", et_best), ("hgb", hgb_best)]

    # --- OOF for meta (TRAIN only) ---
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    oof = np.zeros((len(Xtr), len(base_models)))
    for j,(nm, mdl) in enumerate(base_models):
        fold_pred = np.zeros(len(Xtr))
        for tr_idx, val_idx in kf.split(Xtr):
            mdl.fit(Xtr[tr_idx], y_train.iloc[tr_idx])
            fold_pred[val_idx] = mdl.predict(Xtr[val_idx])
        oof[:, j] = fold_pred

    # --- NNLS meta on OOF ---
    w_nnls, _ = nnls(oof, y_train.values)
    stack_train_raw = oof @ w_nnls

    # --- Isotonic calibration on TRAIN ---
    iso = IsotonicRegression(out_of_bounds="clip")
    _ = iso.fit_transform(stack_train_raw, y_train.values)  # fit only; discard transformed train

    # --- Predict TEST ---
    base_preds_test = []
    for nm, mdl in base_models:
        mdl.fit(Xtr, y_train)
        base_preds_test.append(mdl.predict(Xte))
    base_preds_test = np.vstack(base_preds_test).T

    y_pred_raw = base_preds_test @ w_nnls
    y_pred_test = iso.transform(y_pred_raw)

    # clip to [0, 15]
    y_pred_test = np.clip(y_pred_test, 0.0, 15.0)

    # metrics
    r2   = r2_score(y_test, y_pred_test)
    mae  = mean_absolute_error(y_test, y_pred_test)
    rmse_val = rmse(y_test, y_pred_test)

    # save
    out = pd.DataFrame({
        "Player": n_test.values,
        "Actual_RB_Grade": y_test.values,
        "Predicted_RB_Grade": y_pred_test,
        "Error": y_pred_test - y_test.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    pred_path.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(pred_path, index=False)

    preview15 = out.head(15)

    print(f"\n=== Seed {seed} — TEST ===")
    print(f"R²: {r2:.4f} | MAE: {mae:.4f} | RMSE: {rmse_val:.4f} | MaxPred: {out['Predicted_RB_Grade'].max():.3f}")
    return {"seed": seed, "R2": r2, "MAE": mae, "RMSE": rmse_val, "csv": str(pred_path), "top15": preview15}

# ---------- run ----------
OUT_DIR.mkdir(parents=True, exist_ok=True)
results = [run_one(s) for s in SEEDS]
summary = pd.DataFrame([{k:v for k,v in r.items() if k!='top15'} for r in results]).sort_values("R2", ascending=False)
best = max(results, key=lambda d: d["R2"])

print("\n=== Summary across seeds ===")
print(summary.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
print(f"\nBest seed: {best['seed']}  R²={best['R2']:.4f}  MAE={best['MAE']:.4f}  RMSE={best['RMSE']:.4f}")
print(f"Test predictions CSV: {best['csv']}")

print("\n=== Top 15 players (Actual vs Predicted) — Best Seed ===")
print(best["top15"][["Player","Actual_RB_Grade","Predicted_RB_Grade","Error"]].round(3).to_string(index=False))


In [None]:
# ===========================
# Full-dataset stack (ALL rows) with NNLS + Isotonic
# Bases: GB, RF, ExtraTrees, HistGB
# Meta: NNLS (non-negative) + Isotonic calibration
# Draft Age treated like any other feature (inverted)
# Predictions clipped to [0, 15]
# Saves:
#   - data/Bakery/_derived/stack_nnls_iso_all_players.csv
#   - data/Bakery/_derived/stack_nnls_iso_feature_impact.csv
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    HistGradientBoostingRegressor
)
from scipy.optimize import nnls

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
ALL_PRED_CSV  = OUT_DIR / "stack_nnls_iso_all_players.csv"
IMPACT_CSV    = OUT_DIR / "stack_nnls_iso_feature_impact.csv"

RANDOM_SEED = 42
N_JOBS      = -1
N_FOLDS     = 5
N_ITER      = 20   # cap by grid-size helper below

# canonical features (+ Draft Age)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better"
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns: X[c] = -X[c]
    # a few strong interactions
    if "BMI" in X and "40 Time" in X:         X["BMIx40"]    = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:           X["ELUxYCOA"]  = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:             X["YPCxELU"]   = X["YPC"] * X["ELU"]
    return X

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def space_size(param_grid: dict) -> int:
    n = 1
    for v in param_grid.values():
        n *= len(v)
    return n

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]
y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target column not found. Available: {list(df.columns)}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}
if not mapped:
    raise ValueError("No usable feature columns found from ALIASES.")

X_all = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all, y_all, names_all = X_all.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)
X_all = basic_interactions(X_all)

# impute (fit on ALL, we are producing in-sample predictions)
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X_all)

# ---------- tune base models on ALL (for final refit/pred) ----------
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = {
    "n_estimators": [600, 900, 1200],
    "learning_rate": [0.03, 0.05, 0.07],
    "max_depth": [3, 4],
    "subsample": [0.8, 1.0],
    "min_samples_leaf": [1, 2],
}
gb_best = RandomizedSearchCV(
    gb, gb_param, n_iter=min(N_ITER, space_size(gb_param)),
    scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED
).fit(X_imp, y_all).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)
rf_param = {
    "n_estimators": [700, 1000, 1300],
    "max_depth": [None, 12, 16],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", 0.8, 1.0],
}
rf_best = RandomizedSearchCV(
    rf, rf_param, n_iter=min(N_ITER, space_size(rf_param)),
    scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED
).fit(X_imp, y_all).best_estimator_

et = ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)
et_param = {
    "n_estimators": [700, 1000, 1300],
    "max_depth": [None, 12, 16],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", 0.8, 1.0],
}
et_best = RandomizedSearchCV(
    et, et_param, n_iter=min(N_ITER, space_size(et_param)),
    scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED
).fit(X_imp, y_all).best_estimator_

hgb = HistGradientBoostingRegressor(random_state=RANDOM_SEED)
hgb_param = {
    "learning_rate": [0.03, 0.05, 0.08],
    "max_depth": [3, 6, 9],
    "l2_regularization": [0.0, 0.1, 0.5],
    "max_bins": [128, 255],
}
hgb_best = RandomizedSearchCV(
    hgb, hgb_param, n_iter=min(N_ITER, space_size(hgb_param)),
    scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED
).fit(X_imp, y_all).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best), ("et", et_best), ("hgb", hgb_best)]

# ---------- build OOF on ALL rows for meta + calib (no leakage in meta fit) ----------
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))
for j,(nm, mdl) in enumerate(base_models):
    fold_pred = np.zeros(len(X_imp))
    for tr_idx, val_idx in kf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y_all.iloc[tr_idx])
        fold_pred[val_idx] = mdl.predict(X_imp[val_idx])
    oof[:, j] = fold_pred

# NNLS meta on OOF
w_nnls, _ = nnls(oof, y_all.values)
stack_oof_raw = oof @ w_nnls

# Isotonic calibration on OOF
iso = IsotonicRegression(out_of_bounds="clip")
_ = iso.fit_transform(stack_oof_raw, y_all.values)  # fit only

# ---------- final fit on ALL, predict ALL ----------
base_preds_all = []
for nm, mdl in base_models:
    mdl.fit(X_imp, y_all)
    base_preds_all.append(mdl.predict(X_imp))
base_preds_all = np.vstack(base_preds_all).T

y_pred_raw = base_preds_all @ w_nnls
y_pred_all = iso.transform(y_pred_raw)
y_pred_all = np.clip(y_pred_all, 0.0, 15.0)   # enforce [0, 15]

# ---------- summary + save per-player ----------
OUT_DIR.mkdir(parents=True, exist_ok=True)
per_player = pd.DataFrame({
    "Player": names_all.values,
    "Actual_RB_Grade": y_all.values,
    "Predicted_RB_Grade": y_pred_all,
    "Error": y_pred_all - y_all.values
}).sort_values("Actual_RB_Grade", ascending=False)
per_player.to_csv(ALL_PRED_CSV, index=False)

# in-sample fit quality (for context only)
r2_full  = r2_score(y_all, y_pred_all)
mae_full = mean_absolute_error(y_all, y_pred_all)
rmse_full= rmse(y_all, y_pred_all)

print("\n=== Full-dataset (in-sample) fit quality ===")
print(f"R²: {r2_full:.4f} | MAE: {mae_full:.4f} | RMSE: {rmse_full:.4f}")
print(f"Saved per-player predictions → {ALL_PRED_CSV}")

print("\nTop 20 by Actual RB Grade:")
print(per_player.head(20).round(3).to_string(index=False))

# ---------- feature impact (meta-weighted) ----------
# Combine GB/RF/ET importances using NNLS weights (normalized)
def safe_importances(model, feature_names):
    if hasattr(model, "feature_importances_"):
        s = pd.Series(model.feature_importances_, index=feature_names)
        return s / (s.sum() + 1e-12)
    return pd.Series(0.0, index=feature_names)

feature_order = list(X_all.columns)   # after interactions
imp_gb = safe_importances(gb_best, feature_order)
imp_rf = safe_importances(rf_best, feature_order)
imp_et = safe_importances(et_best, feature_order)
# HistGB often lacks feature_importances_; ignore for impact aggregation

# normalize NNLS weights and use only the bases we included in impact
meta_w = pd.Series(w_nnls, index=[nm for nm,_ in base_models])
meta_w = meta_w / (meta_w.abs().sum() + 1e-12)

combined = (
    meta_w.get("gb",0.0)*imp_gb +
    meta_w.get("rf",0.0)*imp_rf +
    meta_w.get("et",0.0)*imp_et
)

impact_df = pd.DataFrame({
    "Impact_MetaWeighted": combined,
    "GB_Importance": imp_gb,
    "RF_Importance": imp_rf,
    "ET_Importance": imp_et
}).sort_values("Impact_MetaWeighted", ascending=False)

impact_df.to_csv(IMPACT_CSV, index=False)
print(f"\nSaved feature impact table → {IMPACT_CSV}")
print("\nTop 15 features by impact:")
print(impact_df.head(15).round(4).to_string())


In [None]:
# ===========================
# Full-dataset stack (ALL rows) with NNLS (no isotonic for in-sample use)
# Bases: GB, RF, ExtraTrees, HistGB
# Draft Age de-emphasized: winsorize + scale
# Predictions clipped to [0, 15]
# Saves:
#   - data/Bakery/_derived/stack_nnls_all_players.csv
#   - data/Bakery/_derived/stack_nnls_feature_impact.csv
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    HistGradientBoostingRegressor
)
from scipy.optimize import nnls

# ---------- config ----------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
ALL_PRED_CSV  = OUT_DIR / "stack_nnls_all_players.csv"
IMPACT_CSV    = OUT_DIR / "stack_nnls_feature_impact.csv"

RANDOM_SEED = 42
N_JOBS      = -1
N_FOLDS     = 5
N_ITER      = 20   # cap by grid-size helper below

# De-emphasize Draft Age
DRAFT_AGE_SCALE = 0.30      # try 0.2–0.4 to lower its influence
WINSOR = {                  # tame heavy tails
    "Draft Age": (0.05, 0.95),
    "Break%":    (0.01, 0.99),
}

ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def winsorize(col: pd.Series, lo_hi=(0.01, 0.99)) -> pd.Series:
    lo, hi = col.quantile(lo_hi[0]), col.quantile(lo_hi[1])
    return col.clip(lo, hi)

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # invert "lower is better"
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns: X[c] = -X[c]
    # scale down Draft Age to soften its influence
    if "Draft Age" in X.columns:
        X["Draft Age"] = X["Draft Age"] * DRAFT_AGE_SCALE
    # a few strong interactions (no Draft Age interactions)
    if "BMI" in X and "40 Time" in X:         X["BMIx40"]    = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:           X["ELUxYCOA"]  = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:             X["YPCxELU"]   = X["YPC"] * X["ELU"]
    return X

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def space_size(param_grid: dict) -> int:
    n = 1
    for v in param_grid.values():
        n *= len(v)
    return n

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]
y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target column not found. Available: {list(df.columns)}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}
if not mapped:
    raise ValueError("No usable feature columns found from ALIASES.")

X_all = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all, y_all, names_all = X_all.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)

# winsorize selected columns BEFORE inversion/interaction
for col, q in WINSOR.items():
    if col in X_all.columns:
        X_all[col] = winsorize(X_all[col], q)

# interactions + inversions + DraftAge scale
X_all = basic_interactions(X_all)

# impute (fit on ALL; in-sample use)
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X_all)

# ---------- tune base models on ALL ----------
gb = GradientBoostingRegressor(random_state=RANDOM_SEED)
gb_param = {
    "n_estimators": [600, 900, 1200],
    "learning_rate": [0.03, 0.05, 0.07],
    "max_depth": [3, 4],
    "subsample": [0.8, 1.0],
    "min_samples_leaf": [1, 2],
}
gb_best = RandomizedSearchCV(
    gb, gb_param, n_iter=min(N_ITER, space_size(gb_param)),
    scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED
).fit(X_imp, y_all).best_estimator_

rf = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)
rf_param = {
    "n_estimators": [700, 1000, 1300],
    "max_depth": [None, 12, 16],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", 0.8, 1.0],
}
rf_best = RandomizedSearchCV(
    rf, rf_param, n_iter=min(N_ITER, space_size(rf_param)),
    scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED
).fit(X_imp, y_all).best_estimator_

et = ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)
et_param = {
    "n_estimators": [700, 1000, 1300],
    "max_depth": [None, 12, 16],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", 0.8, 1.0],
}
et_best = RandomizedSearchCV(
    et, et_param, n_iter=min(N_ITER, space_size(et_param)),
    scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED
).fit(X_imp, y_all).best_estimator_

hgb = HistGradientBoostingRegressor(random_state=RANDOM_SEED)
hgb_param = {
    "learning_rate": [0.03, 0.05, 0.08],
    "max_depth": [3, 6, 9],
    "l2_regularization": [0.0, 0.1, 0.5],
    "max_bins": [128, 255],
}
hgb_best = RandomizedSearchCV(
    hgb, hgb_param, n_iter=min(N_ITER, space_size(hgb_param)),
    scoring="r2", cv=3, n_jobs=N_JOBS, random_state=RANDOM_SEED
).fit(X_imp, y_all).best_estimator_

base_models = [("gb", gb_best), ("rf", rf_best), ("et", et_best), ("hgb", hgb_best)]

# ---------- build OOF on ALL rows for NNLS meta ----------
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)
oof = np.zeros((len(X_imp), len(base_models)))
for j,(nm, mdl) in enumerate(base_models):
    fold_pred = np.zeros(len(X_imp))
    for tr_idx, val_idx in kf.split(X_imp):
        mdl.fit(X_imp[tr_idx], y_all.iloc[tr_idx])
        fold_pred[val_idx] = mdl.predict(X_imp[val_idx])
    oof[:, j] = fold_pred

# NNLS meta on OOF (no isotonic to avoid flat steps in-sample)
w_nnls, _ = nnls(oof, y_all.values)

# ---------- final fit on ALL, predict ALL ----------
base_preds_all = []
for nm, mdl in base_models:
    mdl.fit(X_imp, y_all)
    base_preds_all.append(mdl.predict(X_imp))
base_preds_all = np.vstack(base_preds_all).T

y_pred_all = base_preds_all @ w_nnls
y_pred_all = np.clip(y_pred_all, 0.0, 15.0)

# ---------- summary + save per-player ----------
OUT_DIR.mkdir(parents=True, exist_ok=True)
per_player = pd.DataFrame({
    "Player": names_all.values,
    "Actual_RB_Grade": y_all.values,
    "Predicted_RB_Grade": y_pred_all,
    "Error": y_pred_all - y_all.values
}).sort_values("Actual_RB_Grade", ascending=False)
per_player.to_csv(ALL_PRED_CSV, index=False)

r2_full  = r2_score(y_all, y_pred_all)
mae_full = mean_absolute_error(y_all, y_pred_all)
rmse_full= rmse(y_all, y_pred_all)

print("\n=== Full-dataset (in-sample) fit quality (no isotonic) ===")
print(f"R²: {r2_full:.4f} | MAE: {mae_full:.4f} | RMSE: {rmse_full:.4f}")
print(f"Saved per-player predictions → {ALL_PRED_CSV}")

print("\nTop 20 by Actual RB Grade:")
print(per_player.head(20).round(3).to_string(index=False))

# ---------- feature impact (meta-weighted) ----------
def safe_importances(model, feature_names):
    if hasattr(model, "feature_importances_"):
        s = pd.Series(model.feature_importances_, index=feature_names)
        return s / (s.sum() + 1e-12)
    return pd.Series(0.0, index=feature_names)

feature_order = list(X_all.columns)   # after interactions & scaling
imp_gb = safe_importances(gb_best, feature_order)
imp_rf = safe_importances(rf_best, feature_order)
imp_et = safe_importances(et_best, feature_order)

meta_w = pd.Series(w_nnls, index=[nm for nm,_ in base_models])
meta_w = meta_w / (meta_w.abs().sum() + 1e-12)

combined = (
    meta_w.get("gb",0.0)*imp_gb +
    meta_w.get("rf",0.0)*imp_rf +
    meta_w.get("et",0.0)*imp_et
)

impact_df = pd.DataFrame({
    "Impact_MetaWeighted": combined,
    "GB_Importance": imp_gb,
    "RF_Importance": imp_rf,
    "ET_Importance": imp_et
}).sort_values("Impact_MetaWeighted", ascending=False)

impact_df.to_csv(IMPACT_CSV, index=False)
print(f"\nSaved feature impact table → {IMPACT_CSV}")
print("\nTop 15 features by impact:")
print(impact_df.head(15).round(4).to_string())

# Quick check: show Draft Age impact after scaling
if "Draft Age" in impact_df.index:
    print("\nDraft Age impact (after scaling):")
    print(impact_df.loc["Draft Age"].round(5).to_string())


In [None]:
# ===========================
# 80/20 Stack (NNLS + Isotonic) with targeted feature influences
# - Push Break% impact up (~0.06) and Draft Age down (~0.02)
# - Bases: GB, RF, ExtraTrees, HistGB
# - Meta: NNLS (non-negative) + Isotonic calibration
# - Draft Age & test preds treated normally (preds clipped to [0, 15])
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    HistGradientBoostingRegressor
)
from scipy.optimize import nnls

# -----------------------------
# Config (tune these two first)
# -----------------------------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
TEST_SIZE   = 0.20
SEEDS       = [42, 1337, 7]      # try a few seeds; best R2 is reported
N_JOBS      = -1
N_FOLDS     = 5
N_ITER      = 20                 # random-search iterations per base (capped by grid size)

# Targeted influence nudges
BREAK_SCALE       = 7.0          # ↑ to boost Break% impact (~5–9 is typical)
DRAFT_AGE_SCALE   = 0.12         # ↓ to reduce Draft Age impact (~0.08–0.15)
WINSOR = {
    "Draft Age": (0.10, 0.90),   # tighter caps to avoid outlier dominance
    "Break%":    (0.01, 0.99),   # keep tails so it can matter
}

# Canonical column aliases (Draft Age included; no Breakout Age)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# -----------------------------
# Helpers
# -----------------------------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def winsorize_col(x: pd.Series, lo: float, hi: float) -> pd.Series:
    lo_q = x.quantile(lo)
    hi_q = x.quantile(hi)
    return x.clip(lo_q, hi_q)

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()

    # 1) Winsorize BEFORE scaling so the scale applies to trimmed ranges
    for col, (lo, hi) in WINSOR.items():
        if col in X.columns:
            X[col] = winsorize_col(X[col], lo, hi)

    # 2) Feature-specific scaling (nudges)
    if "Break%" in X.columns:
        X["Break%"] = X["Break%"] * BREAK_SCALE
    if "Draft Age" in X.columns:
        X["Draft Age"] = X["Draft Age"] * DRAFT_AGE_SCALE

    # 3) Invert “lower is better” timing/age/round metrics
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns:
            X[c] = -X[c]

    # 4) A few strong interactions
    if "BMI" in X and "40 Time" in X:         X["BMIx40"]    = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:           X["ELUxYCOA"]  = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:             X["YPCxELU"]   = X["YPC"] * X["ELU"]

    return X

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def space_size(param_grid: dict) -> int:
    n = 1
    for v in param_grid.values():
        n *= len(v)
    return n

# -----------------------------
# Load / prepare data
# -----------------------------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target not found. Available: {list(df.columns)}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}
if not mapped:
    raise ValueError("No usable feature columns found from ALIASES.")

X_all = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all, y_all, names_all = X_all.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)

# Build engineered features (winsorize + scale + invert + interactions)
X_all = basic_interactions(X_all)

# -----------------------------
# Run one seed (train/test)
# -----------------------------
def run_one(seed: int):
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    pred_path = OUT_DIR / f"stack_nnls_iso_test_predictions_seed{seed}.csv"

    X_train, X_test, y_train, y_test, n_train, n_test = train_test_split(
        X_all, y_all, names_all, test_size=TEST_SIZE, random_state=seed
    )

    imp = SimpleImputer(strategy="median")
    Xtr = imp.fit_transform(X_train)
    Xte = imp.transform(X_test)

    # ---- tune bases on TRAIN only ----
    gb = GradientBoostingRegressor(random_state=seed)
    gb_param = {
        "n_estimators": [600, 900, 1200],
        "learning_rate": [0.03, 0.05, 0.07],
        "max_depth": [3, 4],
        "subsample": [0.8, 1.0],
        "min_samples_leaf": [1, 2],
    }
    gb_best = RandomizedSearchCV(
        gb, gb_param, n_iter=min(N_ITER, space_size(gb_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    rf = RandomForestRegressor(random_state=seed, n_jobs=N_JOBS)
    rf_param = {
        "n_estimators": [700, 1000, 1300],
        "max_depth": [None, 12, 16],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", 0.8, 1.0],
    }
    rf_best = RandomizedSearchCV(
        rf, rf_param, n_iter=min(N_ITER, space_size(rf_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    et = ExtraTreesRegressor(random_state=seed, n_jobs=N_JOBS)
    et_param = {
        "n_estimators": [700, 1000, 1300],
        "max_depth": [None, 12, 16],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", 0.8, 1.0],
    }
    et_best = RandomizedSearchCV(
        et, et_param, n_iter=min(N_ITER, space_size(et_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    hgb = HistGradientBoostingRegressor(random_state=seed)
    hgb_param = {
        "learning_rate": [0.03, 0.05, 0.08],
        "max_depth": [3, 6, 9],
        "l2_regularization": [0.0, 0.1, 0.5],
        "max_bins": [128, 255],
    }
    hgb_best = RandomizedSearchCV(
        hgb, hgb_param, n_iter=min(N_ITER, space_size(hgb_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    base_models = [("gb", gb_best), ("rf", rf_best), ("et", et_best), ("hgb", hgb_best)]

    # ---- OOF for meta on TRAIN ----
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    oof = np.zeros((len(Xtr), len(base_models)))
    for j,(nm, mdl) in enumerate(base_models):
        fold_pred = np.zeros(len(Xtr))
        for tr_idx, val_idx in kf.split(Xtr):
            mdl.fit(Xtr[tr_idx], y_train.iloc[tr_idx])
            fold_pred[val_idx] = mdl.predict(Xtr[val_idx])
        oof[:, j] = fold_pred

    # ---- NNLS meta + isotonic calibration ----
    w_nnls, _ = nnls(oof, y_train.values)
    stack_train_raw = oof @ w_nnls

    iso = IsotonicRegression(out_of_bounds="clip")
    _ = iso.fit_transform(stack_train_raw, y_train.values)  # fit only

    # ---- Predict TEST ----
    base_preds_test = []
    for nm, mdl in base_models:
        mdl.fit(Xtr, y_train)
        base_preds_test.append(mdl.predict(Xte))
    base_preds_test = np.vstack(base_preds_test).T

    y_pred_raw = base_preds_test @ w_nnls
    y_pred_test = iso.transform(y_pred_raw)
    y_pred_test = np.clip(y_pred_test, 0.0, 15.0)

    # ---- Metrics ----
    r2   = r2_score(y_test, y_pred_test)
    mae  = mean_absolute_error(y_test, y_pred_test)
    rmse_val = rmse(y_test, y_pred_test)

    # ---- Save Test Predictions ----
    out = pd.DataFrame({
        "Player": n_test.values,
        "Actual_RB_Grade": y_test.values,
        "Predicted_RB_Grade": y_pred_test,
        "Error": y_pred_test - y_test.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    pred_path.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(pred_path, index=False)

    # ---- Meta-weighted feature impact (for inspection) ----
    # Fit bases on all TRAIN to get importances aligned to feature names
    feature_order = list(X_train.columns)
    fi = {}
    for nm, mdl in base_models:
        # refit already done above; ensure .feature_importances_ exists
        if hasattr(mdl, "feature_importances_"):
            fi[nm] = pd.Series(mdl.feature_importances_, index=feature_order)
        else:
            # Fallback: uniform tiny vector (rare with chosen models)
            fi[nm] = pd.Series(np.full(len(feature_order), 1.0 / len(feature_order)), index=feature_order)

    # Normalize each base FI then aggregate with NNLS weights
    fi_norm = {k: v / (v.sum() + 1e-12) for k, v in fi.items()}
    model_weights = pd.Series(w_nnls, index=[nm for nm,_ in base_models])
    model_weights = model_weights / (model_weights.sum() + 1e-12)

    combined = sum(model_weights.get(nm, 0.0) * fi_norm[nm] for nm,_ in base_models)
    impact_df = pd.DataFrame({"Impact_MetaWeighted": combined})
    for nm,_ in base_models:
        impact_df[f"{nm.upper()}_Importance"] = fi_norm[nm]
    impact_df = impact_df.sort_values("Impact_MetaWeighted", ascending=False)

    impact_path = OUT_DIR / f"stack_nnls_iso_feature_impact_seed{seed}.csv"
    impact_df.to_csv(impact_path, index=False)

    print(f"\n=== Seed {seed} — TEST ===")
    print(f"R²: {r2:.4f} | MAE: {mae:.4f} | RMSE: {rmse_val:.4f} | MaxPred: {out['Predicted_RB_Grade'].max():.3f}")
    print("Top 15 on TEST (by Actual):")
    print(out.head(15).round(3).to_string(index=False))
    print(f"Saved predictions → {pred_path}")
    print(f"Saved feature impact → {impact_path}")
    return {"seed": seed, "R2": r2, "MAE": mae, "RMSE": rmse_val, "csv": str(pred_path), "impact_csv": str(impact_path)}

# -----------------------------
# Run across seeds
# -----------------------------
OUT_DIR.mkdir(parents=True, exist_ok=True)
results = [run_one(s) for s in SEEDS]
summary = pd.DataFrame(results).sort_values("R2", ascending=False)

print("\n=== Summary across seeds (80/20) ===")
print(summary.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
best = summary.iloc[0]
print(f"\nBest seed: {int(best['seed'])}  |  R²={best['R2']:.4f}  MAE={best['MAE']:.4f}  RMSE={best['RMSE']:.4f}")
print(f"Test predictions CSV: {best['csv']}")
print(f"Feature impact CSV:   {best['impact_csv']}")


In [None]:
# ===========================
# 80/20 Stack (NNLS) + Linear (Ridge) calibration  — no plateaus
# Bases: GB, RF, ExtraTrees, HistGB
# Draft Age treated like any other feature (with down-weight)
# Break% up-weight
# Predictions clipped to [0, 15]
# ===========================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    HistGradientBoostingRegressor
)
from scipy.optimize import nnls

# -----------------------------
# Config (same as before)
# -----------------------------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
TEST_SIZE   = 0.20
SEEDS       = [42, 1337, 7]
N_JOBS      = -1
N_FOLDS     = 5
N_ITER      = 20

# Targeted influence nudges
BREAK_SCALE       = 7.0    # increase Break% influence
DRAFT_AGE_SCALE   = 0.12   # reduce Draft Age influence
WINSOR = {
    "Draft Age": (0.10, 0.90),
    "Break%":    (0.01, 0.99),
}

ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# -----------------------------
# Helpers
# -----------------------------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def winsorize_col(x: pd.Series, lo: float, hi: float) -> pd.Series:
    lo_q = x.quantile(lo)
    hi_q = x.quantile(hi)
    return x.clip(lo_q, hi_q)

def basic_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()

    # Winsorize first
    for col, (lo, hi) in WINSOR.items():
        if col in X.columns:
            X[col] = winsorize_col(X[col], lo, hi)

    # Influence nudges
    if "Break%" in X.columns:
        X["Break%"] = X["Break%"] * BREAK_SCALE
    if "Draft Age" in X.columns:
        X["Draft Age"] = X["Draft Age"] * DRAFT_AGE_SCALE

    # Invert “lower is better”
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns:
            X[c] = -X[c]

    # Interactions
    if "BMI" in X and "40 Time" in X:         X["BMIx40"]    = X["BMI"] * X["40 Time"]
    if "ELU" in X and "YCO/A" in X:           X["ELUxYCOA"]  = X["ELU"] * X["YCO/A"]
    if "DOM++" in X and "Draft Capital" in X: X["DOMxDraft"] = X["DOM++"] * X["Draft Capital"]
    if "YPC" in X and "ELU" in X:             X["YPCxELU"]   = X["YPC"] * X["ELU"]

    return X

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def space_size(param_grid: dict) -> int:
    n = 1
    for v in param_grid.values():
        n *= len(v)
    return n

# -----------------------------
# Load & feature build
# -----------------------------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target not found. Available: {list(df.columns)}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}
if not mapped:
    raise ValueError("No usable feature columns found from ALIASES.")

X_all = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all, y_all, names_all = X_all.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)
X_all = basic_interactions(X_all)

# -----------------------------
# Train/test for a seed
# -----------------------------
def run_one(seed: int):
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    pred_path   = OUT_DIR / f"stack_nnls_lin_test_predictions_seed{seed}.csv"
    impact_path = OUT_DIR / f"stack_nnls_lin_feature_impact_seed{seed}.csv"

    X_train, X_test, y_train, y_test, n_train, n_test = train_test_split(
        X_all, y_all, names_all, test_size=TEST_SIZE, random_state=seed
    )

    imp = SimpleImputer(strategy="median")
    Xtr = imp.fit_transform(X_train)
    Xte = imp.transform(X_test)

    # --- tune bases ---
    gb = GradientBoostingRegressor(random_state=seed)
    gb_param = {
        "n_estimators": [600, 900, 1200],
        "learning_rate": [0.03, 0.05, 0.07],
        "max_depth": [3, 4],
        "subsample": [0.8, 1.0],
        "min_samples_leaf": [1, 2],
    }
    gb_best = RandomizedSearchCV(
        gb, gb_param, n_iter=min(N_ITER, space_size(gb_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    rf = RandomForestRegressor(random_state=seed, n_jobs=N_JOBS)
    rf_param = {
        "n_estimators": [700, 1000, 1300],
        "max_depth": [None, 12, 16],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", 0.8, 1.0],
    }
    rf_best = RandomizedSearchCV(
        rf, rf_param, n_iter=min(N_ITER, space_size(rf_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    et = ExtraTreesRegressor(random_state=seed, n_jobs=N_JOBS)
    et_param = {
        "n_estimators": [700, 1000, 1300],
        "max_depth": [None, 12, 16],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2],
        "max_features": ["sqrt", 0.8, 1.0],
    }
    et_best = RandomizedSearchCV(
        et, et_param, n_iter=min(N_ITER, space_size(et_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    hgb = HistGradientBoostingRegressor(random_state=seed)
    hgb_param = {
        "learning_rate": [0.03, 0.05, 0.08],
        "max_depth": [3, 6, 9],
        "l2_regularization": [0.0, 0.1, 0.5],
        "max_bins": [128, 255],
    }
    hgb_best = RandomizedSearchCV(
        hgb, hgb_param, n_iter=min(N_ITER, space_size(hgb_param)),
        scoring="r2", cv=3, n_jobs=N_JOBS, random_state=seed
    ).fit(Xtr, y_train).best_estimator_

    base_models = [("gb", gb_best), ("rf", rf_best), ("et", et_best), ("hgb", hgb_best)]

    # --- OOF for NNLS meta ---
    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)
    oof = np.zeros((len(Xtr), len(base_models)))
    for j,(nm, mdl) in enumerate(base_models):
        fold_pred = np.zeros(len(Xtr))
        for tr_idx, val_idx in kf.split(Xtr):
            mdl.fit(Xtr[tr_idx], y_train.iloc[tr_idx])
            fold_pred[val_idx] = mdl.predict(Xtr[val_idx])
        oof[:, j] = fold_pred

    # --- NNLS stack weights ---
    w_nnls, _ = nnls(oof, y_train.values)
    stack_train_raw = oof @ w_nnls

    # --- Linear calibration (Ridge) on TRAIN ---
    calib = Ridge(alpha=1.0, random_state=seed)
    calib.fit(stack_train_raw.reshape(-1,1), y_train.values)

    # --- Predict TEST ---
    base_preds_test = []
    for nm, mdl in base_models:
        mdl.fit(Xtr, y_train)
        base_preds_test.append(mdl.predict(Xte))
    base_preds_test = np.vstack(base_preds_test).T

    y_pred_raw = base_preds_test @ w_nnls
    y_pred_test = calib.predict(y_pred_raw.reshape(-1,1))

    # clip to [0, 15]
    y_pred_test = np.clip(y_pred_test, 0.0, 15.0)

    # metrics
    r2   = r2_score(y_test, y_pred_test)
    mae  = mean_absolute_error(y_test, y_pred_test)
    rmse_val = rmse(y_test, y_pred_test)

    # save predictions
    out = pd.DataFrame({
        "Player": n_test.values,
        "Actual_RB_Grade": y_test.values,
        "Predicted_RB_Grade": y_pred_test,
        "Error": y_pred_test - y_test.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    pred_path.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(pred_path, index=False)

    # meta-weighted feature impact
    feature_order = list(X_train.columns)
    fi = {}
    for nm, mdl in base_models:
        if hasattr(mdl, "feature_importances_"):
            fi[nm] = pd.Series(mdl.feature_importances_, index=feature_order)
        else:
            fi[nm] = pd.Series(np.full(len(feature_order), 1.0 / len(feature_order)), index=feature_order)

    fi_norm = {k: v / (v.sum() + 1e-12) for k, v in fi.items()}
    model_weights = pd.Series(w_nnls, index=[nm for nm,_ in base_models])
    model_weights = model_weights / (model_weights.sum() + 1e-12)

    combined = sum(model_weights.get(nm, 0.0) * fi_norm[nm] for nm,_ in base_models)
    impact_df = pd.DataFrame({"Impact_MetaWeighted": combined})
    for nm,_ in base_models:
        impact_df[f"{nm.upper()}_Importance"] = fi_norm[nm]
    impact_df = impact_df.sort_values("Impact_MetaWeighted", ascending=False)
    impact_df.to_csv(impact_path, index=False)

    print(f"\n=== Seed {seed} — TEST ===")
    print(f"R²: {r2:.4f} | MAE: {mae:.4f} | RMSE: {rmse_val:.4f} | MaxPred: {out['Predicted_RB_Grade'].max():.3f}")
    print("Top 15 on TEST (by Actual):")
    print(out.head(15).round(3).to_string(index=False))
    print(f"Saved predictions → {pred_path}")
    print(f"Saved feature impact → {impact_path}")

    return {"seed": seed, "R2": r2, "MAE": mae, "RMSE": rmse_val,
            "csv": str(pred_path), "impact_csv": str(impact_path)}

# -----------------------------
# Run across seeds
# -----------------------------
OUT_DIR.mkdir(parents=True, exist_ok=True)
results = [run_one(s) for s in SEEDS]
summary = pd.DataFrame(results).sort_values("R2", ascending=False)

print("\n=== Summary across seeds (80/20) ===")
print(summary.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
best = summary.iloc[0]
print(f"\nBest seed: {int(best['seed'])}  |  R²={best['R2']:.4f}  MAE={best['MAE']:.4f}  RMSE={best['RMSE']:.4f}")
print(f"Test predictions CSV: {best['csv']}")
print(f"Feature impact CSV:   {best['impact_csv']}")


In [None]:
# ============================================================
# RB Grade — Non-negative linear "line of best fit" (NNLS)
#  - 80/20 train/test by seeds
#  - Interaction-only polynomial features (degree=2)
#  - Impute (median) + Standardize
#  - Non-negative least squares (weights >= 0)
#  - Intercept = y_train.mean()
#  - Predictions clipped to [0, 15]
#  - Saves predictions and coefficient tables per seed
# ============================================================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.optimize import nnls

# ---------------- Config ----------------
CSV_PATH     = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR      = Path("./data/Bakery/_derived")
TEST_SIZE    = 0.20
SEEDS        = [3, 7, 11, 19, 23, 29, 31, 37, 41, 42, 1337]
CLIP_RANGE   = (0.0, 15.0)

# polynomial features
POLY_DEGREE           = 2
POLY_INTERACTION_ONLY = True  # only pairwise interactions, no squares

# columns (Draft Age included; still no Breakout Age)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# -------------- helpers --------------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# -------------- load --------------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target column not found. Available: {list(df.columns)}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}
if not mapped:
    raise ValueError("No usable feature columns found from ALIASES.")

X_all = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all, y_all, names_all = X_all.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)

OUT_DIR.mkdir(parents=True, exist_ok=True)
base_feature_names = list(X_all.columns)

# -------------- runner --------------
def run_one(seed: int):
    pred_path = OUT_DIR / f"rb_nnls_test_predictions_seed{seed}.csv"
    coef_path = OUT_DIR / f"rb_nnls_coeffs_seed{seed}.csv"

    X_train, X_test, y_train, y_test, n_train, n_test = train_test_split(
        X_all, y_all, names_all, test_size=TEST_SIZE, random_state=seed
    )

    # impute
    imputer = SimpleImputer(strategy="median")
    Xtr_base = imputer.fit_transform(X_train)
    Xte_base = imputer.transform(X_test)

    # polynomial interactions
    poly = PolynomialFeatures(
        degree=POLY_DEGREE,
        interaction_only=POLY_INTERACTION_ONLY,
        include_bias=False
    )
    Xtr_poly = poly.fit_transform(Xtr_base)
    Xte_poly = poly.transform(Xte_base)
    feature_names = poly.get_feature_names_out(base_feature_names)

    # standardize
    scaler = StandardScaler()
    Xtr = scaler.fit_transform(Xtr_poly)
    Xte = scaler.transform(Xte_poly)

    # ---------- Non-negative least squares ----------
    # center y so the intercept = y_mean (keeps weights non-negative)
    y_mean = float(y_train.mean())
    y_center = y_train - y_mean

    # NNLS solve: min ||X w - y_center||  s.t. w >= 0
    w, _ = nnls(Xtr, y_center.to_numpy())

    # predictions: add back mean, clip to [0, 15]
    y_pred = y_mean + Xte @ w
    y_pred = np.clip(y_pred, CLIP_RANGE[0], CLIP_RANGE[1])

    # metrics
    r2   = r2_score(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse_val = rmse(y_test, y_pred)

    # outputs
    pred_df = pd.DataFrame({
        "Player": n_test.values,
        "Actual_RB_Grade": y_test.values,
        "Predicted_RB_Grade": y_pred,
        "Error": y_pred - y_test.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    pred_df.to_csv(pred_path, index=False)

    coef = pd.Series(w, index=feature_names, name="weight").sort_values(ascending=False)
    # all weights must be >= 0 by construction
    assert (coef >= -1e-12).all(), "Found a negative weight — should not happen with NNLS."
    coef.to_csv(coef_path, header=True)

    print(f"\n=== Seed {seed} — TEST (NNLS, non-negative weights) ===")
    print(f"R²: {r2:.4f} | MAE: {mae:.4f} | RMSE: {rmse_val:.4f} | MaxPred: {pred_df['Predicted_RB_Grade'].max():.3f}")
    print("Top 15 (by Actual):")
    print(pred_df.head(15).round(3).to_string(index=False))
    print(f"Saved predictions → {pred_path}")
    print(f"Saved coefficients → {coef_path}")

    return {"seed": seed, "R2": r2, "MAE": mae, "RMSE": rmse_val, "pred_csv": str(pred_path), "coef_csv": str(coef_path), "coefs": coef}

# -------------- run all seeds --------------
results = [run_one(s) for s in SEEDS]
summary = pd.DataFrame([{k:v for k,v in r.items() if k!='coefs'} for r in results]).sort_values("R2", ascending=False)

print("\n=== Summary across seeds (NNLS, non-negative weights) ===")
print(summary.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

# Aggregate coefficient stability across seeds
coef_df = pd.DataFrame(results[0]["coefs"])
coef_df.columns = [f"seed_{results[0]['seed']}"]
for r in results[1:]:
    coef_df = coef_df.join(r["coefs"].rename(f"seed_{r['seed']}"), how="outer")
coef_mean = coef_df.mean(axis=1).fillna(0.0)
coef_std  = coef_df.std(axis=1).fillna(0.0)

coef_agg = pd.DataFrame({"weight_mean": coef_mean, "weight_std": coef_std})
coef_agg = coef_agg.reindex(coef_agg["weight_mean"].sort_values(ascending=False).index)

agg_path = OUT_DIR / "rb_nnls_coeffs_aggregate.csv"
coef_agg.to_csv(agg_path)

print(f"\nSaved aggregate coefficient table → {agg_path}")
print("Top 20 features by mean weight (non-negative):")
print(coef_agg.head(20).round(5))


In [None]:
# ============================================================
# RB Grade — Lean model with minimal interactions (Fixed)
#   • Greedy forward selection now seeds with best single feature
#   • Won't evaluate CV with 0 columns
#   • Interactions only considered when both parents are selected
#   • 80/20 train–test; GB tuned; predictions clipped to [0, 15]
# ============================================================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

# ---------------- Config ----------------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
TEST_SIZE   = 0.20
SEEDS       = [42, 1337, 7]
CV_FOLDS    = 5
CLIP_RANGE  = (0.0, 15.0)
IMPROVE_MIN = 0.002     # minimum CV R² gain to accept a new feature

BASE_FEATURES = [
    "DOM++","40 Time","BMI","YPC","ELU","YCO/A","Break%","Draft Capital","Bama","Draft Age",
    # "Shuttle","Three Cone","Rec Yards",  # optional if present
]

INTERACTIONS = {
    "DOMxDraft": ("DOM++", "Draft Capital"),
    "YPCxELU":   ("YPC",   "ELU"),
    # "ELUxYCOA":  ("ELU",   "YCO/A"),  # optional if you want to allow it
}

ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred):
    from sklearn.metrics import mean_squared_error
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def add_interactions(X, use_interactions):
    X = X.copy()
    for name, (a, b) in INTERACTIONS.items():
        if name in use_interactions and a in X.columns and b in X.columns:
            X[name] = X[a] * X[b]
    return X

def build_X(df, mapped_cols, use_interactions=None, invert_lower_better=True):
    X = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped_cols.items()})
    if invert_lower_better:
        for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
            if c in X.columns: X[c] = -X[c]
    if use_interactions:
        X = add_interactions(X, use_interactions)
    return X

# ---------- load ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target column not found. Available: {list(df.columns)}")

mapped = {}
for feat in BASE_FEATURES:
    col = find_col(df, ALIASES.get(feat, [feat]))
    if col is not None:
        mapped[feat] = col

available_features = list(mapped.keys())
if not available_features:
    raise ValueError("None of the base features were found in the CSV.")

X0 = build_X(df, mapped, use_interactions=None)
y  = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

mask = y.notna()
X0, y, names = X0.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

allowed_interactions = []
for name, (a,b) in INTERACTIONS.items():
    if a in X0.columns and b in X0.columns:
        allowed_interactions.append(name)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# --------- greedy forward selection (fixed) ---------
def greedy_select(X_train_df, y_train, base_feats, interactions, random_state=42):
    imp = SimpleImputer(strategy="median")

    def cv_score(cols, inters):
        # guard: no columns -> invalid
        if len(cols) == 0:
            return -1e12
        X_use = X_train_df[cols].copy()
        if inters:
            for iname, (a,b) in INTERACTIONS.items():
                if iname in inters and a in X_use.columns and b in X_use.columns:
                    X_use[iname] = X_use[a]*X_use[b]
        Xt = imp.fit_transform(X_use)
        gb = GradientBoostingRegressor(
            random_state=random_state,
            n_estimators=600, learning_rate=0.05, max_depth=3, subsample=0.9,
            min_samples_leaf=2
        )
        kf = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=random_state)
        cv = cross_val_score(gb, Xt, y_train, scoring="r2", cv=kf)
        return float(np.mean(cv))

    # --- Step 1: pick the best single feature to seed ---
    best_first_feat = None
    best_first_score = -1e12
    for f in base_feats:
        sc = cv_score([f], [])
        if sc > best_first_score:
            best_first_score = sc
            best_first_feat = f
    if best_first_feat is None:
        # fallback: at least take one feature arbitrarily to avoid empty set
        best_first_feat = base_feats[0]
        best_first_score = cv_score([best_first_feat], [])

    selected_feats = [best_first_feat]
    selected_inters = []
    best = best_first_score

    # remaining candidates
    feat_cands = [f for f in base_feats if f != best_first_feat]
    inter_cands = interactions.copy()

    while True:
        improved = False
        best_try = None
        best_score = best

        # try adding one more base feature
        for f in feat_cands:
            score = cv_score(selected_feats + [f], selected_inters)
            if score > best_score + 1e-9:
                best_try = ("feat", f)
                best_score = score

        # try adding an interaction ONLY if its parents are already selected
        if len(selected_inters) < 2:
            for inter in inter_cands:
                a, b = INTERACTIONS[inter]
                if a in selected_feats and b in selected_feats:
                    score = cv_score(selected_feats, selected_inters + [inter])
                    if score > best_score + 1e-9:
                        best_try = ("inter", inter)
                        best_score = score

        if best_try and (best_score - best) >= IMPROVE_MIN:
            if best_try[0] == "feat":
                f = best_try[1]
                selected_feats.append(f)
                feat_cands.remove(f)
            else:
                i = best_try[1]
                selected_inters.append(i)
                inter_cands.remove(i)
            best = best_score
            improved = True

        if not improved:
            break

    return selected_feats, selected_inters, best

# --------- one full train/test run ----------
def run_one(seed: int):
    pred_path = OUT_DIR / f"lean_gb_predictions_seed{seed}.csv"

    X_train_df, X_test_df, y_train, y_test, n_train, n_test = train_test_split(
        X0, y, names, test_size=TEST_SIZE, random_state=seed
    )

    feats_sel, inters_sel, cv_r2 = greedy_select(
        X_train_df, y_train,
        base_feats=[f for f in available_features if f in X_train_df.columns],
        interactions=allowed_interactions,
        random_state=seed
    )

    # build final train/test matrices with the selected set
    X_train = X_train_df[feats_sel].copy()
    X_test  = X_test_df[feats_sel].copy()
    for iname in inters_sel:
        a,b = INTERACTIONS[iname]
        X_train[iname] = X_train[a]*X_train[b]
        X_test[iname]  = X_test[a]*X_test[b]

    imp = SimpleImputer(strategy="median")
    Xtr = imp.fit_transform(X_train)
    Xte = imp.transform(X_test)

    # final tuning
    gb = GradientBoostingRegressor(random_state=seed)
    param = {
        "n_estimators": [700, 900, 1200],
        "learning_rate": [0.03, 0.05, 0.07],
        "max_depth": [3, 4],
        "subsample": [0.85, 1.0],
        "min_samples_leaf": [1, 2],
    }
    search = RandomizedSearchCV(
        gb, param, n_iter=min(12, np.prod([len(v) for v in param.values()])),
        scoring="r2", cv=CV_FOLDS, random_state=seed, n_jobs=-1
    ).fit(Xtr, y_train)
    best_gb = search.best_estimator_

    best_gb.fit(Xtr, y_train)
    y_pred = best_gb.predict(Xte)
    y_pred = np.clip(y_pred, CLIP_RANGE[0], CLIP_RANGE[1])

    r2   = r2_score(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rse  = rmse(y_test, y_pred)

    out = pd.DataFrame({
        "Player": n_test.values,
        "Actual_RB_Grade": y_test.values,
        "Predicted_RB_Grade": y_pred,
        "Error": y_pred - y_test.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    out.to_csv(pred_path, index=False)

    print(f"\n=== Seed {seed} — TEST (lean GB) ===")
    print(f"Selected features ({len(feats_sel)}): {feats_sel}")
    print(f"Selected interactions ({len(inters_sel)}): {inters_sel}")
    print(f"Greedy CV R² (train only): {cv_r2:.4f}")
    print(f"TEST  R²: {r2:.4f} | MAE: {mae:.4f} | RMSE: {rse:.4f} | MaxPred: {out['Predicted_RB_Grade'].max():.3f}")
    print("\nTop 15 by Actual (TEST):")
    print(out.head(15).round(3).to_string(index=False))
    print(f"\nSaved predictions → {pred_path}")

    return {
        "seed": seed, "R2": r2, "MAE": mae, "RMSE": rse,
        "cvR2": cv_r2, "pred_csv": str(pred_path),
        "features": feats_sel, "interactions": inters_sel
    }

# --------- run across seeds ----------
OUT_DIR.mkdir(parents=True, exist_ok=True)
results = [run_one(s) for s in SEEDS]
summary = pd.DataFrame(results).sort_values("R2", ascending=False)
print("\n=== Summary across seeds (lean features) ===")
print(summary[["seed","R2","MAE","RMSE","cvR2","features","interactions"]]
      .to_string(index=False, float_format=lambda x: f"{x:.4f}"))


In [None]:
# ===============================================================
# RB Grade — Wide search over feature subsets & hyperparameters
# - Randomly sample feature combos (+ limited interactions)
# - Tune multiple models (GB, RF, ET, HGB) with RandomizedSearchCV
# - 80/20 train-test; CV on TRAIN only; metrics on TEST
# - Predictions clipped to [0, 15]
# - Logs leaderboard + meta for reproducibility
# ===============================================================
import re, json, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import (
    GradientBoostingRegressor, RandomForestRegressor,
    ExtraTreesRegressor, HistGradientBoostingRegressor
)

# ---------------- Search controls ----------------
CSV_PATH            = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR             = Path("./data/Bakery/_derived")
TEST_SIZE           = 0.20

SEEDS               = [42, 1337, 7]   # run search for multiple seeds
N_SUBSETS           = 60              # random feature subsets per seed
MAX_BASE_FEATS      = 7               # cap base features per subset
MAX_INTERACTIONS    = 2               # cap interactions per subset
N_ITER_PER_MODEL    = 20              # RandomizedSearchCV iterations per model
CV_FOLDS            = 5

CLIP_MIN, CLIP_MAX  = 0.0, 15.0       # clip predictions

# ---------------- Feature space ----------------
BASE_FEATURES = [
    "DOM++","40 Time","BMI","YPC","ELU","YCO/A","Break%","Draft Capital","Bama","Draft Age"
]

INTERACTIONS = {
    "DOMxDraft": ("DOM++", "Draft Capital"),
    "YPCxELU":   ("YPC",   "ELU"),
    "ELUxYCOA":  ("ELU",   "YCO/A"),
}

ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------------- Utilities ----------------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def invert_cols(X):
    # invert "lower is better"
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns: X[c] = -X[c]
    return X

def add_interactions(X, inter_names):
    X = X.copy()
    for name in inter_names:
        a,b = INTERACTIONS[name]
        if a in X.columns and b in X.columns:
            X[name] = X[a]*X[b]
    return X

# ---------------- Load & prep ----------------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Could not find target column among {TARGET_CANDS}")

mapped = {}
for feat in BASE_FEATURES:
    col = find_col(df, ALIASES.get(feat, [feat]))
    if col is not None:
        mapped[feat] = col

ALLOWED_INTERS = {k:v for k,v in INTERACTIONS.items() if v[0] in mapped and v[1] in mapped}

X_all_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all     = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all_raw, y_all, names_all = X_all_raw.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------------- Model spaces ----------------
def model_spaces(random_state):
    return [
        ("GB", GradientBoostingRegressor(random_state=random_state), {
            "n_estimators": [400, 600, 800, 1000, 1200],
            "learning_rate": [0.03, 0.05, 0.07, 0.1],
            "max_depth": [2, 3, 4],
            "subsample": [0.8, 0.9, 1.0],
            "min_samples_leaf": [1, 2, 3],
        }),
        ("RF", RandomForestRegressor(random_state=random_state, n_jobs=-1), {
            "n_estimators": [600, 900, 1200, 1500],
            "max_depth": [None, 12, 16, 20],
            "min_samples_split": [2, 4, 6],
            "min_samples_leaf": [1, 2, 3],
            "max_features": ["sqrt", 0.7, 0.9, 1.0],
        }),
        ("ET", ExtraTreesRegressor(random_state=random_state, n_jobs=-1), {
            "n_estimators": [600, 900, 1200, 1500],
            "max_depth": [None, 12, 16, 20],
            "min_samples_split": [2, 4, 6],
            "min_samples_leaf": [1, 2, 3],
            "max_features": ["sqrt", 0.7, 0.9, 1.0],
        }),
        ("HGB", HistGradientBoostingRegressor(random_state=random_state), {
            "learning_rate": [0.03, 0.05, 0.08, 0.1],
            "max_depth": [3, 6, 9],
            "l2_regularization": [0.0, 0.1, 0.3, 0.5],
            "max_bins": [128, 255],
        })
    ]

# ---------------- Random subset generator ----------------
def sample_subset(rng, base_pool, max_bases, allowed_inters, max_inters):
    n_bases = rng.integers(low=min(3, len(base_pool)), high=min(max_bases, len(base_pool)) + 1)
    bases = rng.choice(base_pool, size=int(n_bases), replace=False).tolist()

    inter_names = []
    if allowed_inters and max_inters > 0:
        eligible = [name for name,(a,b) in allowed_inters.items() if a in bases and b in bases]
        if eligible:
            k = rng.integers(low=0, high=min(max_inters, len(eligible)) + 1)
            if k > 0:
                inter_names = rng.choice(eligible, size=int(k), replace=False).tolist()
    return bases, inter_names

# ---------------- One full search (per seed) ----------------
def run_seed(seed: int):
    pred_path  = OUT_DIR / f"rb_wide_best_preds_seed{seed}.csv"
    meta_path  = OUT_DIR / f"rb_wide_best_meta_seed{seed}.json"
    board_path = OUT_DIR / f"rb_wide_leaderboard_seed{seed}.csv"

    rng = np.random.default_rng(seed)

    X_tr_raw, X_te_raw, y_tr, y_te, n_tr, n_te = train_test_split(
        X_all_raw, y_all, names_all, test_size=TEST_SIZE, random_state=seed
    )

    leaderboard = []
    baseline_done = False

    for subset_idx in range(N_SUBSETS):
        if not baseline_done:
            # best single feature baseline
            best_feat, best_cv = None, -1e9
            for f in X_tr_raw.columns:
                imp = SimpleImputer(strategy="median")
                X_single = imp.fit_transform(X_tr_raw[[f]])
                kf = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=seed)
                cv = cross_val_score(GradientBoostingRegressor(random_state=seed), X_single, y_tr, scoring="r2", cv=kf).mean()
                if cv > best_cv: best_cv, best_feat = cv, f
            bases, inters = [best_feat], []
            baseline_done = True
        else:
            bases, inters = sample_subset(rng, list(X_tr_raw.columns), MAX_BASE_FEATS, ALLOWED_INTERS, MAX_INTERACTIONS)

        # build matrices
        Xtr_df, Xte_df = X_tr_raw[bases].copy(), X_te_raw[bases].copy()
        for iname in inters:
            a,b = INTERACTIONS[iname]
            Xtr_df[iname], Xte_df[iname] = Xtr_df[a]*Xtr_df[b], Xte_df[a]*Xte_df[b]
        Xtr_df, Xte_df = invert_cols(Xtr_df), invert_cols(Xte_df)

        imp = SimpleImputer(strategy="median")
        Xtr, Xte = imp.fit_transform(Xtr_df), imp.transform(Xte_df)

        # tune models
        best_cv, best_tag, best_est = -1e9, None, None
        for tag, est, grid in model_spaces(seed):
            n_iter = min(N_ITER_PER_MODEL, int(np.prod([len(v) for v in grid.values()])))
            search = RandomizedSearchCV(est, grid, n_iter=n_iter, scoring="r2", cv=CV_FOLDS, random_state=seed, n_jobs=-1)
            search.fit(Xtr, y_tr)
            if search.best_score_ > best_cv:
                best_cv, best_tag, best_est = search.best_score_, tag, search.best_estimator_

        best_est.fit(Xtr, y_tr)
        y_pred = np.clip(best_est.predict(Xte), CLIP_MIN, CLIP_MAX)

        leaderboard.append({
            "seed": seed, "subset_idx": subset_idx, "model": best_tag,
            "cvR2_mean": best_cv, "TEST_R2": r2_score(y_te, y_pred),
            "TEST_MAE": mean_absolute_error(y_te, y_pred),
            "TEST_RMSE": rmse(y_te, y_pred),
            "n_features": len(bases) + len(inters),
            "bases": "|".join(bases), "interactions": "|".join(inters),
            "max_pred": float(np.max(y_pred)),
        })

    # leaderboard
    board = pd.DataFrame(leaderboard).sort_values(["TEST_R2","cvR2_mean"], ascending=False).head(15)
    board.to_csv(board_path, index=False)

    best_row = board.iloc[0]
    print(f"\n=== Seed {seed} — top of board ===")
    print(board[["subset_idx","model","n_features","cvR2_mean","TEST_R2","TEST_MAE","TEST_RMSE","bases","interactions"]]
          .to_string(index=False, float_format=lambda x: f"{x:.4f}"))

    # save meta
    meta = {
        "seed": seed, "leaderboard_csv": str(board_path), "best_predictions_csv": str(pred_path),
        "best_bases": best_row["bases"].split("|"), "best_interactions": best_row["interactions"].split("|") if best_row["interactions"] else [],
        "best_model_tag": best_row["model"], "best_cvR2": float(best_row["cvR2_mean"]),
        "best_test_R2": float(best_row["TEST_R2"]), "best_test_MAE": float(best_row["TEST_MAE"]),
        "best_test_RMSE": float(best_row["TEST_RMSE"]), "max_pred_test": float(best_row["max_pred"])
    }
    with open(meta_path, "w") as f: json.dump(meta, f, indent=2)

    return meta

# ---------------- Run across seeds & summarize ----------------
all_meta = [run_seed(s) for s in SEEDS]
summary = pd.DataFrame(all_meta).sort_values("best_test_R2", ascending=False)
print("\n=== Overall summary across seeds ===")
print(summary[["seed","best_cvR2","best_test_R2","best_test_MAE","best_test_RMSE","best_model_tag","best_bases","best_interactions"]]
      .to_string(index=False, float_format=lambda x: f"{x:.4f}"))


In [None]:
# ===============================================================
# Subset 28 reproducer (seed=42, HGB model, 5 features)
# Features: ELU, Draft Capital, Draft Age, DOM++, BMI
# Output: CSV with Player, Actual_RB_Grade, Predicted_RB_Grade, Error
# ===============================================================
import re, json, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor

# ---------------- Config ----------------
CSV_PATH    = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR     = Path("./data/Bakery/_derived")
SEED        = 42           # Subset 28 was from seed 42
TEST_SIZE   = 0.20
CV_FOLDS    = 5
CLIP_MIN, CLIP_MAX = 0.0, 15.0

# Five features from the winning subset
WIN_FEATURES = ["ELU", "Draft Capital", "Draft Age", "DOM++", "BMI"]

# Aliases (so it works even if your headers vary a bit)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "BMI":           ["BMI"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# HGB search space (kept tight; we’ll let CV pick the best)
HGB_PARAM = {
    "learning_rate": [0.03, 0.05, 0.08, 0.1],
    "max_depth":     [3, 6, 9],
    "l2_regularization": [0.0, 0.1, 0.3, 0.5],
    "max_bins": [128, 255],
}

# ---------------- Helpers ----------------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred):
    from sklearn.metrics import mean_squared_error
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---------------- Load ----------------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target column not found among {TARGET_CANDS}. Available: {list(df.columns)}")

# map the 5 winning columns
mapped = {}
for feat in WIN_FEATURES:
    col = find_col(df, ALIASES.get(feat, [feat]))
    if col is None:
        raise ValueError(f"Could not find required feature '{feat}' in the CSV headers.")
    mapped[feat] = col

X_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

# keep only rows with target
mask = y_all.notna()
X_raw, y_all, names = X_raw.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

# Invert 'lower is better' features (earlier round/younger age should increase grade)
for c in ["Draft Capital", "Draft Age"]:
    if c in X_raw.columns:
        X_raw[c] = -X_raw[c]

# ---------------- Split, Impute, Tune, Predict ----------------
X_train_df, X_test_df, y_train, y_test, n_train, n_test = train_test_split(
    X_raw, y_all, names, test_size=TEST_SIZE, random_state=SEED
)

imp = SimpleImputer(strategy="median")
Xtr = imp.fit_transform(X_train_df)
Xte = imp.transform(X_test_df)

hgb = HistGradientBoostingRegressor(random_state=SEED)
n_iter = min(20, int(np.prod([len(v) for v in HGB_PARAM.values()])))
search = RandomizedSearchCV(
    hgb, HGB_PARAM, n_iter=n_iter, scoring="r2",
    cv=KFold(n_splits=CV_FOLDS, shuffle=True, random_state=SEED),
    random_state=SEED, n_jobs=-1
)
search.fit(Xtr, y_train)
best_hgb = search.best_estimator_

y_pred = np.clip(best_hgb.predict(Xte), CLIP_MIN, CLIP_MAX)

# ---------------- Report & Save ----------------
r2   = r2_score(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)
rse  = rmse(y_test, y_pred)

OUT_DIR.mkdir(parents=True, exist_ok=True)
pred_csv = OUT_DIR / "subset28_seed42_hgb_predictions.csv"

out = pd.DataFrame({
    "Player": n_test.values,
    "Actual_RB_Grade": y_test.values,
    "Predicted_RB_Grade": y_pred,
    "Error": y_pred - y_test.values
}).sort_values("Actual_RB_Grade", ascending=False)
out.to_csv(pred_csv, index=False)

print("\n=== Subset 28 replica (HGB, 5 features) — TEST ===")
print(f"Features used: {WIN_FEATURES}")
print(f"Best HGB params: {search.best_params_}")
print(f"R²: {r2:.4f} | MAE: {mae:.4f} | RMSE: {rse:.4f} | MaxPred: {out['Predicted_RB_Grade'].max():.3f}")

print("\nTop 20 by Actual (TEST):")
print(out.head(20).round(3).to_string(index=False))

print(f"\nSaved predictions → {pred_csv}")


In [None]:
# ===============================================================
# Subset 28 full-dataset predictions + PDF
# Features: ELU, Draft Capital, Draft Age, DOM++, BMI
# Outputs:
#   data/Bakery/_derived/subset28_seed42_hgb_full_predictions.csv
#   data/Bakery/_derived/subset28_seed42_hgb_full_report.pdf
# ===============================================================
import re, json, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor

# Use matplotlib for a simple PDF (no seaborn, no custom colors)
import matplotlib.pyplot as plt

# ---------------- Config ----------------
PROJECT_ROOT = Path(".")  # adjust if you run from elsewhere
CANDIDATE_CSV_PATHS = [
    PROJECT_ROOT / "data/Bakery/RB/Bakery_RB_Overall.csv",
    PROJECT_ROOT / "../data/Bakery/RB/Bakery_RB_Overall.csv",
]
OUT_DIR     = PROJECT_ROOT / "data/Bakery/_derived"
SEED        = 42
CV_FOLDS    = 5
CLIP_MIN, CLIP_MAX = 0.0, 15.0

WIN_FEATURES = ["ELU", "Draft Capital", "Draft Age", "DOM++", "BMI"]

ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "BMI":           ["BMI"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# HGB search space
HGB_PARAM = {
    "learning_rate": [0.03, 0.05, 0.08, 0.1],
    "max_depth":     [3, 6, 9],
    "l2_regularization": [0.0, 0.1, 0.3, 0.5],
    "max_bins": [128, 255],
}

# ---------------- Helpers ----------------
def find_existing_path(paths):
    for p in paths:
        if Path(p).exists():
            return Path(p)
    return None

def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred):
    from sklearn.metrics import mean_squared_error
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---------------- Load ----------------
CSV_PATH = find_existing_path(CANDIDATE_CSV_PATHS)
if CSV_PATH is None:
    raise FileNotFoundError(f"Could not find Bakery_RB_Overall.csv in any of: {CANDIDATE_CSV_PATHS}")

df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target column not found among {TARGET_CANDS}. Available: {list(df.columns)}")

# map the 5 subset-28 columns
mapped = {}
for feat in WIN_FEATURES:
    col = find_col(df, ALIASES.get(feat, [feat]))
    if col is None:
        raise ValueError(f"Could not find required feature '{feat}' in the CSV headers.")
    mapped[feat] = col

X_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names = df[name_col].astype(str).fillna("")

# keep only rows with target
mask = y_all.notna()
X_raw, y_all, names = X_raw.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names.loc[mask].reset_index(drop=True)

# Invert 'lower is better' (earlier round / younger age -> higher grade)
for c in ["Draft Capital", "Draft Age"]:
    if c in X_raw.columns:
        X_raw[c] = -X_raw[c]

# ---------------- Impute, tune (CV), fit on ALL rows ----------------
imp = SimpleImputer(strategy="median")
X_all = imp.fit_transform(X_raw)

hgb = HistGradientBoostingRegressor(random_state=SEED)
n_iter = min(20, int(np.prod([len(v) for v in HGB_PARAM.values()])))
cv = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=SEED)

search = RandomizedSearchCV(
    hgb, HGB_PARAM, n_iter=n_iter, scoring="r2",
    cv=cv, random_state=SEED, n_jobs=-1
)
search.fit(X_all, y_all)
best_hgb = search.best_estimator_

y_pred = np.clip(best_hgb.predict(X_all), CLIP_MIN, CLIP_MAX)

# ---------------- Report & Save ----------------
R2   = r2_score(y_all, y_pred)
MAE  = mean_absolute_error(y_all, y_pred)
RMSE = rmse(y_all, y_pred)

OUT_DIR.mkdir(parents=True, exist_ok=True)
pred_csv = OUT_DIR / "subset28_seed42_hgb_full_predictions.csv"

out_df = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y_all.values,
    "Predicted_RB_Grade": y_pred,
    "Error": y_pred - y_all.values
}).sort_values("Actual_RB_Grade", ascending=False)
out_df.to_csv(pred_csv, index=False)

# ----- Build PDF (matplotlib table) -----
report_pdf = OUT_DIR / "subset28_seed42_hgb_full_report.pdf"

top_k = 25
tbl = out_df.head(top_k).copy().round(3)
tbl.insert(0, "#", range(1, len(tbl) + 1))

title_lines = [
    "Bakery RB — Subset 28 Full-Dataset Report",
    f"Model: HistGradientBoosting (seed={SEED})",
    f"Features: {', '.join(WIN_FEATURES)}",
    f"Best Params: {search.best_params_}",
    f"Rows: {len(out_df)} | R²={R2:.4f} | MAE={MAE:.3f} | RMSE={RMSE:.3f}",
    f"CSV: {pred_csv}",
]

plt.figure(figsize=(10.5, 13.5))
plt.axis('off')
ypos = 0.98
for line in title_lines:
    plt.text(0.02, ypos, line, fontsize=11, ha='left', va='top')
    ypos -= 0.035

col_labels = list(tbl.columns)
cell_text  = tbl.values.tolist()
the_table = plt.table(cellText=cell_text, colLabels=col_labels, cellLoc='left', loc='center')
the_table.auto_set_font_size(False)
the_table.set_fontsize(9)
the_table.scale(1, 1.2)

plt.tight_layout()
plt.savefig(report_pdf, format="pdf")
plt.close()

print("\n=== Subset 28 (HGB, 5 features) — FULL DATASET ===")
print(f"Best HGB params: {search.best_params_}")
print(f"R²: {R2:.4f} | MAE: {MAE:.4f} | RMSE: {RMSE:.4f}")
print(f"\nSaved predictions CSV → {pred_csv}")
print(f"Saved PDF report      → {report_pdf}")


In [None]:
# ============================================================
# RB Grade — Non-negative linear "line of best fit" (NNLS)
#  - 80/20 train/test by seeds
#  - Interaction-only polynomial features (degree=2)
#  - Impute (median) + Standardize
#  - Non-negative least squares (weights >= 0)
#  - Intercept = y_train.mean()
#  - Predictions clipped to [0, 15]
#  - Saves predictions and coefficient tables per seed
# ============================================================
import re, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.optimize import nnls

# ---------------- Config ----------------
CSV_PATH     = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR      = Path("./data/Bakery/_derived")
TEST_SIZE    = 0.20
SEEDS        = [3, 7, 11, 19, 23, 29, 31, 37, 41, 42, 1337]
CLIP_RANGE   = (0.0, 15.0)

# polynomial features
POLY_DEGREE           = 2
POLY_INTERACTION_ONLY = True  # only pairwise interactions, no squares

# columns (Draft Age included; still no Breakout Age)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# -------------- helpers --------------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# -------------- load --------------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Target column not found. Available: {list(df.columns)}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}
if not mapped:
    raise ValueError("No usable feature columns found from ALIASES.")

X_all = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all, y_all, names_all = X_all.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)

OUT_DIR.mkdir(parents=True, exist_ok=True)
base_feature_names = list(X_all.columns)

# -------------- runner --------------
def run_one(seed: int):
    pred_path = OUT_DIR / f"rb_nnls_test_predictions_seed{seed}.csv"
    coef_path = OUT_DIR / f"rb_nnls_coeffs_seed{seed}.csv"

    X_train, X_test, y_train, y_test, n_train, n_test = train_test_split(
        X_all, y_all, names_all, test_size=TEST_SIZE, random_state=seed
    )

    # impute
    imputer = SimpleImputer(strategy="median")
    Xtr_base = imputer.fit_transform(X_train)
    Xte_base = imputer.transform(X_test)

    # polynomial interactions
    poly = PolynomialFeatures(
        degree=POLY_DEGREE,
        interaction_only=POLY_INTERACTION_ONLY,
        include_bias=False
    )
    Xtr_poly = poly.fit_transform(Xtr_base)
    Xte_poly = poly.transform(Xte_base)
    feature_names = poly.get_feature_names_out(base_feature_names)

    # standardize
    scaler = StandardScaler()
    Xtr = scaler.fit_transform(Xtr_poly)
    Xte = scaler.transform(Xte_poly)

    # ---------- Non-negative least squares ----------
    # center y so the intercept = y_mean (keeps weights non-negative)
    y_mean = float(y_train.mean())
    y_center = y_train - y_mean

    # NNLS solve: min ||X w - y_center||  s.t. w >= 0
    w, _ = nnls(Xtr, y_center.to_numpy())

    # predictions: add back mean, clip to [0, 15]
    y_pred = y_mean + Xte @ w
    y_pred = np.clip(y_pred, CLIP_RANGE[0], CLIP_RANGE[1])

    # metrics
    r2   = r2_score(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse_val = rmse(y_test, y_pred)

    # outputs
    pred_df = pd.DataFrame({
        "Player": n_test.values,
        "Actual_RB_Grade": y_test.values,
        "Predicted_RB_Grade": y_pred,
        "Error": y_pred - y_test.values
    }).sort_values("Actual_RB_Grade", ascending=False)
    pred_df.to_csv(pred_path, index=False)

    coef = pd.Series(w, index=feature_names, name="weight").sort_values(ascending=False)
    # all weights must be >= 0 by construction
    assert (coef >= -1e-12).all(), "Found a negative weight — should not happen with NNLS."
    coef.to_csv(coef_path, header=True)

    print(f"\n=== Seed {seed} — TEST (NNLS, non-negative weights) ===")
    print(f"R²: {r2:.4f} | MAE: {mae:.4f} | RMSE: {rmse_val:.4f} | MaxPred: {pred_df['Predicted_RB_Grade'].max():.3f}")
    print("Top 15 (by Actual):")
    print(pred_df.head(15).round(3).to_string(index=False))
    print(f"Saved predictions → {pred_path}")
    print(f"Saved coefficients → {coef_path}")

    return {"seed": seed, "R2": r2, "MAE": mae, "RMSE": rmse_val, "pred_csv": str(pred_path), "coef_csv": str(coef_path), "coefs": coef}

# -------------- run all seeds --------------
results = [run_one(s) for s in SEEDS]
summary = pd.DataFrame([{k:v for k,v in r.items() if k!='coefs'} for r in results]).sort_values("R2", ascending=False)

print("\n=== Summary across seeds (NNLS, non-negative weights) ===")
print(summary.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

# Aggregate coefficient stability across seeds
coef_df = pd.DataFrame(results[0]["coefs"])
coef_df.columns = [f"seed_{results[0]['seed']}"]
for r in results[1:]:
    coef_df = coef_df.join(r["coefs"].rename(f"seed_{r['seed']}"), how="outer")
coef_mean = coef_df.mean(axis=1).fillna(0.0)
coef_std  = coef_df.std(axis=1).fillna(0.0)

coef_agg = pd.DataFrame({"weight_mean": coef_mean, "weight_std": coef_std})
coef_agg = coef_agg.reindex(coef_agg["weight_mean"].sort_values(ascending=False).index)

agg_path = OUT_DIR / "rb_nnls_coeffs_aggregate.csv"
coef_agg.to_csv(agg_path)

print(f"\nSaved aggregate coefficient table → {agg_path}")
print("Top 20 features by mean weight (non-negative):")
print(coef_agg.head(20).round(5))


In [None]:
#### ===============================================================
# RB Grade — Wide search over feature subsets & hyperparameters
# - Randomly sample feature combos (+ limited interactions)
# - Tune multiple models (GB, RF, ET, HGB) with RandomizedSearchCV
# - 80/20 train-test; CV on TRAIN only; metrics on TEST
# - Predictions clipped to [0, 15]
# - Logs leaderboard + meta for reproducibility
# ===============================================================
import re, json, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import (
    GradientBoostingRegressor, RandomForestRegressor,
    ExtraTreesRegressor, HistGradientBoostingRegressor
)

# ---------------- Search controls ----------------
CSV_PATH            = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
OUT_DIR             = Path("./data/Bakery/_derived")
TEST_SIZE           = 0.20

SEEDS               = [456, 123, 789] # run search for multiple seeds
N_SUBSETS           = 60              # random feature subsets per seed
MAX_BASE_FEATS      = 8               # cap base features per subset
MAX_INTERACTIONS    = 3               # cap interactions per subset
N_ITER_PER_MODEL    = 25              # RandomizedSearchCV iterations per model
CV_FOLDS            = 5

CLIP_MIN, CLIP_MAX  = 0.0, 15.0       # clip predictions

# ---------------- Feature space ----------------
BASE_FEATURES = [
    "DOM++","40 Time","BMI","YPC","ELU","YCO/A","Break%","Draft Capital","Bama","Draft Age"
]

INTERACTIONS = {
    "DOMxDraft": ("DOM++", "Draft Capital"),
    "YPCxELU":   ("YPC",   "ELU"),
    "ELUxYCOA":  ("ELU",   "YCO/A"),
}

ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# ---------------- Utilities ----------------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def invert_cols(X):
    # invert "lower is better"
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in X.columns: X[c] = -X[c]
    return X

def add_interactions(X, inter_names):
    X = X.copy()
    for name in inter_names:
        a,b = INTERACTIONS[name]
        if a in X.columns and b in X.columns:
            X[name] = X[a]*X[b]
    return X

# ---------------- Load & prep ----------------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col    = find_col(df, TARGET_CANDS)
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Could not find target column among {TARGET_CANDS}")

mapped = {}
for feat in BASE_FEATURES:
    col = find_col(df, ALIASES.get(feat, [feat]))
    if col is not None:
        mapped[feat] = col

ALLOWED_INTERS = {k:v for k,v in INTERACTIONS.items() if v[0] in mapped and v[1] in mapped}

X_all_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_all     = to_num(df[y_col])
names_all = df[name_col].astype(str).fillna("")

mask = y_all.notna()
X_all_raw, y_all, names_all = X_all_raw.loc[mask].reset_index(drop=True), y_all.loc[mask].reset_index(drop=True), names_all.loc[mask].reset_index(drop=True)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------------- Model spaces ----------------
def model_spaces(random_state):
    return [
        ("GB", GradientBoostingRegressor(random_state=random_state), {
            "n_estimators": [400, 600, 800, 1000, 1200],
            "learning_rate": [0.03, 0.05, 0.07, 0.1],
            "max_depth": [2, 3, 4],
            "subsample": [0.8, 0.9, 1.0],
            "min_samples_leaf": [1, 2, 3],
        }),
        ("RF", RandomForestRegressor(random_state=random_state, n_jobs=-1), {
            "n_estimators": [600, 900, 1200, 1500],
            "max_depth": [None, 12, 16, 20],
            "min_samples_split": [2, 4, 6],
            "min_samples_leaf": [1, 2, 3],
            "max_features": ["sqrt", 0.7, 0.9, 1.0],
        }),
        ("ET", ExtraTreesRegressor(random_state=random_state, n_jobs=-1), {
            "n_estimators": [600, 900, 1200, 1500],
            "max_depth": [None, 12, 16, 20],
            "min_samples_split": [2, 4, 6],
            "min_samples_leaf": [1, 2, 3],
            "max_features": ["sqrt", 0.7, 0.9, 1.0],
        }),
        ("HGB", HistGradientBoostingRegressor(random_state=random_state), {
            "learning_rate": [0.03, 0.05, 0.08, 0.1],
            "max_depth": [3, 6, 9],
            "l2_regularization": [0.0, 0.1, 0.3, 0.5],
            "max_bins": [128, 255],
        })
    ]

# ---------------- Random subset generator ----------------
def sample_subset(rng, base_pool, max_bases, allowed_inters, max_inters):
    n_bases = rng.integers(low=min(3, len(base_pool)), high=min(max_bases, len(base_pool)) + 1)
    bases = rng.choice(base_pool, size=int(n_bases), replace=False).tolist()

    inter_names = []
    if allowed_inters and max_inters > 0:
        eligible = [name for name,(a,b) in allowed_inters.items() if a in bases and b in bases]
        if eligible:
            k = rng.integers(low=0, high=min(max_inters, len(eligible)) + 1)
            if k > 0:
                inter_names = rng.choice(eligible, size=int(k), replace=False).tolist()
    return bases, inter_names

# ---------------- One full search (per seed) ----------------
def run_seed(seed: int):
    pred_path  = OUT_DIR / f"rb_wide_best_preds_seed{seed}.csv"
    meta_path  = OUT_DIR / f"rb_wide_best_meta_seed{seed}.json"
    board_path = OUT_DIR / f"rb_wide_leaderboard_seed{seed}.csv"

    rng = np.random.default_rng(seed)

    X_tr_raw, X_te_raw, y_tr, y_te, n_tr, n_te = train_test_split(
        X_all_raw, y_all, names_all, test_size=TEST_SIZE, random_state=seed
    )

    leaderboard = []
    baseline_done = False

    for subset_idx in range(N_SUBSETS):
        if not baseline_done:
            # best single feature baseline
            best_feat, best_cv = None, -1e9
            for f in X_tr_raw.columns:
                imp = SimpleImputer(strategy="median")
                X_single = imp.fit_transform(X_tr_raw[[f]])
                kf = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=seed)
                cv = cross_val_score(GradientBoostingRegressor(random_state=seed), X_single, y_tr, scoring="r2", cv=kf).mean()
                if cv > best_cv: best_cv, best_feat = cv, f
            bases, inters = [best_feat], []
            baseline_done = True
        else:
            bases, inters = sample_subset(rng, list(X_tr_raw.columns), MAX_BASE_FEATS, ALLOWED_INTERS, MAX_INTERACTIONS)

        # build matrices
        Xtr_df, Xte_df = X_tr_raw[bases].copy(), X_te_raw[bases].copy()
        for iname in inters:
            a,b = INTERACTIONS[iname]
            Xtr_df[iname], Xte_df[iname] = Xtr_df[a]*Xtr_df[b], Xte_df[a]*Xte_df[b]
        Xtr_df, Xte_df = invert_cols(Xtr_df), invert_cols(Xte_df)

        imp = SimpleImputer(strategy="median")
        Xtr, Xte = imp.fit_transform(Xtr_df), imp.transform(Xte_df)

        # tune models
        best_cv, best_tag, best_est = -1e9, None, None
        for tag, est, grid in model_spaces(seed):
            n_iter = min(N_ITER_PER_MODEL, int(np.prod([len(v) for v in grid.values()])))
            search = RandomizedSearchCV(est, grid, n_iter=n_iter, scoring="r2", cv=CV_FOLDS, random_state=seed, n_jobs=-1)
            search.fit(Xtr, y_tr)
            if search.best_score_ > best_cv:
                best_cv, best_tag, best_est = search.best_score_, tag, search.best_estimator_

        best_est.fit(Xtr, y_tr)
        y_pred = np.clip(best_est.predict(Xte), CLIP_MIN, CLIP_MAX)

        leaderboard.append({
            "seed": seed, "subset_idx": subset_idx, "model": best_tag,
            "cvR2_mean": best_cv, "TEST_R2": r2_score(y_te, y_pred),
            "TEST_MAE": mean_absolute_error(y_te, y_pred),
            "TEST_RMSE": rmse(y_te, y_pred),
            "n_features": len(bases) + len(inters),
            "bases": "|".join(bases), "interactions": "|".join(inters),
            "max_pred": float(np.max(y_pred)),
        })

    # leaderboard
    board = pd.DataFrame(leaderboard).sort_values(["TEST_R2","cvR2_mean"], ascending=False).head(15)
    board.to_csv(board_path, index=False)

    best_row = board.iloc[0]
    print(f"\n=== Seed {seed} — top of board ===")
    print(board[["subset_idx","model","n_features","cvR2_mean","TEST_R2","TEST_MAE","TEST_RMSE","bases","interactions"]]
          .to_string(index=False, float_format=lambda x: f"{x:.4f}"))

    # save meta
    meta = {
        "seed": seed, "leaderboard_csv": str(board_path), "best_predictions_csv": str(pred_path),
        "best_bases": best_row["bases"].split("|"), "best_interactions": best_row["interactions"].split("|") if best_row["interactions"] else [],
        "best_model_tag": best_row["model"], "best_cvR2": float(best_row["cvR2_mean"]),
        "best_test_R2": float(best_row["TEST_R2"]), "best_test_MAE": float(best_row["TEST_MAE"]),
        "best_test_RMSE": float(best_row["TEST_RMSE"]), "max_pred_test": float(best_row["max_pred"])
    }
    with open(meta_path, "w") as f: json.dump(meta, f, indent=2)

    return meta

# ---------------- Run across seeds & summarize ----------------
all_meta = [run_seed(s) for s in SEEDS]
summary = pd.DataFrame(all_meta).sort_values("best_test_R2", ascending=False)
print("\n=== Overall summary across seeds ===")
print(summary[["seed","best_cvR2","best_test_R2","best_test_MAE","best_test_RMSE","best_model_tag","best_bases","best_interactions"]]
      .to_string(index=False, float_format=lambda x: f"{x:.4f}"))


In [None]:
# ===============================================================
# Use TOP result from wide-search leaderboard to score ALL players
# - Reads leaderboard -> gets best model, bases, interactions
# - Rebuilds those features from Bakery_RB_Overall.csv
# - Tunes the chosen model (light CV), fits on ALL rows
# - Saves Player | Actual_RB_Grade | Predicted_RB_Grade | Error
# ===============================================================
import re, json, numpy as np, pandas as pd
from pathlib import Path

from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    HistGradientBoostingRegressor
)

# ---------------- Config ----------------
PROJECT_ROOT   = Path(".")  # adjust if you run elsewhere
# Point this to your leaderboard (works with either local project path or an absolute path like /mnt/data/...):
LEADERBOARD_CSV = PROJECT_ROOT / "data/Bakery/_derived/rb_wide_leaderboard_seed789.csv"
# Example if you stored it in /mnt/data:
# LEADERBOARD_CSV = Path("/mnt/data/rb_wide_leaderboard_seed789.csv")

DATA_CSV      = PROJECT_ROOT / "data/Bakery/RB/Bakery_RB_Overall.csv"
OUT_DIR       = PROJECT_ROOT / "data/Bakery/_derived"
OUT_CSV       = OUT_DIR / "rb_top_model_full_predictions.csv"

SEED          = 789   # this can be any; only affects tuning randomness
CV_FOLDS      = 5
CLIP_MIN, CLIP_MAX = 0.0, 15.0
N_JOBS        = -1

# Aliases (so we can map headers reliably)
ALIASES = {
    "DOM++":         ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":       ["40 Time","Forty","40"],
    "BMI":           ["BMI"],
    "YPC":           ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":           ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":         ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":        ["Break%","Break %","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital": ["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":          ["Bama","Bama Rating","BamaAdj","BAMA"],
    "Shuttle":       ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":    ["3 Cone","Three Cone","3-Cone"],
    "Rec Yards":     ["Receiving Yards","Rec Yds","RecYds"],
    "Draft Age":     ["Draft Age","Age at Draft","DraftAge","Age (Draft)","AgeDraft","Age_at_Draft"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]
NAME_CANDS   = ["Player","Player Name","Name"]

# Interaction definitions used in the wide search
INTERACTIONS = {
    "DOMxDraft": ("DOM++", "Draft Capital"),
    "YPCxELU":   ("YPC",   "ELU"),
    "ELUxYCOA":  ("ELU",   "YCO/A"),
}

# Light grids to (re)fit the chosen model family
def model_space(tag, seed):
    if tag == "GB":
        est = GradientBoostingRegressor(random_state=seed)
        grid = {
            "n_estimators": [600, 800, 1000, 1200],
            "learning_rate": [0.03, 0.05, 0.07, 0.1],
            "max_depth": [2, 3, 4],
            "subsample": [0.85, 1.0],
            "min_samples_leaf": [1, 2],
        }
    elif tag == "RF":
        est = RandomForestRegressor(random_state=seed, n_jobs=N_JOBS)
        grid = {
            "n_estimators": [800, 1100, 1400],
            "max_depth": [None, 12, 16, 20],
            "min_samples_split": [2, 4],
            "min_samples_leaf": [1, 2],
            "max_features": ["sqrt", 0.8, 1.0],
        }
    elif tag == "ET":
        est = ExtraTreesRegressor(random_state=seed, n_jobs=N_JOBS)
        grid = {
            "n_estimators": [800, 1100, 1400],
            "max_depth": [None, 12, 16, 20],
            "min_samples_split": [2, 4],
            "min_samples_leaf": [1, 2],
            "max_features": ["sqrt", 0.8, 1.0],
        }
    elif tag == "HGB":
        est = HistGradientBoostingRegressor(random_state=seed)
        grid = {
            "learning_rate": [0.03, 0.05, 0.08, 0.1],
            "max_depth": [3, 6, 9],
            "l2_regularization": [0.0, 0.1, 0.3, 0.5],
            "max_bins": [128, 255],
        }
    else:
        raise ValueError(f"Unknown model tag in leaderboard: {tag}")
    return est, grid

# ---------------- Helpers ----------------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+","",c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+","",cand).lower()
        if key in norm: return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

def invert_lower_better(df):
    # Earlier round / faster time / younger age / faster shuttles → higher grade
    for c in ["40 Time","Draft Capital","Shuttle","Three Cone","Draft Age"]:
        if c in df.columns:
            df[c] = -df[c]
    return df

def add_interactions(X_df, inter_names):
    X = X_df.copy()
    for name in inter_names:
        if not name:
            continue
        a, b = INTERACTIONS[name]
        if a in X.columns and b in X.columns:
            X[name] = X[a] * X[b]
    return X

def rmse(y_true, y_pred):
    from sklearn.metrics import mean_squared_error
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---------------- Load leaderboard & pick the top row ----------------
board = pd.read_csv(LEADERBOARD_CSV)
if board.empty:
    raise ValueError(f"Leaderboard is empty: {LEADERBOARD_CSV}")

# Sort by TEST_R2 desc (tie-breaker by cvR2_mean if present)
sort_cols = [c for c in ["TEST_R2","cvR2_mean"] if c in board.columns]
board_sorted = board.sort_values(sort_cols, ascending=[False]*len(sort_cols)).reset_index(drop=True)
best = board_sorted.iloc[0]

best_tag  = best["model"]
bases     = (str(best["bases"]).split("|") if pd.notna(best["bases"]) and best["bases"] != "" else [])
inters    = (str(best["interactions"]).split("|") if pd.notna(best["interactions"]) and best["interactions"] != "" else [])

print("== Using TOP leaderboard row ==")
print(f"Model: {best_tag}")
print(f"Bases: {bases}")
print(f"Interactions: {inters}")
if "TEST_R2" in best:
    print(f"Leaderboard TEST R²: {best['TEST_R2']:.4f}")

# ---------------- Load data & build features ----------------
df = pd.read_csv(DATA_CSV)
df.columns = [c.strip() for c in df.columns]

y_col    = None
name_col = None
for cands in [TARGET_CANDS]:
    y_col = find_col(df, cands)
    if y_col: break
name_col = find_col(df, NAME_CANDS) or "Player"
if not y_col:
    raise ValueError(f"Could not find target column in {TARGET_CANDS}. Available: {list(df.columns)}")

# map bases
mapped = {}
for feat in bases:
    col = find_col(df, ALIASES.get(feat, [feat]))
    if col is None:
        raise ValueError(f"Could not find required feature '{feat}' in the CSV headers.")
    mapped[feat] = col

# restrict interactions to those whose parents exist
valid_inters = []
for name in inters:
    if not name: 
        continue
    if name not in INTERACTIONS:
        print(f"Warning: interaction '{name}' not recognized; skipping.")
        continue
    a,b = INTERACTIONS[name]
    if a in mapped and b in mapped:
        valid_inters.append(name)

X_base = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
X_base = invert_lower_better(X_base)
X_all  = add_interactions(X_base, valid_inters)

y_all   = to_num(df[y_col])
names   = df[name_col].astype(str).fillna("")

# keep rows with target
mask    = y_all.notna()
X_all   = X_all.loc[mask].reset_index(drop=True)
y_all   = y_all.loc[mask].reset_index(drop=True)
names   = names.loc[mask].reset_index(drop=True)

# impute
imp = SimpleImputer(strategy="median")
X_imp = imp.fit_transform(X_all)

# ---------------- Train chosen model family on ALL rows ----------------
est, grid = model_space(best_tag, SEED)
n_iter = min(20, int(np.prod([len(v) for v in grid.values()])))
cv = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=SEED)

search = RandomizedSearchCV(est, grid, n_iter=n_iter, scoring="r2",
                            cv=cv, random_state=SEED, n_jobs=N_JOBS)
search.fit(X_imp, y_all)
best_est = search.best_estimator_

y_pred = best_est.predict(X_imp)
y_pred = np.clip(y_pred, CLIP_MIN, CLIP_MAX)

# ---------------- Save results (ALL players) ----------------
OUT_DIR.mkdir(parents=True, exist_ok=True)
out = pd.DataFrame({
    "Player": names.values,
    "Actual_RB_Grade": y_all.values,
    "Predicted_RB_Grade": y_pred,
    "Error": y_pred - y_all.values
}).sort_values("Actual_RB_Grade", ascending=False)
out.to_csv(OUT_CSV, index=False)

# quick in-sample metrics (just for sense check)
R2   = r2_score(y_all, y_pred)
MAE  = mean_absolute_error(y_all, y_pred)
RMSE = rmse(y_all, y_pred)

print("\n=== Top model applied to FULL dataset ===")
print(f"Chosen model: {best_tag}  | tuned params: {search.best_params_}")
print(f"In-sample R²: {R2:.4f} | MAE: {MAE:.4f} | RMSE: {RMSE:.4f}")
print(f"Saved → {OUT_CSV}")
print("\nPreview:")
print(out.head(15).round(3).to_string(index=False))


In [None]:
from src.visuals.plot_feature_scatter_batch import main

# Generate max plots, 6 per PDF page, for RB
main(position="RB", max_plots=900, cols=3, rows=2)
