In [None]:
import sys
sys.path.append("/Users/chase/Desktop/Comp_Sci/Capstone/Dynasty")

from src.process_college import build_player_dict

player_dict = build_player_dict(verbose=False)
print(len(player_dict))
print(player_dict.get("Cameron Ward"))


In [None]:
import sys
sys.path.append("/Users/chase/Desktop/Comp_Sci/Capstone/Dynasty")

from src.process_combine import build_combine_dict

player_combine = build_combine_dict(verbose=False)
print(len(player_combine))
print(player_combine.get("Amon-Ra St Brown"))


In [None]:
from src.process_pro_qb import run_pro_qb_player

qb_dict = run_pro_qb_player(years=range(2016, 2025), s_type="REG", verbose=False)
print(len(qb_dict))
print(qb_dict.get("Patrick Mahomes"))

In [None]:
from src.process_pro_wr import run_pro_wr_player

wr_dict = run_pro_wr_player(years=range(2016, 2025), s_type="REG", verbose=False)
print(len(wr_dict))
print(wr_dict.get("Rashee Rice"))

In [None]:
from src.process_pro_rb import run_pro_rb_player

rb_dict = run_pro_rb_player(years=range(2016, 2025), s_type="REG", verbose=False)
print(len(rb_dict))
print(rb_dict.get("Christian McCaffrey"))

In [None]:
from src.process_pro_te import run_pro_te_player

te_dict = run_pro_te_player(years=range(2016, 2025), s_type="REG", verbose=False)
print(len(te_dict))
print(te_dict.get("Travis Kelce"))

In [None]:
from src.fantasycalc_client import (
        get_player_value, search_players,
        get_rankings_df, save_current_rankings
    )

# Look up one player
row = get_player_value("Breece Hall")
print(row)

# Search for possible name matches
print(search_players("Harrison"))

# Get full rankings as a DataFrame
df = get_rankings_df(dynasty=True, num_qbs=2, teams=12, ppr=1.0)
print(df.head())

# Save CSV snapshot(s) to Market_Value/
path = save_current_rankings(dynasty=True, num_qbs=2, teams=12, ppr=1.0)
print("Saved:", path)

In [None]:
from src.plot_scatter_wr_combine import main
main(years=range(2020, 2026), limit=50, pick_min=1, pick_max=32)


In [None]:
import sys
sys.path.append("../src")

from src.utils import clean_player_name

print(clean_player_name("Amon-Ra St. Brown"))   # Amon-Ra StBrown
print(clean_player_name("amon-ra st brown"))    # Amon-Ra StBrown
print(clean_player_name(None))                  # ""


In [None]:
import re

def clean_player_name(player_name):
    """Clean names while preserving apostrophes/hyphens, fusing 'St. X' -> 'StX',
    and normalizing casing to Title Case."""
    if not isinstance(player_name, str):
        return player_name

    s = player_name.strip()

    # 1) Remove common suffixes (Jr, Sr, II, III, IV, V), case-insensitive
    suffixes = ['Jr', 'Sr', 'II', 'III', 'IV', 'V']
    s = re.sub(r'\b(?:' + '|'.join(suffixes) + r')\b\.?', '', s, flags=re.IGNORECASE)

    # 2) Keep only word chars, whitespace, apostrophes, and hyphens
    s = re.sub(r"[^\w\s'-]", '', s)

    # 3) Fuse 'St. ' or 'St ' (any case) before a capitalized surname -> 'StSurname'
    s = re.sub(r"\bSt[.\s]+(?=[A-Z])", "St", s, flags=re.IGNORECASE)

    # 4) Collapse extra spaces
    s = ' '.join(s.split())

    # 5) Normalize to Title Case (preserves apostrophes/hyphens properly)
    s = s.title()

    # Fix common cases where title-casing breaks (e.g., "O'Neal" -> "O'Neal", not "O'Neal")
    # The default .title() already does this okay, but just in case:
    s = re.sub(r"\bO'([A-Z])", lambda m: "O'" + m.group(1).upper(), s)

    return s


def strip_name_marks(s: object) -> object:
    """Strip common extraneous marks like '*' without touching apostrophes or hyphens."""
    if not isinstance(s, str):
        return s
    return s.replace("*", "")


# ---- quick checks ----
tests = [
    "Amon-Ra St. Brown",
    "amon-ra st. brown",
    "O'Neal Jr.",
    "jean-baptiste iii",
    "ST. JOHN",
]
for t in tests:
    print(t, "->", clean_player_name(t))


In [None]:
import pandas as pd

df = pd.read_csv("./data/Bakery/RB/Bakery_RB_2017.csv")
pd.set_option("display.max_columns", None)
print(df.shape)
df.head()


In [None]:
import pandas as pd

# adjust path (this assumes you’re in Dynasty/notebooks/)
df = pd.read_csv("./data/Bakery/RB/Bakery_RB_2017.csv")

# Clean up column names
df.columns = [c.strip() for c in df.columns]
print("Columns:", df.columns.tolist())
df.head()


In [None]:
features = [
    "DOM++", "40 Time", "BMI", "YPC",
    "ELU", "YCO/A", "Break%", "Draft Cap", "BAMA"
]

target = "RB Grade"


In [None]:
from sklearn.preprocessing import StandardScaler

X = df[features].copy()
y = df[target]

# invert "lower is better"
X["40 Time"]      = -X["40 Time"]
X["Draft Cap"] = -X["Draft Cap"]

# drop rows with missing values
mask = X.notna().all(axis=1) & y.notna()
X = X[mask]
y = y[mask]

# normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.linear_model import LinearRegression
import pandas as pd

model = LinearRegression()
model.fit(X_scaled, y)

weights = pd.Series(model.coef_, index=features).sort_values(ascending=False)
print("Approximate Weights for RB Grade:")
print(weights)

print("\nIntercept:", model.intercept_)
print("R² (fit quality):", round(model.score(X_scaled, y), 4))


In [None]:
from sklearn.linear_model import Lasso

lasso_pos = Lasso(alpha=0.01, positive=True, max_iter=10000)
lasso_pos.fit(X_scaled, y)

weights_lasso = pd.Series(lasso_pos.coef_, index=features).sort_values(ascending=False)
print("Lasso (positive, shrunk weights):\n", weights_lasso)
print("\nR²:", round(lasso_pos.score(X_scaled, y), 4))


In [None]:
from sklearn.linear_model import LinearRegression

model_pos = LinearRegression(positive=True)
model_pos.fit(X_scaled, y)

weights_pos = pd.Series(model_pos.coef_, index=features).sort_values(ascending=False)
print("Non-Negative Weights:\n", weights_pos)
print("\nR²:", round(model_pos.score(X_scaled, y), 4))


In [2]:
# ===== Reverse-engineer Bakery RB Grade from Bakery_RB_Overall.csv (non-negative weights, no Breakout Age) =====
import re, json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from scipy.optimize import nnls

# ---------- config ----------
CSV_PATH = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")
ROOT = CSV_PATH.parent
OUT_DIR = Path("./data/Bakery/_derived"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+", "", c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+", "", cand).lower()
        if key in norm:
            return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

# canonical features to look for (NO Breakout Age)
ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
    # optional extras if present
    "Shuttle":      ["Shuttle","Short Shuttle","20 Shuttle","20 Yard Shuttle"],
    "Three Cone":   ["3 Cone","Three Cone","3-Cone"],
    "Vertical":     ["Vertical","Vertical Jump"],
    "Broad":        ["Broad","Broad Jump"],
    "Speed Score":  ["Speed Score","SpeedScore"],
    "Rec Yards":    ["Receiving Yards","Rec Yds","RecYds"],
    "Targets":      ["Targets","Target Share","Tgt%"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]

# ---------- load ----------
if not CSV_PATH.exists():
    # fall back to any similarly named overall file
    candidates = list(ROOT.glob("Bakery_RB_Overall*.csv"))
    if not candidates:
        raise FileNotFoundError(f"Could not find {CSV_PATH} or any Bakery_RB_Overall*.csv under {ROOT}")
    CSV_PATH = candidates[0]

df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]
print("Loaded:", CSV_PATH)
print("Rows x Cols:", df.shape)

# ---------- map target + features ----------
y_col = find_col(df, TARGET_CANDS)
if not y_col:
    raise ValueError(f"Could not find RB Grade in columns:\n{df.columns.tolist()}")

mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

if not mapped:
    raise ValueError("No usable feature columns found. Inspect df.columns for header names.")

print("\nUsing features (canonical <- sheet column):")
for k,v in mapped.items():
    print(f"  {k:<12} <- {v}")

X_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_raw = to_num(df[y_col])

# ---------- drop rows with NaN TARGET ----------
mask = y_raw.notna()
dropped = len(y_raw) - mask.sum()
if dropped:
    print(f"\nDropped {dropped} rows with NaN RB Grade.")
X_raw = X_raw.loc[mask].reset_index(drop=True)
y = y_raw.loc[mask].reset_index(drop=True)

# ---------- keep columns with enough data (loose thresholds for real-world sheets) ----------
keep = [c for c in X_raw.columns if X_raw[c].notna().sum() >= 5 and X_raw[c].nunique(dropna=True) > 1]
if not keep:
    raise ValueError("All candidate features are too sparse/constant. "
                     "Relax thresholds or ensure the Overall file has those columns filled.")
X_raw = X_raw[keep]
print("Kept features:", keep)

# ---------- invert where lower is better (NO Breakout Age) ----------
for c in ["40 Time","Draft Capital","Shuttle","Three Cone"]:
    if c in X_raw.columns:
        X_raw[c] = -X_raw[c]

# ---------- impute X (median) + standardize ----------
imp = SimpleImputer(strategy="median")
X_imputed = imp.fit_transform(X_raw)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# final NaN guards
if np.isnan(X_scaled).any():
    raise ValueError("X still contains NaNs after imputation/standardization. Please inspect your data.")

if y.isna().any():
    raise ValueError("y contains NaNs after filtering; this should not happen.")

# ---------- fit non-negative models ----------
results = {}

# (A) Positive OLS
ols_pos = LinearRegression(positive=True)
ols_pos.fit(X_scaled, y)
r2_ols = float(ols_pos.score(X_scaled, y)) if y.var() > 0 else float("nan")
results["OLS_Positive"] = (r2_ols, pd.Series(ols_pos.coef_, index=X_raw.columns))

# (B) NNLS with mean intercept (stable, non-negative)
y_mean = float(y.mean())
w_nnls, _ = nnls(X_scaled, (y - y_mean).to_numpy())
y_pred = y_mean + X_scaled @ w_nnls
r2_nnls = float(1 - np.sum((y - y_pred)**2) / np.sum((y - y_mean)**2)) if y.var() > 0 else float("nan")
results["NNLS_Positive"] = (r2_nnls, pd.Series(w_nnls, index=X_raw.columns))

# ---------- report ----------
rows = []
for name, (r2, coefs) in results.items():
    row = {"Model": name, "R2": r2}
    row.update({f"w:{k}": v for k,v in coefs.items()})
    rows.append(row)

comp = pd.DataFrame(rows).set_index("Model").sort_values("R2", ascending=False)
pd.set_option("display.max_columns", None)
print("\n=== Model comparison (non-negative only) ===")
display(comp.round(4))

best_name = comp.index[0]
best_r2, best_coefs = results[best_name]
print(f"\nBest non-negative model: {best_name}  (R²={best_r2:.3f})")
print("\nSorted weights (standardized):")
print(best_coefs.sort_values(ascending=False).round(4))

# ---------- save artifacts for reuse ----------
weights_path = OUT_DIR / f"rb_weights_{best_name}.csv"
scaler_path  = OUT_DIR / "rb_scaler.json"
meta_path    = OUT_DIR / "rb_feature_mapping.json"

best_coefs.to_csv(weights_path, header=["coef"])
with open(scaler_path, "w") as f:
    json.dump({
        "means": scaler.mean_.tolist(),
        "scales": scaler.scale_.tolist(),
        "feature_order": list(X_raw.columns),
        "intercept_mean": y_mean,
        "model": best_name
    }, f, indent=2)

with open(meta_path, "w") as f:
    json.dump({"mapped_columns": mapped, "kept_features": keep, "target": y_col}, f, indent=2)

print(f"\nSaved weights → {weights_path}")
print(f"Saved scaler   → {scaler_path}")
print(f"Saved mapping  → {meta_path}")


Loaded: data/Bakery/RB/Bakery_RB_Overall.csv
Rows x Cols: (247, 125)

Using features (canonical <- sheet column):
  DOM++        <- DOM++
  40 Time      <- 40 Time
  BMI          <- BMI
  YPC          <- YPC
  ELU          <- ELU
  YCO/A        <- YCO/A
  Break%       <- Break %
  Draft Capital <- Draft Cap
  Bama         <- BAMA
  Rec Yards    <- Rec Yds
Kept features: ['DOM++', '40 Time', 'BMI', 'YPC', 'ELU', 'YCO/A', 'Break%', 'Draft Capital', 'Bama', 'Rec Yards']

=== Model comparison (non-negative only) ===


Unnamed: 0_level_0,R2,w:DOM++,w:40 Time,w:BMI,w:YPC,w:ELU,w:YCO/A,w:Break%,w:Draft Capital,w:Bama,w:Rec Yards
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OLS_Positive,0.7117,1.3924,0.6178,0.387,0.352,0.8815,0.0,0.0366,0.0426,1.0676,0.0
NNLS_Positive,0.7117,1.3924,0.6178,0.387,0.352,0.8815,0.0,0.0366,0.0426,1.0676,0.0



Best non-negative model: OLS_Positive  (R²=0.712)

Sorted weights (standardized):
DOM++            1.3924
Bama             1.0676
ELU              0.8815
40 Time          0.6178
BMI              0.3870
YPC              0.3520
Draft Capital    0.0426
Break%           0.0366
YCO/A            0.0000
Rec Yards        0.0000
dtype: float64

Saved weights → data/Bakery/_derived/rb_weights_OLS_Positive.csv
Saved scaler   → data/Bakery/_derived/rb_scaler.json
Saved mapping  → data/Bakery/_derived/rb_feature_mapping.json


In [3]:
# ===== Evaluate Bakery RB Grade with multiple models (train/test split) =====
import re, json, numpy as np, pandas as pd
from pathlib import Path
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from scipy.optimize import nnls

# ---------- config ----------
CSV_PATH = Path("./data/Bakery/RB/Bakery_RB_Overall.csv")

ALIASES = {
    "DOM++":        ["DOM++","DOMpp","DOM_plus_plus","DOMpp_Weighted","DOM"],
    "40 Time":      ["40 Time","Forty","40"],
    "BMI":          ["BMI"],
    "YPC":          ["YPC","Yards per Carry","Yards/Carry","Rushing YPC"],
    "ELU":          ["ELU","Elusiveness","Elusiveness Rating"],
    "YCO/A":        ["YCO/A","YAC/A","Yards After Contact / Att","Yards After Contact per Attempt"],
    "Break%":       ["Break%","Breakaway %","Breakaway Percentage","Breakaway%"],
    "Draft Capital":["Draft Capital","Draft Cap","Draft Round","Round","Rnd"],
    "Bama":         ["Bama","Bama Rating","BamaAdj"],
}
TARGET_CANDS = ["RB Grade","RBGrade","RB_Grade"]

# ---------- helpers ----------
def find_col(frame, candidates):
    norm = {re.sub(r"\s+", "", c).lower(): c for c in frame.columns}
    for cand in candidates:
        key = re.sub(r"\s+", "", cand).lower()
        if key in norm:
            return norm[key]
    return None

def to_num(series):
    s = series.astype(str).str.strip()
    s = (s.str.replace('%','',regex=False)
           .str.replace(r'(?i)round\s*','',regex=True)
           .str.replace(r'(?i)^r\s*','',regex=True)
           .str.replace(r'(?i)(st|nd|rd|th)$','',regex=True)
           .str.replace(',','',regex=False)
           .str.replace(r'[^0-9\.\-]','',regex=True))
    return pd.to_numeric(s, errors='coerce')

# ---------- load data ----------
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip() for c in df.columns]

y_col = find_col(df, TARGET_CANDS)
mapped = {feat: find_col(df, alts) for feat, alts in ALIASES.items()}
mapped = {k:v for k,v in mapped.items() if v is not None}

X_raw = pd.DataFrame({feat: to_num(df[col]) for feat, col in mapped.items()})
y_raw = to_num(df[y_col])

# drop NaN target
mask = y_raw.notna()
X_raw, y_raw = X_raw.loc[mask], y_raw.loc[mask]

# invert where lower = better
for c in ["40 Time","Draft Capital"]:
    if c in X_raw.columns:
        X_raw[c] = -X_raw[c]

# impute + scale
imp = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_imp = imp.fit_transform(X_raw)
X_scaled = scaler.fit_transform(X_imp)

# ---------- train/test split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_raw, test_size=0.2, random_state=42
)

# ---------- models ----------
models = {
    "OLS_Positive": LinearRegression(positive=True),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01, max_iter=5000),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "NNLS_Positive": "custom"  # handled separately
}

results = []

# fit sklearn models
for name, model in models.items():
    if name == "NNLS_Positive":
        w, _ = nnls(X_train, y_train - y_train.mean())
        y_pred = y_train.mean() + X_test @ w
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "R2": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": mean_squared_error(y_test, y_pred, squared=False)
    })

# ---------- results ----------
comp = pd.DataFrame(results).set_index("Model").sort_values("R2", ascending=False)
pd.set_option("display.max_columns", None)
display(comp.round(4))


TypeError: got an unexpected keyword argument 'squared'