In [1]:
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

# Ensure project root is on path
PROJECT_ROOT = Path.cwd().resolve().parents[0]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from project_code.functions import *
from sklearn.metrics import log_loss
from scipy.stats import pearsonr, spearmanr

In [2]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x2789a046d90>

In [3]:
exports = Path("../exports")

def latest_timestamp(prefix="wp_model_", suffix=".joblib"):
    pat = re.compile(rf"^{re.escape(prefix)}(\d{{8}}_\d{{4}}){re.escape(suffix)}$")
    ts_list = []
    for p in exports.iterdir():
        m = pat.match(p.name)
        if m:
            ts_list.append(m.group(1))
    if not ts_list:
        raise FileNotFoundError(f"No files matching {prefix}YYYYMMDD_HHMM{suffix} in {exports.resolve()}")
    return sorted(ts_list)[-1]

ts = latest_timestamp(prefix="wp_model_", suffix=".joblib")
print("Loading timestamp:", ts)


Loading timestamp: 20260202_2252


In [None]:
# ---------- Variables ----------
test_season   = joblib.load(f"../exports/test_season_{ts}.joblib")
wp_features   = joblib.load(f"../exports/wp_features_{ts}.joblib")
wp_base_features   = joblib.load(f"../exports/wp_base_features_{ts}.joblib")
go_features   = joblib.load(f"../exports/go_features_{ts}.joblib")
fg_features   = joblib.load(f"../exports/fg_features_{ts}.joblib")
punt_features = joblib.load(f"../exports/punt_features_{ts}.joblib")
fg_features   = joblib.load(f"../exports/fg_features_{ts}.joblib")
go_folds = joblib.load(f"../exports/go_folds_{ts}.joblib")

# ---------- DataFrames ----------
raw_pbp = pd.read_parquet(f"../exports/raw_pbp{ts}.parquet")
pbp = pd.read_parquet(f"../exports/pbp{ts}.parquet")
pbp_fourth_train = pd.read_parquet(f"../exports/pbp_fourth_train_{ts}.parquet")
pbp_fourth_test  = pd.read_parquet(f"../exports/pbp_fourth_test_{ts}.parquet")

# ---------- Models ----------
wp_model   = joblib.load(f"../exports/wp_model_{ts}.joblib")
go_model   = joblib.load(f"../exports/go_model_{ts}.joblib")
fg_model   = joblib.load(f"../exports/fg_model_{ts}.joblib")
punt_model = joblib.load(f"../exports/punt_model_{ts}.joblib")

# Objects
X_scaler = joblib.load(f"../exports/X_scaler_{ts}.joblib")
y_scaler = joblib.load(f"../exports/y_scaler_{ts}.joblib")

In [None]:
state = {
  "yardline_100": 4,
  "down": 4,
  "ydstogo": 4,
  "game_seconds_remaining": 1250,
  "half_seconds_remaining": 1250,
  "score_differential": 0,
  "posteam_timeouts_remaining": 2,
  "defteam_timeouts_remaining": 2,
  "temp_F": 20,
  "wind_mph": 10,
  "possession_spread_line": -4.5,
  "total_line": 43.5
}

test_results, fourth_test_results = create_df_with_ewp(
    pd.DataFrame([state]),
    wp_model=wp_model,
    go_model=go_model,
    fg_model=fg_model,
    punt_model=punt_model,
    wp_features=wp_features,
    wp_base_features=wp_base_features,
    go_features=go_features,
    fg_features=fg_features,
    punt_features=punt_features,
    test=True
)

In [None]:
# Calibration + Brier + Murphy decomposition
y = pbp_fourth_test["win_actual"].astype(float).to_numpy()
p = pbp_fourth_test["wp_pred"].astype(float).to_numpy()

# Bin predicted probabilities
bins = np.linspace(0, 1, 11)  # 10 equal-width bins
wp_bin = pd.cut(pbp_fourth_test["wp_pred"], bins=bins, include_lowest=True)

# Aggregate by bin
cal_table = (
    pbp_fourth_test
    .groupby(wp_bin, observed=False)
    .agg(
        wp_pred_mean=("wp_pred", "mean"),
        win_rate=("win_actual", "mean"),
        count=("win_actual", "count"),
    )
    .reset_index()
)

N = len(pbp_fourth_test)
p_bar = cal_table["wp_pred_mean"].to_numpy(dtype=float)
o_bar = cal_table["win_rate"].to_numpy(dtype=float)
n_k  = cal_table["count"].to_numpy(dtype=float)

o = y.mean()

reliability = np.sum((n_k / N) * (p_bar - o_bar) ** 2)
resolution  = np.sum((n_k / N) * (o_bar - o) ** 2)
uncertainty = o * (1.0 - o)

brier = reliability - resolution + uncertainty

print(f"Brier score:  {brier:.5f}")
print(f"Reliability:  {reliability:.5f}")
print(f"Resolution:   {resolution:.5f}")
print(f"Uncertainty:  {uncertainty:.5f}")

x = cal_table["wp_pred_mean"].to_numpy()
yhat = cal_table["win_rate"].to_numpy()

plt.figure(figsize=(6, 6))
plt.plot(x, yhat, marker="o")
plt.plot([0, 1], [0, 1], "--", color="gray")
plt.xlabel("Predicted WP")
plt.ylabel("Observed Win Rate")
plt.title("Calibration Curve")
plt.show()

In [None]:
cal_table

In [None]:
state = pbp_fourth_test[wp_features].copy()

state_flipped = state.copy()
state_flipped["score_differential"] *= -1
state_flipped[["posteam_timeouts_remaining","defteam_timeouts_remaining"]] = (
    state_flipped[["defteam_timeouts_remaining","posteam_timeouts_remaining"]].values
)
state_flipped["yardline_100"] = 100 - state_flipped["yardline_100"]
state_flipped["possession_spread_line"] *= -1

wp_raw = predict_wp(state, wp_model, wp_features)
wp_raw_flipped = predict_wp(state_flipped, wp_model, wp_features)

flip_err = wp_raw + wp_raw_flipped - 1
print("flip_err mean:", flip_err.mean())
print("flip_err abs mean:", np.abs(flip_err).mean())
print("flip_err std:", flip_err.std())

In [None]:
bins = np.linspace(0, 1, 11)
calibration_df = pd.DataFrame(index=pd.IntervalIndex.from_tuples([(round(bins[i],2), round(bins[i+1],2)) for i in range(len(bins)-1)]))

# Function to compute empirical win fraction per bin
def empirical_win_fraction(pred_col):
    return pbp_fourth_test.groupby(pd.cut(pbp_fourth_test[pred_col], bins=bins))['win_actual'].mean()

# Compute results for each play type
calibration_df['ewp_punt'] = round(empirical_win_fraction('ewp_punt'),3)
calibration_df['ewp_fg'] = round(empirical_win_fraction('ewp_fg'),3)
calibration_df['ewp_go'] = round(empirical_win_fraction('ewp_go'),3)

# Bins indicate predicted wp bin; columns are how often a team actually won in that predicted wp bin
calibration_df

In [None]:
ewp_columns = ['ewp_punt', 'ewp_fg', 'ewp_go']

# Boolean mask of violations
violation_mask = (pbp_fourth_test[ewp_columns] < 0) | (pbp_fourth_test[ewp_columns] > 1)

# Count violations per column
violations = violation_mask.sum()

if violations.sum() == 0:
    print("No EWP violations detected.")
else:
    print("Violations detected:")
    print(violations)
    print(pbp_fourth_test[violation_mask.any(axis=1)].head())

In [None]:
# Conditional mean regret
pbp_fourth_test[pbp_fourth_test.follow_model == 0].regret_actual.mean()

In [None]:
# Compute regret stats by play type ---
df_disagree = pbp_fourth_test[pbp_fourth_test["disagreed"] == True].copy()
regret_by_play = df_disagree.groupby('play_type_actual')['regret_actual'].agg(['mean', 'median'])
df_disagree['regret_actual'] = pd.to_numeric(df_disagree['regret_actual'], errors='coerce')

# Compute stats
regret_by_play = df_disagree.groupby('play_type_actual')['regret_actual'].agg(['size', 'mean', 'median'])
regret_by_play['95th'] = df_disagree.groupby('play_type_actual')['regret_actual'].quantile(0.95)

print("Regret by Play Type Conditioned on Coach Disagreement:")
regret_by_play

In [None]:
# Regret vs margin for disagreements
disagree_bins = pd.qcut(df_disagree["decision_margin"], q=10, duplicates="drop")
regret_by_margin = df_disagree.groupby(disagree_bins)["regret_actual"].agg(
    mean="mean",
    p95=lambda x: np.percentile(x, 95),
    count="count"
)
print("Regret vs Decision Margin (Disagreements)")
display(regret_by_margin)

# Bin decision margin
pbp_fourth_test["margin_bin"] = pd.qcut(pbp_fourth_test["decision_margin"], 10)
follow_by_margin = pbp_fourth_test.groupby("margin_bin").agg(
    follow_rate=("follow_model", "mean"),
    count=("follow_model", "size")
)
print("Follow Model Rate vs Decision Margin")
follow_by_margin

In [None]:
test = pbp_fourth_test[pbp_fourth_test.season == test_season]

labels = ["punt", "field_goal", "go"]

cm_df = test[["play_type_actual", "recommended_play"]].dropna()

cm = confusion_matrix(
    cm_df["play_type_actual"].astype(str),
    cm_df["recommended_play"].astype(str),
    labels=labels
)

confusion = pd.DataFrame(
    cm,
    index=pd.Index(labels, name="Actual"),
    columns=pd.Index(labels, name="Recommended")
)

confusion

In [None]:
confusion_norm = confusion.div(confusion.sum(axis=1), axis=0)
round(confusion_norm,4)

In [None]:
def p_go_from_margin(margin, scale):
    
    return 1 / (1 + np.exp(-margin / scale))

In [None]:
def tune_scale_cv(df, folds, scale_grid):
    
    losses = []
    for s in scale_grid:
        fold_losses = []
        for tr_idx, va_idx in folds:
            dval = df.iloc[va_idx]
            y = (dval["play_type_actual"] == "go").astype(int).to_numpy()
            p = p_go_from_margin(dval["go_margin"].to_numpy(), scale=s)
            p = np.clip(p, 1e-6, 1 - 1e-6)
            fold_losses.append(log_loss(y, p))
        losses.append((s, float(np.mean(fold_losses))))
    losses.sort(key=lambda x: x[1])
    
    return losses

In [None]:
scale_grid = np.geomspace(0.002, 0.05, 30)
losses = tune_scale_cv(pbp_fourth_train.reset_index(drop=True), go_folds, scale_grid)

best_scale, best_loss = losses[0]
print(best_scale, best_loss)

In [None]:
df = pbp_fourth_test.copy()

df["p_go_model"] = p_go_from_margin(
    df["go_margin"],
    scale=best_scale
).clip(.01, .99)

df["coach_go"] = (
    df["play_type_actual"] == "go"
).astype(int)

df["go_diff"] = df["coach_go"] - df["p_go_model"]

league_bias = df["go_diff"].mean()

rai = (
    df.groupby("possession_coach")["go_diff"].mean()
    - league_bias
)

In [None]:
df = pbp_fourth_test.copy()

df["p_go_model"] = p_go_from_margin(
    df["go_margin"],
    scale=best_scale
).clip(.01, .99)

df["coach_go"] = (
    df["play_type_actual"] == "go"
).astype(int)


df["go_diff"] = (
    df["coach_go"]
    - df["p_go_model"]
)

league_bias = df["go_diff"].mean()
df["relative_go_diff"] = df["go_diff"] - league_bias

rai_raw = (
    df
    .groupby("possession_coach")
    .agg(
        rai=("relative_go_diff", "mean"),
        n=("relative_go_diff", "size")
    )
    .sort_values("rai")
)

k = 30  # prior strength
rai_raw["rai_shrunk"] = (
    rai_raw["n"] / (rai_raw["n"] + k)
) * rai_raw["rai"]

In [None]:
rai_raw.sort_values("rai_shrunk", ascending=False).head(10)

In [None]:
def create_features(df):
    """
    Safely adds derived football features.
    Only creates features when required base columns exist.
    Missing dependencies -> feature is created as NaN or 0.
    """

    df = df.copy()

    if "yardline_100" in df.columns:
        df["is_red_zone"] = (df["yardline_100"] <= 20).astype(int)
    else:
        df["is_red_zone"] = 0

    if {"ydstogo", "yardline_100"}.issubset(df.columns):
        df["is_goal_to_go"] = (df["ydstogo"] >= df["yardline_100"]).astype(int)
    else:
        df["is_goal_to_go"] = 0

    if "ydstogo" in df.columns:
        df["log_ydstogo"] = np.log1p(df["ydstogo"].clip(lower=0))
    else:
        df["log_ydstogo"] = np.nan

    if "game_seconds_remaining" in df.columns:
        df["log_game_seconds_remaining"] = np.log1p(df["game_seconds_remaining"].clip(lower=0))
    else:
        df["log_game_seconds_remaining"] = np.nan

    if "score_differential" in df.columns:
        df["abs_score_differential"] = df["score_differential"].abs()
    else:
        df["abs_score_differential"] = np.nan

    if {"score_differential", "game_seconds_remaining"}.issubset(df.columns):
        df["score_time_ratio"] = (df["score_differential"].abs() / (df["game_seconds_remaining"] + 1))
    else:
        df["score_time_ratio"] = np.nan

    return df

In [None]:
rai_raw.sort_values("rai_shrunk").head(10)

In [None]:
yard_edges  = list(range(1, 101, 10)) + [101]
yard_labels = [f"{start}–{start+9}" for start in yard_edges[:-1]]

df["yardline_bin"] = pd.cut(
    df["yardline_100"],
    bins=yard_edges,
    labels=yard_labels,
    right=False,
    include_lowest=True
)


togo_bins   = [0, 1, 3, 6, 9, np.inf]
togo_labels = ["1", "2–3", "4–6", "7–9", "10+"]

df["ydstogo_bin"] = pd.cut(
    df["ydstogo"],
    bins=togo_bins,
    labels=togo_labels,
    right=True,
    include_lowest=True
)

In [None]:
heat = (
    df
    .groupby(["ydstogo_bin", "yardline_bin"], observed=False)["go_diff"]
    .mean()
    .unstack("yardline_bin")
)

counts = (
    df
    .groupby(["ydstogo_bin", "yardline_bin"], observed=False)["go_diff"]
    .size()
    .unstack("yardline_bin")
)

min_n = 10
heat = heat.mask(counts < min_n)

round(heat,3)

In [None]:
agg = df.groupby("possession_coach").agg(
    go_rate=("coach_go", "mean"),
    n=("coach_go", "size")
)

rai = rai_raw.copy()

rai["rai_shrunk"] = (
    rai["n"] / (rai["n"] + k)
) * rai["rai"]

agg = (
    df.groupby("possession_coach")
      .agg(
          go_rate=("coach_go", "mean"),
          n=("coach_go", "size")
      )
)

rai_plot = (
    agg
    .join(rai_raw[["rai_shrunk"]])
    .dropna()
)

x0 = rai_plot["go_rate"].mean()     # league-average aggression
y0 = 0.0                            # neutral intelligence

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

plt.scatter(
    rai_plot["go_rate"],
    rai_plot["rai_shrunk"],
    alpha=0.7
)

plt.axvline(x0)
plt.axhline(y0)

# --- ADD LABELS
for coach, r in rai_plot.iterrows():
    plt.text(
        r["go_rate"],
        r["rai_shrunk"],
        str(coach),
        fontsize=8,
        alpha=0.8
    )

plt.xlabel("Aggression: Go-for-it rate")
plt.ylabel("Intelligence: RAI (shrunk)")
plt.title("Aggression vs Intelligence")

plt.text(
    r["go_rate"] + 0.002,
    r["rai_shrunk"] + 0.002,
    str(coach),
    fontsize=8,
    alpha=0.8
)

plt.show()

In [None]:
pbp_all, pbp_fourth_all = create_df_with_ewp(
    pbp,
    wp_model=wp_model,
    go_model=go_model,
    fg_model=fg_model,
    punt_model=punt_model,
    wp_features=wp_features,
    wp_base_features=wp_base_features,
    go_features=go_features,
    fg_features=fg_features,
    punt_features=punt_features
)

In [None]:
df = pbp_fourth_all.copy()

df["p_go_model"] = (
    p_go_from_margin(df["go_margin"], scale=best_scale)
    .clip(0.01, 0.99)
)

df["coach_go"] = (df["play_type_actual"] == "go").astype(int)

# decision residual: actual go (0/1) minus model p(go)
df["go_diff"] = df["coach_go"] - df["p_go_model"]

# league bias by season (important: the league's aggressiveness changes over time)
league_bias_by_season = df.groupby("season")["go_diff"].mean()

# RAI per team-season (mean residual, de-biased within season)
rai_team_season = (
    df.groupby(["season", "posteam"])["go_diff"].mean()
    .rename("rai_raw")
    .reset_index()
)

rai_team_season["league_bias"] = rai_team_season["season"].map(league_bias_by_season)
rai_team_season["rai"] = rai_team_season["rai_raw"] - rai_team_season["league_bias"]

In [None]:
past = (rai_team_season.query("season in [2022, 2023, 2024]")
        .groupby("posteam")["rai"]
        .mean()
        .rename("rai_22_24"))

future = (rai_team_season.query("season == 2025")
          .set_index("posteam")["rai"]
          .rename("rai_2025"))

stick_df = past.to_frame().join(future, how="inner")

In [None]:
x = stick_df["rai_22_24"].to_numpy()
y = stick_df["rai_2025"].to_numpy()

pearson_r, pearson_p = pearsonr(x, y)
spearman_r, spearman_p = spearmanr(x, y)

In [None]:
plt.figure(figsize=(8, 6))

# scatter
plt.scatter(x, y, alpha=0.8)

# regression line
coef = np.polyfit(x, y, 1)
x_line = np.linspace(x.min(), x.max(), 100)
y_line = coef[0] * x_line + coef[1]
plt.plot(x_line, y_line, lw=2)

# reference lines
plt.axhline(0, color="black", lw=1, alpha=0.6)
plt.axvline(0, color="black", lw=1, alpha=0.6)

plt.xlabel("Average RAI (2022–2024)")
plt.ylabel("RAI (2025)")
plt.title("Past RAI vs Future RAI (Out-of-Sample)");

In [None]:
!git add .
!git commit -m "Update analysis and functions notebooks"
!git push