In [None]:
import pandas as pd
import numpy as np
import jupyter_black
import nfl_data_py as nfl
from typing import Literal, Tuple

# from src.modelling.metrics import success_rate_lambda
# from visualisation.plots import plot_team_scatter

jupyter_black.load()
pd.set_option("display.max_columns", None)

In [None]:
pbp = nfl.import_pbp_data(range(2022, 2025))

In [None]:
pbp_rp = pbp.query('play_type=="pass" or play_type=="run"')
len(pbp_rp)

In [None]:
# filter for neutral game states
neutral_wp_threshold = 0.025

pbp_rp_neu = pbp_rp.query(
    f"wp>={neutral_wp_threshold} and def_wp>={neutral_wp_threshold}"
)
len(pbp_rp_neu)

In [None]:
pwk = nfl.import_weekly_data(years=[2024])
len(pwk)

In [None]:
pwk.position_group.value_counts()

In [None]:
print(pwk.sample(4).to_markdown())
# pwk.sample(4)

In [None]:
qb_cols = [
    "player_id",
    "player_name",
    "player_display_name",
    "recent_team",
    "season",
    "week",
    "opponent_team",
    "completions",
    "attempts",
    "passing_yards",
    "passing_tds",
    "interceptions",
    "sacks",
    "sack_yards",
    "sack_fumbles",
    "sack_fumbles_lost",
    "passing_air_yards",
    "passing_yards_after_catch",
    "passing_first_downs",
    "passing_epa",
    "passing_2pt_conversions",
    "pacr",
    "dakota",
    "carries",
    "rushing_yards",
    "rushing_tds",
    "rushing_fumbles",
    "rushing_fumbles_lost",
    "rushing_first_downs",
    "rushing_epa",
    "rushing_2pt_conversions",
    "fantasy_points",
]

In [None]:
pwk_qb = pwk.query("position=='QB'").query("season_type=='REG'")[qb_cols].copy()
len(pwk_qb)

In [None]:
pwk_qb.sample(4)

In [None]:
week = 13

In [None]:
off_epa_df = (
    pbp_r_neu.groupby(["season", "week", "posteam"])
    .agg({"epa": "mean"})
    .sort_values(by="epa", ascending=False)
)
# off_epa_df["epa_rank"] = off_epa_df["epa"].rank(ascending=False).astype(int)
off_epa_df["epa_percentile"] = off_epa_df["epa"].rank(pct=True).round(2) * 10

In [None]:
off_epa_df.xs(2024, level="season").xs(2, level="week")

In [None]:
pbp_r_neu.groupby(["season", "posteam"]).agg({"epa": "mean"}).sort_values(
    by="epa", ascending=False
).rename(columns={"epa": "epa_per_rush"})

In [None]:
success_rate_lambda = lambda x: 1 if x > 0 else 0

In [None]:
def calculate_epa_metrics(
    data: pd.DataFrame, team: Literal["posteam", "defteam"] = "posteam"
) -> pd.DataFrame:
    sort_ascending = team == "defteam"
    df = data.copy()
    df["success"] = df["epa"].apply(success_rate_lambda)
    epa_df = (
        df.groupby(team)
        .agg({team: "count", "epa": "mean", "success": "mean"})
        .sort_values(by="epa", ascending=False)
        .rename(columns={team: "n", "success": "success_rate"})
    )
    for col in ["epa", "success_rate"]:
        epa_df[f"{col}_rank"] = epa_df[col].rank(ascending=sort_ascending).astype(int)
        epa_df[f"{col}_percentile"] = (
            epa_df[col].rank(ascending=(not sort_ascending), pct=True).round(2) * 10
        )
    epa_df["epa"] = epa_df["epa"].round(3)
    epa_df["success_rate"] = epa_df["success_rate"].round(2)
    col_list = list(epa_df.columns)
    col_list.remove("n")
    col_list = list(sorted(col_list))
    col_list.insert(0, "n")
    return epa_df[col_list].sort_values("epa_rank")


def dual_epa_metrics(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return calculate_epa_metrics(data), calculate_epa_metrics(data, "defteam")

In [None]:
off_rush_epa_df, def_rush_epa_df = dual_epa_metrics(pbp_r_neu.query("season==2023"))

In [None]:
off_rush_epa_df

In [None]:
plot_team_scatter(off_rush_epa_df, "success_rate", "epa", alpha=0.9)

In [None]:
def_rush_epa_df

In [None]:
plot_team_scatter(def_rush_epa_df, "success_rate", "epa", flip_def=True, alpha=0.9)