In [1]:
import pandas as pd

In [2]:
import sys
from pathlib import Path
repo_root = Path.cwd().resolve().parents[3]
print(f"Adding {repo_root} to sys.path")
sys.path.append(str(repo_root))
import utils

Adding /home/mrmath/sports_betting_empire/sports_betting_empire to sys.path


In [3]:
ybc_yac = utils.rush_yard_stats_from_s3("ybc_yac", 2018, 2025)

In [4]:
ybc_yac = ybc_yac[ybc_yac['Pos.'] == 'RB']

In [5]:
def engineer_ybc_trends(
    df: pd.DataFrame,
    entity_col: str,         # "Player", "Team", or "Opp"
    prefix: str = "",        # "", "team_", "opp_"
    windows=(3, 5),
):
    """
    Build leak-free rolling YBC/YAC trend features.

    - entity_col: grouping key (Player / Team / Opp)
    - prefix: feature prefix (e.g., 'opp_')
    """

    df = df.copy()

    # -------------------------
    # Numeric casting
    # -------------------------
    num_cols = ["YBC/Att", "YAC/Att", "Att/Br"]
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

    # -------------------------
    # Base per-carry metrics
    # -------------------------
    df["ybc_per_att"] = df["YBC/Att"]
    df["yac_per_att"] = df["YAC/Att"]
    df["brk_tkl_per_att"] = df["Att/Br"]
    
    # -------------------------
    # Sorting
    # -------------------------
    sort_keys = [entity_col]
    if "Date" in df.columns:
        sort_keys.append("Date")

    df = df.sort_values(sort_keys)

    g = df.groupby(entity_col, group_keys=False)

    base_cols = ["ybc_per_att", "yac_per_att", "brk_tkl_per_att"]

    # -------------------------
    # Rolling trends (past-only)
    # -------------------------
    for col in base_cols:
        shifted = g[col].shift(1)
        for w in windows:
            df[f"{prefix}{col}_{w}_g_ma"] = (
                shifted.rolling(w, min_periods=1).mean()
            )

    # -------------------------
    # Final selection
    # -------------------------
    key_cols = [
        c for c in
        list(set([entity_col, "Season", "Week", "Date", "Team", "Opp"]))
        if c in df.columns
    ]

    feature_cols = [
        c for c in df.columns
        if c.startswith(prefix) and "_g_ma" in c
    ]

    return df[key_cols + feature_cols]


In [6]:
ybc_yac_player_level_df = engineer_ybc_trends(ybc_yac, entity_col="Player", prefix="player_")

opp_def = (
    ybc_yac
    .groupby(["Opp", "Date"], as_index=False)
    .agg({
        "YBC/Att": "mean",
        "YAC/Att": "mean",
        "Att/Br": "mean",
    })
)

opp_ybc_features = engineer_ybc_trends(
    opp_def,
    entity_col="Opp",
    prefix="opp_",
)

team_def = (
    ybc_yac
    .groupby(["Team", "Date"], as_index=False)
    .agg({
        "YBC/Att": "mean",
        "YAC/Att": "mean",
        "Att/Br": "mean",
    })
)
team_ybc_features = engineer_ybc_trends(
    team_def,
    entity_col="Team",
    prefix="team_",
)


In [7]:
opp_ybc_features

Unnamed: 0,Date,Opp,opp_ybc_per_att_3_g_ma,opp_ybc_per_att_5_g_ma,opp_yac_per_att_3_g_ma,opp_yac_per_att_5_g_ma,opp_brk_tkl_per_att_3_g_ma,opp_brk_tkl_per_att_5_g_ma
0,2018-09-09,ARI,,,,,,
1,2018-09-16,ARI,3.300000,3.300000,3.033333,3.033333,9.00,9.0000
2,2018-09-23,ARI,2.175000,2.175000,2.516667,2.516667,12.25,12.2500
3,2018-09-30,ARI,3.066667,3.066667,2.244444,2.244444,12.25,12.2500
4,2018-10-07,ARI,3.016667,3.087500,1.900000,2.183333,11.25,10.5000
...,...,...,...,...,...,...,...,...
4249,2025-12-07,WAS,2.827778,2.510000,1.933333,1.820000,,11.0000
4250,2025-12-14,WAS,2.116667,2.283333,2.311111,2.253333,7.00,9.0000
4251,2025-12-20,WAS,2.000000,2.486667,1.994444,2.096667,7.25,7.2500
4252,2025-12-25,WAS,2.866667,2.650000,2.611111,2.286667,5.75,5.7500


In [8]:
ybc_yac_player_level_df['team_key'] = ybc_yac_player_level_df['Team'] + "_" + ybc_yac_player_level_df['Date'].dt.strftime('%Y-%m-%d')
ybc_yac_player_level_df['opp_key'] = ybc_yac_player_level_df['Opp'] + "_" + ybc_yac_player_level_df['Date'].dt.strftime('%Y-%m-%d')
opp_ybc_features['opp_key'] = opp_ybc_features['Opp'] + "_" + opp_ybc_features['Date'].dt.strftime('%Y-%m-%d')
team_ybc_features['team_key'] = team_ybc_features['Team'] + "_" + team_ybc_features['Date'].dt.strftime('%Y-%m-%d')

In [9]:
merged_dfs = pd.merge(
    ybc_yac_player_level_df,
    opp_ybc_features[["opp_key"] + [c for c in opp_ybc_features.columns if c.startswith("opp_") and "_g_ma" in c]],
    on="opp_key",
    how="left",
)
merged_dfs = pd.merge(
    merged_dfs,
    team_ybc_features[["team_key"] + [c for c in team_ybc_features.columns if c.startswith("team_") and "_g_ma" in c]],
    on="team_key",
    how="left",
)
final_df = merged_dfs.drop(columns=["team_key", "opp_key"])
final_df

Unnamed: 0,Date,Player,Week,Team,Opp,player_ybc_per_att_3_g_ma,player_ybc_per_att_5_g_ma,player_yac_per_att_3_g_ma,player_yac_per_att_5_g_ma,player_brk_tkl_per_att_3_g_ma,...,opp_yac_per_att_3_g_ma,opp_yac_per_att_5_g_ma,opp_brk_tkl_per_att_3_g_ma,opp_brk_tkl_per_att_5_g_ma,team_ybc_per_att_3_g_ma,team_ybc_per_att_5_g_ma,team_yac_per_att_3_g_ma,team_yac_per_att_5_g_ma,team_brk_tkl_per_att_3_g_ma,team_brk_tkl_per_att_5_g_ma
0,2020-09-13,AJ Dillon,1,GNB,MIN,,,,,,...,2.388889,1.793333,4.800000,12.900000,2.150000,2.050000,3.122222,2.873333,4.375,6.812500
1,2020-09-20,AJ Dillon,2,GNB,DET,4.500000,4.500000,2.500000,2.50,,...,2.722222,2.206667,5.016667,6.137500,3.066667,2.740000,3.255556,2.943333,5.050,6.062500
2,2020-09-27,AJ Dillon,3,GNB,NOR,2.650000,2.650000,2.550000,2.55,2.5,...,0.866667,1.180000,7.900000,6.575000,3.211111,2.796667,3.377778,3.086667,4.200,4.687500
3,2020-10-05,AJ Dillon,4,GNB,ATL,2.650000,2.650000,2.550000,2.55,2.5,...,2.038889,2.306667,8.000000,8.000000,3.033333,2.780000,2.788889,3.166667,5.000,4.200000
4,2020-10-18,AJ Dillon,6,GNB,TAM,1.900000,2.766667,1.300000,1.70,2.5,...,1.600000,1.520000,10.000000,11.750000,2.233333,2.640000,2.366667,2.693333,2.500,3.775000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13474,2025-11-16,Zonovan Knight,11,ARI,SFO,2.133333,2.300000,1.166667,0.86,7.0,...,1.633333,1.710000,13.000000,15.000000,2.033333,3.646667,2.283333,1.650000,6.750,7.125000
13475,2025-11-23,Zonovan Knight,12,ARI,JAX,2.233333,2.300000,1.300000,1.26,7.0,...,2.305556,1.913333,11.250000,11.625000,2.255556,1.906667,2.472222,1.836667,6.750,6.000000
13476,2025-11-30,Zonovan Knight,13,ARI,TAM,1.533333,1.740000,1.400000,1.44,5.0,...,0.994444,0.826667,11.250000,10.333333,1.705556,1.770000,2.705556,2.126667,4.500,6.750000
13477,2025-12-07,Zonovan Knight,14,ARI,LAR,1.066667,1.420000,2.800000,2.06,5.5,...,1.105556,1.263333,13.750000,8.125000,1.300000,1.793333,2.377778,2.623333,5.500,6.333333


In [12]:
final_df.to_csv("ybc_yac_feature_engineering.csv", index=False)