In [1]:
import pandas as pd

In [2]:
import sys
from pathlib import Path
repo_root = Path.cwd().resolve().parents[3]
print(f"Adding {repo_root} to sys.path")
sys.path.append(str(repo_root))
import utils

Adding /home/mrmath/sports_betting_empire/sports_betting_empire to sys.path


In [3]:
base_stats = utils.rush_yard_stats_from_s3("base_stats", 2018, 2025)

In [4]:
offense_rush_stats_LOOKUP = {}
for k, v in base_stats.sort_values(['Date']).groupby(['Team']):
    for i in v['Player'].unique():
        player_data = base_stats[base_stats['Player'] == i].sort_values(['Date'])

        # existing rolling means (already great)
        rush_yards_1ma = player_data['Yds'].shift(1).rolling(1, min_periods=1).mean()
        rush_yards_3ma = player_data['Yds'].shift(1).rolling(3, min_periods=1).mean()
        rush_yards_5ma = player_data['Yds'].shift(1).rolling(5, min_periods=1).mean()
        rush_yards_10ma = player_data['Yds'].shift(1).rolling(10, min_periods=1).mean()


        rush_attempts_1ma = player_data['Att'].shift(1).rolling(1, min_periods=1).mean()
        rush_attempts_3ma = player_data['Att'].shift(1).rolling(3, min_periods=1).mean()
        rush_attempts_5ma = player_data['Att'].shift(1).rolling(5, min_periods=1).mean()
        rush_attempts_10ma = player_data['Att'].shift(1).rolling(10, min_periods=1).mean()

        ypc_1ma = (player_data['Yds'] / player_data['Att']).shift(1).rolling(1, min_periods=1).mean()
        ypc_3ma = (player_data['Yds'] / player_data['Att']).shift(1).rolling(3, min_periods=1).mean()
        ypc_5ma = (player_data['Yds'] / player_data['Att']).shift(1).rolling(5, min_periods=1).mean()
        ypc_10ma = (player_data['Yds'] / player_data['Att']).shift(1).rolling(10, min_periods=1).mean()

        success_rate_1ma = player_data['Succ%'].shift(1).rolling(1, min_periods=1).mean()
        success_rate_3ma = player_data['Succ%'].shift(1).rolling(3, min_periods=1).mean()
        success_rate_5ma = player_data['Succ%'].shift(1).rolling(5, min_periods=1).mean()
        success_rate_10ma = player_data['Succ%'].shift(1).rolling(10, min_periods=1).mean()


        max_rush_yards_3ma = player_data['Yds'].shift(1).rolling(3, min_periods=1).max()
        max_rush_yards_5ma = player_data['Yds'].shift(1).rolling(5, min_periods=1).max()
        min_rush_yards_3ma = player_data['Yds'].shift(1).rolling(3, min_periods=1).min()
        min_rush_yards_5ma = player_data['Yds'].shift(1).rolling(5, min_periods=1).min()

        # --- NEW: Rolling deltas ---
        rush_yards_delta_3_5 = rush_yards_3ma - rush_yards_5ma
        rush_yards_delta_5_10 = rush_yards_5ma - rush_yards_10ma

        rush_attempts_delta_3_5 = rush_attempts_3ma - rush_attempts_5ma
        rush_attempts_delta_5_10 = rush_attempts_5ma - rush_attempts_10ma

        ypc_delta_3_5 = ypc_3ma - ypc_5ma
        ypc_delta_5_10 = ypc_5ma - ypc_10ma

        success_rate_delta_3_5 = success_rate_3ma - success_rate_5ma

        # --- Optional: volatility features (std over window) ---
        rush_yards_vol_5 = player_data['Yds'].shift(1).rolling(5, min_periods=2).std()
        ypc_vol_5 = (player_data['Yds'] / player_data['Att']).shift(1).rolling(5, min_periods=2).std()

        base_player_stats_ma = {
            'Date': pd.to_datetime(player_data['Date']),
            'rush_yards_1ma': rush_yards_1ma,
            'rush_yards_3ma': rush_yards_3ma,
            'rush_yards_5ma': rush_yards_5ma,
            'rush_yards_10ma': rush_yards_10ma,
            'rush_yards_delta_3_5': rush_yards_delta_3_5,
            'rush_yards_delta_5_10': rush_yards_delta_5_10,

            'rush_attempts_1ma': rush_attempts_1ma,
            'rush_attempts_3ma': rush_attempts_3ma,
            'rush_attempts_5ma': rush_attempts_5ma,
            'rush_attempts_10ma': rush_attempts_10ma,
            'rush_attempts_delta_3_5': rush_attempts_delta_3_5,
            'rush_attempts_delta_5_10': rush_attempts_delta_5_10,

            'ypc_1ma': ypc_1ma,
            'ypc_3ma': ypc_3ma,
            'ypc_5ma': ypc_5ma,
            'ypc_10ma': ypc_10ma,
            'ypc_delta_3_5': ypc_delta_3_5,
            'ypc_delta_5_10': ypc_delta_5_10,

            'success_rate_1ma': success_rate_1ma,
            'success_rate_3ma': success_rate_3ma,
            'success_rate_5ma': success_rate_5ma,
            'success_rate_10ma': success_rate_10ma,
            'success_rate_delta_3_5': success_rate_delta_3_5,


            'rush_yards_vol_5': rush_yards_vol_5,
            'ypc_vol_5': ypc_vol_5,

            'min_rush_yards_3ma': min_rush_yards_3ma,
            'min_rush_yards_5ma': min_rush_yards_5ma,
            'max_rush_yards_3ma': max_rush_yards_3ma,
            'max_rush_yards_5ma': max_rush_yards_5ma,
            'Pos.': player_data['Pos.'].iloc[0],
        }

        offense_rush_stats_LOOKUP[i] = pd.DataFrame(base_player_stats_ma)



In [5]:
base_stats['Date'] = pd.to_datetime(base_stats['Date'])

In [6]:
base_stats = base_stats[base_stats['Pos.'] == 'RB']

In [7]:
starter_lookup_by_week_season = {}
rusher_lookup_by_week_season = {}
base_stats = base_stats.sort_values(['Date'])
for i in range(len(base_stats)):
    rusher = base_stats.iloc[i]
    key = (base_stats.iloc[i]['Player'], base_stats.iloc[i]['Week'], base_stats.iloc[i]['season'])
    if rusher['is_starter']:
        starter_lookup_by_week_season[key] = rusher['is_starter']
    rusher_lookup_by_week_season[key] = 1

In [11]:
def generate_train_df(rush_df):
    """
    Build training dataset for RB workload prediction.

    Key modeling ideas:
    - Capture teammate competition within same game
    - Model recency-weighted injury impact of other RBs
    - Estimate how carry share changes when injured RBs return
    - Use rolling moving averages (1/3/5 windows) to capture workload trends
    """

    rush_df = rush_df.sort_values("Date").copy()
    rush_df["game_date"] = rush_df["Date"].dt.date

    rows = []

    for row in rush_df.itertuples(index=False):

        player_key = row.Player
        team = row.Team
        game_date = row.game_date
        week = row.Week
        season = row.season

        # -------------------------------------------------
        # PLAYER STATS UP TO CURRENT GAME
        # -------------------------------------------------

        if player_key not in offense_rush_stats_LOOKUP:
            continue

        player_full_history = offense_rush_stats_LOOKUP[player_key]
        ps = player_full_history[player_full_history["Date"].dt.date <= game_date]

        if ps.empty:
            continue

        player_stats_on_date = ps.iloc[-1]

        # -------------------------------------------------
        # OTHER RUSHERS ACTIVE IN THIS GAME
        # -------------------------------------------------

        same_game_teammates = rush_df[
            (rush_df["Team"] == team) &
            (rush_df["game_date"] == game_date) &
            (rush_df["Player"] != player_key)
        ]

        # -------------------------------------------------
        # PREVIOUS TEAMMATES THIS SEASON (for injury logic)
        # -------------------------------------------------

        prev_teammates = rush_df[
            (rush_df["Team"] == team) &
            (rush_df["game_date"] < game_date) &
            (rush_df["season"] == season) &
            (rush_df["Player"] != player_key)
        ].drop_duplicates(["Player", "season"], keep="last")

        others_been_injured_1ma = 0
        others_been_injured_3ma = 0
        others_been_injured_5ma = 0

        carries_before_injury_1ma = 0
        carries_before_injury_3ma = 0
        carries_before_injury_5ma = 0

        for teammate in prev_teammates.itertuples(index=False):

            key = teammate.Player
            last_active_week = teammate.Week

            # Skip if active this week
            if (key, week, season) in rusher_lookup_by_week_season:
                continue

            week_diff = week - last_active_week

            if week_diff > 5:
                continue

            stats_df = offense_rush_stats_LOOKUP.get(key)
            if stats_df is None:
                continue

            stats_df = stats_df[stats_df["Date"].dt.date < game_date]
            if stats_df.empty:
                continue

            last_val = stats_df.iloc[-1]["rush_attempts_5ma"]
            if pd.isna(last_val):
                continue

            # Recency-weighted injury impact
            if week_diff < 2:
                others_been_injured_1ma += last_val
            elif 2 <= week_diff <= 3:
                others_been_injured_3ma += last_val
            elif 4 <= week_diff <= 5:
                others_been_injured_5ma += last_val

            # -------------------------------------------------
            # CARRIES BEFORE INJURY (GUARD INCLUDED)
            # -------------------------------------------------

            if len(stats_df) < 2:
                continue  # <-- Guard added here

            pre_injury_row = stats_df.iloc[-2]
            pre_injury_date = pre_injury_row["Date"]

            player_hist = player_full_history[
                player_full_history["Date"] == pre_injury_date
            ]

            if player_hist.empty:
                continue

            player_pre = player_hist.iloc[-1]

            for window in [1, 3, 5]:

                col = f"rush_attempts_{window}ma"

                player_val = player_pre.get(col, 0)
                teammate_val = pre_injury_row.get(col, 0)

                denom = player_val + teammate_val
                if denom <= 0:
                    continue

                share = player_val / denom

                if window == 1:
                    carries_before_injury_1ma += share
                elif window == 3:
                    carries_before_injury_3ma += share
                elif window == 5:
                    carries_before_injury_5ma += share

        # -------------------------------------------------
        # ACTIVE TEAMMATE MOVING AVERAGES
        # -------------------------------------------------

        other_stats = {
            "others_rush_attempts_1ma": 0,
            "others_rush_attempts_3ma": 0,
            "others_rush_attempts_5ma": 0,
        }

        for teammate in same_game_teammates.itertuples(index=False):

            key = teammate.Player
            stats_df = offense_rush_stats_LOOKUP.get(key)
            if stats_df is None:
                continue

            stats_df = stats_df[stats_df["Date"].dt.date <= game_date]
            if stats_df.empty:
                continue

            latest = stats_df.iloc[-1]

            for window in [1, 3, 5]:
                col = f"rush_attempts_{window}ma"
                val = latest[col]
                if pd.notna(val):
                    other_stats[f"others_rush_attempts_{window}ma"] += val

        # -------------------------------------------------
        # CARRY SHARE CALCULATIONS
        # -------------------------------------------------

        pct = {}

        for window in [1, 3, 5]:
            player_val = player_stats_on_date[f"rush_attempts_{window}ma"]
            other_val = other_stats[f"others_rush_attempts_{window}ma"]

            denom = player_val + other_val
            pct[f"pct_of_carries_{window}ma"] = (
                player_val / denom if denom > 0 else 0
            )

        # -------------------------------------------------
        # BUILD TRAINING ROW
        # -------------------------------------------------

        data_row = {
            "Player": player_key,
            "Team": team,
            "Date": game_date,
            "Att": row.Att,
            "Rush_yards": row.Yds,
            "Starter": row.is_starter,
            **player_stats_on_date.to_dict(),
            **other_stats,
            **pct,
            "others_been_injured_1ma": others_been_injured_1ma,
            "others_been_injured_3ma": others_been_injured_3ma,
            "others_been_injured_5ma": others_been_injured_5ma,
            "carries_before_injury_1ma": carries_before_injury_1ma,
            "carries_before_injury_3ma": carries_before_injury_3ma,
            "carries_before_injury_5ma": carries_before_injury_5ma,
        }

        rows.append(data_row)

    return pd.DataFrame(rows)


In [12]:
train_df = generate_train_df(base_stats)

In [13]:
train_df.to_csv('base_stats_feature_engineering.csv', index=False)