In [11]:
import pandas as pd

In [12]:
import sys
from pathlib import Path
repo_root = Path.cwd().resolve().parents[3]
print(f"Adding {repo_root} to sys.path")
sys.path.append(str(repo_root))
import utils

Adding /home/mrmath/sports_betting_empire/sports_betting_empire to sys.path


In [13]:
base_stats = utils.rush_yard_stats_from_s3("base_stats", 2018, 2025)

In [14]:
offense_rush_stats_LOOKUP = {}
for k, v in base_stats.sort_values(['Date']).groupby(['Team']):
    for i in v['Player'].unique():
        player_data = base_stats[base_stats['Player'] == i].sort_values(['Date'])

        # existing rolling means (already great)
        rush_yards_1ma = player_data['Yds'].shift(1).rolling(1, min_periods=1).mean()
        rush_yards_3ma = player_data['Yds'].shift(1).rolling(3, min_periods=1).mean()
        rush_yards_5ma = player_data['Yds'].shift(1).rolling(5, min_periods=1).mean()
        rush_yards_10ma = player_data['Yds'].shift(1).rolling(10, min_periods=1).mean()


        rush_attempts_1ma = player_data['Att'].shift(1).rolling(1, min_periods=1).mean()
        rush_attempts_3ma = player_data['Att'].shift(1).rolling(3, min_periods=1).mean()
        rush_attempts_5ma = player_data['Att'].shift(1).rolling(5, min_periods=1).mean()
        rush_attempts_10ma = player_data['Att'].shift(1).rolling(10, min_periods=1).mean()

        ypc_1ma = (player_data['Yds'] / player_data['Att']).shift(1).rolling(1, min_periods=1).mean()
        ypc_3ma = (player_data['Yds'] / player_data['Att']).shift(1).rolling(3, min_periods=1).mean()
        ypc_5ma = (player_data['Yds'] / player_data['Att']).shift(1).rolling(5, min_periods=1).mean()
        ypc_10ma = (player_data['Yds'] / player_data['Att']).shift(1).rolling(10, min_periods=1).mean()

        success_rate_1ma = player_data['Succ%'].shift(1).rolling(1, min_periods=1).mean()
        success_rate_3ma = player_data['Succ%'].shift(1).rolling(3, min_periods=1).mean()
        success_rate_5ma = player_data['Succ%'].shift(1).rolling(5, min_periods=1).mean()
        success_rate_10ma = player_data['Succ%'].shift(1).rolling(10, min_periods=1).mean()



        base_player_stats_ma = {
            'Date': pd.to_datetime(player_data['Date']),
            'rush_yards_1ma': rush_yards_1ma,
            'rush_yards_3ma': rush_yards_3ma,
            'rush_yards_5ma': rush_yards_5ma,
            'rush_yards_10ma': rush_yards_10ma,

            'rush_attempts_1ma': rush_attempts_1ma,
            'rush_attempts_3ma': rush_attempts_3ma,
            'rush_attempts_5ma': rush_attempts_5ma,
            'rush_attempts_10ma': rush_attempts_10ma,

            'ypc_1ma': ypc_1ma,
            'ypc_3ma': ypc_3ma,
            'ypc_5ma': ypc_5ma,
            'ypc_10ma': ypc_10ma,

            'success_rate_1ma': success_rate_1ma,
            'success_rate_3ma': success_rate_3ma,
            'success_rate_5ma': success_rate_5ma,
            'success_rate_10ma': success_rate_10ma,


            'Pos.': player_data['Pos.'].iloc[0],
        }

        offense_rush_stats_LOOKUP[i] = pd.DataFrame(base_player_stats_ma)



In [15]:
# Ensure date is datetime
import numpy as np
base_stats['Date'] = pd.to_datetime(base_stats['Date'])

# Filter RBs only once
rb_stats = base_stats[base_stats['Pos.'] == 'RB'].copy()

# Compute RB rush yards allowed and attempts allowed per team/date
team_rb_summary = (
    rb_stats
    .groupby(['Team', 'Date'], as_index=False)
    .agg(
        RB_rush_yards_allowed=('Yds', 'sum'),
        RB_rush_attempts_allowed=('Att', 'sum'),
        Opp=('Opp', 'first')
    )
)

# Compute RB YPC allowed safely
team_rb_summary['RB_ypc_allowed'] = team_rb_summary['RB_rush_yards_allowed'] / team_rb_summary['RB_rush_attempts_allowed'].replace(0, np.nan)

# Compute strength_of_offense by joining with offense lookup
rb_stats_unique = rb_stats[['Player', 'Date']].drop_duplicates()
strength_map = {}
for k, v in base_stats.groupby(['Team', "Date"]):
    offense_strength = 0
    for player in v['Player'].unique():
        if player in offense_rush_stats_LOOKUP:
            player_offense_stats = offense_rush_stats_LOOKUP[player]
            player_stats_on_date = player_offense_stats[player_offense_stats['Date'] == k[1]]
            if not player_stats_on_date.empty:
                offense_strength += player_stats_on_date['rush_yards_5ma'].values[0]
    strength_map[k] = offense_strength
team_rb_summary['strength_of_offense'] = team_rb_summary.apply(lambda row: strength_map.get((row['Team'], row['Date']), 0), axis=1)

In [16]:
team_rb_summary

Unnamed: 0,Team,Date,RB_rush_yards_allowed,RB_rush_attempts_allowed,Opp,RB_ypc_allowed,strength_of_offense
0,ARI,2018-09-09,61,13,WAS,4.692308,
1,ARI,2018-09-16,54,15,LAR,3.600000,
2,ARI,2018-09-23,41,17,CHI,2.411765,
3,ARI,2018-09-30,72,25,SEA,2.880000,64.0
4,ARI,2018-10-07,54,19,SFO,2.842105,69.5
...,...,...,...,...,...,...,...
4249,WAS,2025-12-07,84,17,MIN,4.941176,152.8
4250,WAS,2025-12-14,102,27,NYG,3.777778,86.0
4251,WAS,2025-12-20,91,24,PHI,3.791667,141.6
4252,WAS,2025-12-25,103,12,DAL,8.583333,64.2


In [17]:
defense_rush_stats_LOOKUP = {}
for k, v in team_rb_summary.sort_values(['Date']).groupby('Opp'):
    v = v.sort_values(['Date']).reset_index(drop=True)
    RB_rush_yards_allowed_1ma = v['RB_rush_yards_allowed'].shift(1).rolling(1, min_periods=1).mean()
    RB_rush_yards_allowed_3ma = v['RB_rush_yards_allowed'].shift(1).rolling(3, min_periods=1).mean()
    RB_rush_yards_allowed_5ma = v['RB_rush_yards_allowed'].shift(1).rolling(5, min_periods=1).mean()

    RB_ypc_allowed_1ma = v['RB_ypc_allowed'].shift(1).rolling(1, min_periods=1).mean()
    RB_ypc_allowed_3ma = v['RB_ypc_allowed'].shift(1).rolling(3, min_periods=1).mean()
    RB_ypc_allowed_5ma = v['RB_ypc_allowed'].shift(1).rolling(5, min_periods=1).mean()

    min_rush_yards_allowed_3ma = v['RB_rush_yards_allowed'].shift(1).rolling(3, min_periods=1).min()
    min_rush_yards_allowed_5ma = v['RB_rush_yards_allowed'].shift(1).rolling(5, min_periods=1).min()
    max_rush_yards_allowed_3ma = v['RB_rush_yards_allowed'].shift(1).rolling(3, min_periods=1).max()
    max_rush_yards_allowed_5ma = v['RB_rush_yards_allowed'].shift(1).rolling(5, min_periods=1).max()
    
    strength_of_offense_1ma = v['strength_of_offense'].shift(1).rolling(1, min_periods=1).mean()
    strength_of_offense_3ma = v['strength_of_offense'].shift(1).rolling(3, min_periods=1).mean()
    strength_of_offense_5ma = v['strength_of_offense'].shift(1).rolling(5, min_periods=1).mean()

    defense_performance_relative_1ma = RB_rush_yards_allowed_1ma - strength_of_offense_1ma
    defense_performance_relative_3ma = RB_rush_yards_allowed_3ma - strength_of_offense_3ma
    defense_performance_relative_5ma = RB_rush_yards_allowed_5ma - strength_of_offense_5ma

    RB_rush_yards_allowed_delta_3_5 = RB_rush_yards_allowed_3ma - RB_rush_yards_allowed_5ma
    RB_rush_yards_allowed_delta_1_3 = RB_rush_yards_allowed_1ma - RB_rush_yards_allowed_3ma

    ypc_allowed_delta_3_5 = RB_ypc_allowed_3ma - RB_ypc_allowed_5ma
    ypc_allowed_delta_1_3 = RB_ypc_allowed_1ma - RB_ypc_allowed_3ma

    defense_relative_delta_3_5 = defense_performance_relative_3ma - defense_performance_relative_5ma
    defense_relative_delta_1_3 = defense_performance_relative_1ma - defense_performance_relative_3ma

    RB_rush_yards_allowed_vol_5 = v['RB_rush_yards_allowed'].shift(1).rolling(5, min_periods=1).std()
    ypc_allowed_vol_5 = v['RB_ypc_allowed'].shift(1).rolling(5, min_periods=1).std()

    defense_rush_stats_LOOKUP[k] = pd.DataFrame({
        'Date': pd.to_datetime(v['Date']),
        'RB_rush_yards_allowed_1ma': RB_rush_yards_allowed_1ma,
        'RB_rush_yards_allowed_3ma': RB_rush_yards_allowed_3ma,
        'RB_rush_yards_allowed_5ma': RB_rush_yards_allowed_5ma,
        'RB_ypc_allowed_1ma': RB_ypc_allowed_1ma,
        'RB_ypc_allowed_3ma': RB_ypc_allowed_3ma,
        'RB_ypc_allowed_5ma': RB_ypc_allowed_5ma,
        'min_rush_yards_allowed_3ma': min_rush_yards_allowed_3ma,
        'min_rush_yards_allowed_5ma': min_rush_yards_allowed_5ma,
        'max_rush_yards_allowed_3ma': max_rush_yards_allowed_3ma,
        'max_rush_yards_allowed_5ma': max_rush_yards_allowed_5ma,
        'defense_performance_relative_1ma': defense_performance_relative_1ma,
        'defense_performance_relative_3ma': defense_performance_relative_3ma,
        'defense_performance_relative_5ma': defense_performance_relative_5ma,
        'RB_rush_yards_allowed_delta_3_5': RB_rush_yards_allowed_delta_3_5,
        'RB_rush_yards_allowed_delta_1_3': RB_rush_yards_allowed_delta_1_3,
        'ypc_allowed_delta_3_5': ypc_allowed_delta_3_5,
        'ypc_allowed_delta_1_3': ypc_allowed_delta_1_3,
        'defense_relative_delta_3_5': defense_relative_delta_3_5,
        'defense_relative_delta_1_3': defense_relative_delta_1_3,
        'RB_rush_yards_allowed_vol_5': RB_rush_yards_allowed_vol_5,
        'ypc_allowed_vol_5': ypc_allowed_vol_5,
    })  

In [18]:
defense_train = []
for k, v in base_stats.groupby(['Team', 'Date']):
    team = k[0]
    date = k[1]
    opp = v['Opp'].iloc[0]

    
    defense_features = defense_rush_stats_LOOKUP.get(opp, pd.DataFrame())
    defense_row = defense_features[defense_features['Date'] == date]

    if not defense_row.empty:
        combined_row = defense_row.iloc[0].to_dict()
        combined_row['Team'] = team
        combined_row['Date'] = date
        combined_row['Opp'] = opp
        defense_train.append(combined_row)    

In [19]:
defense_train_df = pd.DataFrame(defense_train)

In [20]:
defense_train_df.to_csv('defense_stats_feature_engineering.csv', index=False)