In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
from pathlib import Path
repo_root = Path.cwd().resolve().parents[3]
print(f"Adding {repo_root} to sys.path")
sys.path.append(str(repo_root))
import utils

Adding /home/mrmath/sports_betting_empire/sports_betting_empire to sys.path


In [3]:
base_stats_df = pd.read_csv('/home/mrmath/sports_betting_empire/sports_betting_empire/americanfootball_nfl/player_rush_yards/modeling/feature_engineering/base_stats_feature_engineering.csv')

defense_stats_df = pd.read_csv('/home/mrmath/sports_betting_empire/sports_betting_empire/americanfootball_nfl/player_rush_yards/modeling/feature_engineering/defense_stats_feature_engineering.csv')

pbp_df = pd.read_csv('/home/mrmath/sports_betting_empire/sports_betting_empire/americanfootball_nfl/player_rush_yards/modeling/feature_engineering/play_by_play_feature_engineering.csv')

ybc_yac_df = pd.read_csv('/home/mrmath/sports_betting_empire/sports_betting_empire/americanfootball_nfl/player_rush_yards/modeling/feature_engineering/ybc_yac_feature_engineering.csv')

In [4]:
spread_point_diff_df = pd.read_csv('/home/mrmath/sports_betting_empire/sports_betting_empire/americanfootball_nfl/player_rush_yards/modeling/feature_engineering/point_diff_spread_train.csv')
spread_point_diff_df['team_key'] = spread_point_diff_df['team'] + "_" + spread_point_diff_df['date'].astype(str)

In [5]:
import re

def clean_player_name(name: str) -> str:
    """
    Remove generational suffixes from player names.

    Why this matters:
    - Player name keys must be consistent across datasets
    - Some sources include suffixes (e.g., "Jr.", "III")
    - Others omit them
    - Removing them prevents join mismatches and duplicate identities

    Handles:
    - Jr, Jr.
    - Sr, Sr.
    - II, III, IV, V, VI
    - Case-insensitive
    - Extra whitespace
    """

    if not isinstance(name, str):
        return name

    # Normalize whitespace
    name = name.strip()

    # Regex to remove suffix at end of string
    # \b ensures we only match whole suffix tokens
    suffix_pattern = r"\b(JR|SR|II|III|IV|V|VI)\.?$"

    # Remove suffix (case-insensitive)
    cleaned = re.sub(suffix_pattern, "", name, flags=re.IGNORECASE)

    # Remove any leftover trailing spaces
    return cleaned.strip()


In [6]:
base_stats_df['clean_player_name'] = base_stats_df['Player'].apply(clean_player_name)
pbp_df['clean_player_name'] = pbp_df['player'].apply(clean_player_name)
ybc_yac_df['clean_player_name'] = ybc_yac_df['Player'].apply(clean_player_name)

In [7]:
base_stats_df['player_key'] = base_stats_df['clean_player_name'] + "_" + base_stats_df['Date'].astype(str)
pbp_df['player_key'] = pbp_df['clean_player_name'] + "_" + pbp_df['Date'].astype(str)
ybc_yac_df['player_key'] = ybc_yac_df['clean_player_name'] + "_" + ybc_yac_df['Date'].astype(str)

In [8]:
merged_df = pd.merge(base_stats_df, pbp_df, on='player_key', how='left', suffixes=('_base', '_pbp'))
merged_df = pd.merge(merged_df, ybc_yac_df, on='player_key', how='left', suffixes=('', '_ybc_yac'))

In [9]:
merged_df['team_key'] = merged_df['Team'] + "_" + merged_df['Date'].astype(str)

In [10]:
merged_df = pd.merge(merged_df, spread_point_diff_df, on='team_key', how='left', suffixes=('', '_spread_point_diff'))

In [11]:
defense_stats_df['team_key'] = defense_stats_df['Team'] + "_" + defense_stats_df['Date'].astype(str)
final_df = pd.merge(merged_df, defense_stats_df, on='team_key', how='left', suffixes=('', '_defense'))

In [12]:
spread_point_diff_df

Unnamed: 0,spread,team,date,point_diff_3_ma,point_diff_5_ma,point_diff_3_sum,point_diff_5_sum,point_scored_3_ma,point_scored_5_ma,points_allowed_3_ma,points_allowed_5_ma,opp_point_diff_3_ma,opp_point_diff_5_ma,opp_point_diff_3_sum,opp_point_diff_5_sum,opp_point_scored_3_ma,opp_point_scored_5_ma,opp_points_allowed_3_ma,opp_points_allowed_5_ma,team_key
0,-1.0,PHI,2018-09-06,,,,,,,,,,,,,,,,,PHI_2018-09-06
1,1.0,ATL,2018-09-06,,,,,,,,,,,,,,,,,ATL_2018-09-06
2,-1.0,CIN,2018-09-09,,,,,,,,,,,,,,,,,CIN_2018-09-09
3,-1.0,TEN,2018-09-09,,,,,,,,,,,,,,,,,TEN_2018-09-09
4,-3.5,LAC,2018-09-09,,,,,,,,,,,,,,,,,LAC_2018-09-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4249,8.0,NYJ,2026-01-04,-27.666667,-20.8,-83.0,-104.0,12.000000,14.6,39.666667,35.4,2.000000,6.0,6.0,30.0,23.333333,27.0,21.333333,21.0,NYJ_2026-01-04
4250,10.5,GNB,2026-01-04,-10.333333,-3.4,-31.0,-17.0,22.000000,25.0,32.333333,28.4,8.000000,5.8,24.0,29.0,24.333333,20.8,16.333333,15.0,GNB_2026-01-04
4251,8.0,CLE,2026-01-04,-8.000000,-8.8,-24.0,-44.0,12.000000,14.6,20.000000,23.4,7.666667,7.2,23.0,36.0,27.333333,29.6,19.666667,22.4,CLE_2026-01-04
4252,14.5,LAC,2026-01-04,5.333333,7.2,16.0,36.0,22.000000,23.8,16.666667,16.6,0.333333,1.8,1.0,9.0,24.666667,25.0,24.333333,23.2,LAC_2026-01-04


In [13]:
spread_point_diff_df['date'].min()

'2018-09-06'

In [14]:
train_cols = [col for col in final_df.columns if 'delta_3_5' in col or col.endswith('3ma') or col.endswith('5ma') or col.endswith('g_ma')] + ['Starter']
train_cols += ['others_been_injured_1ma']
train_cols += ['carries_before_injury_1ma']
train_cols += ['spread']
train_cols += ['point_diff_3_ma', 'point_diff_5_ma',
       'point_diff_3_sum', 'point_diff_5_sum', 'point_scored_3_ma',
       'point_scored_5_ma', 'points_allowed_3_ma', 'points_allowed_5_ma',
       'opp_point_diff_3_ma', 'opp_point_diff_5_ma', 'opp_point_diff_3_sum',
       'opp_point_diff_5_sum', 'opp_point_scored_3_ma',
       'opp_point_scored_5_ma', 'opp_points_allowed_3_ma',
       'opp_points_allowed_5_ma']

In [15]:
final_df['Date'] = pd.to_datetime(final_df['Date'])
final_df['season'] = final_df['Date'].apply(lambda x: x.year if x.month >= 3 else x.year - 1)

In [16]:
final_df[train_cols + ['Rush_yards']].corr()['Rush_yards'].sort_values(ascending=False)

Rush_yards                  1.000000
pct_of_carries_3ma          0.649969
pct_of_carries_5ma          0.647841
rush_attempts_3ma           0.637186
rush_attempts_5ma           0.637117
                              ...   
opp_point_diff_5_ma        -0.051169
rushes_one_to_two_5ma      -0.054466
spread                     -0.068975
others_rush_attempts_5ma   -0.501672
others_rush_attempts_3ma   -0.503975
Name: Rush_yards, Length: 166, dtype: float64

In [17]:
import xgboost as xgb
from sklearn.metrics import r2_score

reg_model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
all_preds = []
for season in range(2019, 2024):
    train_data = final_df[final_df['season'] < season]
    test_data = final_df[final_df['season'] == season]

    train_data = train_data[(train_data['rush_attempts_3ma'] >= 1) | (train_data['Starter'] == 1)]
    test_data = test_data[(test_data['rush_attempts_3ma'] >= 1) | (test_data['Starter'] == 1)]
    X_train = train_data[train_cols]
    y_train = train_data['Rush_yards']
    X_test = test_data[train_cols]
    y_test = test_data['Rush_yards']

    reg_model.fit(X_train, y_train)
    predictions = reg_model.predict(X_test)
    X_test = X_test.copy()

    X_test.loc[:, 'predicted_rush_yards'] = predictions
    X_test = X_test.copy()
    X_test.loc[:, 'actual_rush_yards'] = y_test
    all_preds.append(X_test)


    print(
        f"Season {season} - "
        f"MAE: {np.mean(np.abs(predictions - y_test)):.2f}, "
        f"RMSE: {np.sqrt(np.mean((predictions - y_test) ** 2)):.2f}, "
        f"R^2: {r2_score(y_test, predictions):.2f}"
    )


Season 2019 - MAE: 22.64, RMSE: 31.32, R^2: 0.32
Season 2020 - MAE: 21.27, RMSE: 30.08, R^2: 0.35
Season 2021 - MAE: 19.81, RMSE: 27.98, R^2: 0.40
Season 2022 - MAE: 20.28, RMSE: 28.80, R^2: 0.41
Season 2023 - MAE: 18.76, RMSE: 26.63, R^2: 0.41


In [18]:
concat_df = pd.concat(all_preds)

In [19]:
concat_df[['predicted_rush_yards', 'actual_rush_yards']].corr()

Unnamed: 0,predicted_rush_yards,actual_rush_yards
predicted_rush_yards,1.0,0.618325
actual_rush_yards,0.618325,1.0


In [20]:
from sklearn.inspection import permutation_importance

perm = permutation_importance(
    reg_model,
    X_test[train_cols],
    y_test,
    n_repeats=5,
    random_state=42,
)

sorted_idx = perm.importances_mean.argsort()[::-1]
new_train_cols = []
for i in sorted_idx:
    print(f"{train_cols[i]}: {perm.importances_mean[i]:.4f}")
    if perm.importances_mean[i] > -0:  # Threshold for importance
        new_train_cols.append(train_cols[i])

Starter: 0.0837
pct_of_carries_3ma: 0.0414
rush_yards_5ma: 0.0251
pct_of_carries_5ma: 0.0138
opponent_left tackle_diff_3ma: 0.0093
min_rush_yards_5ma: 0.0066
opp_points_allowed_3_ma: 0.0044
max_rush_yards_5ma: 0.0041
spread: 0.0040
total_diff_3ma: 0.0040
left end_diff_3ma: 0.0035
team_rushes_forty_plus_5ma: 0.0033
total_diff_5ma: 0.0029
ypc_3ma: 0.0029
right tackle_diff_3ma: 0.0027
left end_diff_5ma: 0.0027
left guard_diff_3ma: 0.0024
opponent_up the middle_diff_3ma: 0.0023
team_yac_per_att_5_g_ma: 0.0023
opponent_rushes_one_to_two_3ma: 0.0022
opp_brk_tkl_per_att_5_g_ma: 0.0021
player_ybc_per_att_3_g_ma: 0.0020
rush_attempts_3ma: 0.0020
right guard_diff_3ma: 0.0020
point_diff_5_ma: 0.0019
opponent_rushes_forty_plus_5ma: 0.0019
others_rush_attempts_3ma: 0.0018
team_rushes_ten_plus_5ma: 0.0017
others_rush_attempts_5ma: 0.0017
team_rushes_less_than_eq_zero_5ma: 0.0016
opp_point_diff_5_ma: 0.0015
team_rushes_forty_plus_3ma: 0.0014
team_left guard_diff_3ma: 0.0014
RB_rush_yards_allowed_5ma:

In [22]:
new_train_cols

['Starter',
 'pct_of_carries_3ma',
 'rush_yards_5ma',
 'pct_of_carries_5ma',
 'opponent_left tackle_diff_3ma',
 'min_rush_yards_5ma',
 'opp_points_allowed_3_ma',
 'max_rush_yards_5ma',
 'spread',
 'total_diff_3ma',
 'left end_diff_3ma',
 'team_rushes_forty_plus_5ma',
 'total_diff_5ma',
 'ypc_3ma',
 'right tackle_diff_3ma',
 'left end_diff_5ma',
 'left guard_diff_3ma',
 'opponent_up the middle_diff_3ma',
 'team_yac_per_att_5_g_ma',
 'opponent_rushes_one_to_two_3ma',
 'opp_brk_tkl_per_att_5_g_ma',
 'player_ybc_per_att_3_g_ma',
 'rush_attempts_3ma',
 'right guard_diff_3ma',
 'point_diff_5_ma',
 'opponent_rushes_forty_plus_5ma',
 'others_rush_attempts_3ma',
 'team_rushes_ten_plus_5ma',
 'others_rush_attempts_5ma',
 'team_rushes_less_than_eq_zero_5ma',
 'opp_point_diff_5_ma',
 'team_rushes_forty_plus_3ma',
 'team_left guard_diff_3ma',
 'RB_rush_yards_allowed_5ma',
 'team_up the middle_diff_5ma',
 'opponent_rushes_one_to_two_5ma',
 'max_rush_yards_3ma',
 'max_rush_yards_allowed_5ma',
 'ypc_d

In [21]:
import xgboost as xgb
from sklearn.metrics import r2_score
final_df = final_df.sort_values('Date')
reg_model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
all_preds = []
for season in range(2019, 2024):
    train_data = final_df[final_df['season'] < season]
    test_data = final_df[final_df['season'] == season]

    train_data = train_data[(train_data['rush_attempts_3ma'] >= 1) | (train_data['Starter'] == 1)]
    test_data = test_data[(test_data['rush_attempts_3ma'] >= 1) | (test_data['Starter'] == 1)]
    X_train = train_data[new_train_cols]
    y_train = train_data['Rush_yards']
    X_test = test_data[new_train_cols]
    y_test = test_data['Rush_yards']

    reg_model.fit(X_train, y_train)
    predictions = reg_model.predict(X_test)
    X_test = X_test.copy()

    X_test.loc[:, 'predicted_rush_yards'] = predictions
    X_test = X_test.copy()
    X_test.loc[:, 'actual_rush_yards'] = y_test
    all_preds.append(X_test)


    print(
        f"Season {season} - "
        f"MAE: {np.mean(np.abs(predictions - y_test)):.2f}, "
        f"RMSE: {np.sqrt(np.mean((predictions - y_test) ** 2)):.2f}, "
        f"R^2: {r2_score(y_test, predictions):.2f}"
    )

Season 2019 - MAE: 22.10, RMSE: 30.64, R^2: 0.35
Season 2020 - MAE: 20.52, RMSE: 29.10, R^2: 0.39
Season 2021 - MAE: 19.99, RMSE: 28.25, R^2: 0.39
Season 2022 - MAE: 20.22, RMSE: 28.97, R^2: 0.40
Season 2023 - MAE: 18.44, RMSE: 26.22, R^2: 0.43


In [23]:
reg_model.save_model('rush_yard_regressor.json')