In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [20]:
rb_train = pd.read_csv('../data/final/rb_train_updated.csv')
rb_test = pd.read_csv('../data/final/rb_test_updated.csv')

In [21]:
rb_train[rb_train.player_display_name == 'Derrick Henry'].sample(3)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year
1461,2023,Derrick Henry,17,280,1167.0,12,0.0,0.0,63.0,-2.098503,28,36,275.0,7.0,3.174481,0.084112,218.66,8.0,9.225233
736,2016,Derrick Henry,14,110,490.0,5,0.0,0.0,29.0,-0.293348,13,15,124.0,6.0,3.780821,0.061475,92.7,12.5,13.564606
833,2017,Derrick Henry,18,211,928.0,6,2.0,0.0,45.0,-11.590342,16,23,174.0,5.0,5.051808,0.065341,154.0,12.5,13.564606


In [22]:
### using avg of the most recent 2 season stats before each contract 

rct_seasons_per_contract = (
    rb_train.sort_values(['player_display_name', 'salary_per_year', 'season'], ascending=[True, True, False])
            .groupby(['player_display_name', 'salary_per_year'])
            .head(2)
)

contract_latest = (
    rct_seasons_per_contract.groupby(['player_display_name', 'salary_per_year'])['season'].max().reset_index()
)

top_contracts_per_player = (
    contract_latest.sort_values(['player_display_name', 'season'], ascending=[True, False])
                   .groupby('player_display_name')
                   .head(2)
)

final = rct_seasons_per_contract.merge(top_contracts_per_player, on=['player_display_name', 'salary_per_year'])
final[final.player_display_name == 'Derrick Henry']

Unnamed: 0,season_x,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,season_y
242,2023,Derrick Henry,17,280,1167.0,12,0.0,0.0,63.0,-2.098503,28,36,275.0,7.0,3.174481,0.084112,218.66,8.0,9.225233,2023
243,2022,Derrick Henry,16,349,1538.0,13,4.0,2.0,65.0,-22.311296,33,41,397.0,11.0,14.124216,0.11326,269.76,8.0,9.225233,2023
244,2019,Derrick Henry,18,386,1986.0,18,5.0,3.0,94.0,4.75416,23,29,275.0,5.0,6.436225,0.068558,339.42,12.5,13.564606,2019
245,2018,Derrick Henry,16,215,1059.0,12,0.0,0.0,51.0,10.600936,15,18,115.0,4.0,-2.62559,0.055046,186.36,12.5,13.564606,2019


In [23]:
preds = [col for col in rb_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year']]
grouped_df = final.groupby(['player_display_name', 'smoothed_salary_per_year'])[preds].mean().reset_index()
grouped_df[grouped_df.player_display_name == 'Derrick Henry']

Unnamed: 0,player_display_name,smoothed_salary_per_year,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points
170,Derrick Henry,9.225233,16.5,314.5,1352.5,12.5,2.0,1.0,64.0,-12.2049,30.5,38.5,336.0,9.0,8.649349,0.098686,244.21
171,Derrick Henry,13.564606,17.0,300.5,1522.5,15.0,2.5,1.5,72.5,7.677548,19.0,23.5,195.0,4.5,1.905317,0.061802,262.89


In [24]:
## filling na's with 0 -- only occur in the advanced metrics where player has no attempts
X = grouped_df[preds]
X = X.fillna(0)

y = grouped_df['smoothed_salary_per_year']

In [None]:
## lasso for feature selection 

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
y = grouped_df['smoothed_salary_per_year'].values.ravel()

lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X, y)

In [26]:
lasso_coef = pd.Series(lasso.coef_, index=preds)
selected_features = lasso_coef[lasso_coef != 0].index.tolist()
print("Selected Features:", selected_features)

Selected Features: ['games', 'carries', 'rushing_yards', 'rushing_first_downs', 'rushing_epa', 'targets', 'receiving_yards_after_catch', 'receiving_epa', 'fantasy_points']


In [29]:
import statsmodels.api as sm

X_selected = X[selected_features]
X_selected = sm.add_constant(X_selected)

y = grouped_df['smoothed_salary_per_year'].values.ravel()  # flatten to 1D
ols_model = sm.OLS(y, X_selected).fit()

print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.545
Model:                            OLS   Adj. R-squared:                  0.538
Method:                 Least Squares   F-statistic:                     75.80
Date:                Thu, 17 Apr 2025   Prob (F-statistic):           1.79e-91
Time:                        21:28:08   Log-Likelihood:                -1262.1
No. Observations:                 579   AIC:                             2544.
Df Residuals:                     569   BIC:                             2588.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

In [30]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_test = rb_test[selected_features]
X_test = X_test.fillna(0)
X_test = sm.add_constant(X_test)

y_test = rb_test['smoothed_salary_per_year'].values.ravel()

y_pred = ols_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R² Score:", r2)

MSE: 2.2420439222989144
RMSE: 1.4973456255316988
MAE: 1.2281637321391503
R² Score: 0.8147875432378877


In [31]:
rb_test['predicted_smoothed_salary'] = y_pred
rb_test.sample(10)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,predicted_smoothed_salary
12,2024,Darrynton Evans,1,3,3.0,0,0.0,0.0,0.0,-1.294813,0,0,0.0,0.0,,,0.3,1.21,0.197907,-0.623138
6,2024,Miles Sanders,11,55,205.0,2,0.0,0.0,12.0,-8.659847,24,30,150.0,5.0,-17.532581,0.095847,53.3,1.3375,0.303606,1.898633
2,2024,Dare Ogunbowale,17,32,126.0,0,0.0,0.0,5.0,-7.770873,20,29,174.0,8.0,5.249691,0.068558,38.4,1.8,0.687027,-0.437848
8,2024,Jonathan Ward,2,5,22.0,0,0.0,0.0,1.0,0.204633,0,0,0.0,0.0,,,2.2,1.17,0.164746,-0.61567
20,2024,Jordan Mason,12,153,789.0,3,3.0,1.0,37.0,-5.4424,11,14,59.0,5.0,4.628491,0.059072,104.0,5.346,3.626722,2.837834
17,2024,Javonte Williams,18,146,542.0,4,2.0,2.0,30.0,-24.837296,54,72,408.0,14.0,-5.380488,0.125874,110.2,3.0,1.681848,3.807647
9,2024,Patrick Taylor,5,39,183.0,1,0.0,0.0,11.0,2.139868,3,11,19.0,2.0,-10.156942,0.083333,26.8,1.17,0.164746,0.537147
4,2024,Saquon Barkley,20,436,2504.0,18,2.0,1.0,99.0,30.810126,46,58,337.0,15.0,16.555139,0.125541,409.7,20.6,16.272552,13.019499
18,2024,Tyler Goodson,9,32,153.0,1,0.0,0.0,6.0,2.687648,11,15,67.0,3.0,1.303773,0.069124,33.4,1.03,0.048683,-0.619384
1,2024,Samaje Perine,20,21,100.0,1,0.0,0.0,8.0,0.036445,29,38,330.0,17.0,16.01346,0.0608,55.9,1.8,0.687027,1.133603
