In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [2]:
rb_train = pd.read_csv('../data/final/rb_train_updated.csv')
rb_test = pd.read_csv('../data/final/rb_test_updated.csv')
rb_train.sample(5)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year
1147,2020,James Conner,14,180,758.0,7,2.0,0.0,44.0,3.264991,40,48,272.0,12.0,-0.491178,0.088725,144.3,1.75,1.286447
912,2018,Tevin Coleman,16,167,800.0,4,1.0,0.0,36.0,-11.832124,32,44,275.0,15.0,12.861834,0.072848,161.6,4.25,4.624665
1196,2020,Devin Singletary,19,172,750.0,2,1.0,1.0,43.0,-16.565423,46,59,305.0,9.0,-0.907245,0.086131,116.3,2.75,2.746516
941,2018,Christian McCaffrey,16,219,1098.0,7,2.0,1.0,53.0,7.454052,107,124,859.0,41.0,34.296967,0.227941,278.5,16.015853,17.695942
1199,2020,Trayveon Williams,6,26,157.0,0,1.0,1.0,5.0,-14.436734,5,5,41.0,1.0,-0.41158,0.058824,16.7,0.9075,-0.072924


In [3]:
preds = [col for col in rb_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year']]

In [5]:
### using avg of the most recent 2 season stats before each contract 

rct_seasons_per_contract = (
    rb_train.sort_values(['player_display_name', 'salary_per_year', 'season'], ascending=[True, True, False])
            .groupby(['player_display_name', 'salary_per_year'])
            .head(2)
)


contract_latest = (
    rct_seasons_per_contract.groupby(['player_display_name', 'salary_per_year'])['season'].max().reset_index()
)

top_contracts_per_player = (
    contract_latest.sort_values(['player_display_name', 'season'], ascending=[True, False])
                   .groupby('player_display_name')
                   .head(2)
)

final = rct_seasons_per_contract.merge(top_contracts_per_player, on=['player_display_name', 'salary_per_year'])
final[final.player_display_name == 'Nick Chubb']

Unnamed: 0,season_x,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,season_y
598,2023,Nick Chubb,2,28,170.0,0,0.0,0.0,8.0,4.113699,4,4,20.0,1.0,0.750114,0.137931,19.1,2.275,1.77989,2023
599,2022,Nick Chubb,17,302,1525.0,12,1.0,1.0,69.0,13.225027,27,37,259.0,12.0,7.964626,0.072978,254.4,2.275,1.77989,2023
600,2020,Nick Chubb,14,221,1212.0,12,1.0,1.0,63.0,17.199724,22,27,218.0,8.0,8.068899,0.072193,219.5,12.2,18.147493,2020
601,2019,Nick Chubb,16,299,1513.0,8,2.0,2.0,63.0,-21.48616,36,49,318.0,13.0,-2.660069,0.094595,221.1,12.2,18.147493,2020


In [6]:
grouped_df = final.groupby(['player_display_name', 'smoothed_salary_per_year'])[preds].mean().reset_index()

In [7]:
X = grouped_df[preds]
X = X.fillna(0)
y = grouped_df['smoothed_salary_per_year']

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0], 
}

gbm = GradientBoostingRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=gbm,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best MSE:", grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best MSE: -6.110173068276545


In [9]:
### fitting gbm based on grid search results

gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05,
                                max_depth=3, random_state = 30, subsample = .8, min_samples_leaf = 5, min_samples_split = 5)
gbm.fit(X, y)

In [10]:
X_test = rb_test[preds]
X_test = X_test.fillna(0)

y_test = rb_test['smoothed_salary_per_year'].values.ravel()
y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X)

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


mse_train = mean_squared_error(y, y_train_pred)
mae_train = mean_absolute_error(y, y_train_pred)
r2_train = r2_score(y, y_train_pred)

mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print("Training Set:")
print(f"  MSE: {mse_train:.4f}")
print(f"  MAE: {mae_train:.4f}")
print(f"  R²:  {r2_train:.4f}\n")

print("Test Set:")
print(f"  MSE: {mse_test:.4f}")
print(f"  MAE: {mae_test:.4f}")
print(f"  R²:  {r2_test:.4f}")

Training Set:
  MSE: 1.6265
  MAE: 0.8561
  R²:  0.8385

Test Set:
  MSE: 4.2631
  MAE: 1.2218
  R²:  0.6478


In [12]:
rb_test['predicted_smoothed_salary'] = y_pred
rb_test.sample(5)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,predicted_smoothed_salary
18,2024,Tyler Goodson,9,32,153.0,1,0.0,0.0,6.0,2.687648,11,15,67.0,3.0,1.303773,0.069124,33.4,1.03,0.048683,0.190853
13,2024,Clyde Edwards-Helaire,2,13,46.0,0,0.0,0.0,2.0,-1.958973,3,5,35.0,1.0,0.48212,0.068493,7.0,1.17,0.164746,-0.152565
12,2024,Darrynton Evans,1,3,3.0,0,0.0,0.0,0.0,-1.294813,0,0,0.0,0.0,,,0.3,1.21,0.197907,-0.212608
21,2024,Aaron Shampklin,3,6,17.0,0,0.0,0.0,1.0,-2.41447,0,0,0.0,0.0,,,1.7,0.96,-0.009348,-0.212608
4,2024,Saquon Barkley,20,436,2504.0,18,2.0,1.0,99.0,30.810126,46,58,337.0,15.0,16.555139,0.125541,409.7,20.6,16.272552,9.012965


In [13]:
importances = gbm.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for i in range(len(importances)):
    print(f"{i + 1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Feature ranking:
1. fantasy_points: 0.2521
2. receiving_yards_after_catch: 0.2492
3. rushing_yards: 0.1377
4. rushing_first_downs: 0.0690
5. rushing_epa: 0.0636
6. carries: 0.0440
7. target_share: 0.0408
8. receiving_first_downs: 0.0322
9. receiving_epa: 0.0243
10. rushing_tds: 0.0217
11. targets: 0.0174
12. games: 0.0168
13. receptions: 0.0148
14. rushing_fumbles: 0.0126
15. rushing_fumbles_lost: 0.0037


In [14]:
rb_test.to_csv('../data/dashboard/rb.csv')