In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_columns', None)

In [55]:
wr_train = pd.read_csv('../../data/final/wr_train_2024.csv')
wr_test = pd.read_csv('../../data/final/wr_test_2024.csv')
wr_train.sample(5)

Unnamed: 0,season,player_display_name,games,receptions,targets,receiving_yards,receiving_tds,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,year_signed
1039,2016,Golden Tate,17,94,140,1102.0,4,627.0,48.0,23.419964,0.228758,134.6,9.375,6.957494,2019.0
1733,2019,Josh Reynolds,16,21,43,326.0,1,139.0,15.0,3.595845,0.073883,40.9,1.75,0.753058,2021.0
759,2014,Leonard Hankerson,1,0,1,0.0,0,0.0,0.0,-0.804459,0.03125,0.0,1.0,-0.42524,2015.0
2218,2021,Ihmir Smith-Marsette,3,5,6,116.0,2,17.0,4.0,11.404099,0.078947,23.6,0.207,-0.771429,2022.0
2039,2021,Willie Snead,7,4,8,38.0,0,12.0,3.0,-4.782851,0.03252,3.8,1.12,0.06953,2022.0


In [56]:
### filtering for season data year prior to contract

wr_train = wr_train.sort_values('season').groupby(['player_display_name', 'year_signed']).tail(1)
wr_test = wr_test.sort_values('season').groupby(['player_display_name', 'year_signed']).tail(1)

wr_train = wr_train.dropna()
wr_test = wr_test.dropna()

In [57]:
print(wr_train.shape)
print(wr_test.shape)

(1146, 15)
(121, 15)


In [58]:
preds = [col for col in wr_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year']]

X = wr_train[preds]
y = wr_train['salary_per_year']

gbm = GradientBoostingRegressor(n_estimators=300, learning_rate=.05,
                                max_depth=3, random_state = 40, subsample = .8, min_samples_leaf = 5, min_samples_split = 5)
gbm.fit(X, y)

In [59]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_test = wr_test[preds]
X_test = X_test.fillna(0)

y_test = wr_test['salary_per_year'].values.ravel()
y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X)

mse_train = mean_squared_error(y, y_train_pred)
mae_train = mean_absolute_error(y, y_train_pred)
r2_train = r2_score(y, y_train_pred)

mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print("Training Set:")
print(f"  MSE: {mse_train:.4f}")
print(f"  MAE: {mae_train:.4f}")
print(f"  R²:  {r2_train:.4f}\n")

print("Test Set:")
print(f"  MSE: {mse_test:.4f}")
print(f"  MAE: {mae_test:.4f}")
print(f"  R²:  {r2_test:.4f}")

Training Set:
  MSE: 1.5826
  MAE: 0.7750
  R²:  0.9141

Test Set:
  MSE: 11.9952
  MAE: 1.7737
  R²:  0.8404


In [61]:
wr_test['predicted_salary'] = y_pred
wr_test[(wr_test.predicted_salary - wr_test.salary_per_year) > 3]

Unnamed: 0,season,player_display_name,games,receptions,targets,receiving_yards,receiving_tds,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,year_signed,predicted_smoothed_salary,predicted_salary
33,2021,Tim Patrick,16,53,85,734.0,5,202.0,39.0,37.658799,0.173469,103.4,1.295,0.206156,2024.0,6.963319,6.963319
237,2023,Rashid Shaheed,15,46,75,719.0,5,151.0,27.0,12.090541,0.142045,111.6,3.0925,1.66481,2024.0,8.912508,8.912508
122,2023,Mike Evans,19,90,155,1450.0,14,357.0,61.0,63.872143,0.248397,229.0,20.5,15.790829,2024.0,24.921141,24.921141
154,2023,Josh Reynolds,19,48,78,740.0,6,209.0,39.0,17.742736,0.119816,108.0,4.5,2.806983,2024.0,7.521132,7.521132


In [64]:
wr_test[(wr_test.salary_per_year - wr_test.predicted_salary) > 10]

Unnamed: 0,season,player_display_name,games,receptions,targets,receiving_yards,receiving_tds,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,year_signed,predicted_smoothed_salary,predicted_salary
79,2022,D.J. Moore,17,63,118,888.0,7,186.0,44.0,17.145478,0.276347,136.1,27.5,21.471262,2024.0,15.555902,15.555902
203,2023,Darnell Mooney,14,31,61,414.0,1,181.0,20.0,-4.804643,0.151365,47.9,13.0,9.704651,2024.0,2.663813,2.663813
204,2023,Justin Jefferson,10,68,100,1074.0,5,260.0,51.0,42.822706,0.268097,136.2,35.0,27.557439,2024.0,23.546803,23.546803
211,2023,Jerry Jeudy,16,54,87,758.0,2,323.0,28.0,16.952774,0.195506,87.8,17.5,13.356358,2024.0,6.457197,6.457197
212,2023,Tee Higgins,12,42,76,656.0,5,235.0,33.0,26.474023,0.178824,95.6,21.816,16.85875,2024.0,4.781008,4.781008
220,2023,Jaylen Waddle,15,74,109,1045.0,4,421.0,49.0,44.9779,0.219758,130.6,28.25,22.079879,2024.0,18.061282,18.061282


In [62]:
importances = gbm.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for i in range(len(importances)):
    print(f"{i + 1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Feature ranking:
1. fantasy_points: 0.4030
2. target_share: 0.1496
3. year_signed: 0.1489
4. receiving_yards: 0.1310
5. receiving_first_downs: 0.0470
6. receiving_epa: 0.0309
7. targets: 0.0285
8. receiving_yards_after_catch: 0.0240
9. games: 0.0159
10. receiving_tds: 0.0120
11. receptions: 0.0091


In [63]:
wr_test.to_csv('../../data/dashboard/wr.csv')