In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_columns', None)

In [149]:
qb_train_orig = pd.read_csv('../../data/final/qb_train_updated_med.csv')
qb_test_orig = pd.read_csv('../../data/final/qb_test_updated_med.csv')

In [150]:
### filtering for season data year prior to contract

qb_train = qb_train_orig.sort_values('season').groupby(['player_display_name', 'year_signed']).tail(2)
qb_test = qb_test_orig.sort_values('season').groupby(['player_display_name', 'year_signed']).tail(2)

qb_train = qb_train.dropna()
qb_test = qb_test.dropna()

In [151]:
preds = [col for col in qb_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year', 'year_signed', 'mean_adj_salary_per_year', 'med_adj_salary_per_year', 'passing_epa']]
qb_train = qb_train.groupby(['player_display_name', 'year_signed'])[preds].mean().reset_index()
qb_test = qb_test.groupby(['player_display_name', 'year_signed'])[preds].mean().reset_index()

qb_train = qb_train.merge(
    qb_train_orig[['player_display_name', 'year_signed', 'salary_per_year', 'smoothed_salary_per_year']],
    on=['player_display_name', 'year_signed'],
    how='right'
)

qb_test = qb_test.merge(
    qb_test_orig[['player_display_name', 'year_signed', 'salary_per_year', 'smoothed_salary_per_year']],
    on=['player_display_name', 'year_signed'],
    how='right'
)

qb_test = qb_test.drop_duplicates(subset=['player_display_name', 'year_signed'])
qb_train = qb_train.drop_duplicates(subset=['player_display_name', 'year_signed'])

In [152]:
print(qb_train.shape)
print(qb_test.shape)

(447, 17)
(38, 17)


In [159]:
qb_train = qb_train.dropna()
qb_test = qb_test.dropna()

In [160]:
qb_train.sample(5)

Unnamed: 0,player_display_name,year_signed,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points,salary_per_year,smoothed_salary_per_year
336,Tyrod Taylor,2015.0,3.0,1.0,5.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.074074,-0.272305,64.0,0.0,5.08,1.116667,-0.661635
683,Drew Brees,2020.0,14.5,362.5,489.0,3864.5,32.0,6.0,18.0,2.5,197.5,0.5,1.158931,0.17569,10.5,2.5,285.68,25.0,6.27746
9,Donovan McNabb,2002.0,18.0,357.5,612.0,3757.5,27.0,14.0,50.0,6.5,181.0,0.5,0.0,0.064914,637.0,5.5,320.25,8.398529,1.032648
473,Mark Sanchez,2015.0,9.0,198.0,309.0,2418.0,14.0,11.0,23.0,3.0,117.0,0.0,0.936483,0.091686,87.0,1.0,139.42,4.5,0.224474
876,Joe Flacco,2022.0,2.0,27.0,42.0,338.0,3.0,0.0,2.0,1.0,14.0,0.0,1.005952,0.140893,3.0,0.0,23.82,3.5,0.317013


In [167]:
preds = [col for col in qb_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year', 'mean_adj_salary_per_year', 'med_adj_salary_per_year', 'passing_epa', 'dakota']]

X = qb_train[preds]
y = qb_train['salary_per_year']

gbm = GradientBoostingRegressor(n_estimators=400, learning_rate=.01,
                                max_depth=2, random_state = 20, subsample = .8, min_samples_leaf = 5, min_samples_split = 5)
gbm.fit(X, y)

In [168]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_test = qb_test[preds]
X_test = X_test.fillna(0)

y_test = qb_test['salary_per_year'].values.ravel()
y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X)

mse_train = mean_squared_error(y, y_train_pred)
mae_train = mean_absolute_error(y, y_train_pred)
r2_train = r2_score(y, y_train_pred)

mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print("Training Set:")
print(f"  MSE: {mse_train:.4f}")
print(f"  MAE: {mae_train:.4f}")
print(f"  R²:  {r2_train:.4f}\n")

print("Test Set:")
print(f"  MSE: {mse_test:.4f}")
print(f"  MAE: {mae_test:.4f}")
print(f"  R²:  {r2_test:.4f}")

Training Set:
  MSE: 18.6032
  MAE: 2.7613
  R²:  0.8330

Test Set:
  MSE: 77.6486
  MAE: 5.4894
  R²:  0.7838


In [184]:
qb_test['predicted_salary'] = y_pred
qb_test[(qb_test.predicted_salary - qb_test.salary_per_year) > 3]

Unnamed: 0,player_display_name,year_signed,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points,salary_per_year,smoothed_salary_per_year,predicted_salary
9,Russell Wilson,2024.0,15.0,294.5,465.0,3297.0,21.0,9.5,50.0,5.5,138.5,1.0,0.894569,0.067798,309.0,3.0,240.83,1.21,-0.393551,30.705484
18,Desmond Ridder,2024.0,9.5,161.0,251.5,1772.0,7.0,6.0,20.0,3.5,86.5,1.0,0.821948,0.031846,128.5,2.5,109.43,0.985,-0.450483,4.368788
39,Joshua Dobbs,2024.0,13.0,262.0,417.0,2464.0,13.0,10.0,30.0,7.0,126.0,1.0,0.742169,0.019909,421.0,6.0,202.66,2.25,-0.130399,12.463007
47,Baker Mayfield,2024.0,19.0,412.0,643.0,4730.0,34.0,12.0,48.0,6.0,219.0,2.0,0.861096,0.095922,194.0,1.0,324.6,33.333333,7.734656,39.167025
51,Jake Browning,2024.0,9.0,171.0,243.0,1936.0,12.0,7.0,24.0,3.0,93.0,1.0,1.245817,0.121133,127.0,3.0,143.44,0.9725,-0.453646,9.056528


In [185]:
qb_test[(qb_test.salary_per_year - qb_test.predicted_salary) > 10]

Unnamed: 0,player_display_name,year_signed,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points,salary_per_year,smoothed_salary_per_year,predicted_salary
0,Jared Goff,2024.0,18.5,433.0,651.5,4925.0,31.5,9.5,30.0,5.0,252.0,1.5,1.098829,0.110034,48.5,1.0,311.1,53.0,12.710937,39.68942
2,Tua Tagovailoa,2024.0,15.5,333.5,499.5,4185.5,27.5,11.5,26.0,3.5,196.0,0.0,0.990273,0.133806,84.5,0.0,257.87,53.1,12.73624,32.55052
3,Dak Prescott,2024.0,16.0,380.0,557.0,4145.0,33.5,14.0,32.5,2.5,207.0,1.0,0.926414,0.140569,257.5,2.0,309.55,60.0,14.482156,39.310705
7,Trevor Lawrence,2024.0,17.5,404.5,617.0,4317.0,25.5,13.5,33.0,7.5,207.0,2.0,0.901756,0.08832,332.0,4.5,296.88,55.0,13.217,39.516495
10,Kirk Cousins,2024.0,13.0,335.5,496.5,3575.5,24.5,9.5,31.5,4.5,179.0,1.5,0.99111,0.111693,61.5,1.5,233.07,45.0,10.686687,27.705265
58,Jordan Love,2024.0,19.0,409.0,634.0,4625.0,37.0,13.0,30.0,5.0,230.0,3.0,0.869198,0.120195,249.0,4.0,355.9,55.0,13.217,43.457031


In [186]:
importances = gbm.feature_importances_
feature_names = X.columns
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for i in range(len(importances)):
    print(f"{i + 1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Feature ranking:
1. fantasy_points: 0.6752
2. year_signed: 0.1904
3. passing_tds: 0.0748
4. rushing_yards: 0.0146
5. pacr: 0.0120
6. interceptions: 0.0110
7. sacks: 0.0061
8. passing_2pt_conversions: 0.0054
9. rushing_tds: 0.0041
10. passing_first_downs: 0.0021
11. games: 0.0017
12. sack_fumbles: 0.0014
13. completions: 0.0005
14. passing_yards: 0.0005
15. attempts: 0.0002


In [187]:
qb_test.to_csv('../../data/dashboard/qb.csv')