In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [37]:
qb_train = pd.read_csv('../data/final/qb_train_updated.csv')
qb_test = pd.read_csv('../data/final/qb_test_updated.csv')
qb_train.sample(5)

Unnamed: 0,season,player_display_name,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points,salary_per_year,smoothed_salary_per_year
338,2012,Tom Brady,18,455,731,5491.0,39,10.0,28.0,1,289.0,207.017332,0,0.877998,0.158286,36.0,4,383.24,11.4,1.584186
488,2014,Brandon Weeden,4,24,41,303.0,3,2.0,1.0,0,16.0,2.738845,0,0.863248,0.036213,-1.0,0,20.02,2.0,-0.326643
126,2006,Donovan McNabb,10,180,316,2612.0,18,6.0,21.0,1,113.0,36.704778,0,0.93553,0.104538,212.0,3,199.68,12.1,1.468996
83,2004,Steve McNair,8,129,215,1343.0,8,9.0,13.0,3,76.0,4.639694,0,0.0,0.064079,128.0,1,82.52,5.5,0.302264
753,2019,Case Keenum,10,160,247,1707.0,11,5.0,15.0,4,83.0,-3.21123,0,1.004118,0.054235,12.0,1,103.48,6.0,0.931482


In [38]:
preds = [col for col in qb_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year']]

In [39]:
### using avg of the most recent 2 season stats before each contract 

rct_seasons_per_contract = (
    qb_train.sort_values(['player_display_name', 'salary_per_year', 'season'], ascending=[True, True, False])
            .groupby(['player_display_name', 'salary_per_year'])
            .head(2)
)


contract_latest = (
    rct_seasons_per_contract.groupby(['player_display_name', 'salary_per_year'])['season'].max().reset_index()
)

rct_contracts_per_player = (
    contract_latest.sort_values(['player_display_name', 'season'], ascending=[True, False])
                   .groupby('player_display_name')
                   .head(2)
)

final = rct_seasons_per_contract.merge(rct_contracts_per_player, on=['player_display_name', 'salary_per_year'])
grouped_df = final.groupby(['player_display_name', 'smoothed_salary_per_year'])[preds].mean().reset_index()

In [40]:
X = grouped_df[preds]
X = X.fillna(0)
y = grouped_df['smoothed_salary_per_year']

In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 1.0], 
}

gbm = GradientBoostingRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=gbm,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best MSE:", grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best MSE: -3.566145712896438


In [42]:
### fitting gbm based on grid search results

gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05,
                                max_depth=3, random_state = 30, subsample = 1.0)
gbm.fit(X, y)

In [43]:
X_test = qb_test[preds]
X_test = X_test.fillna(0)

y_test = qb_test['smoothed_salary_per_year'].values.ravel()
y_pred = gbm.predict(X_test)

In [44]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.4f}")

Mean Squared Error (MSE): 0.58
Mean Absolute Error (MAE): 0.50
R-squared (R²): 0.9273


In [12]:
qb_test['predicted_smoothed_salary'] = y_pred
qb_test.sample(10)

Unnamed: 0,season,player_display_name,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points,salary_per_year,smoothed_salary_per_year,predicted_smoothed_salary
12,2024,Daniel Jones,10,216,341,2070.0,8,7.0,29.0,4,106.0,-33.381812,0,0.82503,0.052998,265.0,2,135.3,14.0,2.383453,0.651503
7,2024,Kyle Allen,1,1,1,19.0,0,0.0,0.0,0,1.0,1.432553,0,1.461538,,0.0,0,0.76,1.27,-0.355655,-0.487888
15,2024,Mac Jones,10,171,262,1672.0,8,8.0,14.0,1,77.0,-2.45747,1,0.86722,0.044203,92.0,1,96.08,3.5,0.124173,0.007622
14,2024,Justin Fields,11,106,162,1106.0,5,1.0,16.0,2,45.0,8.069526,0,0.880573,0.097063,289.0,5,119.14,20.0,3.67447,1.472189
9,2024,Josh Allen,19,365,565,4367.0,32,6.0,19.0,6,199.0,150.465796,2,0.92994,0.174354,636.0,14,438.28,55.0,11.205403,11.426394
4,2024,Jacoby Brissett,7,95,161,826.0,2,1.0,18.0,3,41.0,-38.167678,1,0.755718,0.007904,62.0,0,45.24,6.25,0.715889,-0.265419
11,2024,Jarrett Stidham,2,0,0,0.0,0,0.0,0.0,0,0.0,,0,,,5.0,0,0.5,6.0,0.662097,-0.369559
13,2024,Kyle Trask,3,1,1,5.0,0,0.0,0.0,0,0.0,0.297811,0,0.0,,-4.0,0,-0.2,2.787,-0.029243,-0.422551
8,2024,Mason Rudolph,8,146,228,1530.0,9,9.0,11.0,2,78.0,-1.936141,1,0.93865,0.071265,106.0,1,95.8,4.0,0.231758,0.190526
0,2024,Andy Dalton,6,106,160,989.0,7,6.0,7.0,1,50.0,-18.095891,0,0.931262,0.028268,34.0,0,58.96,4.0,0.231758,-0.2066


In [46]:
qb_test.to_csv('../data/dashboard/qb.csv')