In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [3]:
qb_train = pd.read_csv('../data/final/qb_train_updated.csv')
qb_test = pd.read_csv('../data/final/qb_test_updated.csv')

In [4]:
## example data
qb_train[qb_train.player_display_name == 'Tua Tagovailoa']

Unnamed: 0,season,player_display_name,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points,salary_per_year,smoothed_salary_per_year
864,2020,Tua Tagovailoa,10,186,290,1814.0,11,5.0,20.0,1,100.0,-13.510189,1,0.834023,0.054589,109.0,3,135.46,53.1,12.73624
927,2021,Tua Tagovailoa,13,263,388,2653.0,16,10.0,20.0,7,137.0,20.855968,1,0.970018,0.084615,128.0,3,180.92,53.1,12.73624
987,2022,Tua Tagovailoa,13,259,400,3548.0,25,8.0,21.0,4,162.0,97.368233,0,0.925646,0.148105,70.0,0,230.92,53.1,12.73624
1040,2023,Tua Tagovailoa,18,408,599,4823.0,30,15.0,31.0,3,230.0,109.62352,0,1.054899,0.119507,99.0,0,284.82,53.1,12.73624


In [7]:
## using avg of the most recent 2 season stats before each contract 

qb_recent = qb_train.sort_values(by=['player_display_name', 'season'], ascending=[True, False])
qb_recent = qb_recent.groupby('player_display_name').head(2)
qb_recent[qb_recent.player_display_name == 'Tua Tagovailoa']

Unnamed: 0,season,player_display_name,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points,salary_per_year,smoothed_salary_per_year
1040,2023,Tua Tagovailoa,18,408,599,4823.0,30,15.0,31.0,3,230.0,109.62352,0,1.054899,0.119507,99.0,0,284.82,53.1,12.73624
987,2022,Tua Tagovailoa,13,259,400,3548.0,25,8.0,21.0,4,162.0,97.368233,0,0.925646,0.148105,70.0,0,230.92,53.1,12.73624


In [30]:
### aggregting data -- avg of metrics during 2 seasons leading up to contract

preds = [col for col in qb_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year']]
grouped_df = qb_recent.groupby(['player_display_name', 'smoothed_salary_per_year'])[preds].mean().reset_index()
grouped_df.sample(3)

Unnamed: 0,player_display_name,smoothed_salary_per_year,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points
18,Blaine Gabbert,-0.050057,6.0,7.0,11.0,67.0,0.0,0.0,1.0,0.0,4.0,0.191835,0.0,0.744444,0.084953,-7.0,0.0,1.98
145,Josh Rosen,-0.545696,5.0,58.0,109.0,567.0,1.0,5.0,16.0,1.0,26.0,-49.465133,0.0,0.630701,-0.038763,13.0,0.0,17.98
4,Alex Tanney,-0.636848,1.0,10.0,14.0,99.0,1.0,0.0,3.0,0.0,7.0,-0.563155,0.0,0.804878,0.074431,0.0,0.0,7.96


In [31]:
## filling na's with 0 -- only occur in the advanced metrics where player has no attempts
X = grouped_df[preds]
X = X.fillna(0)

y = grouped_df['smoothed_salary_per_year']

In [32]:
## lasso for feature selection 

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
y = grouped_df['smoothed_salary_per_year'].values.ravel()

lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X, y)

In [11]:
#lasso_coef = pd.Series(lasso.coef_, index=preds)
#print("LASSO Coefficients:\n", lasso_coef)

In [33]:
lasso_coef = pd.Series(lasso.coef_, index=preds)
selected_features = lasso_coef[lasso_coef != 0].index.tolist()
print("Selected Features:", selected_features)

Selected Features: ['completions', 'passing_epa', 'rushing_yards', 'fantasy_points']


In [34]:
import statsmodels.api as sm

X_selected = X[selected_features]
X_selected = sm.add_constant(X_selected)

y = grouped_df['smoothed_salary_per_year'].values.ravel()  # flatten to 1D
ols_model = sm.OLS(y, X_selected).fit()

print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.737
Method:                 Least Squares   F-statistic:                     185.5
Date:                Thu, 17 Apr 2025   Prob (F-statistic):           8.84e-75
Time:                        21:33:47   Log-Likelihood:                -531.65
No. Observations:                 264   AIC:                             1073.
Df Residuals:                     259   BIC:                             1091.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.5882      0.160     -3.

In [35]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_test = qb_test[selected_features]
X_test = X_test.fillna(0)
X_test = sm.add_constant(X_test)

y_test = qb_test['smoothed_salary_per_year'].values.ravel()

y_pred = ols_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R² Score:", r2)

MSE: 0.6443792854712679
RMSE: 0.8027323872071364
MAE: 0.6065788415222657
R² Score: 0.9198923872278869


In [36]:
qb_test['predicted_smoothed_salary'] = y_pred
qb_test.sample(10)

Unnamed: 0,season,player_display_name,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points,salary_per_year,smoothed_salary_per_year,predicted_smoothed_salary
4,2024,Jacoby Brissett,7,95,161,826.0,2,1.0,18.0,3,41.0,-38.167678,1,0.755718,0.007904,62.0,0,45.24,6.25,0.715889,-0.513605
3,2024,Brandon Allen,3,17,30,199.0,1,2.0,2.0,2,8.0,-12.354178,0,0.825726,0.004662,4.0,0,6.36,1.4225,-0.322841,-0.817112
14,2024,Justin Fields,11,106,162,1106.0,5,1.0,16.0,2,45.0,8.069526,0,0.880573,0.097063,289.0,5,119.14,20.0,3.67447,2.862154
11,2024,Jarrett Stidham,2,0,0,0.0,0,0.0,0.0,0,0.0,,0,,,5.0,0,0.5,6.0,0.662097,-0.551404
1,2024,Jimmy Garoppolo,1,27,41,334.0,2,1.0,3.0,0,16.0,3.024632,0,1.511312,0.035713,5.0,0,19.86,3.005,0.017664,-0.181023
15,2024,Mac Jones,10,171,262,1672.0,8,8.0,14.0,1,77.0,-2.45747,1,0.86722,0.044203,92.0,1,96.08,3.5,0.124173,1.683162
0,2024,Andy Dalton,6,106,160,989.0,7,6.0,7.0,1,50.0,-18.095891,0,0.931262,0.028268,34.0,0,58.96,4.0,0.231758,0.08107
5,2024,Nick Mullens,4,2,2,38.0,0,0.0,0.0,0,2.0,4.868726,0,1.583333,,-2.0,0,1.32,2.25,-0.144789,-0.415698
17,2024,Skylar Thompson,2,21,33,187.0,0,0.0,6.0,2,8.0,-15.511025,0,0.813043,0.049576,4.0,0,7.88,1.1,-0.392234,-0.886473
9,2024,Josh Allen,19,365,565,4367.0,32,6.0,19.0,6,199.0,150.465796,2,0.92994,0.174354,636.0,14,438.28,55.0,11.205403,12.884521
