In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [12]:
wr_train = pd.read_csv('../data/final/wr_train_updated.csv')
wr_test = pd.read_csv('../data/final/wr_test_updated.csv')

In [14]:
# Step 1: Get top 2 seasons per contract
top_seasons_per_contract = (
    wr_train.sort_values(['player_display_name', 'salary_per_year', 'season'], ascending=[True, True, False])
            .groupby(['player_display_name', 'salary_per_year'])
            .head(2)
)

# Step 2: Get the most recent season per contract (to sort contracts per player)
contract_latest = (
    top_seasons_per_contract.groupby(['player_display_name', 'salary_per_year'])['season'].max().reset_index()
)

# Step 3: Get top 2 contracts per player by latest season
top_contracts_per_player = (
    contract_latest.sort_values(['player_display_name', 'season'], ascending=[True, False])
                   .groupby('player_display_name')
                   .head(2)
)

# Step 4: Merge to keep only rows from top 2 contracts
final = top_seasons_per_contract.merge(top_contracts_per_player, on=['player_display_name', 'salary_per_year'])

In [None]:
preds = [col for col in wr_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year']]
grouped_df = final.groupby(['player_display_name', 'smoothed_salary_per_year'])[preds].mean().reset_index()

Unnamed: 0,player_display_name,smoothed_salary_per_year,games,receptions,targets,receiving_yards,receiving_tds,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points
0,A.J. Brown,22.065259,14.5,72.0,115.0,1084.5,9.0,351.5,54.5,56.385506,0.270404,164.95
1,A.J. Brown,25.122968,18.5,103.5,162.5,1549.0,9.5,538.0,68.5,63.470761,0.292725,207.9
2,A.J. Green,2.261734,17.0,54.0,95.0,848.0,3.0,181.0,37.0,26.662374,0.16436,102.8
3,A.J. Green,5.57516,16.0,47.0,104.0,523.0,2.0,86.0,34.0,-15.264712,0.183746,64.3
4,A.J. Jenkins,-0.612286,7.5,9.0,16.5,125.0,0.0,56.5,5.0,4.435864,0.073488,12.7


In [16]:
X = grouped_df[preds]
X = X.fillna(0)

y = grouped_df['smoothed_salary_per_year']

In [17]:
## lasso for feature selection 

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
y = grouped_df['smoothed_salary_per_year'].values.ravel()

lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X, y)

In [18]:
lasso_coef = pd.Series(lasso.coef_, index=preds)
selected_features = lasso_coef[lasso_coef != 0].index.tolist()
print("Selected Features:", selected_features)

Selected Features: ['games', 'targets', 'receiving_yards', 'receiving_yards_after_catch', 'fantasy_points']


In [19]:
import statsmodels.api as sm

X_selected = X[selected_features]
X_selected = sm.add_constant(X_selected)

y = grouped_df['smoothed_salary_per_year'].values.ravel()
ols_model = sm.OLS(y, X_selected).fit()

print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.548
Model:                            OLS   Adj. R-squared:                  0.545
Method:                 Least Squares   F-statistic:                     215.8
Date:                Thu, 17 Apr 2025   Prob (F-statistic):          8.83e-151
Time:                        21:51:22   Log-Likelihood:                -2269.5
No. Observations:                 897   AIC:                             4551.
Df Residuals:                     891   BIC:                             4580.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_test = wr_test[selected_features]
X_test = X_test.fillna(0)
X_test = sm.add_constant(X_test)

y_test = wr_test['smoothed_salary_per_year'].values.ravel()

y_pred = ols_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R² Score:", r2)

MSE: 5.760024985848884
RMSE: 2.4000052053795393
MAE: 1.4851469058082527
R² Score: 0.6067314907640325


In [21]:
wr_test['predicted_smoothed_salary'] = y_pred
wr_test.sample(10)

Unnamed: 0,season,player_display_name,games,receptions,targets,receiving_yards,receiving_tds,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,predicted_smoothed_salary
22,2024,Cody White,1,2,3,44.0,0,13.0,2.0,2.547461,0.088235,4.4,1.03,-0.107225,0.2068
28,2024,Jalen Reagor,7,7,12,100.0,0,26.0,3.0,-7.275862,0.057416,7.7,1.3375,0.09796,-0.832183
47,2024,Ronnie Bell,3,2,6,22.0,0,2.0,1.0,-4.306386,0.069767,2.2,1.03,-0.107225,-0.616127
44,2024,Kendric Pryor,1,1,1,9.0,0,0.0,0.0,0.680168,0.027027,0.9,1.03,-0.107225,-0.199311
31,2024,D'Wayne Eskridge,3,3,3,44.0,0,29.0,2.0,3.129415,0.032258,4.4,1.27,0.05292,-0.027899
39,2024,Bo Melton,12,9,20,107.0,0,7.0,6.0,-4.357463,0.062893,16.1,1.03,-0.107225,-2.125972
32,2024,Dyami Brown,18,44,58,537.0,2,309.0,22.0,11.17255,0.104882,64.3,10.0,5.878179,3.617561
14,2024,KhaDarel Hodge,6,7,12,131.0,1,89.0,6.0,8.256851,0.067039,25.1,2.75,1.040478,1.008279
41,2024,Justyn Ross,1,0,1,0.0,0,0.0,0.0,-1.216528,0.025,0.0,0.96,-0.153934,-0.258048
26,2024,Kristian Wilkerson,1,2,3,18.0,1,11.0,2.0,2.861493,0.081081,7.8,1.1,-0.060516,0.189004
