In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_columns', None)

In [83]:
wr_train = pd.read_csv('../../data/cleaned/wr_train.csv')
wr_test = pd.read_csv('../../data/cleaned/wr_test.csv')
wr_train.sample(5)

Unnamed: 0,season,player_display_name,games,receptions,targets,receiving_yards,receiving_tds,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,mean_adj_salary_per_year,med_adj_salary_per_year,year_signed
1731,2019,Chad Williams,1,0,1,0.0,0,0.0,0.0,-1.084458,0.028571,0.0,0.1428,-1.042785,-0.392195,-1.174359,2020.0
1655,2019,Amari Cooper,16,79,119,1189.0,8,240.0,54.0,52.464829,0.206597,167.5,20.0,18.787277,7.056775,55.39886,2020.0
2059,2021,DeVante Parker,9,40,73,515.0,2,104.0,29.0,24.061897,0.237013,63.5,5.4,6.209127,1.579922,12.262658,2023.0
1109,2016,De'Anthony Thomas,7,7,9,35.0,0,24.0,2.0,-2.197605,0.05,6.4,0.88,-0.374357,-0.115652,2.534419,2018.0
1546,2018,Keelan Cole,13,38,70,491.0,1,136.0,25.0,-4.619933,0.154525,51.1,3.259,2.069156,0.776775,7.703704,2020.0


In [84]:
### filtering for season data year prior to contract

wr_train = wr_train.sort_values('season').groupby(['player_display_name', 'year_signed']).tail(1)
wr_test = wr_test.sort_values('season').groupby(['player_display_name', 'year_signed']).tail(1)

wr_train = wr_train.dropna()
wr_test = wr_test.dropna()

In [85]:
print(wr_train.shape)
print(wr_test.shape)

(1146, 17)
(121, 17)


In [101]:
preds = [col for col in wr_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'mean_adj_salary_per_year', 'med_adj_salary_per_year', 'smoothed_salary_per_year', 'fantasy_points']]

X = wr_train[preds]

## pick which target you want to model on -- _y is bc the join and i was too lazy to rename and delete _x

#y = wr_train['salary_per_year_y']
#y = wr_train['smoothed_salary_per_year']
y = wr_train['mean_adj_salary_per_year']
#y = wr_train['med_adj_salary_per_year']

gbm = GradientBoostingRegressor(n_estimators=300, learning_rate=.05,
                                max_depth=3, random_state = 40, subsample = .8, min_samples_leaf = 5, min_samples_split = 5)
gbm.fit(X, y)

In [102]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_test = wr_test[preds]
X_test = X_test.fillna(0)

#y_test = wr_test['salary_per_year_y'].values.ravel()
#y_test = wr_test['smoothed_salary_per_year'].values.ravel()
y_test = wr_test['mean_adj_salary_per_year'].values.ravel()
#y_test = wr_test['med_adj_salary_per_year'].values.ravel()


y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X)

mse_train = mean_squared_error(y, y_train_pred)
mae_train = mean_absolute_error(y, y_train_pred)
r2_train = r2_score(y, y_train_pred)

mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)
print('Model: WR GBM -- Mean \n')
print("Training Set:")
print(f"  MSE: {mse_train:.4f}")
print(f"  MAE: {mae_train:.4f}")
print(f"  R²:  {r2_train:.4f}\n")

print("Test Set:")
print(f"  MSE: {mse_test:.4f}")
print(f"  MAE: {mae_test:.4f}")
print(f"  R²:  {r2_test:.4f}\n")


importances = gbm.feature_importances_
feature_names = X.columns.tolist()

if 'year_signed' in feature_names:
    idx = feature_names.index('year_signed')
    del feature_names[idx]
    importances = np.delete(importances, idx)
    
indices = np.argsort(importances)[::-1]
print("Top 3 Feature Importances:")
for i in range(3):
    print(f"{i + 1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Model: WR GBM -- Mean 

Training Set:
  MSE: 0.2469
  MAE: 0.3057
  R²:  0.9047

Test Set:
  MSE: 1.7270
  MAE: 0.6703
  R²:  0.8367

Top 3 Feature Importances:
1. receiving_yards: 0.3877
2. target_share: 0.1776
3. receiving_first_downs: 0.1259


In [103]:
## adjust this depeneding on your target

#wr_test['predicted_smoothed_salary'] = y_pred
wr_test['predicted_mean_salary'] = y_pred
#wr_test['predicted_median_salary'] = y_pred

In [100]:
#wr_test.to_csv('../../data/dashboard/wr/wr_smoothed.csv')
wr_test.to_csv('../../data/dashboard/wr/wr_mean_adj.csv')
#wr_test.to_csv('../../data/dashboard/wr/wr_med_adj.csv')