In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_columns', None)

In [173]:
rb_train_orig = pd.read_csv('../../data/cleaned/rb_train.csv')
rb_test_orig = pd.read_csv('../../data/cleaned/rb_test.csv')

In [174]:
### filtering for season data year prior to contract

rb_train = rb_train_orig.sort_values('season').groupby(['player_display_name', 'year_signed']).tail(1)
rb_test = rb_test_orig.sort_values('season').groupby(['player_display_name', 'year_signed']).tail(1)

rb_train = rb_train.dropna()
rb_test = rb_test.dropna()

In [175]:
preds = [col for col in rb_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'smoothed_salary_per_year', 'year_signed','passing_epa']]
rb_train = rb_train.groupby(['player_display_name', 'year_signed'])[preds].mean().reset_index()
rb_test = rb_test.groupby(['player_display_name', 'year_signed'])[preds].mean().reset_index()

rb_train = rb_train.merge(
    rb_train_orig[['player_display_name', 'year_signed', 'salary_per_year', 'smoothed_salary_per_year']],
    on=['player_display_name', 'year_signed'],
    how='right'
)

rb_test = rb_test.merge(
    rb_test_orig[['player_display_name', 'year_signed', 'salary_per_year', 'smoothed_salary_per_year']],
    on=['player_display_name', 'year_signed'],
    how='right'
)

rb_test = rb_test.drop_duplicates(subset=['player_display_name', 'year_signed'])
rb_train = rb_train.drop_duplicates(subset=['player_display_name', 'year_signed'])
rb_train = rb_train.dropna()
rb_test = rb_test.dropna()

In [176]:
print(rb_train.shape)
print(rb_test.shape)

(635, 21)
(69, 21)


In [185]:
preds = [col for col in rb_train.columns if col not in ['season', 'player_display_name', 'salary_per_year', 'mean_adj_salary_per_year', 'med_adj_salary_per_year', 'smoothed_salary_per_year', 'fantasy_points']]

X = rb_train[preds]

### pick which target you want to model on ###

#y = rb_train['salary_per_year_y']
#y = rb_train['smoothed_salary_per_year']
y = rb_train['mean_adj_salary_per_year']
#y = rb_train['med_adj_salary_per_year']

gbm = GradientBoostingRegressor(n_estimators=200, learning_rate=.05,
                                max_depth=2, random_state = 40, subsample = .8, min_samples_leaf = 5, min_samples_split = 5)
gbm.fit(X, y)

In [186]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_test = rb_test[preds]
X_test = X_test.fillna(0)

#y_test = rb_test['salary_per_year_y'].values.ravel()
#y_test = rb_test['smoothed_salary_per_year'].values.ravel()
y_test = rb_test['mean_adj_salary_per_year'].values.ravel()
#y_test = rb_test['med_adj_salary_per_year'].values.ravel()

y_pred = gbm.predict(X_test)
y_train_pred = gbm.predict(X)

mse_train = mean_squared_error(y, y_train_pred)
mae_train = mean_absolute_error(y, y_train_pred)
r2_train = r2_score(y, y_train_pred)

mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)
print('Model: RB GBM -- Mean \n')
print("Training Set:")
print(f"  MSE: {mse_train:.4f}")
print(f"  MAE: {mae_train:.4f}")
print(f"  R²:  {r2_train:.4f}\n")

print("Test Set:")
print(f"  MSE: {mse_test:.4f}")
print(f"  MAE: {mae_test:.4f}")
print(f"  R²:  {r2_test:.4f}\n")


importances = gbm.feature_importances_
feature_names = X.columns.tolist()

if 'year_signed' in feature_names:
    idx = feature_names.index('year_signed')
    del feature_names[idx]
    importances = np.delete(importances, idx)
    
indices = np.argsort(importances)[::-1]
print("Top 3 Feature Importances:")
for i in range(3):
    print(f"{i + 1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Model: RB GBM -- Median 

Training Set:
  MSE: 0.5179
  MAE: 0.4567
  R²:  0.7992

Test Set:
  MSE: 1.6174
  MAE: 0.7210
  R²:  0.6605

Top 3 Feature Importances:
1. rushing_yards: 0.5000
2. target_share: 0.1377
3. receiving_yards_after_catch: 0.1110


In [187]:
## adjust this depeneding on your target
#rb_test['predicted_smoothed_salary'] = y_pred
rb_test['predicted_mean_salary'] = y_pred
#rb_test['predicted_median_salary'] = y_pred

In [188]:
#rb_test.to_csv('../../data/dashboard/rb/rb_smoothed.csv')
rb_test.to_csv('../../data/dashboard/rb/rb_mean_adj.csv')
#rb_test.to_csv('../../data/dashboard/rb/rb_med_adj.csv')