In [38]:
import pandas as pd
import numpy as np

import plotly.express as px
from scipy.special import inv_boxcox

# Load the data

In [39]:
df = pd.read_csv('data/NBA_Player_Dataset-2013-2021.csv')

box_cox_transformer = 0.16106145323461019 #need to pull from data_prep.ipynb

In [40]:
#Can be used to filter out earlier years
#df = df[~df['Year'].isin([2013, 2014, 2015])]

# Scale the data

In [41]:
df.columns

Index(['Year', 'FULL NAME', 'TEAM', 'POS', 'AGE', 'GP', 'MPG', 'MIN%', 'USG%',
       'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG',
       'TRB%', 'APG', 'AST%', 'SPG', 'BPG', 'VI', 'ORTG', 'DRTG',
       'TO_100_Games', 'Salary', 'Cap Maximum', 'Salary%OfCap', 'Traded',
       'Center', 'Forward', 'Guard', 'Salary_Scaled', 'Salary_BoxCox',
       'Salary_Scaled_BoxCox'],
      dtype='object')

In [42]:
#variables we need to scale

x_to_scale = df[['Year', 'AGE', 'GP', 'MPG', 'MIN%', 'USG%',
       'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG',
       'TRB%', 'APG', 'AST%', 'SPG', 'BPG', 'VI', 'TO_100_Games']]

In [43]:
#scaler function
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_to_scale)

x_scaled = scaler.transform(x_to_scale)

x_scaled_df = pd.DataFrame(x_scaled, columns = x_to_scale.columns)

In [44]:
x_scaled_df

Unnamed: 0,Year,AGE,GP,MPG,MIN%,USG%,FTA,FT%,2PA,2P%,...,TS%,PPG,RPG,TRB%,APG,AST%,SPG,BPG,VI,TO_100_Games
0,1.482654,-0.953826,0.525742,0.392690,0.393190,-0.076881,0.015324,-0.558461,0.469838,-0.195886,...,-0.293161,0.032396,1.275948,1.077135,-0.443400,-0.672336,-0.269983,0.352158,0.079020,-0.208811
1,1.482654,0.504425,0.770085,0.706184,0.704493,-1.108214,0.641433,-0.771311,0.297944,0.521576,...,0.326449,-0.244682,2.531283,1.793029,0.719887,0.251317,0.441002,0.830343,1.104846,1.096346
2,1.482654,-0.440957,-0.125837,1.387224,1.389359,1.146735,1.376067,0.249401,1.287289,0.490040,...,0.622382,1.694868,2.693262,1.444756,0.886071,0.533846,2.052568,0.902071,1.745987,0.258468
3,1.482654,2.395189,-0.166561,0.306209,0.315365,0.709729,-0.043113,0.825063,0.530956,0.734450,...,0.724109,0.798437,0.790012,0.632119,-0.554190,-0.726669,-0.767672,1.475893,0.249991,-0.821106
4,1.482654,-0.705664,0.444295,-0.126197,-0.117865,0.482486,0.006976,0.546907,0.259745,-0.483659,...,-0.547479,-0.049097,-0.404581,-0.374002,0.055151,0.229584,-0.175185,-0.149936,0.356848,-0.144358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4205,-1.606858,-1.544690,0.932979,0.468361,0.450262,1.618701,1.826865,-0.297236,1.333128,0.040640,...,-0.330153,0.716944,-0.181861,-0.606184,0.609097,0.881574,1.057189,-0.436847,0.677418,0.774085
4206,-1.606858,0.346074,0.607190,0.879147,0.870521,1.461379,1.484592,0.592863,1.023718,-0.148581,...,0.326449,1.515582,-0.424829,-1.012503,-0.221822,-0.433273,0.227707,-0.508575,-0.305665,-0.692202
4207,-1.606858,-0.362963,1.218045,1.538566,1.508692,0.989413,1.017098,0.046226,2.750298,0.072177,...,-0.154443,1.515582,0.951991,-0.122472,0.221335,-0.129011,3.545637,0.136975,0.292734,-0.353828
4208,-1.606858,-1.308345,1.340217,-0.309970,-0.327994,0.080441,0.775002,0.133301,0.504217,-0.471833,...,-0.283913,-0.423968,0.263581,0.786907,-0.443400,-0.281142,-0.341081,0.232612,0.207248,0.339033


In [45]:
#add the categorical features to scaled features for input
X = pd.concat([x_scaled_df, df[['Center', 'Forward', 'Guard']]], axis = 1)

#choose our target variable
#y = df['Salary']
y = df['Salary_BoxCox']

# Feature Selection
SelectKBest
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

### Scorer for regression
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression

In [47]:
#remove highly correlated variables first from data explore correlation section

X_subset = X.drop(['MIN%', 'FTA', 'MPG', 'AST%', 'PPG'], axis = 1)

In [48]:
#Use SelectKbest and f_regression for variable score

from sklearn.feature_selection import SelectKBest, f_regression

selectKBest = SelectKBest(f_regression, k=6)

selectKBest.fit(X_subset, y)

X_best_features = selectKBest.transform(X_subset)
x_best_feature_names = selectKBest.get_feature_names_out()

In [49]:
X_best_features_df = pd.DataFrame(X_best_features, columns = x_best_feature_names)
X_best_features_df.head()

Unnamed: 0,GP,2PA,RPG,APG,SPG,VI
0,0.525742,0.469838,1.275948,-0.4434,-0.269983,0.07902
1,0.770085,0.297944,2.531283,0.719887,0.441002,1.104846
2,-0.125837,1.287289,2.693262,0.886071,2.052568,1.745987
3,-0.166561,0.530956,0.790012,-0.55419,-0.767672,0.249991
4,0.444295,0.259745,-0.404581,0.055151,-0.175185,0.356848


# Grid Search CV

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

https://scikit-learn.org/stable/modules/grid_search.html

# Random Forest

In [69]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

#call the model you want to use, define any parameters you want to hold steady across the search
random_forest = RandomForestRegressor(verbose = 1)

#Create a search grid of parameters
rf_param_grid = [
  {'n_estimators': [100, 250], 'max_depth': [10, 25], 'max_features': [5, 10, 20]},
 ]

#Create grid search
rf_gridsearch = GridSearchCV(random_forest, rf_param_grid, 
                            scoring = 'neg_root_mean_squared_error', 
                            verbose = 2, 
                            n_jobs = -1)

#grid search maximizes, so scorer needs to be negative in this case.  Real MSE is just the positive version

In [70]:
#Utilize full dataset for random forest
rf_gridsearch.fit(X, y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    5.3s finished


GridSearchCV(estimator=RandomForestRegressor(verbose=1), n_jobs=-1,
             param_grid=[{'max_depth': [10, 25], 'max_features': [5, 10, 20],
                          'n_estimators': [100, 250]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [71]:
random_forest_results = pd.DataFrame(rf_gridsearch.cv_results_)

#replace -mse with +mse
random_forest_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] = random_forest_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
random_forest_results['Model Type'] = 'Random Forest'
random_forest_results['Training Data'] = 'Full Dataset'

In [72]:
#write to csv to save results
random_forest_results.to_csv('results/rf_results_all_vars.csv', index = False)

# KNN
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html

Will try with full and subset of X data

In [73]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn_param_grid = [
  {'n_neighbors': [5, 10, 20, 50], 'weights':['uniform', 'distance'], 'p': [1, 2]},
 ]

knn_grid_search = GridSearchCV(knn, knn_param_grid, verbose = 2, n_jobs = -1, scoring = 'neg_root_mean_squared_error')

In [74]:
#Try KNN with full
knn_grid_search.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid=[{'n_neighbors': [5, 10, 20, 50], 'p': [1, 2],
                          'weights': ['uniform', 'distance']}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [75]:
knn_results = pd.DataFrame(knn_grid_search.cv_results_)

#replace -mse with +mse
knn_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
knn_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
knn_results['Model Type'] = 'KNN'
knn_results['Training Data'] = 'Full Dataset'

In [76]:
#write to csv to save results
knn_results.to_csv('results/knn_results_all_vars.csv', index = False)

### KNN Subset

In [78]:
#Try KNN with subset of X
knn_grid_search.fit(X_best_features_df, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid=[{'n_neighbors': [5, 10, 20, 50], 'p': [1, 2],
                          'weights': ['uniform', 'distance']}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [80]:
knn_results2 = pd.DataFrame(knn_grid_search.cv_results_)

#replace -mse with +mse
knn_results2[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
knn_results2[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
knn_results2['Model Type'] = 'KNN'
knn_results2['Training Data'] = 'Subset Dataset'

In [81]:
#write to csv to save results
knn_results2.to_csv('results/knn_results_subset_vars.csv', index = False)

# Support Vector Machine
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [84]:
from sklearn.svm import SVR

svm = SVR()

svm_param_grid = [
  {'kernel': ['linear'], 'degree': [2, 3, 4], 'C': [.001, .01, .1, 1, 10, 100, 1000]},
  {'kernel': ['poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'], 'C': [.001, .01, .1, 1, 10, 100, 1000]}
]

svm_grid_search = GridSearchCV(svm, svm_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [85]:
#Try KNN with full and subset
svm_grid_search.fit(X, y)

Fitting 5 folds for each of 63 candidates, totalling 315 fits


GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'degree': [2, 3, 4], 'kernel': ['linear']},
                         {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['poly', 'rbf', 'sigmoid']}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [86]:
svm_results = pd.DataFrame(svm_grid_search.cv_results_)

#replace -mse with +mse
svm_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
svm_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
svm_results['Model Type'] = 'SVM'
svm_results['Training Data'] = 'Full Dataset'

In [87]:
#write to csv to save results
svm_results.to_csv('results/svm_results_all_vars.csv', index = False)

### SVM Subset X

In [88]:
#Try KNN with full and subset
svm_grid_search.fit(X_best_features_df, y)

Fitting 5 folds for each of 63 candidates, totalling 315 fits


GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'degree': [2, 3, 4], 'kernel': ['linear']},
                         {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['poly', 'rbf', 'sigmoid']}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [89]:
svm_results2 = pd.DataFrame(svm_grid_search.cv_results_)

#replace -mse with +mse
svm_results2[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
svm_results2[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
svm_results2['Model Type'] = 'SVM'
svm_results2['Training Data'] = 'Subset Dataset'

In [90]:
#write to csv to save results
svm_results2.to_csv('results/svm_results_subset_vars.csv', index = False)

# Linear Regression (OLS)
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [91]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

linreg_param_grid = [
  {'fit_intercept': [True]}
]

linreg_grid_search = GridSearchCV(linreg, linreg_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [92]:
#Linear Regression with subset dataset
linreg_grid_search.fit(X_best_features_df, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(estimator=LinearRegression(), n_jobs=-1,
             param_grid=[{'fit_intercept': [True]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [93]:
linreg_results = pd.DataFrame(linreg_grid_search.cv_results_)

#replace -mse with +mse
linreg_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
linreg_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
linreg_results['Model Type'] = 'Linear Regression'
linreg_results['Training Data'] = 'Subset Dataset'

In [94]:
#write to csv to save results
linreg_results.to_csv('results/linreg_results_subset_vars.csv', index = False)

# Ridge Regression

In [95]:
from sklearn.linear_model import Ridge

ridge = Ridge()

ridge_param_grid = [
  {'alpha': [.001, .01, .1, 1, 10]}
]

ridge_grid_search = GridSearchCV(ridge, ridge_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [96]:
#Full dataset with ridge - will self select features
ridge_grid_search.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(estimator=Ridge(), n_jobs=-1,
             param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [97]:
ridge_results = pd.DataFrame(ridge_grid_search.cv_results_)

#replace -mse with +mse
ridge_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
ridge_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
ridge_results['Model Type'] = 'Ridge Regression'
ridge_results['Training Data'] = 'Full Dataset'

In [103]:
#write to csv to save results
ridge_results.to_csv('results/ridge_results_all_vars.csv', index = False)

# Lasso Regression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso

In [99]:
from sklearn.linear_model import Lasso

lasso = Lasso()

lasso_param_grid = [
  {'alpha': [.001, .01, .1, 1, 10]}
]

lasso_grid_search = GridSearchCV(lasso, lasso_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [100]:
#Full dataset with lasso - will self select features
lasso_grid_search.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(estimator=Lasso(), n_jobs=-1,
             param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [101]:
lasso_results = pd.DataFrame(lasso_grid_search.cv_results_)

#replace -mse with +mse
lasso_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
lasso_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
lasso_results['Model Type'] = 'Lasso Regression'
lasso_results['Training Data'] = 'Full Dataset'

In [104]:
#write to csv to save results
lasso_results.to_csv('results/lasso_results_all_vars.csv', index = False)

# Elastic Net

# Notes

Once we come up with best performing model, can grab the best estimator from all the training.

Train the model on all previous season data.  Apply a scaling factor where appropriate on current season data, then predict on it.  

Inaccuracies aren't exactly *wrong*.  We can consider the results what a player is actually worth.  Doing some testing on the simple models, Steph Curry in 2019 was worst prediction.  Something like 30M difference.  But that season he was hurt and only played 5 games.  So maybe he wasn't worth his 40M contract?  

Anyway we can see what our model thinks a player is worth, highlight some highs and lows.  Then do a groupby on teams and do the same.  