In [234]:
import pandas as pd
import numpy as np

import plotly.express as px
from scipy.special import inv_boxcox

# Load the data

In [235]:
df = pd.read_csv('data/NBA_Player_Dataset-2013-2021.csv')

box_cox_transformer = 0.16106145323461019 #need to pull from data_prep.ipynb

In [236]:
#Can be used to filter out earlier years
#df = df[~df['Year'].isin([2013, 2014, 2015])]

# Scale the data

In [237]:
df.columns

Index(['Year', 'FULL NAME', 'TEAM', 'POS', 'AGE', 'GP', 'MPG', 'MIN%', 'USG%',
       'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG',
       'TRB%', 'APG', 'AST%', 'SPG', 'BPG', 'VI', 'ORTG', 'DRTG',
       'TO_100_Games', 'Salary', 'Cap Maximum', 'Salary%OfCap', 'Traded',
       'Center', 'Forward', 'Guard', 'Salary_Scaled', 'Salary_BoxCox',
       'Salary_Scaled_BoxCox'],
      dtype='object')

In [238]:
#variables we need to scale

x_to_scale = df[['AGE', 'GP', 'MPG', 'MIN%', 'USG%',
       'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG',
       'TRB%', 'APG', 'AST%', 'SPG', 'BPG', 'VI', 'TO_100_Games']]

In [239]:
#scaler function
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_to_scale)

x_scaled = scaler.transform(x_to_scale)

x_scaled_df = pd.DataFrame(x_scaled, columns = x_to_scale.columns)

In [240]:
x_scaled_df

Unnamed: 0,AGE,GP,MPG,MIN%,USG%,FTA,FT%,2PA,2P%,3PA,...,TS%,PPG,RPG,TRB%,APG,AST%,SPG,BPG,VI,TO_100_Games
0,1.054938,0.482581,1.246624,1.249235,1.006838,1.273851,0.917979,0.808159,0.245669,1.166583,...,0.789723,1.547901,0.546992,-0.257996,4.043414,3.665499,3.171747,-0.651949,2.259476,0.178108
1,-0.783816,-1.268083,-0.180363,-0.172433,0.394914,-0.644776,0.148347,-0.722653,-0.567050,-0.642869,...,-0.274237,-0.179525,-0.789344,-0.935353,-0.554199,-0.596403,-0.175151,-0.436866,-0.732954,-0.095976
2,0.345907,0.930425,0.316920,0.299727,0.272529,0.214435,1.150321,0.498943,-0.306664,0.310201,...,0.012569,0.130108,-0.708354,-1.032118,0.775231,1.023554,-0.080204,-0.723643,0.079277,0.016882
3,-1.067428,-0.168829,-0.104690,-0.104981,0.202595,-0.160948,0.276619,-0.337087,1.022880,-0.235397,...,0.729586,0.178997,1.134170,1.677307,-0.720378,-0.966058,-0.804178,0.901426,-0.027596,-0.418428
4,-0.367851,-0.535247,-1.277630,-1.272409,-0.828935,-0.628092,-0.202586,-0.741740,1.780366,-0.815526,...,1.784295,-0.912867,-0.445136,1.174128,-0.720378,-0.324598,-1.053415,-0.711694,0.635014,0.823012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4205,-0.363124,1.011851,1.452024,1.461966,2.178236,3.551179,0.859894,2.495487,0.332464,1.753619,...,0.752716,2.753840,0.870952,-0.064466,0.886017,0.632155,2.768220,0.805834,1.276249,-0.611899
4206,-0.363124,-1.064517,-1.115473,-1.111563,-0.339395,-0.819955,-1.381236,-0.848630,0.056297,-0.746463,...,-0.801592,-0.912867,-0.688106,0.158094,-0.526503,0.061365,-0.780441,-0.651949,0.271647,-0.757002
4207,-1.213961,-1.064517,-1.369520,-1.365803,-0.426813,-0.794930,0.230635,-0.871534,1.665954,-0.891495,...,1.215308,-0.912867,-0.384393,2.006309,-0.886557,-0.694253,-0.982204,0.805834,0.335771,0.065250
4208,0.109564,0.441868,0.479078,0.481327,0.290013,0.439666,0.985746,0.193544,-0.211979,0.545016,...,0.179102,0.407148,-0.789344,-1.206295,-0.332627,-0.465937,0.821797,-0.723643,-0.775703,-0.692512


In [241]:
#add the categorical features to scaled features for input
X = pd.concat([x_scaled_df, df[['Center', 'Forward', 'Guard']]], axis = 1)

#choose our target variable
#y = df['Salary']
y = df['Salary_BoxCox']

# Feature Selection
SelectKBest
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

### Scorer for regression
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn.feature_selection.f_regression

In [242]:
#remove highly correlated variables first from data explore correlation section

X_subset = X.drop(['MIN%', 'FTA', 'MPG', 'AST%', 'PPG'], axis = 1)

In [243]:
#Use SelectKbest and f_regression for variable score

from sklearn.feature_selection import SelectKBest, f_regression

selectKBest = SelectKBest(f_regression, k=6)

selectKBest.fit(X_subset, y)

X_best_features = selectKBest.transform(X_subset)
x_best_feature_names = selectKBest.get_feature_names_out()

In [244]:
X_best_features_df = pd.DataFrame(X_best_features, columns = x_best_feature_names)
X_best_features_df.head()

Unnamed: 0,GP,2PA,RPG,APG,SPG,VI
0,0.482581,0.808159,0.546992,4.043414,3.171747,2.259476
1,-1.268083,-0.722653,-0.789344,-0.554199,-0.175151,-0.732954
2,0.930425,0.498943,-0.708354,0.775231,-0.080204,0.079277
3,-0.168829,-0.337087,1.13417,-0.720378,-0.804178,-0.027596
4,-0.535247,-0.74174,-0.445136,-0.720378,-1.053415,0.635014


# Grid Search CV

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

https://scikit-learn.org/stable/modules/grid_search.html

# Random Forest

In [245]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

#call the model you want to use, define any parameters you want to hold steady across the search
random_forest = RandomForestRegressor(verbose = 1)

#Create a search grid of parameters
rf_param_grid = [
  {'n_estimators': [100, 250], 'max_depth': [10, 25], 'max_features': [5, 10, 20]},
 ]

#Create grid search
rf_gridsearch = GridSearchCV(random_forest, rf_param_grid, 
                            scoring = 'neg_root_mean_squared_error', 
                            verbose = 2, 
                            n_jobs = -1)

#grid search maximizes, so scorer needs to be negative in this case.  Real MSE is just the positive version

In [246]:
#Utilize full dataset for random forest
rf_gridsearch.fit(X, y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    5.5s finished


GridSearchCV(estimator=RandomForestRegressor(verbose=1), n_jobs=-1,
             param_grid=[{'max_depth': [10, 25], 'max_features': [5, 10, 20],
                          'n_estimators': [100, 250]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [247]:
random_forest_results = pd.DataFrame(rf_gridsearch.cv_results_)

#replace -mse with +mse
random_forest_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] = random_forest_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
random_forest_results['Model Type'] = 'Random Forest'
random_forest_results['Training Data'] = 'Full Dataset'

In [248]:
#write to csv to save results
random_forest_results.to_csv('results/rf_results_all_vars.csv', index = False)

# KNN
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html

Will try with full and subset of X data

In [249]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn_param_grid = [
  {'n_neighbors': [5, 10, 20, 50], 'weights':['uniform', 'distance'], 'p': [1, 2]},
 ]

knn_grid_search = GridSearchCV(knn, knn_param_grid, verbose = 2, n_jobs = -1, scoring = 'neg_root_mean_squared_error')

In [250]:
#Try KNN with full
knn_grid_search.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid=[{'n_neighbors': [5, 10, 20, 50], 'p': [1, 2],
                          'weights': ['uniform', 'distance']}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [251]:
knn_results = pd.DataFrame(knn_grid_search.cv_results_)

#replace -mse with +mse
knn_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
knn_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
knn_results['Model Type'] = 'KNN'
knn_results['Training Data'] = 'Full Dataset'

In [252]:
#write to csv to save results
knn_results.to_csv('results/knn_results_all_vars.csv', index = False)

### KNN Subset

In [253]:
#Try KNN with subset of X
knn_grid_search.fit(X_best_features_df, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid=[{'n_neighbors': [5, 10, 20, 50], 'p': [1, 2],
                          'weights': ['uniform', 'distance']}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [254]:
knn_results2 = pd.DataFrame(knn_grid_search.cv_results_)

#replace -mse with +mse
knn_results2[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
knn_results2[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
knn_results2['Model Type'] = 'KNN'
knn_results2['Training Data'] = 'Subset Dataset'

In [255]:
#write to csv to save results
knn_results2.to_csv('results/knn_results_subset_vars.csv', index = False)

# Support Vector Machine
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [256]:
from sklearn.svm import SVR

svm = SVR()

svm_param_grid = [
  {'kernel': ['linear'], 'degree': [2, 3, 4], 'C': [.001, .01, .1, 1, 10, 100, 1000]},
  {'kernel': ['poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto'], 'C': [.001, .01, .1, 1, 10, 100, 1000]}
]

svm_grid_search = GridSearchCV(svm, svm_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [257]:
#Try KNN with full and subset
svm_grid_search.fit(X, y)

Fitting 5 folds for each of 63 candidates, totalling 315 fits


GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'degree': [2, 3, 4], 'kernel': ['linear']},
                         {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['poly', 'rbf', 'sigmoid']}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [258]:
svm_results = pd.DataFrame(svm_grid_search.cv_results_)

#replace -mse with +mse
svm_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
svm_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
svm_results['Model Type'] = 'SVM'
svm_results['Training Data'] = 'Full Dataset'

In [259]:
#write to csv to save results
svm_results.to_csv('results/svm_results_all_vars.csv', index = False)

### SVM Subset X

In [260]:
#Try KNN with full and subset
svm_grid_search.fit(X_best_features_df, y)

Fitting 5 folds for each of 63 candidates, totalling 315 fits


GridSearchCV(estimator=SVR(), n_jobs=-1,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'degree': [2, 3, 4], 'kernel': ['linear']},
                         {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['poly', 'rbf', 'sigmoid']}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [261]:
svm_results2 = pd.DataFrame(svm_grid_search.cv_results_)

#replace -mse with +mse
svm_results2[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
svm_results2[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
svm_results2['Model Type'] = 'SVM'
svm_results2['Training Data'] = 'Subset Dataset'

In [262]:
#write to csv to save results
svm_results2.to_csv('results/svm_results_subset_vars.csv', index = False)

# Linear Regression (OLS)
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [263]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

linreg_param_grid = [
  {'fit_intercept': [True]}
]

linreg_grid_search = GridSearchCV(linreg, linreg_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [264]:
#Linear Regression with subset dataset
linreg_grid_search.fit(X_best_features_df, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


GridSearchCV(estimator=LinearRegression(), n_jobs=-1,
             param_grid=[{'fit_intercept': [True]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [265]:
linreg_results = pd.DataFrame(linreg_grid_search.cv_results_)

#replace -mse with +mse
linreg_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
linreg_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
linreg_results['Model Type'] = 'Linear Regression'
linreg_results['Training Data'] = 'Subset Dataset'

In [266]:
#write to csv to save results
linreg_results.to_csv('results/linreg_results_subset_vars.csv', index = False)

# Ridge Regression

In [267]:
from sklearn.linear_model import Ridge

ridge = Ridge()

ridge_param_grid = [
  {'alpha': [.001, .01, .1, 1, 10]}
]

ridge_grid_search = GridSearchCV(ridge, ridge_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [268]:
#Full dataset with ridge - will self select features
ridge_grid_search.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(estimator=Ridge(), n_jobs=-1,
             param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [269]:
ridge_results = pd.DataFrame(ridge_grid_search.cv_results_)

#replace -mse with +mse
ridge_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
ridge_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
ridge_results['Model Type'] = 'Ridge Regression'
ridge_results['Training Data'] = 'Full Dataset'

In [270]:
#write to csv to save results
ridge_results.to_csv('results/ridge_results_all_vars.csv', index = False)

# Lasso Regression
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso

In [271]:
from sklearn.linear_model import Lasso

lasso = Lasso()

lasso_param_grid = [
  {'alpha': [.001, .01, .1, 1, 10]}
]

lasso_grid_search = GridSearchCV(lasso, lasso_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [272]:
#Full dataset with lasso - will self select features
lasso_grid_search.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


  model = cd_fast.enet_coordinate_descent(


GridSearchCV(estimator=Lasso(), n_jobs=-1,
             param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [273]:
lasso_results = pd.DataFrame(lasso_grid_search.cv_results_)

#replace -mse with +mse
lasso_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
lasso_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
lasso_results['Model Type'] = 'Lasso Regression'
lasso_results['Training Data'] = 'Full Dataset'

In [274]:
#write to csv to save results
lasso_results.to_csv('results/lasso_results_all_vars.csv', index = False)

# Elastic Net

In [275]:
from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(max_iter = 5000)

elastic_net_param_grid = [
  {'alpha': [.001, .01, .1, 1, 10], 'l1_ratio': [.1, .3, .5, .7, .9]}
]

elastic_net_grid_search = GridSearchCV(elastic_net, elastic_net_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [276]:
#Full dataset with lasso - will self select features
elastic_net_grid_search.fit(X, y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


  model = cd_fast.enet_coordinate_descent(


GridSearchCV(estimator=ElasticNet(max_iter=5000), n_jobs=-1,
             param_grid=[{'alpha': [0.001, 0.01, 0.1, 1, 10],
                          'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [277]:
elastic_net_results = pd.DataFrame(elastic_net_grid_search.cv_results_)

#replace -mse with +mse
elastic_net_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
elastic_net_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
elastic_net_results['Model Type'] = 'Elastic Net Regression'
elastic_net_results['Training Data'] = 'Full Dataset'

In [278]:
#write to csv to save results
elastic_net_results.to_csv('results/elastic_results_all_vars.csv', index = False)

# AdaBoost

In [279]:
from sklearn.ensemble import AdaBoostRegressor

ada_boost = AdaBoostRegressor()

ada_boost_param_grid = [
  {'n_estimators': [50, 100, 250, 500], 'learning_rate': [.001, .01, .1, 1]}
]

ada_boost_grid_search = GridSearchCV(ada_boost, ada_boost_param_grid, 
                                verbose = 2, 
                                n_jobs = -1, 
                                scoring = 'neg_root_mean_squared_error')

In [280]:
#Full dataset with ada - will self select features
ada_boost_grid_search.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(estimator=AdaBoostRegressor(), n_jobs=-1,
             param_grid=[{'learning_rate': [0.001, 0.01, 0.1, 1],
                          'n_estimators': [50, 100, 250, 500]}],
             scoring='neg_root_mean_squared_error', verbose=2)

In [281]:
ada_results = pd.DataFrame(ada_boost_grid_search.cv_results_)

#replace -mse with +mse
ada_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] =\
ada_results[['split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score']] * -1.0

#add label to pull merge data later
ada_results['Model Type'] = 'AdaBoost DT Regression'
ada_results['Training Data'] = 'Full Dataset'

In [282]:
#write to csv to save results
ada_results.to_csv('results/ada_results_all_vars.csv', index = False)