In [82]:
import pandas as pd
import glob

# Data import

In [83]:
df = pd.read_csv('data/NBA_Player_Dataset-2013-2021.csv')

box_cox_transformer = 0.16106145323461019 #need to pull from data_prep.ipynb

# Scale Data

In [84]:
#variables we need to scale

x_to_scale = df[['AGE', 'GP', 'MPG', 'MIN%', 'USG%',
       'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG',
       'TRB%', 'APG', 'AST%', 'SPG', 'BPG', 'VI', 'TO_100_Games']]

#scaler function
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_to_scale)

x_scaled = scaler.transform(x_to_scale)

x_scaled_df = pd.DataFrame(x_scaled, columns = x_to_scale.columns)

In [85]:
#add the categorical features to scaled features for input
X_with_year = pd.concat([x_scaled_df, df[['Year', 'Center', 'Forward', 'Guard']]], axis = 1)

#choose our target variable
#y = df['Salary']
y = df['Salary_BoxCox']

# Read in results from models_final.ipynb

In [86]:
df_list = []

for file in glob.glob('results/*.csv'):
    temp_df = pd.read_csv(file)
    df_list.append(temp_df)

results_df = pd.concat(df_list, axis = 0)

In [87]:
#Remove the columns specific to certain models, all parameters are stored in 'params' column

results_df = results_df[['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params', 
                        'mean_test_score', 'std_test_score', 'rank_test_score', 'Model Type', 'Training Data']]

In [88]:
#so far best model is RF with full dataset.  
results_df.sort_values('mean_test_score', ascending = True).to_csv('model_results.csv', index = False)
results_df.sort_values('mean_test_score', ascending = True).head(15)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,rank_test_score,Model Type,Training Data
9,5.725135,0.047653,0.090401,0.003772,"{'max_depth': 25, 'max_features': 10, 'n_estim...",9.215617,0.849504,1,Random Forest,Full Dataset
3,4.205863,0.07262,0.0708,0.003544,"{'max_depth': 10, 'max_features': 10, 'n_estim...",9.230938,0.861173,2,Random Forest,Full Dataset
8,2.375351,0.035147,0.037999,0.001264,"{'max_depth': 25, 'max_features': 10, 'n_estim...",9.234943,0.831438,3,Random Forest,Full Dataset
11,8.722401,0.308318,0.070528,0.004964,"{'max_depth': 25, 'max_features': 20, 'n_estim...",9.240034,0.865563,4,Random Forest,Full Dataset
5,7.454335,0.122594,0.072349,0.006241,"{'max_depth': 10, 'max_features': 20, 'n_estim...",9.260602,0.893415,5,Random Forest,Full Dataset
7,3.529395,0.055916,0.09495,0.003066,"{'max_depth': 25, 'max_features': 5, 'n_estima...",9.262326,0.877414,6,Random Forest,Full Dataset
2,1.706747,0.048957,0.04,0.014183,"{'max_depth': 10, 'max_features': 10, 'n_estim...",9.263493,0.88508,7,Random Forest,Full Dataset
10,4.127105,0.048558,0.03718,0.001332,"{'max_depth': 25, 'max_features': 20, 'n_estim...",9.266665,0.880879,8,Random Forest,Full Dataset
49,1.310396,0.019908,0.604287,0.018062,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",9.274834,0.973603,1,SVM,Full Dataset
46,1.314544,0.016748,0.601202,0.009714,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",9.280998,0.968731,2,SVM,Full Dataset


In [89]:
#worst models
results_df.sort_values('mean_test_score', ascending = True).tail(15)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,rank_test_score,Model Type,Training Data
41,1.290662,0.017405,0.264293,0.006952,"{'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}",45.506384,7.131972,57,SVM,Full Dataset
44,1.239022,0.013881,0.237146,0.001699,"{'C': 1, 'gamma': 'auto', 'kernel': 'sigmoid'}",100.125252,5.81508,56,SVM,Subset Dataset
41,1.216151,0.014955,0.238001,0.008742,"{'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}",100.198039,6.523099,57,SVM,Subset Dataset
50,1.304272,0.016399,0.2592,0.005153,"{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}",419.306408,49.107381,58,SVM,Full Dataset
47,1.282089,0.019177,0.263037,0.00648,"{'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}",473.120078,67.644174,59,SVM,Full Dataset
50,1.208386,0.008968,0.237458,0.001804,"{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}",995.57481,49.704919,58,SVM,Subset Dataset
47,1.216775,0.018423,0.240801,0.003188,"{'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}",996.822272,56.6864,59,SVM,Subset Dataset
56,1.30821,0.102762,0.256767,0.004269,"{'C': 100, 'gamma': 'auto', 'kernel': 'sigmoid'}",4187.710436,474.24684,60,SVM,Full Dataset
53,1.287463,0.024872,0.265584,0.018219,"{'C': 100, 'gamma': 'scale', 'kernel': 'sigmoid'}",4328.790909,836.781366,61,SVM,Full Dataset
56,1.241107,0.017614,0.241732,0.002708,"{'C': 100, 'gamma': 'auto', 'kernel': 'sigmoid'}",9942.33419,486.112016,60,SVM,Subset Dataset


In [90]:
#parameters for best model
best_params = results_df.sort_values('mean_test_score', ascending = True).iloc[0]['params']
best_params

"{'max_depth': 25, 'max_features': 10, 'n_estimators': 250}"

# Train model on previous years, predict on 2021 season

In [73]:
X_train = X_with_year[X_with_year['Year'] != 2021].drop('Year', axis = 1)
y_train = df['Salary_BoxCox'][df['Year'] != 2021]

X_test = X_with_year[X_with_year['Year'] == 2021].drop('Year', axis = 1)
y_test = df['Salary_BoxCox'][df['Year'] == 2021]

In [81]:
X_train

Unnamed: 0,AGE,GP,MPG,MIN%,USG%,FTA,FT%,2PA,2P%,3PA,...,TRB%,APG,AST%,SPG,BPG,VI,TO_100_Games,Center,Forward,Guard
537,-1.152357,0.485019,-0.850477,-0.846832,0.150362,0.023672,-0.935786,-0.168080,0.466387,-0.912200,...,1.154529,-0.775768,-0.759268,-0.720273,0.136975,0.036277,0.097337,0,1,0
538,-0.355872,-1.714062,-1.877441,-1.874131,-0.024440,-0.894620,-3.398072,-1.012270,-2.521726,-0.905284,...,-0.277260,-0.886557,-0.052945,-1.502357,-0.962850,-0.647607,-2.077924,0,0,1
539,0.305895,0.362847,0.835906,0.834203,-1.213095,0.232375,-1.250223,0.129870,1.049818,-0.898367,...,1.406059,-0.000243,-0.433273,0.701697,0.615160,0.292734,0.774085,1,0,0
540,-0.639487,0.607190,1.462895,1.461996,0.884531,2.060612,0.467088,1.990144,0.679260,-0.863785,...,0.999741,1.938568,1.500964,1.270485,1.499802,2.130672,0.339033,1,1,0
541,2.196659,-0.940312,0.646729,0.655203,0.465006,-0.502259,1.047588,-0.210098,0.091887,-0.365804,...,-0.083775,0.138243,-0.090978,-0.341081,2.695265,0.506447,-0.563298,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4205,-1.544690,0.932979,0.468361,0.450262,1.618701,1.826865,-0.297236,1.333128,0.040640,0.381168,...,-0.606184,0.609097,0.881574,1.057189,-0.436847,0.677418,0.774085,0,0,1
4206,0.346074,0.607190,0.879147,0.870521,1.461379,1.484592,0.592863,1.023718,-0.148581,1.501626,...,-1.012503,-0.221822,-0.433273,0.227707,-0.508575,-0.305665,-0.692202,0,0,1
4207,-0.362963,1.218045,1.538566,1.508692,0.989413,1.017098,0.046226,2.750298,0.072177,1.100474,...,-0.122472,0.221335,-0.129011,3.545637,0.136975,0.292734,-0.353828,0,1,0
4208,-1.308345,1.340217,-0.309970,-0.327994,0.080441,0.775002,0.133301,0.504217,-0.471833,-0.912200,...,0.786907,-0.443400,-0.281142,-0.341081,0.232612,0.207248,0.339033,1,0,0


In [76]:
from sklearn.ensemble import RandomForestRegressor

rand_forest = RandomForestRegressor(max_depth =  25, max_features =  10, n_estimators = 250)

In [77]:
rand_forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=25, max_features=10, n_estimators=250)

In [91]:
y_pred = rand_forest.predict(X_test)

In [94]:
from scipy.special import inv_boxcox

y_pred_salary = inv_boxcox(y_pred, box_cox_transformer)
y_test_salary = inv_boxcox(y_test, box_cox_transformer)

In [135]:
results = pd.DataFrame(zip(y_test_salary, y_pred_salary), columns = ['True Salary', 'Predicted Salary'])
results['Error'] = results['Predicted Salary'] - results['True Salary']

In [136]:
player_team = df[['FULL NAME', 'TEAM']][df['Year'] == 2021]

In [137]:
predicting_2021 = pd.merge(player_team, results, left_index=True, right_index=True)

In [138]:
predicting_2021

Unnamed: 0,FULL NAME,TEAM,True Salary,Predicted Salary,Error
0,Precious Achiuwa,Tor,2.693401e+06,2.651795e+06,-4.160573e+04
1,Steven Adams,Mem,1.693989e+07,1.309893e+07,-3.840954e+06
2,Bam Adebayo,Mia,2.787475e+07,1.491731e+07,-1.295743e+07
3,LaMarcus Aldridge,Bro,2.624316e+06,8.721541e+06,6.097226e+06
4,Nickeil Alexander-Walker,Nor,3.239581e+06,2.489418e+06,-7.501623e+05
...,...,...,...,...,...
532,Thaddeus Young,San,1.408097e+07,5.549210e+06,-8.531762e+06
533,Trae Young,Atl,8.265438e+06,8.680063e+06,4.146254e+05
534,Omer Yurtseven,Mia,1.479823e+06,1.827956e+06,3.481326e+05
535,Cody Zeller,Por,2.374079e+06,4.781456e+06,2.407376e+06


In [145]:
#Overpaid players (worst value players)
predicting_2021.sort_values('Error', ascending = True).head(25)

Unnamed: 0,FULL NAME,TEAM,True Salary,Predicted Salary,Error
503,Kemba Walker,Nyk,34677690.0,7127757.0,-27549930.0
188,Blake Griffin,Bro,32138950.0,4760307.0,-27378640.0
481,Klay Thompson,Gol,37663890.0,13835670.0,-23828220.0
305,Kevin Love,Cle,31001590.0,8449786.0,-22551810.0
513,Russell Westbrook,Lal,43837830.0,22013030.0,-21824800.0
519,Andrew Wiggins,Gol,31319870.0,13631490.0,-17688380.0
200,Tobias Harris,Phi,35696980.0,18208180.0,-17488800.0
437,D'Angelo Russell,Min,29767880.0,14292600.0,-15475280.0
133,Goran Dragic,Tor,19196240.0,3737722.0,-15458520.0
407,Kristaps Porzingis,Dal,31390450.0,16208690.0,-15181760.0


In [146]:
#Underpaid Players (Best value players)
predicting_2021.sort_values('Error', ascending = False).head(25)

Unnamed: 0,FULL NAME,TEAM,True Salary,Predicted Salary,Error
134,Andre Drummond,Phi,2385890.0,9806653.0,7420763.0
69,Jalen Brunson,Dal,1790651.0,9075288.0,7284636.0
243,Reggie Jackson,Lac,10306860.0,17123710.0,6816845.0
406,Bobby Portis,Mil,4317591.0,10946870.0,6629280.0
3,LaMarcus Aldridge,Bro,2624316.0,8721541.0,6097226.0
493,Jonas Valanciunas,Nor,13892560.0,19851940.0,5959388.0
171,Shai Gilgeous-Alexander,Okc,5456754.0,11070260.0,5613506.0
469,Jae'Sean Tate,Hou,1508541.0,7097733.0,5589192.0
12,Carmelo Anthony,Lal,2624316.0,7905524.0,5281208.0
59,Miles Bridges,Cha,5383286.0,10534160.0,5150876.0


In [149]:
#Best Predicted players
predicting_2021['Absolute Error'] = abs(predicting_2021['Error'])
predicting_2021.sort_values('Absolute Error', ascending = True).head(25)

Unnamed: 0,FULL NAME,TEAM,True Salary,Predicted Salary,Error,Absolute Error
190,Kyle Guy,Mia,422898.2,423192.0,293.823164,293.823164
512,Quinndary Weatherspoon,Gol,344661.4,344987.3,325.885443,325.885443
507,Yuta Watanabe,Tor,1751664.0,1750437.0,-1226.910025,1226.910025
392,Eric Paschall,Uta,1771351.0,1772888.0,1537.680927,1537.680927
270,Georgios Kalaitzakis,Mil,460102.2,458305.4,-1796.836916,1796.836916
323,Miles McBride,Nyk,919797.8,922133.9,2336.010708,2336.010708
509,Tremont Waters,Tor,190917.3,194277.0,3359.675254,3359.675254
531,Moses Wright,Lac,171983.0,182476.3,10493.283737,10493.283737
129,Damyean Dotson,Nyk,204642.7,218339.6,13696.913762,13696.913762
242,Justin Jackson,Bos,306886.4,323792.5,16906.144471,16906.144471


In [141]:
#Group by team
team_predictions = predicting_2021.groupby('TEAM').sum().reset_index()

### The fact that just about every team has a lower predicted salary tells me we need to scale 2021 to a full season... 

In [142]:
team_predictions.sort_values('Error', ascending = True)

Unnamed: 0,TEAM,True Salary,Predicted Salary,Error
2,Bro,192055100.0,127719000.0,-64336120.0
5,Cle,132602400.0,71324800.0,-61277600.0
19,Nyk,132868800.0,73791590.0,-59077210.0
17,Min,136081700.0,78112650.0,-57969040.0
9,Gol,165471000.0,112396400.0,-53074650.0
13,Lal,172339200.0,121594900.0,-50744290.0
0,Atl,130736600.0,80980230.0,-49756410.0
28,Uta,152784600.0,103530000.0,-49254660.0
1,Bos,123681400.0,75488620.0,-48192740.0
27,Tor,120163000.0,74060670.0,-46102380.0
