In [82]:
import pandas as pd
import glob

# Data import

In [83]:
df = pd.read_csv('data/NBA_Player_Dataset-2013-2021.csv')

box_cox_transformer = 0.16106145323461019 #need to pull from data_prep.ipynb

# Scale Data

In [156]:
#First scaling 2021 season to a full season
#roughly 10 games left for all teams out of 82

scale_factor = 1 + 10/82

#columns to scale

cols_to_scale = ['GP', 'FTA', '2PA', '3PA']

df[cols_to_scale] = df[cols_to_scale] * scale_factor

In [157]:
#variables we need to scale

x_to_scale = df[['AGE', 'GP', 'MPG', 'MIN%', 'USG%',
       'FTA', 'FT%', '2PA', '2P%', '3PA', '3P%', 'eFG%', 'TS%', 'PPG', 'RPG',
       'TRB%', 'APG', 'AST%', 'SPG', 'BPG', 'VI', 'TO_100_Games']]

#scaler function
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_to_scale)

x_scaled = scaler.transform(x_to_scale)

x_scaled_df = pd.DataFrame(x_scaled, columns = x_to_scale.columns)

In [158]:
#add the categorical features to scaled features for input
X_with_year = pd.concat([x_scaled_df, df[['Year', 'Center', 'Forward', 'Guard']]], axis = 1)

#choose our target variable
#y = df['Salary']
y = df['Salary_BoxCox']

# Read in results from models_final.ipynb

In [159]:
df_list = []

for file in glob.glob('results/*.csv'):
    temp_df = pd.read_csv(file)
    df_list.append(temp_df)

results_df = pd.concat(df_list, axis = 0)

In [160]:
#Remove the columns specific to certain models, all parameters are stored in 'params' column

results_df = results_df[['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params', 
                        'mean_test_score', 'std_test_score', 'rank_test_score', 'Model Type', 'Training Data']]

In [161]:
#so far best model is RF with full dataset.  
results_df.sort_values('mean_test_score', ascending = True).to_csv('model_results.csv', index = False)
results_df.sort_values('mean_test_score', ascending = True).head(15)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,rank_test_score,Model Type,Training Data
9,6.598142,0.402557,0.102035,0.010738,"{'max_depth': 25, 'max_features': 10, 'n_estim...",9.15513,0.215472,1,Random Forest,Full Dataset
11,9.665229,0.637191,0.075954,0.00636,"{'max_depth': 25, 'max_features': 20, 'n_estim...",9.174885,0.193764,2,Random Forest,Full Dataset
3,4.772411,0.690266,0.084805,0.012553,"{'max_depth': 10, 'max_features': 10, 'n_estim...",9.193707,0.231902,3,Random Forest,Full Dataset
8,2.73806,0.348547,0.0498,0.013287,"{'max_depth': 25, 'max_features': 10, 'n_estim...",9.195151,0.17875,4,Random Forest,Full Dataset
4,3.501711,0.512405,0.034989,0.001272,"{'max_depth': 10, 'max_features': 20, 'n_estim...",9.204922,0.226176,5,Random Forest,Full Dataset
5,9.083368,1.173974,0.094001,0.035769,"{'max_depth': 10, 'max_features': 20, 'n_estim...",9.207776,0.194854,6,Random Forest,Full Dataset
2,2.037275,0.327192,0.038315,0.008223,"{'max_depth': 10, 'max_features': 10, 'n_estim...",9.211908,0.247775,7,Random Forest,Full Dataset
7,4.118319,0.53139,0.118671,0.022865,"{'max_depth': 25, 'max_features': 5, 'n_estima...",9.227281,0.207246,8,Random Forest,Full Dataset
10,4.781488,0.240398,0.044402,0.004756,"{'max_depth': 25, 'max_features': 20, 'n_estim...",9.25222,0.204945,9,Random Forest,Full Dataset
6,1.541104,0.04305,0.044734,0.004706,"{'max_depth': 25, 'max_features': 5, 'n_estima...",9.275502,0.217739,10,Random Forest,Full Dataset


In [162]:
#worst models
results_df.sort_values('mean_test_score', ascending = True).tail(15)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,mean_test_score,std_test_score,rank_test_score,Model Type,Training Data
41,1.368311,0.016403,0.278649,0.012914,"{'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}",50.436944,4.067735,57,SVM,Full Dataset
44,1.268169,0.04297,0.2526,0.022811,"{'C': 1, 'gamma': 'auto', 'kernel': 'sigmoid'}",100.884112,5.798606,56,SVM,Subset Dataset
41,1.27201,0.061068,0.250168,0.016202,"{'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}",100.918061,6.711417,57,SVM,Subset Dataset
50,1.371754,0.026226,0.280721,0.013525,"{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}",463.562443,36.662448,58,SVM,Full Dataset
47,1.368837,0.053628,0.267229,0.007819,"{'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}",520.176297,46.029433,59,SVM,Full Dataset
47,1.372343,0.08879,0.265795,0.008835,"{'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}",1002.214085,67.673235,58,SVM,Subset Dataset
50,1.411286,0.124818,0.263104,0.005914,"{'C': 10, 'gamma': 'auto', 'kernel': 'sigmoid'}",1002.429532,59.529284,59,SVM,Subset Dataset
56,1.385706,0.047284,0.277602,0.005083,"{'C': 100, 'gamma': 'auto', 'kernel': 'sigmoid'}",3913.28795,685.142338,60,SVM,Full Dataset
53,1.406578,0.056377,0.272704,0.009103,"{'C': 100, 'gamma': 'scale', 'kernel': 'sigmoid'}",4729.386173,510.797848,61,SVM,Full Dataset
56,1.341692,0.035011,0.257934,0.007764,"{'C': 100, 'gamma': 'auto', 'kernel': 'sigmoid'}",10009.943693,588.147454,60,SVM,Subset Dataset


In [163]:
#parameters for best model
best_params = results_df.sort_values('mean_test_score', ascending = True).iloc[0]['params']
best_params

"{'max_depth': 25, 'max_features': 10, 'n_estimators': 250}"

# Train model on previous years, predict on 2021 season

In [164]:
X_train = X_with_year[X_with_year['Year'] != 2021].drop('Year', axis = 1)
y_train = df['Salary_BoxCox'][df['Year'] != 2021]

X_test = X_with_year[X_with_year['Year'] == 2021].drop('Year', axis = 1)
y_test = df['Salary_BoxCox'][df['Year'] == 2021]

In [165]:
X_train[cols_to_scale]

Unnamed: 0,GP,FTA,2PA,3PA
537,0.485019,0.023672,-0.168080,-0.912200
538,-1.714062,-0.894620,-1.012270,-0.905284
539,0.362847,0.232375,0.129870,-0.898367
540,0.607190,2.060612,1.990144,-0.863785
541,-0.940312,-0.502259,-0.210098,-0.365804
...,...,...,...,...
4205,0.932979,1.826865,1.333128,0.381168
4206,0.607190,1.484592,1.023718,1.501626
4207,1.218045,1.017098,2.750298,1.100474
4208,1.340217,0.775002,0.504217,-0.912200


### Should remove outliers here

In [166]:
#remove outliers

### Train our best model

In [167]:
from sklearn.ensemble import RandomForestRegressor

rand_forest = RandomForestRegressor(max_depth =  25, max_features =  10, n_estimators = 250)

In [168]:
rand_forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=25, max_features=10, n_estimators=250)

### Predict

In [169]:
y_pred = rand_forest.predict(X_test)

In [170]:
from scipy.special import inv_boxcox

y_pred_salary = inv_boxcox(y_pred, box_cox_transformer)
y_test_salary = inv_boxcox(y_test, box_cox_transformer)

In [171]:
results = pd.DataFrame(zip(y_test_salary, y_pred_salary), columns = ['True Salary', 'Predicted Salary'])
results['Error'] = results['Predicted Salary'] - results['True Salary']

In [172]:
player_team = df[['FULL NAME', 'TEAM']][df['Year'] == 2021]

In [173]:
predicting_2021 = pd.merge(player_team, results, left_index=True, right_index=True)

In [174]:
predicting_2021

Unnamed: 0,FULL NAME,TEAM,True Salary,Predicted Salary,Error
0,Precious Achiuwa,Tor,2.693401e+06,2.615641e+06,-7.776038e+04
1,Steven Adams,Mem,1.693989e+07,1.246445e+07,-4.475439e+06
2,Bam Adebayo,Mia,2.787475e+07,1.548094e+07,-1.239380e+07
3,LaMarcus Aldridge,Bro,2.624316e+06,9.018845e+06,6.394529e+06
4,Nickeil Alexander-Walker,Nor,3.239581e+06,2.330614e+06,-9.089661e+05
...,...,...,...,...,...
532,Thaddeus Young,San,1.408097e+07,5.774636e+06,-8.306335e+06
533,Trae Young,Atl,8.265438e+06,7.996446e+06,-2.689915e+05
534,Omer Yurtseven,Mia,1.479823e+06,1.898720e+06,4.188967e+05
535,Cody Zeller,Por,2.374079e+06,4.247613e+06,1.873534e+06


In [175]:
#Overpaid players (worst value players)
predicting_2021.sort_values('Error', ascending = True).head(25)

Unnamed: 0,FULL NAME,TEAM,True Salary,Predicted Salary,Error
503,Kemba Walker,Nyk,34677690.0,6309393.0,-28368300.0
188,Blake Griffin,Bro,32138950.0,4808967.0,-27329980.0
513,Russell Westbrook,Lal,43837830.0,19986340.0,-23851490.0
481,Klay Thompson,Gol,37663890.0,14553670.0,-23110220.0
305,Kevin Love,Cle,31001590.0,9878006.0,-21123590.0
519,Andrew Wiggins,Gol,31319870.0,13402950.0,-17916920.0
200,Tobias Harris,Phi,35696980.0,19001740.0,-16695240.0
133,Goran Dragic,Tor,19196240.0,3315976.0,-15880270.0
437,D'Angelo Russell,Min,29767880.0,14077800.0,-15690080.0
407,Kristaps Porzingis,Dal,31390450.0,15752450.0,-15638000.0


In [176]:
#Underpaid Players (Best value players)
predicting_2021.sort_values('Error', ascending = False).head(25)

Unnamed: 0,FULL NAME,TEAM,True Salary,Predicted Salary,Error
243,Reggie Jackson,Lac,10306860.0,17668780.0,7361917.0
134,Andre Drummond,Phi,2385890.0,9691409.0,7305519.0
69,Jalen Brunson,Dal,1790651.0,8657813.0,6867162.0
3,LaMarcus Aldridge,Bro,2624316.0,9018845.0,6394529.0
469,Jae'Sean Tate,Hou,1508541.0,7591026.0,6082485.0
406,Bobby Portis,Mil,4317591.0,10306490.0,5988898.0
12,Carmelo Anthony,Lal,2624316.0,8181101.0,5556785.0
171,Shai Gilgeous-Alexander,Okc,5456754.0,10885430.0,5428676.0
493,Jonas Valanciunas,Nor,13892560.0,19283360.0,5390799.0
402,Jakob Poeltl,San,8685575.0,13189990.0,4504414.0


In [177]:
#Best Predicted players
predicting_2021['Absolute Error'] = abs(predicting_2021['Error'])
predicting_2021.sort_values('Absolute Error', ascending = True).head(25)

Unnamed: 0,FULL NAME,TEAM,True Salary,Predicted Salary,Error,Absolute Error
129,Damyean Dotson,Nyk,204642.7,203681.8,-960.842779,960.842779
314,Naji Marshall,Nor,1508541.0,1514576.0,6034.734946,6034.734946
460,Cassius Stanley,Det,255426.3,262413.3,6986.975612,6986.975612
500,Dean Wade,Cle,1771351.0,1761892.0,-9458.949282,9458.949282
531,Moses Wright,Lac,171983.0,181824.2,9841.25368,9841.25368
491,Rayjon Tucker,Den,190917.3,201344.7,10427.412578,10427.412578
303,Robin Lopez,Orl,4965029.0,4975529.0,10499.406462,10499.406462
509,Tremont Waters,Tor,190917.3,174642.7,-16274.626762,16274.626762
79,Facundo Campazzo,Den,3178553.0,3159637.0,-18916.478512,18916.478512
212,Haywood Highsmith,Mia,544651.3,521263.6,-23387.726285,23387.726285


In [178]:
#Group by team
team_predictions = predicting_2021.groupby('TEAM').sum().reset_index()

### The fact that just about every team has a lower predicted salary tells me we need to scale 2021 to a full season... 

In [179]:
#Sorted by most overpaid/underperforming teams.  Looks about right.  
team_predictions.sort_values('Error', ascending = True)

Unnamed: 0,TEAM,True Salary,Predicted Salary,Error,Absolute Error
2,Bro,192055100.0,126057600.0,-65997500.0,91179820.0
19,Nyk,132868800.0,73867890.0,-59000910.0,63882130.0
5,Cle,132602400.0,75209990.0,-57392410.0,61508410.0
17,Min,136081700.0,79687830.0,-56393860.0,67916850.0
9,Gol,165471000.0,113002400.0,-52468660.0,83180380.0
13,Lal,172339200.0,120124900.0,-52214320.0,86726890.0
0,Atl,130736600.0,79612590.0,-51124040.0,58489910.0
28,Uta,152784600.0,102756900.0,-50027700.0,57204100.0
27,Tor,120163000.0,70879270.0,-49283780.0,52102550.0
1,Bos,123681400.0,75669650.0,-48011710.0,49460430.0
