# Creating Priors for 2016/17 season

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# read in all our training data

# MAIN training set for after we've validated
main_train_rookies = pd.read_csv("../data/pre_2016_17/main_train_rookies.csv")
main_train_rookies.drop(main_train_rookies.columns[0], axis = 1, inplace = True)

main_train_vets = pd.read_csv("../data/pre_2016_17/main_train_vets.csv")
main_train_vets.drop(main_train_vets.columns[0], axis = 1, inplace = True)

# training set before validation
train_rookies = pd.read_csv("../data/pre_2016_17/train_rookies.csv")
train_rookies.drop(train_rookies.columns[0], axis = 1, inplace = True)

train_vets = pd.read_csv("../data/pre_2016_17/train_vets.csv")
train_vets.drop(train_vets.columns[0], axis = 1, inplace = True)

# validation dataset
validate_rookies = pd.read_csv("../data/pre_2016_17/validate_rookies.csv")
validate_rookies.drop(validate_rookies.columns[0], axis = 1, inplace = True)

validate_vets = pd.read_csv("../data/pre_2016_17/validate_vets.csv")
validate_vets.drop(validate_vets.columns[0], axis = 1, inplace = True)

In [3]:
# FIRST - with team rating included as a covariate

# x and y for training
x_rookies1 = np.array(train_rookies[['rating', 'mu']])
y_rookies = np.array(train_rookies['coefs'])

x_vets1 = np.array(train_vets[['rating', 'mu']])
y_vets = np.array(train_vets['coefs'])

# x and y for validation
x_rookies_validate1 = np.array(validate_rookies[['rating', 'mu']])
y_rookies_validate = np.array(validate_rookies['coefs'])

x_vets_validate1 = np.array(validate_vets[['rating', 'mu']])
y_vets_validate = np.array(validate_vets['coefs'])

# SECOND - without team rating as a covariate
# Note that we don't need to change the y variables since they stay the same regardless of the covariates
x_rookies2 = np.array(train_rookies['mu']).reshape(-1, 1)
x_vets2 = np.array(train_vets['mu']).reshape(-1, 1)
x_rookies_validate2 = np.array(validate_rookies['mu']).reshape(-1, 1)
x_vets_validate2 = np.array(validate_vets['mu']).reshape(-1, 1)

# Now create dataset for main training sets
x_main_rookies = np.array(main_train_rookies['mu']).reshape(-1, 1)
y_main_rookies = np.array(main_train_rookies['coefs'])
x_main_vets = np.array(main_train_vets['mu']).reshape(-1, 1)
y_main_vets = np.array(main_train_vets['coefs']).reshape(-1, 1)


In [4]:
rf_rookie2 = RandomForestRegressor(max_depth = 2, n_estimators = 200).fit(x_rookies2, y_rookies)

preds_rookie_rf2 = rf_rookie2.predict(x_rookies_validate2)
mse_rf_rookie2 = np.mean((y_rookies_validate - preds_rookie_rf2)**2)

rf_vet2 = RandomForestRegressor(max_depth = 2, n_estimators = 50).fit(x_rookies2, y_rookies)

preds_vet_rf2 = rf_vet2.predict(x_vets_validate2)
mse_rf_vet2 = np.mean((y_vets_validate - preds_vet_rf2)**2)


In [5]:
# read in contract data for 2015/16 season which will be used as the new data in our model to get priors

newdata_vets = pd.read_csv("../data/Contract+team2016_NonRookie.csv")
newdata_rookies = pd.read_csv("../data/Contract+team2016_Rookie.csv")

newdata_vets.drop(newdata_vets.columns[0], axis = 1, inplace = True)
newdata_rookies.drop(newdata_rookies.columns[0], axis = 1, inplace = True)

In [6]:
x_final_rookies = np.array(newdata_rookies['mu']).reshape(-1, 1)
x_final_vets = np.array(newdata_vets['mu']).reshape(-1, 1)

In [7]:
# train rookie model and veteran model on all of our main data

rf_rookie2 = RandomForestRegressor(max_depth = 2, n_estimators = 200).fit(x_main_rookies, y_main_rookies)

rf_vet2 = RandomForestRegressor(max_depth = 2, n_estimators = 50).fit(x_main_vets, y_main_vets)

# NOTE - keep the MSE's from validation set and this will be used as our standard error in the priors
mse_vets = mse_rf_vet2
mse_rookies = mse_rf_rookie2

priors_rookies_means = rf_rookie2.predict(x_final_rookies)
priors_vets_means = rf_vet2.predict(x_final_vets)

sigma_rookies = np.sqrt(mse_rookies)
sigma_vets = np.sqrt(mse_vets)

newdata_vets['finalpriors'] = priors_vets_means
newdata_rookies['finalpriors'] = priors_rookies_means

newdata_vets['finalse'] = sigma_vets
newdata_rookies['finalse'] = sigma_rookies

  """


In [8]:
# Now add player id and index columns by merging with the player index map for 2015/16

player_index_map_2016 = pd.read_csv("../data/player_index_map_2016-17.csv")
player_index_map_2016.drop(player_index_map_2016.columns[0], axis = 1, inplace = True)

player_index_map_2016.head()

Unnamed: 0,player_id,index,player_name
0,203585.0,0,Rodney McGruder
1,202337.0,1,Luke Babbitt
2,201609.0,2,Goran Dragic
3,201961.0,3,Wayne Ellington
4,2754.0,4,Tony Allen


In [9]:
newdata_vets = newdata_vets.merge(player_index_map_2016, how = "inner", left_on = "name", right_on = "player_name")
newdata_rookies = newdata_rookies.merge(player_index_map_2016, how = "inner", left_on = "name", right_on = "player_name")

newdata_vets

Unnamed: 0,rating,Team,Type,mu,sd,name,finalpriors,finalse,player_id,index,player_name
0,7.072487,San Antonio Spurs,Non-rookie,6.858335,5,LaMarcus Aldridge,3.257040,4.382986,200746.0,184,LaMarcus Aldridge
1,7.072487,San Antonio Spurs,Non-rookie,0.221270,5,Joel Anthony,-1.127424,4.382986,201202.0,348,Joel Anthony
2,7.072487,San Antonio Spurs,Non-rookie,5.166667,5,Pau Gasol,2.904536,4.382986,2200.0,375,Pau Gasol
3,7.072487,San Antonio Spurs,Non-rookie,4.666667,5,Manu Ginobili,1.890003,4.382986,1938.0,350,Manu Ginobili
4,7.072487,San Antonio Spurs,Non-rookie,3.333333,5,Danny Green,0.509107,4.382986,201980.0,426,Danny Green
...,...,...,...,...,...,...,...,...,...,...,...
248,-13.379977,Philadelphia 76ers,Non-rookie,3.675676,5,Andrew Bogut,0.509107,4.382986,101106.0,349,Andrew Bogut
249,-13.379977,Philadelphia 76ers,Non-rookie,0.019224,5,Justin Harper,-1.098821,4.382986,202712.0,450,Justin Harper
250,-13.379977,Philadelphia 76ers,Non-rookie,3.000000,5,Gerald Henderson,0.509107,4.382986,201945.0,376,Gerald Henderson
251,-13.379977,Philadelphia 76ers,Non-rookie,2.666667,5,Sergio Rodriguez,0.509107,4.382986,200771.0,44,Sergio Rodriguez


In [16]:
newdata_vets.to_csv("../data/final_priors_vets_2016_17.csv")
newdata_rookies.to_csv("../data/final_priors_rookies_2016_17.csv")