# Creating Priors for 2017/18 season

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# read in all our training data

# MAIN training set for after we've validated
main_train_rookies = pd.read_csv("../data/pre_2017_18/main_train_rookies.csv")
main_train_rookies.drop(main_train_rookies.columns[0], axis = 1, inplace = True)

main_train_vets = pd.read_csv("../data/pre_2017_18/main_train_vets.csv")
main_train_vets.drop(main_train_vets.columns[0], axis = 1, inplace = True)

# training set before validation
train_rookies = pd.read_csv("../data/pre_2017_18/train_rookies.csv")
train_rookies.drop(train_rookies.columns[0], axis = 1, inplace = True)

train_vets = pd.read_csv("../data/pre_2017_18/train_vets.csv")
train_vets.drop(train_vets.columns[0], axis = 1, inplace = True)

# validation dataset
validate_rookies = pd.read_csv("../data/pre_2017_18/validate_rookies.csv")
validate_rookies.drop(validate_rookies.columns[0], axis = 1, inplace = True)

validate_vets = pd.read_csv("../data/pre_2017_18/validate_vets.csv")
validate_vets.drop(validate_vets.columns[0], axis = 1, inplace = True)

In [2]:
# FIRST - with team rating included as a covariate

# x and y for training
x_rookies1 = np.array(train_rookies[['rating', 'mu']])
y_rookies = np.array(train_rookies['coefs'])

x_vets1 = np.array(train_vets[['rating', 'mu']])
y_vets = np.array(train_vets['coefs'])

# x and y for validation
x_rookies_validate1 = np.array(validate_rookies[['rating', 'mu']])
y_rookies_validate = np.array(validate_rookies['coefs'])

x_vets_validate1 = np.array(validate_vets[['rating', 'mu']])
y_vets_validate = np.array(validate_vets['coefs'])

# SECOND - without team rating as a covariate
# Note that we don't need to change the y variables since they stay the same regardless of the covariates
x_rookies2 = np.array(train_rookies['mu']).reshape(-1, 1)
x_vets2 = np.array(train_vets['mu']).reshape(-1, 1)
x_rookies_validate2 = np.array(validate_rookies['mu']).reshape(-1, 1)
x_vets_validate2 = np.array(validate_vets['mu']).reshape(-1, 1)

# Now create dataset for main training sets
x_main_rookies = np.array(main_train_rookies['mu']).reshape(-1, 1)
y_main_rookies = np.array(main_train_rookies['coefs'])
x_main_vets = np.array(main_train_vets['mu']).reshape(-1, 1)
y_main_vets = np.array(main_train_vets['coefs']).reshape(-1, 1)


In [3]:
rf_rookie2 = RandomForestRegressor(max_depth = 2, n_estimators = 200).fit(x_rookies2, y_rookies)

preds_rookie_rf2 = rf_rookie2.predict(x_rookies_validate2)
mse_rf_rookie2 = np.mean((y_rookies_validate - preds_rookie_rf2)**2)

rf_vet2 = RandomForestRegressor(max_depth = 2, n_estimators = 50).fit(x_rookies2, y_rookies)

preds_vet_rf2 = rf_vet2.predict(x_vets_validate2)
mse_rf_vet2 = np.mean((y_vets_validate - preds_vet_rf2)**2)


In [4]:
# read in contract data for 2015/16 season which will be used as the new data in our model to get priors

newdata_vets = pd.read_csv("../data/Contract+team2017_NonRookie.csv")
newdata_rookies = pd.read_csv("../data/Contract+team2017_Rookie.csv")

newdata_vets.drop(newdata_vets.columns[0], axis = 1, inplace = True)
newdata_rookies.drop(newdata_rookies.columns[0], axis = 1, inplace = True)

In [5]:
x_final_rookies = np.array(newdata_rookies['mu']).reshape(-1, 1)
x_final_vets = np.array(newdata_vets['mu']).reshape(-1, 1)

In [6]:
# train rookie model and veteran model on all of our main data

rf_rookie2 = RandomForestRegressor(max_depth = 2, n_estimators = 200).fit(x_main_rookies, y_main_rookies)

rf_vet2 = RandomForestRegressor(max_depth = 2, n_estimators = 50).fit(x_main_vets, y_main_vets)

# NOTE - keep the MSE's from validation set and this will be used as our standard error in the priors
mse_vets = mse_rf_vet2
mse_rookies = mse_rf_rookie2

priors_rookies_means = rf_rookie2.predict(x_final_rookies)
priors_vets_means = rf_vet2.predict(x_final_vets)

sigma_rookies = np.sqrt(mse_rookies)
sigma_vets = np.sqrt(mse_vets)

newdata_vets['finalpriors'] = priors_vets_means
newdata_rookies['finalpriors'] = priors_rookies_means

newdata_vets['finalse'] = sigma_vets
newdata_rookies['finalse'] = sigma_rookies

  """


In [7]:
# Now add player id and index columns by merging with the player index map for 2017/18

player_index_map_2017 = pd.read_csv("../data/player_index_map_2017-18.csv")
player_index_map_2017.drop(player_index_map_2017.columns[0], axis = 1, inplace = True)

player_index_map_2017.head()

Unnamed: 0,player_id,index,player_name
0,201152.0,0,Thaddeus Young
1,203506.0,1,Victor Oladipo
2,203922.0,2,Glenn Robinson III
3,1626202.0,3,Joe Young
4,1628388.0,4,TJ Leaf


In [8]:
newdata_vets = newdata_vets.merge(player_index_map_2017, how = "inner", left_on = "name", right_on = "player_name")
newdata_rookies = newdata_rookies.merge(player_index_map_2017, how = "inner", left_on = "name", right_on = "player_name")

newdata_vets

Unnamed: 0,Team,rating,Type,mu,sd,name,finalpriors,finalse,player_id,index,player_name
0,Golden State Warriors,13.430316,Non-rookie,0.490461,5,Omri Casspi,-0.662198,3.662797,201956.0,36,Omri Casspi
1,Golden State Warriors,13.430316,Non-rookie,11.560850,5,Stephen Curry,3.284871,3.662797,201939.0,290,Stephen Curry
2,Golden State Warriors,13.430316,Non-rookie,8.333333,5,Kevin Durant,3.284871,3.662797,201142.0,391,Kevin Durant
3,Golden State Warriors,13.430316,Non-rookie,5.466667,5,Draymond Green,2.521518,3.662797,203110.0,199,Draymond Green
4,Golden State Warriors,13.430316,Non-rookie,4.938272,5,Andre Iguodala,1.834240,3.662797,2738.0,198,Andre Iguodala
...,...,...,...,...,...,...,...,...,...,...,...
232,Portland Trailblazers,,Non-rookie,5.710383,5,Evan Turner,3.197303,3.662797,202323.0,326,Evan Turner
233,Toronto Raptors,,Non-rookie,9.246658,5,DeMar DeRozan,3.284871,3.662797,201942.0,411,DeMar DeRozan
234,Toronto Raptors,,Non-rookie,6.687243,5,Serge Ibaka,3.325913,3.662797,201586.0,229,Serge Ibaka
235,Toronto Raptors,,Non-rookie,9.567901,5,Kyle Lowry,3.284871,3.662797,200768.0,5,Kyle Lowry


In [9]:
newdata_vets.to_csv("../data/final_priors_vets_2017_18.csv")
newdata_rookies.to_csv("../data/final_priors_rookies_2017_18.csv")