# Bayesian Regression Model 2015/16 using priors from optimized random forest model

In [1]:
import pymc3 as pm
import pandas as pd
import numpy as np
import arviz as az

data = pd.read_csv("../data/shifts_data_final_2016_17.csv")
data.drop(data.columns[0], axis = 1, inplace = True)
data.head()

Unnamed: 0,point_diff_per_100,home_team,away_team,0,1,2,3,4,5,6,...,476,477,478,479,480,481,482,483,484,485
0,-4.227543,Celtics,Heat,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,72.337963,Celtics,Heat,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-52.083333,Celtics,Heat,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,Celtics,Heat,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-30.281008,Celtics,Heat,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
priors_df_vets = pd.read_csv("../data/final_priors_vets_2016_17.csv")
priors_df_vets.drop(priors_df_vets.columns[0], axis = 1, inplace = True)
# need to rename the index column to idx
priors_df_vets.columns = ['rating', 'team', 'type', 'mu' ,'sd', 'name', 'finalpriors', 'finalse', 'player_id', 'idx', 'player_name']

priors_df_rookies = pd.read_csv("../data/final_priors_rookies_2016_17.csv")
priors_df_rookies.drop(priors_df_rookies.columns[0], axis = 1, inplace = True)
# need to rename the index column to idx
priors_df_rookies.columns = ['rating', 'team', 'type', 'mu' ,'sd', 'name', 'finalpriors', 'finalse', 'player_id', 'idx', 'player_name']

priors_df_vets.sort_values(by = ['idx'], inplace = True)
priors_df_rookies.sort_values(by = ['idx'], inplace = True)

In [3]:
n_players = np.shape(data)[1] - 3
prior_means = np.zeros(n_players)
prior_sigmas = np.full(n_players, 4)

for i in range(len(prior_means)):
    if i in np.array(priors_df_vets['idx']):
        prior_means[i] = priors_df_vets.loc[priors_df_vets['idx'] == i]['finalpriors'].iloc[0]
        prior_sigmas[i] = priors_df_vets.loc[priors_df_vets['idx'] == i]['finalse'].iloc[0]
    elif i in np.array(priors_df_rookies['idx']):
        prior_means[i] = priors_df_rookies.loc[priors_df_rookies['idx'] == i]['finalpriors'].iloc[0]
        prior_sigmas[i] = priors_df_rookies.loc[priors_df_rookies['idx'] == i]['finalse'].iloc[0]
        


In [4]:
home_teams = data['home_team']
away_teams = data['away_team']
# now drop these columns from the main training dataframe
data.drop(['home_team', 'away_team'], axis = 1, inplace = True)
data.head()

Unnamed: 0,point_diff_per_100,0,1,2,3,4,5,6,7,8,...,476,477,478,479,480,481,482,483,484,485
0,-4.227543,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,72.337963,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-52.083333,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-30.281008,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# need to rename columns now since numbers confuse pymc3
new_cols = []
for i in range(np.shape(data)[1]):
    if i == 0:
        new_cols.append("point_diff")
    else:
        new_cols.append("p" + str(i-1))

# x_df = data.iloc[:20000,]
x_df = data
x_df.columns = new_cols
x_df

Unnamed: 0,point_diff,p0,p1,p2,p3,p4,p5,p6,p7,p8,...,p476,p477,p478,p479,p480,p481,p482,p483,p484,p485
0,-4.227543,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,72.337963,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-52.083333,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-30.281008,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33380,15.409270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33381,-58.740602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33382,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33383,144.675926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
x = np.array(x_df.iloc[:,1:])
y = np.array(x_df.iloc[:,0])

x_shape = np.shape(x)[1]
    
with pm.Model() as model:
    # priors
    sigma = pm.HalfCauchy("sigma", beta=10) # arbitrarily defined
    intercept = pm.Normal("Intercept", 0, sigma=20) # arbitrarily defined
    x_prior_means = prior_means # defined above
    x_prior_sigmas = prior_sigmas # defined above
#     x_prior_means = np.zeros(x_shape) # just testing with mean zero to compare to ridge
    x_coeff = pm.Normal("x", mu = x_prior_means, sigma=x_prior_sigmas, shape = x_shape) # original method - no list comprehension

    likelihood = pm.Normal("y", mu=intercept + x_coeff.dot(x.T), sigma=sigma, observed=y) # original method - no list comprehension
    
    trace = pm.sample(1000, tune = 1000, cores = 1)
    
    

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 1551 seconds.


# Save the Trace

In [7]:
with model:
    path = pm.save_trace(trace, directory = "trace_2016_17")

In [8]:
with model:
    results_df = az.summary(trace)

In [13]:
player_index_map = pd.read_csv("../data/player_index_map_2016-17.csv")
player_index_map.head()

Unnamed: 0.1,Unnamed: 0,player_id,index,player_name
0,0,203585.0,0,Rodney McGruder
1,1,202337.0,1,Luke Babbitt
2,2,201609.0,2,Goran Dragic
3,3,201961.0,3,Wayne Ellington
4,4,2754.0,4,Tony Allen


In [31]:
player_index_map.loc[player_index_map['index'] == 41]
player_index_map.loc[player_index_map['player_name'] == "Stephen Curry"]

Unnamed: 0.1,Unnamed: 0,player_id,index,player_name
279,279,201939.0,279,Stephen Curry


In [29]:
print((results_df.loc[results_df['mean'] > 4]).sort_values(by=['mean']))

          mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  \
x[179]   4.005  2.727  -1.534    8.882      0.035    0.042    5777.0   
x[189]   4.049  2.099  -0.012    7.961      0.028    0.024    5746.0   
x[300]   4.111  2.488  -0.685    8.558      0.039    0.033    4102.0   
x[380]   4.176  2.489  -0.527    8.756      0.039    0.034    4092.0   
x[224]   4.215  2.826  -1.250    9.297      0.047    0.044    3683.0   
x[38]    4.272  2.338  -0.139    8.653      0.035    0.031    4484.0   
x[298]   4.404  2.696  -0.786    9.549      0.041    0.043    4215.0   
x[106]   4.416  2.717  -0.604    9.593      0.039    0.038    4885.0   
x[246]   4.435  2.718  -0.865    9.232      0.043    0.037    4084.0   
x[77]    4.626  2.453   0.108    9.230      0.033    0.030    5650.0   
x[118]   4.916  2.534   0.527    9.808      0.042    0.036    3643.0   
x[391]   4.922  2.587   0.229    9.780      0.037    0.032    4829.0   
x[49]    5.045  2.620   0.520   10.227      0.043    0.034    37

In [32]:
with model:
    tmp = trace.get_values("x")

# np.shape(tmp)
# np.mean(tmp, axis = 0)
tmp_df = pd.DataFrame(tmp)
tmp_df.to_csv(r'../data/bayesian_posterior_samples_2016_17.csv')

In [33]:
results_df.to_csv(r'../data/bayesian_results_df_2016_17.csv')