# Bayesian Regression Model 2015/16 using priors from optimized random forest model

In [1]:
import pymc3 as pm
import pandas as pd
import numpy as np
import arviz as az

data = pd.read_csv("../data/shifts_data_final_2015_16.csv")
data.drop(data.columns[0], axis = 1, inplace = True)
data.head()

Unnamed: 0,point_diff_per_100,home_team,away_team,0,1,2,3,4,5,6,...,466,467,468,469,470,471,472,473,474,475
0,-26.939655,Hawks,Pistons,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-32.349896,Hawks,Pistons,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,Hawks,Pistons,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.373526,Hawks,Pistons,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.166667,Hawks,Pistons,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
priors_df_vets = pd.read_csv("../data/final_priors_vets_2015_16.csv")
priors_df_vets.drop(priors_df_vets.columns[0], axis = 1, inplace = True)
# need to rename the index column to idx
priors_df_vets.columns = ['rating', 'team', 'type', 'mu' ,'sd', 'name', 'finalpriors', 'finalse', 'player_id', 'idx', 'player_name']

priors_df_rookies = pd.read_csv("../data/final_priors_rookies_2015_16.csv")
priors_df_rookies.drop(priors_df_rookies.columns[0], axis = 1, inplace = True)
# need to rename the index column to idx
priors_df_rookies.columns = ['rating', 'team', 'type', 'mu' ,'sd', 'name', 'finalpriors', 'finalse', 'player_id', 'idx', 'player_name']

priors_df_vets.sort_values(by = ['idx'], inplace = True)
priors_df_rookies.sort_values(by = ['idx'], inplace = True)

In [3]:
prior_means = np.zeros(476)
prior_sigmas = np.full(476, 4)

for i in range(len(prior_means)):
    if i in np.array(priors_df_vets['idx']):
        prior_means[i] = priors_df_vets.loc[priors_df_vets['idx'] == i]['finalpriors'].iloc[0]
        prior_sigmas[i] = priors_df_vets.loc[priors_df_vets['idx'] == i]['finalse'].iloc[0]
    elif i in np.array(priors_df_rookies['idx']):
        prior_means[i] = priors_df_rookies.loc[priors_df_rookies['idx'] == i]['finalpriors'].iloc[0]
        prior_sigmas[i] = priors_df_rookies.loc[priors_df_rookies['idx'] == i]['finalse'].iloc[0]
        


In [4]:
home_teams = data['home_team']
away_teams = data['away_team']
# now drop these columns from the main training dataframe
data.drop(['home_team', 'away_team'], axis = 1, inplace = True)
data.head()

Unnamed: 0,point_diff_per_100,0,1,2,3,4,5,6,7,8,...,466,467,468,469,470,471,472,473,474,475
0,-26.939655,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-32.349896,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.373526,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.166667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# need to rename columns now since numbers confuse pymc3
new_cols = []
for i in range(np.shape(data)[1]):
    if i == 0:
        new_cols.append("point_diff")
    else:
        new_cols.append("p" + str(i-1))

# x_df = data.iloc[:20000,]
x_df = data
x_df.columns = new_cols
x_df

Unnamed: 0,point_diff,p0,p1,p2,p3,p4,p5,p6,p7,p8,...,p466,p467,p468,p469,p470,p471,p472,p473,p474,p475
0,-26.939655,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-32.349896,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.373526,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.166667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33884,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33885,-8.768238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33886,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33887,72.337963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
x = np.array(x_df.iloc[:,1:])
y = np.array(x_df.iloc[:,0])

x_shape = np.shape(x)[1]
    
with pm.Model() as model:
    # priors
    sigma = pm.HalfCauchy("sigma", beta=10) # arbitrarily defined
    intercept = pm.Normal("Intercept", 0, sigma=20) # arbitrarily defined
    x_prior_means = prior_means # defined above
    x_prior_sigmas = prior_sigmas # defined above
#     x_prior_means = np.zeros(x_shape) # just testing with mean zero to compare to ridge
    x_coeff = pm.Normal("x", mu = x_prior_means, sigma=x_prior_sigmas, shape = x_shape) # original method - no list comprehension

    likelihood = pm.Normal("y", mu=intercept + x_coeff.dot(x.T), sigma=sigma, observed=y) # original method - no list comprehension
    
    trace = pm.sample(1000, tune = 1000, cores = 1)
    
    

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 1641 seconds.


## Save the trace:

In [33]:
with model:
    path = pm.save_trace(trace, directory = "trace_2015_16")

In [7]:
with model:
    results_df = az.summary(trace)

In [10]:
player_index_map_2015 = pd.read_csv("../data/player_index_map_2015-16.csv")
player_index_map_2015.head()

Unnamed: 0.1,Unnamed: 0,player_id,index,player_name
0,0,201952,0,Jeff Teague
1,1,203471,1,Dennis Schroder
2,2,203488,2,Mike Muscala
3,3,203145,3,Kent Bazemore
4,4,203503,4,Tony Snell


In [32]:
player_index_map_2015.loc[player_index_map_2015['index'] == 163]
# player_index_map_2015.loc[player_index_map_2015['player_name'] == "Kevin Durant"]

Unnamed: 0.1,Unnamed: 0,player_id,index,player_name
256,256,201142,256,Kevin Durant


In [9]:
print((results_df.loc[results_df['mean'] > 4]).sort_values(by=['mean']))

          mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  \
x[459]   4.032  2.114   0.198    8.061      0.032    0.026    4279.0   
x[200]   4.053  2.348  -0.224    8.482      0.039    0.033    3540.0   
x[32]    4.271  2.326  -0.041    8.618      0.033    0.031    5132.0   
x[42]    4.316  2.268  -0.178    8.340      0.034    0.032    4427.0   
x[183]   4.318  2.159   0.178    8.259      0.033    0.027    4200.0   
x[405]   4.325  2.317  -0.121    8.792      0.034    0.030    4593.0   
x[114]   4.430  2.629  -0.869    8.828      0.038    0.035    4715.0   
x[256]   4.487  2.134   0.404    8.285      0.028    0.023    5986.0   
x[439]   4.565  2.240   0.283    8.797      0.034    0.029    4253.0   
x[201]   4.607  2.213   0.653    8.779      0.031    0.030    4954.0   
x[413]   4.616  2.212   0.378    8.599      0.030    0.026    5356.0   
x[427]   4.619  2.158   0.657    8.759      0.029    0.026    5501.0   
x[48]    4.631  2.208   0.546    8.845      0.032    0.033    48

In [34]:
with model:
    tmp = trace.get_values("x")

# np.shape(tmp)
# np.mean(tmp, axis = 0)
tmp_df = pd.DataFrame(tmp)
tmp_df.to_csv(r'../data/bayesian_posterior_samples_2015_16.csv')

In [35]:
results_df.to_csv(r'../data/bayesian_results_df_2015_16.csv')