# Bayesian Regression run at set time intervals to create time series of estimates for each player

This will allow us to see how players improve/worsen over the course of a season

In [1]:
import pymc3 as pm
import pandas as pd
import numpy as np
import arviz as az

data = pd.read_csv("../data/shifts_data_final_2015_16.csv")
data.drop(data.columns[0], axis = 1, inplace = True)
data.head()

Unnamed: 0,point_diff_per_100,home_team,away_team,0,1,2,3,4,5,6,...,466,467,468,469,470,471,472,473,474,475
0,-26.939655,Hawks,Pistons,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-32.349896,Hawks,Pistons,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,Hawks,Pistons,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.373526,Hawks,Pistons,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.166667,Hawks,Pistons,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
priors_df_vets = pd.read_csv("../data/final_priors_vets_2015_16.csv")
priors_df_vets.drop(priors_df_vets.columns[0], axis = 1, inplace = True)
# need to rename the index column to idx
priors_df_vets.columns = ['rating', 'team', 'type', 'mu' ,'sd', 'name', 'finalpriors', 'finalse', 'player_id', 'idx', 'player_name']

priors_df_rookies = pd.read_csv("../data/final_priors_rookies_2015_16.csv")
priors_df_rookies.drop(priors_df_rookies.columns[0], axis = 1, inplace = True)
# need to rename the index column to idx
priors_df_rookies.columns = ['rating', 'team', 'type', 'mu' ,'sd', 'name', 'finalpriors', 'finalse', 'player_id', 'idx', 'player_name']

priors_df_vets.sort_values(by = ['idx'], inplace = True)
priors_df_rookies.sort_values(by = ['idx'], inplace = True)

In [3]:
prior_means = np.zeros(476)
prior_sigmas = np.full(476, 4)

for i in range(len(prior_means)):
    if i in np.array(priors_df_vets['idx']):
        prior_means[i] = priors_df_vets.loc[priors_df_vets['idx'] == i]['finalpriors'].iloc[0]
        prior_sigmas[i] = priors_df_vets.loc[priors_df_vets['idx'] == i]['finalse'].iloc[0]
    elif i in np.array(priors_df_rookies['idx']):
        prior_means[i] = priors_df_rookies.loc[priors_df_rookies['idx'] == i]['finalpriors'].iloc[0]
        prior_sigmas[i] = priors_df_rookies.loc[priors_df_rookies['idx'] == i]['finalse'].iloc[0]
        


In [4]:
home_teams = data['home_team']
away_teams = data['away_team']
# now drop these columns from the main training dataframe
data.drop(['home_team', 'away_team'], axis = 1, inplace = True)
data.head()

Unnamed: 0,point_diff_per_100,0,1,2,3,4,5,6,7,8,...,466,467,468,469,470,471,472,473,474,475
0,-26.939655,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-32.349896,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.373526,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.166667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# need to rename columns now since numbers confuse pymc3
new_cols = []
for i in range(np.shape(data)[1]):
    if i == 0:
        new_cols.append("point_diff")
    else:
        new_cols.append("p" + str(i-1))

# # x_df = data.iloc[:20000,]
# x_df = data
# x_df.columns = new_cols
# x_df

Unnamed: 0,point_diff,p0,p1,p2,p3,p4,p5,p6,p7,p8,...,p466,p467,p468,p469,p470,p471,p472,p473,p474,p475
0,-26.939655,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-32.349896,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.373526,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.166667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33884,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33885,-8.768238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33886,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33887,72.337963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# get +/- estimates in intervals of 5000 shifts (roughly monthly)
nrow = np.shape(data)[0]
upper_lim = 0
done = False
cur_iter = 1
while not done:
    upper_lim += 5000
    upper_lim = min(upper_lim, nrow)
    if upper_lim >= nrow: 
        done = True # this will be our last iteration
    x_df = data.iloc[:upper_lim,]
    x_df.columns = new_cols

    x = np.array(x_df.iloc[:,1:])
    y = np.array(x_df.iloc[:,0])

    x_shape = np.shape(x)[1]

    with pm.Model() as model:
        # priors
        sigma = pm.HalfCauchy("sigma", beta=10) # arbitrarily defined
        intercept = pm.Normal("Intercept", 0, sigma=20) # arbitrarily defined
        x_prior_means = prior_means # defined above
        x_prior_sigmas = prior_sigmas # defined above
    #     x_prior_means = np.zeros(x_shape) # just testing with mean zero to compare to ridge
        x_coeff = pm.Normal("x", mu = x_prior_means, sigma=x_prior_sigmas, shape = x_shape) # original method - no list comprehension

        likelihood = pm.Normal("y", mu=intercept + x_coeff.dot(x.T), sigma=sigma, observed=y) # original method - no list comprehension

        trace = pm.sample(1000, tune = 1000, cores = 1)
        
        trace_name = "trace_2015_16" + str(cur_iter)
        path = pm.save_trace(trace, directory = trace_name)
        
        cur_iter += 1 # increment current iteration counter



Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 129 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 314 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 516 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 900 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 1192 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 1341 seconds.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [x, Intercept, sigma]


  return np.where(x < 0.6931471805599453, np.log(-np.expm1(-x)), np.log1p(-np.exp(-x)))


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 1684 seconds.


In [8]:
with model:
    trace1 = pm.load_trace("trace_2015_161")
    trace2 = pm.load_trace("trace_2015_162")
    trace3 = pm.load_trace("trace_2015_163")
    trace4 = pm.load_trace("trace_2015_164")
    trace5 = pm.load_trace("trace_2015_165")
    trace6 = pm.load_trace("trace_2015_166")
    trace7 = pm.load_trace("trace_2015_167")

In [10]:
with model:
    results_df1 = az.summary(trace1)
    results_df2 = az.summary(trace2)
    results_df3 = az.summary(trace3)
    results_df4 = az.summary(trace4)
    results_df5 = az.summary(trace5)
    results_df6 = az.summary(trace6)
    results_df7 = az.summary(trace7)

In [12]:
print((results_df1.loc[results_df1['mean'] > 4]).sort_values(by=['mean']))

          mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  \
x[301]   4.058  2.757  -1.562    8.791      0.063    0.054    1864.0   
x[42]    4.059  2.865  -1.137    9.593      0.061    0.047    2200.0   
x[439]   4.162  2.778  -0.907    9.463      0.060    0.047    2133.0   
x[8]     4.218  2.829  -0.989    9.436      0.068    0.051    1756.0   
x[201]   4.261  2.821  -0.802    9.570      0.066    0.051    1849.0   
x[256]   4.405  2.821  -0.606    9.997      0.060    0.051    2229.0   
x[78]    4.445  2.784  -1.063    9.416      0.062    0.048    2066.0   
x[459]   4.697  2.686  -0.331    9.626      0.060    0.046    2018.0   
x[9]     4.865  2.698   0.073   10.106      0.062    0.051    1892.0   
x[82]    5.019  2.645   0.109    9.939      0.063    0.047    1784.0   
x[183]   5.025  2.705  -0.196   10.064      0.063    0.049    1883.0   
x[115]   5.132  2.801   0.261   10.659      0.056    0.045    2512.0   
x[138]   5.800  2.690   0.651   10.735      0.062    0.047    18

In [13]:
print((results_df7.loc[results_df7['mean'] > 4]).sort_values(by=['mean']))

          mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  \
x[200]   4.017  2.352  -0.585    8.233      0.035    0.030    4358.0   
x[459]   4.096  2.154  -0.024    7.972      0.030    0.027    5231.0   
x[32]    4.237  2.266   0.148    8.436      0.033    0.030    4652.0   
x[42]    4.280  2.255  -0.389    8.056      0.034    0.034    4361.0   
x[405]   4.280  2.281   0.180    8.642      0.036    0.033    4007.0   
x[183]   4.325  2.288  -0.109    8.320      0.037    0.031    3871.0   
x[114]   4.393  2.599  -0.445    9.265      0.040    0.039    4226.0   
x[256]   4.508  2.198  -0.064    8.147      0.033    0.030    4508.0   
x[439]   4.529  2.214   0.358    8.616      0.033    0.027    4532.0   
x[201]   4.594  2.135   0.384    8.280      0.033    0.029    4214.0   
x[413]   4.619  2.136   0.610    8.733      0.028    0.026    5936.0   
x[427]   4.666  2.302   0.296    8.967      0.037    0.036    3927.0   
x[48]    4.718  2.219   0.506    8.742      0.034    0.028    42

In [14]:
with model:
    tmp1 = trace1.get_values("x")
    tmp2 = trace2.get_values("x")
    tmp3 = trace3.get_values("x")
    tmp4 = trace4.get_values("x")
    tmp5 = trace5.get_values("x")
    tmp6 = trace6.get_values("x")
    tmp7 = trace7.get_values("x")


tmp_df1 = pd.DataFrame(tmp1)
tmp_df2 = pd.DataFrame(tmp2)
tmp_df3 = pd.DataFrame(tmp3)
tmp_df4 = pd.DataFrame(tmp4)
tmp_df5 = pd.DataFrame(tmp5)
tmp_df6 = pd.DataFrame(tmp6)
tmp_df7 = pd.DataFrame(tmp7)

tmp_df1.to_csv(r'../data/bayesian_posterior_samples_2015_16_month1.csv')
tmp_df2.to_csv(r'../data/bayesian_posterior_samples_2015_16_month2.csv')
tmp_df3.to_csv(r'../data/bayesian_posterior_samples_2015_16_month3.csv')
tmp_df4.to_csv(r'../data/bayesian_posterior_samples_2015_16_month4.csv')
tmp_df5.to_csv(r'../data/bayesian_posterior_samples_2015_16_month5.csv')
tmp_df6.to_csv(r'../data/bayesian_posterior_samples_2015_16_month6.csv')
tmp_df7.to_csv(r'../data/bayesian_posterior_samples_2015_16_month7.csv')

In [15]:
results_df1.to_csv(r'../data/bayesian_results_df_2015_16_month1.csv')
results_df2.to_csv(r'../data/bayesian_results_df_2015_16_month2.csv')
results_df3.to_csv(r'../data/bayesian_results_df_2015_16_month3.csv')
results_df4.to_csv(r'../data/bayesian_results_df_2015_16_month4.csv')
results_df5.to_csv(r'../data/bayesian_results_df_2015_16_month5.csv')
results_df6.to_csv(r'../data/bayesian_results_df_2015_16_month6.csv')
results_df7.to_csv(r'../data/bayesian_results_df_2015_16_month7.csv')