## Comparing the MSE of Bayes vs Frequentist regression

In [1]:
import pandas as pd
import numpy as np
import pymc as pm

In [2]:
data = pd.read_csv('wr_model_data.csv')

In [3]:
data = data.dropna()

In [4]:
data['P_five'] = data['P_five'].astype(object)
data

Unnamed: 0,Year,Round,Pick,Overall,Name,Team,Position,School,Rec.Yards,Rec.TD,Rec,CONF,P_five,Pos,RAS,AllTime
1,2010,1,24,24,Dez Bryant,COWBOYS,WR,Oklahoma State,1480.0,19.0,87.0,B12,1,WR,9.05,9.17
2,2010,2,7,39,Arrelious Benn,BUCCANEERS,WR,Illinois,1055.0,3.0,67.0,B10,1,WR,8.23,8.21
3,2010,2,28,60,Golden Tate,SEAHAWKS,WR,Notre Dame,1496.0,15.0,93.0,ACC,1,WR,7.49,7.43
5,2010,3,14,78,Brandon LaFell,PANTHERS,WR,Louisiana State,929.0,8.0,63.0,SEC,1,WR,4.88,4.94
6,2010,3,18,82,Emmanuel Sanders,STEELERS,WR,Southern Methodist,1339.0,7.0,98.0,CUSA,0,WR,9.38,9.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,2022,3,24,88,Jalen Tolbert,COWBOYS,WR,South Alabama,1474.0,8.0,82.0,SB,0,WR,8.62,8.62
274,2022,3,35,99,David Bell,BROWNS,WR,Purdue,1286.0,6.0,93.0,B10,1,WR,3.99,3.99
276,2022,4,20,125,Erik Ezukanma,DOLPHINS,WR,Texas Tech,748.0,6.0,46.0,B12,1,WR,7.03,7.03
279,2022,4,43,148,Khalil Shakir,BILLS,WR,Boise State,1117.0,7.0,77.0,NP5,0,WR,8.31,8.31


In [11]:
data_train = data[data['Year'] < 2020]
data_test = data[data['Year'] >=2020]

In [12]:
X = data_train[['Rec.Yards','Rec.TD','Rec','RAS']]
Y = data_train['Overall']

meanx = X.mean().values
scalex = X.std().values
zX = ((X - meanx)/scalex).values

meany = Y.mean()
scaley = Y.std()
zY = ((Y - meany)/scaley).values

In [18]:
X_test = data_test[['Rec.Yards','Rec.TD','Rec','RAS']]
Y_test = data_test['Overall']

meanx_test = X_test.mean().values
scalex_test = X_test.std().values
zX_test = ((X_test - meanx_test)/scalex_test).values

meany_test = Y_test.mean()
scaley_test = Y_test.std()
zY_test = ((Y_test - meany_test)/scaley_test).values

In [15]:
with pm.Model() as wr_model:
    Y = pm.MutableData(name = 'Y', value = zY)
    X1 = pm.MutableData(name = 'X1', value = zX[:,0])
    X2 = pm.MutableData(name = 'X2', value = zX[:,1])
    X3 = pm.MutableData(name = 'X3', value = zX[:,2])
    X4 = pm.MutableData(name = 'X4', value = zX[:,3])
    
    beta0 = pm.Normal('beta0', mu=0, sigma=2) 
    beta1 = pm.Normal('beta1', mu=0, sigma=2) 
    beta2 = pm.Normal('beta2', mu=0, sigma=2)
    beta3 = pm.Normal('beta3', mu=0, sigma=2)
    beta4 = pm.Normal('beta4', mu=0, sigma=2)
    beta5 = pm.Normal('beta5', mu=0, sigma=2)
    mu = beta0 + beta1*X1 + beta2*X2 + beta3*X3 + beta4*X4
    
    nu = pm.Exponential('nu', 1/29.)
    sigma = pm.Uniform('sigma', 10**-5, 10)
    
    likelihood = pm.StudentT('likelihood', nu=nu, mu=mu, lam=1/sigma**2, observed = Y)
    trace = pm.sample(1000, cores = 4,target_accept=0.95)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta0, beta1, beta2, beta3, beta4, beta5, nu, sigma]


  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 74 seconds.


In [16]:
with wr_model:
    trace.extend(pm.sample_prior_predictive())
    trace.extend(pm.sample_posterior_predictive(trace))

Sampling: [beta0, beta1, beta2, beta3, beta4, beta5, likelihood, nu, sigma]
Sampling: [likelihood]


In [19]:
pm.set_data(new_data={'X1':zX_test[:,0],'X2':zX_test[:,1],'X3':zX_test[:,2],'X4':zX_test[:,3], 'Y':zY_test}, model=wr_model)
ppc_power = pm.sample_posterior_predictive(trace, model=wr_model, var_names=['likelihood'])

Sampling: [likelihood]


In [20]:
predict_p = pd.DataFrame({
    'predict':ppc_power['posterior_predictive']['likelihood'].mean(axis=1)[0].to_numpy()})

In [31]:
data_test = data_test.reset_index()
data_test['predict'] = predict_p

In [32]:
## MSE
((data_test['Overall'] - data_test['predict'])**2).mean()

9553.955723068026

In [33]:
data_test['predict'] = data_test['predict']

In [34]:
data_test['actual'] = zY_test

In [35]:
data_test

Unnamed: 0,level_0,index,Year,Round,Pick,Overall,Name,Team,Position,School,Rec.Yards,Rec.TD,Rec,CONF,P_five,Pos,RAS,AllTime,predict,actual
0,0,211,2020,1,15,15,Jerry Jeudy,BRONCOS,WR,Alabama,1315.0,14.0,68.0,SEC,1,WR,6.76,6.75,-0.261375,-1.193707
1,1,212,2020,1,17,17,CeeDee Lamb,COWBOYS,WR,Oklahoma,1327.0,14.0,62.0,B12,1,WR,7.44,7.43,-0.354912,-1.157583
2,2,213,2020,1,21,21,Jalen Reagor,EAGLES,WR,Texas Christian,1061.0,9.0,72.0,B12,1,WR,6.04,6.11,0.241091,-1.085334
3,3,214,2020,1,22,22,Justin Jefferson,VIKINGS,WR,Louisiana State,1540.0,18.0,111.0,SEC,1,WR,9.69,9.68,-0.988991,-1.067272
4,4,215,2020,1,25,25,Brandon Aiyuk,49ERS,WR,Arizona State,1192.0,16.0,65.0,P12,1,WR,8.46,8.46,-0.578106,-1.013085
5,5,216,2020,2,1,33,Tee Higgins,BENGALS,WR,Clemson,1167.0,13.0,59.0,ACC,1,WR,4.13,4.08,0.241144,-0.868588
6,6,220,2020,2,17,49,Chase Claypool,STEELERS,WR,Notre Dame,1037.0,13.0,66.0,ACC,1,WR,9.98,9.99,-0.525661,-0.579594
7,7,222,2020,2,27,59,Denzel Mims,JETS,WR,Baylor,1087.0,8.0,61.0,B12,1,WR,9.77,9.78,-0.160307,-0.398973
8,8,223,2020,3,2,66,Antonio Gibson,TEAM,WR,Memphis,735.0,8.0,38.0,Amer,0,WR,9.29,9.31,0.028453,-0.272538
9,9,226,2020,3,28,92,Devin Duvernay,RAVENS,WR,Texas,1386.0,9.0,106.0,B12,1,WR,7.86,7.92,-0.093396,0.197078
