## Comparing the MSE of Bayes vs Frequentist regression

In [13]:
import pandas as pd
import numpy as np
import pymc as pm

In [2]:
##Using the data as is, no feature transformations
data = pd.read_csv('wr_model_data.csv')

In [4]:
data = data.dropna()

In [7]:
data['P_five'] = data['P_five'].astype(object)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['P_five'] = data['P_five'].astype(object)


Unnamed: 0,Year,Round,Pick,Overall,Name,Team,Position,School,Rec.Yards,Rec.TD,Rec,CONF,P_five,Pos,RAS,AllTime
1,2010,1,24,24,Dez Bryant,COWBOYS,WR,Oklahoma State,1480.0,19.0,87.0,B12,1,WR,9.05,9.17
2,2010,2,7,39,Arrelious Benn,BUCCANEERS,WR,Illinois,1055.0,3.0,67.0,B10,1,WR,8.23,8.21
3,2010,2,28,60,Golden Tate,SEAHAWKS,WR,Notre Dame,1496.0,15.0,93.0,ACC,1,WR,7.49,7.43
5,2010,3,14,78,Brandon LaFell,PANTHERS,WR,Louisiana State,929.0,8.0,63.0,SEC,1,WR,4.88,4.94
6,2010,3,18,82,Emmanuel Sanders,STEELERS,WR,Southern Methodist,1339.0,7.0,98.0,CUSA,0,WR,9.38,9.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,2022,3,24,88,Jalen Tolbert,COWBOYS,WR,South Alabama,1474.0,8.0,82.0,SB,0,WR,8.62,8.62
274,2022,3,35,99,David Bell,BROWNS,WR,Purdue,1286.0,6.0,93.0,B10,1,WR,3.99,3.99
276,2022,4,20,125,Erik Ezukanma,DOLPHINS,WR,Texas Tech,748.0,6.0,46.0,B12,1,WR,7.03,7.03
279,2022,4,43,148,Khalil Shakir,BILLS,WR,Boise State,1117.0,7.0,77.0,NP5,0,WR,8.31,8.31


In [16]:
data_train = data[data['Year'] < 2020]
data_test = data[data['Year'] >=2020]

In [18]:
with pm.Model() as wr_model:
    Y = pm.MutableData(name = 'Y', value = data_train['Overall'])
    X1 = pm.MutableData(name = 'X1', value = data_train['Rec.Yards'])
    X2 = pm.MutableData(name = 'X2', value = data_train['Rec.TD'])
    X3 = pm.MutableData(name = 'X3', value = data_train['Rec'])
    X4 = pm.MutableData(name = 'X4', value = data_train['P_five'])
    X5 = pm.MutableData(name = 'X5', value = data_train['RAS'])
    
    beta0 = pm.Normal('beta0', mu=0, sigma=2) 
    beta1 = pm.Normal('beta1', mu=0, sigma=2) 
    beta2 = pm.Normal('beta2', mu=0, sigma=2)
    beta3 = pm.Normal('beta3', mu=0, sigma=2)
    beta4 = pm.Normal('beta4', mu=0, sigma=2)
    beta5 = pm.Normal('beta5', mu=0, sigma=2)
    mu = beta0 + beta1*X1 + beta2*X2 + beta3*X3 + beta4*X4 + beta5*X5
    
    nu = pm.Exponential('nu', 1/29.)
    sigma = pm.Uniform('sigma', 10**-5, 10)
    
    likelihood = pm.StudentT('likelihood', nu=nu, mu=mu, lam=1/sigma**2, observed = Y)
    trace = pm.sample(10000, cores = 4,target_accept=0.95)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta0, beta1, beta2, beta3, beta4, beta5, nu, sigma]


  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
  return _boost._beta_ppf(q, a, b)
Sampling 4 chains for 1_000 tune and 10_000 draw iterations (4_000 + 40_000 draws total) took 717 seconds.


In [20]:
with wr_model:
    trace.extend(pm.sample_prior_predictive())
    trace.extend(pm.sample_posterior_predictive(trace))

Sampling: [beta0, beta1, beta2, beta3, beta4, beta5, likelihood, nu, sigma]
Sampling: [likelihood]


In [21]:
pm.set_data(new_data={'X1':data_test['Rec.Yards'],'X2':data_test['Rec.TD'],'X3':data_test['Rec'],'X4':data_test['P_five'],'X5':data_test['RAS'], 'Y':data_test['Overall']}, model=wr_model)
ppc_power = pm.sample_posterior_predictive(trace, model=wr_model, var_names=['likelihood'])

Sampling: [likelihood]


In [22]:
predict_p = pd.DataFrame({
    'predict':ppc_power['posterior_predictive']['likelihood'].mean(axis=1)[0].to_numpy()})

In [23]:
data_test = data_test.reset_index()
data_test['predict'] = predict_p

In [24]:
## MSE
((data_test['Overall'] - data_test['predict'])**2).mean()

8420873241546.693

In [27]:
data_test['predict'].round()

0         1912.0
1       -20922.0
2        76968.0
3      -401995.0
4        -4536.0
5      1001619.0
6     19255653.0
7        28920.0
8        34793.0
9        -9963.0
10         954.0
11     1297063.0
12        -962.0
13         734.0
14       29043.0
15      -17959.0
16      -90681.0
17       -1788.0
18       -7139.0
19        -153.0
20        5507.0
21     -178210.0
22       -9272.0
23      -35606.0
24      616475.0
25     2137858.0
26       -4985.0
27      -33320.0
28      -74783.0
29       -1336.0
30       14567.0
31      -41296.0
32      -67122.0
33      396726.0
34       28340.0
35       71872.0
36       -1060.0
37        5592.0
38        3906.0
39       20538.0
40       -2132.0
41       -1231.0
42      -10692.0
43       27957.0
44      367850.0
Name: predict, dtype: float64