# Lesson 5: Bayesian Regression

## Intro to Quantified Cognition

## Lesson plan

- Extension of BEST
- Introduce Bayesian regression
- Example with simulated data
- Robust regression
- Introduce hierarchical models
- Hierarchical example
- Real-world data analysis in teams


In [None]:
# load matplotlib inline mode
%matplotlib inline

# import some useful libraries
import numpy as np                # numerical analysis linear algebra
import pandas as pd               # efficient tables
import matplotlib.pyplot as plt   # plotting
from scipy import stats

import pymc3 as pm

import dists

## Independent BEST

Let's extend the example to independent samples!!!


In [None]:
# generate some data that may or may not be significantly different from each other
A = dists.normal(mean=0.2, std=0.5).rvs(10)
B = dists.normal(mean=0.4, std=1.0).rvs(12)

# plot it
plt.hist(A, bins='auto', alpha=0.3);
plt.hist(B, bins='auto', alpha=0.3);

# do a quick t-test
stats.ttest_ind(A, B)

In [None]:
# first get overall mean and std
overall_mean = np.append(A, B).mean()
overall_std = np.append(A, B).std()

# define a model
with pm.Model() as model:
    # set up the params/priors
    mu_A = pm.Normal('mu_A', overall_mean, overall_std*2.0)
    sd_A = pm.HalfCauchy('sd_A', 5)
    mu_B = pm.Normal('mu_B', overall_mean, overall_std*2.0)
    sd_B = pm.HalfCauchy('sd_B', 5)
    nu = pm.Exponential('df_minus_one', 1/29.) + 1.
    
    # build the model
    #lam = data_std**-2.
    data_A = pm.StudentT('data_A', mu=mu_A, sd=sd_A, nu=nu, observed=A)
    data_B = pm.StudentT('data_B', mu=mu_B, sd=sd_B, nu=nu, observed=B)
    
    # set up some deterministic vars to keep
    diff_of_means = pm.Deterministic('difference of means', mu_A - mu_B)
    diff_of_stds = pm.Deterministic('difference of stds', sd_A - sd_B)
    effect_size = pm.Deterministic('effect size',
                                   diff_of_means / np.sqrt((sd_A**2 + sd_B**2) / 2))

    

In [None]:
with model:
    trace = pm.sample(2000, cores=2)

In [None]:
pm.traceplot(trace);

In [None]:
pm.plot_posterior(trace, varnames=['mu_A', 'mu_B', 'sd_A', 'sd_B', 'df_minus_one', 'effect size']);

In [None]:
pm.plot_posterior(trace, varnames=['difference of means','difference of stds', 'effect size'],
                  ref_val=0.0);


## Linear Regression

One of the most common and flexible statistical approaches.

Involves building a model that can predict the dependent data ($y$) based on different combinations of independent data ($x$):

$$y = \beta_0 + \beta_1 x + \epsilon$$


In [None]:
# generate some data with a linear trend
nsamples = 100
true_slope = 0.5
true_intercept = 1.0
true_sigma = 0.5

# uniform sampling over x
x = dists.uniform(0, 1).rvs(nsamples)

# apply noise to linear model
y_true = true_intercept + true_slope*x 
y = y_true + dists.normal(mean=0.0, std=true_sigma).rvs(nsamples)

# set the data
data = pd.DataFrame(dict(x=x, y=y))

# plot the data
plt.plot(x, y, 'o')
plt.plot(x, y_true, '-')

In [None]:
data.head()

In [None]:
# define a standard linear model
with pm.Model() as model:
    # set up the params/priors
    intercept = pm.Normal('intercept', 0, 20)
    slope = pm.Normal('slope', 0, 20)
    sigma = pm.HalfCauchy('sigma', 10)
    
    # combine them into a linear function for the likelihood
    likelihood = pm.Normal('y', mu=intercept + slope * x, 
                           sd=sigma, observed=y)
    

In [None]:
# sample the posterior
with model:
    trace = pm.sample(2000, cores=2)

In [None]:
pm.traceplot(trace);

In [None]:
pm.plot_posterior(trace, varnames=['intercept', 'slope', 'sigma'],
                  ref_val=0.0);

## Dealing with outliers

Sometimes data can be messy. You can either assume every observation affects the statistical inference similarly, or you can try and downplay the effect of potential outliers.

This approach is also known as robust regression.


In [None]:
# let's add in some outliers!
x_out = np.append(x, [.1, .15, .3])
y_out = np.append(y, [3.54, 4.1, 3.2])

# plot the data
plt.plot(x_out, y_out, 'o')
plt.plot(x, y_true, '-')

In [None]:
# define a linear model with Gaussian noise
with pm.Model() as model:
    # set up the params/priors
    intercept = pm.Normal('intercept', 0, 20)
    slope = pm.Normal('slope', 0, 20)
    sigma = pm.HalfCauchy('sigma', 10)
    
    # combine them into a linear function for the likelihood
    likelihood = pm.Normal('y_out', mu=intercept + slope * x_out, 
                           sd=sigma, observed=y_out)
    

In [None]:
with model:
    trace = pm.sample(2000, cores=2)

In [None]:
pm.traceplot(trace);

In [None]:
pm.plot_posterior(trace, varnames=['intercept', 'slope', 'sigma'],
                  ref_val=[true_intercept, 0.0, true_sigma]);

In [None]:
# let's check with the posterior predictives
lm = lambda x, samples: samples['intercept'] + x*samples['slope']

# plot the data
plt.plot(x_out, y_out, 'o')
plt.plot(x, y_true, '-')

pm.plot_posterior_predictive_glm(trace, eval=np.linspace(0, 1, 100), 
                                 lm=lm, samples=200, color="green", alpha=.15)

In [None]:
# Can we fix it?
# define a model
with pm.Model() as model:
    # set up the params/priors
    intercept = pm.Normal('intercept', 0, 20)
    slope = pm.Normal('slope', 0, 20)
    sigma = pm.HalfCauchy('sigma', 10)
    nu = pm.Exponential('df_minus_one', 1/29.) + 1.
    
    # combine them into a robust linear function for the likelihood
    likelihood = pm.StudentT('y_out', mu=intercept + slope * x_out, 
                             sd=sigma, nu=nu, observed=y_out)
    

In [None]:
with model:
    trace = pm.sample(2000, cores=2)

In [None]:
pm.traceplot(trace);

In [None]:
pm.plot_posterior(trace, varnames=['intercept', 'slope', 'sigma', 'df_minus_one'],
                  ref_val=[true_intercept, 0.0, true_sigma, 0.0]);

In [None]:
# let's check with the posterior predictives
lm = lambda x, samples: samples['intercept'] + x*samples['slope']

# plot the data
plt.plot(x_out, y_out, 'o')
plt.plot(x, y_true, '-')

pm.plot_posterior_predictive_glm(trace, eval=np.linspace(0, 1, 100), 
                                 lm=lm, samples=200, color="green", alpha=.15)

## Hierarchical Regression

What should we do when we have multiple groups/subjects?

Ideally we should share information across groups to inform the models fit to the individuals.

This is called multi-level or hierarchical modeling.

In [None]:
# load some data
dat = pd.read_csv('data/flanker_dat.csv')
dat.head()

In [None]:
# usually best to look at log rt
dat['log_rt'] = np.log(dat['rt'])
dat['subj'] = dat['subj']-101 #.astype('str')
dat.columns

In [None]:
dat.groupby(['condition'])['log_rt'].mean()

In [None]:
dat['num_cond'] = 0
dat.loc[dat['condition']=='=', 'num_cond'] = 1
dat.loc[dat['condition']=='~', 'num_cond'] = 2
dat.num_cond.unique()

In [None]:
# get various ways to index the data
subj_idx = dat.subj.values
cond_idx = dat.condition.values

n_subj = len(dat.subj.unique())
n_cond = len(dat.condition.unique())

In [None]:
with pm.Model() as hierarchical_model:
    # Hyperpriors for group nodes
    mu_a = pm.Normal('mu_a', mu=0., sd=100**2)
    sigma_a = pm.HalfCauchy('sigma_a', 5)
    mu_b = pm.Normal('mu_b', mu=0., sd=100**2)
    sigma_b = pm.HalfCauchy('sigma_b', 5)

    # Intercept for each county, distributed around group mean mu_a
    # Above we just set mu and sd to a fixed value while here we
    # plug in a common group distribution for all a and b (which are
    # vectors of length n_subj).
    a = pm.Normal('a', mu=mu_a, sd=sigma_a, shape=n_subj)
    # slope for each subj, distributed around group mean mu_a
    b = pm.Normal('b', mu=mu_b, sd=sigma_b, shape=n_subj)

    # Model error
    eps = pm.HalfCauchy('eps', 5)

    log_rt_est = a[subj_idx] + b[subj_idx] * dat.num_cond.values

    # Data likelihood
    log_rt_like = pm.Normal('log_rt_like', mu=log_rt_est, sd=eps, observed=dat.log_rt)

In [None]:
with hierarchical_model:
    trace = pm.sample(2000, cores=2)

In [None]:
pm.traceplot(trace);