# Lesson 6: Bayesian Data Analysis

## Intro to Quantified Cognition

## Lesson plan

- Deep dive into various Bayesian models of one dataset
- Explore other datasets
- Real-world data analysis in teams


In [None]:
# load matplotlib inline mode
%matplotlib inline

# import some useful libraries
import numpy as np                # numerical analysis linear algebra
import pandas as pd               # efficient tables
import matplotlib.pyplot as plt   # plotting
from scipy import stats

import pymc3 as pm
import theano.tensor as T

from sklearn import preprocessing

import dists

## Hierarchical Regression

What should we do when we have multiple groups/subjects?

Ideally we should share information across groups to inform the models fit to the individuals.

This is called multi-level or hierarchical modeling.

## Flanker dataset

We will explore a number of ways to analyze a flanker task dataset. 

You can then use these as a guide when you analyze some of the other datasets we have available.

In [None]:
# load some data
dat = pd.read_csv('data/flanker_dat.csv')

# remove nans
dat = dat[~np.isnan(dat['rt'])]

dat.head()

In [None]:
# get various ways to index the data
# see: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
le = preprocessing.LabelEncoder()
subj_idx = le.fit_transform(dat['subj'])
cond_idx = le.fit_transform(dat['condition'])
n_subj = len(dat.subj.unique())
n_cond = len(dat.condition.unique())


In [None]:
# usually best to look at log rt
dat['log_rt'] = np.log(dat['rt'])

# also add in some info about where the stimuli were presented
dat['abs_x'] = np.abs(dat.loc_x)
dat['abs_y'] = np.abs(dat.loc_y)
dat['angle_rad'] = np.arctan2(dat.loc_y, dat.loc_x)
dat['angle_deg'] = dat['angle_rad']*180/np.pi
dat.groupby(['angle_deg'])['log_rt'].mean().plot()


# show the columns we have to work with
dat.columns

In [None]:
# show how distribution of log_rts is mostly normal
plt.hist(dat['log_rt'].values, bins='auto');

## Explore effects on reaction time

In [None]:
# show overall mean log_rt for each condition
dat.groupby(['condition'])['log_rt'].mean()

In [None]:
print('Overall mean:', dat.log_rt.mean())
print('Overall std:', dat.log_rt.std())

In [None]:
# Examine gamma prior on the half cauchy scale
x = np.linspace(1, 50, 100)
plt.plot(x, np.exp(pm.Gamma.dist(.5, .5).logp(x).eval()))

### Hierarchical version of *t*-test

In [None]:
# define the hierarchical model
with pm.Model() as hierarchical_model:
    
    # hyperprior for mu
    mu_mu = pm.Normal('mu_mu', 
                        mu=dat.log_rt.mean(), 
                        sd=dat.log_rt.std()*2,
                        shape=n_cond)
    sigma_mu = pm.HalfCauchy('sigma_mu', 10, shape=n_cond)
    
    # prior on mu
    mu = pm.Normal('mu', mu_mu, sigma_mu, 
                   shape=(n_subj, n_cond))
    
    # hyperprior for sd (Gelman suggests gamma prior)
    sd_scale = pm.Gamma('sd_scale', .5, .5,
                        shape=n_cond)
    
    # prior on sd
    sd = pm.HalfCauchy('sd', sd_scale, 
                       shape=(n_subj, n_cond))
    
    # prior on df (fixed for all params)
    nu = pm.Exponential('df_minus_one', 1/29.) + 1.
    
    # likelihood
    log_rt_like = pm.StudentT('log_rt_like', 
                              mu=mu[subj_idx, cond_idx], 
                              sd=sd[subj_idx, cond_idx],
                              nu=nu,
                              observed=dat.log_rt)
    
    # save out some comparisons of interest
    mudiff01 = pm.Deterministic('mu_diff_01', mu_mu[1]-mu_mu[0])
    mudiff02 = pm.Deterministic('mu_diff_02', mu_mu[2]-mu_mu[0])
    mudiff12 = pm.Deterministic('mu_diff_12', mu_mu[2]-mu_mu[1])

In [None]:
with hierarchical_model:
    trace = pm.sample(2000, cores=2)

In [None]:
# check our traces
pm.traceplot(trace);

In [None]:
# show comparisons of interest
pm.plot_posterior(trace, varnames=['mu_diff_01', 'mu_diff_12', 'mu_diff_02'], 
                  ref_val=0.0);

### Hierarchical version of linear model

In [None]:
# ned to make a numerical condition variable
dat['num_cond'] = 0
dat.loc[dat['condition']=='=', 'num_cond'] = 1
dat.loc[dat['condition']=='~', 'num_cond'] = 2
dat.num_cond.unique()

In [None]:
# define the hierarchical model
with pm.Model() as hierarchical_linear_model:
    # Hyperpriors for group nodes
    mu_inter = pm.Normal('mu_inter', mu=dat.log_rt.mean(), sd=dat.log_rt.std()*10)
    sigma_inter = pm.HalfCauchy('sigma_inter', 10)
    
    mu_slope = pm.Normal('mu_slope', mu=0., sd=dat.log_rt.std()*10)
    sigma_slope = pm.HalfCauchy('sigma_slope', 10)

    # Intercept for each subj, distributed around group mean
    intercept = pm.Normal('intercept', mu=mu_inter, sd=sigma_inter, shape=n_subj)
    
    # slope for each subj, distributed around group mean
    slope = pm.Normal('slope', mu=mu_slope, sd=sigma_slope, shape=n_subj)

    # Model error
    eps = pm.HalfCauchy('eps', 10)
    
    # define the means predicted from a linear function
    log_rt_est = intercept[subj_idx] + slope[subj_idx] * dat.num_cond.values

    # Data likelihood (could also replace with Student's t)
    log_rt_like = pm.Normal('log_rt_like', mu=log_rt_est, sd=eps, observed=dat.log_rt)
    
    # Data with Student's t likelihood
    # prior on df (fixed for all params)
    #nu = pm.Exponential('df_minus_one', 1/29.) + 1.
    #log_rt_like = pm.StudentT('log_rt_like', mu=log_rt_est, sd=eps, nu=nu, observed=dat.log_rt)


In [None]:
with hierarchical_linear_model:
    trace = pm.sample(2000, cores=2)

In [None]:
pm.traceplot(trace);

In [None]:
pm.plot_posterior(trace, varnames=['mu_slope_cond', 'mu_slope_y'], ref_val=0.0);

In [None]:
with hierarchical_linear_model:
    print(pm.waic(trace))
    

### More-complicated linear model

In [None]:
# define the hierarchical model
with pm.Model() as hierarchical_linear_model:
    # Hyperpriors for group nodes
    mu_inter = pm.Normal('mu_inter', mu=dat.log_rt.mean(), sd=dat.log_rt.std()*10)
    sigma_inter = pm.HalfCauchy('sigma_inter', 10)
    
    mu_slope_cond = pm.Normal('mu_slope_cond', mu=0., sd=dat.log_rt.std()*10)
    sigma_slope_cond = pm.HalfCauchy('sigma_slope_cond', 10)
    #mu_slope_y = pm.Normal('mu_slope_y', mu=0., sd=dat.log_rt.std()*10)
    #sigma_slope_y = pm.HalfCauchy('sigma_slope_y', 10)

    # Intercept for each subj, distributed around group mean
    intercept = pm.Normal('intercept', mu=mu_inter, sd=sigma_inter, shape=n_subj)
    
    # slope for each subj, distributed around group mean
    slope_cond = pm.Normal('slope_cond', mu=mu_slope_cond, sd=sigma_slope_cond, shape=n_subj)
    #slope_y = pm.Normal('slope_y', mu=mu_slope_y, sd=sigma_slope_y, shape=n_subj)

    # Model error
    eps = pm.HalfCauchy('eps', 10)
    
    # define the means predicted from a linear function
    log_rt_est = (intercept[subj_idx] + #slope_y[subj_idx] * dat.abs_y.values + 
                  slope_cond[subj_idx] * dat.num_cond.values)

    # Data likelihood (could also replace with Student's t)
    log_rt_like = pm.Normal('log_rt_like', mu=log_rt_est, sd=eps, observed=dat.log_rt)
    
    # Data with Student's t likelihood
    # prior on df (fixed for all params)
    #nu = pm.Exponential('df_minus_one', 1/29.) + 1.
    #log_rt_like = pm.StudentT('log_rt_like', mu=log_rt_est, sd=eps, nu=nu, observed=dat.log_rt)


In [None]:
with hierarchical_linear_model:
    trace = pm.sample(2000, cores=2)

In [None]:
pm.plot_posterior(trace, varnames=['mu_slope_cond', 'mu_slope_y'], ref_val=0.0);

In [None]:
with hierarchical_linear_model:
    print(pm.waic(trace))

## Evaluate percent correct

In [None]:
# how about mean performance
dat.groupby(['condition'])['correct'].mean()

In [None]:
dat.groupby(['condition'])['correct'].mean().plot()

In [None]:
x = np.linspace(0, 1, 100)
#y = np.exp(pm.Beta.dist(mu=.75, sd=.1).logp(x).eval())
y = np.exp(pm.HalfCauchy.dist(.1).logp(x).eval())
#y = np.sqrt(x*(1-x))
#y = np.exp(pm.TruncatedNormal.dist(mu=dat.correct.mean(), 
#                              sd=dat.correct.std()*2, 
#                              lower=0.0, upper=1.0).logp(x).eval())
plt.plot(x, y)

In [None]:
# define the hierarchical model
with pm.Model() as perf_model:
    
    # prior on mu (one for each cond)
    mu = pm.TruncatedNormal('mu',
                            mu=dat.correct.mean(), 
                            sd=dat.correct.std()*2, 
                            lower=0.0, upper=1.0,
                            shape=n_cond)
    
    # prior on sd (one for each cond)
    sd = pm.HalfCauchy('sd', 0.1, shape=n_cond)
    
    # likelihood
    prob = pm.Beta('beta', 
                   mu=mu,
                   sd=sd,
                   shape=(n_subj, n_cond))
    
    perf = pm.Binomial('perf', 
                       n=1,
                       p=prob[subj_idx, cond_idx],
                       observed=dat.correct)
    
    # save out some comparisons of interest
    mudiff01 = pm.Deterministic('mu_diff_01', mu[1]-mu[0])
    mudiff02 = pm.Deterministic('mu_diff_02', mu[2]-mu[0])
    mudiff12 = pm.Deterministic('mu_diff_12', mu[2]-mu[1])
    
    sddiff01 = pm.Deterministic('sd_diff_01', sd[1]-sd[0])
    sddiff02 = pm.Deterministic('sd_diff_02', sd[2]-sd[0])
    sddiff12 = pm.Deterministic('sd_diff_12', sd[2]-sd[1])


In [None]:
with perf_model:
    trace = pm.sample(2000, cores=2)

In [None]:
# check our traces
pm.traceplot(trace);

In [None]:
# show comparisons of interest
pm.plot_posterior(trace, varnames=['mu_diff_01', 'mu_diff_12', 'mu_diff_02', 
                                   'sd_diff_01', 'sd_diff_12', 'sd_diff_02'], 
                  ref_val=0.0);

In [None]:
probs = trace.get_values('beta')
plt.hist(probs[:,0,0]-probs[:,0,1], bins='auto', alpha=.3);
plt.hist(probs[:,0,1]-probs[:,0,2], bins='auto', alpha=.3);

In [None]:
pdiff = [((probs[:,i,0] - probs[:,i,1])>0.0).sum()/len(probs) 
         for i in range(probs.shape[1])]
plt.hist(pdiff, bins='auto');