## Notebook setup

In [1]:
# Standard libraries
import os
import sys

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Numerical libraries
import jax.numpy as jnp
import numpy as np
import pandas as pd
import patsy as pt

import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

import numpyro
import numpyro.distributions as dist

from numpyro import handlers
from numpyro.diagnostics import hpdi
from numpyro.mcmc import MCMC, NUTS

from statsmodels.formula.api import ols

# Internal libraries
sys.path.append('../../src')
from lib.reconstruction.bayes.preprocessing.pivots import (load_tidy, 
                                                           compute_extra_tidy, 
                                                           compute_per_subject_pivot, 
                                                           compute_per_trial_pivot)

In [2]:
# Notebook configuration
pd.set_option('display.max_columns', 40)
# os.environ['THEANO_FLAGS'] = 'device=cuda,floatX=float32'

sns.set_context('paper')
sns.set_style('white')

colors = sns.cubehelix_palette(n_colors=2, start=0.5, hue=1, rot=.1, light=.65) 
colors += sns.cubehelix_palette(n_colors=2, start=2.5, hue=1, rot=.1, light=.65)

%matplotlib inline

## Load data

In [3]:
tidy_df, board_set_df = load_tidy('../../etc/reconstruction/tidy_data.csv')
extra_tidy_df = compute_extra_tidy(tidy_df)
per_trial_df = compute_per_trial_pivot(extra_tidy_df)
per_subject_df = compute_per_subject_pivot(per_trial_df)

### Data preprocessing

In [4]:
num_position_levels = len(extra_tidy_df.position_type.unique())
num_condition_levels = len(extra_tidy_df.condition_indicator.unique())
num_interaction_levels = num_position_levels * num_condition_levels
num_subjects = len(extra_tidy_df.usubject.unique())

def get_model_inputs(df, target_column, count_column):
    y = df[target_column].values
    n = df[count_column].values
    
    num_obs = len(y)

    p_cat = df.position_type.astype('category').cat.codes.values
    c_cat = df.condition_indicator.astype('category').cat.codes.values
    i_cat = df.interaction.astype('category').cat.codes.values
    s_cat = df.interaction.astype('category').cat.codes.values
    
    return y, n, num_obs, p_cat, c_cat, i_cat, s_cat
    

y, n, num_obs, p_cat, c_cat, i_cat, s_cat = get_model_inputs(per_subject_df, 
                                                             'errors_2', 
                                                             'occupied')

In [5]:
formula = 'errors_2 ~ C(usubject, Sum) + C(condition_indicator, Sum) * C(position_type, Sum)'
# formula = 'errors_2 ~ C(condition_indicator, Sum) * C(position_type, Sum)'
exogenous_df, endogenous_df = pt.dmatrices(formula, per_trial_df, 
                                           return_type='dataframe', 
                                           NA_action='raise')
endogenous_df = endogenous_df.astype(int)

In [6]:
extra_tidy_df['successes_2'] = extra_tidy_df['occupied'] - extra_tidy_df['errors_2']
# formula = 'errors_2 ~ C(usubject, Sum) + C(condition_indicator, Sum) * C(position_type, Sum)'
formula = 'errors_2 ~ C(condition_indicator, Sum) * C(position_type, Sum)'

exogenous_df, endogenous_df = pt.dmatrices(formula, extra_tidy_df, 
                                           return_type='dataframe', 
                                           NA_action='raise')
endogenous_df = endogenous_df.astype(int)

# sm_model = sm.GLM(endogenous_df, extra_tidy_df[['successes_2', 'errors_2']], 
#                   family=Bernoulli())

sm_model = sm.Logit.from_formula(formula, extra_tidy_df)

In [7]:
result = sm_model.fit()

Optimization terminated successfully.
         Current function value: 0.328225
         Iterations 6


In [8]:
result.summary()

0,1,2,3
Dep. Variable:,errors_2,No. Observations:,131328.0
Model:,Logit,Df Residuals:,131324.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 28 Sep 2019",Pseudo R-squ.:,0.006555
Time:,15:59:41,Log-Likelihood:,-43105.0
converged:,True,LL-Null:,-43390.0
Covariance Type:,nonrobust,LLR p-value:,5.792e-123

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.1894,0.009,-236.471,0.000,-2.208,-2.171
"C(condition_indicator, Sum)[S.0]",0.0663,0.009,7.162,0.000,0.048,0.084
"C(position_type, Sum)[S.0]",0.2079,0.009,22.452,0.000,0.190,0.226
"C(condition_indicator, Sum)[S.0]:C(position_type, Sum)[S.0]",0.0048,0.009,0.514,0.607,-0.013,0.023


In [9]:
result.params

Intercept                                                     -2.189402
C(condition_indicator, Sum)[S.0]                               0.066306
C(position_type, Sum)[S.0]                                     0.207879
C(condition_indicator, Sum)[S.0]:C(position_type, Sum)[S.0]    0.004756
dtype: float64

In [10]:
p_cat

array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=int8)

In [None]:
def get_coefficient(name, shape=1):
    sigma = numpyro.sample(f'sigma_{name}', dist.Gamma(1.64, .32))
    a = numpyro.sample(f'a_{name}', dist.Normal(jnp.zeros(shape), sigma))
    return a

def model(condition=None, position=None, subject=None):
    intercept = numpyro.sample('a', dist.Normal(0, 1))
    a_condition = get_coefficient('condition')