## Notebook setup

In [None]:
# Standard libraries
import os
import sys

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Numerical libraries
import numpy as np
import pandas as pd
import patsy as pt
import pymc3 as pm
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

from statsmodels.formula.api import ols
from theano import tensor as T

# Internal libraries
sys.path.append('../../src')
from lib.reconstruction.bayes.preprocessing.pivots import (load_tidy, 
                                                           compute_extra_tidy, 
                                                           compute_per_subject_pivot, 
                                                           compute_per_trial_pivot)
from lib.reconstruction.features import count_all_features

In [None]:
# Notebook configuration
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 100)
# os.environ['THEANO_FLAGS'] = 'device=cuda,floatX=float32'

sns.set_context('paper')
sns.set_style('white')

colors = sns.cubehelix_palette(n_colors=2, start=0.5, hue=1, rot=.1, light=.65) 
colors += sns.cubehelix_palette(n_colors=2, start=2.5, hue=1, rot=.1, light=.65)

%matplotlib inline

## Load data

In [None]:
tidy_df, board_set_df = load_tidy('../../etc/reconstruction/tidy_data.csv')

# One position was duplicated between real/fake positions; drop it from data
valid_ids = tidy_df.pivot_table(index='Position ID', values='Subject ID', aggfunc=len)
valid_ids = valid_ids.loc[valid_ids['Subject ID'] == 38]
valid_ids = valid_ids.index.tolist()
tidy_df = tidy_df.loc[tidy_df['Position ID'].isin(valid_ids)]
board_set_df = board_set_df.loc[valid_ids]
board_set_df.sort_index(inplace=True)
board_set_df.reset_index(inplace=True, drop=True)

upids = tidy_df.sort_values('Position ID')['Position ID'].unique()
pid_map = dict(zip(upids, np.arange(0, len(upids), 1, dtype=int)))
tidy_df['Position ID'] = tidy_df['Position ID'].map(pid_map)

extra_tidy_df = compute_extra_tidy(tidy_df)
per_trial_df = compute_per_trial_pivot(extra_tidy_df)
per_subject_df = compute_per_subject_pivot(per_trial_df)

In [None]:
# Add features
all_features = tidy_df.apply(count_all_features, axis=1)
feature_names = list(all_features.iloc[0].keys())
feature_base_names = list(set([fn[:-1] for fn in feature_names]))

for fn in feature_names:
    extra_tidy_df[f'feature_{fn}'] = np.concatenate(all_features.map(lambda x: x[fn]).values)
    
for bn in feature_base_names:
    b = np.concatenate(all_features.map(lambda x: x[f'{bn}b']).values)
    w = b = np.concatenate(all_features.map(lambda x: x[f'{bn}w']).values)
    extra_tidy_df[f'basef_{bn}'] = b + w
    
extra_tidy_df['same'] = extra_tidy_df['same'].astype(float)
extra_tidy_df['opposite'] = extra_tidy_df['opposite'].astype(float)

In [None]:
extra_tidy_df

### Data preprocessing

In [None]:
num_position_levels = len(extra_tidy_df.position_type.unique())
num_condition_levels = len(extra_tidy_df.condition_indicator.unique())
num_interaction_levels = num_position_levels * num_condition_levels
num_subjects = len(extra_tidy_df.usubject.unique())

In [None]:
extra_tidy_df['n_pieces'] = extra_tidy_df.position_id.map(board_set_df['Num Pieces'])

In [None]:
formula = 'errors_2 ~ C(usubject, Sum) + C(condition_indicator, Sum) * C(position_type, Sum)'
# formula = 'errors_2 ~ C(condition_indicator, Sum) * C(position_type, Sum)'
exogenous_df, endogenous_df = pt.dmatrices(formula, per_trial_df, 
                                           return_type='dataframe', 
                                           NA_action='raise')
endogenous_df = endogenous_df.astype(int)

In [None]:
extra_tidy_df['successes_2'] = extra_tidy_df['occupied'] - extra_tidy_df['errors_2']
# formula = 'errors_2 ~ C(usubject, Sum) + C(condition_indicator, Sum) * C(position_type, Sum)'
formula = 'errors_2 ~ C(condition_indicator, Sum) * C(position_type, Sum) + n_pieces + same + opposite'

# exogenous_df, endogenous_df = pt.dmatrices(formula, extra_tidy_df, 
#                                            return_type='dataframe', 
#                                            NA_action='raise')
# endogenous_df = endogenous_df.astype(int)

sm_model = sm.Logit.from_formula(formula, extra_tidy_df)
result = sm_model.fit()
result.summary()

### Bayesian

In [None]:
extra_tidy_df['same'] = extra_tidy_df['same'].astype(float)
extra_tidy_df['opposite'] = extra_tidy_df['opposite'].astype(float)

# Important interactions:
# 1. color and feature
#    "Should a heuristic function distinguish between feature colors?"
# 2. training and feature
#    "Does prior experience with the game bias subjects towards features from the game model?"
# 3. same and opposite
#    "Do different colors in context have a simple impact?"
# 4. training and num_pieces (and position type?)
#    "Does game experience provide a subject with a more efficient position encoding scheme?"

formula = 'errors_2 ~ C(condition_indicator) + C(condition_indicator, Sum) + C(position_type, Sum) + '
formula += 'n_pieces + same + opposite + C(color, Sum) + '
formula += ' + '.join(f'basef_{bn}' for bn in feature_base_names)

# Interactions
# formula += ' + ' + ' + '.join(f'basef_{bn}:C(color, Sum)' for bn in feature_base_names)
# formula += ' + ' + ' + '.join(f'basef_{bn}:C(condition_indicator, Sum)' for bn in feature_base_names)
# formula += ' + ' + 'C(condition_indicator, Sum):n_pieces'

# old
# Feature names including color
# formula += ' + '.join(f'feature_{fn}' for fn in feature_names)


df = extra_tidy_df.copy()
df = df.loc[df['occupied'] == 1]
# df['n_pieces'] = df.n_pieces - df.n_pieces.max() + .5 * (df.n_pieces.max() - df.n_pieces.min())
# df['same'] = df.same - .5
# df['opposite'] = df.opposite - .5

exogenous_df, endogenous_df = pt.dmatrices(formula, df, 
                                           return_type='dataframe', 
                                           NA_action='raise')

index_cols = ['Intercept', 
              'C(condition_indicator)[T.1]', 
              'C(position_type)[T.1]', 
              'C(condition_indicator)[T.0]:C(position_type)[T.0]']

for c in index_cols:
    if c in endogenous_df.columns:
        endogenous_df[c] = endogenous_df[c].astype(int)
    
endogenous_df

In [None]:
def create_coeff(name, shape=(2,), mu=0):
#     sigma_prior = pm.Gamma(f'sigma_prior_{name}', 1.64, .32)
#     sigma = pm.Gamma(f'sigma_{name}', 1.64, .32)
#     a_prior = pm.Normal(f'a_prior_{name}', mu=mu, sigma=sigma, shape=shape)
    
    a = pm.Normal(f'a_{name}', mu=(0, 0), sigma=1, shape=shape)
    return a


condition = endogenous_df['C(condition_indicator)[T.1]'].values


with pm.Model() as logistic_anova:
    b0 = pm.Normal(f'intercept', mu=0, sigma=2)
    
    # Coefficients
#     b_condition = create_coeff('condition')
    b_condition = pm.Normal(f'a_condition', mu=0, sigma=2)
    
    # Position level
    b_position = create_coeff('position')
    b_n_pieces = create_coeff('n_pieces')
    
    # Piece level
    b_color = create_coeff('color')
    b_same = create_coeff('same')
    b_opposite = create_coeff('opposite')
    
    b_f1101 = create_coeff('f1101')
    b_f1001 = create_coeff('f1001')
    b_f1111 = create_coeff('f1111')
    b_f1010 = create_coeff('f1010')
    b_f1100 = create_coeff('f1100')
    b_f1110 = create_coeff('f1110')
    
    # Regression equation
    mu = b0
    mu += b_condition * endogenous_df['C(condition_indicator, Sum)[S.0]'].values
    mu += b_position[condition] * endogenous_df['C(position_type, Sum)[S.0]'].values
    mu += b_n_pieces[condition] * endogenous_df.n_pieces.values
    mu += b_color[condition] * endogenous_df['C(color, Sum)[S.-1]'].values
    mu += b_same[condition] * endogenous_df.same.values
    mu += b_opposite[condition] * endogenous_df.opposite.values
    mu += b_f1101[condition] * endogenous_df.basef_1101.values
    mu += b_f1001[condition] * endogenous_df.basef_1001.values
    mu += b_f1111[condition] * endogenous_df.basef_1111.values
    mu += b_f1010[condition] * endogenous_df.basef_1010.values
    mu += b_f1100[condition] * endogenous_df.basef_1100.values
    mu += b_f1110[condition] * endogenous_df.basef_1110.values
    
    mu = pm.invlogit(mu)
    
    y = pm.Bernoulli('tagerts', p=mu, observed=exogenous_df['errors_2'].values)

In [None]:
with logistic_anova:
    step = pm.NUTS(target_accept=.99, max_treedepth=10)
    trace = pm.sample(2500, step, tune=2000, chains=6, cores=6)

In [None]:
sns.set_style('white')
pm.traceplot(trace, var_names=['intercept', 'a_condition', 'a_position', 'a_n_pieces'])
sns.despine();

In [None]:
sns.set_style('white')

fig, axes = plt.subplots(1, 3, figsize=(12, 3), dpi=150)

position = trace.get_values('a_condition')

sns.distplot(position[:, 0] - position[:, 1], ax=axes[0])
sns.despine()
plt.tight_layout();

In [None]:
sns.set_style('white')

fig, axes = plt.subplots(1, 4, figsize=(16, 3), dpi=150)

# var_names = ['intercept', 'a_position', 'a_condition', 
#              'a_num_pieces', 'a_interaction', 'a_same', 'a_color',
#              'a_opposite', 'a_features']

# var_names += [f'a_feature_{fn}' for fn in feature_names]

var_names = ['intercept', 'a_condition', 'a_position']

pm.plot_posterior(trace, 
                  var_names=var_names,
                  color='#87ceeb', kind='hist', ax=axes,
                  rope=(-.01, .01), ref_val=0, round_to=3)

sns.despine()
plt.tight_layout();

In [None]:
sns.set_style('white')

fig, axes = plt.subplots(3, 3, figsize=(12, 9), dpi=150)

# var_names = ['intercept', 'a_position', 'a_condition', 
#              'a_num_pieces', 'a_interaction', 'a_same', 'a_color',
#              'a_opposite', 'a_features']

# var_names += [f'a_feature_{fn}' for fn in feature_names]

var_names = ['a_n_pieces', 'a_same', 'a_opposite', 'a_color',] # 'a_condition_x_n_pieces', 'a_same_x_opposite']

pm.plot_posterior(trace, 
                  var_names=var_names,
                  color='#87ceeb', kind='hist', ax=axes,
                  rope=(-.01, .01), ref_val=0, round_to=3)

sns.despine()
plt.tight_layout();

In [None]:
sns.set_style('white')

fig, axes = plt.subplots(6, 3, figsize=(12, 18), dpi=150)

var_names = [f'a_f{f}' for f in feature_base_names]

pm.plot_posterior(trace, 
                  var_names=var_names,
                  color='#87ceeb', kind='hist', ax=axes,
                  rope=(-.01, .01), ref_val=0, round_to=3)

sns.despine()
plt.tight_layout();