## Notebook setup

In [1]:
# Standard libraries
import os
import sys

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Scientific libraries
import numpy as np
import pandas as pd

import pymc3 as pm

# Internal libraries
sys.path.append('../../src')

# import lib.reconstruction.errors as errs
from lib.reconstruction.errors import get_errors_per_location
from lib.reconstruction.neighbors import get_adjacency, get_adjacency_per_location

from lib.reconstruction.bayes.data import BayesDFCompute
from lib.reconstruction.bayes.binomial import build_binomial_model

In [2]:
# Notebook configuration
pd.set_option('display.max_columns', 40)

sns.set_style('white')

colors = sns.cubehelix_palette(n_colors=2, start=0.5, hue=1, rot=.1, light=.65) 
colors += sns.cubehelix_palette(n_colors=2, start=2.5, hue=1, rot=.1, light=.65)

%matplotlib inline

## Load data

In [3]:
tidy = pd.read_csv('../../etc/reconstruction/tidy_data.csv', index_col=0)

tidy['Condition'] = tidy['Condition'].map(lambda x: 'Untrained' if x == 'Naive' else x)
tidy['Position ID'] = tidy['Position ID'].map(int)

vals = ['Black Position', 'White Position', 
        'Is Real', 'Num Pieces']

board_set = tidy.pivot_table(index='Position ID', 
                             values=vals, 
                             aggfunc=lambda x: x.unique()[0])[vals]

### Data preprocessing

In [4]:
# Compute the adjacency of each location

adjacencies = board_set.apply(get_adjacency_per_location, axis=1)

adjacency_column_names = ['adjacency_all', 
                          'adjacency_same', 
                          'adjacency_opposite']

adjacency_df = pd.DataFrame(adjacencies.tolist(), 
                            index=board_set.index, 
                            columns=adjacency_column_names)

In [5]:
def get_occupied_mask(row):
    bp = np.stack([int(i) for i in row['Black Position']])
    wp = np.stack([int(i) for i in row['White Position']])
    p = bp + wp
    return p.tolist()

def get_condition_mask(condition):
    return [condition, ] * 36

tidy['occupied'] = tidy.apply(get_occupied_mask, axis=1)
tidy['condition_mask'] = tidy['Condition'].map(get_condition_mask)

In [6]:
tidy['adjacency_same'] = tidy['Position ID'].map(adjacency_df['adjacency_same'])
tidy['adjacency_opposite'] = tidy['Position ID'].map(adjacency_df['adjacency_opposite'])

In [7]:
# Auxilliary data structures

# Get a dummy array of location indices for convenience
board_set['location_idx'] = np.tile(np.arange(36, dtype=np.uint8), [len(board_set), 1]).tolist()

# Get distances to center as a dummy field
blank_board = np.zeros((4, 9))
center = (blank_board.shape[0] / 2 - .5, blank_board.shape[1] / 2 - .5)

distances = np.sqrt(((np.argwhere(blank_board == 0) - center) ** 2).sum(axis=1))
board_set['distance_to_center'] = np.tile(distances, [len(board_set), 1]).tolist()

In [8]:
unique_ids = np.arange(len(tidy['Subject ID'].unique()))
subject_idx_map = dict(zip(tidy['Subject ID'].unique(),
                           unique_ids))

tidy['subject_idx'] = tidy['Subject ID'].map(subject_idx_map)

In [9]:
get_model_df = BayesDFCompute()

model_df = get_model_df(tidy, board_set)
# Filter for occupied positions only
model_df = model_df.loc[model_df['occupied'] == '1']

In [11]:
trained_sel = model_df['condition_mask'] == 'Trained'
untrained_sel = model_df['condition_mask'] == 'Untrained'
natural_sel  = model_df['position_type'] == '1'
synthetic_sel = model_df['position_type'] == '0'

### Difference between stimulus types, per condition

#### Trained

In [14]:
# TODO: figure out how to generate one big dict (full of bad datas?)
pos_natural = model_df.loc[natural_sel & trained_sel, 'position_id'].values.astype(int)
err_natural = model_df.loc[natural_sel & trained_sel, 'errors_1'].values.astype(int)

pos_synthetic = model_df.loc[synthetic_sel & trained_sel, 'position_id'].values.astype(int)
err_synthetic = model_df.loc[synthetic_sel & trained_sel, 'errors_1'].values.astype(int)

data_dict = {'natural': {'x': pos_natural, 'y': err_natural},
             'synthetic': {'x': pos_synthetic, 'y': err_synthetic}}

In [15]:
trained_model = build_binomial_model(data_dict)

In [None]:
nuts_kwargs = {'target_accept': .98}
with trained_model:
    trained_trace = pm.sample(16000, cores=4, tune=4000, nuts_kwargs=nuts_kwargs)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [p_synthetic, p_natural, kappa_synthetic, mu_synthetic, kappa_natural, mu_natural, rate, shape, kappa, mu]
Sampling 4 chains:  34%|███▍      | 27265/80000 [24:26<1:41:08,  8.69draws/s]

In [None]:
pm.traceplot(trained_trace)
sns.despine();

In [None]:
samples = trained_trace.get_values('difference in means')
sd = np.std(samples)

len(np.where((sd > samples) & (samples > -sd))[0]) / len(samples)

In [None]:
pm.plot_posterior(trained_trace, 
                  var_names=['difference in means', 'difference in variances', 'effect size'],
                  ref_val=0, kind='hist', credible_interval=.95);

#### Untrained

In [None]:
pos_natural = bayes_model_df.loc[natural_sel & untrained_sel, 'position_id'].values.astype(int)
err_natural = bayes_model_df.loc[natural_sel & untrained_sel, 'errors'].values.astype(int)

pos_synthetic = bayes_model_df.loc[synthetic_sel & untrained_sel, 'position_id'].values.astype(int)
err_synthetic = bayes_model_df.loc[synthetic_sel & untrained_sel, 'errors'].values.astype(int)

data_dict = {'natural': {'x': pos_natural, 'y': err_natural},
             'synthetic': {'x': pos_synthetic, 'y': err_synthetic}}

untrained_model = build_binomial_model(data_dict)

pm.model_to_graphviz(untrained_model)

In [None]:
nuts_kwargs = {'target_accept': .98}
with untrained_model:
    untrained_trace = pm.sample(16000, cores=4, tune=4000, nuts_kwargs=nuts_kwargs)

In [None]:
pm.traceplot(untrained_trace)
sns.despine();

In [None]:
sns.set_style('white')

ax = pm.plot_posterior(untrained_trace, 
                       var_names=['difference in means'],
                       ref_val=0, credible_interval=.95,
                       kind='hist', 
                       figsize=(4, 3))
plt.setp(ax, 
         title='Natural - Synthetic; Untrained subjects', 
         xlabel=r'$\Delta$ error rate');

### Difference between conditions, per stimulus type

####  Natural

In [None]:
sub_trained = bayes_model_df.loc[natural_sel & trained_sel, 'subject'].values.astype(int)
err_trained = bayes_model_df.loc[natural_sel & trained_sel, 'errors'].values.astype(int)

sub_untrained = bayes_model_df.loc[natural_sel & untrained_sel, 'subject'].values.astype(int)
err_untrained = bayes_model_df.loc[natural_sel & untrained_sel, 'errors'].values.astype(int)

data_dict = {'trained': {'x': sub_trained, 'y': err_trained},
             'untrained': {'x': sub_untrained, 'y': err_untrained}}

natural_model = build_binomial_model(data_dict)

pm.model_to_graphviz(natural_model)

In [None]:
nuts_kwargs = {'target_accept': .98}
with natural_model:
    natural_trace = pm.sample(16000, cores=4, tune=4000, nuts_kwargs=nuts_kwargs)

In [None]:
pm.traceplot(natural_trace)
sns.despine();

In [None]:
sns.set_style('white')

ax = pm.plot_posterior(natural_trace, 
                       var_names=['difference in means'],
                       ref_val=0, credible_interval=.95,
                       kind='hist', 
                       figsize=(4, 3))
plt.setp(ax, 
         title='Trained - Untrained; Natural positions', 
         xlabel=r'$\Delta$ error rate');

#### Synthetic

In [None]:
sub_trained = bayes_model_df.loc[synthetic_sel & trained_sel, 'subject'].values.astype(int)
err_trained = bayes_model_df.loc[synthetic_sel & trained_sel, 'errors'].values.astype(int)

sub_untrained = bayes_model_df.loc[synthetic_sel & untrained_sel, 'subject'].values.astype(int)
err_untrained = bayes_model_df.loc[synthetic_sel & untrained_sel, 'errors'].values.astype(int)

data_dict = {'trained': {'x': sub_trained, 'y': err_trained},
             'untrained': {'x': sub_untrained, 'y': err_untrained}}

synthetic_model = build_binomial_model(data_dict)

pm.model_to_graphviz(synthetic_model)

In [None]:
nuts_kwargs = {'target_accept': .98}
with synthetic_model:
    synthetic_trace = pm.sample(16000, cores=4, tune=4000, nuts_kwargs=nuts_kwargs)

In [None]:
pm.traceplot(synthetic_trace)
sns.despine();

In [None]:
sns.set_style('white')

ax = pm.plot_posterior(untrained_trace, 
                       var_names=['difference in means'],
                       ref_val=0, credible_interval=.95,
                       kind='hist', 
                       figsize=(4, 3))
plt.setp(ax, 
         title='Natural - Synthetic; Untrained subjects', 
         xlabel=r'$\Delta$ error rate');

In [None]:
sample = untrained_trace.get_values('difference in means')

len(np.where((-.01 < sample) & (sample < .01))[0]) / len(sample)

In [None]:
len(sample[sample < -.01]) / len(sample)

In [None]:
len(sample[sample > .01]) / len(sample)