# Preprocess

This should really be a combination of utility libraries and scripts
but... well at least I've learned better by now.

In [5]:
import pandas as pd
import numpy as np
import json
import ast
import re
from utils import Labeler
import os
import seaborn as sns
import matplotlib.pyplot as plt

## Saving variables

In [2]:
if 'EXPERIMENT' not in locals():
    EXPERIMENT = 2

In [3]:
try:
    VARIABLES = pd.Series.from_csv('variables.csv')
except:
    VARIABLES = pd.Series()

def writevar(key, val):
    key = 'EXP{}_{}'.format(EXPERIMENT, key)
    VARIABLES[key] = val
    print('{} = {}'.format(key, val))

def savevars():
    VARIABLES.to_csv('variables.csv')
    with open('../paper/variables.sed', 'w+') as f:
        for key, val in VARIABLES.items():
            val = str(val).replace('\\', '\\\\').replace('&', '\&')
            f.write('s/`{}`/{}/g'.format(key, val) + '\n')

def write_lm_var(model, var, name):
    beta = np.round(model.params[var], 2)
    se = np.round(model.bse[var], 2)
    p = model.pvalues[var]
    if p <.001:
        p_desc = 'p < 0.001'
    elif p < .01:
        p_desc = 'p = {}'.format(np.round(p, 3))
    else :
        p_desc = 'p = {}'.format(np.round(p, 3))

    writevar('{}_BETA'.format(name), beta)
    writevar('{}_SE'.format(name), se)
    writevar('{}_P'.format(name), p)
    
    writevar(
        '{}_RESULT'.format(name),
        r'$\\beta = %s,\\ \\text{SE} = %s,\\ %s$' % (beta, se, p_desc)
    )

## Load Data

In [4]:
from glob import glob
def get_data(): 
    data = {}
    for file in glob('../data/{}/*.csv'.format(EXPERIMENT)):
        name = os.path.basename(file)[:-4]
        df = pd.read_csv(file)
        data[name] = df
    return data

def misformatted(df):
    def check(c):
        return len(c.dropna()) !=138
    
    idx =  df.groupby('pid').correct.apply(check).as_matrix()
    return set(df.pid.unique()[idx])

def failed_catch(df, max_fails=1):
    fails = (df.query("kind == 'control'")
             .groupby('pid')
             .correct.agg(lambda c: len(c[c== False]))
    )
    failed_pids = fails[fails > max_fails].index
    return set(failed_pids)


data = get_data()
caught_pid = failed_catch(data['ball'])

started_pid = list(data['ball'].pid.unique())
finished_pid = list(data['survey-text'].pid.unique())
unfinished_pid = set(started_pid) - set(finished_pid)
data['participants'] = data['participants'].query('pid == @started_pid')
writevar('N_PARTICIPANT', len(data['participants']))
writevar('N_UNFINISHED', len(unfinished_pid))
writevar('N_CAUGHT', len(caught_pid - unfinished_pid))

drop_pid = unfinished_pid | caught_pid
# pidx is an unbroken index for participants in the analysis.
x_labeler = Labeler()
for k, df in data.items():
    df = df.set_index('pid').drop(drop_pid, errors='ignore').reset_index()
    df['pidx'] = df.pid.apply(x_labeler)
    data[k] = df
writevar('N_REMAIN', len(data['participants']))

misformat_pids = misformatted(data['ball'])
assert not misformat_pids - drop_pid

EXP1_N_PARTICIPANT = 119
EXP1_N_UNFINISHED = 9
EXP1_N_CAUGHT = 8
EXP1_N_REMAIN = 102


set()

## Reformat data

In [63]:
def fix_conditions(df):
    df = df.rename(columns={'show_feedback': 'feedback'})
    def get_cue(r): 
        return 'color' if r.color_cue else 'wall' if r.wall_cue else 'none'
    df['cue_type'] = df.apply(get_cue, axis=1).astype('category',
                                                 categories=['none', 'color']
                                                )
    return df

bdf = data['ball'].copy()
pdf = data['participants'].copy()

pdf = pdf.set_index('pidx')
bdf = bdf.set_index('pidx')
pdf['pidx'] = pd.Categorical(pdf.index)
bdf['pidx'] = pd.Categorical(bdf.index)

bdf.correct = bdf.correct.astype(int)
bdf.good_cue = bdf.good_cue.astype(bool)
bdf.rt = bdf.rt / 1000
bdf['stim'] = (bdf.hole.astype(str) + bdf.goes_in.astype(str)
              ).apply(Labeler()).astype('category')

def cue_on(r):
    present = r.get('cue_present', True)
    return (r.color_cue or r.wall_cue) and present

bdf['cuex'] = bdf.apply(cue_on, axis=1).astype(bool)
# bdf['cue'] = bdf.cuex.apply(lambda x: 'present' if x else 'absent') \
#     .astype('category', categories=['absent', 'present'])
cue_names = {-1: 'deceitful', 0: 'neutral', 1: 'honest'}
bdf['cue'] = ((bdf.good_cue.astype(int) * 2 - 1) * bdf.cuex).apply(cue_names.get)

try:
    bdf['trialx'] = bdf['trials_completed']
except:
    pass
    
bdf = fix_conditions(bdf)
pdf = fix_conditions(pdf)
try:
    bdf['block'] = bdf['block_idx'].astype(int)
except:
    pass
bdf['feedback_cond'] = bdf.pidx.apply(lambda p: pdf.ix[p].feedback)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


In [6]:
drop = ['draw_trajectory', 'block_idx', 'score', 'trials_completed']
bdf = bdf.drop(drop, axis=1, errors='ignore')

## Create Dataframes

In [7]:
def query_subset(df, col, subset):
    idx = df[col].apply(lambda x: x in subset)
    return df[idx].copy()

In [8]:
# Analysis trials (not instruction or control)
adf = query_subset(bdf, 'kind', {'standard', 'easy', 'hard', 'critical'})
adf['logrt'] = np.log(adf.rt)
adf['responsex'] = response = adf.response.apply({'yes': 1, 'no': 0}.get).as_matrix()
kind_map = {'easy': 0, 'standard': 1, 'hard':2, 'critical':2}
adf['kindx'] = adf.kind.apply(kind_map.get).as_matrix()

In [9]:
# Normal trials (not critical)
ndf = query_subset(adf, 'kind', {'easy', 'standard', 'hard'})

# Clip extreme (> 3 sd) reaction times.
threshold = np.mean(ndf.rt) + 3 * np.std(ndf.rt)
n_clipped = (ndf.rt > threshold).sum()
ndf.rt = ndf.rt.clip_upper(threshold)
ndf['difficulty'] = ndf['kind'].str.replace('standard', 'medium').astype('category',
    categories=['easy', 'medium', 'hard'])
writevar('N_RT_CLIP', n_clipped)
writevar('THRESHOLD_RT_CLIP', threshold)
# print('Dropping', sdf.rt.apply(np.isnan).sum(), 'extreme reaction times.')

EXP3_N_RT_CLIP = 100
EXP3_THRESHOLD_RT_CLIP = 9.031841356928114


In [11]:
# Critical trials.
cdf = adf.query("kind == 'critical'").copy()

In [12]:
# Add info to participant dataframe.
pdf['rt'] = ndf.groupby('pid').rt.mean()
pdf['correct'] = ndf.groupby('pid').correct.mean()
pdf['total_correct'] = ndf.groupby('pid').correct.sum()

In [21]:
# Cue quiz
def add_pdf_cols(df, cols):
    df = df.copy()
    for col in cols:
        df[col] = df.pidx.apply(lambda x: pdf.ix[x][col])
    return df

def is_cue_survey(x):
    return x.startswith('["Which player is') or x.startswith('["The ball is')

qdf = data['survey-multi-choice'].dropna(axis=1).copy()
qdf.correct = qdf.correct.apply(ast.literal_eval)
qdf = qdf[qdf.questions.apply(is_cue_survey)].reset_index()
qdf['perfect'] = qdf.correct.apply(lambda x: sum(x[:2]) == 2)
qdf = add_pdf_cols(qdf, ['feedback']).reset_index()
if EXPERIMENT == 1:
    qdf['block'] = (qdf.index % 10) + 1
    final_qdf = qdf[qdf.block == qdf.block.max()]
else:
    final_qdf = qdf

## Plotting

In [22]:
def jitter(x, amt=.01):
    return x + np.random.random(x.shape) * amt

In [23]:
sns.set_context('notebook', font_scale=1.3)
sns.set_style('white')
sns.set_palette('deep', color_codes=True)
COLOR = sns.color_palette('deep')
CUE_COLOR = [(0.7, 0.7, 0.7), *COLOR]
C_MAP = {
    'neutral': (0.7, 0.7, 0.7),
    'honest': COLOR[0],
    'deceitful': sns.color_palette('Paired')[4],
}
PK = dict(markers='.', palette=C_MAP, dodge=True)

In [24]:
# fig_dir = os.path.join('figs', CODE_VERSION)
fig_dir = os.path.join('../paper/figs', str(EXPERIMENT))
os.makedirs(fig_dir, exist_ok=True)

def join(seq):
    return '_' + '_'.join(map(str, seq))

def savefig(name):
    name = name.lower()
    path = os.path.join(fig_dir, name + '.pdf')
    print(path)
    plt.savefig(path)
    
def plot(**kwargs1):
    """Decorator that calls a plotting function and saves the result."""
    def decorator(func):        
        def wrapped(*args, **kwargs):
            kwargs.update(kwargs1)
            params = (v for v in kwargs1.values() if v is not None)
            name = func.__name__ + join(params).rstrip('_')
            if name.startswith('plot_'):
                name = name[len('plot_'):]
            func(*args, **kwargs)
            savefig(name)
        wrapped()
        return wrapped
    
    return decorator

In [25]:
def feedback_labels(g):
    try:
        ax1, ax2 = g.axes.ravel()
    except ValueError:
        pass
    else:
        ax1.set_title('No Feedback')
        ax2.set_title('Feedback')

## Stats

In [26]:
def pval(x):
    if x < 0.001:
        return "p < 0.001"
    elif x < 0.01:
        return "p < 0.01"
    elif x < 0.05:
        return "p < 0.05"
    elif x >= 0.05:
        return "p = {:.2f}".format(x)
    else:
        return float('nan')

In [27]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import ri2py
%load_ext rpy2.ipython

In [29]:
%%R

library('pwr')
library('lme4')
library('car')
library('lsmeans')
library('multcomp')







Attaching package: ‘TH.data’



    geyser




In [30]:
def prep_for_r(df):
    df = df.copy()
    df.feedback = df.feedback.astype(int)
    df.cue = df.cuex.astype(int)
    df.good_cue = df.good_cue.astype(int)
    df.feedback = df.feedback_cond.astype(int)
    df.stim = df.stim.astype(str)
    if 'pidx' in df:
        df['pid'] = df.pidx.astype(str)
        del df['pidx']
    return df

In [31]:
def to_snake_case(name):
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    name = re.sub(r'[.:\/]', '_', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()

def reformat_name(name):
    return re.sub('\W', '', to_snake_case(name))

In [33]:
def get_rtable(results, p_col=None):
    tbl = ri2py(results)
    tbl = tbl.rename(columns=reformat_name)
    if p_col:
        tbl['signif'] = tbl[reformat_name(p_col)].apply(pval)
    return tbl
    

In [34]:
from IPython.display import display, Latex

def save_analysis(table, name, tex, display_tex=True):
    for idx, row in table.iterrows():
        row['idx'] = idx
        n = name(row) if callable(name) else name
        n = reformat_name(n.format_map(row)).upper()
        
        t = tex(row) if callable(tex) else tex
        t = t.format_map(row)
        
        writevar(n, t)
        if display_tex:
            display(Latex(t))
    savevars()

In [35]:
def signif_stars(g, comps_tbl):
    def star(ax, x, p):
        n_star = 3 if p < .001 else 2 if p < .01 else 1 if p < .05 else 0
        if n_star:
            g.axes.flat[ax].text(x - .04 * n_star, .9, '*' * n_star, fontsize=20)
    
    for i, row in comps_tbl.ix[:6].iterrows():
        i -= 1
        if '<' in row.signif:
            star(i // 3, i % 3, row.p_value)