In [None]:
from __future__ import absolute_import, division

import os
import urllib, cStringIO

import pymongo as pm

import numpy as np
import scipy.stats as stats
import pandas as pd
import json
import re

from PIL import Image
import base64
import sys

import matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import seaborn as sns
sns.set_context('talk')
sns.set_style('white')

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [None]:
# directory & file hierarchy
proj_dir = os.path.abspath('../../..')
analysis_dir = os.getcwd()
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
sketch_dir = os.path.abspath(os.path.join(proj_dir,'sketches'))

## add helpers to python path
if os.path.join(proj_dir,'analysis','python') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'analysis','python'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       
    
# Assign variables within imported analysis helpers
import analysis_helpers as h
if sys.version_info[0]>=3:
    from importlib import reload
reload(h)

### setup

In [None]:
# set vars 
auth = pd.read_csv('auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'rxdhawkins.me' ## cocolab ip address

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['3dObjects']
coll = db['graphical_conventions_recog']

# which iteration name should we use?
iterationName = 'pilot1'

### get basic participation stats

In [None]:
## list of researchers
researchers = ['A4SSYO0HDVD4E', 'A1BOIDKD33QSDK']
num_correct_thresh = 0

## get list of valid sessions with reasonable accuracy
workers = coll.find({ '$and': [{'iterationName':iterationName}]}).distinct('workerId')
workers = [i for i in workers if len(i)>10 and i not in researchers] ## filter workers
print '{} workers performed this task'.format(len(workers))

In [None]:
## get total number of recog events in the collection as a whole
top_workers = []
for i,w in enumerate(workers):
    print 'Analyzing {} | {} of {}'.format(w,str(i).zfill(3),len(workers))
    clear_output(wait=True)
    R = coll.find({ '$and': [{'iterationName':iterationName}, {'workerId': w}]}).sort('time',-1)
    num_correct = np.sum([r['correct'] for r in R])
    if num_correct >= num_correct_thresh:
        top_workers.append(w)
        
print '{} workers got at least {} correct.'.format(len(top_workers),num_correct_thresh)        

### construct group dataframe

In [None]:
## get total number of recog events in the collection as a whole
# grab rep & accuracy
from IPython.display import clear_output
rep = []
correct = []
rt = []
condition = []
orig_correct = []
generalization = []
target = []
distractor1 = []
distractor2 = []
distractor3 = []
for i,w in enumerate(top_workers):
    print 'Now analyzing {} | {} of {}'.format(w,str(i+1).zfill(3), len(top_workers))
    clear_output(wait=True)
    R = coll.find({ '$and': [{'iterationName':iterationName}, {'workerId': w}]}).sort('time')
    for r in R:
        rep.append(r['repetition'])
        correct.append(r['correct'])
        rt.append(r['rt'])
        condition.append(r['condition'])
        if 'outcome' in r.keys():
            orig_correct.append(r['outcome'])
        else:
            orig_correct.append(r['original_correct'])
        generalization.append(r['Generalization'])
        target.append(r['target'])
        distractor1.append(r['distractor1'])
        distractor2.append(r['distractor2'])
        distractor3.append(r['distractor3'])
    
## make dataframe
X = pd.DataFrame([rep,correct,rt,condition,orig_correct,\
                 generalization,target,distractor1,distractor2,\
                 distractor3])
X = X.transpose()
X.columns = ['repetition','correct','rt','condition', 'orig_correct',\
             'generalization','target','distractor1','distractor2',\
             'distractor3']

## convert datatypes to numeric
X['correct'] = pd.to_numeric(X['correct'])
X['rt'] = pd.to_numeric(X['rt'])
X['orig_correct'] = pd.to_numeric(X['orig_correct'])
    
print 'Finished analyzing top workers.'
print 'There are {} observation in the dataframe.'.format(X.shape[0])

#### preprocessing helper 

In [None]:
## function to unroll target, distractor dicts into separate columns
def dict2cols(X,item='target'):
    '''
    X = dataframe containing group data
    item = which item column to unroll: target? distractor1? 
    '''
    df = pd.DataFrame.from_dict(X[item]) ## make temporary dataframe with dictionary as main column
    df2 = df[item].apply(pd.Series) ## separate into different columns
    ## rename to ensure uniqueness
    df3 = df2.rename(columns={'objectname': '{}_objectname'.format(item),\
                              'shapenetid': '{}_shapenetid'.format(item),\
                              'url': '{}_url'.format(item)})
    X2 = X.join(df3) ## add to original group dataframe
    X2.drop(labels=[item],axis=1,inplace=True) ## remove old dictionary column
    return X2

## now actually apply unrolling function
items = ['target','distractor1','distractor2','distractor3']
for item in items: 
    print 'Unrolling {}'.format(item)
    clear_output(wait=True)
    if item in X.columns:
        X = dict2cols(X,item=item)
        
print 'Finished unrolling item dictionaries into separate columns.'

In [None]:
X.groupby(['condition','repetition'])['correct'].mean()

### visualize recognizability x repetition

In [None]:
## get dataframe subsetted by condition and broken out by target
X2 = X.query("condition=='repeated'")
X2 = X2.sort_values(by=['target_objectname'])
targ_list = np.unique(X2.target_objectname.values)
sns.set_context('talk')

## plot recognizxability, collapsing across target
fig = plt.figure(figsize=(4,4))
sns.lineplot(data=X2,x='repetition',y='correct')
plt.ylim(0,1)
plt.yticks(np.arange(0, 1, 0.1))
plt.xticks(np.arange(0, 8, 1))
plt.plot([0,7],[0.25,0.25],color='black',linestyle=':')

## plot recognizability, split out by target
fig = plt.figure(figsize=(16,16))
g = sns.FacetGrid(X2, col="target_objectname", col_wrap=4,height=3, margin_titles=False)
g.map(sns.lineplot, "repetition", "correct", alpha=.7)
g.set_titles("{col_name}")
plt.tight_layout()

### visualize rt x repetition

In [None]:
## get dataframe subsetted by condition and broken out by target
X2 = X.query("condition=='repeated'")
X2 = X2.sort_values(by=['target_objectname'])
targ_list = np.unique(X2.target_objectname.values)
sns.set_context('talk')

## plot recognizxability, collapsing across target
fig = plt.figure(figsize=(4,4))
sns.lineplot(data=X2,x='repetition',y='rt')
plt.ylim(0,1)
plt.yticks(np.arange(0, 10000, 2000))
plt.xticks(np.arange(0, 8, 1))

## plot recognizability, split out by target
fig = plt.figure(figsize=(16,16))
g = sns.FacetGrid(X2, col="target_objectname", col_wrap=4,height=3, margin_titles=False)
g.map(sns.lineplot, "repetition", "rt", alpha=.7)
g.set_titles("{col_name}")
plt.tight_layout()

#### for use while running experiment to check on final scores of individual participants

In [None]:
## get final score for particular worker
w = top_workers[0]
coll.find_one({ '$and': [{'iterationName':iterationName}, \
                        {'workerId': w}]}, \
                        sort=[("score", -1)])["score"]