In [None]:
import os
import sys
import urllib, io

import numpy as np
import scipy.stats as stats
import pandas as pd

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

### setup paths

In [None]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir = os.path.abspath(os.path.join(os.getcwd(),'..'))
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))
will_dir = os.getcwd()

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       
    
import blockworld_helpers as utils    
    
# ## Assign variables within imported analysis helpers
# import analysis_helpers as h
# if sys.version_info[0]>=3:
#     from importlib import reload
# reload(h)

### connect to db [skip this if on compute server that is behind vpn]

#### remember to establish tunnel to cogtoolslab server first: `ssh -fNL 27017:127.0.0.1:27017 USER@cogtoolslab.org`

In [None]:
## set vars 
auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## cocolab ip address

## have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['block_construction']
coll = db['silhouette']

## which iteration name should we use?
iterationName = 'testing'

In [None]:
coll.distinct('iterationName')

### query over mongodb records and organize into tidy dataframe

In [None]:
## define how we will initially query records in the database
query = coll.find({'blockVertices':{'$exists':True},'iterationName':iterationName })

In [None]:
## convert into pandas dataframe
df = pd.DataFrame(list(query.sort('time_absolute')))

In [None]:
query = coll.find({'randID':'v31lrjcfgdbtbrqb5utri'})
df = pd.DataFrame(list(query.sort('time_absolute')))
df['trialList']

### Show some reduced dataframes

Check if all data events are recorded

In [None]:
# get whole sequence of data for one person
query = coll.find({'randID':"v1mpsbnlolhzyo9un4s4t"})
df = pd.DataFrame(list(query.sort('time_absolute')))
df[['normedIncrementalScore','trialNum','condition','phase','eventType','score', 'points','iterationName','numBlocksExplore','buildTime','relativePlacementTime','numBlocks','blockNum']]

In [None]:
# get whole sequence of data for one iteration
query = coll.find({'iterationName':"pilot4"})
df = pd.DataFrame(list(query.sort('time_absolute')))
df[['normedIncrementalScore','trialNum','condition','phase','eventType',
    'score', 'points','iterationName','numBlocksExplore','buildTime',
    'relativePlacementTime','numBlocks','blockNum','success']]

In [None]:
# get whole sequence of data for one person
query = coll.find({"$and":[
                        {'workerId':{'$exists':True}},
                        {'iterationName':"pilot4"},
                        {'eventType':'trial_end'},
                        {'trialNum':15}]})
df = pd.DataFrame(list(query.sort('workerId')))
df[['workerId','trialNum','condition','phase','eventType','score', 'points','iterationName','numBlocks','normedScore']]

In [None]:
query = coll.find({"workerId":"A1RFS3YXD1ZIKG"})
df = pd.DataFrame(list(query.sort('time_absolute')))
#df[['workerId','trialNum','condition','phase','eventType','score', 'points','iterationName','normedIncrementalScore']]
list(df['score'])

In [None]:
# get whole sequence of data for one person
query = coll.find({'randID':"l6krowoij8h48cr5dydt7e"})
df = pd.DataFrame(list(query.sort('time_absolute')))


In [None]:
# get whole sequence of data for one person
query = coll.find({"$and":[
                        {'randID':"v31lrjcfgdbtbrqb5utri"},
                        {'eventType':{"$ne":'settled'}},
                        {'eventType':{"$ne":'initial'}}]
                     })
df = pd.DataFrame(list(query.sort('time_absolute')))
df[['trialNum','phase','condition','eventType','score','normedScore']]

### Basic analyses: accuracy and nblocks

In [None]:
# get sequence of trial_end data for all people
# note that these currently lump everyone together, and don't
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot1'}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
df = df_full[['workerId','trialNum','phase','condition','eventType','score','normedScore','numBlocks']]
df

In [None]:
# get sequence of trial_end data for all people
# note that these currently lump everyone together, and don't
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot2'}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
df = df_full[['workerId','trialNum','phase','condition','eventType','score','normedScore','numBlocks']]
df

In [None]:
# get sequence of trial_end data for all people
# note that these currently lump everyone together, and don't
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot4'},
                        {'workerId': 'A20N1NK5X5S88F'}]
                     })
df_full = pd.DataFrame(list(query.sort('timeAbsolute')))
df = df_full[['workerId','trialNum','phase','condition','eventType','score','normedScore','numBlocks']]
df

In [None]:
# get sequence of trial_end data for all people
# note that these currently lump everyone together, and don't
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot2'}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
df = df_full[['workerId','trialNum','phase','condition','eventType','score','normedScore','numBlocks']]
df

In [None]:
# get sequence of trial_end data for all people
# note that these currently lump everyone together, and don't
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot2'}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
df = df_full[['workerId','trialNum','phase','condition','eventType','score','normedScore','numBlocks']]
df

In [None]:
# Time taken from first block settled to last block settled
query = coll.find({"$and":[
                        {'workerId':'A3DS5B06ZCD3E3'},
                        #{'condition':{'$ne':'practice'}},
                        {'eventType':'settled'},
                        {'iterationName':'pilot2'}]
                     })
df_full = pd.DataFrame(list(query.sort('timeAbsolute')))
df = list(df_full['timeAbsolute'])
(df[-1] - df[0])/(1000*60)


In [None]:
# get sequence of trial_end data for all people
# note that these currently lump everyone together, and don't
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'survey_data'},
                        {'iterationName':'pilot4'}]
                     })
df_survey = pd.DataFrame(list(query.sort('workerId')))
list(df_survey[['workerId','age','comments','difficulty','fun','strategies','inputDevice','sex','score']]['comments'])

### Sanity checks

- workerId not associated with multiple gameIDs
- each participant has exactly one trial_end data for each trialNum (0-15)
- 8 mental and 8 physical trials

In [None]:
# Ensure one to one gameID and workerId 
# Should only happen if a repeat worker gets through

query = coll.find({"$and":[
                        {'workerId':{'$exists':True}},
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {"$or":[{'iterationName':'pilot3'},
                                {'iterationName':'pilot2'}]},
                        {'trialNum':0}]
                     })

df_trial_end_full = pd.DataFrame(list(query.sort('timeAbsolute')))
#df_trial_end_full[['workerId','gameID']]


assert (np.mean(df_trial_end_full['workerId'].value_counts()) == np.mean(df_trial_end_full['gameID'].value_counts()))

In [None]:
# get ids of people with trial 15 data
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot3'},
                        {'trialNum':15}]
                     })
complete_data_df = pd.DataFrame(query)
complete_data_ids = list(complete_data_df['workerId'])

In [None]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot3'}]
                     })

df_trial_end_full = pd.DataFrame(list(query.sort('timeAbsolute')))



# filter dataframe for complete datasets
df_trial_end_full_filtered = df_trial_end_full[df_trial_end_full.workerId.isin(complete_data_ids)]



# reduce to crucial information
df_trial_end_reduced_filtered = df_trial_end_full_filtered[['gameID','trialNum','phase','condition','eventType','score','normedScore','numBlocks','timeAbsolute','timeRelative','buildTime','currBonus','exploreResets','buildResets','allVertices','nPracticeAttempts','exploreStartTime','buildStartTime','buildFinishTime']]

df_for_analysis = df_trial_end_reduced_filtered.sort_values(by=['gameID', 'timeAbsolute'])
df_for_analysis


In [None]:
# By condition
df_for_analysis[['condition','normedScore','numBlocks','buildTime']].groupby(by=['condition']).describe()

In [None]:
# By trial number
df_for_analysis[['condition','normedScore','numBlocks','buildTime','trialNum']].groupby(by='trialNum').describe()

### Helper functions - need to test

In [None]:
def get_light_df(df_full):
    ''' Get most essential columns from dataframe
    '''
    df_light = df_full[['randID','trialNum','phase','condition','eventType','score','normedScore','numBlocks']]
    return df_light

def compress_vertices(vert_dict):
    '''
    '''
    vert_list = list(map(lambda block: list(map(lambda corner: (corner['x'],world_size-corner['y']), block)), vert_dict))
    return vert_list
    
def get_world_vertices(row):
    ''' Get vertices of this row.
    '''
    vert_dict = row['allVertices'] # extract dictionary of vertices of blocks 
    world_verts = compress_vertices(vert_dict) # convert dictionary to list of unlabelled tuples
    return world_verts

def draw_world_from_row(row):
    ''' Renders state of world from one observation
    '''
    world_verts = get_final_vertices(row)
    draw_world(world_verts)

#def draw_block_in_context(row):
    ''' draw new block in figure showing old blocks in a different color
    '''

In [None]:
# check final bonus amount
df_for_analysis[df_for_analysis.trialNum == 15][['gameID','score']]

In [None]:
# Overall accuracy and nblocks
df_for_analysis[['condition','normedScore','numBlocks']].describe()

In [None]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot2'},
                        {'trialNum':15}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
complete_data_ids = list(df_full['workerId'])
complete_data_ids

query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'survey_data'},
                        {'iterationName':'pilot1'}] # filter out participants who weren't trying
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
df_full[['workerId','age','comments','difficulty','fun','strategies','inputDevice','sex','score']]

In [None]:
query = coll.find({"$and":[
                        {'iterationName':'pilot2'},
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'trialNum':15}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
df = df_full[['workerId','trialNum','phase','condition','eventType','normedScore','numBlocks','buildTime']]


In [None]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'trialNum':'18'},
                        {'iterationName':'pilot2'}]
                     })

In [None]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot2'}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
df = df_full[['workerId','trialNum','phase','condition','eventType','normedScore','numBlocks','buildTime']]
df.groupby(by='trialNum').describe()

## trial_end data by trial number

In [None]:
# get full datasets
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot2'},
                        {'trialNum':15}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
complete_data_ids = list(df_full['workerId'])
complete_data_ids


# get all trial end data
query = coll.find({"$and":[
                        {'workerId':{'$exists':True}},
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        #{'workerId':{'$ne':''}},# filter out participants who weren't trying
                        {'iterationName':'pilot2'}
                    ]})
df_full = pd.DataFrame(list(query.sort('workerId')))
df = df_full[['workerId','trialNum','phase','condition','eventType','score','normedScore','numBlocks']]
df_filtered = df[df.workerId.isin(complete_data_ids)]
df_filtered.groupby(by='trialNum').describe()

# Useful queries






In [None]:
# get whole sequence of data for one person
query = coll.find({'gameID':'7970-01a11233-665c-40d2-b7a3-3dabeb8f2a35'})
df = pd.DataFrame(list(query.sort('time_absolute')))
df[['normedIncrementalScore','trialNum','phase','eventType','score']]


In [None]:
## get successful practice trials
query = coll.find({'randID':"tjj25al28dtwvg86troo6i", 'eventType':'practice_attempt', 'success':True})
df = pd.DataFrame(list(query.sort('time_absolute')))


In [None]:
## Get list of block placements from any person
query = coll.find({'blockVertices':{'$exists':True},'trialList':{'$exists':True},'iterationName':'dataTesting' })
df = pd.DataFrame(list(query.sort('phase')))


In [None]:
# Get survey data from someone
query = coll.find({'eventType':"survey_data", 'randID' :'eab9cf17pm7qdfnfocxwk'})
df = pd.DataFrame(list(query.sort('time_absolute')))

In [None]:
## get vertices of blocks at each settled-block event
query = coll.find({'eventType':'settled', 'randID':"5310wg9k0d06l9jn2tlui8"})
df = pd.DataFrame(list(query.sort('time_absolute')))
listOfVertices = df['allVertices'] # list of vertices is a list of whole worlds of blocks
vertices = listOfVertices[13] # vertices is one world, which contains multiple blocks

In [None]:
# Expression for converting vertices dict to vertices list 
list(map(lambda block: list(map(lambda corner: (corner['x'],world_size-corner['y']), block)), vertices))

In [None]:
# Find ids of datasets that have trial_end data for last trial
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':'pilot1'},
                        {'trialNum':15}]
                     })
df_full = pd.DataFrame(list(query.sort('workerId')))
complete_data_ids = list(df_full['workerId'])
complete_data_ids

# Draw world from vertices

In [None]:
import numpy as np
from PIL import Image

from matplotlib import pylab, mlab, pyplot
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.path import Path
import matplotlib.patches as patches
import copy
import json
import datetime
import random
from random import randint
import string
import os

world_size = 900;

def patch_for_block(b):
    return get_patch(b,color='#29335C')

def patches_for_world(blocks):
    patches = []
    for (b) in blocks:
        patches.append(patch_for_block(b))
    return patches

def draw_world(vertices):
    fig = render_blockworld(patches_for_world(vertices)) 
    return fig

def get_patch(verts,
              color='orange',
              line_width = 0.2):
    '''
    input:
        verts: array or list of (x,y) vertices of convex polygon. 
                last vertex = first vertex, so len(verts) is num_vertices + 1
        color: facecolor
        line_width: edge width    
    output:
        patch matplotlib.path patch object
    '''
    codes = [1] + [2]*(len(verts)-1)    ## 1 = MOVETO, 2 = LINETO
    path = Path(verts,codes)
    patch = patches.PathPatch(path, facecolor=color, lw=line_width)
    return patch

def render_blockworld(patches,
                      xlim=(0,world_size),
                      ylim=(0,world_size),
                      figsize=(4,4)):
    
    '''
    input: 
        patches: list of patches generated by get_patch() function
        xlim, ylim: axis limits
        figsize: defaults to square aspect ratio
    output:
        visualization of block placement
    '''
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)
    for patch in patches:
        ax.add_patch(patch)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim) 
    cur_axes = plt.gca()
    cur_axes.axes.get_xaxis().set_visible(False)
    cur_axes.axes.get_yaxis().set_visible(False)        
    plt.show()
    return fig


In [None]:
## get end state of trials
query = coll.find({"$and":[{'eventType':'settled', 'phase':'build', 'workerId':"A2XKVWHXJV0HWZ"}]})
df = pd.DataFrame(list(query.sort('time_absolute')))
for i in range(0,100):
    vertices = df['allVertices'][i]
    collapsed_verts = list(map(lambda block: list(map(lambda corner: (corner['x'],world_size-corner['y']), block)), vertices))
    draw_world(collapsed_verts)