In [97]:
import os
import sys
import urllib, io

import numpy as np
import scipy.stats as stats
import pandas as pd

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [98]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir = os.path.abspath(os.path.join(os.getcwd(),'..'))
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))
jefan_dir = os.path.join(analysis_dir,'jefan')
will_dir = os.path.join(analysis_dir,'will')

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

In [99]:
## set vars 
auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## cocolab ip address

## have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['block_construction']
coll = db['silhouette']

## which iteration name should we use?
iterationName = 'Exp2Pilot1'


## Sanity Checks

In [100]:
# Ensure one to one gameID and workerId 
# Should only happen if a repeat worker gets through

query = coll.find({"$and":[
                        {'workerId':{'$exists':True}},
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {"$or":[{'iterationName':'pilot2'},
                                {'iterationName':'pilot3'},
                                {'iterationName':'pilot4'},
                                {'iterationName':'Exp2Pilot1'}]},
                        {'trialNum':0}]
                     })

df_trial_end_full = pd.DataFrame(list(query.sort('timeAbsolute')))
#df_trial_end_full[['workerId','gameID']]


assert (np.mean(df_trial_end_full['workerId'].value_counts()) == np.mean(df_trial_end_full['gameID'].value_counts()))

### Find full datasets for Silhouette_1

In [5]:
# get ids of people with trial 15 data
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName': iterationName}]
                     })
complete_data_df = pd.DataFrame(query)
complete_data_ids = list(complete_data_df['workerId'])

In [6]:
complete_data_df['workerId'].nunique()

1

In [7]:
complete_data_df

Unnamed: 0,F1Score,_id,aID,allBlockBodyProperties,allBlockDims,allVertices,blockColor,blockColorID,blockColors,blockOptions,...,timeThresholdRed,timeThresholdYellow,timeToBuild,trialNum,version,vertices,workerId,worldHeightUnits,worldScale,worldWidthUnits
0,0,5e20a705d920b335b21bac7a,3LYA37P8IR190VJLC747L1FCIEKBK6,"[{'id': 23, 'angle': 0.0009042862596682584, 'p...","[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","[[{'x': 440.0146822146286, 'y': 592.6725852904...",#F7EA31,7,"[#78878C, #791E94, #6B4623, #FF4A1C, #E85D75, ...","{'friction': 0.9, 'frictionStatic': 1.4, 'dens...",...,15000,30000,12773,0,82,"[{'x': -247.50000000000017, 'y': 702.428571428...",A1RFS3YXD1ZIKG,8,2.2,8
1,0,5e20a74ed920b335b21bac8d,3LYA37P8IR190VJLC747L1FCIEKBK6,"[{'id': 42, 'angle': 0.000278705060374706, 'po...","[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","[[{'x': 549.7040112548426, 'y': 592.7159051700...",#791E94,1,"[#78878C, #791E94, #6B4623, #FF4A1C, #E85D75, ...","{'friction': 0.9, 'frictionStatic': 1.4, 'dens...",...,15000,30000,19073,1,82,"[{'x': -247.50000000000017, 'y': 702.428571428...",A1RFS3YXD1ZIKG,8,2.2,8


## Collect data from db and filter with sanity checks

In [None]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':iterationName}]
                     })

df_trial_end_full = pd.DataFrame(list(query.sort('timeAbsolute')))



# filter dataframe for complete datasets
df_trial_end_full_filtered = df_trial_end_full[df_trial_end_full.workerId.isin(complete_data_ids)]


# reduce to crucial information
df_trial_end_reduced_filtered = df_trial_end_full_filtered[['gameID','trialNum','phase','condition',
                                                            'eventType','score','normedScore','numBlocks',
                                                            'timeAbsolute','timeRelative','buildTime',
                                                            'currBonus','exploreResets','buildResets',
                                                            'allVertices','nPracticeAttempts','exploreStartTime',
                                                            'buildStartTime','buildFinishTime','targetName','numBlocksExplore']]

df = df_trial_end_reduced_filtered.sort_values(by=['gameID', 'timeAbsolute'])


### Integrate reset data before sending

In [None]:
# Resets

query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'reset'},
                        {'phase':'build'},
                        {'iterationName':iterationName}]
                     })
df_resets_full = pd.DataFrame(list(query.sort('timeAbsolute')))
df_resets_full_filtered = df_resets_full[df_resets_full.workerId.isin(complete_data_ids)]
df_resets_reduced_filtered = df_resets_full_filtered[['gameID','trialNum','phase','condition','numBlocks']]
pre_reset_blocks = df_resets_reduced_filtered.groupby(
    ['gameID','trialNum','phase','condition'])['numBlocks'].apply(list).reset_index()

# Merge pre-reset blocks with build data
pre_reset_blocks = pre_reset_blocks.rename(columns = {'numBlocks':'preResetBuildBlocks'})

pre_reset_blocks = pre_reset_blocks.fillna(value={'preResetBuildBlocks': 0})
df = df.merge(pre_reset_blocks, on=['gameID', 'trialNum','phase','condition'], how='left')

df2.merge(df1, on['a'], how='left')

# Rename and add totals
df = df.fillna(value={'preResetBuildBlocks': 0})
df = df.rename(columns = {'numBlocks':'finalBuildBlocks'})
df['totalBuildBlocks'] = df['finalBuildBlocks'] + df['preResetBuildBlocks'].apply(np.sum)

In [None]:
df

In [None]:
## save out to csv dir, where all the csv's go to live
out_path = os.path.join(csv_dir,'block_silhouette_{}.csv'.format(iterationName))
df_for_analysis.to_csv(out_path)

## Settled Block Data

In [None]:
print('Loading iteration: ' + iterationName)
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'settled'},
                        {'iterationName':iterationName}]
                     })

df_settled_full = pd.DataFrame(list(query))


# filter dataframe for complete datasets
df_settled_full_filtered = df_settled_full[df_settled_full.workerId.isin(complete_data_ids)]

print('Loaded ' + str(df_settled_full_filtered.shape[0]) + ' complete sets of settled blocks')
# reduce to crucial information

In [None]:

df_settled_reduced_filtered = df_settled_full_filtered[['gameID','trialNum','phase','condition',
                                                            'eventType','numBlocks', 'timeAbsolute','timeRelative',
                                                            'normedScore','currBonus','score','incrementalScore','normedIncrementalScore',
                                                            'currBonus','allVertices','targetName','relativePlacementTime','iterationName',
                                                            'blockKind'
                                                           ]]

df_settled_reduced_filtered = df_settled_reduced_filtered.sort_values(by=['gameID', 'timeAbsolute'])

buildstart = df_for_analysis[['gameID','trialNum','buildStartTime','exploreStartTime']]

# copy across time variables that were not saved in the correct place in pilot 3
df_settled_reduced_filtered = df_settled_reduced_filtered.merge(buildstart, on=['gameID', 'trialNum'], how='left')
df_settled_reduced_filtered['timePlaced'] = df_settled_reduced_filtered['timeAbsolute'] - df_settled_reduced_filtered['buildStartTime']
df_settled_reduced_filtered.loc[(df_settled_reduced_filtered.timePlaced < 0),'timePlaced'] = df_settled_reduced_filtered[df_settled_reduced_filtered.timePlaced < 0]['timeAbsolute'] - df_settled_reduced_filtered[df_settled_reduced_filtered.timePlaced < 0]['exploreStartTime']

# 
df_settled_reduced_filtered.loc[(df_settled_reduced_filtered.iterationName == 'pilot4'),'timePlaced'] = df_settled_reduced_filtered[(df_settled_reduced_filtered.iterationName == 'pilot4')]['relativePlacementTime'] 
df_settled_reduced_filtered.loc[(df_settled_reduced_filtered.timePlaced <= 0),'timePlaced'] = 0



In [None]:
## save out to csv dir, where all the csv's go to live
out_path = os.path.join(csv_dir,'block_silhouette_settled_{}.csv'.format(iterationName))
df_settled_reduced_filtered.to_csv(out_path)

In [None]:
## load in dataframe
data_path = os.path.join(csv_dir,'block_silhouette_settled_{}.csv'.format(iterationName))
df = pd.read_csv(data_path)
df

## Explore End

In [None]:
# Ensure one to one gameID and workerId 
# Should only happen if a repeat worker gets through

query = coll.find({"$and":[
                        {'workerId':{'$exists':True}},
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'explore_end'},
                        {"$or":[{'iterationName':'pilot2'},
                                {'iterationName':'pilot3'},
                                {'iterationName':'pilot4'}]},
                        {'trialNum':0}]
                     })

df_explore_end_full = pd.DataFrame(list(query.sort('timeAbsolute')))

assert (np.mean(df_trial_end_full['workerId'].value_counts()) == np.mean(df_trial_end_full['gameID'].value_counts()))

In [None]:
# get ids of people with trial 15 data
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'explore_end'},
                        {'iterationName': iterationName},
                        {'trialNum':15}]
                     })
complete_data_df = pd.DataFrame(query)
complete_data_ids = list(complete_data_df['workerId'])

In [None]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'explore_end'},
                        {'iterationName':iterationName}]
                     })

df_explore_end_full = pd.DataFrame(list(query.sort('timeAbsolute')))



# filter dataframe for complete datasets
df_explore_end_full_filtered = df_explore_end_full[df_explore_end_full.workerId.isin(complete_data_ids)]


# reduce to crucial information
df_explore_end_reduced_filtered = df_explore_end_full_filtered[['gameID','trialNum','phase','condition',
                                                            'eventType','score','normedScore','numBlocks',
                                                            'timeAbsolute','timeRelative',
                                                            'currBonus','exploreResets',
                                                            'allVertices','nPracticeAttempts','exploreStartTime',
                                                            'targetName','numBlocksExplore']]

df_explore = df_explore_end_reduced_filtered.sort_values(by=['gameID', 'timeAbsolute'])

In [None]:
df_explore

## Integrate reset data before sending

In [None]:
# Resets

query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'reset'},
                        {'phase':'explore'},
                        {'iterationName':iterationName}]
                     })
df_resets_full = pd.DataFrame(list(query.sort('timeAbsolute')))
df_resets_full_filtered = df_resets_full[df_resets_full.workerId.isin(complete_data_ids)]
df_resets_reduced_filtered = df_resets_full_filtered[['gameID','trialNum','phase','condition','numBlocks']]
pre_reset_blocks = df_resets_reduced_filtered.groupby(
    ['gameID','trialNum','phase','condition'])['numBlocks'].apply(list).reset_index()

# Merge pre-reset blocks with explore data
pre_reset_blocks = pre_reset_blocks.rename(columns = {'numBlocks':'preResetExploreBlocks'})
df_explore = df_explore.merge(pre_reset_blocks, on=['gameID', 'trialNum','phase','condition'], how='left')

# Rename and add totals
df_explore.loc[df_explore.preResetExploreBlocks.isnull(),'preResetExploreBlocks'] = df_explore.preResetExploreBlocks.loc[df_explore.preResetExploreBlocks.isnull()].apply(lambda x:[])
df_explore = df_explore.rename(columns = {'numBlocksExplore':'finalExploreBlocks'})
df_explore['totalExploreBlocks'] = df_explore['finalExploreBlocks'] + df_explore['preResetExploreBlocks'].apply(np.sum)

#df_explore = df_explore.fillna(value={'totalExploreBlocks': 0 })


In [None]:
df_explore['numAttempts'] = df_explore['preResetExploreBlocks'].apply(len) + 1

In [None]:
## save out to csv dir, where all the csv's go to live
out_path = os.path.join(csv_dir,'block_silhouette_explore_{}.csv'.format(iterationName))
df_explore.to_csv(out_path)

### Test data saving

In [62]:
query = coll.find({"$and":[
                        #{'condition':{'$ne':'practice'}},
                        {'eventType':'initial'},
                        {'iterationName':iterationName}]
                     })
df = pd.DataFrame(list(query.sort('timeAbsolute')))
df

Unnamed: 0,F1Score,_id,aID,allBlockDims,blockBodyProperties,blockCenterX,blockCenterY,blockColor,blockColorID,blockColors,...,timeRelative,timeThresholdRed,timeThresholdYellow,trialNum,version,vertices,workerId,worldHeightUnits,worldScale,worldWidthUnits
0,0,5e210433e0e28b3fb63a7f66,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 6, 'type': 'body', 'label': 'Rectangle ...",495.0,424.786115,#B13B00,0,,...,179629.295,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
1,0,5e210438e0e28b3fb63a7f68,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 8, 'type': 'body', 'label': 'Rectangle ...",495.0,278.536115,#B13B00,0,,...,184479.075,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
2,0,5e21043be0e28b3fb63a7f6a,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 10, 'type': 'body', 'label': 'Rectangle...",495.0,373.508113,#B13B00,0,,...,187334.91,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
3,0,5e210441e0e28b3fb63a7f6c,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 18, 'type': 'body', 'label': 'Rectangle...",550.0,591.286115,#B13B00,0,,...,193802.02,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
4,0,5e210444e0e28b3fb63a7f6e,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 20, 'type': 'body', 'label': 'Rectangle...",440.030353,674.490706,#B13B00,0,,...,196690.675,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
5,0,5e210448e0e28b3fb63a7f70,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 22, 'type': 'body', 'label': 'Rectangle...",440.028912,619.953196,#B13B00,0,,...,200215.45,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
6,0,5e21044ce0e28b3fb63a7f72,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 24, 'type': 'body', 'label': 'Rectangle...",412.5,537.286115,#B13B00,0,,...,204568.575,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
7,0,5e210450e0e28b3fb63a7f74,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 26, 'type': 'body', 'label': 'Rectangle...",495.0,422.536115,#B13B00,0,,...,208520.095,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
8,0,5e210456e0e28b3fb63a7f76,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 28, 'type': 'body', 'label': 'Rectangle...",495.0,314.536115,#B13B00,0,,...,214302.05,15000,30000,,,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8
9,0,5e2104bae0e28b3fb63a7f7a,3VAR3R6G1QF9OO6T7NH97CVPSP8O8T,"[[1, 2], [2, 1], [2, 2], [2, 4], [4, 2]]","{'id': 35, 'type': 'body', 'label': 'Rectangle...",385.0,645.161111,#6B4623,2,"[#78878C, #791E94, #6B4623, #FF4A1C, #E85D75, ...",...,315104.545,15000,30000,0.0,82.0,"[{'x': -247.50000000000017, 'y': 702.428571428...",A12FTSX85NQ8N9,8,2.2,8


In [47]:
df[['gameID','trialNum','phase','condition',
    'eventType','score','normedScore','numBlocks',
    'timeAbsolute','timeRelative','buildTime',
    'currBonus','allVertices','nPracticeAttempts',
    'buildStartTime','buildFinishTime','targetName',
    'blockColor','blockColorID','numTargets', 'prePostSetSize',
    'numRepetitions','repetition','targetID',
    'bonusThresholdLow','bonusThresholdMid','bonusThresholdHigh',
    'timeThresholdYellow','timeThresholdRed','devMode',
    'timeBonus'
   ]]

KeyError: "None of [Index(['gameID', 'trialNum', 'phase', 'condition', 'eventType', 'score',\n       'normedScore', 'numBlocks', 'timeAbsolute', 'timeRelative', 'buildTime',\n       'currBonus', 'allVertices', 'nPracticeAttempts', 'buildStartTime',\n       'buildFinishTime', 'targetName', 'blockColor', 'blockColorID',\n       'numTargets', 'prePostSetSize', 'numRepetitions', 'repetition',\n       'targetID', 'bonusThresholdLow', 'bonusThresholdMid',\n       'bonusThresholdHigh', 'timeThresholdYellow', 'timeThresholdRed',\n       'devMode', 'timeBonus'],\n      dtype='object')] are in the [columns]"

In [94]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':iterationName}]
                     })

df = pd.DataFrame(list(query.sort('gameID')))

# df[['gameID','trialNum','phase','condition',
#     'eventType','score','normedScore','numBlocks',
#     'timeAbsolute','timeRelative','buildTime',
#     'currBonus','allVertices','nPracticeAttempts',
#     'buildStartTime','buildFinishTime','targetName',
#     'blockColor','blockColorID','numTargets', 'prePostSetSize',
#     'numRepetitions','repetition','targetID',
#     'bonusThresholdLow','bonusThresholdMid','bonusThresholdHigh',
#     'timeThresholdYellow','timeThresholdRed','devMode',
#     'timeBonus'
#    ]]

df[['gameID','trialNum','phase','condition','repetition','normedScore','targetName','currBonus','timeBonus','score']]

Unnamed: 0,gameID,trialNum,phase,condition,repetition,normedScore,targetName,currBonus,timeBonus,score
0,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,0,pre,repeated,0,0.478350,hand_selected_006,0.00,0.0,0.000
1,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,1,pre,control,0,0.314096,hand_selected_011,0.00,0.0,0.000
2,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,2,pre,repeated,0,0.500030,hand_selected_009,0.00,0.0,0.000
3,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,3,pre,control,0,0.417431,hand_selected_004,0.00,0.0,0.000
4,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,4,pre,repeated,0,0.752928,hand_selected_012,0.01,0.0,0.010
5,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,5,pre,control,0,0.568097,hand_selected_005,0.00,0.0,0.010
6,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,6,pre,repeated,0,0.570508,hand_selected_008,0.00,0.0,0.010
7,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,7,pre,control,0,0.484642,hand_selected_016,0.00,0.0,0.010
8,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,8,repeated,repeated,1,0.606160,hand_selected_012,0.00,0.0,0.010
9,0576-854f9e5c-1d8f-45e2-8037-d67a78e0d9da,9,repeated,repeated,1,0.544478,hand_selected_006,0.00,0.0,0.010


In [96]:
# get sequence of trial_end data for all people
# note that these currently lump everyone together, and don't
query = coll.find({"$and":[
                        {'eventType':'survey_data'},
                        {'iterationName':iterationName}]
                     })
df_survey = pd.DataFrame(list(query.sort('workerId')))
list(df_survey[['workerId','age','comments','difficulty','fun','strategies','inputDevice','sex','score']]['strategies'])

['When I first started I kept trying to make it just like the picture, but the scale was off, so then I just went for trying to get it as close as I could',
 'No',
 'Not really']