In [3]:
import os
import sys
import urllib, io

import numpy as np
import scipy.stats as stats
import pandas as pd

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [4]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir = os.path.abspath(os.path.join(os.getcwd(),'..'))
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))
jefan_dir = os.path.join(analysis_dir,'jefan')
will_dir = os.path.join(analysis_dir,'will')

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

In [5]:
# set vars 
auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## cocolab ip address

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['block_construction']
coll = db['silhouette']

# which iteration name should we use?
iterationName = 'Exp2Pilot3'

# variables to check integrity of data
numTrials = 24

## Sanity Checks

In [6]:
# Ensure one to one gameID and workerId 
# Should only happen if a repeat worker gets through

query = coll.find({"$and":[
                        {'workerId':{'$exists':True}},
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {"$or":[{'iterationName':'pilot2'},
                                {'iterationName':'pilot3'},
                                {'iterationName':'pilot4'},
                                {'iterationName':'Exp2Pilot1'},
                                {'iterationName':'Exp2Pilot1_turk'},
                                {'iterationName':'Exp2Pilot1_turk'}]},
                        {'trialNum':0}]
                     })

df_trial_end_full = pd.DataFrame(list(query.sort('timeAbsolute')))
#df_trial_end_full[['workerId','gameID']]


assert (np.mean(df_trial_end_full['workerId'].value_counts()) == np.mean(df_trial_end_full['gameID'].value_counts()))

### Find full datasets for Silhouette_1

In [7]:
# Assuming that if trial 23 saves, then 0-22 have also saved 
# get ids of people with trial 23 data
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName': iterationName},
                        {'trialNum': numTrials-1}]
                     })
complete_data_df = pd.DataFrame(query)
complete_data_ids = list(complete_data_df['workerId'])

In [8]:
complete_data_ids

['A2ZDEERVRN5AMC',
 'A17FGZ1I5P9RZA',
 'A2LANO898EAYKJ',
 'ALLP45O3BDWYM',
 'A1LA6CIGBNDOH9',
 'AY832D29HUURG',
 'A28UGNCW3YMSTH',
 'A1ROEDVMTO9Y3X',
 'ARL7HOWLEHNOP',
 'AMV1E7FFPVAW4',
 'A1VR1XQEQQXYUE',
 'A3CWYWKQXX4RIZ',
 'A3FT3XPTOWHJMY',
 'A1CY7IOJ9YH136',
 'A1X84T4EFW04GZ',
 'A3G16WWK0QUQ80',
 'A314ERJIHRSDY7',
 'A2HHWFGVV9UUC5',
 'A3LXD82BMSRT2F',
 'A7O82NXM2PI12',
 'ANGJ99ZU0TTGO',
 'A2EA2PN47ZWILX',
 'A5NHP0N1XC09K',
 'A1F669OTXWIJW0',
 'AT468RB7BWBQW',
 'A3V57BKH58EUIY',
 'AEF74ZYJTTEIA',
 'AVT79B8F5O9LI',
 'A1YFVXP4A1CXSF',
 'A2GLSQQQE9UZA6',
 'A2LAMCJLVCRQ4T',
 'A1M682B2WUSYJP',
 'A2GA29WTMFW2W',
 'A1TMZLYXQAK8Q0',
 'A2196WCNDZULFS',
 'A2MCG5W6LHSRG9',
 'A1OVGCI9KUL4MI',
 'A2LF84L3K71GR2',
 'AQOXSP4W3ITSW',
 'AG9LWKO86TNHG',
 'A2UCTX06NM6Y02',
 'AFIK3VBMMX6G6',
 'A3LI18V0QQ34YK',
 'AILDNTO2TWB4A',
 'A3K0E7TSPX25GH',
 'AEQ8K4HBO323D',
 'A3MLUEOP3CCLXL',
 'A39MKVROUZ1UWR',
 'A2QTSQ26FID1FK',
 'A1H3IOEYN0VNB2',
 'A3QEVFM3UD67BA']

In [9]:
# Filter for full datasets
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'trial_end'},
                        {'iterationName':iterationName}]
                     })

df_trial_end_full = pd.DataFrame(list(query.sort('timeAbsolute')))


# filter dataframe for complete datasets
df_trial_end_full_filtered = df_trial_end_full[df_trial_end_full.workerId.isin(complete_data_ids)]

# reduce to crucial information
df_trial_end_reduced_filtered = df_trial_end_full_filtered[[
    'gameID','trialNum','phase','condition','eventType','targetName','repetition','targetID', #trial identifiers
    'nullScore','F1Score','normedScore','rawScoreDiscrete','nullScoreDiscrete','normedScoreDiscrete','scoreGapDiscrete', #scoring
    'numBlocks','nPracticeAttempts','blockColor','blockColorID','blockFell','doNothingRepeats',#misc. trial info
    'score','currBonus','timeBonus', #bonusing
    'timeAbsolute','timeRelative','buildTime','buildStartTime','buildFinishTime','timeToBuild', #timing 
    'discreteWorld','allVertices', #world reconstruction
    'browser','browserVersion','os','devMode', #developer info
    #below here should be the same for every trial in a dataset
    'iterationName',
    'numTargets', 'prePostSetSize','numRepetitions', #pre-post info
    'bonusThresholdLow','bonusThresholdMid','bonusThresholdHigh','timeThresholdYellow','timeThresholdRed', #bonus info
    ]]

#Fix error in data-saving- normedScoreDiscrete saved as rawScoreDiscrete
df_trial_end_reduced_filtered['normedScoreDiscrete'] = df_trial_end_reduced_filtered['rawScoreDiscrete']
df_trial_end_reduced_filtered.drop(['rawScoreDiscrete'], axis=1)


df = df_trial_end_reduced_filtered.sort_values(by=['gameID', 'timeAbsolute'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
targetMaps = {}

with open(os.path.join(csv_dir,'targetMaps.txt')) as json_file:
    targetMaps = json.load(json_file)

In [155]:
def getPrecision(arr1,arr2):
    print(arr1)
    print(arr2)
    prod = np.multiply(arr1,arr2)
    false_pos = np.subtract(arr2,prod)
    numerator = np.sum(prod)
    denominator = np.add(numerator,np.sum(false_pos))
    recall = numerator/denominator
    return recall

def getRecall(arr1,arr2):
    prod = np.multiply(arr1,arr2)
    false_neg = np.subtract(arr1,arr2)
    numerator = np.sum(prod)
    denominator = np.add(np.sum(prod),np.sum(false_neg))
    recall = numerator/denominator
    return recall

def getF1Score(targetName, discreteWorld):
    targetMap = targetMaps[targetName]
    arr1 = 1*np.logical_not(np.array(targetMap))
    arr2 = 1*np.logical_not(np.array(discreteWorld))
    recall = getRecall(arr1, arr2)
    precision = getPrecision(arr1, arr2)
    numerator = np.multiply(precision, recall)
    denominator = np.add(precision, recall)
    quotient = np.divide(numerator, denominator)
    f1Score = np.multiply(2, quotient)
    #print('recall ' + recall);
    return f1Score

def getF1ScoreLambda(row):
    return(getF1Score(row['targetName'], row['discreteWorld']))
    
def getNullScore(targetName):
    targetMap = targetMaps[targetName]
    arr1 = 1*np.logical_not(np.array(targetMap))
    arr2 = 1*np.zeros(arr1.shape)
    recall = getRecall(arr1, arr2)
    precision = getPrecision(arr1, arr2)
    numerator = np.multiply(precision, recall)
    denominator = np.add(precision, recall)
    quotient = np.divide(numerator, denominator)
    f1Score = np.multiply(2, quotient)
    print('recall ', str(recall));
    print('precision ', str(precision));
    print('quotient ', str(quotient));
    return f1Score

In [156]:
getNullScore('hand_selected_009')

[[0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 0 0 0 0 0 0]
 [1 1 0 0 1 1 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 0 0 0 0 0]
 [1 1 0 0 1 1 1 1 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0

  


nan

In [149]:
getF1Score(df.targetName[0],df.discreteWorld[0])

0.9010989010989011

In [151]:
df['rawF1DiscreteScore'] =  df.apply(getF1ScoreLambda, axis=1)

  app.launch_new_instance()


742     0.883721
754     0.500000
766     0.717949
778     0.731707
792     0.551724
803     0.869565
815     0.370370
826     0.776471
839     0.818182
853     0.800000
866     0.375000
881     0.886076
895     0.588235
909     0.613333
923     0.826667
933     0.555556
946     0.611111
959     0.727273
970     0.776471
985     0.857143
992     0.794521
1000    0.631579
1007    0.736842
1013    0.631579
710     0.400000
715     1.000000
721     1.000000
729     0.896552
739     0.949495
752     0.864865
          ...   
654     1.000000
658     1.000000
663     1.000000
669     1.000000
672     1.000000
676     1.000000
150     0.492754
158     0.717949
167     0.731707
175     0.686567
185     0.864865
193     1.000000
201     0.916667
208     0.857143
215     0.594595
221     0.833333
230     1.000000
238     0.760563
246     0.950000
259     0.980392
266     0.844444
274     0.724638
281     0.322581
288     0.594595
296     0.989474
301     0.780488
307     0.871795
312     0.8571

In [159]:
np.sum(df['rawF1DiscreteScore']0)

0

In [152]:
# Make new column: phase_extended
# Same as phase but with 'repeated' split into 'repetition 1' and 'repetition 2'

df['phase_extended'] = df['phase']
df.loc[(df.phase=='repeated') & (df.repetition==1),'phase_extended'] = 'repetition 1'
df.loc[(df.phase=='repeated') & (df.repetition==2),'phase_extended'] = 'repetition 2'

phase_dict = {
    'pre':0,
    'repetition 1':1,
    'repetition 2':2,
    'post':3
}

ordered_phases = ['pre','repetition 1','repetition 2','post']
df['phase_number'] = df.phase_extended.astype("category",
                                              ordered=True,
                                              categories=ordered_phases).cat.codes

#df['phase_number'] 

  exec(code_obj, self.user_global_ns, self.user_ns)


In [153]:
out_path = os.path.join(csv_dir,'block_silhouette_{}.csv'.format(iterationName))
df.to_csv(out_path)

## Initial Block Data
Initial block placements (before physics, after snapping, before falling)

In [None]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'initial'},
                        {'iterationName':iterationName}]
                     })

df_initial_full = pd.DataFrame(list(query))

# filter dataframe for complete datasets
df_initial_full_filtered = df_initial_full[df_initial_full.workerId.isin(complete_data_ids)]

print('Loaded ' + str(df_initial_full_filtered.shape[0]) + ' complete sets of initial blocks')
# reduce to crucial information

In [None]:
df_initial_full_filtered.columns

In [None]:
df_initial_reduced_filtered = df_initial_full_filtered[[
    'gameID','trialNum','phase','condition','eventType','targetName','repetition','targetID','blockNum', #trial identifiers
    'nullScore','incrementalScore','normedIncrementalScore','rawScoreDiscrete','incrementalNormedScoreDiscretePrevious', #scoring
    'score','currBonus', #bonusing
    'timeAbsolute','timeRelative','timeBlockSelected','timeBlockPlaced','relativePlacementTime', #timing 
    'discreteWorld','vertices','blockKind','blockColorID','blockColor','blockCenterX', 'blockCenterY', #world reconstruction
    'x_index','y_index','x_discrete','y_discrete','width_discrete','height_discrete'
    ]]

df_initial_reduced_filtered = df_initial_reduced_filtered.sort_values(by=['gameID', 'timeAbsolute'])

In [None]:
df_initial_reduced_filtered['rawScoreDiscrete']

In [None]:
out_path = os.path.join(csv_dir,'block_silhouette_initial_{}.csv'.format(iterationName))
df_initial_reduced_filtered.to_csv(out_path)

## Settled Block Data
Block data after coming to rest (after physics)

In [None]:
query = coll.find({"$and":[
                        {'condition':{'$ne':'practice'}},
                        {'eventType':'settled'},
                        {'iterationName':iterationName}]
                     })

df_settled_full = pd.DataFrame(list(query))


# filter dataframe for complete datasets
df_settled_full_filtered = df_settled_full[df_settled_full.workerId.isin(complete_data_ids)]

print('Loaded ' + str(df_settled_full_filtered.shape[0]) + ' complete sets of settled blocks')
# reduce to crucial information

In [None]:
df_settled_full_filtered.columns

In [None]:
df_settled_reduced_filtered = df_settled_full_filtered[[
    'gameID','trialNum','phase','condition','eventType','targetName','repetition','targetID', #trial identifiers
    'nullScore','incrementalScore','normedIncrementalScore','rawScoreDiscrete','incrementalNormedScoreDiscrete','numBlocks','blockFell', #scoring
    'score','currBonus', #bonusing
    'timeAbsolute','timeRelative',#timing 
    'discreteWorld','allVertices','blockKind','blockColorID','blockColor','blockCenterX', 'blockCenterY',#world reconstruction
    'x_index','y_index','x_discrete','y_discrete'
    ]]

df_settled_reduced_filtered = df_settled_reduced_filtered.sort_values(by=['gameID', 'timeAbsolute'])

In [None]:
out_path = os.path.join(csv_dir,'block_silhouette_settled_{}.csv'.format(iterationName))
df_settled_reduced_filtered.to_csv(out_path)

In [None]:
query = coll.find({"$and":[
                        {'eventType':'survey_data'},
                        {'iterationName':iterationName}]
                     })
df_survey = pd.DataFrame(list(query.sort('absoluteTime')))
df_survey[['gameID','age','comments','difficulty','fun','strategies','inputDevice','sex','score']]

In [None]:
out_path = os.path.join(csv_dir,'block_silhouette_survey_{}.csv'.format(iterationName))
df_survey.to_csv(out_path)