# Analyses for SPP

In [None]:
import os
import sys
import urllib, io

import numpy as np
import scipy.stats as stats
import pandas as pd
from random import random

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [None]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir = os.path.abspath(os.path.join(os.getcwd(),'..'))
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))
jefan_dir = os.path.join(analysis_dir,'jefan')
will_dir = os.path.join(analysis_dir,'will')

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       

### Load Data

In [None]:
# Trial_end data
iterationName = 'pilot4'

trial_path = os.path.join(csv_dir,'block_silhouette_{}.csv'.format(iterationName))
df = pd.read_csv(trial_path)

In [None]:
# Settled_block data
settled_path = os.path.join(csv_dir,'block_silhouette_settled_{}.csv'.format(iterationName))
dfs = pd.read_csv(settled_path)

In [None]:
# Explore_end data
explore_path = os.path.join(csv_dir,'block_silhouette_explore_{}.csv'.format(iterationName))
dfe = pd.read_csv(explore_path)

In [None]:
# Exclude outliers
low_scores = df[(df.trialNum == 15) & (df.score == 0)]['gameID']

df = df[~df.gameID.isin(low_scores)]
dfs = dfs[~dfs.gameID.isin(low_scores)]
dfe = dfe[~dfe.gameID.isin(low_scores)]

n = df.gameID.nunique()
print(str(n) + ' participants with score > 0 and full set of trials')


## Preprocessing

### Block Data

In [None]:
# Add a condition for physical with 0 actions
dfs['extendedCondition'] = dfs['condition']
trialInfo = df[['gameID','trialNum','numBlocksExplore']]
dfs = dfs.merge(trialInfo, on=['gameID', 'trialNum'], how='left')
dfs.loc[(df.condition == 'physical') & (dfs.numBlocksExplore == 0),'extendedCondition'] = 'no_action_physical'

# Add useful variables to dataframe
dfs['subject_trial'] = dfs['gameID'] + '_' + str(dfs['trialNum'])
dfs['condition_number'] = np.where(dfs['condition']=='mental', 100, 2)
dfs['condition_code'] = dfs['condition_number'] + dfs['trialNum']
dfs['time_bin'] = np.round_(dfs['timePlaced']/10000)*10000

dfs_build = dfs[dfs.phase == 'build']
dfs_explore = dfs[dfs.phase == 'explore']

In [None]:
df = df.merge(dfe[['gameID','trialNum','condition','totalExploreBlocks','numAttempts','finalExploreBlocks']], on=['gameID','trialNum','condition'], how='left')

### Explore Data

## Basic summaries

### Did people use the physical practice phase? 
What percentage of physical trials did people place >0 blocks?

What is the distribution of total time from first block placed to last block placed? (Did they use the full time?)

In [None]:
physical_explore = dfe[dfe.condition == 'physical']

n_something = physical_explore[physical_explore.totalExploreBlocks > 0].shape[0]
n_nothing = physical_explore[physical_explore.totalExploreBlocks == 0].shape[0]

print('In ' + str(n_something) + ' physical explore phases, participant placed at least one block')
print('In ' + str(n_nothing) + ' physical explore phases, participant placed zero blocks')
print(str('{}%'.format(100*n_something/(n_something+n_nothing))))

In [None]:
def percent_used_exactly(totalExploreBlocks, how_many = 0):
    n_something = totalExploreBlocks[totalExploreBlocks != how_many].shape[0]
    n_nothing = totalExploreBlocks[totalExploreBlocks == how_many].shape[0]
    return 100*n_something/(n_something+n_nothing)

In [None]:
def percent_used_at_least(totalExploreBlocks, how_many = 1):
    n_something = totalExploreBlocks[totalExploreBlocks >= how_many].shape[0]
    n_nothing = totalExploreBlocks[totalExploreBlocks < how_many].shape[0]
    print(n_something)
    print(n_nothing)
    return 100*n_something/(n_something+n_nothing)

In [None]:
percent_used_at_least(physical_explore['totalExploreBlocks'],1)

In [None]:
summaryUsedExplores = physical_explore.groupby(['gameID','trialNum'])['totalExploreBlocks'].apply(percent_used_exactly)

In [None]:
summaryUsedExplores

In [None]:
sum((pd.DataFrame(summaryUsedExplores).reset_index().groupby(by='gameID').agg(sum))['totalExploreBlocks']==0)

In [None]:
(pd.DataFrame(summaryUsedExplores).reset_index().groupby(by='gameID').agg(sum))

In [None]:
summaryUsedExplores = physical_explore.groupby(['gameID','trialNum'])['totalExploreBlocks'].apply(percent_used_exactly)
sns.distplot(summaryUsedExplores.groupby('gameID').apply(percent_used_exactly),
            bins = 10,
            kde = False)
plt.ylabel('count')
plt.xlabel('percent explore phases used')
plt.title('Participant use of explore phase')
plt.tight_layout()
percent_used(summaryUsedExplores, how_many = 100)

In [None]:
physical_explore[physical_explore['totalExploreBlocks']>0].shape[0]

In [None]:
physical_explore[physical_explore['totalExploreBlocks']==0].shape[0]

In [None]:
trialExploreUsage = physical_explore.groupby(['trialNum','gameID'])['totalExploreBlocks'].apply(percent_used_exactly)
trialExploreUsage = trialExploreUsage.reset_index()

sns.set_context('poster')
b = sns.lineplot(data = trialExploreUsage,
                palette='husl',
                x='trialNum',
                y='totalExploreBlocks')
plt.legend(bbox_to_anchor=(1.0,1))
plt.ylabel('PercentExploresUsed')
plt.xlabel('trialNum')
plt.yticks(np.linspace(0,1,6))
plt.tight_layout()
plt.title('mean explore phase use over time', fontsize = 18)


In [None]:
trialExploreUsage = physical_explore.groupby(['targetName','gameID'])['totalExploreBlocks'].apply(percent_used_exactly)
trialExploreUsage = trialExploreUsage.reset_index()

sns.set_context('poster')
b = sns.barplot(data = trialExploreUsage,
                palette='husl',
                x='targetName',
                y='totalExploreBlocks')
plt.ylabel('PercentExploresUsed')
plt.xlabel('targetName')
plt.yticks(np.linspace(0,1,6))
plt.tight_layout()
plt.title('mean explore phase use by structure', fontsize = 18)


In [None]:
fig = plt.figure(figsize=(8,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
c = sns.distplot(physical_explore['totalExploreBlocks'],
                 bins=9, 
                 kde=False, 
                 #hue='gameID', 
                 #kind='point',
                );
plt.ylabel('count')
plt.xlabel('blocks placed')
plt.title('Blocks places in physical explore phase')
plt.tight_layout()


In [None]:
y = physical_explore[physical_explore['totalExploreBlocks']>0]['totalExploreBlocks']

np.mean(y)

In [None]:
np.std(physical_explore[physical_explore['totalExploreBlocks']>0]['totalExploreBlocks'])

### Explore vs. exploit question: how many attempts did they make?
Proportion of trials that had 0 attempts (no blocks), 1 attempt, 2+ attempts.

"Attempt" operationalized as explore bout in which >= 1 block placed. 

How many "failures" (tower collapsed/fell over)?

Insofar as num_blocks / attempt is "high," suggestive of instrumental planning as opposed to exploration as such. 

In [None]:
real_attempts = physical_explore['numAttempts'][physical_explore.totalExploreBlocks>0]

fig = plt.figure(figsize=(8,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
c = sns.distplot(real_attempts,
                 #bins=9, 
                 kde=False, 
                 #kind='point',
                );
plt.ylabel('count')
plt.xlabel('n attempts')
plt.title('Number of attempts')
plt.tight_layout()


In [None]:
one = sum(physical_explore['numAttempts'] == 1)
two = sum(physical_explore['numAttempts'] == 2)
three = sum(physical_explore['numAttempts'] == 3)

print(one, two, three)



In [None]:
final_attempts = physical_explore['finalExploreBlocks'][physical_explore.totalExploreBlocks > 0]

fig = plt.figure(figsize=(8,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
c = sns.distplot(final_attempts,
                 #bins=9, 
                 kde=False, 
                 #kind='point',
                );
plt.ylabel('count')
plt.xlabel('n blocks')
plt.title('Number blocks in final attempt')
plt.tight_layout()


In [None]:
numBlock_data = df

fig = plt.figure(figsize=(8,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
c = sns.distplot(numBlock_data['numBlocks'],
                 #bins=9, 
                 kde=False, 
                 #kind='point',
                );
plt.ylabel('count')
plt.xlabel('n blocks')
plt.title('Number blocks in reconstruction')
plt.tight_layout()

In [None]:
print('reconstruction mean: {}'.format(np.mean(physical_explore['finalExploreBlocks'])) +\
      ', sd: {}'.format(np.std(physical_explore['finalExploreBlocks'])))

print('explore mean: {}'.format(np.mean(numBlock_data['numBlocks'])) +\
      ', sd: {}'.format(np.std(numBlock_data['numBlocks'])))

In [None]:
list(df.columns)

In [None]:
df

col_name = 'totalExploreBlocks'

fig = plt.figure(figsize=(8,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
c = sns.distplot(df[(df.condition!='practice') & (df.buildTime>0)][col_name],
                 bins=9, 
                 kde=False, 
                 #kind='point',
                );
plt.ylabel('count')
plt.xlabel(col_name)
plt.title(col_name)
plt.tight_layout()

print(col_name + ' mean: {}'.format(np.mean(numBlock_data[col_name])) +\
      ', sd: {}'.format(np.std(numBlock_data[col_name])))


In [None]:
import ast

def newAppend(a,b):
    c = ast.literal_eval(a).copy()
    c.append(b)
    return c

attemptBlocks = physical_explore.apply(
    lambda row: newAppend(row['preResetExploreBlocks'],row['finalExploreBlocks']),axis=1)


block_and_attempts = []

for subject, attemptList in enumerate(attemptBlocks):
    for an, ab in enumerate(attemptList):
        block_and_attempts.append([an+1,ab,an==len(attemptList)-1,subject,len(attemptList)])

df_block_and_attempts = pd.DataFrame(block_and_attempts, columns = 
                                     ['attempt_number', 'attempt_blocks', 'final_attempt','subject','subjectAttempts']) 
        
df_block_and_attempts['attempt_blocks_jittered'] = df_block_and_attempts['attempt_blocks'].apply(lambda x: x + ((random()-0.5)/5))

fig = plt.figure(figsize=(8,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
sns.swarmplot(x="attempt_number", y="attempt_blocks_jittered", hue="subjectAttempts", dodge = True,
              palette=["r", "c", "y"], data=df_block_and_attempts)
plt.legend(bbox_to_anchor=(1.0,1))
plt.ylabel('Number of blocks',fontsize=14)
plt.xlabel('Number attempt',fontsize=14)
plt.title('Number of blocks placed in each attempt, by number of attempts total', fontsize=16)


In [None]:
import ast

def newAppend(a,b):
    c = ast.literal_eval(a).copy()
    c.append(b)
    return c

attemptBlocks = physical_explore.apply(
    lambda row: newAppend(row['preResetExploreBlocks'],row['finalExploreBlocks']),axis=1)


block_and_attempts = []

for subject, attemptList in enumerate(attemptBlocks):
    for an, ab in enumerate(attemptList):
        block_and_attempts.append([an+1,ab,an==len(attemptList)-1,subject,len(attemptList)])

df_block_and_attempts = pd.DataFrame(block_and_attempts, columns = 
                                     ['attempt_number', 'attempt_blocks', 'final_attempt','subject','subjectAttempts']) 
        
df_block_and_attempts['attempt_blocks_jittered'] = df_block_and_attempts['attempt_blocks'].apply(lambda x: x + ((random()-0.5)/5))

fig = plt.figure(figsize=(8,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
sns.swarmplot(x="subject", y="attempt_blocks_jittered", hue="attempt_number", dodge = True,
              palette=["r", "c", "y"], data=df_block_and_attempts)
plt.legend(bbox_to_anchor=(1.0,1))
plt.ylabel('Number of blocks',fontsize=14)
plt.xlabel('Number attempt',fontsize=14)
plt.title('Number of blocks placed in each attempt, by number of attempts total', fontsize=16)


### Within the final prototyping attempt, what did people build?

How similar are the prototypes to the final reconstructions?

- Intersection over Union for every matched time step within-participant, between-phase, within-structure -- compared to IOU for between-participant, btw-phase, within-structure.
- ** block_id "match proportion" metric ... compare observed to null distribution with participants shuffled

If this phase was meant to be practice, for those blocks that they did get to place in the practice phase, did they  place *those* blocks more accurately in the build phase than they did in the practice phase?

- Compute difference in area under the curve ...
- Examine cumulative surface area as function of block placement by phase

How much did the within-trial observed accuracy timecourse deviate from optimal accuracy timecourse?

In [None]:
list(df.columns)

In [None]:
df = df[(df.numBlocks>2)]
df = df[~df.gameID.isin(low_scores)]


df['extendedCondition'] = df['condition']
df.loc[(df.condition == 'physical') & (df.totalExploreBlocks >= 6),'extendedCondition'] = '>=6 blocks'
df.loc[(df.condition == 'physical') & (df.totalExploreBlocks < 6),'extendedCondition'] = '3<x<6 blocks'
df.loc[(df.condition == 'physical') & (df.totalExploreBlocks <= 3),'extendedCondition'] = '<=3 blocks'
df.loc[(df.condition == 'physical') & (df.totalExploreBlocks == 0),'extendedCondition'] = '0 blocks'

fig = plt.figure(figsize=(10,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
b = sns.barplot(data = df,
                palette='husl',
                hue='extendedCondition',
                hue_order=['mental','0 blocks','<=3 blocks','3<x<6 blocks','>=6 blocks'],
                #hue_order=['mental','no_action_physical','not_very_physical','physical','very_physical'],
                x='condition',
                y='normedScore')
plt.legend(bbox_to_anchor=(1.0,1))
plt.ylabel('Normed F1 score')
plt.xlabel('Condition')
plt.yticks(np.linspace(0,1,6))
plt.tight_layout()
plt.title('Accuracy of reconstruction by exploration', fontsize = 18)


In [None]:
fig = plt.figure(figsize=(8,6))
sns.set_context('poster')
sns.set_style('whitegrid', {'legend.frameon':False})
sns.swarmplot(data=df, x="condition", y="normedScore", hue="totalExploreBlocks", dodge = True,
              palette=sns.color_palette("hls", 10))
plt.legend(bbox_to_anchor=(1.0,1))
plt.title('Accuracy of reconstruction by numblocks in exploration phase', fontsize = 18)

In [None]:
## save out to csv dir, where all the csv's go to live
out_path = os.path.join(csv_dir,'full_temp_block_silhouette_{}.csv'.format(iterationName))
df.to_csv(out_path)