# Analysis Notebook for CurioTower experiment

(Need to add link to pre-registration)

The goal of this experiment is to gather human judgments on the "interestingness" and "stability" of towers, both generated in curiodrop and TDW

### Establish connection to mongo
First thing you need to do is to establish an ssh tunnel (aka remote port forwarding) to the server, so that requests to the mongodb can be made "as if" the mongodb server is running on your local computer. Run this from the command line before you begin data analysis if you plan to fetch data from mongo:

`ssh -fNL 27017:127.0.0.1:27017 USERNAME@cogtoolslab.org`

### Load packages

In [1]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("..")
sys.path.append("../utils")
sys.path.append("../analysis/utils")


import numpy as np
import scipy.stats as stats
import pandas as pd

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
plt.style.use('seaborn-white')

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

# import drawing_utils as drawing
# import importlib
# import scoring

### Set up directory paths to plots and data

In [2]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir =  os.path.abspath('.')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'behavioral_experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       
    
## add helpers to python path
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))   

def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name

## create directories that don't already exist        
result = [make_dir_if_not_exists(x) for x in [results_dir,plot_dir,csv_dir]]

In [3]:
# set vars 
auth = pd.read_csv(os.path.join(analysis_dir,'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')


#### Connect to database

In [4]:
# db = conn['curiotower']
#coll = db['tdw-height3Jitter3']
#print('Iterations List:', coll.distinct('iterationName'))

db = conn['curiotower']
coll = db['curiodrop']
print('Iterations List:', coll.distinct('iterationName'))

iterationName = 'run_1'

Iterations List: ['testing', 'run_1']


In [None]:
### Print count and example record

In [5]:
# how many records?
print('We have {} records in mongo.'.format(coll.estimated_document_count()))

survey = coll.find({'iterationName':iterationName, 'eventType':'survey'})
df_survey = pd.DataFrame(survey)
df_survey.head(1)

We have 2138 records in mongo.


### Construct tidy dataframe with game data

In [7]:
df = coll.find({
            'iterationName':iterationName
#             'prolificID': {'$exists' : True},
#             'studyID': {'$exists' : True},
#             'sessionID': {'$exists' : True},
#             'eventType': 'rating-task'
})
df = pd.DataFrame(df)
df['button_pressed'] = pd.to_numeric(df['button_pressed'])
print('unique Prolific IDs:', len(df['prolificID'].unique()))
print(df.shape)

df.head(2)

unique Prolific IDs: 29
(1942, 43)


Unnamed: 0,_id,type,iterationName,condition,prompt,towerID,image_url,stim_version,catch_trial,games,...,trial_index,time_elapsed,internal_node_id,workerId,hitID,aID,eventType,rt,responses,question_order
0,6015abbc8c76cb634b3ef33d,image-button-response,run_1,interesting,How interesting is this?,121319_06,https://curiotower.s3.amazonaws.com/121319_06.png,curiodrop,False,[],...,1,14802,0.0-1.0,,,,,,,
1,6015abc18c76cb634b3ef33e,image-button-response,run_1,interesting,How interesting is this?,121319_07,https://curiotower.s3.amazonaws.com/121319_07.png,curiodrop,False,[],...,2,20041,0.0-2.0,,,,,,,


## Or read in most recent data directly from csv

In [8]:
#df = pd.read_csv('curiotower_raw_data_{}.csv'.format(iterationName))

## Include catch-trial checks:

1. Check whether subjects rated the catch trials as stable <3 or interesting >3. 
2. Check that subjects completed all trials

Remove those who fail either catch

In [9]:
df.prolificID.unique()

array([None, '5f750da0715498178417079e', '5d1f655b51bab90018162918',
       '5fe28987eb5270e97ffca1e8', '5b244233a7cee100011d6924',
       '59d585d3d1ab390001da2f35', nan, '5ae230ae8ee5bf00018c89a7',
       '5def71cd68885201653393ab', '5b701784798af00001d3c0e6',
       '5c2e2ead23477600011a4782', '5d53aae0707297001a5efc0b',
       '5849916abb147f0001786ac5', '5a69d77a56585a0001633482',
       '5b1bf2e768a5ed000173833c', '5f04bc351fca9b773618505d',
       '5e7e069af8a96f493e008ab9', '5eeb6916424ca42d92eac0f9',
       '5fcfc5d2efeec61409aa6cf7', '59c558114374e1000125c522',
       '5d4c28e28f11b70015ba045f', '5a2d62ae5dba1200014b0362',
       '5e8452d8cd38010131ff3ca5', '5f8a2cff75ffa70e8c69e5ca',
       '5fd1fe1751a3a223a052b9dc', '5e2e147d047efa000c6b833d',
       '5f4fdd11275a74100788dac6', '5f342cc0c5947d2eb10a264a',
       '5fe848f115b59b69cf9b06b6'], dtype=object)

In [10]:
df_catch = df[df['catch_trial'] == True]

#return array of gameIDs that failed catch trials
failed_attention_check = df_catch[((df_catch['button_pressed'] <3) & (df_catch['condition'] == 'stable')) |
                       ((df_catch['button_pressed'] >1) & (df_catch['condition'] == 'interesting'))]['prolificID'].unique()
failed_attention_check

array([None, '5c2e2ead23477600011a4782'], dtype=object)

In [11]:
#num_stims = 69
num_stims = len(df['towerID'].unique())
df = df[df.type == 'image-button-response']
df = df[~df['towerID'].str.contains("0999", na=False)]
df_response_count = df.groupby(['prolificID'])['button_pressed'].agg(['count']).reset_index()
failed_completion_check = df_response_count[df_response_count['count'] <num_stims]['prolificID']
failed_completion_check

1     59c558114374e1000125c522
10    5d1f655b51bab90018162918
Name: prolificID, dtype: object

In [12]:
remove_ID = list(failed_attention_check) + list(failed_completion_check)
print('We will remove:' ,remove_ID)
print('-'*40)
print('Our total ID count, after removing catches is:', len(df[~df.prolificID.isin(remove_ID)]['prolificID'].unique()))

We will remove: [None, '5c2e2ead23477600011a4782', '59c558114374e1000125c522', '5d1f655b51bab90018162918']
----------------------------------------
Our total ID count, after removing catches is: 24


In [13]:
#run to get all successful completions
#df[~df.prolificID.isin(remove_ID)]['prolificID'].unique()

In [14]:
#Remove failed IDs from dataset
df = df[~df.prolificID.isin(remove_ID)]
#Remove catch trials from dataset
df = df[df['catch_trial'] == False]
#Only keep button responses
df = df[df.type == 'image-button-response']

df.to_csv('curiotower_cooltower_raw_data_{}.csv'.format(iterationName))
print(df.shape)
df.head(2)

(1656, 43)


Unnamed: 0,_id,type,iterationName,condition,prompt,towerID,image_url,stim_version,catch_trial,games,...,trial_index,time_elapsed,internal_node_id,workerId,hitID,aID,eventType,rt,responses,question_order
4,6015afff3316296bbf016db5,image-button-response,run_1,interesting,How interesting is this?,121119_04,https://curiotower.s3.amazonaws.com/121119_04.png,curiodrop,False,[],...,1,53222,0.0-1.0,,,,,,,
5,6015b0123316296bbf016db6,image-button-response,run_1,interesting,How interesting is this?,121119_12,https://curiotower.s3.amazonaws.com/121119_12.png,curiodrop,False,[],...,2,71358,0.0-2.0,,,,,,,


In [15]:
#df = pd.read_csv('curiotower_raw_data_run_0.csv')

 ### Create df of tower_level ratings

In [16]:
df_tower = df.groupby(['towerID', 'condition'])['button_pressed'].agg(['mean', 'std', 'size']).reset_index()
#df_tower_var = df.groupby(['towerID', 'condition'])['button_pressed'].std().reset_index()
#df_tower_count = df.groupby(['towerID', 'condition'])['button_pressed'].count().reset_index()
#df_tower['sd'] = df_tower_var['button_pressed']
#df_tower['count'] = df_tower_count['button_pressed']
# df_tower['mean'] = df_tower['button_pressed']
#df_tower = df_tower.drop(['button_pressed'], axis = 1)
df_tower.sort_values(by = 'mean', ascending = False)

Unnamed: 0,towerID,condition,mean,std,size
40,121619_08b,interesting,3.083333,0.974308,24
23,121319_03,interesting,3.041667,0.858673,24
39,121619_08-help,interesting,2.916667,1.017955,24
10,121119_09,interesting,2.791667,0.832971,24
63,121819_07,interesting,2.791667,0.931533,24
...,...,...,...,...,...
66,121919_01b,interesting,0.791667,0.658005,24
29,121619_02,interesting,0.541667,0.832971,24
64,121819_08,interesting,0.458333,0.508977,24
25,121319_05,interesting,0.375000,0.769670,24


In [17]:
df_tower.groupby(['condition'])['std'].mean()

condition
interesting    0.919448
Name: std, dtype: float64