# Analysis Notebook for human physics benchmark experiments

[Pregistration dominoes_pilot](https://github.com/cogtoolslab/human-physics-benchmarking/blob/master/experiments/dominoes_pilot/preregistration_dominoes_pilot.md)

In [None]:
study = "dominoes_pilot"
# study = "towers_pilot"
bucket_name = 'human-physics-benchmarking-dominoes-pilot' #name of S3 bucket/stimuli collection
# bucket_name = 'human-physics-benchmarking-towers-pilot' #name of S3 bucket/stimuli collection
stim_version = 'example' #the version of the stimuli uploaded
iterationName = 'run_1'

### Establish connection to mongo
First thing you need to do is to establish an ssh tunnel (aka remote port forwarding) to the server, so that requests to the mongodb can be made "as if" the mongodb server is running on your local computer. Run this from the command line before you begin data analysis if you plan to fetch data from mongo. Insert your username.

In [None]:
!`ssh -fNL 27017:127.0.0.1:27017 fbinder@cogtoolslab.org`

### Load packages

In [None]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("..")
sys.path.append("../utils")
sys.path.append("../analysis/utils")


import numpy as np
import scipy.stats as stats
import pandas as pd

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

import  matplotlib
from matplotlib import pylab, mlab, pyplot
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
plt.style.use('seaborn-white')

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')
%matplotlib inline

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

### Helper functions

In [None]:
#helper function for pd.agg
def item(x):
    """Returns representative single item"""
    return x.tail(1).item()

### Set up directory paths to plots and data

In [None]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir =  os.path.abspath('.')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'behavioral_experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       
    
## add helpers to python path
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))   

def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name

## create directories that don't already exist        
result = [make_dir_if_not_exists(x) for x in [results_dir,plot_dir,csv_dir]]

In [None]:
# set vars 
auth = pd.read_csv(os.path.join(proj_dir,'auth.txt'), header = None) # this auth.txt file contains the password for the sketchloop user. Place in repo folder
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org'

# have to fix this to be able to analyze from local
import pymongo as pm
conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')

#### Connect to database

In [None]:
db = conn['human_physics_benchmarking']
coll = db[study]

stim_db = conn['stimuli']
stim_coll = stim_db[bucket_name+'_'+stim_version]
print('Iterations List:', coll.distinct('iterationName'))

In [None]:
# how many records?
print('We have {} records in mongo.'.format(coll.estimated_document_count()))

### Construct tidy dataframe with game data

In [None]:
# get dataframe of served stims
stim_df = pd.DataFrame(stim_coll.find({}))
stim_df.set_index('_id')

In [None]:
stim_df

In [None]:
#get experimental results
df = coll.find({
            'iterationName':iterationName
#             'prolificID': {'$exists' : True},
#             'studyID': {'$exists' : True},
#             'sessionID': {'$exists' : True},
})
df = pd.DataFrame(df)
df['button_pressed'] = pd.to_numeric(df['button_pressed'])
# print('unique Prolific IDs:', len(df['prolificID'].unique()))
print("Shape:",df.shape)

df.tail(10)

Let's figure out which gameids are complete

In [None]:
#Which gameids have completed all trials that were served to them? 
#Note that this will also exclude complete trials whose games aren't in the stim database anymore (ie if it has been dropped)
complete_gameids = []

for gameid in df['gameID'].unique():
    #get the corresponding games
    served_stim_ID = None
    for stims_ID in stim_df.index:
        if gameid in stim_df.iloc[stims_ID]['games']:
            #great, we found our corresponding stim_ID
            served_stim_ID = stims_ID
    if served_stim_ID == None:
        #we haven't found the stim_ID
#         print("No recorded entry for game_ID in stimulus database:",gameid)
        continue
    served_stims = stim_df.at[served_stim_ID,'stims']
    #let's check if we can find an entry for each stim
    found_empty = False
    for stim_ID in [s['stim_ID'] for s in served_stims.values()]:
        #check if we have an entry for that stimulus
        if len(df.query("gameID == '"+gameid+"' & stim_ID == '"+stim_ID+"'")) == 0:
            found_empty = True
            break
    if not found_empty: complete_gameids.append(gameid)
        
print("Completed games:",complete_gameids)

In [None]:
#mark unfinished entries
df['complete_experiment'] = df['gameID'].isin(complete_gameids)

In [None]:
#how many started games?
print('We have {} unique games in mongo.'.format(len(df['gameID'].unique())))

In [None]:
#how many completed games?
print('We have {} unique completed games in mongo.'.format(len(df[df['complete_experiment']==True]['gameID'].unique())))

In [None]:
#exclude unfinished games ⚠️
df = df[df['gameID'].isin(complete_gameids)]

In [None]:
#Generate some useful views
df_trial_entries = df[(df['condition'] == 'prediction') & (df['trial_type'] == 'video-button-response')] #only experimental trials

## Structures
Let's look at the rating for the structures

In [None]:
df_trial_entries['c'] = 1 #add dummy variable for count in agg
per_stim_agg = df_trial_entries.groupby('stim_ID').agg({
    'correct' : lambda cs: np.mean([1 if c == True else 0 for c in cs]),
    'c' : 'count',
    'middle_objects' : item
})

In [None]:
#A view of the different structures
per_stim_agg.sort_values('c',ascending=False).query("c > 2")

In [None]:
per_stim_agg.to_csv(os.path.join(csv_dir,"per_stim_agg_"+study+".csv"))

In [None]:
plt.hist(per_stim_agg['correct'])
plt.xlabel("Mean correctness per stimulus")
plt.ylabel("Count")
plt.title("Histogram mean correctness per stimulus on "+study)
plt.show()

## Subjects
Let's look at the distribution between subjects

In [None]:
per_person_agg = df_trial_entries.groupby('gameID').agg({
    'correct' : lambda cs: np.mean([1 if c == True else 0 for c in cs]),
})

Histogram over average rate of correct guesses between **subjects**

In [None]:
plt.hist(per_person_agg['correct'])
plt.xlabel("Mean correctness per subject")
plt.ylabel("Count")
plt.title("Histogram mean correctness per subject on "+study)
plt.show()

## Summary
Let's look at some basic summaries

### Overall correct on trials

In [None]:
str(round(df_trial_entries['correct'].mean() * 100,2)) + '% correct across all subjects and structures, excluding familiarization trials'