# Across scenario basic analysis

### Load packages

In [None]:
import os
import sys
import urllib, io
os.getcwd()
sys.path.append("..")
sys.path.append("../utils")
sys.path.append("../analysis/utils")


import numpy as np
import scipy.stats as stats
import pandas as pd

import pymongo as pm
from collections import Counter
import json
import re
import ast

from PIL import Image, ImageOps, ImageDraw, ImageFont 

from io import BytesIO
import base64

from tqdm.notebook import tqdm

import  matplotlib
from matplotlib import pylab, mlab, pyplot
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.pylabtools import figsize, getfigs
plt = pyplot
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
plt.style.use('seaborn-white')

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid')
%matplotlib inline

from IPython.display import clear_output

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [None]:
#display all columns
pd.set_option('display.max_columns', None)

### Helper functions

In [None]:
#helper function for pd.agg
def item(x):
    """Returns representative single item"""
    return x.tail(1).item()

### Set up directory paths to plots and data

In [None]:
## directory & file hierarchy
proj_dir = os.path.abspath('..')
datavol_dir = os.path.join(proj_dir,'data')
analysis_dir =  os.path.abspath('.')
results_dir = os.path.join(proj_dir,'results')
plot_dir = os.path.join(results_dir,'plots')
csv_dir = os.path.join(results_dir,'csv')
json_dir = os.path.join(results_dir,'json')
exp_dir = os.path.abspath(os.path.join(proj_dir,'behavioral_experiments'))
png_dir = os.path.abspath(os.path.join(datavol_dir,'png'))

## add helpers to python path
if os.path.join(proj_dir,'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'stimuli'))
    
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)   
    
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)       
    
## add helpers to python path
if os.path.join(proj_dir,'utils') not in sys.path:
    sys.path.append(os.path.join(proj_dir,'utils'))   

def make_dir_if_not_exists(dir_name):   
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    return dir_name

## create directories that don't already exist        
result = [make_dir_if_not_exists(x) for x in [results_dir,plot_dir,csv_dir]]

## Load in data

Assumes exported csvs from `basic_analysis.ipynb` in results folder

In [None]:
studies = [
    "drop_pilot",
    "collision_pilot",
    "rollingsliding_pilot",
    "dominoes_pilot"
]

In [None]:
#load all experiments as one dataframe
df = pd.concat([pd.read_csv(os.path.join(csv_dir,'trial_entries_'+l+'.csv')) for l in studies])
print("Loaded dataframes")

In [None]:
df['study']

In [None]:
df['scenario'] = df['study'].apply(lambda x: x.split('_')[0])

In [None]:
df['scenario']

In [None]:
df.columns

## Plots

### Per Distractor/ Occluder
How might distractors and occulers affect human subjects' performance? The following plots explores the average accuracy and the number of distractors/ occulers in a single trial.

In [None]:
df['response'] = df['response'] == "YES" #encode response as boolean

In [None]:
per_confuser_agg = df.groupby('stim_ID').agg({
    'scenario' : lambda s: s.head(1),
    'correct' : lambda cs: np.mean([1 if c == True else 0 for c in cs]),
    'response' : 'mean',
    'c' : 'count'
})

In [None]:
per_confuser_agg = per_confuser_agg.reset_index()
others = per_confuser_agg[per_confuser_agg['scenario']!='dominoes']
others['occluder_num'] = others['stim_ID'].str.extract(r'(\d{1})_occ').fillna('0')
others['distractor_num'] = others['stim_ID'].str.extract(r'(\d{1})_dis').fillna('0')
others

In [None]:
dominoes = per_confuser_agg[per_confuser_agg['scenario']=='dominoes']
dominoes['distractor_num'] = dominoes['stim_ID'].str.extract(r'_d(\d{1})').fillna('0')
dominoes['occluder_num'] = dominoes['stim_ID'].str.extract(r'_o(\d{1})').fillna('0')
dominoes

In [None]:
per_confuser_agg = pd.concat([others, dominoes])

In [None]:
per_confuser_agg.groupby(['scenario', 'occluder_num']).count()

In [None]:
per_confuser_agg.groupby(['scenario', 'distractor_num']).count()

### occluder

In [None]:
g = sns.FacetGrid(per_confuser_agg, col="scenario", hue='occluder_num', height=6)
g.map(sns.violinplot, 'occluder_num', "correct", order=['0','1','2'])
# g.set(ylim=(0, 1))

### distractor

In [None]:
g = sns.FacetGrid(per_confuser_agg, col="scenario", hue='distractor_num', height=6)
g.map(sns.violinplot, 'distractor_num', "correct", order=['0','1','2','3'])

### Per Num of Dominoes 
Here, we check if increase the number of dominoes will affact human subject's accuracy.

In [None]:
dominoes = df[df['scenario']=='dominoes']
per_dominoes_agg = dominoes.groupby('stim_ID').agg({
    'scenario' : lambda s: s.head(1),
    'correct' : lambda cs: np.mean([1 if c == True else 0 for c in cs]),
    'c' : 'count',
    'middle_objects' : lambda n: n.head(1)
})

In [None]:
per_dominoes_agg['num_middle_objects'] = per_dominoes_agg['middle_objects'].fillna(0).apply(lambda x: 0 if x==0 else len(re.findall(r'(\'[a-z]+\')', x)))
per_dominoes_agg

In [None]:
per_dominoes_agg[per_dominoes_agg['num_middle_objects']==8]

In [None]:
per_dominoes_agg.groupby(['num_middle_objects']).count()[['c']]

In [None]:
ax = sns.barplot(x='num_middle_objects', y="correct", data= per_dominoes_agg,  palette="Blues_d",ci = None)
ax.set_title('Accueacy per Number of middle dominoes \n')
ax.set_xlabel('Number of middle dominoes')
ax.set_ylabel('Accueacy')

## Participant Info
Here, we check the infomration about our participant such as their age and gender.

In [None]:
studies = [
    "drop",
    "collision",
    "rollingsliding",
    "dominoes"
]

In [None]:
#load all experiments as one dataframe
df2 = pd.concat([pd.read_csv(os.path.join(csv_dir,'prolific_export_'+l+'.csv')) for l in studies])
print("Loaded dataframes on prolific info")

In [None]:
whole_df = pd.merge(df, df2,  how='left', left_on=['prolificID','sessionID'], right_on = ['participant_id','session_id'])
whole_df.head()

In [None]:
whole_df.columns

In [None]:
per_person_agg = whole_df.groupby('gameID').agg({
    'scenario' : lambda s: s.head(1),
    'correct' : lambda cs: np.mean([1 if c == True else 0 for c in cs]),
    'c' : 'count',
    'age' : lambda s: s.head(1),
    'Sex' : lambda s: s.head(1),
    'First language' : lambda s: s.head(1),
    'Student Status' : lambda s: s.head(1),
    'Nationality':  lambda s: s.head(1),
    'Current Country of Residence':  lambda s: s.head(1)
})
per_person_agg

In [None]:
per_person_agg['age']

In [None]:
ax = sns.histplot(per_person_agg, x="age", hue="Sex", element="step")
ax.set_title('Participant Age for different gender \n')

In [None]:
nations = per_person_agg.groupby("Nationality").count()
nations = nations.reset_index()
nations = nations[nations['c']>3]

In [None]:
from matplotlib import pyplot

a4_dims = (20, 8.27)

fig, ax = pyplot.subplots(figsize=a4_dims)
sns.barplot(ax=ax, x="Nationality", y="c", data=nations)

ax.set_title('Participant Nationality \n')
ax.set_ylabel('Count')