In [21]:
import sys
# Appending python modules so we can run simulations
sys.path.append("../python")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import stimuli_generation
import os
import numpy as np
import json
import models as m

from statsmodels.formula.api import ols
from statsmodels.iolib.smpickle import load_pickle
from statannot import add_stat_annotation
from utility import load_scene
from pymunk.vec2d import Vec2d
from matplotlib.pyplot import figure

# Are we saving the cleaned data to a JSON?
saving = False

In [2]:
sns.set_palette("tab10")
sns.set_style("whitegrid")
sns.set_context("talk")
sns.set(rc={'figure.figsize':(12,10)})

In [3]:
data_path_exp1 = "../experiments/experiment3/data/data.csv"

In [4]:
df_exp1 = pd.read_csv(data_path_exp1)

In [5]:
df_exp1['experiment_version'] = 1

In [6]:
# Removing unused columns from jsPsych
df_exp1 = df_exp1.drop(columns=['success', 
                          'failed_images',
                          'failed_audio',
                          'failed_video',
                         'time_elapsed',
                         'scene_sp',
                         'internal_node_id',
                         'session_id',
                         'study_id',
                         'value',
                         'stimulus',
                         'timeout',
                         'trial_type'])

In [7]:
# Copy dataframe
responses_exp1 = df_exp1

In [8]:
# Remove attempts column since we no longer are using it
responses_exp1 = responses_exp1.drop(columns=['attempts','condition',
                                             'response','trial_index'])

In [9]:
# Convert all scene type entries to strings
responses_exp1.scene_type = responses_exp1.scene_type.astype(str)
# Retain only the scene type (remove "_goalpos_" substring)
responses_exp1.scene_type = responses_exp1.scene_type.apply(lambda x: "_".join(x.split('_')[:2]))

In [10]:
# Convert all scene entries to strings
responses_exp1.scene = responses_exp1.scene.astype(str)
# Retain only the scene name (remove ".mp4" substring)
responses_exp1.scene = responses_exp1.scene.apply(lambda x: x.split('.')[0])

In [11]:
# Remove all tasks that aren't coded as response
responses_exp1 = responses_exp1.loc[(responses_exp1.task == 'response')]

In [12]:
# Remove the task column since we no longer are using it
responses_exp1 = responses_exp1.drop(columns=['task'])

In [13]:
responses_exp1['sim_cond'] = responses_exp1.scene_type.apply(lambda x: x.split('_')[0])

In [14]:
responses_exp1['path_cond'] = responses_exp1.scene.apply(lambda x: 'straight' if 'yessp' in x else 'not_straight')

In [15]:
# Calculate variable-wise z-scores for response time
responses_exp1['var_zrt'] = responses_exp1.groupby('experiment_version').rt.apply(lambda x: (x-x.mean()) / x.std())

In [16]:
# Calculate participant-wise z-score for response time
responses_exp1['part_zrt'] = responses_exp1.groupby(['experiment_version','subject_id']).rt.apply(lambda x: (x-x.mean()) / x.std())

In [17]:
# Check these z-scores are not equivalent
all(responses_exp1['var_zrt'] == responses_exp1['part_zrt'])

False

In [18]:
# Determine number of datapoints lost by outlier exclusion
excluded = len(responses_exp1) - len(responses_exp1[abs(responses_exp1.part_zrt) <= 2])
print(f"{excluded} of {len(responses_exp1)} ({excluded/len(responses_exp1)*100:.4}%) are excluded via 2-SD exclusion")

105 of 2256 (4.654%) are excluded via 2-SD exclusion


In [19]:
# Excluding outliers
responses_exp1 = responses_exp1[abs(responses_exp1.part_zrt) <= 2]

In [20]:
responses_exp1

Unnamed: 0,subject_id,rt,correct_response,correct,task_condition,scene,scene_type,scene_index,scene_col,timed_out,experiment_version,sim_cond,path_cond,var_zrt,part_zrt
58,61716480d6da1714ff60ed0d,884.0,f,True,No,low_nocol_yessp_1,low_nocol,1.0,no,False,1,low,straight,-0.528320,-0.789426
63,61716480d6da1714ff60ed0d,726.0,j,False,Yes,low_yescol_yessp_2,low_yescol,2.0,yes,False,1,low,straight,-0.746416,-0.953251
68,61716480d6da1714ff60ed0d,885.0,f,True,No,low_nocol_nosp_2,low_nocol,2.0,no,False,1,low,not_straight,-0.526940,-0.788389
73,61716480d6da1714ff60ed0d,981.0,f,True,No,med_nocol_nosp_3,med_nocol,3.0,no,False,1,med,not_straight,-0.394425,-0.688849
78,61716480d6da1714ff60ed0d,1189.0,j,True,Yes,high_yescol_yessp_2,high_yescol,2.0,yes,False,1,high,straight,-0.107311,-0.473180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15859,5f145461435271023220d6ac,1551.0,f,True,No,med_nocol_nosp_1,med_nocol,1.0,no,False,1,med,not_straight,0.392378,0.910210
15864,5f145461435271023220d6ac,1231.0,f,True,No,high_nocol_nosp_1,high_nocol,1.0,no,False,1,high,not_straight,-0.049336,0.300333
15869,5f145461435271023220d6ac,791.0,f,True,No,med_nocol_nosp_2,med_nocol,2.0,no,False,1,med,not_straight,-0.656693,-0.538248
15874,5f145461435271023220d6ac,804.0,f,True,No,low_nocol_nosp_4,low_nocol,4.0,no,False,1,low,not_straight,-0.638748,-0.513472


In [22]:
# Make a copy of our resposnes dataframe
model_exp1 = responses_exp1

In [23]:
model_exp1

Unnamed: 0,subject_id,rt,correct_response,correct,task_condition,scene,scene_type,scene_index,scene_col,timed_out,experiment_version,sim_cond,path_cond,var_zrt,part_zrt
58,61716480d6da1714ff60ed0d,884.0,f,True,No,low_nocol_yessp_1,low_nocol,1.0,no,False,1,low,straight,-0.528320,-0.789426
63,61716480d6da1714ff60ed0d,726.0,j,False,Yes,low_yescol_yessp_2,low_yescol,2.0,yes,False,1,low,straight,-0.746416,-0.953251
68,61716480d6da1714ff60ed0d,885.0,f,True,No,low_nocol_nosp_2,low_nocol,2.0,no,False,1,low,not_straight,-0.526940,-0.788389
73,61716480d6da1714ff60ed0d,981.0,f,True,No,med_nocol_nosp_3,med_nocol,3.0,no,False,1,med,not_straight,-0.394425,-0.688849
78,61716480d6da1714ff60ed0d,1189.0,j,True,Yes,high_yescol_yessp_2,high_yescol,2.0,yes,False,1,high,straight,-0.107311,-0.473180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15859,5f145461435271023220d6ac,1551.0,f,True,No,med_nocol_nosp_1,med_nocol,1.0,no,False,1,med,not_straight,0.392378,0.910210
15864,5f145461435271023220d6ac,1231.0,f,True,No,high_nocol_nosp_1,high_nocol,1.0,no,False,1,high,not_straight,-0.049336,0.300333
15869,5f145461435271023220d6ac,791.0,f,True,No,med_nocol_nosp_2,med_nocol,2.0,no,False,1,med,not_straight,-0.656693,-0.538248
15874,5f145461435271023220d6ac,804.0,f,True,No,low_nocol_nosp_4,low_nocol,4.0,no,False,1,low,not_straight,-0.638748,-0.513472


In [24]:
scenedir_exp1 = "../data/json/pilot3/trial/"

In [25]:
json_files = [pos_json for pos_json in os.listdir(scenedir_exp1) if pos_json.endswith('.json')]

In [26]:
# Model results
model_sim_exp1 = pd.DataFrame({})

for scene_json in json_files:
    # Get scene and load it
    scenedir = scenedir_exp1 + scene_json
    name = scene_json.split('.')[0]
    scene = load_scene(scenedir)
    # Get model results for simulation and abstraction
    simulation_result = m.simulation(scene.args)
    # Append those results to dataframe
    sim_df = pd.DataFrame({"scene": scene.args['name'],
                           "collision_prob": simulation_result['collision_probability'],
                           "sim_time": simulation_result['simulation_time']})
    model_sim_exp1 = pd.concat([model_sim_exp1,sim_df])

In [27]:
df = model_exp1.groupby('scene').rt.apply(np.mean)
model_exp1['avg_rt'] = np.nan
for avg_rt,scene in zip(df,df.index):
    model_exp1.loc[model_exp1['scene']==scene,'avg_rt'] = avg_rt

In [28]:
model_exp1['mean_rt_z'] = model_exp1.avg_rt.transform(lambda x: (x-x.mean())/x.std())

In [29]:
model_exp1 = model_exp1.merge(model_sim_exp1, on='scene')

In [30]:
model_exp1['sim_time_z'] = model_exp1.sim_time.transform(lambda x: (x-x.mean()) / x.std())

In [31]:
model_exp1

Unnamed: 0,subject_id,rt,correct_response,correct,task_condition,scene,scene_type,scene_index,scene_col,timed_out,experiment_version,sim_cond,path_cond,var_zrt,part_zrt,avg_rt,mean_rt_z,collision_prob,sim_time,sim_time_z
0,61716480d6da1714ff60ed0d,884.0,f,True,No,low_nocol_yessp_1,low_nocol,1.0,no,False,1,low,straight,-0.528320,-0.789426,863.956522,-1.359604,0,180,-0.999585
1,611fbf86423c9a875559377d,1097.0,f,True,No,low_nocol_yessp_1,low_nocol,1.0,no,False,1,low,straight,-0.234304,-0.514677,863.956522,-1.359604,0,180,-0.999585
2,58209f4785e40e00012f2a7d,961.0,j,True,No,low_nocol_yessp_1,low_nocol,1.0,no,False,1,low,straight,-0.422032,-0.850806,863.956522,-1.359604,0,180,-0.999585
3,61119c082735b4dccb04645f,818.0,j,True,No,low_nocol_yessp_1,low_nocol,1.0,no,False,1,low,straight,-0.619423,-0.918544,863.956522,-1.359604,0,180,-0.999585
4,616c14b549a12a6ba7d5b2a1,731.0,j,True,No,low_nocol_yessp_1,low_nocol,1.0,no,False,1,low,straight,-0.739515,-1.037211,863.956522,-1.359604,0,180,-0.999585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099,60d5dd79e75ea00daa99ea69,1240.0,j,True,No,high_nocol_nosp_4,high_nocol,4.0,no,False,1,high,not_straight,-0.036913,0.333504,1526.966667,1.465242,0,710,1.275072
2100,60d65ae29c431eaafde45c51,1598.0,j,True,No,high_nocol_nosp_4,high_nocol,4.0,no,False,1,high,not_straight,0.457255,0.010937,1526.966667,1.465242,0,710,1.275072
2101,60d5f406db993fe9113ca832,1454.0,j,True,No,high_nocol_nosp_4,high_nocol,4.0,no,False,1,high,not_straight,0.258483,1.175878,1526.966667,1.465242,0,710,1.275072
2102,5f8a32a3ee8d0a0f13e8ab34,1189.0,j,True,No,high_nocol_nosp_4,high_nocol,4.0,no,False,1,high,not_straight,-0.107311,-0.362715,1526.966667,1.465242,0,710,1.275072


In [73]:
grp = model_exp1.groupby('scene').part_zrt
emp_data = grp.describe()

In [74]:
Y = dict(emp_data['mean'])

In [85]:
def squared_distance(X,Y,keys):
    sum_ = 0
    assert len(X) == len(Y) == len(keys)
    for i in keys:
        sum_ += (X[i]-Y[i])**2
    return sum_

In [87]:
rho = squared_distance

In [83]:
scenes = set(emp_data.index)

In [88]:
rho(Y,Y,scenes)

0.0