In [1]:
import json
import pandas as pd
import numpy as np
import scipy.stats

In [2]:
filename = 'full_data_no_id.json'
with open(filename) as f:
    data = json.load(f)
    
p_id = 0
instances_data = {} # {instance1:{'full':{participant1: [response times], participant2: [response times],...},'default':{...}}, instances2:{}, ...}
participants_data = {'full':{}, 'default':{}} # {'full':{participant1:{instance1:[response times], instance2:[response times], ...}, participants2:{...}, ...}, 'default':{}}
for data_block in data:
    
    if "complete" in data_block['content'].strip().split('\n')[3].strip().split('","')[0]:
        condition = 'full'
    else:
        condition = 'default'
        
    participants_data[condition][p_id] = {}
    order = 0
    for item in data_block['content'].strip().split('\n'):
        
        info = item.strip().split('","')
        if info[6] == "single-stim-button":
            if info[13] == "test": # if this is test instance
                instance = info[20]
                rt = info[1]
                action = info[14]
        
                if instance in participants_data[condition][p_id]:
                    participants_data[condition][p_id][instance]['responses'].append((rt,action))
                else:
                    participants_data[condition][p_id][instance]={'order':order, 'responses':[(rt,action)]}
                    order += 1
                
                if not instance in instances_data:
                    instances_data[instance] = {'full':{},'default':{}}
                
                if p_id in instances_data[instance][condition]:
                    instances_data[instance][condition][p_id]['responses'].append((rt,action))
                else:
                    instances_data[instance][condition][p_id] = {'order': order, 'responses':[(rt,action)]}
                
                
                    
    
    
    p_id += 1
    


In [3]:
# mark the outliers and output the processed dataset

all_ipt = []
all_instance = []
all_pid = []
all_condition = []
all_order = []
optimalCost = []
startHierarchy = []
bfs_speed = []
greedy_gc_speed = []
astar_gc_speed = []

model_speed = []
random_speed = []
tree_gc_speed = []
conflict_move = []
hc_speed = []

lh_speeds = {}
for i in range(1,8):
    lh_speeds['lh' + str(i)] = []


for condition in ['full', 'default']:
    for instance in instances_data:
        with open("../dataset/TOLdataset/" + instance + "/info_v_new.json") as f: # pick-and-put model
            info_dict = json.load(f)
        
        initial_times = []
        for p in instances_data[instance][condition]:
            initial_times.append(int(instances_data[instance][condition][p]['responses'][0][0]))


        zscores = scipy.stats.zscore(initial_times)
        for s,p in zip(zscores, instances_data[instance][condition]):
            if -3<=s<=3: # exclusion criterion
                instances_data[instance][condition][p]['outlier'] = False
                participants_data[condition][p][instance]['outlier'] = False
                optimalCost.append(info_dict['sol_length(BFS)'])
                startHierarchy.append(3-info_dict['start_hierarchy'])
                model_speed.append(info_dict['model_time'])
                bfs_speed.append(info_dict['num_expanded_nodes(BFS)'])
                astar_gc_speed.append(info_dict['num_expanded_nodes(astar+gc)'])
                greedy_gc_speed.append(info_dict['num_expanded_nodes(greedy+gc)'])
                tree_gc_speed.append(info_dict['num_expanded_nodes(tree+gc)'])
                random_speed.append(info_dict['num_expanded_nodes(RW)'])
                hc_speed.append(info_dict['num_expanded_nodes(hc+gc)'])
                conflict_move.append(info_dict['first_cm'])
                for i in range(1,8):
                    lh_speeds['lh' + str(i)].append(info_dict['num_expanded_nodes(lh' + str(i) + ')'])
                all_ipt.append(int(instances_data[instance][condition][p]['responses'][0][0]))
                all_instance.append(instance)
                all_pid.append(p)
                if condition == 'full':
                    all_condition.append(1)
                else:
                    all_condition.append(0)
                all_order.append(instances_data[instance][condition][p]['order'])
            else:
                instances_data[instance][condition][p]['outlier'] = True
                participants_data[condition][p][instance]['outlier'] = True
                
instance_data_output = "instances_data.json"
with open(instance_data_output,'w') as f:
    json.dump(instances_data, f)
    
participants_data_output = "participants_data.json"
with open(participants_data_output, 'w') as f:
    json.dump(participants_data, f)

In [4]:
# output the dataset for R to run regression models

data = {
    'ipt':all_ipt,
    'instance':all_instance,
    'pid':all_pid,
    'condition':all_condition,
    'order':all_order,
    'model':model_speed,
    'bfs':bfs_speed,
    'astar_gc':astar_gc_speed,
    'greedy_gc':greedy_gc_speed,
    'tree_gc':tree_gc_speed,
    'start_hierarchy': startHierarchy,
    'optimal_cost': optimalCost,
    'random':random_speed,
    'hc':hc_speed,
    'cm':conflict_move
}

for i in range(1,8):
    data['lh' + str(i)] = lh_speeds['lh' + str(i)]

df = pd.DataFrame(data, columns = list(data.keys()))

print(df)

        ipt instance  pid  condition  order  model  bfs  astar_gc  greedy_gc  \
0     15106  TOL_112    0          1      1     35   27        18         19   
1     18402  TOL_112   15          1     15     35   27        18         19   
2      4773  TOL_112   38          1     21     35   27        18         19   
3     23290  TOL_112   39          1     35     35   27        18         19   
4     57899  TOL_112   51          1      7     35   27        18         19   
...     ...      ...  ...        ...    ...    ...  ...       ...        ...   
9155   5816   TOL_53  167          0     35     22   18         8          9   
9156   1689   TOL_53  183          0     37     22   18         8          9   
9157    825   TOL_53  211          0     27     22   18         8          9   
9158   4447   TOL_53  214          0     27     22   18         8          9   
9159   3757   TOL_53  216          0     35     22   18         8          9   

      tree_gc  ...  random  hc  cm  lh1

In [5]:
df.to_csv("all_info_ipt_v.csv") # pick-and-put model