the goal of this notebook is to gather the data in the files generated by our softwares and by google form and to put everything into tidy and csv files with standardized column name

------------------------

Column names will be formatter with no space, capital letters to separate words and lower case for the first letter

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import json
import glob
import datetime as dt

-------------------------------------------
# Json files

## listing all the files

In [6]:
jsonPathes = glob.glob("raw/json/*.json")

In [7]:
len(jsonPathes)

92

In [5]:
results = []
for jsonpath in jsonPathes :

  with open(jsonpath) as jsonfile:
    result = json.load(jsonfile)

  # the only difference between a VR and a non VR experiment
  # is that there is VR in the file name of Vr experiments
  fileName = os.path.basename(jsonpath)
  if "VR" in fileName :
    result["device"] = "VR"
  else :
    result["device"] = "Comp"

  results.append(result)

# Slots

we will firstly do a dataframe where each row correspond to a slot/object association

In [112]:
slots = []

for result in results :
  for pair in result["Pairs"]:
    slot = {
        # identify specifically an experiment
        'participantId' : int(result['experimentID']),
        'device' : result['device'],
        'datasetId' : result['chosenDataset'],
        # actual result
        'slotId' : pair['slotIndex'] + 1,
        'isTrue' : ("true" in pair['answer']),
        'answerId' : int(''.join(n for n in pair['answer'] if n.isdigit()))
    }
    slots.append(slot)

In [113]:
slotsDf = pd.DataFrame(slots)

In [114]:
slotsDf

Unnamed: 0,participantId,device,datasetId,slotId,isTrue,answerId
0,16,VR,1,1,True,4
1,16,VR,1,2,True,2
2,16,VR,1,3,False,2
3,16,VR,1,4,True,6
4,16,VR,1,5,True,5
...,...,...,...,...,...,...
638,13,Comp,2,3,True,4
639,13,Comp,2,4,True,6
640,13,Comp,2,5,False,3
641,13,Comp,2,6,False,5


does each experiment correspond has one answer for each slot ?

In [126]:
pbSlots = (slotsDf
  .set_index(["participantId","device","datasetId"])
  .join(
    slotsDf.groupby(["participantId","device","datasetId"])
      .nunique("slotId")["slotId"]
      .rename("nbSlot")
    )
  .query("nbSlot != 7"))

In [127]:
pbSlots

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,slotId,isTrue,answerId,nbSlot
participantId,device,datasetId,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11,Comp,1,1,True,3,6
11,Comp,1,2,True,4,6
11,Comp,1,3,True,5,6
11,Comp,1,4,False,2,6
11,Comp,1,5,True,7,6
11,Comp,1,7,True,2,6
11,Comp,1,7,True,6,6
14,Comp,1,1,False,1,6
14,Comp,1,2,False,3,6
14,Comp,1,3,False,7,6


these experiment had a bug, let's delete them

In [128]:
pbSlots.index

MultiIndex([(11, 'Comp', 1),
            (11, 'Comp', 1),
            (11, 'Comp', 1),
            (11, 'Comp', 1),
            (11, 'Comp', 1),
            (11, 'Comp', 1),
            (11, 'Comp', 1),
            (14, 'Comp', 1),
            (14, 'Comp', 1),
            (14, 'Comp', 1),
            (14, 'Comp', 1),
            (14, 'Comp', 1),
            (14, 'Comp', 1)],
           names=['participantId', 'device', 'datasetId'])

In [132]:
slotsDf = (slotsDf
  .set_index(["participantId","device","datasetId"])
  .drop(pbSlots.index)
  .reset_index())

In [133]:
slotsDf.groupby(["participantId","device","datasetId"]).nunique("slotId").query('slotId != 7')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,slotId,isTrue,answerId
participantId,device,datasetId,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [134]:
# saving as csv
slotsDf.to_csv(path + "/clean-data/slots.csv", index = False)

# experiment

now we will do a dataframe where each row correspond to an experiment (i.e. there should be 4 rows for each participants)

## experiment order

We need to compute the experiment order, a value in $[\![ 1;4]\!]$ that describe in which order the experiment was taken by the participant. Let's compute this first. We'll do it in vanilla python because we don't have that many datapoints

In [11]:
results[0]

{'InspectionTimes': [],
 'Pairs': [{'answer': 'true4(Clone)', 'slotIndex': 0},
  {'answer': 'true2(Clone)', 'slotIndex': 1},
  {'answer': 'false2(Clone)', 'slotIndex': 2},
  {'answer': 'true6(Clone)', 'slotIndex': 3},
  {'answer': 'true5(Clone)', 'slotIndex': 4},
  {'answer': 'true7(Clone)', 'slotIndex': 5},
  {'answer': 'true3(Clone)', 'slotIndex': 6}],
 'chosenDataset': 1,
 'device': 'VR',
 'experimentID': '16',
 'pauseTime': 184.9094696044922,
 'sortingTime': 269.09613037109375,
 'time': '4/27/2022 1:36:32 PM'}

In [12]:
resultsByParticipant = [list() for i in range(25)]

for result in results :
  resultsByParticipant[int(result['experimentID'])-1].append(result)

In [13]:
[len(res) for res in resultsByParticipant]

[4, 3, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 4]

In [14]:
format = "%m/%d/%Y %I:%M:%S %p"

for participant in resultsByParticipant:
  if len(participant)==4:
    # sort experiment of a same participant by chronological order
    participant.sort(key = lambda x : dt.datetime.strptime(x["time"], format))
    # note in the experiment it's sorted position
    for i,exp in enumerate(participant):
      exp["experimentOrder"] = i+1

## clean dataframe

In [22]:
experiments = []

for result in results:
  experiment = {
    # identify specifically an experiment
    'participantId' : int(result['experimentID']),
    'device' : result['device'],
    'datasetId' : result['chosenDataset'],
    # others
    'pauseTime' : result['pauseTime'],
    'sortingTime' : result['sortingTime'],
    'time' : dt.datetime.strptime(result["time"], format)
  }
  if 'experimentOrder' in result :
    experiment['experimentOrder'] = result['experimentOrder']
  experiments.append(experiment)

In [23]:
experimentDf = pd.DataFrame(experiments)

In [24]:
experimentDf['experimentOrder'] = experimentDf['experimentOrder'].astype('Int8')

we had some problem for some experiment, so we need to enter by hand their experimentOrder based on the notes taken on paper during the experiments

In [25]:
experimentDf[experimentDf['experimentOrder'].isna()]

Unnamed: 0,participantId,device,datasetId,pauseTime,sortingTime,time,experimentOrder
39,2,VR,2,14.771276,146.338394,2022-04-29 17:06:31,
40,2,VR,1,14.461515,222.809418,2022-04-29 17:13:40,
41,2,Comp,2,3.333272,103.000214,2022-04-29 17:31:08,
66,4,Comp,2,1514.756104,332.973419,2022-04-27 17:28:27,


In [45]:
experimentDf = experimentDf.set_index(["participantId","device","datasetId"])
experimentDf.loc[2,"experimentOrder"] = [1,2,4]
experimentDf.loc[4,"experimentOrder"] = 2
experimentDf = experimentDf.reset_index()

  user_expressions, allow_stdin)


In [46]:
experimentDf[experimentDf['experimentOrder'].isna()]

Unnamed: 0,participantId,device,datasetId,pauseTime,sortingTime,time,experimentOrder


let's now check that each paticipants correspond to 4 experiments ?

In [135]:
(experimentDf
  .set_index("participantId")
  .join(
      experimentDf
        .groupby("participantId")
        .size().rename("nbExp")
        )
  .reset_index()
  .query("nbExp != 4")
  )


Unnamed: 0,participantId,device,datasetId,pauseTime,sortingTime,time,experimentOrder,nbExp
4,2,VR,2,14.771276,146.338394,2022-04-29 17:06:31,1,3
5,2,VR,1,14.461515,222.809418,2022-04-29 17:13:40,2,3
6,2,Comp,2,3.333272,103.000214,2022-04-29 17:31:08,4,3
11,4,Comp,2,1514.756104,332.973419,2022-04-27 17:28:27,2,1


participants 2 and 4 didn't do all the experiment because of a problem


we also need to delete the rows that correspond to the bugged experiment we deleted before

In [138]:
experimentDf = (experimentDf
  .set_index(["participantId","device","datasetId"])
  .drop(pbSlots.index)
  .reset_index())

In [139]:
# saving as csv
experimentDf.to_csv(path + "/clean-data/experiments.csv",index = False)

# Focus Time

we will create a clean dataframe with the time spent focusing on each answer where available

In [48]:
results[10]

{'InspectionTimes': [{'name': 'true1(Clone)',
   'observationTime': 4.0499162673950195},
  {'name': 'true2(Clone)', 'observationTime': 5.783239841461182},
  {'name': 'true3(Clone)', 'observationTime': 0.0},
  {'name': 'true4(Clone)', 'observationTime': 0.0},
  {'name': 'true5(Clone)', 'observationTime': 2.7165987491607666},
  {'name': 'true6(Clone)', 'observationTime': 4.4665679931640625},
  {'name': 'true7(Clone)', 'observationTime': 0.0},
  {'name': 'false1(Clone)', 'observationTime': 0.0},
  {'name': 'false2(Clone)', 'observationTime': 3.833280086517334},
  {'name': 'false3(Clone)', 'observationTime': 4.616592884063721},
  {'name': 'false4(Clone)', 'observationTime': 7.983212947845459},
  {'name': 'false5(Clone)', 'observationTime': 3.466609239578247},
  {'name': 'false6(Clone)', 'observationTime': 1.4833027124404907},
  {'name': 'false7(Clone)', 'observationTime': 0.0}],
 'Pairs': [{'answer': 'false2(Clone)', 'slotIndex': 0},
  {'answer': 'true2(Clone)', 'slotIndex': 1},
  {'answer

In [49]:
focusTimes = []

for result in results :
  for focus in result["InspectionTimes"]:
    focusTime = {
        # identify specifically an experiment
        'participantId' : int(result['experimentID']),
        'device' : result['device'],
        'datasetId' : result['chosenDataset'],
        # identify specifically an answer
        'isTrue' : ("true" in focus['name']),
        'answerId' : int(''.join(n for n in focus['name'] if n.isdigit())),
        # actual measured time
        'focusTime' : focus['observationTime']
    }
    focusTimes.append(focusTime)

In [50]:
focusTimesDf = pd.DataFrame(focusTimes)

In [51]:
focusTimesDf

Unnamed: 0,participantId,device,datasetId,isTrue,answerId,focusTime
0,13,Comp,1,True,1,15.083120
1,13,Comp,1,True,2,6.349893
2,13,Comp,1,True,3,7.566512
3,13,Comp,1,True,4,0.000000
4,13,Comp,1,True,5,6.783242
...,...,...,...,...,...,...
625,13,Comp,2,False,3,9.400105
626,13,Comp,2,False,4,5.749927
627,13,Comp,2,False,5,3.287387
628,13,Comp,2,False,6,15.450157


In [53]:
# saving as csv
focusTimesDf.to_csv(path + "/clean-data/focusTimes.csv",index = False)