## Import packages

In [1]:
import numpy as np 
import csv 
import glob
import pandas as pd
import os
import sys
from tqdm import tqdm
from joblib import Parallel, delayed
import ast

## Import data

#### Import magic carpet 2020 data

In [2]:
# Get the path to the parent directory
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

# Add the parent directory to the Python path
sys.path.append(parent_dir)

In [3]:
path_to_data = os.path.abspath(os.path.join(current_dir, '../two_step_task_datasets/magic_carpet_2020_dataset/choices'))
file_paths = glob.glob(os.path.join(path_to_data, "*game.csv"))

df_magic_carpet_list = []

par_ids_magic_carpet = []

# Loop through each file
for file_path in file_paths:
    participant_id_str = os.path.basename(file_path).split('_')[0]
    
    participant_id = int(participant_id_str)
    
    par_ids_magic_carpet.append(participant_id)
    
    # Read the CSV file into a dataframe
    df = pd.read_csv(file_path)
    
    # Add the ParticipantID column
    df['ParticipantID'] = participant_id
    
    # Append the dataframe to the list
    df_magic_carpet_list.append(df)

# Concatenate all dataframes into one
df_magic_carpet = pd.concat(df_magic_carpet_list, ignore_index=True)

#### Import spaceship 2020 data

In [4]:
path_to_data = os.path.abspath(os.path.join(current_dir, '../two_step_task_datasets/spaceship_2020_dataset/choices'))
file_paths = glob.glob(os.path.join(path_to_data, "*.csv"))

df_spaceship_list = []

par_ids_spaceship = []

# Loop through each file
for file_path in file_paths:
    if file_path.endswith("_practice.csv"):
        continue 

    basename = os.path.basename(file_path)
    participant_id_str = os.path.splitext(basename)[0] 
    
    participant_id = int(participant_id_str)
    
    par_ids_spaceship.append(participant_id)

    # Read the CSV file into a dataframe
    df = pd.read_csv(file_path)
    
    # Add the ParticipantID column
    df['ParticipantID'] = participant_id
    
    # Append the dataframe to the list
    df_spaceship_list.append(df)

# Concatenate all dataframes into one
df_spaceship = pd.concat(df_spaceship_list, ignore_index=True)

#### Import magic carpet 2023 dataset

In [5]:
path_to_data = os.path.abspath(os.path.join(current_dir, '../two_step_task_datasets/magic_carpet_2023_dataset/task_behaviour'))
file_paths = glob.glob(os.path.join(path_to_data, "*story.csv"))

df_magic_carpet_list = []

par_ids_magic_carpet_2023 = []

# Loop through each file
for file_path in file_paths:
    participant_id_str = os.path.basename(file_path).split('_')[0]
    
    participant_id = int(participant_id_str)
    
    par_ids_magic_carpet_2023.append(participant_id)
    
    # Read the CSV file into a dataframe
    df = pd.read_csv(file_path)
    
    # Add the ParticipantID column
    df['ParticipantID'] = participant_id
    
    # Append the dataframe to the list
    df_magic_carpet_list.append(df)

# Concatenate all dataframes into one
df_magic_carpet_2023 = pd.concat(df_magic_carpet_list, ignore_index=True)

## Check number of outliers

#### Check outliers for magic carpet 2020 dataset

In [72]:
n_bad_trials = 0
n_total_trials = 0

n_bad_trials_excluding_bad_participant = 0
n_total_trials_excluding_bad_participant = 0

for par in par_ids_magic_carpet:
    
    df_par = df_magic_carpet[df_magic_carpet['ParticipantID'] == par]

    T = len(df_par)
    
    n_total_trials += T

    if par != 4960:
        n_total_trials_excluding_bad_participant += T

    # Identify bad trials
    badtrials_rt1 = np.where((df_par['rt1'] < 0.1) | (df_par['rt1'] == -1) | df_par['rt1'].isna())[0]
    badtrials_rt2 = np.where((df_par['rt2'] < 0.1) | (df_par['rt2'] == -1) | df_par['rt2'].isna())[0]

    #badtrials_rt1 = np.where((df_par['rt1'] > 0) & (df_par['rt1'] < 0.1))[0]
    #badtrials_rt2 = np.where((df_par['rt2'] > 0) & (df_par['rt2'] < 0.1))[0]

    #badtrials_rt1 = np.where((df_par['rt1'] == -1) | df_par['rt1'].isna())[0]
    #badtrials_rt2 = np.where((df_par['rt2'] == -1) | df_par['rt2'].isna())[0]
    
    badtrials = np.concatenate((badtrials_rt1, badtrials_rt2))
    badtrials = np.sort(np.unique(badtrials))

    par_outlier_percentage = (len(badtrials) / T) * 100 

    print(f'Participant {par}, had {par_outlier_percentage:.3g} % of the trials defined as outliers')
    
    if par != 4960:
        n_bad_trials += len(badtrials)
        n_bad_trials_excluding_bad_participant += len(badtrials)     
    else:
        n_bad_trials+=T

Participant 46080, had 3.83 % of the trials defined as outliers
Participant 53366, had 0.495 % of the trials defined as outliers
Participant 3187, had 0 % of the trials defined as outliers
Participant 68012, had 0.985 % of the trials defined as outliers
Participant 9695, had 0.495 % of the trials defined as outliers
Participant 61120, had 0.495 % of the trials defined as outliers
Participant 4862, had 1.47 % of the trials defined as outliers
Participant 55027, had 0 % of the trials defined as outliers
Participant 99461, had 0 % of the trials defined as outliers
Participant 4960, had 44.1 % of the trials defined as outliers
Participant 16691, had 0.985 % of the trials defined as outliers
Participant 75759, had 0.495 % of the trials defined as outliers
Participant 4936, had 9.91 % of the trials defined as outliers
Participant 17946, had 1.47 % of the trials defined as outliers
Participant 33765, had 0 % of the trials defined as outliers
Participant 92975, had 0 % of the trials defined as

In [73]:
total_outlier_percentage = (100*n_bad_trials)/n_total_trials 
print(f'Total percentage of trials defined as outliers: {total_outlier_percentage:.3g}%')

Total percentage of trials defined as outliers: 6.53%


In [74]:
total_outlier_percentage = (100*n_bad_trials_excluding_bad_participant)/n_total_trials_excluding_bad_participant 
print(f'Total percentage of trials defined as outliers, excluding bad participant: {total_outlier_percentage:.3g}%')

Total percentage of trials defined as outliers, excluding bad participant: 1.07%


#### Check outliers for spaceship dataset

In [9]:
n_bad_trials = 0
n_total_trials = 0

for par in par_ids_spaceship:
    
    df_par = df_spaceship[df_spaceship['ParticipantID'] == par]

    T = len(df_par)

    n_total_trials += T

    # Identify bad trials
    badtrials_rt1 = np.where((df_par['rt1'] < 0.1) | (df_par['rt1'] == -1) | df_par['rt1'].isna())[0]
    badtrials_rt2 = np.where((df_par['rt2'] < 0.1) | (df_par['rt2'] == -1) | df_par['rt2'].isna())[0]
    
    badtrials = np.concatenate((badtrials_rt1, badtrials_rt2))
    badtrials = np.sort(np.unique(badtrials))

    n_bad_trials += len(badtrials)

    par_outlier_percentage = (len(badtrials) / T) * 100 

    print(f'Participant {par}, had {par_outlier_percentage:.3g} % of the trials defined as outliers')

Participant 19470, had 2.39 % of the trials defined as outliers
Participant 36455, had 10.8 % of the trials defined as outliers
Participant 73081, had 6 % of the trials defined as outliers
Participant 36495, had 21.9 % of the trials defined as outliers
Participant 80449, had 25.3 % of the trials defined as outliers
Participant 36848, had 15.6 % of the trials defined as outliers
Participant 81838, had 5.18 % of the trials defined as outliers
Participant 78906, had 3.2 % of the trials defined as outliers
Participant 13442, had 60.4 % of the trials defined as outliers
Participant 21541, had 14.4 % of the trials defined as outliers
Participant 5136, had 10.4 % of the trials defined as outliers
Participant 62517, had 0.4 % of the trials defined as outliers
Participant 65532, had 7.6 % of the trials defined as outliers
Participant 12251, had 3.86 % of the trials defined as outliers
Participant 28896, had 1.2 % of the trials defined as outliers
Participant 1968, had 31.6 % of the trials defin

In [10]:
total_outlier_percentage = (100*n_bad_trials)/n_total_trials 
print(f'Total percentage of trials defined as outliers: {total_outlier_percentage:.3g}%')

Total percentage of trials defined as outliers: 15%


#### Check outliers magic carpet 2023

In [11]:
n_bad_trials = 0
n_total_trials = 0

for par in par_ids_magic_carpet_2023:
    
    df_par = df_magic_carpet_2023[df_magic_carpet_2023['ParticipantID'] == par]

    T = len(df_par)
    
    n_total_trials+=T

    # Identify bad trials
    badtrials_rt1 = np.where((df_par['rt1'].isna()) | (df_par['rt1'] == '') | (df_par['rt1'] < 0.1))[0]
    badtrials_rt2 = np.where((df_par['rt2'].isna()) | (df_par['rt2'] == '') | (df_par['rt2'] < 0.1))[0]
    
    badtrials = np.concatenate((badtrials_rt1, badtrials_rt2))
    badtrials = np.sort(np.unique(badtrials))
    
    par_outlier_percentage = (len(badtrials) / T) * 100 

    print(f'Participant {par}, had {par_outlier_percentage:.3g} % of the trials defined as outliers')

    n_bad_trials+=len(badtrials)
    

Participant 94, had 2.67 % of the trials defined as outliers
Participant 2, had 1.33 % of the trials defined as outliers
Participant 15, had 1.33 % of the trials defined as outliers
Participant 27, had 1.33 % of the trials defined as outliers
Participant 70, had 0 % of the trials defined as outliers
Participant 38, had 0.667 % of the trials defined as outliers
Participant 16, had 2.67 % of the trials defined as outliers
Participant 91, had 0.667 % of the trials defined as outliers
Participant 14, had 0 % of the trials defined as outliers
Participant 3, had 1.33 % of the trials defined as outliers
Participant 23, had 2 % of the trials defined as outliers
Participant 62, had 0.667 % of the trials defined as outliers
Participant 34, had 0.667 % of the trials defined as outliers
Participant 99, had 0.667 % of the trials defined as outliers
Participant 54, had 1.33 % of the trials defined as outliers
Participant 41, had 0 % of the trials defined as outliers
Participant 82, had 1.33 % of the

In [12]:
total_outlier_percentage = (100*n_bad_trials)/n_total_trials 
print(f'Total percentage of trials defined as outliers: {total_outlier_percentage:.3g}%')

Total percentage of trials defined as outliers: 1%
