In [148]:
import hashlib
import pandas as pd

In [149]:
# Are we saving the cleaned data to a JSON?
saving_clean_data_to_disk = True
data_dir = '../data/cleaned_data'

## Cleaning data

### Anonymizing data

In [253]:
ids = {}
def sha256(data):
    data = data.encode('utf-8')
    if data in ids.keys():
        return ids[data]
    else:
        sha256_hash = hashlib.sha256()
        sha256_hash.update(data)
        hash_val = sha256_hash.hexdigest()
        ids[data] = hash_val
        return hash_val

### Experiment 1

In [286]:
data_path_exp1 = '../data/empirical/raw_anonymized_data_exp1.csv'

In [287]:
df_full_data_exp1 = pd.read_csv(data_path_exp1)

In [288]:
df_full_data_exp1['experiment_version'] = 1

#### Demographics

In [219]:
demographics_data_path_exp1 = '../data/empirical/demo_data_exp1.csv'

In [220]:
df_demographic_data_exp1 = pd.read_csv(demographics_data_path_exp1)

In [217]:
df_demographic_data_exp1 = df_demographic_data_exp1[['Age', 'Sex']]

KeyError: "None of [Index(['Age', 'Sex'], dtype='object')] are in the [columns]"

In [None]:
df_demographic_data_exp1['experiment_version'] = 1

In [None]:
# Sex
df_demographic_data_exp1.Sex.value_counts()

In [None]:
# Age
df_demographic_data_exp1[~(df_demographic_data_exp1['Age'].isin(['CONSENT_REVOKED', 'DATA_EXPIRED']))].Age.astype(int).mean()

In [None]:
df_demographic_data_exp1.to_json('../data/empirical/demo_data_exp1.csv')

#### Data

Here, we're removing unused columns that contain metadata from jsPsych. These are unnecessary for our analyses.In line with good practice with dataframes, we are making a copy of our original dataframe `df_full_data_exp1` for analysis and calling it `df_exp1` 

In [222]:
# Removing unused columns from jsPsych
df_exp1 = df_full_data_exp1.drop(columns=['success', 
                                          'failed_images',
                                          'failed_audio',
                                          'failed_video',
                                          'time_elapsed',
                                          'scene_sp',
                                          'internal_node_id',
                                          'session_id',
                                          'study_id',
                                          'value',
                                          'stimulus',
                                          'timeout',
                                          'trial_type',
                                          'timed_out',
                                          'response',
                                          'correct_response',
                                          'task_condition',
                                         ]
                                )

**Cleaning responses:**

In [223]:
# Remove attempts column since we no longer are using it
df_exp1 = df_exp1.drop(columns=['attempts',
                                'condition',
                                'trial_index',
                               ]
                      )

**Cleaning scene types:** Our variable `scene_type` codes which scene type the participant is viewing (e.g. stim_2, stim_4, or stim_6). jsPsych records these data verbosely (e.g. "stim_2_goalpos_1_negative.mp4"). Here, we're cleaning up the `scene_type` entries so they just say what scene type the user viewed. For example, the entry "stim_2_goalpos_1_negative.mp4" will be changed to "stim_2".

In [224]:
# Convert all scene type entries to strings
df_exp1.scene_type = df_exp1.scene_type.astype(str)
# Retain only the scene type (remove "_goalpos_" substring)
df_exp1.scene_type = df_exp1.scene_type.apply(lambda x: '_'.join(x.split('_')[:2]))

**Cleaning scenes:** The `scene` variable codes the actual scene the participant viewed (e.g. "stim_2_goalpos_1_negative.mp4"). We want to remove the ".mp4" substring from the entries in this variable.

In [225]:
# Convert all scene entries to strings
df_exp1.scene = df_exp1.scene.astype(str)
# Retain only the scene name (remove ".mp4" substring)
df_exp1.scene = df_exp1.scene.apply(lambda x: x.split('.')[0])

**Only viewing responses:** Since we removed the participants who failed the comprehension check, we are now only interested in the remaining participants' responses. So, we're removing the data that isn't coded as "response" in the `task` variable.

In [226]:
# Remove all tasks that aren't coded as response
df_exp1 = df_exp1.loc[(df_exp1.task == 'response')]

In [227]:
# Remove the task column since we no longer are using it
df_exp1 = df_exp1.drop(columns=['task'])

**Extract sim time condition**: We're just grabbing the simulation time condition from the scene condition

In [228]:
df_exp1['simulation_time_condition'] = df_exp1.scene_type.apply(lambda x: x.split('_')[0])

**Extract path condition**: We're just grabbing the path condition from the scene

In [229]:
df_exp1['path_condition'] = df_exp1.scene.apply(lambda x: 'straight' if 'yessp' in x else 'not_straight')

**Compute z-scores:** We need to compute the z-scores of response times `rt`. We will do so participant-wise (per participant) and add them as new variable `participant_z_rt`.

In [230]:
# Calculate participant-wise z-score for response time
df_exp1['participant_z_rt'] = df_exp1.groupby(['experiment_version','subject_id']).rt.apply(lambda x: (x-x.mean()) / x.std())

**Removing outlier response times**: We can now remove outlier response times. We use the 2-SD methd as per Alexander Berger and Markus Kiefer, 2021. 2-SD outlier exclusion demonstrated best results for omitting Type-1 errors (false positives) and retaining original data.

In [231]:
# Determine number of datapoints lost by outlier exclusion
excluded = len(df_exp1) - len(df_exp1[abs(df_exp1.participant_z_rt) <= 2])
print(f'{excluded} of {len(df_exp1)} ({excluded/len(df_exp1)*100:.4}%) are excluded via 2-SD exclusion')

105 of 2256 (4.654%) are excluded via 2-SD exclusion


In [232]:
# Excluding outliers
df_exp1 = df_exp1[abs(df_exp1.participant_z_rt) <= 2]

**Clarifying variable names**: Now we rename some columns to make their values more clear.



In [233]:
df_exp1 = df_exp1.rename(columns={'scene_col': 'scene_collision_condition'})
df_exp1 = df_exp1.reset_index(drop=True)

In [234]:
# Save file locally
if saving_clean_data_to_disk:
    print('Data saved...')
    df_exp1.to_json('../data/empirical/data_exp1.json')
else:
    print('Data was not saved...')

Data saved...


### Experiment 2

In [235]:
data_path_exp2 = '../data/empirical/raw_anonymized_data_exp2.csv'

In [236]:
df_full_data_exp2 = pd.read_csv(data_path_exp2)

In [237]:
df_full_data_exp2['experiment_version'] = 2

In [238]:
df_full_data_exp2['subject_id'] = df_full_data_exp2.subject_id.apply(sha256)

#### Demographics

In [239]:
demographics_data_path_exp2 = '../data/empirical/demo_data_exp2.csv'

In [240]:
df_demographic_data_exp2 = pd.read_csv(demographics_data_path_exp2)

In [241]:
df_demographic_data_exp2 = df_demographic_data_exp1[['Age', 'Sex']]

KeyError: "None of [Index(['Age', 'Sex'], dtype='object')] are in the [columns]"

In [None]:
df_demographic_data_exp2['experiment_version'] = 2

In [None]:
# Sex
df_demographic_data_exp2.Sex.value_counts()

In [None]:
# Age
df_demographic_data_exp2[~(df_demographic_data_exp2['Age'].isin(['CONSENT_REVOKED', 'DATA_EXPIRED']))].Age.astype(int).mean()

In [None]:
df_demographic_data_exp2.to_csv(demographics_data_path_exp2)

#### Data

In [242]:
# Removing unused columns from jsPsych
df_exp2 = df_full_data_exp2.drop(columns=['success', 
                                          'failed_images',
                                          'failed_audio',
                                          'failed_video',
                                          'time_elapsed',
                                          'scene_sp',
                                          'internal_node_id',
                                          'session_id',
                                          'study_id',
                                          'value',
                                          'stimulus',
                                          'timeout',
                                          'trial_type',
                                          'timed_out',
                                          'correct_response',
                                          'response',
                                          'task_condition',
                                         ]
                                )

In [243]:
# Copy dataframe
df_exp2 = df_exp2.loc[df_exp2.task == "response"]
# Remove the task column since we no longer are using it
df_exp2 = df_exp2.drop(columns=['task'])
# Remove attempts column since we no longer are using it
df_exp2 = df_exp2.drop(columns=['attempts',
                                'condition',
                                'trial_index',
                               ]
                      )

In [244]:
# Convert all scene type entries to strings
df_exp2.scene_type = df_exp2.scene_type.astype(str)

In [245]:
# Retain only the scene type (remove "_goalpos_" substring)
df_exp2.scene_type = df_exp2.scene_type.apply(lambda x: '_'.join(x.split('_')[:2]))

In [246]:
# Convert all scene entries to strings
df_exp2.scene = df_exp2.scene.astype(str)

In [247]:
# Retain only the scene name (remove ".mp4" substring)
df_exp2.scene = df_exp2.scene.apply(lambda x: x.split('.')[0])

In [248]:
# Calculate participant-wise z-score for response time
df_exp2['participant_z_rt'] = df_exp2.groupby(['experiment_version','subject_id']).rt.apply(lambda x: (x-x.mean()) / x.std())

In [249]:
# Determine number of datapoints lost by outlier exclusion
excluded = len(df_exp2) - len(df_exp2[abs(df_exp2.participant_z_rt) <= 2])
print(f'{excluded} of {len(df_exp2)} ({excluded/len(df_exp2)*100:.4}%) are excluded via 2-SD exclusion')

439 of 9048 (4.852%) are excluded via 2-SD exclusion


In [250]:
# Excluding outliers
df_exp2 = df_exp2[abs(df_exp2.participant_z_rt) <= 2]

In [251]:
# Rename some columns to make their values more clear.
df_exp2 = df_exp2.rename(columns={'scene_col': 'scene_collision_condition'})

In [252]:
# Save file locally
if saving_clean_data_to_disk:
    print('Data saved...')
    df_exp2.to_json('../data/empirical/data_exp2.json')
else:
    print('Data not saved...')

Data saved...
