# Clean Data

In [7]:
# Import libraries
import pandas as pd
import numpy as np
import glob
import re
import janitor

In [8]:

# read and sort psychopy data ----

files = glob.glob('data/01_raw/pavlovia/*.csv')
files = list(filter(lambda file: os.stat(file).st_size > 2, files)) # ignores files of 2kb of under (empty files)

experiment = pd.concat(map(pd.read_csv, files))

cols = list(range(0, 14)) # first few cols
cols.extend([27, 28, 29, 30, 31, 48, 49, 62]) # add remaining relevant columns
experiment = experiment.drop(experiment.columns[cols],axis=1)

# fix incorrect condition labels (depress = depression)
experiment.emotion = experiment.emotion.str.replace(r'depress\b','depression',regex=True)
experiment = experiment.loc[experiment['list'] != 'prac']

# make time in milliseconds
experiment['mouse.time'] = experiment['mouse.time'] *1000

# merge columns with minor difference in heading names
experiment['trials.thisN'] = experiment.fillna(0)['trials.thisN'] + experiment.fillna(0)['trials_2.thisN']

# drop redundant columns
experiment = experiment.drop([
    'expName',
    'trials.thisIndex',
    'mouse_fixation.clicked_name', 
    'trials.thisRepN', 
    'trials.thisTrialN', 
    'trials.ran',
    'condition',
    'mouse.leftButton',
    'mouse.midButton',
    'mouse.rightButton'
    ], 
    axis=1
)

experiment = experiment.drop(experiment.filter(regex='trials_2|mouse_fixation|mouse_sentence').columns, axis=1)

# reorder cols
cols_to_order = ['participant', 'list', 'item', 'trials.thisN', 'emotion', 'tense', 'sentence', 'correct']
new_columns = cols_to_order + (experiment.columns.drop(cols_to_order).tolist())
experiment = experiment[new_columns]

# make new variables
experiment['clicked_up'] = np.where(experiment['mouse.y']>0, 1, 0)
experiment['clicked_right'] = np.where(experiment['mouse.x']>0, 1, 0)

# clean names
experiment = experiment.clean_names()

In [9]:
# read and sort consent data ----

consent = pd.read_csv('data/01_raw/qualtrics/consent-data.csv').drop([0, 1])
consent = consent[['participant', 'Progress', 'Q5', 'Q7', 'Q8']].rename(
    columns={'Q5': 'consent', 'Q7': 'age_years', 'Q8': 'gender'}
).clean_names()

# read and sort questionnaire data ----

qdat = pd.read_csv('data/01_raw/qualtrics/questionnaire-data.csv').drop([0, 1]).clean_names()
das_participants = qdat.filter(regex = 'participant|das')

# make subscales then sum them
das = pd.DataFrame(columns = ['participant', 'depression_sum', 'anxiety_sum', 'stress_sum'])
das['participant'] = das_participants.participant.astype(str)
das['depression_sum'] = das_participants.iloc[:, 0:7].astype(int).sum(axis=1)
das['anxiety_sum'] = das_participants.iloc[:, 7:14].astype(int).sum(axis=1)
das['stress_sum'] = das_participants.iloc[:, 14:21].astype(int).sum(axis=1)

In [10]:
# merge data ----

cleaned_data = experiment.merge(consent, on='participant', how='left')
cleaned_data = cleaned_data.merge(das, on='participant', how='left')

In [11]:
# make reaction time data ----

# filter to correct items only; note now uses only anxiety/depression stimuli as there's no way to determine a correct selection for neutral
cleaned_data_times = cleaned_data[(cleaned_data['correct'] == 1) & (cleaned_data['emotion'] != 'neutral')]

# fix cleaned names for mouse times; uniquely identify the click x, y, and time from the array of all x, ys, and times for mouse movements
cols=pd.Series(cleaned_data_times.columns)

for dup in cols[cols.duplicated()].unique(): 
    cols[cols[cols == dup].index.values.tolist()] = [dup + '_click' if i != 0 else dup for i in range(sum(cols == dup))]
cleaned_data_times.columns=cols

# remove outliers
cleaned_data_times = cleaned_data_times[(cleaned_data_times['mouse_time_click'] > 300) & (cleaned_data_times['mouse_time_click'] < 4000)]

In [12]:
# save cleaned data ----

cleaned_data.to_csv('data/02_cleaned/cleaned_data.csv', encoding='utf-8', index=False)
cleaned_data_times.to_csv('data/02_cleaned/cleaned_data_times.csv', encoding='utf-8', index=False)