In [2]:
import numpy as np
import pandas as pd
from os import listdir
import pickle

# Data preparation
This notebook prepares the raw data for further statistical analyses. Participants with extreme behavior are excluded. 

### load data sets

In [3]:
# set path and load file names
path = '../data_empirical_raw/' # 
fileNames = listdir(path)
fileNames = [file for file in fileNames if not file.startswith('meta_') ] # don't load the meta files
print('Number of participants: {}'.format(len(fileNames)))

Number of participants: 74


## Extract the important information from the raw data files
The resulting data includes: participant ID, block no., trial no., time out, used sequence of actions, reaction time, reward

In [33]:
# this functions loads and prepares data from one participant and returns a panda
def prepare_data_participant(fname, path):
    
    # define columns to use of the raw file
    cols = ['sender', 'sender_id', 'move', 'blockN', 'trialN', 'duration','reward', 'moveCount', 'movesTrial'] 
    # deleted: 'visitedFields', 'points', 'response', 'diff',
    
    # load data from current participant
    df = pd.read_csv(path+fname, header=0, usecols=cols) # read data from csv
    
    # filter data for relevant rows
    filterExperiment = df['sender_id'].str[0:4]=='0_12' # just experiment (not training as well)
    filterMoves = df['sender']=='Move' # only moves
    filterFeedback = df['sender']=='Feedback' # only feedback with difference and reward (incl bonus)
    
    # get df with just the feedback rows
    dfTrials = df.loc[(filterExperiment) & (filterFeedback), 'trialN':'reward'].reset_index()

    # get RT for the whole trial 
    dfRTs = df['duration'].loc[(filterExperiment) & (filterMoves)].groupby([df['blockN'], df['trialN']], sort=False).sum().rename('RT').reset_index()
 
    dfTrials['RT'] = dfRTs['RT']

    # delete useless index column
    del dfTrials['index']

    return dfTrials 

In [34]:
df = pd.DataFrame()

# loop over all participants
for idx, fileName in enumerate(fileNames):
    dfParticipant = prepare_data_participant(fileName, path) # get prepared data
    dfParticipant['Participant_ID'] = idx # add participant ID
    df = pd.concat([df, dfParticipant], ignore_index=True) # concatenate to one big panda

## Define time out trials and add action sequence IDs

In [35]:
# add indicator column for time out trials (not all four moves completed)
df['time_out'] = 0
df.loc[df['moveCount']<4, 'time_out'] = 1

In [36]:
# load file with the sequences (IDs)
f = open('../data_task/sequences.pkl', 'rb')
sequences = pickle.load(f)
f.close()

In [37]:
# convert the moves from string into integer (excl. time out trials)
df.loc[df['time_out']==0, 'movesTrial'] = df.loc[df['time_out']==0, 'movesTrial'].apply(lambda x: [int(val) for val in x.split(',') ])

In [38]:
# add the sequence IDs
df['sequence_ID'] = np.nan
df.loc[df['time_out']==0, 'sequence_ID'] = df.loc[df['time_out']==0, 'movesTrial'].apply(lambda row: np.where((sequences == row).all(axis=1))[0][0])

### Calculate DAS proportions to exclude participants with extreme high or low proportions

In [39]:
idxDAS = 4 # set the DAS index

In [40]:
# extract the sequences of actions for all valid trials
dfSequences = df.loc[df['time_out']==0, ['Participant_ID', 'sequence_ID']]

# count how often each participant has used each sequence of
dfSequences = dfSequences.groupby('Participant_ID').value_counts().rename('count_sequence').reset_index()

In [41]:
# calculate the proportion of DAS choices
nTrialsParticipants = dfSequences['count_sequence'].groupby(dfSequences['Participant_ID']).sum().to_numpy()
nDASParticipants = dfSequences.loc[dfSequences['sequence_ID']==idxDAS, 'count_sequence'].to_numpy()
pDASParticipants = nDASParticipants/nTrialsParticipants

Find participants with $p(\text{DAS})>90%$

In [50]:
pDAS90 = np.where(pDASParticipants<.90)

print $p(\text{DAS})$ of the outliers

In [51]:
pDASParticipants[np.where(pDASParticipants>.90)].round(2)

array([1.  , 0.99, 0.94, 0.91])

Exclude outliers from the panda

In [52]:
df = df.loc[df['Participant_ID'].isin(pDAS90[0])]

### create a new participant ID column without missing IDs

In [21]:
# remove old ID column and add new one
nTrials = len(df['trialN'].unique())*len(df['blockN'].unique())
newIDColumn = np.arange(len(df['Participant_ID'].unique())).repeat(nTrials)
df.pop('Participant_ID')
df.insert(0, 'Participant_ID', newIDColumn) # insert new ID column at first position

### save the resulting panda for further analyses

In [23]:
df.to_csv('../data_empirical/data_cleaned.csv', index=False)

In [28]:
# print date of last changes and version numbers
%load_ext watermark

%watermark -n -u -v -iv -w

Last updated: Wed Apr 17 2024

Python implementation: CPython
Python version       : 3.11.6
IPython version      : 8.16.1

seaborn: 0.13.0
pandas : 2.1.1
numpy  : 1.25.2

Watermark: 2.4.3

