# Lexical Decision Data Preprocessing

## Loading merged participants data

In [1]:
import os
import pandas as pd
import numpy as np

In [113]:
data = pd.read_csv("data/LD_participants_data.csv", index_col=0)

In [114]:
data

Unnamed: 0,is_word,rt,word,is_test,is_correct,age,modality,experiment_id,subject
0,1,3.811557,угол,0,1,19,1,1,1101
1,0,1.839332,грунир,0,1,19,1,1,1101
2,1,1.276595,номер,0,1,19,1,1,1101
3,1,1.294384,стена,0,1,19,1,1,1101
4,0,1.252378,фызюк,0,1,19,1,1,1101
...,...,...,...,...,...,...,...,...,...
15243,0,0.859841,малянц,1,1,19,1,26,1126
15244,0,0.811616,щекниль,1,1,19,1,26,1126
15245,0,0.979536,шелир,1,1,19,1,26,1126
15246,0,0.813491,лоур,1,1,19,1,26,1126


## Detecting outliers

### Selecting test data sample

In [115]:
data = data[data['is_test'] == 1].reset_index(drop=True).drop(columns='is_test')

### Detecting outliers using IQR

In [116]:
N_STIMULI = len(data['word'].unique())
N_STIMULI

288

In [117]:
def Q1(x):
    return np.percentile(x, q=25)
def Q3(x):
    return np.percentile(x, q=75)

data_word_q = data.groupby(['modality', "word",]).agg(
    q1=("rt", Q1),
    q3=("rt", Q3),
)
data_subject_q = data.groupby(['modality', "subject",]).agg(
    q1=("rt", Q1),
    q3=("rt", Q3),
)
for data_q in [data_word_q, data_subject_q]:
    data_q['iqr'] = data_q['q3'] - data_q['q1']
    data_q['min'] = data_q['q1'] - 1.5 * data_q['iqr']
    data_q['max'] = data_q['q3'] + 1.5 * data_q['iqr']

def is_outlier(x, feature):
    data_q = data_word_q if feature == 'word' else data_subject_q
    if x['rt'] > data_q.loc[(x['modality'], x[feature]), 'max'] or x['rt'] < data_q.loc[(x['modality'], x[feature]), 'min']:
        return 1
    return 0

data['is_outlier_word'] = [is_outlier(data.loc[i], 'word') for i in range(len(data))]
data['is_outlier_subject'] = [is_outlier(data.loc[i], 'subject') for i in range(len(data))]
data['is_outlier'] = ((data['is_outlier_subject'] + data['is_outlier_word']) > 0).astype(int)

#### detecting participants with > 20% outliers from data

In [118]:
outlier_threshold = 0.2
outliers = data.groupby(['modality', "subject",]).sum()[['is_outlier']]
names = [x[1] for x in outliers[outliers['is_outlier'] > N_STIMULI * outlier_threshold].index.values]
names

[1019, 1023, 1107]

In [119]:
outliers.sort_values(by='is_outlier', ascending=False).head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,is_outlier
modality,subject,Unnamed: 2_level_1
0,1019,204
0,1023,146
1,1107,89
1,1122,53
0,1008,50


### Detecting participants with low accuracy

In [120]:
max_error_rate = 0.15
accuracy = (data.groupby(['modality', "subject",]).sum()[['is_correct']] / N_STIMULI)
names += [x[1] for x in accuracy[accuracy['is_correct'] <= (1 - max_error_rate)].index.values]
names

[1019, 1023, 1107, 1008, 1023, 1108]

In [121]:
(data.groupby(['modality', "subject",]).sum()[['is_correct']] / N_STIMULI).sort_values(by='is_correct').head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,is_correct
modality,subject,Unnamed: 2_level_1
1,1108,0.649306
0,1008,0.826389
0,1023,0.84375
1,1118,0.885417
1,1125,0.885417


### Eliminating participants with outstanding results

In [122]:
data = data[~data['subject'].isin(names)].reset_index(drop=True)

## Reaction Time Standartization

In [None]:
def get_double_centred_zscore(row):
    return (row['rt'] - statistics['subject'].loc[(row['modality'], row['subject']), 'Mean']
            - statistics['word'].loc[(row['modality'], row['word']), 'Mean']
            + statistics['common'].loc[row['modality'], 'Mean']
    ) / statistics['common'].loc[row['modality'], 'SD']

def get_zscore(row, feature):
    return (row['rt'] - statistics[feature].loc[(row['modality'], row[feature]), 'Mean']
    ) / statistics[feature].loc[(row['modality'], row[feature]), 'SD']

statistics = {
    'subject': data.groupby(['modality', "subject",]).agg(
        Mean=("rt", np.mean),
        SD=("rt", np.std),
    ),
    'word': data.groupby(['modality', "word",]).agg(
        Mean=("rt", np.mean),
        SD=("rt", np.std),
    ),
    'common': data.groupby(['modality']).agg(
        Mean=("rt", np.mean),
        SD=("rt", np.std),
    )
}

for feature in ['subject', 'word']:
    data[f'rtz_{feature}']  = [get_zscore(row, feature) for i, row in data.iterrows()]

data['rtz'] = [get_double_centred_zscore(row) for i, row in data.iterrows()]

## Adding Stimuli Features

In [139]:
words = pd.read_csv('data/normal_words_features.csv')
pseudowords = pd.read_csv('data/pseudowords_features.csv')

In [140]:
data = pd.merge(data, pd.concat([words, pseudowords]), on='word', how='left')
data

Unnamed: 0,is_word,rt,word,is_correct,age,modality,experiment_id,subject,is_outlier_word,is_outlier_subject,...,MedianValency,OLD20,PLD20,PUP,OUP,Duration,NormOLD,NormPLD,LengthOrth,LengthPhon
0,0,1.179121,тунка,1,19,1,1,1101,0,0,...,,4.00,4.00,0.800000,0.800000,0.997292,0.800000,0.800000,5,5
1,0,1.619867,малянц,1,19,1,1,1101,0,0,...,,5.00,5.00,0.833333,0.833333,1.069229,0.833333,0.833333,6,6
2,1,1.447418,тетя,1,19,1,1,1101,0,0,...,0.073537,3.75,3.60,1.000000,1.000000,0.966104,0.937500,0.900000,4,4
3,0,1.085082,шехня,1,19,1,1,1101,0,0,...,,4.55,4.90,0.600000,0.600000,0.992563,0.910000,0.980000,5,5
4,0,1.206604,выремь,1,19,1,1,1101,0,0,...,,5.00,5.00,0.800000,0.833333,0.936500,0.833333,1.000000,6,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12955,0,0.859841,малянц,1,19,1,26,1126,0,0,...,,5.00,5.00,0.833333,0.833333,1.069229,0.833333,0.833333,6,6
12956,0,0.811616,щекниль,1,19,1,26,1126,0,0,...,,5.95,6.00,0.714286,0.571429,1.012583,0.850000,0.857143,7,7
12957,0,0.979536,шелир,1,19,1,26,1126,0,0,...,,4.00,4.30,0.600000,0.800000,0.963812,0.800000,0.860000,5,5
12958,0,0.813491,лоур,1,19,1,26,1126,0,0,...,,3.80,3.85,0.750000,0.750000,0.852479,0.950000,0.962500,4,4


In [141]:
data.to_csv('data/LD_preprocessed_data.csv', index=False)