In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('neurolinguistics_study_version_2-merged.csv')

df.head()

Unnamed: 0,workerid,proliferate.condition,correct_answer,prime,probeContent,response,rt,slide_number_in_experiment,target,trial_type,...,subject_information.gender,subject_information.otherLanguage,subject_information.problems,subject_information.prolific_ID,system.Browser,system.OS,system.screenH,system.screenW,time_in_minutes,error
0,19,condition1,1,準,準,f,2195,8,腦,con,...,Female,普通话，英文,无,66d0e03800acfdbc181f1450,Chrome,Mac,900,1440,4.313183,
1,19,condition1,1,輸,輸,f,2056,9,贏,sem-sem,...,Female,普通话，英文,无,66d0e03800acfdbc181f1450,Chrome,Mac,900,1440,4.313183,
2,19,condition1,2,湧,敢,j,1828,10,敢,phon-sem,...,Female,普通话，英文,无,66d0e03800acfdbc181f1450,Chrome,Mac,900,1440,4.313183,
3,19,condition1,2,炸,查,j,1898,11,查,con,...,Female,普通话，英文,无,66d0e03800acfdbc181f1450,Chrome,Mac,900,1440,4.313183,
4,19,condition1,1,繪,繪,f,2126,12,港,con,...,Female,普通话，英文,无,66d0e03800acfdbc181f1450,Chrome,Mac,900,1440,4.313183,


In [4]:
df['workerid'].value_counts()

workerid
19    180
33    180
20    180
13    180
27    180
34    180
18    180
26    180
29    180
31    180
21    180
28    180
22    180
25    180
23    180
17    180
15    180
30    180
14    180
24    180
Name: count, dtype: int64

In [5]:
# Step 1: Get all columns that start with 'subject_information.'
info_cols = [col for col in df.columns if col.startswith('subject_information.')]
keywords = ['comments', 'problems', 'assess','gender']
info_cols = [col for col in info_cols if not any(k in col for k in keywords)]

# Step 2: Get unique worker IDs
subject_ids = df['workerid'].unique()

# Step 3: Loop over each subject and print their info
for workerid in subject_ids:
    df_subj = df[df['workerid'] == workerid]
    print(f'----- workerid: {workerid} -----')
    for col in info_cols:
        unique_val = df_subj[col].unique()
        print(f'{col}: {unique_val}')


----- workerid: 19 -----
subject_information.age: [25]
subject_information.chinese_country_years: [9]
subject_information.chinese_family_years: [25]
subject_information.chinese_school_work_years_spoken: [9]
subject_information.chinese_school_work_years_written: [9]
subject_information.education: [3]
subject_information.otherLanguage: ['普通话，英文']
subject_information.prolific_ID: ['66d0e03800acfdbc181f1450']
----- workerid: 33 -----
subject_information.age: [32]
subject_information.chinese_country_years: [19]
subject_information.chinese_family_years: [32]
subject_information.chinese_school_work_years_spoken: [18]
subject_information.chinese_school_work_years_written: [18]
subject_information.education: [3]
subject_information.otherLanguage: ['普通话，粤语，英语']
subject_information.prolific_ID: ['6595d5e0e40533319c0188c2']
----- workerid: 14 -----
subject_information.age: [30]
subject_information.chinese_country_years: [0]
subject_information.chinese_family_years: [0]
subject_information.chinese_

In [16]:
subjects = df['workerid'].unique().tolist()
remove_set = {14,15}
subjects = [x for x in subjects if x not in remove_set]
n_subjects = len(subjects)

print('Total number of subjects: ', n_subjects)

Total number of subjects:  18


In [17]:
df.columns

Index(['workerid', 'proliferate.condition', 'correct_answer', 'prime',
       'probeContent', 'response', 'rt', 'slide_number_in_experiment',
       'target', 'trial_type', 'catch_trials', 'subject_information.age',
       'subject_information.assess',
       'subject_information.chinese_country_years',
       'subject_information.chinese_family_years',
       'subject_information.chinese_school_work_years_spoken',
       'subject_information.chinese_school_work_years_written',
       'subject_information.comments', 'subject_information.education',
       'subject_information.gender', 'subject_information.otherLanguage',
       'subject_information.problems', 'subject_information.prolific_ID',
       'system.Browser', 'system.OS', 'system.screenH', 'system.screenW',
       'time_in_minutes', 'error'],
      dtype='object')

In [18]:
def compute_accuracy(df):

    ntrials = len(df)
    n_correct_trials = 0
    
    for ii in range(len(df)):

        trial = df.iloc[ii]
        correct_answer = trial['correct_answer']
        response = trial['response']

        if (correct_answer == 1 and response == 'f') or (correct_answer == 2 and response == 'j'):

            n_correct_trials += 1

    accuracy_percent = n_correct_trials / ntrials

    return accuracy_percent

In [23]:
def compute_rt(df, trial_type):

    df_trial_type = df[df['trial_type'] == trial_type]
    #df_trial_type = df_trial_type[df_trial_type['correct_answer'] == 1]


    rt = df_trial_type['rt']

    rt = rt - (300 + 300 + 300 + 300)

    assert(rt > 0).all(), "Some RTs are not greater than 0"

    return(rt.mean())
    

In [24]:
results = []

for subj in range(n_subjects):
    workerid = subjects[subj]
    df_workerid = df[df['workerid'] == workerid]

    overall_accuracy = compute_accuracy(df_workerid)

    trial_types = df_workerid['trial_type'].unique()
    for trial_type in trial_types:
        mean_rt = compute_rt(df_workerid, trial_type)

        # Append a dictionary of results for this subject and trial type
        results.append({
            'workerid': workerid,
            'accuracy': overall_accuracy,
            'trial_type': trial_type,
            'mean_rt': mean_rt
        })

# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(results)


In [25]:
print(results_df)

    workerid  accuracy trial_type      mean_rt
0         19  0.994444        con   805.782609
1         19  0.994444    sem-sem   676.904762
2         19  0.994444   phon-sem   674.440000
3         19  0.994444  phon-phon   748.478261
4         33  1.000000   phon-sem  1134.480000
..       ...       ...        ...          ...
67        20  0.955556        con   977.000000
68        24  0.994444    sem-sem   539.608696
69        24  0.994444   phon-sem   644.500000
70        24  0.994444  phon-phon   558.285714
71        24  0.994444        con   538.407407

[72 rows x 4 columns]


In [26]:
trial_types = df['trial_type'].unique()

for tt in range(len(trial_types)):

    this_type = trial_types[tt]

    df_type = results_df[results_df['trial_type'] == this_type]

    mean_rt = df_type['mean_rt'].mean()

    print(this_type, mean_rt)

con 821.6467025940395
sem-sem 786.6876058835754
phon-sem 827.4852086895642
phon-phon 954.1025860466403
