In [2]:
# !pip install seaborn


In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import scipy.stats as stats
import re

In [4]:
# Set up files
dir_firstSess = 'data/corr_firstSess'
files_firstSess = [f for f in os.listdir(dir_firstSess) if f.endswith('.csv')]

dir_secondSess = 'data/corr_secondSess'
files_secondSess = [f for f in os.listdir(dir_secondSess) if f.endswith('.csv')]

numTotalSessions = len(files_firstSess) + len(files_secondSess)

print('Number of total sessions: ', numTotalSessions)
print('Number of subjects who completed the second session: ', len(files_secondSess))

Number of total sessions:  360
Number of subjects who completed the second session:  171


In [5]:
# print('------first session------')
# for f in files_firstSess:
#     print(f)
    
# print('------second session------')
# for f in files_secondSess:
#     print(f)


## Acquire thresholds for all tasks

### Functions:

In [6]:
def checkIfComplete(mainOutput):
    
    complete_val = mainOutput['experimentCompleteBool'].dropna().iloc[0]
    complete_bool = str(complete_val) == 'True'
    if not complete_bool:
        prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
    assert complete_bool, 'Fatal: experiment not complete!'                        

In [7]:
def parse_condition(condition_str):
    """
    Parse a condition string of the form <task>_<meridian>_block<repeat>
    Example: 'acuity_R8_block1' -> ('acuity', 'R8', 1)
    """
    match = re.match(r"^(.*?)_(.*?)_block(\d+)$", condition_str)
    if match:
        task = match.group(1)
        meridian = match.group(2)
        repeat = int(match.group(3))
        return task, meridian, repeat
    else:
        raise ValueError(f"String '{condition_str}' is not in the expected format.")


In [8]:
def getThresholds(mydir, files, condition_names, linear_scale_bool = True, convert_to_wpm_bool = False): # num_trials_per_staircase=35, exclude_trial_count_bool=True, exclude_questSD=True, 
    
    all_data = []
    
    numSess = len(files)
    numThresholdsCat = len(condition_names)
    
    for sess in range(numSess):

        # Read the CSV file
        file_path = os.path.join(mydir, files[sess])
        mainOutput = pd.read_csv(file_path)
        checkIfComplete(mainOutput)

        for cat in range(numThresholdsCat):

            cond_threshold = {}
            prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
            
            condition_threshold = 0
            condition_name = condition_names[cat]
            taskName, meridian, repeat = parse_condition(condition_name)

            condition_data = mainOutput[mainOutput['conditionName'] == condition_name]

            assert(len(condition_data.questMeanAtEndOfTrialsLoop.dropna()) == 1)
            condition_logThreshold = condition_data.questMeanAtEndOfTrialsLoop.dropna().iloc[0]
 
            if convert_to_wpm_bool:
                condition_logThreshold = np.log10(60) - condition_logThreshold
            
            if linear_scale_bool:
                # Convert to linear scale 
                condition_threshold = 10 ** condition_logThreshold
            else:
                condition_threshold = condition_logThreshold

            
            # number of trials
            trial_sent = condition_data['trialGivenToQuest']
            num_trial_sent = sum(str(this_trial) == 'True' for this_trial in trial_sent)
            # questSD
            questSD = condition_data['questSDAtEndOfTrialsLoop'].dropna().iloc[0]

            # if exclude_trial_count_bool: 
            #     # Count trials sent to quest
            #     trial_sent = condition_data['trialGivenToQuest']           
            #     num_trial_sent = sum(str(this_trial) == 'True' for this_trial in trial_sent)
            #     num_trial_not_sent = sum(str(this_trial) == 'False' for this_trial in trial_sent)
            #     trial_sent_bool = num_trial_sent >= num_trials_per_staircase
            #     num_missing_line = sum(trial_sent.isna())
            #     if not trial_sent_bool:
            #         condition_threshold = np.nan
                    # print(files[sess])
                    # print(f'Warning1: not enough trials (Session {sess}, condition {condition_name})')
                    # print(f'Num total trials: {len(trial_sent) - 1}')
                    # print(f'Num trials missing: {num_trials_per_staircase - num_trial_sent}')
                    # print(f'Num trials marked as not sent: {num_trial_not_sent}')
                    # print(f'Num lines missing: {num_missing_line - 1}')

            # if exclude_questSD:
            #     questSD = condition_data['questSDAtEndOfTrialsLoop'].dropna().iloc[0]
            #     small_questSD_bool = questSD < 0.1
            #     if not small_questSD_bool:
            #         condition_threshold = np.nan
            #         # print(f'Warning2: large SD (Session {sess}, condition {condition_name}, SD = {questSD})')
            
            assert condition_threshold != 0, 'Fatal: Threshold not assigned'

            cond_threshold['prolificID'] = prolificID
            cond_threshold['conditionName'] = condition_name
            cond_threshold['taskName'] = taskName
            cond_threshold['meridian'] = meridian
            cond_threshold['repeat'] = repeat
            cond_threshold['threshold'] = condition_threshold
            cond_threshold['numTrialsSent'] = num_trial_sent
            cond_threshold['questSD'] = questSD
            cond_threshold['readingCQAccuracy'] = np.nan

            all_data.append(cond_threshold)
        
        # all_data.append(subj_thresholds)
        
        all_data_df = pd.DataFrame(all_data)
        
    return all_data_df
            


In [9]:
def getOrdReadingSpeed(mydir, files, condition_dict, sessionName):
    '''
    accuracy_criterion_percent: the reading speed will be marked as np.nan if the accuracy for the comprehension question
                                is lower than this percentage
    wpm_criteiron: the reading speed will be marked as np.nan if it is higher than this percentage
    '''
    
    condition_names = list(condition_dict.keys())
    
    all_data = []
    
    numSess = len(files)
    numThresholdsCat = len(condition_names)
    
    for sess in range(numSess):
        
        # Read the CSV file
        file_path = os.path.join(mydir, files[sess])
        mainOutput = pd.read_csv(file_path)
        checkIfComplete(mainOutput)    
        
        for cat in range(numThresholdsCat):

            cond_wpm = {}
                
            prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
            
            wordsPerMin = 0
            condition_name = condition_names[cat]
            taskName, meridian, repeat = parse_condition(condition_name)
            repeat = sessionName # for reading, the repeats in one session are assigned to the same repeat

            # Check if the participant answered 3 or more questions correctly
            question_labels = condition_dict[condition_name]
            num_questions = len(question_labels)
            question_correct_bool = np.full(num_questions,np.nan)
            for qq in range(num_questions):

                qq_data = mainOutput[mainOutput['questionAndAnswerNickname'] == question_labels[qq]]
                
                question_correct_bool[qq] = (qq_data['questionAndAnswerCorrectAnswer'].item() == qq_data['questionAndAnswerResponse'].item())
                
            percent_correct = sum(question_correct_bool) / num_questions * 100 
            
            # calculate reading speed
            speed_data = mainOutput[mainOutput['conditionName'] == condition_name]
            numWords = speed_data['readingPageWords'].dropna()
            reading_time = speed_data['readingPageDurationOnsetToOffsetSec'].dropna()

            numWords_include = numWords[1:len(numWords)-1]  # exclude first and last page
            reading_time_include = reading_time[1:len(reading_time)-1]

            numWords_sum = numWords_include.sum()
            reading_time_sum = reading_time_include.sum()

            wordsPerMin = numWords_sum / (reading_time_sum / 60)

            assert wordsPerMin != 0, 'Fatal: Threshold not assigned'

            cond_wpm['prolificID'] = prolificID
            cond_wpm['conditionName'] = condition_name
            cond_wpm['taskName'] = taskName
            cond_wpm['meridian'] = meridian
            cond_wpm['repeat'] = repeat
            cond_wpm['threshold'] = wordsPerMin
            cond_wpm['numTrialsSent'] = np.nan
            cond_wpm['questSD'] = np.nan
            cond_wpm['readingCQAccuracy'] = percent_correct

            all_data.append(cond_wpm)
        
        all_data_df = pd.DataFrame(all_data)
        
    return all_data_df
            

### Acquire thresholds:

- letter acuity (log deg)
- crowding acuity (log deg)
- RSVP reading speed (word duration, log sec)
- ordinary reading speed (words per min)

In [10]:
# first session

thresholds_names_sess1 = ['crowding_R8_block1','crowding_L8_block1',
                          'crowding_R8_block2','crowding_L8_block2',
                          'acuity_R8_block1','acuity_L8_block1']
df_firstSess = getThresholds(dir_firstSess, files_firstSess, thresholds_names_sess1)

thresholds_rsvp_sess1 = ['rsvp_foveal_block1']
df_firstSess_rsvp = getThresholds(dir_firstSess, files_firstSess, thresholds_rsvp_sess1, convert_to_wpm_bool=True)

thresholds_names_read1 = {
        'reading_Beaver_block1': ['Beaver_1','Beaver_2','Beaver_3','Beaver_4','Beaver_5'],
        'reading_Winter_block2': ['Winter_1','Winter_2','Winter_3','Winter_4','Winter_5']}
df_firstSess_reading = getOrdReadingSpeed(dir_firstSess, files_firstSess, thresholds_names_read1, 1)

In [11]:
# second session

thresholds_names_sess2 = ['crowding_R8_block3','crowding_L8_block3',
                          'crowding_R8_block4','crowding_L8_block4',
                          'acuity_R8_block2','acuity_L8_block2']
df_secondSess = getThresholds(dir_secondSess, files_secondSess, thresholds_names_sess2)

thresholds_rsvp_sess2 = ['rsvp_foveal_block2']
df_secondSess_rsvp = getThresholds(dir_secondSess, files_secondSess, thresholds_rsvp_sess2, convert_to_wpm_bool=True)

thresholds_names_read2 = {
        'reading_Desert_block1': ['Desert_1','Desert_2','Desert_3','Desert_4','Desert_5'],
        'reading_Islands_block2': ['Islands_1','Islands_2','Islands_3','Islands_4','Islands_5']}
df_secondSess_reading = getOrdReadingSpeed(dir_secondSess, files_secondSess, thresholds_names_read2, 2)

In [12]:
# merge data frames

df_all_sessions = pd.concat([df_firstSess, df_firstSess_rsvp, df_firstSess_reading,
                              df_secondSess, df_secondSess_rsvp, df_secondSess_reading], ignore_index=True)

# Check if there are any 0 or negative values in df_both_sessions
if (df_all_sessions['threshold'] <= 0).any().any():
    print("Check if you wanted to use logged thresholds.")

In [13]:
# only keep participants who completed both sessions
df_both_sessions = df_all_sessions.groupby("prolificID").filter(lambda g: set(g["repeat"]) == {1, 2, 3, 4})

num_thresholds_per_subj = 18
assert df_both_sessions["prolificID"].nunique() == len(df_both_sessions) / num_thresholds_per_subj, 'Fatal: Number of thresholds per subject does not match'

print(f'\nNumber of participants with both sessions: {df_both_sessions["prolificID"].nunique()}')



Number of participants with both sessions: 169


In [14]:
# save to csv
output_path = 'tidy_both_sessions_thresholds.csv'
df_both_sessions.to_csv(output_path, index=False)