This notebook extracts thresholds predicted by quest after every trial. 
It saves the results in a long form (each row is a threshold), named "tidy_both_sessions_per_trial_log.csv".

In [35]:
# !pip install seaborn


In [36]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import scipy.stats as stats
import re

In [37]:
# Set up files
dir_firstSess = 'data/corr_firstSess'
files_firstSess = [f for f in os.listdir(dir_firstSess) if f.endswith('.csv')]

dir_secondSess = 'data/corr_secondSess'
files_secondSess = [f for f in os.listdir(dir_secondSess) if f.endswith('.csv')]

numTotalSessions = len(files_firstSess) + len(files_secondSess)

print('Number of total sessions: ', numTotalSessions)
print('Number of subjects who completed the second session: ', len(files_secondSess))

Number of total sessions:  360
Number of subjects who completed the second session:  171


In [38]:
# print('------first session------')
# for f in files_firstSess:
#     print(f)
    
# print('------second session------')
# for f in files_secondSess:
#     print(f)


## Acquire thresholds for all tasks

### Functions:

In [39]:
def checkIfComplete(mainOutput):
    
    complete_val = mainOutput['experimentCompleteBool'].dropna().iloc[0]
    complete_bool = str(complete_val) == 'True'
    if not complete_bool:
        prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
    assert complete_bool, 'Fatal: experiment not complete!'                        

In [40]:
def parse_condition(condition_str):
    """
    Parse a condition string of the form <task>_<meridian>_block<repeat>
    Example: 'acuity_R8_block1' -> ('acuity', 'R8', 1)
    """
    match = re.match(r"^(.*?)_(.*?)_block(\d+)$", condition_str)
    if match:
        task = match.group(1)
        meridian = match.group(2)
        repeat = int(match.group(3))
        return task, meridian, repeat
    else:
        raise ValueError(f"String '{condition_str}' is not in the expected format.")


In [41]:
def getThresholds(mydir, files, condition_names, linear_scale_bool = False, staircase_length = 23, convert_to_wpm_bool = False): # num_trials_per_staircase=35, exclude_trial_count_bool=True, exclude_questSD=True, 

    all_data = []

    numSess = len(files)
    numThresholdsCat = len(condition_names)

    for sess in range(numSess):

        # Read the CSV file
        file_path = os.path.join(mydir, files[sess])
        mainOutput = pd.read_csv(file_path)
        checkIfComplete(mainOutput)

        for cat in range(numThresholdsCat):

            cond_threshold = {}
            prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
            assert prolificID, 'Fatal: no prolificID'

            condition_threshold = 0
            condition_name = condition_names[cat]
            taskName, meridian, repeat = parse_condition(condition_name)

            condition_data = mainOutput[mainOutput['conditionName'] == condition_name]

            condition_data_filtered = condition_data[condition_data['trialGivenToQuest'] == True]
            condition_staircase_full = condition_data_filtered['questMeanBeforeThisTrialResponse'].dropna()

            if len(condition_staircase_full) > staircase_length - 1:
                condition_logThreshold = condition_staircase_full.iloc[staircase_length - 1]
            elif len(condition_staircase_full) == 0:
                print('Warning: No trials sent to QUEST (file name: {}, condition {})'.format(files[sess], condition_name))
                condition_logThreshold = np.nan
            else:
                condition_logThreshold = condition_staircase_full.iloc[-1]
            assert np.isscalar(condition_logThreshold), "Threshold extraction did not return a single value"

            if convert_to_wpm_bool:
                condition_logThreshold = np.log10(60) - condition_logThreshold

            if linear_scale_bool:
            # Convert to linear scale 
                condition_threshold = 10 ** condition_logThreshold
            else:
                condition_threshold = condition_logThreshold


            # number of trials
            trial_sent = condition_data['trialGivenToQuest']
            num_trial_sent = sum(str(this_trial) == 'True' for this_trial in trial_sent)
            # questSD
            questSD = condition_data['questSDAtEndOfTrialsLoop'].dropna().iloc[0]

            assert condition_threshold != 0, 'Fatal: Threshold not assigned'

            cond_threshold['prolificID'] = prolificID
            cond_threshold['conditionName'] = condition_name
            cond_threshold['taskName'] = taskName
            cond_threshold['meridian'] = meridian
            cond_threshold['repeat'] = repeat
            cond_threshold['threshold'] = condition_threshold
            cond_threshold['numTrialsSent'] = num_trial_sent
            cond_threshold['questSD'] = questSD
            cond_threshold['readingCQAccuracy'] = np.nan

            all_data.append(cond_threshold)

            # all_data.append(subj_thresholds)

    all_data_df = pd.DataFrame(all_data)

    return all_data_df
            


In [42]:
def getOrdReadingSpeed(mydir, files, condition_dict, sessionName, log_bool = True):
    '''
    accuracy_criterion_percent: the reading speed will be marked as np.nan if the accuracy for the comprehension question
                                is lower than this percentage
    wpm_criteiron: the reading speed will be marked as np.nan if it is higher than this percentage
    '''
    
    condition_names = list(condition_dict.keys())
    
    all_data = []
    
    numSess = len(files)
    numThresholdsCat = len(condition_names)
    
    for sess in range(numSess):
        
        # Read the CSV file
        file_path = os.path.join(mydir, files[sess])
        mainOutput = pd.read_csv(file_path)
        checkIfComplete(mainOutput)    
        
        for cat in range(numThresholdsCat):

            cond_wpm = {}
                
            prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
            
            wordsPerMin = 0
            condition_name = condition_names[cat]
            taskName, meridian, repeat = parse_condition(condition_name)
            repeat = sessionName # for reading, the repeats in one session are assigned to the same repeat

            # Check if the participant answered 3 or more questions correctly
            question_labels = condition_dict[condition_name]
            num_questions = len(question_labels)
            question_correct_bool = np.full(num_questions,np.nan)
            for qq in range(num_questions):

                qq_data = mainOutput[mainOutput['questionAndAnswerNickname'] == question_labels[qq]]
                
                question_correct_bool[qq] = (qq_data['questionAndAnswerCorrectAnswer'].item() == qq_data['questionAndAnswerResponse'].item())
                
            percent_correct = sum(question_correct_bool) / num_questions * 100 
            
            # calculate reading speed
            speed_data = mainOutput[mainOutput['conditionName'] == condition_name]
            numWords = speed_data['readingPageWords'].dropna()
            reading_time = speed_data['readingPageDurationOnsetToOffsetSec'].dropna()

            numWords_include = numWords[1:len(numWords)-1]  # exclude first and last page
            reading_time_include = reading_time[1:len(reading_time)-1]

            numWords_sum = numWords_include.sum()
            reading_time_sum = reading_time_include.sum()

            readingSpeed = numWords_sum / (reading_time_sum / 60)

            assert readingSpeed != 0, 'Fatal: Threshold not assigned'

            if log_bool:
                readingSpeed = np.log10(readingSpeed)

            cond_wpm['prolificID'] = prolificID
            cond_wpm['conditionName'] = condition_name
            cond_wpm['taskName'] = taskName
            cond_wpm['meridian'] = meridian
            cond_wpm['repeat'] = repeat
            cond_wpm['threshold'] = readingSpeed
            cond_wpm['numTrialsSent'] = np.nan
            cond_wpm['questSD'] = np.nan
            cond_wpm['readingCQAccuracy'] = percent_correct

            all_data.append(cond_wpm)
        
        all_data_df = pd.DataFrame(all_data)
        
    return all_data_df
            

In [43]:
def get_trialwise_thresholds(
    mydir, files, condition_names,
    value_col="questMeanBeforeThisTrialResponse",
    linear_scale_bool=False,
    convert_to_wpm_bool=False
):
    """
    Return one row per QUEST trial with the running threshold value
    taken directly from `value_col` (default: 'questMeanBeforeThisTrialResponse').

    Columns returned:
      prolificID, file, conditionName, taskName, meridian, repeat,
      trial_index (1-based within staircase), threshold_value,
      numTrialsSent_total (per condition, optional), questSD_end (optional)
    """
    rows = []

    for fname in files:
        file_path = os.path.join(mydir, fname)
        main = pd.read_csv(file_path)
        checkIfComplete(main)

        prolificID = (main['ProlificParticipantID'].dropna().iloc[0]
                      if 'ProlificParticipantID' in main else np.nan)
        assert prolificID, f'Fatal: no prolific ID, session name: {fname}'

        for condition_name in condition_names:
            # subset to condition
            cond = main[main['conditionName'] == condition_name].copy()
            assert not cond.empty,f'Fatal: empty condition, session name: {fname}'

            # only trials sent to QUEST
            if 'trialGivenToQuest' not in cond.columns:
                # nothing to extract for non-QUEST tasks
                print(f'Warning: no column named trialGivenToQuest, session name: {fname}')

            cond = cond[cond['trialGivenToQuest'] == True].copy()
            if cond.empty:
                print(f'Fatal: no trials were sent to quest, session name: {fname}; condition: {condition_name}')


            # must have the value column
            assert value_col in cond.columns, f'Fatal: no value column {value_col}, session name: {fname}'

            # parse condition into parts (your helper)
            taskName, meridian, repeat = parse_condition(condition_name)

            # numeric series; if set errors to 'coerse', will keep NaNs to preserve alignment if any
            vals = pd.to_numeric(cond[value_col], errors='raise')

            # optional unit conversions (apply in log domain first)
            if convert_to_wpm_bool:
                # log10(sec/word) -> log10(words/min)
                vals = np.log10(60) - vals

            if linear_scale_bool:
                vals = 10 ** vals

            # optional per-condition metadata
            numTrialsSent_total = int((main.loc[main['conditionName'] == condition_name, 'trialGivenToQuest'] == True).sum())
            # questSD_end = (pd.to_numeric(cond.get('questSDAtEndOfTrialsLoop', pd.Series(dtype=float)), errors='coerce')
            #                .dropna()
            #                .iloc[0] if 'questSDAtEndOfTrialsLoop' in cond and cond['questSDAtEndOfTrialsLoop'].notna().any() else np.nan)

            # emit rows
            for t in range(len(vals)):
                rows.append({
                    'prolificID': prolificID,
                    'conditionName': condition_name,
                    'taskName': taskName,
                    'meridian': meridian,
                    'repeat': repeat,
                    'trial_index': t + 1,            # 1-based
                    'threshold': vals.iloc[t],  # taken directly from value_col
                    'numTrialsSent': numTrialsSent_total,
                    'readingCQAccuracy': np.nan
                    # 'questSD_end': questSD_end,
                })

    return pd.DataFrame(rows)


In [44]:
def getOrdReadingSpeed_pagewise(mydir, files, condition_dict, sessionName, log_bool = True):
    '''
    accuracy_criterion_percent: the reading speed will be marked as np.nan if the accuracy for the comprehension question
                                is lower than this percentage
    wpm_criteiron: the reading speed will be marked as np.nan if it is higher than this percentage
    '''
    
    condition_names = list(condition_dict.keys())
    
    all_data = []
    
    numSess = len(files)
    numThresholdsCat = len(condition_names)
    
    for sess in range(numSess):
        
        # Read the CSV file
        file_path = os.path.join(mydir, files[sess])
        mainOutput = pd.read_csv(file_path)
        checkIfComplete(mainOutput)    
        
        for cat in range(numThresholdsCat):

            cond_wpm = {}
                
            prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
            
            wordsPerMin = 0
            condition_name = condition_names[cat]
            taskName, meridian, repeat = parse_condition(condition_name)
            repeat = sessionName # for reading, the repeats in one session are assigned to the same repeat

            # Check if the participant answered 3 or more questions correctly
            question_labels = condition_dict[condition_name]
            num_questions = len(question_labels)
            question_correct_bool = np.full(num_questions,np.nan)
            for qq in range(num_questions):

                qq_data = mainOutput[mainOutput['questionAndAnswerNickname'] == question_labels[qq]]
                
                question_correct_bool[qq] = (qq_data['questionAndAnswerCorrectAnswer'].item() == qq_data['questionAndAnswerResponse'].item())
                
            percent_correct = sum(question_correct_bool) / num_questions * 100 
            
            # calculate reading speed
            speed_data = mainOutput[mainOutput['conditionName'] == condition_name]
            numWords = speed_data['readingPageWords'].dropna()
            reading_time = speed_data['readingPageDurationOnsetToOffsetSec'].dropna()

            numWords_include = numWords[1:len(numWords)-1]  # exclude first and last page
            reading_time_include = reading_time[1:len(reading_time)-1]

            readingSpeed = numWords_include / (reading_time_include / 60)

            if readingSpeed.empty:
                print(f'Warning: no reading speed data, session name: {files[sess]}, condition: {condition_name}')

            if log_bool:
                readingSpeed = np.log10(readingSpeed)

            for pg in range(len(readingSpeed)):

                cond_wpm['prolificID'] = prolificID
                cond_wpm['conditionName'] = condition_name
                cond_wpm['taskName'] = taskName
                cond_wpm['meridian'] = meridian
                cond_wpm['repeat'] = repeat
                cond_wpm['trial_index'] = pg + 1
                cond_wpm['threshold'] = readingSpeed.iloc[pg]
                cond_wpm['numTrialsSent'] = np.nan
                # cond_wpm['questSD'] = np.nan
                cond_wpm['readingCQAccuracy'] = percent_correct

                all_data.append(cond_wpm)
        
        all_data_df = pd.DataFrame(all_data)
        
    return all_data_df
            

### Acquire thresholds:

- letter acuity (log deg)
- crowding acuity (log deg)
- RSVP reading speed (word duration, log sec)
- ordinary reading speed (words per min)

In [45]:
# first session

thresholds_names_sess1 = ['crowding_R8_block1','crowding_L8_block1',
                          'crowding_R8_block2','crowding_L8_block2',
                          'acuity_R8_block1','acuity_L8_block1']
df_firstSess = get_trialwise_thresholds(dir_firstSess, files_firstSess, thresholds_names_sess1)

thresholds_rsvp_sess1 = ['rsvp_foveal_block1']
df_firstSess_rsvp = get_trialwise_thresholds(dir_firstSess, files_firstSess, thresholds_rsvp_sess1, convert_to_wpm_bool=True)

thresholds_names_read1 = {
        'reading_Beaver_block1': ['Beaver_1','Beaver_2','Beaver_3','Beaver_4','Beaver_5'],
        'reading_Winter_block2': ['Winter_1','Winter_2','Winter_3','Winter_4','Winter_5']}
df_firstSess_reading = getOrdReadingSpeed_pagewise(dir_firstSess, files_firstSess, thresholds_names_read1, 1)

Fatal: no trials were sent to quest, session name: FreeSilverFish342_67bdf5a74b3f256db8907428_CrowdingReadingAcuity_firstSess20_0001_2025-07-02_19h51.21.712_EDT.csv; condition: crowding_R8_block1
Fatal: no trials were sent to quest, session name: FreeSilverFish342_67bdf5a74b3f256db8907428_CrowdingReadingAcuity_firstSess20_0001_2025-07-02_19h51.21.712_EDT.csv; condition: crowding_L8_block1
Fatal: no trials were sent to quest, session name: FreeSilverFish342_67bdf5a74b3f256db8907428_CrowdingReadingAcuity_firstSess20_0001_2025-07-02_19h51.21.712_EDT.csv; condition: crowding_R8_block2
Fatal: no trials were sent to quest, session name: FreeSilverFish342_67bdf5a74b3f256db8907428_CrowdingReadingAcuity_firstSess20_0001_2025-07-02_19h51.21.712_EDT.csv; condition: crowding_L8_block2
Fatal: no trials were sent to quest, session name: FreeSilverFish342_67bdf5a74b3f256db8907428_CrowdingReadingAcuity_firstSess20_0001_2025-07-02_19h51.21.712_EDT.csv; condition: acuity_R8_block1
Fatal: no trials were 

In [46]:
# second session

thresholds_names_sess2 = ['crowding_R8_block3','crowding_L8_block3',
                          'crowding_R8_block4','crowding_L8_block4',
                          'acuity_R8_block2','acuity_L8_block2']
df_secondSess = get_trialwise_thresholds(dir_secondSess, files_secondSess, thresholds_names_sess2)

thresholds_rsvp_sess2 = ['rsvp_foveal_block2']
df_secondSess_rsvp = get_trialwise_thresholds(dir_secondSess, files_secondSess, thresholds_rsvp_sess2, convert_to_wpm_bool=True)

thresholds_names_read2 = {
        'reading_Desert_block1': ['Desert_1','Desert_2','Desert_3','Desert_4','Desert_5'],
        'reading_Islands_block2': ['Islands_1','Islands_2','Islands_3','Islands_4','Islands_5']}
df_secondSess_reading = getOrdReadingSpeed_pagewise(dir_secondSess, files_secondSess, thresholds_names_read2, 2)

Fatal: no trials were sent to quest, session name: UpBronzeWhale573_5e38397719e44f0244389781_CrowdingReadingAcuity_secondSess13_0001_2025-07-01_17h48.33.278_ADT.csv; condition: crowding_R8_block3
Fatal: no trials were sent to quest, session name: UpBronzeWhale573_5e38397719e44f0244389781_CrowdingReadingAcuity_secondSess13_0001_2025-07-01_17h48.33.278_ADT.csv; condition: crowding_L8_block3
Fatal: no trials were sent to quest, session name: UpBronzeWhale573_5e38397719e44f0244389781_CrowdingReadingAcuity_secondSess13_0001_2025-07-01_17h48.33.278_ADT.csv; condition: crowding_R8_block4
Fatal: no trials were sent to quest, session name: UpBronzeWhale573_5e38397719e44f0244389781_CrowdingReadingAcuity_secondSess13_0001_2025-07-01_17h48.33.278_ADT.csv; condition: crowding_L8_block4
Fatal: no trials were sent to quest, session name: UpBronzeWhale573_5e38397719e44f0244389781_CrowdingReadingAcuity_secondSess13_0001_2025-07-01_17h48.33.278_ADT.csv; condition: acuity_R8_block2
Fatal: no trials were 

In [47]:
# merge data frames

df_all_sessions = pd.concat([df_firstSess, df_firstSess_rsvp, df_firstSess_reading,
                              df_secondSess, df_secondSess_rsvp, df_secondSess_reading], ignore_index=True)

# Check if there are any 0 or negative values in df_both_sessions
if (df_all_sessions['threshold'] <= 0).any().any():
    print("Check if you wanted to use logged thresholds.")

Check if you wanted to use logged thresholds.


In [48]:
# only keep participants who completed both sessions
df_both_sessions = df_all_sessions.groupby("prolificID").filter(lambda g: set(g["repeat"]) == {1, 2, 3, 4})

# num_thresholds_per_subj = 18
# assert df_both_sessions["prolificID"].nunique() == len(df_both_sessions) / num_thresholds_per_subj, 'Fatal: Number of thresholds per subject does not match'

print(f'\nNumber of participants with both sessions: {df_both_sessions["prolificID"].nunique()}')



Number of participants with both sessions: 168


In [49]:
df_both_sessions_log = df_both_sessions.copy()
# if not already log-transformed, do the log transform here
# df_both_sessions_log['threshold'] = np.log10(df_both_sessions_log['threshold'])

In [50]:
# save to csv
output_path = 'tidy_both_sessions_thresholds_per_trial_log.csv'
df_both_sessions_log.to_csv(output_path, index=False)