In [1]:
# !pip install seaborn


In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import scipy.stats as stats
import re

In [3]:
# Set up files
dir_firstSess = 'data/corr_firstSess'
files_firstSess = [f for f in os.listdir(dir_firstSess) if f.endswith('.csv')]

dir_secondSess = 'data/corr_secondSess'
files_secondSess = [f for f in os.listdir(dir_secondSess) if f.endswith('.csv')]

numTotalSessions = len(files_firstSess) + len(files_secondSess)

print('Number of total sessions: ', numTotalSessions)
print('Number of subjects who completed the second session: ', len(files_secondSess))

Number of total sessions:  360
Number of subjects who completed the second session:  171


## Acquire thresholds for all tasks

### Functions:

In [4]:
def checkIfComplete(mainOutput):
    
    complete_val = mainOutput['experimentCompleteBool'].dropna().iloc[0]
    complete_bool = str(complete_val) == 'True'
    if not complete_bool:
        prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
    assert complete_bool, 'Fatal: experiment not complete!'                        

In [26]:
def get_rsvp_responses(mydir, files): 
    
    all_data = []
    
    numSess = len(files)
    
    # for sess in range(numSess):
    sess = 0
    # Read the CSV file
    file_path = os.path.join(mydir, files[sess])
    mainOutput = pd.read_csv(file_path)
    # checkIfComplete(mainOutput)


    cond_threshold = {}
    prolificID = mainOutput['ProlificParticipantID'].dropna().iloc[0]
    
    condition_threshold = 0
    # Select rows where conditionName contains the RSVP foveal prefix
    condition_data = mainOutput[mainOutput['conditionName'].str.contains('rsvp_foveal_block', na=False)]

    rsvp_responses = condition_data['rsvpReadingResponsesBool'].dropna()
    # Split each response string like '1,1,1' into three parts and store as a matrix
    resp_list = []
    for resp in rsvp_responses:
        if pd.isna(resp):
            # represent missing as [np.nan, np.nan, np.nan]
            resp_list.append([np.nan, np.nan, np.nan])
            continue
        # ensure it's a string and strip whitespace
        resp_str = str(resp).strip()
        parts = [p.strip() for p in resp_str.split(',')]
        # If fewer than 3 parts, pad with NaN; if more, truncate
        parts = parts[:3] + [np.nan] * max(0, 3 - len(parts))
        # convert to numeric where possible
        numeric_parts = []
        for p in parts:
            try:
                numeric_parts.append(int(p))
            except Exception:
                try:
                    numeric_parts.append(float(p))
                except Exception:
                    numeric_parts.append(np.nan)
        resp_list.append(numeric_parts)

    # Convert to numpy matrix (rows=trials, cols=3)
    if len(resp_list) > 0:
        resp_matrix = np.array(resp_list)
    else:
        resp_matrix = np.empty((0, 3))

    # simple check: warn if fewer than expected trials
    if resp_matrix.shape[0] < 24:
        print(f"Warning: fewer than 24 trials ({resp_matrix.shape[0]}), prolificID: {prolificID}")
    print(resp_matrix)
    cond_threshold['prolificID'] = prolificID
    cond_threshold['threshold'] = condition_threshold
    cond_threshold['rsvp_response_matrix'] = resp_matrix.tolist()

    all_data.append(cond_threshold)

    # all_data.append(subj_thresholds)
    
    all_data_df = pd.DataFrame(all_data)
        
    return all_data_df
            


### Acquire thresholds:

- letter acuity (log deg)
- crowding acuity (log deg)
- RSVP reading speed (word duration, log sec)
- ordinary reading speed (words per min)

In [27]:
# first session
df_firstSess_rsvp = get_rsvp_responses(dir_firstSess, files_firstSess)


[[1 1 1]
 [1 1 1]
 [1 1 1]
 [1 1 1]
 [1 0 1]
 [1 0 1]
 [1 1 0]
 [1 0 0]
 [1 1 1]
 [1 0 0]
 [1 0 1]
 [1 1 0]
 [1 1 0]
 [1 1 0]
 [1 1 0]
 [1 1 0]
 [1 1 1]
 [1 1 0]
 [1 1 1]
 [1 1 0]
 [1 1 1]
 [1 1 0]
 [1 1 0]
 [1 1 1]
 [1 0 0]]


In [None]:
# second session

thresholds_rsvp_sess2 = ['rsvp_foveal_block2']
df_secondSess_rsvp = getThresholds(dir_secondSess, files_secondSess, thresholds_rsvp_sess2, convert_to_wpm_bool=True)


# Old codes

In [None]:
# # merge data frames

# df_all_sessions = pd.concat([df_firstSess, df_firstSess_rsvp, df_firstSess_reading,
#                               df_secondSess, df_secondSess_rsvp, df_secondSess_reading], ignore_index=True)

# # Check if there are any 0 or negative values in df_both_sessions
# if (df_all_sessions['threshold'] <= 0).any().any():
#     print("Check if you wanted to use logged thresholds.")
    
# # only keep participants who completed both sessions
# df_both_sessions = df_all_sessions.groupby("prolificID").filter(lambda g: set(g["repeat"]) == {1, 2, 3, 4})

# num_thresholds_per_subj = 18
# assert df_both_sessions["prolificID"].nunique() == len(df_both_sessions) / num_thresholds_per_subj, 'Fatal: Number of thresholds per subject does not match'

# print(f'\nNumber of participants with both sessions: {df_both_sessions["prolificID"].nunique()}')

# df_both_sessions_log = df_both_sessions.copy()
# df_both_sessions_log['threshold'] = np.log10(df_both_sessions_log['threshold'])
