# Logan 2021 Ranschburg Dataset

I need to process the serial recall data here into the EMBAM format. 

I try to describe the format here:

In [1]:
from typing import TypedDict
from jaxcmr.typing import Integer, Array, NotRequired

class RecallDataset(TypedDict):
    """
    A typed dictionary representing a dataset for free or serial recall experiments.
    Each key maps to a 2D integer array of shape (n_trials, ?).
    Rows correspond to trials; columns vary by field.
    Zeros are used to indicate unused or padding entries, with values starting from 1.

    Required fields:
        - subject:       Subject IDs (one per trial).
        - listLength:    The length of the list presented in each trial.
        - pres_itemids:  Cross-list item IDs presented in each trial
                         (points to a global word pool).
        - pres_itemnos:  Within-list item numbers (1-based indices; 0 indicates padding).
        - rec_itemids:   Cross-list item IDs corresponding to items recalled.
        - recalls:       Within-list item numbers for recalled items
                         (1-based indices; 0 indicates padding).

    Optional fields:
        - You can add as many as needed, with `NotRequired[...]`.
    """

    # REQUIRED FIELDS

    subject: Integer[Array, "n_trials 1"]
    """Subject ID for each trial (shape: [n_trials, 1])."""

    listLength: Integer[Array, "n_trials 1"]
    """List length for each trial (shape: [n_trials, 1])."""

    pres_itemnos: Integer[Array, "n_trials num_presented"]
    """Per-trial within-list item numbers (shape: [n_trials, num_presented]).
    1-based indices with 0 for unused/padding entries."""

    recalls: Integer[Array, "n_trials num_recalled"]
    """Within-list item numbers for recalled items (shape: [n_trials, num_recalled]).
    1-based indices with 0 for unused/padding entries."""

    # OPTIONAL FIELDS, REQUIRED FOR SEMANTIC ANALYSIS
    pres_itemids: Integer[Array, "n_trials num_presented"]
    """Per-trial cross-list item IDs (shape: [n_trials, num_presented]). 
    These IDs reference a global word pool and may repeat across trials."""

    rec_itemids: NotRequired[Integer[Array, "n_trials num_recalled"]]
    """Cross-list item IDs for recalled items (shape: [n_trials, num_recalled])."""

    # OPTIONAL FIELDS, MISC
    irt: NotRequired[Integer[Array, "n_trials num_recalled"]]
    """Item response times for recalled items (shape: [n_trials, num_recalled])."""

    session: NotRequired[Integer[Array, "n_trials 1"]]
    """Session IDs for each trial (shape: [n_trials, 1])."""

    listtype: NotRequired[Integer[Array, "n_trials 1"]]
    """List type for each trial (shape: [n_trials, 1])."""

    list_type: NotRequired[Integer[Array, "n_trials 1"]]
    """List type for each trial (shape: [n_trials, 1])."""

The key thing is the pres_itemnos and recalls fields. Nonzero values in `recalls` identify the first study position of the recalled item. Repeated study events and recall events are possible. Intrusions during recall -- recalls of items that weren't presented in the corresponding study sequence -- should be discarded.  In the CSV, there is one row per trial. You can use the "WORD" column to identify the sequence of letters that participants studied. and the "WORDTYPED" column to identify the recall sequence. Subject field is defined in the subject column. Use the "BlockNum" column to define the session column. Use the "Lag" column to define listtype; values can be 0, 1, 2, 3, or 4. Don't bother trying to extract IRT, pres_itemids or rec_itemids. Assume I load data with pd.read_csv("data/raw/GordonRanschburg_Exp1B_Data.csv").

In [2]:
import pandas as pd
import numpy as np
from jaxcmr.helpers import save_dict_to_hdf5

df = pd.read_csv("data/raw/GordonRanschburg_Exp1B_Data.csv")
df

Unnamed: 0,Subject,Group,TrialNumber,BlockNum,IsPractice,Condition,Lag,Position,Word,WordTyped,...,WordPres1.RT,WordPres2.RT,WordPres3.RT,WordPres4.RT,WordPres5.RT,WordPres6.RT,WordPres7.RT,WordPres8.RT,WordPres9.RT,WordPres10.RT
0,1,2,1,1,0,3,2,2-5,vhzmhr,vhzhmr,...,1671,143,88,559,263,145,289,0,0,0
1,1,2,2,1,0,3,4,0,drytgv,drytgr,...,641,588,96,425,820,563,242,0,0,0
2,1,2,3,1,0,3,1,2-4,jpypke,jpypke,...,530,149,213,240,366,88,256,0,0,0
3,1,2,4,1,0,3,0,2-3,pmmnaq,pmmanq,...,130,200,96,1353,73,238,244,0,0,0
4,1,2,5,1,0,3,4,0,ibaslj,ibaslj,...,285,218,157,119,1491,377,260,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13819,24,4,572,3,0,1,4,0,rmotce,rmotce,...,635,149,110,193,296,152,233,0,0,0
13820,24,4,573,3,0,1,4,0,giwfzn,giwfzn,...,1268,104,905,381,233,367,225,0,0,0
13821,24,4,574,3,0,1,3,1-5,dqiodb,dqiodb,...,531,405,389,122,439,196,211,0,0,0
13822,24,4,575,3,0,1,4,0,ksjbpz,ksjbpz,...,451,138,257,226,783,224,188,0,0,0


In [3]:
# 1) Count trials
n_trials = len(df)

# 2) Figure out how long the longest study list is, and the longest recall sequence
#    so we can allocate arrays of fixed shape.
#    Here, we assume 'WORD' and 'WORDTYPED' columns are strings of letters.
#    If they're space-delimited or some other format, you'll need to parse accordingly.
max_studied_len = 0
max_recalled_len = 0

# We will parse each row's "WORD" and "WORDTYPED" as strings (or lists of characters).
# Adjust if your data is stored differently.
for i in range(n_trials):
    studied_seq = str(df.loc[i, 'Word'])
    recall_seq  = str(df.loc[i, 'WordTyped'])
    max_studied_len  = max(max_studied_len,  len(studied_seq))
    max_recalled_len = max(max_recalled_len, len(recall_seq))

# column_max = max(max_studied_len, max_recalled_len)

# 3) Allocate numpy arrays for the required fields
#    - subject: (n_trials, 1)
#    - listLength: (n_trials, 1)
#    - pres_itemnos: (n_trials, column_max)
#    - recalls: (n_trials, column_max)
#    - session: (n_trials, 1)  (BlockNum)
#    - listtype: (n_trials, 1) (Lag)
subject      = np.zeros((n_trials, 1), dtype=int)
listLength   = np.zeros((n_trials, 1), dtype=int)
pres_itemnos = np.zeros((n_trials, max_studied_len), dtype=int)
recalls      = np.zeros((n_trials, max_recalled_len), dtype=int)
session      = np.zeros((n_trials, 1), dtype=int)
lag     = np.zeros((n_trials, 1), dtype=int)
condition = np.zeros((n_trials, 1), dtype=int)

# 4) Populate row by row
for i in range(n_trials):
    # -- Basic fields
    subject[i, 0] = df.loc[i, 'Subject']
    session[i, 0] = df.loc[i, 'BlockNum']  # you said "BlockNum" -> 'session'
    lag[i, 0] = df.loc[i, 'Lag']     
    condition[i, 0] = df.loc[i, 'Condition']  

    # Studied sequence as a list of characters
    studied_seq = str(df.loc[i, 'Word'])
    # The list length is just how many letters were studied
    ll = len(studied_seq)
    listLength[i, 0] = ll

    # Fill pres_itemnos for the studied items in that row.
    # We simply number them 1..N for the N studied letters, then 0 for unused.
    # Example: if studied_seq = ['X', 'Y', 'Z'], pres_itemnos row = [1, 2, 3, 0, 0, ...]
    counter = 0
    for j, word in enumerate(studied_seq):
        previous_position = studied_seq[:j].find(word)
        if previous_position == -1:
            counter += 1
            pres_itemnos[i, j] = counter
        else:
            pres_itemnos[i, j] = previous_position + 1

    # Now parse the typed recall sequence
    recall_seq = list(str(df.loc[i, 'WordTyped']))

    # We go letter by letter in the recall sequence. For each letter, see if it
    # appears in studied_seq. If not, it's an intrusion -> skip. If it appears multiple times,
    # pick the first study position that hasn't already been used by a prior recall of that letter.
    recall_idx = 0
    for r_letter in recall_seq:
        if possible_positions := [
            pos for pos, let in enumerate(studied_seq) if let == r_letter
        ]:
            # pick the earliest one
            chosen_pos = possible_positions[0]
            # record the 1-based position in the recalls matrix
            recalls[i, recall_idx] = chosen_pos + 1
            recall_idx += 1

# 5) Pack it all into a dictionary with the required keys.
#    If you do not want to include optional fields, omit them.
recall_dataset = {
    "subject":      subject,
    "listLength":   listLength,
    "pres_itemnos": pres_itemnos,
    "recalls":      recalls,
    "session":      session,
    "lag":     lag,
    "condition": condition,
}

recall_dataset

{'subject': array([[ 1],
        [ 1],
        [ 1],
        ...,
        [24],
        [24],
        [24]]),
 'listLength': array([[6],
        [6],
        [6],
        ...,
        [6],
        [6],
        [6]]),
 'pres_itemnos': array([[1, 2, 3, 4, 2, 5],
        [1, 2, 3, 4, 5, 6],
        [1, 2, 3, 2, 4, 5],
        ...,
        [1, 2, 3, 4, 1, 5],
        [1, 2, 3, 4, 5, 6],
        [1, 2, 3, 1, 4, 5]]),
 'recalls': array([[1, 2, 3, ..., 0, 0, 0],
        [1, 2, 3, ..., 0, 0, 0],
        [1, 2, 3, ..., 0, 0, 0],
        ...,
        [1, 2, 3, ..., 0, 0, 0],
        [1, 2, 3, ..., 0, 0, 0],
        [1, 2, 3, ..., 0, 0, 0]]),
 'session': array([[1],
        [1],
        [1],
        ...,
        [3],
        [3],
        [3]]),
 'lag': array([[2],
        [4],
        [1],
        ...,
        [3],
        [4],
        [2]]),
 'condition': array([[3],
        [3],
        [3],
        ...,
        [1],
        [1],
        [1]])}

In [4]:
recall_dataset['recalls'].sum(0)

array([14902, 28324, 40783, 51088, 55148, 49101,  1833,   150,    14])

In [5]:
for key in recall_dataset:
    print(f"{key}: {recall_dataset[key].shape}")
    # You can also check the contents of each field if needed
    # print(recall_dataset[key])

subject: (13824, 1)
listLength: (13824, 1)
pres_itemnos: (13824, 6)
recalls: (13824, 9)
session: (13824, 1)
lag: (13824, 1)
condition: (13824, 1)


In [6]:
save_dict_to_hdf5(recall_dataset, "data/GordonRanschburg2021.h5")