In [1]:
# default_exp datasets

# HowaKaha05 Dataset
> Kahana, M. J., & Howard, M. W. (2005). Spacing and lag effects in free recall of pure lists. Psychonomic Bulletin & Review, 12(1), 159-164.

Sixty-six students studied and attempted free recall of 15 different lists of high-frequency nouns drawn from the Toronto Noun Pool (Friendly, Franklin, Hoffman, & Rubin, 1982). The lists consisted of 30 words, each repeated three times for a total of 90 presentations per list. List
presentation was auditory, and the subjects made their responses
vocally into a headset microphone. The words were presented at a rate
of 1.5 sec. After list presentation, the subjects were given a distractor task
involving simple arithmetic problems of the form A  B  C  ?.
The subjects had to correctly answer 15 problems in a row before
they could proceed to the recall phase.

There were three list types: massed, spaced short, and spaced
long. In the massed lists, each word was repeated three times successively. In the spaced-short lists, the presentation order was randomized, subject to the constraint that the lag between repetitions
was at least 2 and no more than 6. For the spaced-long lists, presentation order was randomized, subject to the constraint that interrepetition lags were at least 6 and not more than 20.

As is typical in free recall studies, we took mea-sures to eliminate warm-up effects by excluding the first 2 lists
from our data analyses. One of these first 2 practice lists was massed,
and the other was randomly chosen to be either spaced short or
spaced long. Of the subsequent 12 lists, 4 were massed, 4 were
spaced short, and 4 were spaced long, presented in an individually
randomized order for each subject.

0 - massed
1 - spaced long
2 - spaced short

In [2]:
from jaxcmr_research.helpers.array import find_first

In [21]:
# export

import numpy as np
import pandas as pd
from psifr import fr

def prepare_howakaha05_data(path):
    """
    Prepares data formatted like `../data/HowaKaha05.dat` for fitting.
    """
    
    with open(path) as f:
        howa_data = f.read()

    subject_count = 66
    trial_count = 15
    total_lines = 66 * 15 * 5
    list_length = 90

    lines = [each.split('\t') for each in howa_data.split('\n')]
    trial_info_inds = np.arange(1, total_lines, 5)
    presentation_info_inds = np.arange(2, total_lines, 5)
    recall_info_inds = np.arange(4, total_lines, 5)

    # build vectors/matrices tracking list types and presentation item numbers across trials
    list_types = np.array([int(lines[trial_info_inds[i]-1][2]) for i in range(subject_count * trial_count)])
    subjects = np.array([int(lines[trial_info_inds[i]-1][0]) for i in range(subject_count * trial_count)])
    
    pres_itemnos = np.array([[int(each) for each in lines[presentation_info_inds[i]-1][:-1]] for i in range(
        subject_count * trial_count)])
        
    # convert pres_itemnos into rows of unique indices for easier model encoding
    presentations = []
    for i in range(len(pres_itemnos)):
        seen = []
        presentations.append([])
        for p in pres_itemnos[i]:
            if p not in seen:
                seen.append(p)
            presentations[-1].append(seen.index(p))
    presentations = np.array(presentations) + 1

    # track recalls, discarding intrusions
    trials = []
    for i in range(subject_count * trial_count):
        trials.append([])
        
        # if it can be cast as a positive integer and is not yet in the recall sequence, it's not an intrusion
        trial = lines[recall_info_inds[i]-1][:-1]
        for t in trial:
            try:
                t = int(t)
                if (t in pres_itemnos[i]):
                    #item = presentations[i][np.where(pres_itemnos[i] == t)[0][0]]+1
                    item = np.where(pres_itemnos[i] == t)[0][0] + 1
                    if item not in trials[-1]:
                        trials[-1].append(item)
            except ValueError:
                continue
        
        # pad with zeros to make sure the list is the right length
        while len(trials[-1]) < list_length:
            trials[-1].append(0)
            
    trials = np.array(trials)

    # encode dataset into psifr format
    data = []
    subject_counter = 0
    for trial_index, trial in enumerate(trials):
        presentation = presentations[trial_index]
        
        # every time the subject changes, reset list_index
        if not data or data[-1][0] != subjects[trial_index]:
            subject_counter += 1
            list_index = 0
        list_index += 1
        
        # add study events
        for presentation_index, presentation_event in enumerate(presentation):
            data += [[subjects[trial_index], 
                      list_index, 'study', presentation_index+1, presentation_event,  list_types[trial_index], find_first(presentation_event, presentation) + 1
                     ]]
            
        # add recall events
        for recall_index, recall_event in enumerate(trial):
            if recall_event != 0:
                data += [[subjects[trial_index], list_index, 
                          'recall', recall_index+1, presentation[recall_event-1], list_types[trial_index], recall_event
                         ]]
                
    data = pd.DataFrame(data, columns=[
        'subject', 'list', 'trial_type', 'position', 'item', 'condition', 'first_input'])
    merged = fr.merge_free_recall(data, list_keys=['condition', 'first_input'])
    
    return trials, merged, list_length, presentations, list_types, data, subjects

In [22]:
trials, events, list_length, presentations, list_types, rep_data, subjects = prepare_howakaha05_data(
    'data/HowaKaha05.dat')

events.head()

Unnamed: 0,subject,list,item,input,output,study,recall,repeat,intrusion,condition,first_input,prior_list,prior_input
0,118,1,1,,3.0,False,True,0,True,0,1,1.0,1.0
1,118,1,1,,3.0,False,True,0,True,0,1,1.0,2.0
2,118,1,1,,3.0,False,True,0,True,0,1,1.0,3.0
3,118,1,1,,3.0,False,True,0,True,0,1,,
4,118,1,1,,3.0,False,True,0,True,0,1,,


In [26]:
df_value = "first_input" if "first_input" in events.columns else "input"
events[["output"]] = events[["output"]].fillna("N/A")
trials_df = events.pivot_table(
    index=["subject", "list"], columns="output", values=df_value, dropna=False
)
trials_array = trials_df.to_numpy(na_value=0)[:, :-1].astype("int64")

events["subject_index"] = events["subject"]
subjects = events.pivot_table(
    index=["subject", "list"], values="subject_index", dropna=False
).values.astype("int64")

condition = events.pivot_table(
    index=["subject", "list"], values="condition", dropna=False, aggfunc=lambda x: max(x)
).values

list_length = np.array([90] * subjects.size).reshape(-1, 1)

pres_itemnos = presentations

pres_itemids = pres_itemnos.copy()
rec_itemids = trials_array.copy()

result = {
    "subject": subjects,
    "pres_itemnos": pres_itemnos,
    "pres_itemids": pres_itemids,
    "rec_itemids": rec_itemids,
    "recalls": trials_array,
    "listLength": list_length,
    "condition": condition,
}

# test for each entry in result that they are 2d
for key, value in result.items():
    print(key, value.ndim, value.shape)

subject 2 (990, 1)
pres_itemnos 2 (990, 90)
pres_itemids 2 (990, 90)
rec_itemids 2 (990, 30)
recalls 2 (990, 30)
listLength 2 (990, 1)
condition 2 (990, 1)


In [27]:
import h5py

def save_dict_to_hdf5(data_dict, filename):
    with h5py.File(filename, "w") as file:
        data_group = file.create_group(
            "data"
        )  # Create a group named "data" in the HDF5 file
        for key, value in data_dict.items():
            # Create each dataset within the "data" group
            data_group.create_dataset(key, data=value.T)


save_dict_to_hdf5(result, "data/HowardKahana2005.h5")