In [1]:
# default_exp datasets

# HowaKaha05 Dataset
> Kahana, M. J., & Howard, M. W. (2005). Spacing and lag effects in free recall of pure lists. Psychonomic Bulletin & Review, 12(1), 159-164.

Sixty-six students studied and attempted free recall of 15 different lists of high-frequency nouns drawn from the Toronto Noun Pool (Friendly, Franklin, Hoffman, & Rubin, 1982). The lists consisted of 30 words, each repeated three times for a total of 90 presentations per list. List
presentation was auditory, and the subjects made their responses
vocally into a headset microphone. The words were presented at a rate
of 1.5 sec. After list presentation, the subjects were given a distractor task
involving simple arithmetic problems of the form A  B  C  ?.
The subjects had to correctly answer 15 problems in a row before
they could proceed to the recall phase.

There were three list types: massed, spaced short, and spaced
long. In the massed lists, each word was repeated three times successively. In the spaced-short lists, the presentation order was randomized, subject to the constraint that the lag between repetitions
was at least 2 and no more than 6. For the spaced-long lists, presentation order was randomized, subject to the constraint that interrepetition lags were at least 6 and not more than 20.

As is typical in free recall studies, we took mea-sures to eliminate warm-up effects by excluding the first 2 lists
from our data analyses. One of these first 2 practice lists was massed,
and the other was randomly chosen to be either spaced short or
spaced long. Of the subsequent 12 lists, 4 were massed, 4 were
spaced short, and 4 were spaced long, presented in an individually
randomized order for each subject.

0 - massed
1 - spaced long
2 - spaced short

In [1]:
# export

import numpy as np
import pandas as pd
from psifr import fr

def prepare_howakaha05_data(path):
    """
    Prepares data formatted like `../data/HowaKaha05.dat` for fitting.
    """
    
    with open(path) as f:
        howa_data = f.read()

    subject_count = 66
    trial_count = 15
    total_lines = 66 * 15 * 5
    list_length = 90

    lines = [each.split('\t') for each in howa_data.split('\n')]
    trial_info_inds = np.arange(1, total_lines, 5)
    presentation_info_inds = np.arange(2, total_lines, 5)
    recall_info_inds = np.arange(4, total_lines, 5)

    # build vectors/matrices tracking list types and presentation item numbers across trials
    list_types = np.array([int(lines[trial_info_inds[i]-1][2]) for i in range(subject_count * trial_count)])
    subjects = np.array([int(lines[trial_info_inds[i]-1][0]) for i in range(subject_count * trial_count)])
    
    pres_itemnos = np.array([[int(each) for each in lines[presentation_info_inds[i]-1][:-1]] for i in range(
        subject_count * trial_count)])
        
    # convert pres_itemnos into rows of unique indices for easier model encoding
    presentations = []
    for i in range(len(pres_itemnos)):
        seen = []
        presentations.append([])
        for p in pres_itemnos[i]:
            if p not in seen:
                seen.append(p)
            presentations[-1].append(seen.index(p))
    presentations = np.array(presentations)

    # track recalls, discarding intrusions
    trials = []
    for i in range(subject_count * trial_count):
        trials.append([])
        
        # if it can be cast as a positive integer and is not yet in the recall sequence, it's not an intrusion
        trial = lines[recall_info_inds[i]-1][:-1]
        for t in trial:
            try:
                t = int(t)
                if (t in pres_itemnos[i]):
                    #item = presentations[i][np.where(pres_itemnos[i] == t)[0][0]]+1
                    item = np.where(pres_itemnos[i] == t)[0][0] + 1
                    if item not in trials[-1]:
                        trials[-1].append(item)
            except ValueError:
                continue
        
        # pad with zeros to make sure the list is the right length
        while len(trials[-1]) < list_length:
            trials[-1].append(0)
            
    trials = np.array(trials)

    # encode dataset into psifr format
    data = []
    subject_counter = 0
    for trial_index, trial in enumerate(trials):
        presentation = presentations[trial_index]
        
        # every time the subject changes, reset list_index
        if not data or data[-1][0] != subjects[trial_index]:
            subject_counter += 1
            list_index = 0
        list_index += 1
        
        # add study events
        for presentation_index, presentation_event in enumerate(presentation):
            data += [[subjects[trial_index], 
                      list_index, 'study', presentation_index+1, presentation_event,  list_types[trial_index], presentation_index+1
                     ]]
            
        # add recall events
        for recall_index, recall_event in enumerate(trial):
            if recall_event != 0:
                data += [[subjects[trial_index], list_index, 
                          'recall', recall_index+1, presentation[recall_event-1], list_types[trial_index], recall_event
                         ]]
                
    data = pd.DataFrame(data, columns=[
        'subject', 'list', 'trial_type', 'position', 'item', 'condition', 'first_input'])
    merged = fr.merge_free_recall(data, list_keys=['condition', 'first_input'])
    
    return trials, merged, list_length, presentations, list_types, data, subjects

In [2]:
trials, events, list_length, presentations, list_types, rep_data, subjects = prepare_howakaha05_data(
    '../../data/HowaKaha05.dat')

events.head()

Unnamed: 0,subject,list,item,input,output,study,recall,repeat,intrusion,condition,first_input
0,118,1,0,1,3.0,True,True,0,False,0,1
1,118,1,0,2,,True,False,0,False,0,2
2,118,1,0,3,,True,False,0,False,0,3
3,118,1,1,4,9.0,True,True,0,False,0,4
4,118,1,1,5,,True,False,0,False,0,5


In [8]:
list_types[0], presentations[0]

(0,
 array([ 0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  5,  5,
         5,  6,  6,  6,  7,  7,  7,  8,  8,  8,  9,  9,  9, 10, 10, 10, 11,
        11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 16,
        17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22,
        22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28,
        28, 28, 29, 29, 29]))

In [15]:
list_types[3], presentations[3]

(1,
 array([ 0,  1,  2,  3,  4,  5,  6,  0,  7,  3,  8,  5,  9,  6,  4,  3,  7,
         2,  0,  1,  9, 10, 11, 12,  6,  8,  1,  5,  4, 11,  2, 12, 10,  7,
        13, 14, 15, 16,  9, 17, 13, 18, 10,  8, 16, 11, 19, 17, 12, 18, 14,
        20, 21, 15, 13, 22, 23, 20, 16, 15, 14, 17, 24, 19, 18, 25, 26, 27,
        28, 23, 29, 19, 21, 22, 25, 24, 20, 26, 27, 28, 29, 24, 23, 26, 27,
        22, 25, 28, 29, 21]))

In [10]:
list_types[1], presentations[1]

(2,
 array([ 0,  1,  2,  3,  1,  2,  0,  1,  3,  0,  2,  4,  5,  3,  6,  4,  5,
         7,  4,  8,  6,  9,  5,  7,  8,  9,  6,  8,  7, 10,  9, 11, 12, 13,
        11, 10, 12, 13, 11, 14, 12, 10, 13, 15, 16, 14, 17, 15, 18, 14, 16,
        17, 18, 15, 16, 17, 19, 20, 18, 21, 20, 22, 19, 20, 21, 19, 22, 23,
        24, 25, 21, 26, 22, 23, 24, 25, 26, 23, 24, 25, 26, 27, 28, 29, 27,
        28, 29, 27, 28, 29]))

In [3]:
np.sum(events.intrusion)

0

In [4]:
test = events.loc[(events.subject==118) & (events.list==1)]

In [5]:
presentations_df = events.pivot_table(index=['subject', 'list'], columns='input', values='item')
alt_presentations = presentations_df.to_numpy(na_value=0).astype('int64')
np.all(presentations==alt_presentations)

True

In [6]:
alt_presentations[-1]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  3,  2,  1,  9,  4,  7,  3, 10,
        0,  6,  2,  5,  4,  7,  9,  1,  8,  0,  6, 11, 12, 13, 14, 10,  9,
       15,  5, 16,  8, 14, 17, 11, 15, 18, 19, 10, 17, 13, 12, 11, 15, 14,
       18, 13, 19, 16, 20, 21, 22, 23, 24, 25, 26, 17, 21, 18, 12, 22, 19,
       24, 27, 23, 26, 28, 16, 20, 29, 27, 25, 26, 28, 22, 21, 27, 29, 25,
       23, 20, 28, 24, 29], dtype=int64)

In [8]:
trials_df = events.pivot_table(index=['subject', 'list'], columns='output', values='first_input', dropna=False)
alt_trials = trials_df.to_numpy(na_value=0).astype('int64')

np.all(trials[:,:30]==alt_trials)

True

In [9]:
trials[0]

array([22, 85,  1, 61, 13, 82, 16, 70,  4, 10, 73, 64,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0], dtype=int64)

In [10]:
alt_trials[0]

array([22, 85,  1, 61, 13, 82, 16, 70,  4, 10, 73, 64,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int64)

In [11]:
from compmemlearn.datasets import events_metadata

alt_trials, alt_list_lengths, alt_presentations = events_metadata(events, 'subject > -1')[:3]
np.shape(alt_trials), np.shape(alt_list_lengths), np.shape(alt_presentations)
np.all(trials[:, :30]==alt_trials), np.all(presentations==alt_presentations)

(True, True)

In [12]:
events.to_csv('../../data/HowardKahana2005.csv', index=False)