In [1]:
# default_exp datasets

# Generic Data Operations
Functions for data processing performed prior to actual behavioral analysis but after instantiation of the data object.

In [25]:
# export

import numpy as np
import pandas as pd

def events_metadata(events):
    """
    Return as numpy arrays and vectors key metadata about recall events dataframe structure
    
    Generates a metadata array for each list length, and returns a list of these arrays.
    """

    # list lengths for efficient presentation simulation
    if 'list length' in events.columns:
        list_lengths = [int(each) for each in pd.unique(events["list length"])]
        ll_specific_trial_query = '`list length` == {}'
    else:
        list_lengths = [int(np.max(events.input))]
        ll_specific_trial_query = 'subject != -1'

    # trials for efficient recall simulation
    trials = []
    presentations = []
    if 'item_string_index' in events.columns:
        pres_item_string_ids = []
        trial_item_string_ids = []
    else:
        pres_item_string_ids = None

    trial_details = events.pivot_table(index=['subject', 'list'], dropna=False, aggfunc='first').reset_index()
    for list_length in list_lengths:

        # generate list_length mask
        list_length_mask = trial_details.eval(
            ll_specific_trial_query.format(list_length)).to_numpy(dtype='bool')

        df_value = 'first_input' if 'first_input' in events.columns else 'input'
        trials_df = events.pivot_table(index=['subject', 'list'], columns='output', values=df_value, dropna=False)

        trials_array = trials_df.to_numpy(na_value=0).astype('int32')[list_length_mask]
        trials.append(trials_array[:, :min(list_length, trials_array.shape[1])])

        presentations_df = events.pivot_table(index=['subject', 'list'], columns='input', values='item', dropna=False)
        presentations_array = presentations_df.to_numpy(na_value=0).astype('int32')[list_length_mask]
        presentations.append(presentations_array[:, :min(list_length, presentations_array.shape[1])])

        if pres_item_string_ids is not None:
            pres_item_string_ids_df =  events.pivot_table(
                index=['subject', 'list'], columns='input', values='item_string_index', dropna=False)
            pres_item_string_ids_array = pres_item_string_ids_df.to_numpy().astype('int32')[list_length_mask]
            pres_item_string_ids.append(
                pres_item_string_ids_array[:, :min(list_length, pres_item_string_ids_array.shape[1])])

            trial_item_string_ids_df = events.pivot_table(
                index=['subject', 'list'], columns='output', values='item_string_index', dropna=False)
            trial_item_string_ids_array = trial_item_string_ids_df.to_numpy().astype('int32')[list_length_mask]
            trial_item_string_ids.append(
                trial_item_string_ids_array[:, :min(list_length, trial_item_string_ids_array.shape[1])])

    return trials, list_lengths, presentations, pres_item_string_ids, trial_item_string_ids

In [4]:
# export

def generate_trial_mask(events, trial_query):
    """
    Return mask vector(s) selecting trials that match a query based on elements in events dataframe.
    
    Generates a mask vector for each list length, and returns a list of masks for each list length.
    """

    # infer list length(s)
    if 'list length' in events.columns:
        list_lengths = [int(each) for each in pd.unique(events["list length"])]
        ll_specific_trial_query = '`list length` == {}'
    else:
        list_lengths = [int(np.max(events.input))]
        ll_specific_trial_query = 'subject != -1'

    # build trials argument, careful to filter out recalls and presentations rows with same query
    trial_details = events.pivot_table(index=['subject', 'list'], dropna=False, aggfunc='first').reset_index()

    trial_masks = []

    for list_length in list_lengths:

        # generate list_length mask
        list_length_mask = trial_details.eval(
            ll_specific_trial_query.format(list_length)).to_numpy(dtype='bool')

        # generate trial mask and mask with list_length mask
        trial_mask = trial_details.eval(trial_query).to_numpy(dtype='bool')
        trial_masks.append(trial_mask[list_length_mask])

    return trial_masks

## Tests

1. Make sure `presentations` array is properly generated from the LohnasKahana2014 & HowardKahana2005 datasets as well as a generic dataset.
2. Make sure trials array is 1-indexed and generally selects the correct presentation items. 
3. Make sure trial_mask function can efficiently select between task conditions and/or subjects. 
4. Examine case where filtering happens by list length.
5. Perform timing comparison between applying a dataframe-based recall analysis generating a dataframe from an array-based recall analysis.
6. Make sure outputs can be easily handled by numba-jit compiled functions
7. Make sure semantics of item and position elements are consistent across datasets.
8. Make sure metadata are properly subsetted by list length in our Murdock 1962 and a generic dataset.

### Murdock 1962 Dataset

In [5]:
events = pd.read_csv('../../../compmemlearn/data/Murdock1962.csv')

trials, list_lengths, presentations = events_metadata(events)

assert(len(trials) == 3)
for list_length_index, list_length in enumerate([20, 30, 40]):

    # confirm list length subsetting
    assert(list_lengths[list_length_index] == list_length)
    assert(np.shape(presentations[list_length_index])[1] == list_length)

    # confirm presentations is 0-list_length_indexed
    assert(np.min(presentations[list_length_index]) == 0)
    assert(np.max(presentations[list_length_index]) == list_length-1)

    # confirm trials is 1-list_length_indexed except for non-recall events
    print(trials[list_length_index][0])

    # confirm generate_trial_mask identifies 80 trials per subject; and identifies the right trials
    assert(np.sum(generate_trial_mask(events, "subject == 1")[list_length_index]) == 80)
    assert(np.all(generate_trial_mask(events, "subject == 1")[list_length_index][:80]))

    # confirm generate_trial_masking a list length of 20 only selects trials in 20-item lists
    if list_length == 20:
        assert(np.sum(generate_trial_mask(events, "`list length`==20")[list_length_index]) > 0)
    else:
        assert(np.sum(generate_trial_mask(events, "`list length`==20")[list_length_index]) == 0)

[20 19 13 18  1  9  2 17 16  0  0  0  0  0  0]
[30 27  6 29 15 23  8  0  0  0  0  0  0  0  0]
[39 40 29 31  7  8  1  0  0  0  0  0  0  0  0]


### Lohnas 2014 Dataset

In [6]:
events = pd.read_csv('../../../compmemlearn/data/LohnasKahana2014.csv')

trials, list_lengths, presentations = events_metadata(events)

assert(len(trials) == 1)
assert(len(list_lengths) == 1)
assert(len(presentations) == 1)

trials = trials[0]
list_length = list_lengths[0]
presentations = presentations[0]

for condition_index, condition in enumerate([1, 2, 3, 4]):

    # confirm presentations is 0-list_length_indexed
    assert(np.min(presentations) == 0)
    assert(np.max(presentations) == list_length-1)

    # confirm trial mask can select trials with different presentation orders
    print(presentations[generate_trial_mask(events, f"condition == {condition}")[0]][0])

    # confirm size of trial mask and trials array match even when some participants make 0 recalls
    condition_mask = generate_trial_mask(events, f"condition == {condition}")
    assert(len(condition_mask[0]) == len(trials))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]
[ 0  0  1  1  2  2  3  3  4  4  5  5  6  6  7  7  8  8  9  9 10 10 11 11
 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19]
[ 0  1  2  3  4  5  4  6  1  0  7  2  3  7  5  6  8  9 10 11 10 12 13 14
  9  8 15 12 11 13 15 16 14 17 18 19 16 19 18 17]
[ 0  1  2  3  4  5  6  7  8  9 10 11 11 12 13 14 15 16  9 17 18 19 18 20
 21 22 19 23 24 25 21 26 27 23 28 29 30 31 32 33]


In [7]:
trials[3]

array([ 1,  3,  5,  7,  9, 13, 15, 23, 17, 19, 21, 27, 25, 31, 33, 35, 37,
       39,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0])