**Imports**

In [None]:
import pandas as pd
import numpy as np
from os import path
from IPython.display import display
from tqdm.notebook import tqdm
from python_scripts.utils import loc_utils as lut

# Combine raw data files

## Main data files

Concatenate main raw data and codify participant IDs into a more readable form.

In [None]:
def combine_main_raw(input_data_paths, save_path, save_codes=''):
    combine_list = []
    cdicts = []
    for input_data_path in input_data_paths:
        df = pd.read_csv(input_data_path)
        
        # Codify subject IDs
        uniqids = df.sid.astype('category')
        df.loc[:, 'sid'] = uniqids.cat.codes
        cdict = dict(enumerate(uniqids.cat.categories))
        
        # If not the 1st DF, continue enumerating from previous DF's last index
        if combine_list:
            last_index = combine_list[-1].loc[:, 'sid'].max()
            df.loc[:, 'sid'] += last_index + 1
            cdict = dict(enumerate(uniqids.cat.categories, last_index + 1))
            
        cdicts.append(cdict)    
        combine_list.append(df)
        
    # Combine dataframes
    df = pd.concat(combine_list)
    cdict = {k:v for d in cdicts for k, v in d.items()}
    df_codes = pd.DataFrame({'code': list(cdict.keys()), 'uid': list(cdict.values())})

    # Save combined data
    if save_path:
        print('saving combined data to {}'.format(path.abspath(save_path)))
        df.to_csv(path.join(save_path), index=False)
        
    if save_codes:
        print('saving codes to {}'.format(path.abspath(save_path)))
        df_codes.to_csv(path.join(save_codes), index=False)
    
combine_main_raw(
    input_data_paths = ('data/raw/ig_main.csv', 'data/raw/eg_main.csv'),
    save_path = 'data/combined_main.csv',
    save_codes = 'data/uid_codes.csv'
)

## Extra data files (self-reports)

Concatenate extra raw data and codify participant IDs using codes from the previous function. Then convert data to long format and clean up for convenience.

In [None]:
def combine_extra_raw(input_data_paths, main_data_path, codes_path, save_path=''):
    # Combine data and use previously generated codes
    df = pd.concat([pd.read_csv(p) for p in input_data_paths]).set_index('sid')
    df = df.merge(
        pd.read_csv(codes_path).rename(columns={'uid': 'sid'}).set_index('sid'), on='sid').reset_index()
    df.loc[:, 'sid'] = df.code
    df.drop(columns=['code', 'age', 'gender', 'race', 'ethnicity', 'thoughts', 'comments'], inplace=True)
    
    # Reformat column names to convert to long format
    rename_dict = {}
    stubnames = []
    for i in range(len(df.columns)):
        s = df.columns[i]
        if '.' in s:
            split_str = s.split('.')
            suffix = split_str.pop()
            stubname = ''.join(split_str)
            rename_dict[s] = '%'.join([stubname, suffix])
            stubnames.append(stubname)
    df = df.rename(columns=rename_dict)
    
    # Convert to long format
    df = pd.melt(df, id_vars=['sid', 'group'], value_vars=None, var_name='item%family', value_name='rating')
    split_cols = df.loc[:, 'item%family'].str.split('%',  expand = True)
    split_cols.columns = ['item', 'family']
    df = pd.concat([df, split_cols], axis=1).filter(items=['sid','group','family','item','rating'])
    
    # Identify activity type for each family
    act_df = pd.read_csv(main_data_path).filter(items=['sid','family','activity']).drop_duplicates()
    df = df.merge(act_df, on=['sid','family']).filter(items=['sid','group','activity','item','rating'])
    
    # Sort values and rename items
    df = df.sort_values(by=['group','sid','item','activity']).reset_index(drop=True)
    df.loc[:, 'item'] = df.item.replace({'futurelearn0': 'lrn1',
                                         'futurelearn1': 'lrn2',
                                         'interested': 'int',
                                         'progress': 'prog',
                                         'time': 'time',
                                         'rule': 'rule',
                                         'complex': 'comp'
                                        })
    
    # Add normalized scores
    mean_ratings = df.groupby(['sid']).mean().loc[:, 'rating'].reset_index().rename(columns={'rating':'norm'})
    df = df.merge(mean_ratings, on='sid')
    df.loc[:, 'rating_norm'] = df.rating - df.norm
    df = df.drop(columns='norm')
    display(df.head())
    
    # Save combined data
    if save_path:
        print('saving combined data to {}'.format(path.abspath(save_path)))
        df.to_csv(path.join(save_path), index=False)

        

combine_extra_raw(
    input_data_paths = ('data/raw/ig_extra.csv', 'data/raw/eg_extra.csv'),
    main_data_path = 'data/combined_main.csv',
    codes_path = 'data/uid_codes.csv',
    save_path = 'data/combined_extra.csv'
)

# Exclude outliers

Exclude outliers based on allocation bias and response bias. Report number of exclusions in each group based on allocation bias, then exclude from remaining data according response bias and report.

In [None]:
def make_clean_dataset(input_data_path, save_path, **kwargs):
    # Define a response bias function
    def rbf(x):
        _, response_counts = np.unique(x.response, return_counts=True)
        return np.max(response_counts) / np.sum(response_counts)


    # Open combined data file
    df = pd.read_csv(input_data_path, index_col=None).set_index('sid')

    # Initialize columns to record values of interest
    df['alloc_bias'], df['resp_bias'] = 0, 0

    # Calculate values of interest
    activities = ('A1', 'A2', 'A3', 'A4')
    for sid, sdf in tqdm(df.groupby(by='sid'), desc='Progress: '):
        # Allocation variance
        if kwargs['ab_crit']:
            counts = [sum(sdf.activity == i) for i in activities]
            allocation_variance = np.std(counts)
            df.loc[sid, 'alloc_bias'] = allocation_variance
        else:
            df.loc[sid, 'alloc_bias'] = False

        # Response bias
        response_bias = sdf.groupby('family').apply(rbf).mean()
        df.loc[sid, 'resp_bias'] = response_bias

    # Detect high allocation variance and response bias
    df_ = df.reset_index().groupby('sid').head(1).reset_index()
    df_['high_ab'] = df_.alloc_bias >= kwargs['ab_crit']
    df_['high_rb'] = np.logical_and(df_.resp_bias > df_.resp_bias.mean() + kwargs['rb_crit'] * df_.resp_bias.std(), ~df_.high_ab)

    display(df_.groupby(by='group')[['high_ab', 'high_rb']].sum().astype(int))
    print('Found {} outliers'.format(np.logical_or(df_.high_ab, df_.high_rb).sum()))

    # Exclude outliers
    outlier = df_.loc[df_.high_ab | df_.high_rb, 'sid']
    df = df.loc[~df.index.isin(outlier), :] if exclude else df
    display(df.reset_index().groupby(by='group')['sid'].nunique())

    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.reset_index().to_csv(save_path, index=False)
    

exclude = True
save_path = 'data/clean_data.csv' if exclude else 'data/unclean_data.csv'

make_clean_dataset(
    input_data_path = 'data/combined_main.csv',
    save_path = save_path,

    # Set outlier criteria
    ab_crit = None,   # allocation variance critical value
    rb_crit = 2 ,    # response bias critical value
    
    # Exclude outliers?
    exclude = exclude
)

# Calculate heuristics

## Time-window approach

|Heuristic|Description ($t_i$ = trial number $i$; $w$ = window size)|
|:-------:|:--------------------------------------------------------|
| **PC**  | overall competence ($t_0$ to $t_i$)                     |
| **rPC** | recent competence ($t_{i-w}$ to $t_i$)                  |
| **rLP** | recent learning progress ($t_{i-w}$ to $t_i$)           |
| **SC**  | self-challenge                                          |

In [None]:
def rlp_func(x, w, abs_lp=False):
    '''
    Compute recent LP in x. The resulting value depends on sizes of 2 windows equal to `m` and `n`.
    LP is equal to the absolute difference between average score over the first and the second window.
    The first window spans a subsequence of x from the beginning of x to m, i.e. x[:m]
    The second windon spans a subsequence of x from the end of x to -n, i.e.  x[-n:]
    '''
    diff = np.mean(x[-9:]) - np.mean(x[:10])
    return np.abs(diff) if abs_lp else diff


def make_heuristics_dataset(input_data_path, save_path='', **kwargs):
    # Read clean data and drop unused data
    df = pd.read_csv(input_data_path, index_col=None).set_index(['sid','activity'])
    df = df.loc[:, 'group,stage,trial,correct'.split(',')]
    df = df.loc[df.trial <= 60+250]

    # Add new columns
    activities = 'A1,A2,A3,A4'.split(',')
    for heuristic in ['pc','rpc','rlp']:
        for a in activities:
            df['{}{}'.format(heuristic, a[1])] = np.nan
    df['sc'] = np.nan

    # Calculate dynamic performance heuristics for each subject
    act_codes = {'A1':1, 'A2':2, 'A3':3, 'A4':4}
    for i, sdf in tqdm(df.groupby('sid'), desc='Progress'):
        for a in activities:
            x = sdf.loc[(i, a), 'correct'].astype(int)

            # Overall competence (pc)
            pc = np.cumsum(x) / np.arange(1, x.size+1)
            df.loc[(i, a), 'pc{}'.format(a[1])] = pc

            # Recent competence (rpc)
            rpc = x.rolling(min_periods=kwargs['pc_window'], window=kwargs['pc_window']).mean()
            df.loc[(i, a), 'rpc{}'.format(a[1])] = rpc

            # Recent learning progress (rlp)
            rlp = x.rolling(min_periods=kwargs['pc_window'], window=kwargs['lp_window']).apply(
                rlp_func, args=(kwargs['lp_window'], kwargs['lp_window']), raw=False
            )
            df.loc[(i, a), 'rlp{}'.format(a[1])] = rlp
        
        df.loc[(i, slice(None)), :] = df.loc[(i, slice(None)), :].fillna(method='ffill', axis=0)

        # Self-challenge (sc)
        rpc_max = df.loc[(i, slice(None)), 'rpc1':'rpc4'].max(axis=1).rolling(min_periods=1, window=250).max()
        rpc_min = df.loc[(i, slice(None)), 'rpc1':'rpc4'].min(axis=1).rolling(min_periods=1, window=250).min()
        act_inds = np.array([act_codes[a] for a in sdf.index.get_level_values(1).tolist()]) - 1
        current_rpc = df.loc[(i, slice(None)), 'rpc1':'rpc4'].values[np.arange(sdf.shape[0]), act_inds]
        sc = 1 - (current_rpc-rpc_min)/(rpc_max-rpc_min)
        df.loc[(i, slice(None)), 'sc'] = sc

    df = df.reset_index().sort_values(by=['sid', 'trial'])
    df.loc[df.stage=='train', 'sc'] = np.nan    # SC is not defined in familizarization stage
    display(df.loc[(df.sid == 5) & (df.trial >= 60) & (df.trial < 90), :])    # Display data excerpt
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.to_csv(save_path, index=False)
    
    
make_heuristics_dataset(
    input_data_path = 'data/clean_data.csv',
    save_path = 'data/heuristics_data.csv',
    pc_window = 15,
    lp_window = 15
)

## Q-learning and RPE approach

In [None]:
def q_func(outcomes, init, lrate):
    opes = [outcomes[1] - init]
    qs = [init + lrate*(outcomes[1] - init)]
    for i, o in enumerate(outcomes[1:]):
        ope = o - qs[i]
        q = qs[i] + lrate*(ope)
        qs.append(q)
        opes.append(ope)
    return np.array(qs), np.array(opes)


def make_heuristics_dataset(input_data_path, save_path, init_q, lrate, ope_smoothing, **kwargs):
    # Read clean data and drop unused data
    df = pd.read_csv(input_data_path, index_col=None).set_index(['sid','activity'])
    df = df.loc[:, 'group,stage,trial,correct'.split(',')]
    df = df.loc[df.trial <= 60+250]

    # Add new columns
    activities = 'A1,A2,A3,A4'.split(',')
    for heuristic in ['conf','ope']:
        for a in activities:
            df['{}{}'.format(heuristic, a[1])] = np.nan

    # Calculate dynamic performance heuristics for each subject
    act_codes = {'A1':1, 'A2':2, 'A3':3, 'A4':4}
    for i, sdf in tqdm(df.groupby('sid'), desc='Progress'):
        for a in activities:
            x = sdf.loc[(i, a), 'correct'].astype(int)

            # Compute confidence and outcome prediction errors using Q-updating
            conf, opes = q_func(x.values.squeeze(), init_q, lrate)
            df.loc[(i, a), 'conf{}'.format(a[1])] = conf

            # Recent learning progress (rlp)
            opes = np.abs(pd.Series(opes).rolling(window=ope_smoothing, min_periods=1).mean()).values
            df.loc[(i, a), 'ope{}'.format(a[1])] = opes
        
        df.loc[(i, slice(None)), :] = df.loc[(i, slice(None)), :].fillna(method='ffill', axis=0)

    df = df.reset_index().sort_values(by=['sid', 'trial'])
    display(df.loc[(df.sid == 0) & (df.trial >= 1) & (df.trial < 70), :])    # Display data excerpt
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.to_csv(save_path, index=False)
    
    
make_heuristics_dataset(
    input_data_path = 'data/clean_data.csv',
    save_path = 'data/heuristics_data_alt.csv',
    init_q = .5,
    lrate = .2,
    ope_smoothing = 6,
)

# NAM designation

## NAM dataset

In [None]:
def get_mps(df, **kwargs):
    '''Find mastery points'''
    arr = df.values
    mask = (arr != 0)
    arr = np.where(mask.any(axis=0), mask.argmax(axis=0), kwargs['invalid_val'])
    return pd.Series(arr, dtype=kwargs['dtype'])


def make_nam_dataset(input_data_path, save_path, **kwargs):
    # Load data
    df = pd.read_csv(input_data_path, index_col='sid')

    # Select free-play trials
    df = df.loc[(df.trial <= 60+250) & (df.trial >= 60)]
    df.loc[:, 'trial'] -= 60
    
    # Get group dataset
    group_df = df.groupby('sid').head(1)[['group']]

    # Evaluate each trial's recent PC to True if mastery criterion was reached
    mastered = df.reset_index().set_index(['sid','trial']).loc[:, 'rpc1':'rpc3'] >= kwargs['crit']
    
    # For each subject, find mastery points and NAM
    by_sid = mastered.groupby('sid')
    mastery_points = by_sid.apply(get_mps, invalid_val=250, dtype='int')
    mastery_points.rename(columns={0:'mp1', 1:'mp2', 2:'mp3'}, inplace=True)
#     mastery_points.replace(to_replace=0, value=15, inplace=True)
    nam = by_sid.any().sum(axis=1).to_frame(name='nam')

    # Display output dataset excerpt
    nam = group_df.merge(nam, on='sid')
    nam_df = nam.merge(mastery_points, on='sid').reset_index()
    display(nam_df.head(10))
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        nam_df.to_csv(save_path, index=False)    


make_nam_dataset(
    input_data_path = 'data/heuristics_data.csv', 
    save_path = 'data/nam_data.csv',
    crit = 13/15)

## Mastery as a function criterion dataset

In [None]:
def make_nam_dataset(input_data_path, save_path):
    # Load data
    df = pd.read_csv(input_data_path, index_col='sid')

    # Select free-play trials
    df = df.loc[(df.trial <= 60+250) & (df.trial >= 60)]
    df.loc[:, 'trial'] -= 60
    
    # Get group dataset
    group_df = df.groupby('sid').head(1)[['group']]

    # For different critera, calculate if mastery points were reached
    dfs = []
    for crit in [10,11,12,13,14]:
        mastered = df.reset_index().set_index(['sid','trial']).loc[:, 'rpc1':'rpc3'] >= crit/15
        mastered = mastered.groupby('sid').any()
        mastered.rename(columns={'rpc1':'mp_1', 'rpc2':'mp_2', 'rpc3':'mp_3'}, inplace=True)
        mastered = pd.wide_to_long(mastered.reset_index(), 'mp', i='sid', j='activity', sep='_')
        mastered = mastered.sort_values(by=['sid', 'activity']).astype(int)
        mastered = mastered.rename(columns={'mp':f'crit_{crit}'})
        dfs.append(mastered)
    
    # Join dfs
    df = pd.concat(dfs, axis=1).reset_index()
    df = pd.wide_to_long(df, 'crit', i=['sid','activity'], j='crit_val', sep='_').reset_index()
    df = df.merge(group_df, on='sid').rename(columns={'crit':'mastery'})
    df = df.filter(items=['sid','group','activity','crit_val','mastery'])
    display(df.sort_values(by=['sid','activity','crit_val']).head(10))
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.to_csv(save_path, index=False)    


make_nam_dataset(
    input_data_path = 'data/heuristics_data.csv', 
    save_path = 'data/mcrits.csv')

# Learning dataset

In [None]:
def make_learning_dataset(heuristics_data_path, nam_data_path, save_path='', **kwargs):    
    # Load heuristics data
    df = pd.read_csv(heuristics_data_path, index_col='sid')
    
    # Add NAM classification
    df = df.merge(pd.read_csv(nam_data_path, index_col='sid').drop(columns='group'), on='sid')
    
    # Annotate switch trials
    df['switch'] = 0
    choices = df.activity.values
    switches = df.switch.values.copy() 
    switches[1:] = choices[:-1] != choices[1:]
    df.loc[:, 'switch'] = switches
    df.loc[df.stage=='train', 'switch'] = 0  # changing activity during forced stage is not switching
    df.loc[df.trial==61, 'switch'] = 0       # choosing activity for the first time is not switching
    
    # Select free-play trials
    df = df.loc[(df.trial <= 60+250) & (df.trial > 60)]
    df = df.loc[df.trial > 60]
    df.loc[:, 'trial'] -= 61
    
    # For each subject, compute learning stats from free play stage
    df.reset_index(inplace=True)
    outdf = []
    for i, sdf in tqdm(df.groupby('sid'), desc='Progress: '):
        _sdf = sdf.set_index('trial') # index subject data by trial
        last = _sdf.shape[0] - 1
    
        # Get subject information (group, nam, mps) as a pandas Series
        profile = _sdf.head(1)[['group', 'nam', 'mp1', 'mp2', 'mp3']].iloc[0]

        # Get intervals between consecutive mastery points
        mps = profile['mp1':'mp3'].values
        sorted_lep_bounds = np.sort(np.unique([0] + mps.tolist() + [250]))  
        lep_intervals = pd.IntervalIndex.from_arrays(sorted_lep_bounds[:-1], sorted_lep_bounds[1:], closed='right')

        # Get intervals between consecutive swiches
        switch_trials = _sdf.switch.values.nonzero()[0].tolist() + [250]
        if switch_trials[0] != 1: switch_trials.insert(0,1)

        # Calculate self-challenge (SC) summaries
        sc = _sdf.sc
        sc_flat = np.mean(sc)
        sc_lep = sc.groupby(pd.cut(_sdf.index.astype(int), lep_intervals)).mean().mean()   

        # Calculate weighted initial (dwipc) and final (dwfpc) performances (+ flat performances)
        dwipc = (_sdf.loc[0, 'rpc1':'rpc3'].values * kwargs['difficulty_weights']).sum()
        dwfpc = (_sdf.loc[last, 'rpc1':'rpc3'].values * kwargs['difficulty_weights']).sum()
        ipc = _sdf.loc[0, 'rpc1':'rpc3'].sum()/3
        fpc = _sdf.loc[last, 'rpc1':'rpc3'].sum()/3
        
        # Get profile info and see if subject mastered activities in order of difficulty
        sid = i
        group = profile['group']
        nam = profile['nam']
        progressive = (np.diff(np.array([1,2,3])[np.argsort(mps)]) == 1).all()
        
        # Store subject's learning stats
        outdf.append(
            pd.Series(
                data = [sid,group,nam,progressive,dwipc,dwfpc,ipc,fpc,sc_flat,sc_lep], 
                index='sid,group,nam,progressive,dwipc,dwfpc,ipc,fpc,sc_flat,sc_lep'.split(',')
            )
        )
    
    outdf = pd.DataFrame(outdf).sort_values(by=['group','sid'])
    display(outdf.head())
    display(outdf.groupby('group').mean())
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        outdf.to_csv(save_path, index=False)
    

make_learning_dataset(
    heuristics_data_path = 'data/heuristics_data.csv',
    nam_data_path = 'data/nam_data.csv',
    save_path = 'data/learning_data.csv',
    difficulty_weights = np.array([1,2,3])/6
)

# Choice-modeling dataset

In [None]:
def prep_modeling_data(heuristics_data_path, nam_data_path, save_path):
    # Load data
    df = pd.read_csv(heuristics_data_path, index_col='sid')
    
    # Combine with NAM dataset
    df = df.merge(pd.read_csv(nam_data_path, index_col='sid').drop(columns='group'), on='sid')
    
    # Encode activity choices as one-hot vectors
    activity_codes = df.activity.str.get(1)
    df = pd.concat([df, pd.get_dummies(activity_codes, prefix='ch', prefix_sep='')], axis = 1)
    
    add_data, act_inds = [], ['1','2','3','4']
    for i, sdf in tqdm(df.groupby('sid'), desc='Progress'):
        # Get relative time data
        trials_per_activity = sdf.loc[:, 'ch1':'ch4'].cumsum(axis=0)
        trials_total = np.tile(np.arange(trials_per_activity.shape[0]) + 1, [4, 1]).T
        relt = trials_per_activity / trials_total
        relt.columns = ['relt' + i for i in act_inds]
        
        # Get absolute time data
        abst = trials_per_activity
        abst.columns = ['abst' + i for i in act_inds]
        
        # Get previous trial choice data
        prev = sdf.loc[:, 'ch1':'ch4']
        prev.iloc[1:, :] = prev.iloc[:-1, :]
        prev.iloc[0, :] = np.nan
        prev.columns = ['prev' + i for i in act_inds]
        
        # Store into list
        add_data.append([relt, prev, abst])
    
    add_data = pd.concat([pd.concat(h, axis=0) for h in zip(*add_data)], axis=1)
    df = pd.concat([df, add_data], axis=1).reset_index()

    # Exclude training trials
    df = df.loc[df.trial.gt(60), :]
    df.loc[:, 'trial'] -= 60
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.to_csv(save_path, index=False)
    

prep_modeling_data(
    heuristics_data_path = 'data/heuristics_data.csv',
    nam_data_path = 'data/nam_data.csv',
    save_path = 'data/model_data.csv'
)

# Fitted parameters dataset

In [None]:
def prep_fitted_data(fitted_models_data_path, save_path=''):
    # Load data
    df = pd.read_csv(fitted_models_data_path).set_index(['sid', 'vars'])

    # Initialize empty dict to turn into DF
    var_names = df.index.get_level_values(1).to_series().unique()
    var_names = max(list(var_names), key=len).split(',')
    col_names = ['sid', 'vars'] + var_names + ['tau']
    df_dict = dict(zip(col_names, [[] for _ in col_names]))
    
    # Iterate through DF and extract model params from stored csv strings
    for i, row in df.iterrows():
        if row.aic is np.nan: continue
        df_dict['sid'].append(i[0])
        df_dict['vars'].append(i[1])
        params = [float(p) for p in row.params.split(',')]
        df_dict['tau'].append(params.pop())
        vars_included = i[1].split(',')
        for vn in var_names:
            df_dict[vn].append(params[vars_included.index(vn)] if vn in vars_included else np.nan)

    # Generate DF from df_dict and merge with initial DF
    df = df.filter(items=['group','nam','aic']).merge(
        right = pd.DataFrame(df_dict).set_index(['sid', 'vars']),
        on = ['sid', 'vars']
    ).reset_index()
    
    display(df.head())

    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.to_csv(save_path, index=False)
    

prep_fitted_data(
    fitted_models_data_path = 'data/model_results/param_fits_raw.csv',
    save_path = 'data/model_results/param_fits_clean.csv'
)

# Exposure-competence dataset

In [None]:
def prep_data(data_path, nam_data_path, save_path):
    full_df = pd.read_csv(data_path, index_col='sid').filter(items=['activity', 'stage', 'correct'])
    full_df = full_df.merge(pd.read_csv(nam_data_path, index_col='sid').filter(items=['nam']), on='sid')
    display(full_df.head())
    activities = ['A1','A2','A3','A4']
    
    out_dfs = []
    for nam in [1,2,3]:
        out_dict = dict(zip(activities, [[] for a in activities]))
        df = full_df.loc[full_df.nam.eq(nam), :]
        for i, sdf in df.groupby('sid'):
            for act_ind, sub_sdf in sdf.groupby('activity'):
                out_dict[act_ind].append(sub_sdf.loc[:, 'correct'].tolist())

        for k, v in out_dict.items():
            rect_array = lut.boolean_indexing(v, fillval=np.nan)
            pc = np.nanmean(rect_array, axis=0)
            data_size = pc.size
            pad_size = 310 - data_size
            pc = np.concatenate([pc, np.full(pad_size, np.nan)])
            pc[data_size:] = pc[data_size-50:data_size].mean()
            out_dict[k] = pc

        df = pd.DataFrame(out_dict).fillna(method='ffill')
        df['nam'] = nam
        out_dfs.append(df)
    
    df = pd.concat(out_dfs, axis=0).filter(items=['nam']+activities)
    display(df.head())
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.to_csv(save_path, index=False)
    
    
with np.errstate(all='ignore'):
    prep_data(
        data_path = 'data/clean_data.csv',
        nam_data_path = 'data/nam_data.csv',
        save_path = 'data/exposure_data.csv'
    )