**Imports**

In [None]:
import pandas as pd
import numpy as np
from os import path
from IPython.display import display
from tqdm.notebook import tqdm
from python_scripts.utils import loc_utils as lut

# Combine raw data files
Concatenate raw data and codify participant IDs into a more readable form.

In [None]:
def combine_main_raw(input_data_paths, save_path):
    # Open main files and combine them
    df = pd.concat(
        [pd.read_csv(p) for p in input_data_paths]
    )

    # Codify subject IDs
    df.loc[:, 'sid'] = df.sid.astype('category').cat.codes

    # Save combined data
    print('saving to {}'.format(path.abspath(save_path)))
    df.to_csv(path.join(save_path), index=False)
    
    
combine_main_raw(
    input_data_paths = ('data/raw/ig_main.csv', 'data/raw/eg_main.csv'),
    save_path = 'data/combined_main.csv'
)

# Exclude outliers
Exclude outliers based on allocation bias and response bias. Report number of exclusions in each group based on allocation bias, then exclude from remaining data according response bias and report.

In [None]:
def make_clean_dataset(input_data_path, save_path, **kwargs):
    # Define a response bias function
    def rbf(x):
        _, response_counts = np.unique(x.response, return_counts=True)
        return np.max(response_counts) / np.sum(response_counts)


    # Open combined data file
    df = pd.read_csv(input_data_path, index_col=None).set_index('sid')

    # Initialize columns to record values of interest
    df['alloc_bias'], df['resp_bias'] = 0, 0

    # Calculate values of interest
    activities = ('A1', 'A2', 'A3', 'A4')
    for sid, sdf in tqdm(df.groupby(by='sid'), desc='Progress: '):
        # Allocation variance
        counts = [sum(sdf.activity == i) for i in activities]
        allocation_variance = np.std(counts)
        df.loc[sid, 'alloc_bias'] = allocation_variance

        # Response bias
        response_bias = sdf.groupby('family').apply(rbf).mean()
        df.loc[sid, 'resp_bias'] = response_bias

    # Detect high allocation variance and response bias
    df_ = df.reset_index().groupby('sid').head(1).reset_index()
    df_['high_ab'] = df_.alloc_bias >= kwargs['ab_crit']
    df_['high_rb'] = np.logical_and(df_.resp_bias > df_.resp_bias.mean() + kwargs['rb_crit'] * df_.resp_bias.std(), ~df_.high_ab)

    display(df_.groupby(by='group')[['high_ab', 'high_rb']].sum().astype(int))
    print('Found {} outliers'.format(np.logical_or(df_.high_ab, df_.high_rb).sum()))

    # Exclude outliers
    outlier = df_.loc[df_.high_ab | df_.high_rb, 'sid']
    df = df.loc[~df.index.isin(outlier), :]
    display(df.reset_index().groupby(by='group')['sid'].nunique())

    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.reset_index().to_csv(save_path, index=False)
    

make_clean_dataset(
    input_data_path = 'data/combined_main.csv',
    save_path = 'data/clean_data.csv',

    # Set outlier criteria
    ab_crit = 100,   # allocation variance critical value
    rb_crit = 2 ,    # response bias critical value
)

# Calculate heuristics
|Heuristic|Description ($t_i$ = trial number $i$; $w$ = window size)|
|:-------:|:--------------------------------------------------------|
| **PC**  | overall competence ($t_0$ to $t_i$)                     |
| **rPC** | recent competence ($t_{i-w}$ to $t_i$)                  |
| **rLP** | recent learning progress ($t_{i-w}$ to $t_i$)           |
| **SC**  | self-challenge                                          |

In [None]:
def rlp_func(x, subwindow_1, subwindow_2, abs_lp=True):
    '''Computing recent LP in x'''
    diff = np.mean(x[:subwindow_1]) - np.mean(x[-subwindow_2:])
    return np.abs(diff) if abs_lp else diff


def make_heuristics_dataset(input_data_path, save_path, **kwargs):
    # Read clean data and drop unused data
    df = pd.read_csv(input_data_path, index_col=None).set_index(['sid','activity'])
    df = df.loc[:, 'group,stage,trial,correct'.split(',')]
    df = df.loc[df.trial <= 60+250]

    # Add new columns
    activities = 'A1,A2,A3,A4'.split(',')
    for heuristic in ['pc','rpc','rlp']:
        for a in activities:
            df['{}{}'.format(heuristic, a[1])] = np.nan
    df['sc'] = np.nan
    df = df.loc[(0, slice(None)), :]
    # Calculate dynamic performance heuristics for each subject
    act_codes = {'A1':1, 'A2':2, 'A3':3, 'A4':4}
    for i, sdf in tqdm(df.groupby('sid'), desc='Progress: '):
        for a in activities:
            x = sdf.loc[(i, a), 'correct'].astype(int)

            # Overall competence (pc)
            pc = np.cumsum(x) / np.arange(1, x.size+1)
            df.loc[(i, a), 'pc{}'.format(a[1])] = pc

            # Recent competence (rpc)
            rpc = x.rolling(min_periods=kwargs['window_size'], window=kwargs['window_size']).mean()
            df.loc[(i, a), 'rpc{}'.format(a[1])] = rpc

            # Recent learning progress (rlp)
            rlp = x.rolling(min_periods=kwargs['window_size'], window=kwargs['window_size']).apply(
                rlp_func, args=(kwargs['subwindow_size_1'], kwargs['subwindow_size_2']), raw=False
            )
            df.loc[(i, a), 'rlp{}'.format(a[1])] = rlp
        
        df.loc[(i, slice(None)), :] = df.loc[(i, slice(None)), :].fillna(method='ffill', axis=0)

        # Self-challenge (sc)
        rpc_max = df.loc[(i, slice(None)), 'rpc1':'rpc4'].max(axis=1).rolling(min_periods=1, window=250).max()
        rpc_min = df.loc[(i, slice(None)), 'rpc1':'rpc4'].min(axis=1).rolling(min_periods=1, window=250).min()
        act_inds = np.array([act_codes[a] for a in sdf.index.get_level_values(1).tolist()]) - 1
        current_rpc = df.loc[(i, slice(None)), 'rpc1':'rpc4'].values[np.arange(60+250), act_inds]
        sc = 1 - (current_rpc-rpc_min)/(rpc_max-rpc_min)
        df.loc[(i, slice(None)), 'sc'] = sc

    df = df.reset_index().sort_values(by=['sid', 'trial'])
    df.loc[df.stage=='train', 'sc'] = np.nan    # SC is not defined in familizarization stage
    display(df.loc[(df.sid == 0) & (df.trial >= 1) & (df.trial < 70), :])    # Display data excerpt
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        df.to_csv(save_path, index=False)
    
    
make_heuristics_dataset(
    input_data_path = 'data/clean_data.csv',
    save_path = 'data/heuristics_data.csv',
    window_size = 15,
    subwindow_size_1 = 10,
    subwindow_size_2 = 6,
)

# NAM designation

In [None]:
def get_mps(df, **kwargs):
    '''Find mastery points'''
    arr = df.values
    mask = (arr != 0)
    arr = np.where(mask.any(axis=0), mask.argmax(axis=0), kwargs['invalid_val'])
    return pd.Series(arr, dtype=kwargs['dtype'])


def make_nam_dataset(input_data_path, save_path, **kwargs):
    # Load data
    df = pd.read_csv(input_data_path, index_col='sid')

    # Select free-play trials
    df = df.loc[(df.trial <= 60+250) & (df.trial >= 60)]
    df.loc[:, 'trial'] -= 60

    # Evaluate each trial's recent PC to True if mastery criterion was reached
    mastered = df.reset_index().set_index(['sid','trial']).loc[:, 'rpc1':'rpc3'] >= kwargs['crit']
    
    # For each subject, find mastery points and NAM
    by_sid = mastered.groupby('sid')
    mastery_points = by_sid.apply(get_mps, invalid_val=250, dtype='int')
    mastery_points.rename(columns={0:'mp1', 1:'mp2', 2:'mp3'}, inplace=True)
    nam = by_sid.any().sum(axis=1).to_frame(name='nam')

    # Display output dataset excerpt
    nam_df = nam.merge(mastery_points, on='sid').reset_index()
    display(nam_df.head(10))
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        nam_df.to_csv(save_path, index=False)    


make_nam_dataset(
    input_data_path = 'data/heuristics_data.csv', 
    save_path = 'data/nam_data.csv',
    crit = 13/15)

# Learning dataset

In [None]:
def make_learning_dataset(input_data_path, save_path, **kwargs):    
    # Load data
    df = pd.read_csv('data/heuristics_data.csv', index_col='sid')
    
    # Combine with NAM dataset
    df = df.merge(pd.read_csv('data/nam_data.csv', index_col='sid'), on='sid')
    
    # Encode activity choices as one-hot vectors
    df = pd.concat([df, pd.get_dummies(df.activity, prefix='ch')], axis = 1)
    
    # Annotate switch trials
    df['switch'] = 0
    choices = df.activity.values
    switches = df.switch.values.copy() 
    switches[1:] = choices[:-1] != choices[1:]
    df.loc[:, 'switch'] = switches
    df.loc[df.stage=='train', 'switch'] = 0  # changing activity during forced stage is not switching
    df.loc[df.trial==61, 'switch'] = 0       # choosing activity for the first time is not switching
    
    # Select free-play trials
    df = df.loc[(df.trial <= 60+250) & (df.trial > 60)]
    df.loc[:, 'trial'] -= 61
    
    # For each subject, compute learning stats from free play stage
    df.reset_index(inplace=True)
    outdf = []
    for i, sdf in tqdm(df.groupby('sid'), desc='Progress: '):
        _sdf = sdf.set_index('trial') # index subject data by trial
    
        # Get subject information (group, nam, mps) as a pandas Series
        profile = _sdf.head(1)[['group', 'nam', 'mp1', 'mp2', 'mp3']].iloc[0]

        # Get intervals between consecutive mastery points
        mps = profile['mp1':'mp3'].values
        sorted_lep_bounds = np.sort(np.unique([0] + mps.tolist() + [250]))  
        lep_intervals = pd.IntervalIndex.from_arrays(sorted_lep_bounds[:-1], sorted_lep_bounds[1:], closed='right')

        # Get intervals between consecutive swiches
        switch_trials = _sdf.switch.values.nonzero()[0].tolist() + [250]
        if switch_trials[0] != 1: switch_trials.insert(0,1)
        streaks = pd.IntervalIndex.from_arrays(switch_trials[:-1], switch_trials[1:], closed='right')

        # Calculate self-challenge (SC) summaries
        sc = _sdf.sc
        sc_flat = np.mean(sc)
        sc_lep = sc.groupby(pd.cut(_sdf.index.astype(int), lep_intervals)).mean().mean()
        sc_streaks = sc.groupby(pd.cut(_sdf.index.astype(int), streaks)).mean().mean()    

        # Calculate weighted initial (dwipc) and final (dwfpc) performances
        dwipc = (_sdf.loc[0, 'rpc1':'rpc3'].values * kwargs['difficulty_weights']).sum()
        dwfpc = (_sdf.loc[249, 'rpc1':'rpc3'].values * kwargs['difficulty_weights']).sum()

        # Get profile info and see if subject mastered activities in order of difficulty
        sid = i
        group = profile['group']
        nam = profile['nam']
        progressive = (np.diff(np.array([1,2,3])[np.argsort(mps)]) == 1).all()
        
        # Store subject's learning stats
        outdf.append(
            pd.Series(
                data = [sid,group,nam,progressive,dwipc,dwfpc,sc_flat,sc_lep,sc_streaks], 
                index='sid,group,nam,progressive,dwipc,dwfpc,sc_flat,sc_lep,sc_streaks'.split(',')
            )
        )
    
    outdf = pd.DataFrame(outdf).sort_values(by=['group','sid'])
    display(outdf.head())
    
    # Save data
    if save_path:
        print('saving to {}'.format(path.abspath(save_path)))
        outdf.to_csv(save_path, index=False)
    

make_learning_dataset(
    input_data_path = 'data/ntm_data_freeplay.pkl',
    save_path = 'data/learning_data.csv',
    difficulty_weights = np.array([1,2,3])/6
)