# Imports

In [None]:
%matplotlib notebook
import utils.vis_utils as vut
import utils.loc_utils as lut

import ipywidgets as wid
import numpy as np
import pandas as pd
import scipy as sp
import contextlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib as mpl
mpl.use('Agg')
from IPython.display import display
from itertools import combinations
from tqdm import tqdm_notebook, tqdm
import numdifftools as nd
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit
from IPython.display import display, clear_output

from collections import OrderedDict
import itertools

colors = ['#43799d', '#cc5b46', '#ffbb00', '#71bc78', '#43799d', '#cc5b46', '#ffbb00', '#71bc78']
gcolors = ['#008fd5', '#fc4f30', '#e5ae38', '#6d904f']

glabels = {0: 'F', 1: 'S'} 
fullglabels = {0: 'Free', 1: 'Strategic'}

@contextlib.contextmanager
def temp_seed(seed):
    state = np.random.get_state()
    np.random.seed(seed)
    try:
        yield
    finally:
        np.random.set_state(state)
        
import warnings
warnings.filterwarnings("ignore")

# Preprocess data

In [None]:
def onehotize(x):
    ind = np.arange(x.size)
    out = np.zeros([x.size, 4])
    out[ind, x] = 1
    return out
    

def prep_data():
    ntmdf = lut.unpickle('../data/ntm_data.pkl')[['sid', 'ntm', 'trial', 'lp1', 'lp2', 'lp3']]
    ntmdf = ntmdf.loc[ntmdf.trial==0, ].reset_index(drop=True).drop(columns='trial')
    ntmdf = ntmdf.rename(columns={'lp1': 'lpo1', 'lp2': 'lpo2', 'lp3': 'lpo3'}).set_index('sid')
    
    df = lut.unpickle('../data/trials_data2.pkl')
    df = df.loc[(df.sid>=0) & (df.trial<=310), :]
    calc_lp = lambda x: np.mean(x[-9:]) - np.mean(x[:10])

    N = 250+15*4
    with tqdm_notebook(total=df.sid.unique().shape[0]) as progbar:
        sdf_out_list = []
        for i, sdf_in in df.groupby('sid'):
            sid = sdf_in.sid.values[0]
            grp = sdf_in.grp.values[0]
            t0  = sdf_in.t0.values[59:]
            ntm, lpo1, lpo2, lpo3 = ntmdf.loc[sid, ].values
            sdf_ind = [np.full(251, sid), np.full(251, grp), np.full(251, ntm), np.arange(0, 251), t0]
            lpos = [np.full(251, lpo1), np.full(251, lpo2), np.full(251, lpo3)]
            sdf_out = []
            pc_cols = []
            rlp_cols = []
            dlp_cols = []
            mlp_cols = []
            relt_cols = []
            ch_cols = []
            for tid in [1, 2, 3, 4]:
                nans = np.full(N, np.nan) # make NaN vector
                pc = sdf_in.loc[sdf_in.t0==tid, 'cor'].dropna().astype('int').rolling(window=15).mean() # calculate rolling PC
                nans[(sdf_in.t0==tid).values] = pc # fill nan vector with PC values where needed
                pc_ = pc[:]
                pc = pd.Series(nans).fillna(method='ffill')[59:] # fill NaNs with last non-NaN value and de-select initial NaNs
                sdf_out.append(pc)
                pc_cols.append('PC'+str(tid))

                nans = np.full(N, np.nan) # make NaN vector
                rlp = sdf_in.loc[sdf_in.t0==tid, 'cor'].dropna().astype('int').rolling(window=15).apply(calc_lp, raw=True) # calculate rolling PC
                nans[(sdf_in.t0==tid).values] = rlp # fill nan vector with LP values where needed
                rlp = pd.Series(nans).fillna(method='ffill')[59:] # fill NaNs with last non-NaN value and de-select initial NaNs        
                sdf_out.append(rlp)
                rlp_cols.append('rLP'+str(tid))

                nans = np.full(N, np.nan) # make NaN vector
                pc0ind = np.nonzero(~np.isnan(pc_.values.squeeze()))[0][0]
                pc0 = pc_.values.squeeze()[pc0ind]
                dlp = pc_.values.squeeze() - pc0
                dlp[pc0ind] = pc0 - 0.5
                nans[(sdf_in.t0==tid).values] = dlp
                dlp = pd.Series(nans).fillna(method='ffill')[59:] # fill NaNs with last non-NaN value and de-select initial NaNs        
                sdf_out.append(dlp)
                dlp_cols.append('dLP'+str(tid))

                nans = np.full(N, np.nan) # make NaN vector
                l = sdf_in.loc[sdf_in.t0==tid, 'cor'].dropna().values.shape[0]
                pc_ = sdf_in.loc[sdf_in.t0==tid, 'cor'].dropna().astype('int').rolling(min_periods=1,window=l).mean() # calculate rolling PC
                pc0ind = np.nonzero(~np.isnan(pc_.values.squeeze()))[0][0]
                pc0 = pc_.values.squeeze()[pc0ind]
                mlp = pc_.values.squeeze() - pc0
                mlp[pc0ind] = pc0 - 0.5
                nans[(sdf_in.t0==tid).values] = mlp
                mlp = pd.Series(nans).fillna(method='ffill')[59:] # fill NaNs with last non-NaN value and de-select initial NaNs        
                sdf_out.append(mlp)
                mlp_cols.append('mLP'+str(tid))

            nvar = 4 # number of variables (e.g. PC, LP, CH etc)
            inds = itertools.chain.from_iterable([np.arange(len(sdf_out)).tolist()[i::nvar] for i in range(nvar)])
            colnames =  ['sid','grp','ntm','trial','tid']+['lpo1','lpo2','lpo3']+pc_cols+rlp_cols+dlp_cols+mlp_cols
            list2stack = sdf_ind + lpos + [sdf_out[i] for i in inds]
            sdf_out = pd.DataFrame(np.stack(list2stack, axis=1), columns=colnames)
            dummies = onehotize(sdf_out.tid.values.astype(int)-1)
            for j, tid in enumerate(list('1234')):
                sdf_out['CH'+tid] = dummies[:, j]
            for j, tid in enumerate(list('1234')):
                sdf_out['RELT'+tid] = np.cumsum(sdf_out['CH'+tid]) / np.arange(1, 251+1)
            sdf_out_list.append(sdf_out)
            progbar.update()

    df = pd.concat(sdf_out_list, ignore_index=True)
    int_cols = ['sid', 'grp', 'trial', 'ntm', 'lpo1', 'lpo2', 'lpo3', 'CH1', 'CH2', 'CH3', 'CH4']
    convert_dict = dict(zip(int_cols, [int for i in int_cols]))
    df = df.astype(convert_dict) 
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(df.loc[df.sid==0, 'PC1':'RELT4'].head())
    
#     lut.dopickle('data/choiceModelData_PC_dLP_rLP_mLP_RELT.pkl', data=df)

    
if 1:
    prep_data()

# Fit params

In [None]:
def fit_params(Nseeds, lp_abs=True):
    apply_bounds = False

    bounds = OrderedDict({
#         'intercept': [-1, 1],
        'pc_coef':[-1, 1],
#         'dlp_coef':[-1, 1],
        'rlp_coef':[-1, 1],
#         'relt_coef': [-1, 1],      
        'tau':[1,10]})
    func = np.abs if lp_abs else np.copy
    
    def rand_params(bounds):
        return np.array([np.random.uniform(l, u) if bounds else np.random.rand() for l, u in bounds])
    
    def neg_log_likelihood(params, *args):
        coeffs = np.array(params[:-1])
        inps = np.stack(args[:-1], axis=0).astype(float)
        U = (coeffs[:, None, None] * inps).sum(axis=0)
        exponent = np.exp(U * params[-1])
        P = (exponent.T / np.sum(exponent, axis=1)).T
        logP = np.log(P[args[-1].astype(bool)])
        logL = np.sum(logP, axis=0)
        return -logL

    # Estimate the params
    df = lut.unpickle('data/choiceModelData_PC_dLP_rLP_mLP_RELT.pkl')
    df = df.loc[df.ntm != 0, :]
    print(df.sid.unique().size)
    display(df.head())

    arr = []

    for seed in tqdm_notebook(range(Nseeds), desc='Seed:'):
        np.random.seed(seed)
        data_dict = {'sid': [], 'grp': [], 'ntm': [], 'loss': [], 'aic': [], 'aic_0': [], 'aic_diff': []}
        for k in bounds.keys(): data_dict[k] = []

        init_guess = rand_params([bound for bound in bounds.values()])
        for i, sdf in tqdm_notebook(df.groupby('sid'), desc='sid:', leave=False):
            sid, grp, ntm = sdf.sid.values[0], sdf.grp.values[0], sdf.ntm.values[0]
            pcs = sdf.loc[:, 'PC1':'PC4'].values[1:, :].astype(float)
            dlps = func(sdf.loc[:, 'dLP1':'dLP4'].values[1:, :].astype(float))
            rlps = func(sdf.loc[:, 'rLP1':'rLP4'].values[1:, :].astype(float))
            relts = sdf.loc[:, 'RELT1':'RELT4'].values[1:, :]
            chs = sdf.loc[:, 'CH1':'CH4'].values[1:, :]

            data = {'pc_coef':pcs, 'dlp_coef':dlps, 'rlp_coef':rlps, '' 'tau':chs}
            data = tuple([data[k] for k in bounds.keys()])
            data_arr = np.stack(data, axis=0)

            if apply_bounds:
                x, f, d = sp.optimize.fmin_l_bfgs_b(func=neg_log_likelihood, x0=init_guess, args=data,
                                                approx_grad=True, disp=False, bounds=tuple(bounds.values()))
            else:
                res = sp.optimize.minimize(neg_log_likelihood, x0=init_guess, args=data)
                x, f = res.x, res.fun
                if np.isnan(f): f = 10e5

            baseline = neg_log_likelihood([0 for k in bounds.keys()], *data)

            # Store params   
            data_dict['sid'].append(sid)
            data_dict['grp'].append(grp)
            data_dict['ntm'].append(ntm)
            data_dict['loss'].append(f)
            for i, k in enumerate(bounds.keys()):
                data_dict[k].append(x[i])
            data_dict['aic'].append(2*f + 2*len(bounds.keys()))
            data_dict['aic_0'].append(2*baseline)
            data_dict['aic_diff'].append(data_dict['aic_0'][-1] - data_dict['aic'][-1])

        arr.append(pd.DataFrame(data_dict).values)

    arr = np.stack(arr, axis=0)
    signedFlag = 'unsigned' if lp_abs else 'signed'
#     lut.dopickle('data/choiceModelParamFits_PC_rLP_{}_{}seeds'.format(signedFlag, Nseeds), data=arr)

    
if 1: 
    fit_params(500, lp_abs=True)

# Model comparison

In [None]:
def model_comparison(Nseeds):
    def rand_params(bounds):
        return np.array([np.random.uniform(l, u) if bounds else np.random.rand() for l, u in bounds])
    def neg_log_likelihood(params, *args):
        coeffs = np.array(params[:-1])
        inps = np.stack(args[:-1], axis=0)
        U = (coeffs[:, None, None] * inps).sum(axis=0)
        exponent = np.exp(U * params[-1])
        P = (exponent.T / np.sum(exponent, axis=1)).T
        logP = np.log(P[args[-1].astype(bool)])
        logL = np.sum(logP, axis=0)
        return -logL

    # Estimate the params
    df = lut.unpickle('../data/choiceModelData_PC_dLP_rLP_mLP_RELT.pkl')
#     df = df.loc[df.ntm != 0, :]
    df = df.loc[(df.ntm != 0) & (df.sid >= 216), :]

    cols = ['form','sid','grp','ntm','loss','aic','tau']
    varlist = ['PC', 'dLP', 'rLP', 'mLP', 'RELT']
    for var in varlist: cols.append(var.lower()+'_coef')
    varnames = np.array(varlist)

    bounds = OrderedDict({
        'pc_coef':[-1, 1],
        'dlp_coef':[-1, 1],
        'rlp_coef':[-1, 1],
        'mlp_coef': [-1, 1],
        'relt_coef': [-1, 1],
        'tau':[1,10]})
    
    models, inds = [], [i for i in range(len(varlist))]
    
    for s in range(1,len(inds)+1):
        models += combinations(inds, s)
    
    pb_label = wid.Label('Model form: ')
    display(pb_label)
    for i, sdf in tqdm_notebook(df.groupby('sid')):
        data_dict = dict(zip(cols, [[] for _ in cols]))
        sid, grp, ntm = sdf.sid.values[0], sdf.grp.values[0], sdf.ntm.values[0]
        pcs = sdf.loc[:, 'PC1':'PC4'].values[1:, :].astype(float)
        dlps = np.abs(sdf.loc[:, 'dLP1':'dLP4'].values[1:, :].astype(float))
        rlps = np.abs(sdf.loc[:, 'rLP1':'rLP4'].values[1:, :].astype(float))
        mlps = np.abs(sdf.loc[:, 'mLP1':'mLP4'].values[1:, :].astype(float))
        relts = sdf.loc[:, 'RELT1':'RELT4'].values[1:, :]
        chs = sdf.loc[:, 'CH1':'CH4'].values[1:, :]

        data = {'pc_coef':pcs, 'dlp_coef':dlps, 'rlp_coef':rlps, 'mlp_coef':mlps, 'relt_coef': relts, 'tau':chs}
        data_tuple = tuple([data[k] for k in bounds.keys()])
        data_arr = np.stack(data, axis=0)

        for model in models:
            subdata = [data_tuple[mi] for mi in model] + [data_tuple[-1]]
            xs, fs = [], []
            pb_label.value = 'Model form: {}'.format(' + '.join(varnames[tuple([model])]))
            for seed in tqdm_notebook(range(Nseeds), leave=False):
                init_guess = rand_params(list(bounds.values())).tolist()
                subguess = [init_guess[mi] for mi in model] + [init_guess[-1]]
                res = sp.optimize.minimize(neg_log_likelihood, x0=subguess, args=tuple(subdata))
                xs.append(res.x)
                fs.append(res.fun)
            if np.all(np.isnan(fs)):
                print('"ValueError: All-NaN slice encountered" error, skipped')
                continue
            x, f = xs[np.nanargmin(fs)], np.nanmin(fs)

            # Store params
            vec = np.full(len(varlist)+1, np.nan)
            vec[tuple([model])] = x[:-1]
            vec[-1] = x[-1]
            data_dict['form'].append(' + '.join(varnames[tuple([model])]))
            data_dict['sid'].append(sid)
            data_dict['grp'].append(grp)
            data_dict['ntm'].append(ntm)
            data_dict['loss'].append(f)
            data_dict['aic'].append(2*f + 2*len(models))
            for i, k in enumerate(list(bounds.keys())):
                data_dict[k].append(vec[i])
        
        # Calculate parameter stats
        fdf = pd.DataFrame(data_dict)
        if sid==0:
            fdf.to_csv('data/choiceModelComparison_PC_dLP_rLP_mLP_RELT_{}seeds.csv'.format(Nseeds), 
                       index=False)
        else:
            fdf.to_csv('data/choiceModelComparison_PC_dLP_rLP_mLP_RELT_{}seeds.csv'.format(Nseeds), 
                       index=False, mode='a', header=False)

    

if 1: 
    model_comparison(300)