In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from os.path import join

In [2]:
def scale(x, a, b):
    """
    Linearly scales an array's values
    to a range between a and b.
    """
    return (b - a) * (x - x.min()) / (x.max() - x.min()) + a

# Krajbich 2010
- Data from: original author

In [3]:
krajbich2010_raw = pd.read_stata(join('data', 'krajbich_2010_natneuro', 'original', 'data_nature2010.dta'))

krajbich2010_prep_all_list = []
krajbich2010_prep_nofirst_list = []

for s, subject in tqdm(enumerate(krajbich2010_raw['subject'].unique())):
    subject_data = krajbich2010_raw.loc[krajbich2010_raw['subject'] == subject].copy()
    for t, trial in enumerate(subject_data['trial'].unique()):
        trial_data = subject_data.loc[subject_data['trial'] == trial]
        trial_data_nofirst = trial_data.loc[trial_data['fix_num'] != 1]
        
        trial_data_prep_all = pd.DataFrame(dict(subject=s,
                                                trial=t,
                                                rt=trial_data['rt'].values[0],
                                                choice=(1 - trial_data['choice'].values[0]),
                                                item_value_0=trial_data['leftrating'].values[0],
                                                item_value_1=trial_data['rightrating'].values[0],
                                                gaze_0=trial_data.loc[trial_data['roi'] == 1, 'event_duration'].sum() / trial_data['event_duration'].sum(),
                                                gaze_1=trial_data.loc[trial_data['roi'] == 2, 'event_duration'].sum() / trial_data['event_duration'].sum()),
                                           index=np.ones(1)*t) 
        krajbich2010_prep_all_list.append(trial_data_prep_all)

        if len(trial_data_nofirst) > 0:
            trial_data_prep_nofirst = pd.DataFrame(dict(subject=s,
                                                        trial=t,
                                                        rt=trial_data_nofirst['rt'].values[0],
                                                        choice=(1 - trial_data_nofirst['choice'].values[0]),
                                                        item_value_0=trial_data_nofirst['leftrating'].values[0],
                                                        item_value_1=trial_data_nofirst['rightrating'].values[0],
                                                        gaze_0=trial_data_nofirst.loc[trial_data_nofirst['roi'] == 1, 'event_duration'].sum() / trial_data_nofirst['event_duration'].sum(),
                                                        gaze_1=trial_data_nofirst.loc[trial_data_nofirst['roi'] == 2, 'event_duration'].sum() / trial_data_nofirst['event_duration'].sum()),
                                               index=np.ones(1)*t) 

            krajbich2010_prep_nofirst_list.append(trial_data_prep_nofirst)
        
    del subject_data, trial_data, trial_data_prep_all, trial_data_prep_nofirst

krajbich2010_prep_all = pd.concat(krajbich2010_prep_all_list).reset_index(drop=True)
del krajbich2010_prep_all_list
krajbich2010_prep_nofirst = pd.concat(krajbich2010_prep_nofirst_list).reset_index(drop=True)
del krajbich2010_prep_nofirst_list

krajbich2010_prep_nofirst.dropna(inplace=True) # remove trials without gaze data
krajbich2010_prep_all.to_csv(join('data', 'krajbich_2010_natneuro', 'krajbich2010_prep_all.csv'), index=False)
krajbich2010_prep_nofirst.to_csv(join('data', 'krajbich_2010_natneuro', 'krajbich2010_prep_nofirst.csv'), index=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# Krajbich 2011
- Data from: original author

In [4]:
krajbich2011_raw = pd.read_csv(join('data', 'krajbich_2011_pnas', 'original', 'data_pnas2011.csv'), index_col=0)

krajbich2011_prep_all_list = []
krajbich2011_prep_nofirst_list = []

for s, subject in tqdm(enumerate(krajbich2011_raw['subject'].unique())):
    subject_data = krajbich2011_raw.loc[krajbich2011_raw['subject'] == subject].copy()
    subject_data['fixation_position'] = np.argmax(subject_data[['leftroi', 'middleroi', 'rightroi']].values, axis=1)
    subject_data['choice'] = np.argmax(subject_data[['choice1', 'choice2', 'choice3']].values, axis=1)
    for t, trial in enumerate(subject_data['trial'].unique()):
        trial_data = subject_data.loc[subject_data['trial'] == trial]
        trial_data_nofirst = trial_data.loc[trial_data['fix_num'] != 1]
        
        trial_data_prep_all = pd.DataFrame(dict(subject=s,
                                                trial=t,
                                                rt=trial_data['rt'].values[0],
                                                choice=trial_data['choice'].values[0],
                                                item_value_0=trial_data['rating1'].values[0],
                                                item_value_1=trial_data['rating2'].values[0],
                                                item_value_2=trial_data['rating3'].values[0],                                                
                                                gaze_0=trial_data.loc[trial_data['fixation_position'] == 0, 'eventduration'].sum() / trial_data['eventduration'].sum(),
                                                gaze_1=trial_data.loc[trial_data['fixation_position'] == 1, 'eventduration'].sum() / trial_data['eventduration'].sum(),
                                                gaze_2=trial_data.loc[trial_data['fixation_position'] == 2, 'eventduration'].sum() / trial_data['eventduration'].sum()),
                                           index=np.ones(1)*t)
        krajbich2011_prep_all_list.append(trial_data_prep_all)

        if len(trial_data_nofirst) > 0:
            trial_data_prep_nofirst = pd.DataFrame(dict(subject=s,
                                                        trial=t,
                                                        rt=trial_data_nofirst['rt'].values[0],
                                                        choice=trial_data_nofirst['choice'].values[0],
                                                        item_value_0=trial_data_nofirst['rating1'].values[0],
                                                        item_value_1=trial_data_nofirst['rating2'].values[0],
                                                        item_value_2=trial_data_nofirst['rating3'].values[0],                                                
                                                        gaze_0=trial_data_nofirst.loc[trial_data_nofirst['fixation_position'] == 0, 'eventduration'].sum() / trial_data_nofirst['eventduration'].sum(),
                                                        gaze_1=trial_data_nofirst.loc[trial_data_nofirst['fixation_position'] == 1, 'eventduration'].sum() / trial_data_nofirst['eventduration'].sum(),
                                                        gaze_2=trial_data_nofirst.loc[trial_data_nofirst['fixation_position'] == 2, 'eventduration'].sum() / trial_data_nofirst['eventduration'].sum()),
                                                   index=np.ones(1)*t) 
            krajbich2011_prep_nofirst_list.append(trial_data_prep_nofirst)
        
    del subject_data, trial_data, trial_data_prep_all, trial_data_prep_nofirst

krajbich2011_prep_all = pd.concat(krajbich2011_prep_all_list).reset_index(drop=True)
del krajbich2011_prep_all_list
krajbich2011_prep_nofirst = pd.concat(krajbich2011_prep_nofirst_list).reset_index(drop=True)
del krajbich2011_prep_nofirst_list

krajbich2011_prep_nofirst.dropna(inplace=True) # remove trials without gaze data
krajbich2011_prep_all.to_csv(join('data', 'krajbich_2011_pnas', 'krajbich2011_prep_all.csv'), index=False)
krajbich2011_prep_nofirst.to_csv(join('data', 'krajbich_2011_pnas', 'krajbich2011_prep_nofirst.csv'), index=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# Folke 2016

Data obtained from https://github.com/BDMLab/Folke_De_Martino_NHB_2016_Github

## Experiment 2

This is data from a 3-alternative forced choice task, where participants made choices between snack foods, and eye movements were recorded. The data set only includes aggregated gaze data on the trial level, no individual fixation data. Therefore for this data set an analysis without the first fixation is not possible.

In [5]:
folke2016_raw = pd.read_csv('data/folke_2016_nhb/original/exp2_main_data.csv')

folke2016_prep = folke2016_raw.rename({'Participant':'subject',
                                       'Value_Position_1': 'item_value_0',
                                       'Value_Position_2': 'item_value_1',
                                       'Value_Position_3': 'item_value_2',
                                       'DT_Position_1': 'dwell_0',
                                       'DT_Position_2': 'dwell_1',
                                       'DT_Position_3': 'dwell_2',
                                       'Chosen_Position': 'choice',
                                       'Choice_RT': 'rt'},
                                      axis='columns')

# Convert choice to zero-based indexing
folke2016_prep['choice'] = folke2016_prep['choice'] - 1 

# Convert subject to consecutive 0 based index
for s, subject in enumerate(folke2016_prep['subject'].unique()):
    folke2016_prep.loc[folke2016_prep['subject'] == subject, 'subject'] = s

# Add trial variable
folke2016_prep['trial'] = 0
for subject in folke2016_prep['subject'].unique():
    n_trials = folke2016_prep[folke2016_prep['subject'] == subject].shape[0]
    folke2016_prep.loc[folke2016_prep['subject'] == subject, 'trial'] = np.arange(n_trials)

# Scale values to a range between 1 and 10
n_items = 3
item_values = folke2016_prep[['item_value_{}'.format(i)
                             for i in range(n_items)]].values
scaled_values = scale(item_values, a=1, b=10)
for i in range(n_items):
    folke2016_prep['item_value_{}'.format(i)] = scaled_values[:, i]

# Compute gaze
dwells = folke2016_prep[['dwell_0', 'dwell_1', 'dwell_2']].values
gaze = dwells / dwells.sum(axis=1, keepdims=True)
for i in range(n_items):
    folke2016_prep['gaze_{}'.format(i)] = gaze[:, i]

# Reduce to only needed columns
folke2016_prep = folke2016_prep[['subject', 'trial',
                                 'choice', 'rt',
                                 'item_value_0', 'item_value_1', 'item_value_2',
                                 'gaze_0', 'gaze_1', 'gaze_2'
                                 ]].copy()

folke2016_prep.to_csv(join('data', 'folke_2016_nhb', 'folke2016_prep_all.csv'), index=False)

# Tavares 2017

- Paper: https://www.frontiersin.org/articles/10.3389/fnins.2017.00468/full  
- Data obtained from: http://www.rnl.caltech.edu/publications/index.html

## Experiment 1

This is data from a 2-alternative forced choice task, where participants made perceptual choices about which of 2 line segments was more similar to a given template.

In [6]:
tavares2017_trials_raw = pd.read_csv(join('data', 'tavares_2017_fns', 'original', 'expdata.csv'))
tavares2017_fixations_raw = pd.read_csv('data/tavares_2017_fns/original/fixations.csv')

tavares2017_trials = tavares2017_trials_raw.rename({'parcode': 'subject',
                                                    'item_left': 'item_value_0',
                                                    'item_right': 'item_value_1'},
                                                    axis='columns')
tavares2017_trials.drop('valid', axis=1, inplace=True)

# Recode choice variable (note that this is not documented well, could be the other way around.)
tavares2017_trials['choice'].replace({1: 1, -1: 0}, inplace=True)

# Scale item values
n_items = 2
item_values = tavares2017_trials[['item_value_{}'.format(i)
                                  for i in range(n_items)]].values
## Convert to absolute values, since we don't care about direction of tilt, but absolute discrepancy to template
item_values = np.abs(item_values)

scaled_values = scale(item_values, 1, 10)
for i in range(n_items):
    tavares2017_trials['item_value_{}'.format(i)] = 11 - scaled_values[:, i]

# Compute gaze for each trial
## Drop fixations that are not to item 1 or 2
tavares2017_fixations_all = tavares2017_fixations_raw[tavares2017_fixations_raw['fix_item'].isin([1, 2])].copy()
tavares2017_fixations_all = tavares2017_fixations_all.rename({'parcode': 'subject',
                                                              'fix_item': 'fixation_position',
                                                              'fix_time': 'fixation_duration'},
                                                             axis='columns').reset_index(drop=True)
tavares2017_fixations_nofirst = tavares2017_fixations_all.groupby(['subject', 'trial']).apply(lambda trialdata: trialdata.iloc[1:]).reset_index(drop=True)

for s, subject_code in tqdm(enumerate(tavares2017_trials['subject'].unique())):
    subject_data_all = tavares2017_fixations_all[tavares2017_fixations_all['subject'] == subject_code].copy()
    subject_data_nofirst = tavares2017_fixations_nofirst[tavares2017_fixations_nofirst['subject'] == subject_code].copy()
    for t, trial in enumerate(subject_data_all['trial'].unique()):
        # All fixations
        trial_data_all = subject_data_all[subject_data_all['trial'] == trial].copy()
        trial_dwell_all = trial_data_all['fixation_duration'].sum()
        dwell_0_all = trial_data_all[trial_data_all['fixation_position'] == 1]['fixation_duration'].sum()
        dwell_1_all = trial_data_all[trial_data_all['fixation_position'] == 2]['fixation_duration'].sum()
        gaze_0_all = dwell_0_all / trial_dwell_all
        gaze_1_all = dwell_1_all / trial_dwell_all
        tavares2017_trials.loc[(tavares2017_trials['subject'] == subject_code) & (tavares2017_trials['trial'] == trial), 'gaze_0_all'] = gaze_0_all
        tavares2017_trials.loc[(tavares2017_trials['subject'] == subject_code) & (tavares2017_trials['trial'] == trial), 'gaze_1_all'] = gaze_1_all
    
        # Without first fixation
        trial_data_nofirst = subject_data_nofirst[subject_data_nofirst['trial'] == trial].copy()
        trial_dwell_nofirst = trial_data_nofirst['fixation_duration'].sum()
        dwell_0_nofirst = trial_data_nofirst[trial_data_nofirst['fixation_position'] == 1]['fixation_duration'].sum()
        dwell_1_nofirst = trial_data_nofirst[trial_data_nofirst['fixation_position'] == 2]['fixation_duration'].sum()
        gaze_0_nofirst = dwell_0_nofirst / trial_dwell_nofirst
        gaze_1_nofirst = dwell_1_nofirst / trial_dwell_nofirst
        tavares2017_trials.loc[(tavares2017_trials['subject'] == subject_code) & (tavares2017_trials['trial'] == trial), 'gaze_0_nofirst'] = gaze_0_nofirst
        tavares2017_trials.loc[(tavares2017_trials['subject'] == subject_code) & (tavares2017_trials['trial'] == trial), 'gaze_1_nofirst'] = gaze_1_nofirst

tavares2017_prep_all = tavares2017_trials.rename({'gaze_0_all': 'gaze_0',
                                                  'gaze_1_all': 'gaze_1',
                                                  'gaze_2_all': 'gaze_2'},
                                                 axis='columns').drop(['gaze_0_nofirst', 'gaze_1_nofirst'], axis=1)
tavares2017_prep_all.to_csv(join('data', 'tavares_2017_fns', 'tavares2017_prep_all.csv'), index=False)
tavares2017_prep_nofirst = tavares2017_trials.rename({'gaze_0_nofirst': 'gaze_0',
                                                      'gaze_1_nofirst': 'gaze_1',
                                                      'gaze_2_nofirst': 'gaze_2'},
                                                     axis='columns').drop(['gaze_0_all', 'gaze_1_all'], axis=1)
tavares2017_prep_nofirst.dropna(inplace=True) # remove trials without gaze data
tavares2017_prep_nofirst.to_csv(join('data', 'tavares_2017_fns', 'tavares2017_prep_nofirst.csv'), index=False)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))






# Combine to single file

In [7]:
krajbich2010_prep_all['dataset'] = 'krajbich2010'
krajbich2010_prep_all['n_items'] = 2
krajbich2010_prep_nofirst['dataset'] = 'krajbich2010'
krajbich2010_prep_nofirst['n_items'] = 2
n_subjects = krajbich2010_prep_all['subject'].unique().size

krajbich2011_prep_all['dataset'] = 'krajbich2011'
krajbich2011_prep_all['n_items'] = 3
krajbich2011_prep_all['subject'] += n_subjects
krajbich2011_prep_nofirst['dataset'] = 'krajbich2011'
krajbich2011_prep_nofirst['n_items'] = 3
krajbich2011_prep_nofirst['subject'] += n_subjects
n_subjects += krajbich2011_prep_all['subject'].unique().size 

folke2016_prep['dataset'] = 'folke2016'
folke2016_prep['n_items'] = 3
folke2016_prep['subject'] += n_subjects
n_subjects += folke2016_prep['subject'].unique().size

tavares2017_prep_all['dataset'] = 'tavares2017'
tavares2017_prep_all['n_items'] = 2
tavares2017_prep_all['subject'] += n_subjects
tavares2017_prep_nofirst['dataset'] = 'tavares2017'
tavares2017_prep_nofirst['n_items'] = 2
tavares2017_prep_nofirst['subject'] += n_subjects - folke2016_prep['subject'].unique().size # (Folke is skipped in nofirst analyses)

variables = ['subject', 'trial', 'rt', 'choice', 'item_value_0', 'item_value_1', 'item_value_2', 'gaze_0', 'gaze_1', 'gaze_2', 'n_items', 'dataset']

combined_all = pd.concat([krajbich2010_prep_all, krajbich2011_prep_all, folke2016_prep, tavares2017_prep_all])[variables]
combined_nofirst = pd.concat([krajbich2010_prep_nofirst, krajbich2011_prep_nofirst, tavares2017_prep_nofirst])[variables]

combined_all.to_csv(join('data', 'data_all.csv'), index=False)
combined_nofirst.to_csv(join('data', 'data_nofirst.csv'), index=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [8]:
print('Full data set contains {} participants.'.format(combined_all['subject'].unique().size))
print('Data set without first fixations contains {} participants.'.format(combined_nofirst['subject'].unique().size))

Full data set contains 118 participants.
Data set without first fixations contains 94 participants.
