# Process data

Here we import data from all conditions (for one experiment at a time) and do the necessary processing. This results in a large `.csv` file (eg `EXPERIMENT1DATA.csv`) which is ready for the next stage, parameter estimation.

In [1]:
from glob import glob
import os
import numpy as np
import pandas as pd

# Experiment 1

In [2]:
def import_files(files, paradigm, reward_mag_level):
    """Import raw discounting data from a list of filenames.
    The user can adapt this function and the related functions in to come up 
    with the appropriately structured dataframe.
    inputs:
    """
    data = []
    for i,fname in enumerate(files):
        df = pd.read_csv(fname)
        df = _new_col_of_value(df, 'paradigm', paradigm)
        df = _new_col_of_value(df, 'reward_mag', reward_mag_level)
        df.drop(columns=['block_order', 'group', 'index', 'trial'], inplace=True)
        df.rename(columns={'A':'RA',
                           'B': 'RB'}, inplace=True)
        data.append(df)

    return(pd.concat(data))

def _new_col_of_value(df, colname, value):
    df[colname] = pd.Series(value, index=df.index)
    return df

def _generate_trial_col(df):
    df = df.reset_index()
    df['trial'] = df.index
    return df

In [3]:
expt = 1
reward_levels = ['low', 'high']
paradigms = ['deferred', 'online']

data = []

for reward_level in reward_levels:
    for paradigm in paradigms:
        file_location = f'data/raw_data_expt{expt}/{paradigm}_{reward_level}'
        files = glob(file_location + '/*.csv')
        print(f'{len(files)} files found in {file_location}')
        data.append(import_files(files, paradigm, reward_level))
        
data = pd.concat(data)

43 files found in data/raw_data_expt1/deferred_low
34 files found in data/raw_data_expt1/online_low
44 files found in data/raw_data_expt1/deferred_high
36 files found in data/raw_data_expt1/online_high


Need to create a new `id` column from 0 - total number of participants

In [4]:
# new column `id` which is the factors of `Participant`
factors, keys = data.Participant.factorize()
data['id'] = pd.Series(factors, index=data.index)
data.head()

Unnamed: 0,Participant,RA,RB,DA,DB,R,paradigm,reward_mag,id
0,1012,30,30,0,7,0,deferred,low,0
1,1012,27,30,0,7,0,deferred,low,0
2,1012,24,30,0,7,1,deferred,low,0
3,1012,21,30,0,7,1,deferred,low,0
4,1012,18,30,0,7,1,deferred,low,0


Recode the condition values into numerical values. We are going to factorize `paradigm` and `reward_mag_level` so we end up with conditions = 1, 2, 3, 4.

In [5]:
# multi-column factorize
tuples = data[['paradigm', 'reward_mag']].apply(tuple, axis=1)

# work out factoring and print the condition key. Important for decoding the results by condition!
print('*** CONDITION KEY ***')
factors, keys = pd.factorize( tuples )
for i in np.unique(factors):
    print(f'{i} = {keys[i]}')
    
data['condition'] = pd.factorize( tuples )[0]

*** CONDITION KEY ***
0 = ('deferred', 'low')
1 = ('online', 'low')
2 = ('deferred', 'high')
3 = ('online', 'high')


In [6]:
data.reset_index()

Unnamed: 0,index,Participant,RA,RB,DA,DB,R,paradigm,reward_mag,id,condition
0,0,1012,30,30,0,7,0,deferred,low,0,0
1,1,1012,27,30,0,7,0,deferred,low,0,0
2,2,1012,24,30,0,7,1,deferred,low,0,0
3,3,1012,21,30,0,7,1,deferred,low,0,0
4,4,1012,18,30,0,7,1,deferred,low,0,0
...,...,...,...,...,...,...,...,...,...,...,...
7845,45,2066,30,60,0,29,1,online,high,156,3
7846,46,2066,24,60,0,29,1,online,high,156,3
7847,47,2066,18,60,0,29,0,online,high,156,3
7848,48,2066,12,60,0,29,0,online,high,156,3


Change `trial` column to equate to each unique trial in the whole dataset rather than counting actual trial number in each experiment.

In [7]:
data['trial'] = np.arange(data.shape[0])

In [8]:
data.to_csv('data/processed/EXPERIMENT1DATA.csv')

# Experiment 2

In [9]:
def import_files(files, paradigm, domain):
    """Import raw discounting data from a list of filenames.
    The user can adapt this function and the related functions in to come up 
    with the appropriately structured dataframe.
    inputs:
    """
    data = []
    for i,fname in enumerate(files):
        df = pd.read_csv(fname) 
        df = _new_col_of_value(df, 'paradigm', paradigm)
        df = _new_col_of_value(df, 'domain', domain)
        df.drop(columns=['block_order', 'index', 'trial'], inplace=True)
        df.rename(columns={'A': 'RA',
                           'B': 'RB'}, inplace=True)
        data.append(df)

    return(pd.concat(data))

def _new_col_of_value(df, colname, value):
    df[colname] = pd.Series(value, index=df.index)
    return df

def _generate_trial_col(df):
    df = df.reset_index()
    df['trial'] = df.index
    return df

In [10]:
expt = 2
domains = ['gain', 'loss']
paradigms = ['deferred', 'online']

data = []

for domain in domains:
    for paradigm in paradigms:
        file_location = f'data/raw_data_expt{expt}/{paradigm}_{domain}'
        files = glob(file_location + '/*.csv')
        print(f'{len(files)} files found in {file_location}')
        data.append(import_files(files, paradigm, domain))
        
data = pd.concat(data)

46 files found in data/raw_data_expt2/deferred_gain
33 files found in data/raw_data_expt2/online_gain
37 files found in data/raw_data_expt2/deferred_loss
44 files found in data/raw_data_expt2/online_loss


In [11]:
# new column `id` which is the factors of `Participant`
factors, keys = data.Participant.factorize()
data['id'] = pd.Series(factors, index=data.index)
data.head()

Unnamed: 0,Participant,RA,RB,DA,DB,R,paradigm,domain,id
0,3017,30,30,0,29,1,deferred,gain,0
1,3017,27,30,0,29,1,deferred,gain,0
2,3017,24,30,0,29,1,deferred,gain,0
3,3017,21,30,0,29,1,deferred,gain,0
4,3017,18,30,0,29,1,deferred,gain,0


In [12]:
# multi-column factorize
tuples = data[['paradigm', 'domain']].apply(tuple, axis=1)

# work out factoring and print the condition key. Important for decoding the results by condition!
print('*** CONDITION KEY ***')
factors, keys = pd.factorize( tuples )
for i in np.unique(factors):
    print(f'{i} = {keys[i]}')
    
data['condition'] = pd.factorize( tuples )[0]

*** CONDITION KEY ***
0 = ('deferred', 'gain')
1 = ('online', 'gain')
2 = ('deferred', 'loss')
3 = ('online', 'loss')


In [13]:
data.reset_index()

Unnamed: 0,index,Participant,RA,RB,DA,DB,R,paradigm,domain,id,condition
0,0,3017,30,30,0,29,1,deferred,gain,0,0
1,1,3017,27,30,0,29,1,deferred,gain,0,0
2,2,3017,24,30,0,29,1,deferred,gain,0,0
3,3,3017,21,30,0,29,1,deferred,gain,0,0
4,4,3017,18,30,0,29,1,deferred,gain,0,0
...,...,...,...,...,...,...,...,...,...,...,...
7995,45,4006,-15,-30,0,7,1,online,loss,159,3
7996,46,4006,-12,-30,0,7,1,online,loss,159,3
7997,47,4006,-9,-30,0,7,1,online,loss,159,3
7998,48,4006,-6,-30,0,7,0,online,loss,159,3


In [14]:
data['trial'] = np.arange(data.shape[0])

In [15]:
data.to_csv('data/processed/EXPERIMENT2DATA.csv')