# Create data for pooled analysis

### Content

+ [1. Notebook description](#1.-Notebook-Description)
+ [2. Pooled Data](#2.-pooled-data)
+ [3. Individual Subjects](#3.-individual-subjects)

---

# 1. Notebook Description

To run the pooled analysis we have to transform the data for each subject according to some model and then combine it.
We load a model config file, transform the data according to the config object and concatenate all samples together in a single dataframe. We can do this, because the row index is unique over all sessions, trials, presentations *and* subjects.


---

**Imports:**

In [None]:
from digits.data import matimport, select, utils
from digits.utils import getoutname, dotdict
from digits.transform.dimreduction import SubsampleTransform, FFTransform

import yaml
from os import path

Specify the subject IDs and which models (transformation schemes to use):

In [None]:
subjects = [3130, 3131, 3132, 3134, 3135, 3136, 3138, 3146, 3147, 3149,
            3154, 3156, 3157, 3158, 3159, 3161, 3162, 3233, 3237, 3239,
            3240, 3241, 3242, 3243, 3245, 3248, 3250, 3251, 3252, 3253,
            3255, 3260]

configs = ['short_lda_1.yaml', 'short_lda_4.yaml', 'short_nofft20.yaml']
dataroot = '../../../data/thomas/artcorr/'

# 2. pooled data

In [None]:
for config_file in configs:
    with open('../../../jobs/configs/'+config_file, 'r') as f:
        config = dotdict(yaml.load(f)['config'])
        
    for subject in [str(x) for x in subjects]:
        
        outfile = config_file+'.h5'
        if path.exists(path.join(dataroot, 'transformed', outfile)):
            print('skipping: '+outfile)
            continue
    
        imp = matimport.Importer(dataroot=dataroot)
        imp.open(subject+'.h5')

        samples = imp.store.samples
        targets = imp.store.targets
        samples = select.fromtimerange(samples, config.t0, config.t1)
        samples, targets = select.fromsessionblacklist(samples, targets, ['01'])
        samples = select.fromchannelblacklist(samples, ['LHEOG', 'RHEOG', 'IOL'])
        samples = SubsampleTransform(width=config.subsample_width, verbose=True).transform(samples)
        if config.fft:
            samples = FFTransform(verbose=True, bins=config.size, fmin=config.fmin, fmax=config.fmax,
                                  power=config.power, rate=config.subsample_width/1000.).transform(samples)
        if 'samples_all' in locals():
            samples_all = samples_all.append(samples, verify_integrity=True)
            targets_all = targets_all.append(targets, verify_integrity=True)
        else:
            samples_all = samples
            targets_all = targets
    
    samples_all = utils.remove_duplicate_columns(samples_all, factor=2)
    pool = matimport.Importer(dataroot=dataroot)
    pool.ds = dotdict({
        'samples': samples_all,
        'targets': targets_all,
    })
    pool.save(outfile)
    del samples_all, targets_all

# 3. individual subjects

Do this again for each individual subject, without pooling, in case we need the data.

In [None]:
for config_file in configs:
    with open('../../../jobs/configs/'+config_file, 'r') as f:
        config = dotdict(yaml.load(f)['config'])
        
    for subject in [str(x) for x in subjects]:
    
        outfile = subject+'_'+config_file+'.h5'
        if path.exists(path.join(dataroot, 'transformed', outfile)):
            print('skipping: '+outfile)
            continue
        
        imp = matimport.Importer(dataroot=dataroot)
        imp.open(subject+'.h5')

        samples = imp.store.samples
        targets = imp.store.targets
        samples = select.fromtimerange(samples, config.t0, config.t1)
        samples, targets = select.fromsessionblacklist(samples, targets, ['01'])
        samples = select.fromchannelblacklist(samples, ['LHEOG', 'RHEOG', 'IOL'])
        samples = SubsampleTransform(width=config.subsample_width, verbose=True).transform(samples)
        if config.fft:
            samples = FFTransform(verbose=True, bins=config.size, fmin=config.fmin, fmax=config.fmax,
                                  power=config.power, rate=config.subsample_width/1000.).transform(samples)

    
        samples = utils.remove_duplicate_columns(samples, factor=2)
        pool = matimport.Importer(dataroot=dataroot)
        pool.ds = dotdict({
            'samples': samples,
            'targets': targets,
        })

        pool.save(outfile)
        print('saved: '+outfile)