# Generate weight files

### Content

+ [1. Notebook description](#1.-Notebook-Description)
+ [2. Generate Weights for best Classifiers](#2.-Generate-Weights-for-best-Classifiers)


---

# 1. Notebook Description

Given model configurations and input data we are fitting a fixed classifier on the whole dataset and save the `coef_` attribute of the estimator object to a new hdf5 file.

**Requirements**:

+ model config yaml files
+ pre-transformed subject data for those models


---

**Imports:**

In [None]:
from digits.utils import dotdict
from digits.data import matimport, select, utils
from digits.transform.dimreduction import SubsampleTransform, FFTransform
from digits.transform.shaper import CSPFlatten, CSPWrap

from mne.decoding import CSP
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

from itertools import combinations
import numpy as np
import pandas as pd
import yaml
from os import path
import warnings

In [None]:
# FutureWarning in mne package that is just annoying..
warnings.filterwarnings("ignore", message="in the future, boolean array-likes will be handled as a boolean array index")

---
# 2. Generate Weights for best Classifiers


---

In [None]:
subjects = [3130, 3131, 3132, 3134, 3135, 3136, 3138, 3146, 3147, 3149,
            3154, 3156, 3157, 3158, 3159, 3161, 3162, 3233, 3237, 3239,
            3240, 3241, 3242, 3243, 3245, 3248, 3250, 3251, 3252, 3253,
            3255, 3260]

In [None]:
# best models
models = {
    'svc': ('short_lda_1.yaml', SVC(kernel='linear', C=1.274275e-06, cache_size=1024)),
    'lda': ('short_lda_1.yaml', LDA(shrinkage=0.0444444444444, solver='lsqr')),
    'ldacsp': ('short_lda_4.yaml', Pipeline([
            ('wrap', CSPWrap()),
            ('csp', CSP(n_components=6, reg='ledoit_wolf', transform_into='csp_space')),
            ('flat', CSPFlatten()),
            ('lda', LDA(solver='lsqr', shrinkage='auto'))
        ]))
}

# a time domain model for the document
models = {
    'svc_t': ('short_nofft20.yaml', SVC(kernel='linear', C=1.274275e-06, cache_size=1024)),
    'lda_t': ('short_nofft20.yaml', LDA(shrinkage=0.0444444444444, solver='lsqr')),
    'ldacsp_t': ('short_nofft20.yaml', Pipeline([
            ('wrap', CSPWrap()),
            ('csp', CSP(n_components=6, reg='ledoit_wolf', transform_into='csp_space')),
            ('flat', CSPFlatten()),
            ('lda', LDA(solver='lsqr', shrinkage='auto'))
        ]))
}

dataroot = '../../data/thomas/artcorr/'

In [None]:
for clfname, (config_file, clf) in models.items():
    with open('../../jobs/configs/'+config_file, 'r') as f:
        config = dotdict(yaml.load(f)['config'])

    print("------------------\n{}\n------------------".format(clfname))
    for subject in [str(x) for x in subjects]:
        infile = subject+'_'+config_file+'.h5'
        outfile = 'weights_'+clfname+'_'+subject+'.h5'
        if path.exists(outfile):
            print("skipping {}".format(subject))
            continue
        print("running {}".format(subject)) 
        print('{} -> {}'.format(infile,outfile))
        store = pd.HDFStore(path.join(dataroot, 'transformed', infile))
        samples = store['samples']
        targets = store['targets']

        weights = {}
        for dix,(d1,d2) in enumerate(combinations(np.arange(10), 2)):

            tmp_samples, tmp_targets = select.fromtargetlist(samples, targets, [d1, d2])
            
            clf.fit(tmp_samples, tmp_targets['label'])

            if 'ldacsp' in clfname:
                weights[(clfname, d1, d2)] = clf.named_steps['lda'].coef_[0]
            else:
                weights[(clfname, d1, d2)] = clf.coef_[0]
        
        df_weights = pd.DataFrame(weights)
        df_weights.columns.names = ['type', 'd1', 'd2']
        df_weights.index.names = ['weights']
        store = pd.HDFStore(outfile)
        store['weights'] = df_weights
        store.close()