In [None]:
#Logistic Regression classifier

Classifying data with logit regression.

In [2]:
import numpy as np
import pandas as pd

from scipy.signal import butter, lfilter

from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import LeaveOneLabelOut
from sklearn.decomposition import PCA

from glob import glob

from mne.io import RawArray
from mne.channels import read_montage
from mne.epochs import concatenate_epochs
from mne import create_info, find_events, Epochs, concatenate_raws, pick_types
from mne.decoding import CSP

from joblib import Parallel, delayed

from preprocessing import *

import os


In [3]:
### Data preparation

def prepare_data_train(fname):
    """ read and prepare training data """
    data = pd.read_csv(fname)
    events_fname = fname.replace('_data','_events')
    labels= pd.read_csv(events_fname)
    clean=data.drop(['id' ], axis=1)
    labels=labels.drop(['id' ], axis=1)
    return  clean,labels

def prepare_data_test(fname):
    """ read and prepare test data """
    data = pd.read_csv(fname)
    return data

-

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
def fit(X,y):
    # Do here you training
    clf = LDA()
    clf.fit(X,y)
    return clf

def predict(clf,X):
    # do here your prediction
    preds = clf.predict_proba(X)
    return np.atleast_2d(preds[:,clf.classes_==1])

In [9]:
### Parameters
cols = ['HandStart','FirstDigitTouch',
        'BothStartLoadPhase','LiftOff',
        'Replace','BothReleased']
scaler= StandardScaler()
nfilters = 4
csp = CSP(n_components=nfilters, reg='lws')
bands = [[5,20]]
subsample = 50
cut = 3
cpast = [20,7]
subjects = range(1,13)
series = range(1,9)

### Don't think I need this

# events = ['HandStart','FirstDigitTouch',
#            'BothStartLoadPhase','LiftOff',
#            'Replace','BothReleased']

In [15]:
pred_tot = []
y_tot = []
auc_tot = []
for subject in subjects:
    y_raw= []
    raw = []
    sequence = []

    ################ READ DATA ################################################
    
    for ser in series:
        if ser==2 and subject ==2:
            continue
        fname =  '../train/subj%d_series%d_data.csv' % (subject,ser)
        print("subject=" + str(subject) + "ser=" + str(ser))
        data,labels=prepare_data_train(fname)
        raw.append(data) 
        y_raw.append(labels)
        sequence.extend([ser]*len(data))

    X = pd.concat(raw)
    y = pd.concat(y_raw)
    #transform in numpy array
    #transform train data in numpy array
    X = np.asarray(X.astype(float))
    y = np.asarray(y.astype(float))
    sequence = np.asarray(sequence)


    ################ Train classifiers ########################################
    cv = LeaveOneLabelOut(sequence)
    pred = np.empty((X.shape[0],6))
    
    for train, test in cv:
        test_s = test[cpast[0]*cpast[1]:]
        train_s = train[cpast[0]*cpast[1]:]
        X_train = X[train]
        X_test = X[test]
        y_train = y[train_s]
        #apply preprocessing
        X_train = data_preprocess_train(X_train,events=labels,lpass=cut,sca=1,cpast=cpast,fbands=bands)
        X_test=data_preprocess_test(X_test,sca=1,lpass=cut,cpast=cpast,fbands=bands)
        print(y_train.shape,X_train.shape)
        clfs = Parallel(n_jobs=6)(delayed(fit)(X_train[::subsample,:],y_train[::subsample,i]) for i in range(6))
        preds = Parallel(n_jobs=6)(delayed(predict)(clfs[i],X_test) for i in range(6))
        print(preds[0].shape)
        pred[test_s,:] = np.concatenate(preds,axis=1)
        
    pred_tot.append(pred)
    y_tot.append(y)
    # get AUC
    auc = [roc_auc_score(y[:,i],pred[:,i]) for i in range(6)]     
    auc_tot.append(auc)
    print(auc)

subject=1ser=1
subject=1ser=2
subject=1ser=3
subject=1ser=4
(605608, 6) (605608, 352)
(119296, 1)
(453150, 6) (453150, 352)
(271754, 1)
(507490, 6) (507490, 352)
(217414, 1)
(608864, 6) (608864, 352)
(116040, 1)
[0.90039367430254835, 0.95031895304267699, 0.95155662028057775, 0.91123394255309353, 0.9234649529345621, 0.91523167385468684]
subject=2ser=1
subject=2ser=3
subject=2ser=4
(430659, 6) (430659, 352)
(291674, 1)
(569788, 6) (569788, 352)
(152545, 1)
(444419, 6) (444419, 352)
(277914, 1)
[0.81372133758455112, 0.78309795283040917, 0.82201424815376389, 0.78250908828859056, 0.81637466074493847, 0.77944242150231613]
subject=3ser=1
subject=3ser=2
subject=3ser=3
subject=3ser=4
(618433, 6) (618433, 352)
(122062, 1)
(547835, 6) (547835, 352)
(192660, 1)
(511338, 6) (511338, 352)
(229157, 1)
(544279, 6) (544279, 352)
(196216, 1)
[0.82744841993713736, 0.84477614472893947, 0.8643497151068722, 0.83836983574682455, 0.79497703694176125, 0.81554585663216417]
subject=4ser=1
subject=4ser=2
subject=

In [13]:
np.average(auc_tot)

0.8553658740812603

In [16]:
np.average(auc_tot)

0.86121884238883961

In [6]:
clfs[0].coef_[0][[18,19,20,29]]

array([-1.66823505,  1.77759264, -0.55579713,  0.42364353])

In [98]:
idx = []
for i, subject in enumerate(subjects):
    X_test = prepare_data_test('../test/subj%d_series10_data.csv' %(subject))
    idx.append(np.array(X_test['id']))

In [6]:
pred_tot = []
y_tot = []
auc_tot = []
ids_tot = []
idx = []
for i, subject in enumerate(subjects):
    y_raw= []
    raw = []
    raw_test = []
    ################ READ DATA ################################################
    
    for ser in series:
        if ser==2 and subject ==2:
            continue
        fname =  '../train/subj%d_series%d_data.csv' % (subject,ser)
        data,labels=prepare_data_train(fname)
        raw.append(data) 
        y_raw.append(labels)   

    for k in range(9,11):
        X_test = prepare_data_test('../test/subj%d_series%d_data.csv' %(subject,k))
        idx.append(np.array(X_test['id']))   
        X_test = X_test.drop(['id'],axis=1)
        raw_test.append(X_test)
    X_test = pd.concat(raw_test)
    X_train = pd.concat(raw)
    y_train = pd.concat(y_raw)
    #transform in numpy array
    #transform train data in numpy array
    X_train = np.asarray(X_train.astype(float))
    y_train = np.asarray(y_train.astype(float))
    #sequence = np.asarray(sequence)


    ################ Train classifiers ########################################
    #cv = LeaveOneLabelOut(sequence)
    pred = np.empty((X_test.shape[0],6))
    
    #apply preprocessing
    X_train = data_preprocess_train(X_train,events=labels,lpass=cut,sca=1,cpast=cpast,fbands=bands)
    X_test=data_preprocess_test(X_test,sca=1,lpass=cut,cpast=cpast,fbands=bands)
    clfs = Parallel(n_jobs=3)(delayed(fit)(X_train[::subsample,:],y_train[cpast[0]*cpast[1]:][::subsample,i]) for i in range(6))
    preds = Parallel(n_jobs=3)(delayed(predict)(clfs[i],X_test) for i in range(6))
    #print(len(test),len(preds[0]))
    pred[cpast[0]*cpast[1]:,:] = np.concatenate(preds,axis=1)
        
    pred_tot.append(pred)

ids_tot=np.concatenate(idx)
    
submission_file = 'grasp_sub_fifth.csv'
submission = pd.DataFrame(index=ids_tot,
                          columns=cols,
                          data=np.concatenate(pred_tot))
                   
submission.to_csv(submission_file,index_label='id',float_format='%.3f')

  values = values[:, slicer]
  imask = (-mask).ravel()
  result = arr_idx[key]


In [None]:
submission = pd.DataFrame(index=ids_tot,
                          columns=cols,
                          data=np.concatenate(pred_tot))
                   
submission.to_csv(submission_file,index_label='id',float_format='%.3f')

In [5]:
fn, fn2 = "../train/subj1_series1_data.csv", "../train/subj1_series2_data.csv"
data, labels = prepare_data_train(fn)


In [4]:
fn, fn2 = "../train/subj1_series1_data.csv", "../train/subj1_series2_data.csv"
data, labels = prepare_data_train(fn)

y_train = labels["FirstDigitTouch"]
X_train = data_preprocess_train(data,sca=1)
lr1 = LogisticRegression()
lr1.fit(X_train[::subsample],y_train[::subsample]) 

eval_lr(lr1,sca=1)

  "got %s" % (estimator, X.dtype))


0.74308979879279602

In [72]:
y_train2 = labels["FirstDigitTouch"][100:]
X_train2 = data_preprocess_train(data,cpast=[20,5],sca=1)
lr2 = LogisticRegression()
lr2.fit(X_train2[::subsample],y_train2[::subsample]) 

eval_lr(lr2,cpast=[20,5],sca=1)

  "got %s" % (estimator, X.dtype))


0.83923819765432472

In [75]:
band=[0.5,35]
cut = 30

y_train3 = labels["FirstDigitTouch"]
X_train3 = data_preprocess_train(data,lpass=cut,sca=1)
lr3 = LogisticRegression()
lr3.fit(X_train3[::subsample],y_train3[::subsample]) 

eval_lr(lr3,lpass=cut,sca=1)

0.75988739853284293

In [82]:
cut2 = 35

y_train4 = labels["FirstDigitTouch"][100:]
X_train4 = data_preprocess_train(data,lpass=cut2,cpast=[20,5])
lr4 = LogisticRegression()
lr4.fit(X_train4[::subsample],y_train4[::subsample]) 

eval_lr(lr4,lpass=cut2,cpast=[20,5])

0.81979610555588567

In [107]:
cut2 = 35

y_train5 = labels["FirstDigitTouch"]
X_train5 = data_preprocess_train(data,labels,lpass=cut2,csp_filt=1)
lr5 = LogisticRegression()
lr5.fit(X_train5[::subsample],y_train5[::subsample]) 

eval_lr(lr5,lpass=cut2,csp_filt=1)

0.78239142541246032

In [6]:
cut2 = 35
cpast = [19,10]

y_train6 = labels["FirstDigitTouch"][cpast[0]*cpast[1]:]
X_train6 = data_preprocess_train(data,labels,lpass=cut2,cpast=cpast,csp_filt=1)
lr6 = LogisticRegression()
lr6.fit(X_train6[::subsample],y_train6[::subsample]) 

eval_lr(lr6,lpass=cut2,cpast=cpast,csp_filt=1)

0.84497900944935445

In [None]:
eval_lr(lr5,bpass=1,cpast=1,pca=pca)

In [None]:
def eval_lr(lr,cpast=[],lpass=None,bpass=None,pca=None,sca=None,csp_filt=None):
    fns = glob("../train/subj1_series[2-9]_data.csv")
    scores = np.zeros(len(fns))
    for i, fn in enumerate(fns):
        data, labels = prepare_data_train(fn)
        y_test = labels['FirstDigitTouch']
        if len(cpast)>0:
            y_test = y_test[cpast[0]*cpast[1]:]
        X_test = data_preprocess_test(data,sca=sca,cpast=cpast,bpass=bpass,pca=pca,lpass=lpass,csp_filt=csp_filt)
        pred = lr.predict_proba(X_test)[:,1]
        
        scores[i] = roc_auc_score(y_test,pred)
        
    return np.average(scores)

In [None]:
### Preprocessing functions, should be imported from preprocessing but here just in case


def concat_past(X,interval=20,num_past=5):
    frames = []
    for i in range(num_past):
        X_trunc  = X[i*interval:-(num_past-i)*interval]
        frames.append(X_trunc)
    X_out = np.concatenate(frames,axis=1)
    return X_out


def pd_concat_past(X,interval=20,num_past=5):
    frames = []
    for i in range(num_past):
        X_trunc  = X[i*interval:-(num_past-i)*interval]
        X_trunc = X_trunc.rename(index=lambda x: x-i*interval)
        frames.append(X_trunc)
    X_out = pd.concat(frames,axis=1)
    return X_out

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data,axis=0)
    return y

def butter_lowpass(highcut,fs,order=5):
    nyq = 0.5*fs
    high = highcut/nyq
    b,a = butter(order,high,btype="low")
    return b,a

def butter_lowpass_filter(data,highcut,fs,order=5):
    b, a = butter_lowpass(highcut,fs,order=order)
    y = lfilter(b,a,data,axis=0)
    return y


def creat_mne_raw_object(data, events=[]):
    """Create a mne raw instance from csv file"""

    ch_names = list(data.columns)
    
    # read EEG standard montage from mne
    montage = read_montage('standard_1005',ch_names)

    ch_type = ['eeg']*len(ch_names)
    data = np.array(data[ch_names]).T

    if len(events)>0:
        events_names =list(events.columns)
        events_data = np.array(events[events_names]).T     
        # define channel type, the first is EEG, the last 6 are stimulations
        ch_type.extend(['stim']*6)
        ch_names.extend(events_names)
        # concatenate event file and data
        data = np.concatenate((data,events_data))
        
    # create and populate MNE info structure
    info = create_info(ch_names,sfreq=500.0, ch_types=ch_type, montage=montage)
    #info['filename'] = fname
    
    # create raw object 
    raw = RawArray(data,info,verbose=False)
    
    return raw

def fit_CSP(data,events=[]):
    epochs_tot = []
    y = []
    # read and concatenate all the files
    #raw = concatenate_raws([creat_mne_raw_object(fname) for fname in fnames])
    raw = creat_mne_raw_object(data,events=events)
    # pick eeg signal
    picks = pick_types(raw.info,eeg=True)

    events = find_events(raw,stim_channel='HandStart', verbose=False)

    epochs = Epochs(raw, events, {'during' : 1}, 0, 2, proj=False,
                    picks=picks, baseline=None, preload=True,
                    add_eeg_ref=False, verbose=False)

    epochs_tot.append(epochs)
    y.extend([1]*len(epochs))
    
    epochs_rest = Epochs(raw, events, {'before' : 1}, -2, 0, proj=False,
                    picks=picks, baseline=None, preload=True,
                    add_eeg_ref=False, verbose=False)
    
    epochs_rest.times = epochs.times
    
    y.extend([-1]*len(epochs_rest))
    epochs_tot.append(epochs_rest)
        
    epochs = concatenate_epochs(epochs_tot)

    X = epochs.get_data()
    y = np.array(y)
    

    csp.fit(X,y)