In [108]:
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import time
from scipy import signal
import regex as reg
from pyriemann.estimation import XdawnCovariances
from pyriemann.tangentspace import TangentSpace

# Pre-Processing

things to do: ICA filtering to remove artifacts, butterworth filter, epoching ...

In [29]:
train_labels = pd.read_csv('Data/TrainLabels.csv')
submission = pd.read_csv('Data/SampleSubmission.csv')

In [3]:
submission

Unnamed: 0,IdFeedBack,Prediction
0,S01_Sess01_FB001,0
1,S01_Sess01_FB002,0
2,S01_Sess01_FB003,0
3,S01_Sess01_FB004,0
4,S01_Sess01_FB005,0
...,...,...
3395,S25_Sess05_FB096,0
3396,S25_Sess05_FB097,0
3397,S25_Sess05_FB098,0
3398,S25_Sess05_FB099,0


There are 60 feedbacks for each session, AKA, 12 5 letter words. Each feedback/letter was either a right or wrong prediction from the user. Using the EEG data, we must train a model on the tendencies within the EEG data itself, whenever a feedback was presented.

In [4]:
train_labels

Unnamed: 0,IdFeedBack,Prediction
0,S02_Sess01_FB001,1
1,S02_Sess01_FB002,1
2,S02_Sess01_FB003,0
3,S02_Sess01_FB004,0
4,S02_Sess01_FB005,1
...,...,...
5435,S26_Sess05_FB096,1
5436,S26_Sess05_FB097,0
5437,S26_Sess05_FB098,0
5438,S26_Sess05_FB099,0


Collecting all the names of the training files, and then running a loop through each file, it is imported as a DataFrame, and then turned into an array, where it is appended to the training/test set.

In [43]:
train_files = glob.glob('Data/train/Data*.csv')
test_files = glob.glob('Data/test/Data*.csv')
train_files[0:6]

['Data/train\\Data_S02_Sess01.csv',
 'Data/train\\Data_S02_Sess02.csv',
 'Data/train\\Data_S02_Sess03.csv',
 'Data/train\\Data_S02_Sess04.csv',
 'Data/train\\Data_S02_Sess05.csv',
 'Data/train\\Data_S06_Sess01.csv']

In [128]:
training_subjects = int(16)
num_of_fb = int(340)
freq = int(200)
epoch_time = 1.3
epoch = int(freq * epoch_time)
num_of_cols = int(59)
eeg_cols = int(56)

In [37]:
200*1.3

260.0

In [109]:
'''
extract_d(files)
Ingest Data by looping through files

Epoch 1.3 seconds after feedbackevent == 1 using epoch_d function

Append values to list of arrays called temp


Input: 
    files: array of string of file names (Data_S*_Sess*.csv)
Output: 
    temp: final array of appended values
'''
def extract_d(files):
    start = time.time()
    
    training_subjects = 16 #num of training subjects
    num_of_fb = 340 #num of feedbacks / subject
    freq = 200 #sampling rate
    epoch_time = 1.3 #proposed epoching time in seconds
    epoch = freq * epoch_time #epoch in indices 
    num_of_cols = int(59) 
    eeg_cols = int(56)
    b_s = int(-0.4*freq) #index where baseline starts relative to feedback
    b_e = int(-0.3*freq) #index where baseline ends relative to feedback
    order = 5 #butterworth order
    low_pass = 1 #low frequency pass for butterworth filter
    high_pass = 40 #high frequency pass for butterworth filter
    
    temp = np.empty((1,260,num_of_cols), float)
    for i, f in enumerate(files):
        print(i,f, temp.shape)
        df = pd.read_csv(f) #read each file
        index_fb = df[df['FeedBackEvent'] == 1].index.values
        df = np.array(df) 
        print(df.shape)
        eeg = df[:,1:57] #only eeg values
        eeg_filtered = butter_filter(order, low_pass, high_pass, freq, eeg) #butterworth filter applied
        df[:,1:57] = eeg_filtered
        for j, indx in enumerate(index_fb): #epoching 260 indexes (1.3 seconds) after each stimulus
            epoch_array = df[indx:(indx+int(epoch)),:]
            baseline_array = df[indx+b_s:indx+b_e,:] #baseline correction of 100ms (20 indexes), 400ms to 300ms before fb
            
            epoch_array = epoch_array.reshape((1,int(epoch),int(epoch_array.shape[1])))
            baseline_array = baseline_array.reshape((1,20,int(baseline_array.shape[1])))
            
            baseline_mean = np.mean(baseline_array, axis = 1) #noise subtracted from epoched data
            
            epoch_array = epoch_array - baseline_mean
            if i == 0:
                temp = np.vstack((temp,epoch_array)) #stacking the first epoch
            else:
                temp = np.vstack((temp,epoch_array))
                
    now = time.time()
    print('Elapsed Time: ' + str(int(now-start)) + ' seconds')
    return temp

In [110]:
def butter_filter(order, low_pass, high_pass, fs,sig):
    nyq = 0.5 * fs
    lp = low_pass / nyq
    hp = high_pass / nyq
    sos = signal.butter(order, [lp, hp], btype='band', output = 'sos')
    return signal.sosfilt(sos, sig)

In [112]:
train = extract_d(train_files)

0 Data/train\Data_S02_Sess01.csv (1, 260, 59)
(132001, 59)
1 Data/train\Data_S02_Sess02.csv (61, 260, 59)
(128001, 59)
2 Data/train\Data_S02_Sess03.csv (121, 260, 59)
(127001, 59)
3 Data/train\Data_S02_Sess04.csv (181, 260, 59)
(128001, 59)
4 Data/train\Data_S02_Sess05.csv (241, 260, 59)
(196001, 59)
5 Data/train\Data_S06_Sess01.csv (341, 260, 59)
(132001, 59)
6 Data/train\Data_S06_Sess02.csv (401, 260, 59)
(132601, 59)
7 Data/train\Data_S06_Sess03.csv (461, 260, 59)
(132601, 59)
8 Data/train\Data_S06_Sess04.csv (521, 260, 59)
(132001, 59)
9 Data/train\Data_S06_Sess05.csv (581, 260, 59)
(196001, 59)
10 Data/train\Data_S07_Sess01.csv (681, 260, 59)
(134401, 59)
11 Data/train\Data_S07_Sess02.csv (741, 260, 59)
(136001, 59)
12 Data/train\Data_S07_Sess03.csv (801, 260, 59)
(136001, 59)
13 Data/train\Data_S07_Sess04.csv (861, 260, 59)
(135001, 59)
14 Data/train\Data_S07_Sess05.csv (921, 260, 59)
(203001, 59)
15 Data/train\Data_S11_Sess01.csv (1021, 260, 59)
(145001, 59)
16 Data/train\Data_S

In [113]:
test = extract_d(test_files)

0 Data/test\Data_S01_Sess01.csv (1, 260, 59)
(127401, 59)
1 Data/test\Data_S01_Sess02.csv (61, 260, 59)
(120801, 59)
2 Data/test\Data_S01_Sess03.csv (121, 260, 59)
(120801, 59)
3 Data/test\Data_S01_Sess04.csv (181, 260, 59)
(123001, 59)
4 Data/test\Data_S01_Sess05.csv (241, 260, 59)
(194001, 59)
5 Data/test\Data_S03_Sess01.csv (341, 260, 59)
(138001, 59)
6 Data/test\Data_S03_Sess02.csv (401, 260, 59)
(131001, 59)
7 Data/test\Data_S03_Sess03.csv (461, 260, 59)
(131001, 59)
8 Data/test\Data_S03_Sess04.csv (521, 260, 59)
(132001, 59)
9 Data/test\Data_S03_Sess05.csv (581, 260, 59)
(218001, 59)
10 Data/test\Data_S04_Sess01.csv (681, 260, 59)
(128001, 59)
11 Data/test\Data_S04_Sess02.csv (741, 260, 59)
(128801, 59)
12 Data/test\Data_S04_Sess03.csv (801, 260, 59)
(130001, 59)
13 Data/test\Data_S04_Sess04.csv (861, 260, 59)
(129001, 59)
14 Data/test\Data_S04_Sess05.csv (921, 260, 59)
(193201, 59)
15 Data/test\Data_S05_Sess01.csv (1021, 260, 59)
(133401, 59)
16 Data/test\Data_S05_Sess02.csv (10

In [114]:
np.save('Data/X_epochs_train(bs_bw).npy',train[1:,:,:])
np.save('Data/X_epochs_test.npy(bs_bw)',test[1:,:,:])

In [115]:
train = np.load('Data/X_epochs_train.npy')
test = np.load('Data/X_epochs_test.npy')

In [116]:
test.shape

(3400, 260, 59)

In [131]:
train = np.reshape(train, (5440, num_of_cols, epoch))
test = np.reshape(test, (3400, num_of_cols, epoch))

In [132]:
train.shape

(5440, 59, 260)

After epoching shape of train and test data, no other preprocessing done yet

In [133]:
print(train.shape)
print(test.shape)

(5440, 59, 260)
(3400, 59, 260)


Dropping EOG, Time, and FeedBackEvent columns, and reshaping EEG data into

In [134]:
EEG_train = train[:,1:57,:].reshape(5440*epoch, eeg_cols)
EEG_test = test[:,1:57,:].reshape(3400*epoch, eeg_cols)

In [135]:
print(EEG_train.shape)
print(EEG_test.shape)

(1414400, 56)
(884000, 56)


Reshape back to apply XdawnCovariance

In [136]:
train_filtered = EEG_train.reshape(5440, int(eeg_cols), int(epoch))
test_filtered = EEG_test.reshape(3400, int(eeg_cols), int(epoch))

In [137]:
print(train_filtered.shape)
print(test_filtered.shape)

(5440, 56, 260)
(3400, 56, 260)


Apply 5th filter XdawnCovariance, and then tangent space to convert from reimann model to eucilidean space

In [138]:
Y_train = train_labels.Prediction.values

In [139]:
XC= XdawnCovariances(nfilter=5)
X_train = XC.fit_transform(train_filtered, Y_train)
X_test = XC.transform(test_filtered)
X_train = TangentSpace(metric='riemann').fit_transform(X_train, y = Y_train)
X_test = TangentSpace(metric='riemann').transform(X_test)

In [140]:
X_train

array([[-0.3052637 , -0.03078197, -0.47191025, ...,  2.1746006 ,
         0.08279183,  2.19204785],
       [-0.26662873, -0.01810491, -0.47822838, ...,  2.23375307,
         0.09457055,  2.21326101],
       [-0.25089669, -0.05888838, -0.50475629, ...,  2.20956996,
         0.13514079,  2.19849806],
       ...,
       [-0.94649855, -0.06894671, -0.084074  , ...,  2.67473811,
         0.36364172,  2.17976418],
       [-0.77543791, -0.01144206, -0.12651094, ...,  2.68098828,
         0.35218516,  2.19277779],
       [-0.85746837,  0.03984905, -0.02320009, ...,  2.69008051,
         0.31847183,  2.23020558]])

In [22]:
X_train

array([[ 0.68029903, -0.05177231,  0.08244949, ...,  1.45567708,
         0.2645651 ,  0.02435095],
       [ 0.45721737,  0.16505541,  0.5825956 , ...,  1.34716367,
         0.85486182, -0.13663167],
       [ 0.46666901, -0.14740266,  0.26084896, ...,  1.42327569,
        -0.15015616, -1.34352809],
       ...,
       [-0.46727637,  0.2491673 , -1.1063111 , ...,  1.92021248,
         1.4446069 ,  2.03622266],
       [ 0.1490003 , -0.14054646, -0.85591812, ...,  2.14846434,
         0.02407762,  2.46802044],
       [-0.07702727,  0.19267916, -1.12229464, ...,  1.93536439,
         1.1099897 ,  1.41722724]])

In [141]:
X_test

array([[ 6.71421172,  0.01426299,  0.13618461, ..., 10.68567618,
         0.31619202,  9.65583231],
       [ 6.72126797,  0.0513897 ,  0.0909533 , ..., 10.55535183,
         0.37926217,  9.51646688],
       [ 6.64475402,  0.06201443,  0.186526  , ..., 10.4098938 ,
         0.29485153,  9.41886387],
       ...,
       [ 4.90717174,  0.26493555,  0.47055065, ..., 11.02948111,
         0.18070586, 10.41510942],
       [ 4.85112995,  0.23628849,  0.4607396 , ..., 11.10800369,
         0.17243078, 10.51230865],
       [ 4.89261389,  0.24387666,  0.52946291, ..., 11.07268492,
         0.16222918, 10.48461482]])

In [21]:
X_test

array([[-4.47828769, -1.06996745,  0.1476379 , ...,  0.34029365,
         0.67616555,  0.50864833],
       [-5.00791152, -0.20163138,  0.13796314, ...,  1.51407949,
        -0.79294735,  0.65056717],
       [-4.79634856,  0.26443099,  0.27299077, ..., -0.7216058 ,
        -0.30756265,  1.68098416],
       ...,
       [-3.80145331,  0.51697185,  0.21389248, ...,  0.10877602,
        -0.3098083 ,  1.80046052],
       [-3.90641219,  0.19644335,  0.80647617, ...,  0.71042944,
         1.21563236,  1.20049132],
       [-3.48413773, -0.06080935,  0.06308115, ...,  0.29101443,
        -0.09743197,  1.03960154]])

In [142]:
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

X_train shape:  (5440, 210)
X_test shape:  (3400, 210)


In [143]:
np.save('Data/X_train_final(bs).npy',X_train)
np.save('Data/X_test_final(bs).npy',X_test)