In [1]:
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import time
from scipy import signal
import regex as reg
from pyriemann.estimation import XdawnCovariances
from pyriemann.tangentspace import TangentSpace



# Pre-Processing

things to do: ICA filtering to remove artifacts, butterworth filter, epoching ...

In [2]:
train_labels = pd.read_csv('Data/TrainLabels.csv')
submission = pd.read_csv('Data/SampleSubmission.csv')

In [3]:
submission

Unnamed: 0,IdFeedBack,Prediction
0,S01_Sess01_FB001,0
1,S01_Sess01_FB002,0
2,S01_Sess01_FB003,0
3,S01_Sess01_FB004,0
4,S01_Sess01_FB005,0
...,...,...
3395,S25_Sess05_FB096,0
3396,S25_Sess05_FB097,0
3397,S25_Sess05_FB098,0
3398,S25_Sess05_FB099,0


There are 60 feedbacks for each session, AKA, 12 5 letter words. Each feedback/letter was either a right or wrong prediction from the user. Using the EEG data, we must train a model on the tendencies within the EEG data itself, whenever a feedback was presented.

In [4]:
train_labels

Unnamed: 0,IdFeedBack,Prediction
0,S02_Sess01_FB001,1
1,S02_Sess01_FB002,1
2,S02_Sess01_FB003,0
3,S02_Sess01_FB004,0
4,S02_Sess01_FB005,1
...,...,...
5435,S26_Sess05_FB096,1
5436,S26_Sess05_FB097,0
5437,S26_Sess05_FB098,0
5438,S26_Sess05_FB099,0


Collecting all the names of the training files, and then running a loop through each file, it is imported as a DataFrame, and then turned into an array, where it is appended to the training/test set.

In [5]:
train_files = glob.glob('Data/train/Data*.csv')
test_files = glob.glob('Data/test/Data*.csv')
train_files[0:6]

['Data/train\\Data_S02_Sess01.csv',
 'Data/train\\Data_S02_Sess02.csv',
 'Data/train\\Data_S02_Sess03.csv',
 'Data/train\\Data_S02_Sess04.csv',
 'Data/train\\Data_S02_Sess05.csv',
 'Data/train\\Data_S06_Sess01.csv']

In [7]:
'''
extract_d(files)
Ingest Data by looping through files

Epoch 1.3 seconds after feedbackevent == 1 using epoch_d function

Append values to list of arrays called temp


Input: 
    files: array of string of file names (Data_S*_Sess*.csv)
Output: 
    temp: final array of appended values
'''
def extract_d(files):
    start = time.time()
    
    freq = 200
    epoch_time = 1.3
    epoch = freq * epoch_time
    temp = np.empty((1,260,59), float)
    for i, f in enumerate(files):
        print(i,f, temp.shape)
        df = pd.read_csv(f)
        #using regex to extract subject and session numbers
        [(subject, session)] = reg.findall('Data/.+S(\d\d).+s(\d\d)',f)
        #df.loc[:,'Subject'] = subject
        #df.loc[:,'Session'] = session
        index_fb = df[df['FeedBackEvent'] == 1].index.values
        df = np.array(df)
        for j, indx in enumerate(index_fb):
            epoch_array = df[indx:(indx+int(epoch)),:]
            #feedback_col = np.ones((epoch_array.shape[0],1)) + j
            #epoch_array = np.append(epoch_array, feedback_col, 1)
            #print(indx, epoch_array.shape)
            epoch_array = epoch_array.reshape((1,int(epoch),int(epoch_array.shape[1])))
            if i == 0:
                temp = np.vstack((temp,epoch_array))
                #temp[0] = epoch_array
            else:
                temp = np.vstack((temp,epoch_array))
                
    now = time.time()
    print('Elapsed Time: ' + str(int(now-start)) + ' seconds')
    return temp

In [48]:
def butter_filter(order, low_pass, high_pass, fs,sig):
    nyq = 0.5 * fs
    lp = low_pass / nyq
    hp = high_pass / nyq
    sos = signal.butter(order, [lp, hp], btype='band', output = 'sos')
    return signal.sosfilt(sos, sig)

In [35]:
train = extract_d(train_files)

0 Data/train\Data_S02_Sess01.csv (1, 260, 59)
1 Data/train\Data_S02_Sess02.csv (61, 260, 59)
2 Data/train\Data_S02_Sess03.csv (121, 260, 59)
3 Data/train\Data_S02_Sess04.csv (181, 260, 59)
4 Data/train\Data_S02_Sess05.csv (241, 260, 59)
5 Data/train\Data_S06_Sess01.csv (341, 260, 59)
6 Data/train\Data_S06_Sess02.csv (401, 260, 59)
7 Data/train\Data_S06_Sess03.csv (461, 260, 59)
8 Data/train\Data_S06_Sess04.csv (521, 260, 59)
9 Data/train\Data_S06_Sess05.csv (581, 260, 59)
10 Data/train\Data_S07_Sess01.csv (681, 260, 59)
11 Data/train\Data_S07_Sess02.csv (741, 260, 59)
12 Data/train\Data_S07_Sess03.csv (801, 260, 59)
13 Data/train\Data_S07_Sess04.csv (861, 260, 59)
14 Data/train\Data_S07_Sess05.csv (921, 260, 59)
15 Data/train\Data_S11_Sess01.csv (1021, 260, 59)
16 Data/train\Data_S11_Sess02.csv (1081, 260, 59)
17 Data/train\Data_S11_Sess03.csv (1141, 260, 59)
18 Data/train\Data_S11_Sess04.csv (1201, 260, 59)
19 Data/train\Data_S11_Sess05.csv (1261, 260, 59)
20 Data/train\Data_S12_Sess0

In [37]:
test = extract_d(test_files)

0 Data/test\Data_S01_Sess01.csv (1, 260, 59)
1 Data/test\Data_S01_Sess02.csv (61, 260, 59)
2 Data/test\Data_S01_Sess03.csv (121, 260, 59)
3 Data/test\Data_S01_Sess04.csv (181, 260, 59)
4 Data/test\Data_S01_Sess05.csv (241, 260, 59)
5 Data/test\Data_S03_Sess01.csv (341, 260, 59)
6 Data/test\Data_S03_Sess02.csv (401, 260, 59)
7 Data/test\Data_S03_Sess03.csv (461, 260, 59)
8 Data/test\Data_S03_Sess04.csv (521, 260, 59)
9 Data/test\Data_S03_Sess05.csv (581, 260, 59)
10 Data/test\Data_S04_Sess01.csv (681, 260, 59)
11 Data/test\Data_S04_Sess02.csv (741, 260, 59)
12 Data/test\Data_S04_Sess03.csv (801, 260, 59)
13 Data/test\Data_S04_Sess04.csv (861, 260, 59)
14 Data/test\Data_S04_Sess05.csv (921, 260, 59)
15 Data/test\Data_S05_Sess01.csv (1021, 260, 59)
16 Data/test\Data_S05_Sess02.csv (1081, 260, 59)
17 Data/test\Data_S05_Sess03.csv (1141, 260, 59)
18 Data/test\Data_S05_Sess04.csv (1201, 260, 59)
19 Data/test\Data_S05_Sess05.csv (1261, 260, 59)
20 Data/test\Data_S08_Sess01.csv (1361, 260, 59)

In [39]:
np.save('Data/X_epochs_train.npy',train[1:,:,:])
np.save('Data/X_epochs_test.npy',test[1:,:,:])

In [40]:
train = np.load('Data/X_epochs_train.npy')
test = np.load('Data/X_epochs_test.npy')

After epoching shape of train and test data, no other preprocessing done yet

In [41]:
print(train.shape)
print(test.shape)

(5440, 260, 59)
(3400, 260, 59)


In [44]:
EEG_train.reshape(5440*260, 56)

array([[ 830.677222,  979.638619,  847.257758, ...,  932.304475,
         750.347476,  969.756009],
       [ 882.209104, 1030.107902,  913.80757 , ..., 1000.885015,
         828.234082, 1158.873289],
       [ 838.759996,  984.945818,  848.060414, ...,  946.093298,
         754.041038, 1171.585312],
       ...,
       [  92.333956,  193.835793,  -32.235108, ...,  252.57769 ,
          -1.893835,  233.479999],
       [ 153.879763,  261.568258,  115.160815, ...,  339.478242,
          99.196563,  296.020809],
       [  96.201981,  177.913618,   40.380734, ...,  247.905035,
          -4.242856,  213.572852]])

Dropping EOG, Time, and FeedBackEvent columns, and reshaping EEG data into

In [46]:
EEG_train = train[:,:,1:57].reshape(5440*260, 56)
EEG_test = test[:,:,1:57].reshape(3400*260, 56)

In [50]:
print(EEG_train.shape)
print(EEG_test.shape)

(1414400, 56)
(884000, 56)


In [49]:
order = 5
low_pass = 1
high_pass = 40
fs = 200
train_filtered = butter_filter(order, low_pass, high_pass, fs, EEG_train)
test_filtered = butter_filter(order, low_pass, high_pass, fs, EEG_test)

In [55]:
print(train_filtered.shape)
print(test_filtered.shape)

(1414400, 56)
(884000, 56)


In [56]:
train_filtered = train_filtered.reshape(5440, 260, 56)
test_filtered = test_filtered.reshape(3400, 260, 56)

In [57]:
print(train_filtered.shape)
print(test_filtered.shape)

(5440, 260, 56)
(3400, 260, 56)


Apply 5th filter XdawnCovariance, and then tangent space to convert from reimann model to eucilidean space

In [None]:
Y_train = train_labels.Prediction.values

In [60]:
XC= XdawnCovariances(nfilter=5)
X_train = XC.fit_transform(train_filtered, Y_train)
X_train = TangentSpace(metric='riemann').fit_transform(X_train, y = Y_train)

In [63]:
X_test = XC.transform(test_filtered)
X_test = TangentSpace(metric='riemann').transform(X_test)

In [64]:
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

X_train shape:  (5440, 210)
X_test shape:  (3400, 210)


In [65]:
np.save('Data/X_train_final.npy',X_train)
np.save('Data/X_test_final.npy',X_test)