In [2]:
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import time
from scipy import signal
import regex as reg
from pyriemann.estimation import XdawnCovariances
from pyriemann.tangentspace import TangentSpace



# Pre-Processing

In [14]:
train_labels = pd.read_csv('Data/TrainLabels.csv')
submission = pd.read_csv('Data/SampleSubmission.csv')

In [15]:
Y_train = train_labels.Prediction.values
true_labels = pd.read_csv('Data/true_labels.csv', header = None)
Y_test = true_labels[0].values

In [16]:
Y_test.shape

(3400,)

In [17]:
true_labels

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,0
...,...
3395,1
3396,0
3397,1
3398,1


In [18]:
submission

Unnamed: 0,IdFeedBack,Prediction
0,S01_Sess01_FB001,0
1,S01_Sess01_FB002,0
2,S01_Sess01_FB003,0
3,S01_Sess01_FB004,0
4,S01_Sess01_FB005,0
...,...,...
3395,S25_Sess05_FB096,0
3396,S25_Sess05_FB097,0
3397,S25_Sess05_FB098,0
3398,S25_Sess05_FB099,0


There are 60 feedbacks for each session, AKA, 12 5 letter words. Each feedback/letter was either a right or wrong prediction from the user. Using the EEG data, we must train a model on the tendencies within the EEG data itself, whenever a feedback was presented.

In [19]:
train_labels

Unnamed: 0,IdFeedBack,Prediction
0,S02_Sess01_FB001,1
1,S02_Sess01_FB002,1
2,S02_Sess01_FB003,0
3,S02_Sess01_FB004,0
4,S02_Sess01_FB005,1
...,...,...
5435,S26_Sess05_FB096,1
5436,S26_Sess05_FB097,0
5437,S26_Sess05_FB098,0
5438,S26_Sess05_FB099,0


Collecting all the names of the training files, and then running a loop through each file, it is imported as a DataFrame, and then turned into an array, where it is appended to the training/test set.

In [20]:
train_files = glob.glob('Data/train/Data*.csv')
test_files = glob.glob('Data/test/Data*.csv')
train_files[0:6]

['Data/train\\Data_S02_Sess01.csv',
 'Data/train\\Data_S02_Sess02.csv',
 'Data/train\\Data_S02_Sess03.csv',
 'Data/train\\Data_S02_Sess04.csv',
 'Data/train\\Data_S02_Sess05.csv',
 'Data/train\\Data_S06_Sess01.csv']

In [21]:
training_subjects = int(16)
num_of_fb = int(340)
freq = int(200)
epoch_time = 1.3
epoch = int(freq * epoch_time)
num_of_cols = int(59)
eeg_cols = int(56)

In [25]:
'''
extract_d(files)
Ingest Data by looping through files

Epoch 1.3 seconds after feedbackevent == 1 using epoch_d function

Append values to list of arrays called temp


Input: 
    files: array of string of file names (Data_S*_Sess*.csv)
Output: 
    temp: final array of appended values
'''
def extract_d(files):
    start = time.time()
    
    training_subjects = 16 #num of training subjects
    num_of_fb = 340 #num of feedbacks / subject
    freq = 200 #sampling rate
    epoch_time = 1.3 #proposed epoching time in seconds
    epoch = freq * epoch_time #epoch in indices 
    num_of_cols = int(59) 
    eeg_cols = int(56)
    b_s = int(-0.4*freq) #index where baseline starts relative to feedback (-400ms)
    b_e = int(-0.3*freq) #index where baseline ends relative to feedback (-300ms)
    order = 5 #butterworth order
    low_pass = 1 #low frequency pass for butterworth filter
    high_pass = 40 #high frequency pass for butterworth filter
    
    channels = ['Fp1', 'Fp2', 'AF7', 'AF3', 'AF4', 'AF8', 'F7', 'F5', 'F3', 'F1',
       'Fz', 'F2', 'F4', 'F6', 'F8', 'FT7', 'FC5', 'FC3', 'FC1', 'FCz',
       'FC2', 'FC4', 'FC6', 'FT8', 'T7', 'C5', 'C3', 'C1', 'Cz', 'C2',
       'C4', 'C6', 'T8', 'TP7', 'CP5', 'CP3', 'CP1', 'CPz', 'CP2', 'CP4',
       'CP6', 'TP8', 'P7', 'P5', 'P3', 'P1', 'Pz', 'P2', 'P4', 'P6', 'P8',
       'PO7', 'POz', 'P08', 'O1', 'O2']
    
    temp = np.empty((1,len(channels), 260), float)
    for i, f in enumerate(files):
        print(i,f, temp.shape)
        df = pd.read_csv(f) #read each file
        index_fb = df[df['FeedBackEvent'] == 1].index.values
        df_array = np.array(df) 
        
        #uncomment below for butterworth filter
        
        ##Bandpass
        eeg = df_array[:,1:57] #only eeg values to apply butterworth filter, dropping EOG, Time, and Feedback columns
        for i, channel in enumerate(channels): # apply butterworth channel by channel
            raw_eeg = df[channel].values
            eeg_filtered = butter_filter(order, low_pass, high_pass, freq, raw_eeg) #butterworth filter applied
            eeg[:,i] = eeg_filtered
        df = np.array(df)
        df[:,1:57] = eeg #replacing old eeg values with new ones
        ##
        
        #df = np.array(df)
        
        for j, indx in enumerate(index_fb): #epoching 260 indexes (1.3 seconds) after each stimulus
            epoch_array = eeg[indx:(indx+int(epoch)),:]
            epoch_array = epoch_array.reshape((1,int(epoch_array.shape[1]), int(epoch)))
            
            #uncomment below for baseline correction
            
            ## Baseline correction
            baseline_array = eeg[indx+b_s:indx+b_e,:] #baseline correction of 100ms (20 indexes), 400ms to 300ms before fb
            print(baseline_array.shape)
            baseline_mean = np.mean(baseline_array, axis = 0)
            baseline_mean = baseline_mean.reshape((1,int(baseline_array.shape[1]), 1))
            epoch_array = epoch_array - baseline_mean #EEG noise subtracted from epoched data
            ##
            
            if i == 0:
                temp = np.vstack((temp,epoch_array)) #stacking the first epoch
            else:
                temp = np.vstack((temp,epoch_array))
    print('Final Shape:', temp.shape)
    now = time.time()
    print('Elapsed Time: ' + str(int(now-start)) + ' seconds')
    return temp

In [26]:
def butter_filter(order, low_pass, high_pass, fs,sig):
    nyq = 0.5 * fs
    lp = low_pass / nyq
    hp = high_pass / nyq
    sos = signal.butter(order, [lp, hp], btype='band', output = 'sos')
    return signal.sosfilt(sos, sig)

In [78]:
train = extract_d(train_files)

0 Data/train\Data_S02_Sess01.csv (1, 56, 260)
1 Data/train\Data_S02_Sess02.csv (61, 56, 260)
2 Data/train\Data_S02_Sess03.csv (121, 56, 260)
3 Data/train\Data_S02_Sess04.csv (181, 56, 260)
4 Data/train\Data_S02_Sess05.csv (241, 56, 260)
5 Data/train\Data_S06_Sess01.csv (341, 56, 260)
6 Data/train\Data_S06_Sess02.csv (401, 56, 260)
7 Data/train\Data_S06_Sess03.csv (461, 56, 260)
8 Data/train\Data_S06_Sess04.csv (521, 56, 260)
9 Data/train\Data_S06_Sess05.csv (581, 56, 260)
10 Data/train\Data_S07_Sess01.csv (681, 56, 260)
11 Data/train\Data_S07_Sess02.csv (741, 56, 260)
12 Data/train\Data_S07_Sess03.csv (801, 56, 260)
13 Data/train\Data_S07_Sess04.csv (861, 56, 260)
14 Data/train\Data_S07_Sess05.csv (921, 56, 260)
15 Data/train\Data_S11_Sess01.csv (1021, 56, 260)
16 Data/train\Data_S11_Sess02.csv (1081, 56, 260)
17 Data/train\Data_S11_Sess03.csv (1141, 56, 260)
18 Data/train\Data_S11_Sess04.csv (1201, 56, 260)
19 Data/train\Data_S11_Sess05.csv (1261, 56, 260)
20 Data/train\Data_S12_Sess0

In [79]:
test = extract_d(test_files)

0 Data/test\Data_S01_Sess01.csv (1, 56, 260)
1 Data/test\Data_S01_Sess02.csv (61, 56, 260)
2 Data/test\Data_S01_Sess03.csv (121, 56, 260)
3 Data/test\Data_S01_Sess04.csv (181, 56, 260)
4 Data/test\Data_S01_Sess05.csv (241, 56, 260)
5 Data/test\Data_S03_Sess01.csv (341, 56, 260)
6 Data/test\Data_S03_Sess02.csv (401, 56, 260)
7 Data/test\Data_S03_Sess03.csv (461, 56, 260)
8 Data/test\Data_S03_Sess04.csv (521, 56, 260)
9 Data/test\Data_S03_Sess05.csv (581, 56, 260)
10 Data/test\Data_S04_Sess01.csv (681, 56, 260)
11 Data/test\Data_S04_Sess02.csv (741, 56, 260)
12 Data/test\Data_S04_Sess03.csv (801, 56, 260)
13 Data/test\Data_S04_Sess04.csv (861, 56, 260)
14 Data/test\Data_S04_Sess05.csv (921, 56, 260)
15 Data/test\Data_S05_Sess01.csv (1021, 56, 260)
16 Data/test\Data_S05_Sess02.csv (1081, 56, 260)
17 Data/test\Data_S05_Sess03.csv (1141, 56, 260)
18 Data/test\Data_S05_Sess04.csv (1201, 56, 260)
19 Data/test\Data_S05_Sess05.csv (1261, 56, 260)
20 Data/test\Data_S08_Sess01.csv (1361, 56, 260)

In [80]:
np.save('Data/X_epochs_train(bs).npy',train[1:,:,:])
np.save('Data/X_epochs_test(bs).npy',test[1:,:,:])

In [27]:
train = np.load('Data/X_epochs_train(bs).npy')
test = np.load('Data/X_epochs_test(bs).npy')

In [28]:
print(train.shape)
print(test.shape)

(5440, 56, 260)
(3400, 56, 260)


Apply 5th filter XdawnCovariance, and then tangent space to convert from reimann model to eucilidean space

In [29]:
Y_train = train_labels.Prediction.values

In [30]:
XC= XdawnCovariances(nfilter=5)
X_train = XC.fit_transform(train, Y_train)
X_test = XC.transform(test)

X_train = TangentSpace(metric='riemann').fit_transform(X_train, y = Y_train)
X_test = TangentSpace(metric='riemann').transform(X_test)

In [31]:
X_train

array([[ 0.19073496, -0.07466847, -0.19129241, ..., -0.20151839,
         0.08601067, -0.189216  ],
       [ 0.15148   , -0.03674552,  0.01863182, ..., -0.43904765,
        -0.04368968,  0.19220105],
       [ 0.25723319,  0.05925857, -0.07264738, ..., -0.43044296,
        -0.13808513, -0.38827351],
       ...,
       [ 0.17275684, -0.02291625, -0.0772277 , ...,  0.25046887,
         0.04403163,  0.4463929 ],
       [ 0.26628135,  0.002303  , -0.03140624, ...,  0.33832585,
        -0.24549856,  0.04278776],
       [ 0.21824943, -0.03334318,  0.04486274, ...,  0.24278182,
        -0.04612597,  0.54603802]])

In [32]:
X_test

array([[-0.32261308, -0.14771117,  0.14257296, ...,  3.50834738,
        -0.70118841,  3.33885254],
       [-0.87957478, -0.22925568,  0.2134341 , ...,  3.00364927,
        -0.12668068,  3.67561133],
       [-0.78168907,  0.07258029, -0.14375001, ...,  4.83669489,
         0.18522625,  5.30737553],
       ...,
       [-0.27090926, -0.02230058,  0.25668129, ...,  2.10703304,
        -0.09521843,  2.17275406],
       [-1.1621088 , -0.14258795,  0.28117237, ...,  3.34071485,
         0.35123836,  3.05871719],
       [-0.47244632,  0.09273379,  0.12204881, ...,  2.76882907,
         0.36561909,  2.698652  ]])

In [33]:
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)

X_train shape:  (5440, 210)
X_test shape:  (3400, 210)


In [93]:
np.save('Data/X_train_final(bs).npy',X_train)
np.save('Data/X_test_final(bs).npy',X_test)