# Classifying Events with:
* Logistic Regression
* Random Forests
* SVM
* Naive Bayes



In [None]:
#Use mne XDawn for preprocessing

import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import mne
from mne import io, pick_types, read_events, Epochs
# from mne.datasets import sample
from mne.preprocessing import Xdawn
from mne.decoding import Vectorizer
from mne.viz import tight_layout

import os
import os.path as op

matplotlib.rcParams['figure.figsize'] = (15.0, 10.0)
matplotlib.rcParams.update({'font.size': 15})

#data path for each run of each subject.
drive_data_path = 'E:\eeg_data'

#data path on my external hdd for folder containing all tests of each subject in one file
drive_all_data_path = 'E:\eeg_data\ica_140_500_0.1'

In [None]:
# import all runs data from 10 subjects

#initalize a np array of numbers from 1 to 5. This is the number of subjects
np.random.seed(41)
numb_subj = np.random.randint(1,25,5)
numb_subj
print("subject number used for classification:",numb_subj)
#empty array that will have all file names
files = []

for i in range(len(numb_subj)):
    files.append(str('subject' + str(numb_subj[i]) + '_all_runs-epo.fif'))

#convert to np array
files = np.array(files)

all_data = []
for i in range(len(files)):
     all_data.append(mne.read_epochs(op.join(drive_all_data_path, files[i]),
                          preload=True));

In [None]:
#extract event_id 4 and 5 from all data

all_epochs = []
for i in range(len(all_data)):
    all_epochs.append(all_data[i][(all_data[i].events[:,-1] == 4) | (all_data[i].events[:,-1] == 5)])

In [None]:
#concatenate epochs list
epochs = mne.concatenate_epochs(all_epochs)

In [None]:
epochs = epochs.pick_channels(epochs.ch_names[:60])

In [None]:
clf = make_pipeline(Xdawn(n_components = 3),
                    Vectorizer(),
                    MinMaxScaler(),
                    LogisticRegression(penalty='l1'))
#cross validator
cv = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)


#Do cross-validation
labels = epochs.events[:,-1]
preds = np.empty(len(labels))

for train, test in cv.split(epochs, labels):
    clf.fit(epochs[train], labels[train])
    preds[test] = clf.predict(epochs[test])
    
target_names = ['new', 'scramble']
report = classification_report(labels, preds, target_names=target_names)
print(report)

In [None]:
# Normalized confusion matrix
cm = confusion_matrix(labels, preds)
cm_normalized = cm.astype(float) / cm.sum(axis=1)[:, np.newaxis]

In [None]:
# Plot confusion matrix
plt.imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Normalized Confusion matrix', fontsize = 35)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45, fontsize = 20)
plt.yticks(tick_marks, target_names, fontsize = 20)
tight_layout()
plt.ylabel('True label', fontsize = 35)
plt.xlabel('Predicted label',fontsize = 35)
plt.show()

In [None]:
clf.score(epochs[test],labels[test])

## More tests by sampling more files

In [None]:
np.random.seed(42)
subjects = []
#generate 10 random samples of 5 files 
for i in range(10):
    subjects.append(np.random.choice(25,5))
subjects = np.array(subjects)
print(subjects)

In [None]:
#read file
def read_file(path, file_name):
    return mne.read_epochs(op.join(path, file_name),
                          preload=True);

In [None]:
def filter_new_scrambled(epochs):
    """Filter events in epoch data
    epochs: mne.epochs.EpochsFIF object 
    """
    
    return epochs[(epochs.events[:,-1] == 4) | (epochs.events[:,-1] == 5)] 

In [None]:
epochs = read_file(drive_all_data_path,'subject1_all_runs-epo.fif');

In [None]:
epochs = filter_new_scrambled(epochs)
#Having 61 channels does not work when classifiying for some reason. It seems like any number less than 61 works. 
epochs = epochs.pick_channels(epochs.ch_names[:60])

## Trying to classify one *all runs* file
* using 60 channels

In [None]:
clf = make_pipeline(Xdawn(n_components = 3),
                    Vectorizer(),
                    MinMaxScaler(),
                    LogisticRegression(penalty='l2'))
#cross validator
cv = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)


#Do cross-validation
labels = epochs.events[:,-1]
preds = np.empty(len(labels))

for train, test in cv.split(epochs, labels):
    clf.fit(epochs[train], labels[train])
    preds[test] = clf.predict(epochs[test])
    
target_names = ['new', 'scramble']
report = classification_report(labels, preds, target_names=target_names)
print(report)

In [None]:
clf.score(epochs[test],labels[test])

# Trying to classify the same file but with one channel
* How will classification perform if only one channel is used?
* I will choose one random channel.

In [None]:
np.random.seed(42)
chan_numb = np.random.choice(60)

In [None]:
chan_numb

In [None]:
#extract one channel
epochs.pick_channels(epochs.ch_names[chan_numb:chan_numb+1])

# Observations of the plots
* Looking at 200ms sections of each plot and trying to see if there are clear differences between the events.
* at 600 - 800ms, all but the bottom left have a high differences. 

In [None]:
fig, axs = plt.subplots(nrows = 2, ncols = 2, figsize = (25,15), )

fig.suptitle("Plots of Pairs of New and Scrambled Events of Channel {}".format(epochs.ch_names[-1]), fontsize=20);

axs[0,0].plot(epochs._data[0][0], label = epochs.events[0][-1])
axs[0,0].plot(epochs._data[1][0], label = epochs.events[1][-1])

axs[0,1].plot(epochs._data[3][0], label = epochs.events[3][-1])
axs[0,1].plot(epochs._data[4][0], label = epochs.events[4][-1])              

axs[1,0].plot(epochs._data[5][0], label = epochs.events[5][-1])
axs[1,0].plot(epochs._data[6][0], label = epochs.events[6][-1])              

axs[1,1].plot(epochs._data[8][0], label = epochs.events[8][-1])
axs[1,1].plot(epochs._data[9][0], label = epochs.events[9][-1])              

axs[0,0].legend()
axs[0,1].legend()
axs[1,0].legend()
axs[1,1].legend();

## Convert the epoched data of one channel into a pandas dataframe

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
#initialize np array of shape epoch length and time of 1400 ms.
np_epochs = np.empty(shape = (len(epochs),1401))

In [None]:
epochs[0]._data

In [None]:
# assign values of each epoch to np array
for i in range(len(epochs)):
    np_epochs[i] = epochs[i]._data
    
# pd.DataFrame(epochs._data.reshape(-1,1))

In [None]:
# create pandas df
epoch_df = pd.DataFrame(np_epochs)
# add event column
epoch_df['event'] = epochs.events[:,-1]

In [None]:
# a column for each millisecond
epoch_df.head()

In [None]:
# X values are the 1400 columns
X = epoch_df.iloc[:,:-1].values
# y values are events
y = epoch_df.iloc[:,-1:].values
#split data into training/ test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .30, random_state = 42)

In [None]:
#initialize classifier
clf = LogisticRegression(penalty='l1')
#fit data
clf.fit(X_train,y_train)

In [None]:
#score..is not good
clf.score(X_test,y_test)

# Playing around with what the features and samples are.
* This is not a good idea, but nonetheless offers insight on what not to do.