# Part 2: Machine Learning

In [1]:
import mne
from mne.decoding import Vectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

# Models
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
import mne.viz

import numpy as np

In [2]:
data_file = '1-P-cleaned'
epochs = mne.read_epochs(data_file + '.fif', verbose='error')


In [3]:
epochs.set_eeg_reference(['O1'])

0,1
Number of events,371
Events,FN: 99 FP: 104 FU: 94 NN: 30 NP: 27 NU: 17
Time range,0.000 – 1.496 sec
Baseline,off


In [4]:
epochs.set_eeg_reference('average')

0,1
Number of events,371
Events,FN: 99 FP: 104 FU: 94 NN: 30 NP: 27 NU: 17
Time range,0.000 – 1.496 sec
Baseline,off


In [5]:
#epochs_SN = epochs['Standard', 'Novel'] # Standard vs. Novel
epochs_UN = epochs['FU', 'FN'] # Unpleasant vs. Neutral
epochs_UP = epochs['FU', 'FP'] # Unpleasant vs. Pleasant
epochs_NP = epochs['FN', 'FP'] # Neutral vs. Pleasant

# Dataset with unpleasant and neutral events
data_UN = epochs_UN.get_data()
labels_UN = epochs_UN.events[:,-1]

In [6]:
train_data_UN, test_data_UN, labels_train_UN, labels_test_UN = train_test_split(data_UN, labels_UN, 
                                                                                test_size=0.3, 
                                                                                random_state=42)


In [7]:
clf_svm_0 = make_pipeline(Vectorizer(), StandardScaler(), svm.SVC(kernel='rbf', C=1))

In [8]:
scores = cross_val_score(clf_svm_0, train_data_UN, labels_train_UN, cv=5)
for i in range(len(scores)):   
    print('Accuracy of ' + str(i+1) + 'th fold is ' + str(scores[i]) + '\n')

Accuracy of 1th fold is 0.8148148148148148

Accuracy of 2th fold is 0.5925925925925926

Accuracy of 3th fold is 0.5555555555555556

Accuracy of 4th fold is 0.6666666666666666

Accuracy of 5th fold is 0.5185185185185185



In [9]:
#svm
clf_svm_pip = make_pipeline(Vectorizer(), StandardScaler(), svm.SVC(random_state=42))
parameters = {'svc__kernel':['linear', 'rbf', 'sigmoid'], 'svc__C':[0.1, 1, 10]}
gs_cv_svm = GridSearchCV(clf_svm_pip, parameters, scoring='accuracy', cv=StratifiedKFold(n_splits=5), 
                         return_train_score=True)

In [10]:
gs_cv_svm.fit(train_data_UN, labels_train_UN)
print('Best Parameters: {}'.format(gs_cv_svm.best_params_))
print('Best Score: {}'.format(gs_cv_svm.best_score_))

Best Parameters: {'svc__C': 0.1, 'svc__kernel': 'linear'}
Best Score: 0.6962962962962964


In [11]:
#Prediction
predictions_svm = gs_cv_svm.predict(test_data_UN)

#Evaluate
report_svm = classification_report(labels_test_UN, predictions_svm, target_names=['Standard', 'Novel'])
print('SVM Clasification Report:\n {}'.format(report_svm))

acc_svm = accuracy_score(labels_test_UN, predictions_svm)
print("Accuracy of SVM model: {}".format(acc_svm))

precision_svm,recall_svm,fscore_svm,support_svm=precision_recall_fscore_support(labels_test_UN,
                                                                                predictions_svm,
                                                                                average='macro', 
                                                                                zero_division=0)
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_svm,recall_svm,fscore_svm))

SVM Clasification Report:
               precision    recall  f1-score   support

    Standard       0.71      0.86      0.77        28
       Novel       0.83      0.67      0.74        30

    accuracy                           0.76        58
   macro avg       0.77      0.76      0.76        58
weighted avg       0.77      0.76      0.76        58

Accuracy of SVM model: 0.7586206896551724
Precision: 0.7696078431372549, Recall: 0.7619047619047619, f1-score:0.7574671445639187


In [12]:
# Logistic Regression
clf_lr_pip = make_pipeline(Vectorizer(), StandardScaler(), LogisticRegression(random_state=42, 
                                                                              solver='liblinear'))
parameters = {'logisticregression__penalty':['l1', 'l2']}
gs_cv_lr = GridSearchCV(clf_lr_pip, parameters, scoring='accuracy')
gs_cv_lr.fit(train_data_UN, labels_train_UN)

print('Best Parameters: {}'.format(gs_cv_lr.best_params_))
print('Best Score: {}'.format(gs_cv_lr.best_score_))

#Predictions
predictions_lr = gs_cv_lr.predict(test_data_UN)

#Evaluation
report_lr = classification_report(labels_test_UN, predictions_lr, target_names=['Unpleasant', 'Neutral'])
print('LR Clasification Report:\n {}'.format(report_lr))

acc_lr = accuracy_score(labels_test_UN, predictions_lr)
print("Accuracy of LR model: {}".format(acc_lr))

precision_lr,recall_lr,fscore_lr,support_lr=precision_recall_fscore_support(labels_test_UN,predictions_lr,
                                                                            average='macro')
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_lr,recall_lr,fscore_lr))

Best Parameters: {'logisticregression__penalty': 'l1'}
Best Score: 0.7333333333333332
LR Clasification Report:
               precision    recall  f1-score   support

  Unpleasant       0.75      0.86      0.80        28
     Neutral       0.85      0.73      0.79        30

    accuracy                           0.79        58
   macro avg       0.80      0.80      0.79        58
weighted avg       0.80      0.79      0.79        58

Accuracy of LR model: 0.7931034482758621
Precision: 0.7980769230769231, Recall: 0.7952380952380952, f1-score:0.7928571428571427


In [13]:
# Linear Discriminant Analysis
clf_lda_pip = make_pipeline(Vectorizer(), StandardScaler(), LinearDiscriminantAnalysis(solver='svd'))
clf_lda_pip.fit(train_data_UN,labels_train_UN)

#Predictions
predictions_lda = clf_lda_pip.predict(test_data_UN)

#Evaluation
report_lda = classification_report(labels_test_UN, predictions_lda, target_names=['Unpleasant', 'Neutral'])
print('LDA Clasification Report:\n {}'.format(report_lda))

acc_lda = accuracy_score(labels_test_UN, predictions_lda)
print("Accuracy of LDA model: {}".format(acc_lda))

precision_lda,recall_lda,fscore_lda,support_lda=precision_recall_fscore_support(labels_test_UN,
                                                                                predictions_lda,
                                                                                average='macro')
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_lda,recall_lda,fscore_lda))

LDA Clasification Report:
               precision    recall  f1-score   support

  Unpleasant       0.67      0.64      0.65        28
     Neutral       0.68      0.70      0.69        30

    accuracy                           0.67        58
   macro avg       0.67      0.67      0.67        58
weighted avg       0.67      0.67      0.67        58

Accuracy of LDA model: 0.6724137931034483
Precision: 0.6720430107526881, Recall: 0.6714285714285715, f1-score:0.6715350223546944


In [14]:
accuracies.append([acc_svm, acc_lr, acc_lda])
f1_scores.append([fscore_svm, fscore_lr, fscore_lda])

NameError: name 'accuracies' is not defined

In [None]:
accuracies, f1_scores = [], []
accuracies.append([acc_svm, acc_lr, acc_lda])
f1_scores.append([fscore_svm, fscore_lr, fscore_lda])

In [None]:
# Dataset with unpleasant and pleasant events
data_UP = epochs_UP.get_data()
labels_UP = epochs_UP.events[:,-1]
train_data_UP, test_data_UP, labels_train_UP, labels_test_UP = train_test_split(data_UP, labels_UP, 
                                                                                test_size=0.3, 
                                                                                random_state=42)

In [None]:
# SVM
clf_svm_pip = make_pipeline(Vectorizer(), StandardScaler(), svm.SVC(random_state=42))
parameters = {'svc__kernel':['linear', 'rbf', 'sigmoid'], 'svc__C':[0.1, 1, 10]}
gs_cv_svm = GridSearchCV(clf_svm_pip, parameters, scoring='accuracy', cv=StratifiedKFold(n_splits=5), 
                         return_train_score=True)
gs_cv_svm.fit(train_data_UP, labels_train_UP)

print('Best Parameters: {}'.format(gs_cv_svm.best_params_))
print('Best Score: {}'.format(gs_cv_svm.best_score_))

# Make prediction
predictions_svm = gs_cv_svm.predict(test_data_UP)
#Evaluation
report_svm = classification_report(labels_test_UP, predictions_svm, target_names=['Unpleasant', 'Pleasant'])
print('SVM Clasification Report:\n {}'.format(report_svm))

acc_svm = accuracy_score(labels_test_UP, predictions_svm)
print("Accuracy of SVM model: {}".format(acc_svm))

precision_svm,recall_svm,fscore_svm,support_svm=precision_recall_fscore_support(labels_test_UP,
                                                                                predictions_svm,
                                                                                average='macro')
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_svm,recall_svm,fscore_svm))

In [None]:
#Logistic Regression
clf_lr_pip = make_pipeline(Vectorizer(), StandardScaler(), LogisticRegression(random_state=42, 
                                                                              solver='liblinear'))
parameters = {'logisticregression__penalty':['l1', 'l2']}
gs_cv_lr = GridSearchCV(clf_lr_pip, parameters, scoring='accuracy')
gs_cv_lr.fit(train_data_UP, labels_train_UP)

print('Best Parameters: {}'.format(gs_cv_lr.best_params_))
print('Best Score: {}'.format(gs_cv_lr.best_score_))

# Prediction
predictions_lr = gs_cv_lr.predict(test_data_UP)

#Evaluation
report_lr = classification_report(labels_test_UP, predictions_lr, target_names=['Unpleasant', 'Pleasant'])
print('LR Clasification Report:\n {}'.format(report_lr))

acc_lr = accuracy_score(labels_test_UP, predictions_lr)
print("Accuracy of LR model: {}".format(acc_lr))

precision_lr,recall_lr,fscore_lr,support_lr=precision_recall_fscore_support(labels_test_UP,
                                                                            predictions_lr,average='macro')
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_lr,recall_lr,fscore_lr))

In [None]:
#LDA
clf_lda_pip = make_pipeline(Vectorizer(), StandardScaler(), LinearDiscriminantAnalysis(solver='svd'))
clf_lda_pip.fit(train_data_UP,labels_train_UP)

#Prediction
predictions_lda = clf_lda_pip.predict(test_data_UP)

#Evaluation
report_lda = classification_report(labels_test_UP, predictions_lda, target_names=['Unpleasant', 'Plesant'])
print('LDA Clasification Report:\n {}'.format(report_lda))

acc_lda = accuracy_score(labels_test_UP, predictions_lda)
print("Accuracy of LDA model: {}".format(acc_lda))

precision_lda,recall_lda,fscore_lda,support_lda=precision_recall_fscore_support(labels_test_UP,
                                                                                predictions_lda,
                                                                                average='macro')
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_lda,recall_lda,fscore_lda))

In [None]:
accuracies.append([acc_svm, acc_lr, acc_lda])
f1_scores.append([fscore_svm, fscore_lr, fscore_lda])

In [None]:
# Dataset with neutral and pleasant events
data_NP = epochs_NP.get_data()
labels_NP = epochs_NP.events[:,-1]
train_data_NP, test_data_NP, labels_train_NP, labels_test_NP = train_test_split(data_NP, labels_NP, 
                                                                                test_size=0.3, 
                                                                                random_state=42)

In [None]:
# SVM
clf_svm_pip = make_pipeline(Vectorizer(), StandardScaler(), svm.SVC(random_state=42))
parameters = {'svc__kernel':['linear', 'rbf', 'sigmoid'], 'svc__C':[0.1, 1, 10]}
gs_cv_svm = GridSearchCV(clf_svm_pip, parameters, scoring='accuracy', cv=StratifiedKFold(n_splits=5), 
                         return_train_score=True)
gs_cv_svm.fit(train_data_NP, labels_train_NP)

print('Best Parameters: {}'.format(gs_cv_svm.best_params_))
print('Best Score: {}'.format(gs_cv_svm.best_score_))

# Prediction
predictions_svm = gs_cv_svm.predict(test_data_NP)

#Evaluation
report_svm = classification_report(labels_test_NP, predictions_svm, target_names=['Neutral', 'Pleasant'])
print('SVM Clasification Report:\n {}'.format(report_svm))

acc_svm = accuracy_score(labels_test_NP, predictions_svm)
print("Accuracy of SVM model: {}".format(acc_svm))

precision_svm,recall_svm,fscore_svm,support_svm=precision_recall_fscore_support(labels_test_NP,
                                                                                predictions_svm,
                                                                                average='macro')
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_svm,recall_svm,fscore_svm))

In [None]:
#Logistic Regression
clf_lr_pip = make_pipeline(Vectorizer(), StandardScaler(), LogisticRegression(random_state=42, 
                                                                              solver='liblinear'))
parameters = {'logisticregression__penalty':['l1', 'l2']}
gs_cv_lr = GridSearchCV(clf_lr_pip, parameters, scoring='accuracy')
gs_cv_lr.fit(train_data_NP, labels_train_NP)

print('Best Parameters: {}'.format(gs_cv_lr.best_params_))
print('Best Score: {}'.format(gs_cv_lr.best_score_))

# Prediction
predictions_lr = gs_cv_lr.predict(test_data_NP)

#Evaluation
report_lr = classification_report(labels_test_NP, predictions_lr, target_names=['Neutral', 'Pleasant'])
print('LR Clasification Report:\n {}'.format(report_lr))

acc_lr = accuracy_score(labels_test_NP, predictions_lr)
print("Accuracy of LR model: {}".format(acc_lr))

precision_lr,recall_lr,fscore_lr,support_lr=precision_recall_fscore_support(labels_test_NP,predictions_lr,
                                                                            average='macro')
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_lr,recall_lr,fscore_lr))

In [None]:
clf_lda_pip = make_pipeline(Vectorizer(), StandardScaler(), LinearDiscriminantAnalysis(solver='svd'))
clf_lda_pip.fit(train_data_NP,labels_train_NP)

#Prediction
predictions_lda = clf_lda_pip.predict(test_data_NP)

#Evaluation
report_lda = classification_report(labels_test_NP, predictions_lda, target_names=['Neutral', 'Plesant'])
print('LDA Clasification Report:\n {}'.format(report_lda))

acc_lda = accuracy_score(labels_test_NP, predictions_lda)
print("Accuracy of LDA model: {}".format(acc_lda))

precision_lda,recall_lda,fscore_lda,support_lda=precision_recall_fscore_support(labels_test_NP,
                                                                                predictions_lda,
                                                                                average='macro')
print('Precision: {0}, Recall: {1}, f1-score:{2}'.format(precision_lda,recall_lda,fscore_lda))

In [None]:
accuracies.append([acc_svm, acc_lr, acc_lda])
f1_scores.append([fscore_svm, fscore_lr, fscore_lda])

In [None]:
def plotEvalMetrics(tasks, labels, evalMetric, metricName):
    width = 0.2  # the width of the bars

    # Set position of bar on X axis
    rects1 = np.arange(len(evalMetric[:][0]))
    rects2 = [x + width for x in rects1]
    rects3 = [x + width for x in rects2]

    plt.bar(rects1, list(zip(*evalMetric))[0], color='#87CEFA', width=width, edgecolor='white', label=labels[0])
    plt.bar(rects2, list(zip(*evalMetric))[1], color='#FFE4E1', width=width, edgecolor='white', label=labels[1])
    plt.bar(rects3, list(zip(*evalMetric))[2], color='#CD5C5C', width=width, edgecolor='white', label=labels[2])

    plt.xlabel('Classification Tasks')
    plt.xticks([r + width for r in range(len(evalMetric[:][0]))], tasks)
    plt.ylabel(metricName)

    plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left', )
    plt.show()

In [None]:
#Plot Accuracies
tasks = ['UN', 'UP', 'NP']
labels = ['SVM', 'LR', 'LDA']
plotEvalMetrics(tasks, labels, accuracies, 'Accuracy')
print(accuracies)

In [None]:
#Plot F1 Scores
tasks = ['UN', 'UP', 'NP']
labels = ['SVM', 'LR', 'LDA']
plotEvalMetrics(tasks, labels, f1_scores, 'F1-Scores')

Little projects:
 - decode identity
 - bias in the data (reference)

Get unfiltered data

Examples on two different dataset -> can choose which one, compare performance, 3 way classification for visual, pleasant vs. unpleasant vs. neutral / familiar vs. novel