In [None]:
%run ../preprocessing/Artifact_Removal/preprocecssing_helpers.ipynb
%run ../preprocessing/StimCodes.ipynb
%run ../Classification/ConcatEpochTrails.ipynb
%run ../PCA/Emmanuil-PCA.ipynb
%run ../preprocessing/Artifact_Removal/Extract_Describer_Events.ipynb
# %run ../preprocessing/frequency_bands.ipynb
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20.0, 15.0)
matplotlib.rcParams.update({'font.size': 15})
import numpy as np

import mne 

from mne.decoding import Vectorizer
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.model_selection import ShuffleSplit

from mne.decoding import UnsupervisedSpatialFilter
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from mne.viz import tight_layout

import time

# Exploring Word vs Non-word classification
* Non word vs word classification becomes more intriicate than just classifying audio vs visual.
    * We can say that there are word vs non-word classification tasks for both Auditory and Visual stimuli.

In [None]:
path = 'E:\stim_code_epochs'
epoch_files = os.listdir(path)
epoch_files

## Visualization of Word vs Non-word EEG

In [None]:
epoch = mne.read_epochs(os.path.join(path,epoch_files[2]), preload = True)

In [None]:
new_events = convert_epoch_events_to_stim_combinations(epoch_object=epoch)
epoch.events = new_events

In [None]:
#montage file
if 'Nasium' in epoch.ch_names:
    epoch.drop_channels(ch_names=['Nasium', 'LL4', 'L12', 'VEOG']);
montage = mne.channels.read_montage(kind = 'ANT_DukeWaveGuard_128_electrode_montages_updated_V4')
epoch.set_montage(montage);


In [None]:
epoch.events

In [None]:
modality_lexicality_event_ids

In [None]:
epoch.event_id = modality_lexicality_event_ids

In [None]:
def get_frequency_band(band, epoch_object):
    """
    Description:
        returns an epoch object with a filtered frequency band
    
    Variables: 
        band : 
            'Theta':(4,7),
            'Alpha':(8,12),
            'Beta':(13,25),
            'Gamma':(30,45)
        epoch_object: mne.epoch
    -----
    returns: epoch object
    
    """
    iter_freqs = {
        'Theta':(4,7),
        'Alpha':(8,12),
        'Beta':(13,25),
        'Gamma':(30,45)
    }
    copy_epoch = epoch_object.copy()
    copy_epoch.filter(l_freq= iter_freqs[band][0],
                    h_freq = iter_freqs[band][1])
    
    return copy_epoch


In [None]:
gamma_epoch = get_frequency_band('Gamma', epoch)

In [None]:
mean_stats = get_mean_band(gamma_epoch)
mean_stats = mean_stats.swapaxes(1,2)

In [None]:
# Separate word and non-word events by audio and visual.
for event in epoch.events:
    first_digit = int(str(event[-1])[0])
    if first_digit%2 != 0:
        event[-1] = 100 #lexical
    else:
        event[-1] = 101 # non-lexical

In [None]:
# classification pipeline
start = time.time()
clf = make_pipeline(Vectorizer(),
                    StandardScaler(),
                    LogisticRegression(penalty='l1'))
end = time.time()
print("clf elapsed time: {0}".format(end - start))

start_master = time.time()
cv = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)

labels = epoch.events[:,-1]
preds = np.empty(len(mean_stats))
for train, test in cv.split(mean_stats, labels):
    start = time.time()
    clf.fit(mean_stats[train], labels[train])
    preds[test] = clf.predict(mean_stats[test])
    end = time.time()
    print("kfold elapsed time: {0}".format(end - start))
end = time.time()
print("classification elapsed time: {0}".format(end - start_master))


start = time.time()
target_names = ['Lexical', 'Non-Lexical']
report = classification_report(labels, preds, target_names=target_names)
print(report)
end = time.time()
print("reporting elapsed time: {0}".format(end - start))

In [None]:
cm3 = confusion_matrix(labels, preds)
cm3_normalized = cm3.astype(float) / cm3.sum(axis=1)[:, np.newaxis]

# Plot confusion matrix
plt.imshow(cm3_normalized, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Lexical vs. Non-Lexical', size = 20)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45, size = 20)
plt.yticks(tick_marks, target_names,size = 20)
tight_layout()
plt.ylabel('True label',size = 20)
plt.xlabel('Predicted label',size = 20)
plt.show()

In [None]:
#  X_train, X_test, y_train, y_test = train_test_split(gamma_epoch.get_data())

In [None]:
visual_channels = ['RC7','RD7','RE4','R12','R13','RR12','RR13',
                   'LC7','LD7',';LE4','L12','L13','LL12','LL13']
audio_channels = [x for x in epoch.ch_names if 'LD' in x or 'LC' in x or 'LA' in x
                  or 'RD' in x or 'RC' in x or 'RA' in x]

In [None]:
bad_channels = []
for ch in epoch.ch_names:
    if ch not in visual_channels and ch not in audio_channels:
        bad_channels.append(ch)
epoch.drop_channels(ch_names= bad_channels)

In [None]:
epoch.get_data().shape

In [None]:
gamma_epoch = get_frequency_band('Gamma', epoch)

In [None]:
mean_stats = get_mean_band(gamma_epoch)
mean_stats = mean_stats.swapaxes(1,2)

In [None]:
# classification pipeline
start = time.time()
clf = make_pipeline(Vectorizer(),
                    StandardScaler(),
                    LogisticRegression(penalty='l1'))
end = time.time()
print("clf elapsed time: {0}".format(end - start))

start_master = time.time()
cv = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)

labels = epoch.events[:,-1]
preds = np.empty(len(mean_stats))
for train, test in cv.split(mean_stats, labels):
    start = time.time()
    clf.fit(mean_stats[train], labels[train])
    preds[test] = clf.predict(mean_stats[test])
    end = time.time()
    print("kfold elapsed time: {0}".format(end - start))
end = time.time()
print("classification elapsed time: {0}".format(end - start_master))


start = time.time()
target_names = ['Lexical', 'Non-Lexical']
report = classification_report(labels, preds, target_names=target_names)
print(report)
end = time.time()
print("reporting elapsed time: {0}".format(end - start))

In [None]:
# Lexical vs Non-Lexical needs more though...

In [None]:
epoch_1 = mne.read_epochs(os.path.join(path,epoch_files[2]), preload = True)
epoch_2 = mne.read_epochs(os.path.join(path,epoch_files[3]), preload = True)

In [None]:
epoch = mne.concatenate_epochs([epoch_1, epoch_2])

In [None]:
#montage file
if 'Nasium' in epoch.ch_names:
    epoch.drop_channels(ch_names=['Nasium', 'LL4', 'L12', 'VEOG']);
montage = mne.channels.read_montage(kind = 'ANT_DukeWaveGuard_128_electrode_montages_updated_V4')
epoch.set_montage(montage);

In [None]:
#split by audio and visual 
## Visual - Lexical vs Non-Lexical

new_events = convert_epoch_events_to_stim_combinations(epoch)
epoch = epoch[:-1]
epoch.events = new_events

In [None]:
epoch.event_id = modality_lexicality_event_ids

In [None]:
modality_lexicality_event_ids

In [None]:
#seprate labels into 4 differnt types.
# audio - lexical 
# audio - non-lexical
# visual- lexical
# visual- non-lexical

for event in epoch.events:
    first_digit = int(str(event[-1])[0])
    if event[-1] < 700  and first_digit %2 !=0:
        #Audio - lexical
        event[-1] = 100
    elif event[-1] < 700 and first_digit%2 == 0:
        #Audio - non-lexical
        event[-1] = 101
    elif event[-1] >700 and first_digit%2 !=0:
        #Visual - lexical
        event[-1] = 200
    else:
        # Visual non-lexical
        event[-1] = 201

In [None]:
# get gamma frequency
gamma_epoch = get_frequency_band('Gamma', epoch[:-1])

In [None]:
gamma_epoch.get_data().shape

In [None]:
mean_stats = get_mean_band(gamma_epoch)
mean_stats = mean_stats.swapaxes(1,2)

In [None]:
#extract each type of audio/visual - lexical/non-lexical event
audio_lexical = gamma_epoch[gamma_epoch.events[:,-1] == 100]
audio_non_lexical = gamma_epoch[gamma_epoch.events[:,-1] == 101]
visual_lexical = gamma_epoch[gamma_epoch.events[:,-1] == 200]
visual_non_lexical = gamma_epoch[gamma_epoch.events[:,-1] == 201]

In [None]:
print(audio_lexical.get_data().shape)
print(audio_non_lexical.get_data().shape)
print(visual_lexical.get_data().shape)
print(visual_non_lexical.get_data().shape)

In [None]:
# combine audio_lexical and audio-nonlexical
audio_lexical_data = audio_lexical.get_data()
audio_non_lexical_data = audio_non_lexical.get_data()[:354]
audio_lexicality_data = np.concatenate((audio_lexical_data, audio_non_lexical_data))
audio_lexicality_labels = np.concatenate((audio_lexical.events[:,-1], audio_non_lexical[:354].events[:,-1]))

In [None]:
mean_stats = get_mean_band(audio_lexicality_data)
mean_stats = mean_stats.swapaxes(1,2)

In [None]:
mean_stats.shape

In [None]:
audio_lexicality_data.shape

In [None]:
# classification pipeline
start = time.time()
clf = make_pipeline(Vectorizer(),
                    StandardScaler(),
                    LogisticRegression(penalty='l1'))
end = time.time()
print("clf elapsed time: {0}".format(end - start))

start_master = time.time()
cv = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)

labels = audio_lexicality_labels
preds = np.empty(len(mean_stats))
for train, test in cv.split(mean_stats, labels):
    start = time.time()
    clf.fit(mean_stats[train], labels[train])
    preds[test] = clf.predict(mean_stats[test])
    end = time.time()
    print("kfold elapsed time: {0}".format(end - start))
end = time.time()
print("classification elapsed time: {0}".format(end - start_master))


start = time.time()
target_names = ['Audio Lexical', 'Audio Non-Lexical']
report = classification_report(labels, preds, target_names=target_names)
print(report)
end = time.time()
print("reporting elapsed time: {0}".format(end - start))

In [None]:
cm3 = confusion_matrix(labels, preds)
cm3_normalized = cm3.astype(float) / cm3.sum(axis=1)[:, np.newaxis]

# Plot confusion matrix
plt.imshow(cm3_normalized, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Audio Lexical vs. Audio Non-Lexical', size = 30)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45, size = 30)
plt.yticks(tick_marks, target_names,size = 30)
tight_layout()
plt.ylabel('True label',size = 30)
plt.xlabel('Predicted label',size = 30)
plt.show()

In [None]:
gamma_epoch.get_data().shape

In [None]:
visual_lexical_data = visual_lexical.get_data()[:300]
visual_non_lexical_data = visual_non_lexical.get_data()
visual_lexicality_data = np.concatenate((visual_lexical_data, visual_non_lexical_data))
visual_lexicality_labels = np.concatenate((visual_lexical[:300].events[:,-1], visual_non_lexical.events[:,-1]))

In [None]:
visual_lexicality_data.shape

In [None]:
visual_lexicality_mean_stats = (visual_lexicality_data)
visual_lexicality_mean_stats = visual_lexicality_mean_stats.swapaxes(1,2)

In [None]:
# classification pipeline
start = time.time()
clf = make_pipeline(Vectorizer(),
                    StandardScaler(),
                    LogisticRegression(penalty='l1'))
end = time.time()
print("clf elapsed time: {0}".format(end - start))

start_master = time.time()
cv = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)

labels = visual_lexicality_labels
preds = np.empty(len(visual_lexicality_mean_stats))
for train, test in cv.split(visual_lexicality_mean_stats, labels):
    start = time.time()
    clf.fit(visual_lexicality_mean_stats[train], labels[train])
    preds[test] = clf.predict(visual_lexicality_mean_stats[test])
    end = time.time()
    print("kfold elapsed time: {0}".format(end - start))
end = time.time()
print("classification elapsed time: {0}".format(end - start_master))


start = time.time()
target_names = ['Visual Lexical', 'Visual Non-Lexical']
report = classification_report(labels, preds, target_names=target_names)
print(report)
end = time.time()
print("reporting elapsed time: {0}".format(end - start))

In [None]:
cm3 = confusion_matrix(labels, preds)
cm3_normalized = cm3.astype(float) / cm3.sum(axis=1)[:, np.newaxis]

# Plot confusion matrix
plt.imshow(cm3_normalized, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Visual Lexical vs. Visual Non-Lexical', size = 30)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45, size = 30)
plt.yticks(tick_marks, target_names,size = 30)
tight_layout()
plt.ylabel('True label',size = 30)
plt.xlabel('Predicted label',size = 30)
plt.show()

In [None]:
visual_lexality_mean_stats.shape