In [2]:
import numpy as np
import pandas as pd
import multiprocessing as mp
import matplotlib.pyplot as plt
import os
import mne
from wordfreq import word_frequency
from wordfreq import zipf_frequency
import re

from reader import EXTRACTED_DATA_PATH
from reader import EEG_CHANNEL_COUNT
from reader import MISSING_DATA_SYMBOL
from reader import EEG_FEATURES
from reader import ET_FEATURES

from reader import get_subjects_list

from eeg_plotter import MISSING_CHANNELS
from eeg_plotter import LANGUAGE_CHANNELS
from eeg_plotter import TRT_RANGES

from eeg_plotter import device_channel_to_data_channel
from eeg_plotter import channel_fill
from eeg_plotter import get_evoked_for_eeg_data
from eeg_plotter import get_raw_word_eeg
from eeg_plotter import get_raw_word_eeg_mean


def normalize_word(word): # removes punctuation from the beginning and end and lowers the capitalization
    return re.sub(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$', '', word).lower()

In [8]:
# Get a frequency ordered list of nouns
# PLease also choose a task:
task = 'NR'
df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/data.tsv', sep='\t', keep_default_na=False)
words = df[ (df['subject']==get_subjects_list()[0]) & (df['part_of_speech'].str.startswith('NN')) ] ['content']
words = pd.unique( words )
ordered_list = list(reversed( sorted( [(zipf_frequency(w, 'en'), w) for w in words] ) ))
ordered_list

[(7.73, '(the'),
 (7.41, 'and,'),
 (7.36, 'A'),
 (7.27, 'in,'),
 (7.27, '(in'),
 (6.82, 'this.'),
 (6.57, 'My'),
 (6.47, 'one,'),
 (6.47, 'one'),
 (6.47, '(one'),
 (6.33, '"It\'s'),
 (6.3, 'her.'),
 (6.29, 'time.'),
 (6.29, 'time,'),
 (6.29, 'time'),
 (6.26, 'she,'),
 (6.25, 'people,'),
 (6.25, 'New'),
 (6.18, '(now'),
 (6.12, 'only,'),
 (6.12, 'Good'),
 (6.11, 'him.'),
 (6.11, 'First'),
 (6.04, 'back.'),
 (6.0, '(most'),
 (5.98, 'May'),
 (5.98, '(May'),
 (5.96, 'years;'),
 (5.96, 'years.'),
 (5.96, 'years)'),
 (5.96, 'years'),
 (5.96, 'year.'),
 (5.96, 'year,'),
 (5.96, 'year'),
 (5.96, 'work,'),
 (5.96, 'work'),
 (5.89, 'world.'),
 (5.89, 'world'),
 (5.89, 'life.'),
 (5.89, 'life,'),
 (5.89, 'World'),
 (5.89, 'Life.'),
 (5.88, 'Great'),
 (5.86, 'S.'),
 (5.86, "'S'"),
 (5.82, 'man'),
 (5.82, 'Man.'),
 (5.82, 'Man'),
 (5.82, 'Love.'),
 (5.81, 'home,'),
 (5.81, 'home'),
 (5.81, 'Long'),
 (5.78, 'part'),
 (5.78, 'State'),
 (5.78, '("three'),
 (5.75, 'help'),
 (5.72, 'thing'),
 (5.72, 'ga

In [9]:
# get the reversed of the previous list
list(reversed(ordered_list))

[(0.0, 'Adalet'),
 (0.0, 'Ballyporeen,'),
 (0.0, 'Brandenburg-Kulmbach,'),
 (0.0, 'Bôcher'),
 (0.0, "Courant's"),
 (0.0, 'Crescentia'),
 (0.0, "Edsel's"),
 (0.0, 'Kalkinma'),
 (0.0, 'Kerouacs'),
 (0.0, 'Litogot'),
 (0.0, 'Ludecke,'),
 (0.0, 'Lévesques'),
 (0.0, 'Necmettin'),
 (0.0, "Onassis's"),
 (0.0, 'Overisel,'),
 (0.0, 'Petschek'),
 (0.0, 'Roncalio,'),
 (0.0, 'Selâmet'),
 (0.0, 'Springport,'),
 (0.0, "Struensee's"),
 (0.0, 'demeanours.'),
 (0.0, "greasin's"),
 (0.0, 'statcoulomb.'),
 (0.0, '–'),
 (1.01, 'Springwells'),
 (1.01, 'colleages'),
 (1.02, '1760s.'),
 (1.06, "Lawford's"),
 (1.08, 'Abiah'),
 (1.08, 'Desireless,'),
 (1.1, 'Codina,'),
 (1.11, 'Partisi'),
 (1.11, 'Partisi),'),
 (1.13, 'Garnica'),
 (1.17, "Nesbit's"),
 (1.18, 'DePaolo'),
 (1.18, 'Erbakan.'),
 (1.18, 'Williard'),
 (1.2, 'cornetist.'),
 (1.21, 'Quadricycle'),
 (1.21, 'Quadricycle,'),
 (1.27, 'Heslov,'),
 (1.3, 'Cockleshell'),
 (1.3, 'fallaciously)'),
 (1.32, 'Quesnay'),
 (1.37, 'SuperDraft,'),
 (1.38, 'Aquidneck'

In [16]:
# plot the given eeg channels for the given words
# the shorter appearences are padded with their own mean to match the longest appearence
# finally all appearences are averaged and the selected channels are plotted and saved in the EXTRACTED_DATA_PATH/statistics folder
task = 'NR'
words = ['time', 'people', 'year', 'demeanours', 'statcoulomb', 'cornetist']
plot_channels = [51] # in device indices (a.k.a. 1-128)
df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/data.tsv', sep='\t', keep_default_na=False)
df = df[ df['content'].apply(lambda word : normalize_word(word) in words) ] [['content', 'subject', 'sentence_id', 'word_idx']]
for word in words:
    word_df = df[ df['content'].apply(lambda w : normalize_word(w)==word) ]
    eeg_data = []
    maxlen = 0
    for appearence in word_df.itertuples(index=False):
        eeg_data.append( get_raw_word_eeg( (task, appearence[1], appearence[2], appearence[3]) ) )
        if eeg_data[-1] is None:
            eeg_data.pop()
        else:
            maxlen = max(maxlen, eeg_data[-1].shape[0])
    for i in range(len(eeg_data)):
        padded = np.empty((maxlen, eeg_data[i].shape[1]), dtype=np.float64)
        padded[:eeg_data[i].shape[0], :] = eeg_data[i]
        padded[eeg_data[i].shape[0]:, :] = np.mean(eeg_data[i], axis=0)
        eeg_data[i] = padded
    word_data = np.mean( np.array(eeg_data), axis=0 )
    plt.figure(figsize=(10, 6))
    for channel in plot_channels:
        plt.plot([t for t in range(word_data.shape[0])], word_data[:, device_channel_to_data_channel(channel)], label=f'E{channel}') #TODO freq
    plt.xlabel('Time (ms)')
    plt.ylabel('Amplitude (µV)')
    plt.title(f'EEG Data for word "{word}" (zipf_frequency={zipf_frequency(word, "en")})')
    plt.axvline(0, color='black', linestyle='--')  # Add vertical line at 0s (stimulus onset)
    plt.legend(loc='upper right')
    if not os.path.exists(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/statistics'):
        os.mkdir(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/statistics')
    plt.savefig(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/statistics/{word}_plot.png', format='png', dpi=1200)
    plt.close()
    print(f'Finished plotting word "{word}".')
print('Done')

Finished plotting word "time".
Finished plotting word "people".
Finished plotting word "year".
Finished plotting word "demeanours".
Finished plotting word "statcoulomb".
Finished plotting word "cornetist".
Done


In [None]:
# same as above, except plot the words against each other using a single channel
task = 'NR'
words = ['time', 'people', 'year', 'demeanours', 'statcoulomb', 'cornetist']
plot_channels = [51] # in device indices (a.k.a. 1-128)
df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/data.tsv', sep='\t', keep_default_na=False)
df = df[ df['content'].apply(lambda word : normalize_word(word) in words) ] [['content', 'subject', 'sentence_id', 'word_idx']]
for word in words:
    word_df = df[ df['content'].apply(lambda w : normalize_word(w)==word) ]
    eeg_data = []
    maxlen = 0
    for appearence in word_df.itertuples(index=False):
        eeg_data.append( get_raw_word_eeg( (task, appearence[1], appearence[2], appearence[3]) ) )
        if eeg_data[-1] is None:
            eeg_data.pop()
        else:
            maxlen = max(maxlen, eeg_data[-1].shape[0])
    for i in range(len(eeg_data)):
        padded = np.empty((maxlen, eeg_data[i].shape[1]), dtype=np.float64)
        padded[:eeg_data[i].shape[0], :] = eeg_data[i]
        padded[eeg_data[i].shape[0]:, :] = np.mean(eeg_data[i], axis=0)
        eeg_data[i] = padded
    word_data = np.mean( np.array(eeg_data), axis=0 )
    plt.figure(figsize=(10, 6))
    for channel in plot_channels:
        plt.plot([t for t in range(word_data.shape[0])], word_data[:, device_channel_to_data_channel(channel)], label=f'E{channel}') #TODO freq
    plt.xlabel('Time (ms)')
    plt.ylabel('Amplitude (µV)')
    plt.title(f'EEG Data for word "{word}" (zipf_frequency={zipf_frequency(word, "en")})')
    plt.axvline(0, color='black', linestyle='--')  # Add vertical line at 0s (stimulus onset)
    plt.legend(loc='upper right')
    if not os.path.exists(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/statistics'):
        os.mkdir(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/statistics')
    plt.savefig(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/statistics/{word}_plot.png', format='png', dpi=1200)
    plt.close()
    print(f'Finished plotting word "{word}".')
print('Done')