In [1]:
import numpy as np
import pandas as pd
import multiprocessing as mp
import matplotlib.pyplot as plt
import os
import mne
from wordfreq import word_frequency
from wordfreq import zipf_frequency
import re

from reader import EXTRACTED_DATA_PATH
from reader import EEG_CHANNEL_COUNT
from reader import MISSING_DATA_SYMBOL
from reader import EEG_FEATURES
from reader import ET_FEATURES

from reader import get_subjects_list

from eeg_plotter import MISSING_CHANNELS
from eeg_plotter import LANGUAGE_CHANNELS
from eeg_plotter import TRT_RANGES

from eeg_plotter import device_channel_to_data_channel
from eeg_plotter import channel_fill
from eeg_plotter import get_evoked_for_eeg_data
from eeg_plotter import get_raw_word_eeg
from eeg_plotter import get_raw_word_eeg_mean


def normalize_word(word): # removes punctuation from the beginning and end and lowers the capitalization
    return re.sub(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$', '', word).lower()

In [2]:
# Get a frequency ordered list of nouns
# PLease also choose a task:
task = 'NR'
pos_tags_src = 'nltk'
df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_tags/{pos_tags_src}.tsv', sep='\t', keep_default_na=False)
nn_df = df[ (df['part_of_speech'].isin(['NN', 'NNS'])) ] ['content']
nn_df = pd.unique( nn_df.apply(normalize_word) )
ordered_nouns = list(reversed( sorted( [(zipf_frequency(w, 'en'), w) for w in nn_df] ) ))
ordered_nouns

[(7.73, 'the'),
 (7.41, 'and'),
 (7.27, 'in'),
 (6.82, 'this'),
 (6.47, 'one'),
 (6.3, 'her'),
 (6.29, 'time'),
 (6.26, 'she'),
 (6.25, 'people'),
 (6.12, 'only'),
 (6.11, 'him'),
 (6.04, 'back'),
 (5.98, 'may'),
 (5.96, 'years'),
 (5.96, 'year'),
 (5.96, 'work'),
 (5.89, 'world'),
 (5.89, 'life'),
 (5.82, 'man'),
 (5.81, 'home'),
 (5.78, 'part'),
 (5.75, 'help'),
 (5.72, 'thing'),
 (5.72, 'game'),
 (5.71, 'school'),
 (5.71, 'place'),
 (5.7, 'again'),
 (5.68, 'show'),
 (5.68, 'end'),
 (5.67, 'team'),
 (5.66, 'family'),
 (5.64, 'money'),
 (5.63, 'second'),
 (5.62, 'number'),
 (5.61, 'name'),
 (5.61, 'days'),
 (5.61, 'city'),
 (5.6, 'company'),
 (5.57, 'group'),
 (5.56, 'times'),
 (5.56, 'start'),
 (5.56, 'business'),
 (5.55, 'person'),
 (5.55, 'anything'),
 (5.54, 'point'),
 (5.54, 'change'),
 (5.52, 'states'),
 (5.52, 'power'),
 (5.52, 'music'),
 (5.52, 'including'),
 (5.51, 'men'),
 (5.51, 'head'),
 (5.5, 'side'),
 (5.5, 'job'),
 (5.49, 'service'),
 (5.49, 'later'),
 (5.48, 'season'),

In [3]:
# get the reversed of the previous list
list(reversed(ordered_nouns))

[(0.0, 'demeanours'),
 (0.0, "greasin's"),
 (0.0, 'statcoulomb'),
 (1.01, 'colleages'),
 (1.02, '1760s'),
 (1.2, 'cornetist'),
 (1.21, 'quadricycle'),
 (1.3, 'fallaciously'),
 (1.62, 'confidantes'),
 (1.94, 'womaniser'),
 (2.03, 'busboy'),
 (2.06, 'quebecers'),
 (2.12, 'scoutmaster'),
 (2.36, 'reissues'),
 (2.45, 'astigmatism'),
 (2.45, 'botulism'),
 (2.47, 'brevet'),
 (2.49, 'bandleader'),
 (2.52, 'newscaster'),
 (2.52, 'snobby'),
 (2.7, 'trumpeter'),
 (2.72, 'stupor'),
 (2.73, 'boarder'),
 (2.75, 'machinist'),
 (2.76, 'appendicitis'),
 (2.77, 'sounders'),
 (2.8, 'crumbles'),
 (2.83, 'snarky'),
 (2.83, 'synapses'),
 (2.85, 'adequacy'),
 (2.85, 'handyman'),
 (2.88, 'bookkeeper'),
 (2.88, 'shopkeeper'),
 (2.89, 'beehive'),
 (2.93, 'mistresses'),
 (2.94, 'catchphrase'),
 (2.98, 'schoolteacher'),
 (3.0, 'heretic'),
 (3.0, 'semesters'),
 (3.0, 'yank'),
 (3.03, 'ticker'),
 (3.05, 'cremated'),
 (3.05, 'diffraction'),
 (3.06, '1826-1905'),
 (3.07, 'horseman'),
 (3.09, 'pane'),
 (3.1, 'pseudon

In [17]:
len(ordered_nouns)

703

In [6]:
# plot the given eeg channels for the given words
# the shorter appearences are padded with their own mean to match the longest appearence
# finally all appearences are averaged and the selected channels are plotted and saved in the EXTRACTED_DATA_PATH/statistics folder
task = 'NR'
plot_channel = 51 # in device indices (a.k.a. 1-128)
plot_count = 10
pos_tags_src = 'nltk'

highfreq_colors = [(1, 0, 0), # red
                   (1, 0.33, 0), # orange 
                   (1, 1, 0)] # yellow
lowfreq_colors = [(0, 0, 0.75), # blue
                  (0, 0.75, 0), # green
                  (0.75, 0, 0.75)] # purple

def helper_get_word_eeg(word_df):
    '''
    Takes in a dataframe containing the 'content', 'sentence_id', 'word_idx' columnns
    corresponding to a single word (perhaps accross different sentences and subjects). It reads their eeg data,
    padds shorter appearences with their mean to match the longest one and returns their mean.
    '''
    eeg_data = []
    maxlen = 0
    for appearence in word_df.itertuples(index=False):
        for subject in get_subjects_list():
            eeg_data.append( get_raw_word_eeg( (task, subject, appearence[1], appearence[2]) ) )
            if eeg_data[-1] is None:
                eeg_data.pop()
            else:
                maxlen = max(maxlen, eeg_data[-1].shape[0])
    for i in range(len(eeg_data)):
        padded = np.empty((maxlen, eeg_data[i].shape[1]), dtype=np.float64)
        padded[:eeg_data[i].shape[0], :] = eeg_data[i]
        padded[eeg_data[i].shape[0]:, :] = np.mean(eeg_data[i], axis=0)
        eeg_data[i] = padded
    return np.mean( np.array(eeg_data), axis=0 )

df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_tags/{pos_tags_src}.tsv', sep='\t', keep_default_na=False)
df = df[ (df['part_of_speech'].isin(['NN', 'NNS'])) ] [['content', 'sentence_id', 'word_idx']]

nn_df = df ['content']
nn_df = pd.unique( nn_df.apply(normalize_word) )
ordered_nouns = list(reversed(sorted( [(zipf_frequency(w, 'en'), w) for w in nn_df] )))

highfreq_nouns = [ n for freq, n in ordered_nouns[15:len(ordered_nouns)//2]]
lowfreq_nouns = [ n for freq, n in list(reversed(ordered_nouns))[15:len(ordered_nouns)//2] ]

for nidx in range(0, min(len(highfreq_nouns), len(lowfreq_nouns)), 3):
    plt.figure(figsize=(10, 6))
    for j in range(3):
        # plot highfreq
        word = highfreq_nouns[nidx+j]
        word_df = df[ df['content'].apply(lambda w : normalize_word(w)==word) ]
        word_data = helper_get_word_eeg(word_df)
        plt.plot([t for t in range(word_data.shape[0])], word_data[:, device_channel_to_data_channel(plot_channel)], 
                 label=f'"{word}", zipf_freq={zipf_frequency(word, "en")}', color=highfreq_colors[j]) #TODO freq
        # plot lowfreq
        word = lowfreq_nouns[nidx+j]
        word_df = df[ df['content'].apply(lambda w : normalize_word(w)==word) ]
        word_data = helper_get_word_eeg(word_df)
        plt.plot([t for t in range(word_data.shape[0])], word_data[:, device_channel_to_data_channel(plot_channel)], 
                 label=f'"{word}", zipf_freq={zipf_frequency(word, "en")}', color=lowfreq_colors[j]) #TODO freq
    plt.xlabel('Time (ms)')
    plt.ylabel('Amplitude (µV)')
    plt.title(f'E{plot_channel} EEG data comparing high and low frequency words')
    plt.axvline(0, color='black', linestyle='--')  # Add vertical line at 0s (stimulus onset)
    plt.legend(loc='upper right')
    if not os.path.exists(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/wordfreq_statistics'):
        os.mkdir(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/wordfreq_statistics')
    plt.savefig(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/wordfreq_statistics/wordfreq_comparison_plot{plot_count}.png', format='png', dpi=1200)
    plt.close()
    plot_count -= 1
    print(f'Finished plotting batch.')
    if plot_count == 0:
        break
print('Done')

Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Done
