In [1]:
import numpy as np
import pandas as pd
import multiprocessing as mp
import matplotlib.pyplot as plt
import os
import mne
from wordfreq import word_frequency
from wordfreq import zipf_frequency
import re

from commons import normalize_word

from reader import EXTRACTED_DATA_PATH
from reader import EEG_CHANNEL_COUNT
from reader import MISSING_DATA_SYMBOL
from reader import EEG_FEATURES
from reader import ET_FEATURES

from reader import get_subjects_list

from eeg_plotter import MISSING_CHANNELS
from eeg_plotter import LANGUAGE_CHANNELS
from eeg_plotter import TRT_RANGES

from eeg_plotter import device_channel_to_data_channel
from eeg_plotter import channel_fill
from eeg_plotter import eeg_data_pad_to_len
from eeg_plotter import get_evoked_for_eeg_data
from eeg_plotter import get_raw_word_eeg
from eeg_plotter import get_raw_word_eeg_mean

def helper_get_word_mean_eeg(word_df, padding = 'mean'):
    '''
    Takes in a dataframe containing the 'task', 'sentence_id', 'word_idx' columnns
    corresponding to a single word (perhaps accross different sentences and subjects). It reads their eeg data,
    padds shorter appearences with either 0 or their mean to match the longest one and returns their mean.
    '''
    eeg_data = []
    maxlen = 0
    for appearence in word_df.itertuples(index=False):
        for subject in get_subjects_list():
            eeg_data.append( get_raw_word_eeg( (appearence[0], subject, appearence[1], appearence[2]) ) )
            if eeg_data[-1] is None:
                eeg_data.pop()
            else:
                maxlen = max(maxlen, eeg_data[-1].shape[0])
    if len(eeg_data) == 0:
        return None
    for i in range(len(eeg_data)):
        eeg_data[i] = eeg_data_pad_to_len(eeg_data[i], maxlen, padding)
    return np.mean( np.array(eeg_data), axis=0 )

In [2]:
# Get a frequency ordered list of nouns
# PLease also choose a task:
task = 'NR'
pos_tags_src = 'nltk'
df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_tags/{pos_tags_src}.tsv', sep='\t', keep_default_na=False)
nn_df = df[ (df['part_of_speech'].isin(['NN', 'NNS'])) ] ['content']
nn_df = pd.unique( nn_df.apply(normalize_word) )
ordered_nouns = list(reversed( sorted( [(zipf_frequency(w, 'en'), w) for w in nn_df] ) ))
ordered_nouns

[(7.73, 'the'),
 (7.41, 'and'),
 (7.27, 'in'),
 (6.82, 'this'),
 (6.47, 'one'),
 (6.3, 'her'),
 (6.29, 'time'),
 (6.26, 'she'),
 (6.25, 'people'),
 (6.12, 'only'),
 (6.11, 'him'),
 (6.04, 'back'),
 (5.98, 'may'),
 (5.96, 'years'),
 (5.96, 'year'),
 (5.96, 'work'),
 (5.89, 'world'),
 (5.89, 'life'),
 (5.82, 'man'),
 (5.81, 'home'),
 (5.78, 'part'),
 (5.75, 'help'),
 (5.72, 'thing'),
 (5.72, 'game'),
 (5.71, 'school'),
 (5.71, 'place'),
 (5.7, 'again'),
 (5.68, 'show'),
 (5.68, 'end'),
 (5.67, 'team'),
 (5.66, 'family'),
 (5.64, 'money'),
 (5.63, 'second'),
 (5.62, 'number'),
 (5.61, 'name'),
 (5.61, 'days'),
 (5.61, 'city'),
 (5.6, 'company'),
 (5.57, 'group'),
 (5.56, 'times'),
 (5.56, 'start'),
 (5.56, 'business'),
 (5.55, 'person'),
 (5.55, 'anything'),
 (5.54, 'point'),
 (5.54, 'change'),
 (5.52, 'states'),
 (5.52, 'power'),
 (5.52, 'music'),
 (5.52, 'including'),
 (5.51, 'men'),
 (5.51, 'head'),
 (5.5, 'side'),
 (5.5, 'job'),
 (5.49, 'service'),
 (5.49, 'later'),
 (5.48, 'season'),

In [3]:
# get the reversed of the previous list
list(reversed(ordered_nouns))

[(0.0, 'demeanours'),
 (0.0, "greasin's"),
 (0.0, 'statcoulomb'),
 (1.01, 'colleages'),
 (1.02, '1760s'),
 (1.2, 'cornetist'),
 (1.21, 'quadricycle'),
 (1.3, 'fallaciously'),
 (1.62, 'confidantes'),
 (1.94, 'womaniser'),
 (2.03, 'busboy'),
 (2.06, 'quebecers'),
 (2.12, 'scoutmaster'),
 (2.36, 'reissues'),
 (2.45, 'astigmatism'),
 (2.45, 'botulism'),
 (2.47, 'brevet'),
 (2.49, 'bandleader'),
 (2.52, 'newscaster'),
 (2.52, 'snobby'),
 (2.7, 'trumpeter'),
 (2.72, 'stupor'),
 (2.73, 'boarder'),
 (2.75, 'machinist'),
 (2.76, 'appendicitis'),
 (2.77, 'sounders'),
 (2.8, 'crumbles'),
 (2.83, 'snarky'),
 (2.83, 'synapses'),
 (2.85, 'adequacy'),
 (2.85, 'handyman'),
 (2.88, 'bookkeeper'),
 (2.88, 'shopkeeper'),
 (2.89, 'beehive'),
 (2.93, 'mistresses'),
 (2.94, 'catchphrase'),
 (2.98, 'schoolteacher'),
 (3.0, 'heretic'),
 (3.0, 'semesters'),
 (3.0, 'yank'),
 (3.03, 'ticker'),
 (3.05, 'cremated'),
 (3.05, 'diffraction'),
 (3.06, '1826-1905'),
 (3.07, 'horseman'),
 (3.09, 'pane'),
 (3.1, 'pseudon

In [17]:
len(ordered_nouns)

703

In [12]:
# -> generate plot_count plots as such:
# -> for each plot, 3 high frequency and 3 low frequency nouns are chosen
# -> for each noun, the eeg data associated with each appearence is extracted, the shorter appearences are padded with their respective 
# means to match the longest appearence, then all eeg data is averaged and plotted
# NOTE: currently, the most frequent nouns are ignored due to nltk's high error rate in that range, as well as the least frequent 15, to 
# match sample counts
# -> results are saved in pos_analysis/wordfreq_statistics
task = 'TSR'
plot_channel = 51 # in device indices (a.k.a. 1-128)
plot_count = 10
pos_tags_src = 'nltk'

highfreq_colors = [(1, 0, 0), # red
                   (1, 0.33, 0), # orange 
                   (1, 1, 0)] # yellow
lowfreq_colors = [(0, 0, 0.75), # blue
                  (0, 0.75, 0), # green
                  (0.75, 0, 0.75)] # purple

df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_tags/{pos_tags_src}.tsv', sep='\t', keep_default_na=False)
df = df[ (df['part_of_speech'].isin(['NN', 'NNS'])) ] [['task', 'content', 'sentence_id', 'word_idx']]

nn_df = df ['content']
nn_df = pd.unique( nn_df.apply(normalize_word) )
ordered_nouns = list(reversed(sorted( [(zipf_frequency(w, 'en'), w) for w in nn_df] )))

highfreq_nouns = [ n for freq, n in ordered_nouns[15:len(ordered_nouns)//2]]
lowfreq_nouns = [ n for freq, n in list(reversed(ordered_nouns))[15:len(ordered_nouns)//2] ]

for nidx in range(0, min(len(highfreq_nouns), len(lowfreq_nouns)), 3):
    plt.figure(figsize=(10, 6))
    for j in range(3):
        # plot highfreq
        word = highfreq_nouns[nidx+j]
        word_df = df[ df['content'].apply(lambda w : normalize_word(w)==word) ]
        word_data = helper_get_word_mean_eeg(word_df[ ['task', 'sentence_id', 'word_idx'] ])
        plt.plot([t for t in range(word_data.shape[0])], word_data[:, device_channel_to_data_channel(plot_channel)], 
                 label=f'"{word}", zipf_freq={zipf_frequency(word, "en")}', color=highfreq_colors[j]) #TODO freq
        # plot lowfreq
        word = lowfreq_nouns[nidx+j]
        word_df = df[ df['content'].apply(lambda w : normalize_word(w)==word) ]
        word_data = helper_get_word_mean_eeg(word_df[ ['task', 'sentence_id', 'word_idx'] ])
        plt.plot([t for t in range(word_data.shape[0])], word_data[:, device_channel_to_data_channel(plot_channel)], 
                 label=f'"{word}", zipf_freq={zipf_frequency(word, "en")}', color=lowfreq_colors[j]) #TODO freq
    plt.xlabel('Time (ms)')
    plt.ylabel('Amplitude (µV)')
    plt.title(f'E{plot_channel} EEG data comparing high and low frequency words')
    plt.axvline(0, color='black', linestyle='--')  # Add vertical line at 0s (stimulus onset)
    plt.legend(loc='upper right')
    if not os.path.exists(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/wordfreq_statistics'):
        os.makedirs(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/wordfreq_statistics')
    plt.savefig(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/wordfreq_statistics/wordfreq_comparison_plot{plot_count}.png', format='png', dpi=1200)
    plt.close()
    plot_count -= 1
    print(f'Finished plotting batch.')
    if plot_count == 0:
        break
print('Done')

Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Finished plotting batch.
Done


In [9]:
# print a list of words marked with a tag in tags
task = 'NR'
pos_tags_src = 'nltk'
tags = ['NN', 'NNS']
df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_tags/{pos_tags_src}.tsv', sep='\t', keep_default_na=False)
list(  pd.unique(df[ df['part_of_speech'].isin(tags) ]['content'].apply(normalize_word))  ) 

['son',
 'philanthropic',
 'organization',
 'charter',
 'welfare',
 'success',
 'and',
 'investors',
 'interest',
 'race',
 'cars',
 'company',
 'period',
 'quadricycle',
 'victory',
 'driver',
 'favorite',
 'engineer',
 'promotion',
 'time',
 'money',
 'attention',
 'experiments',
 'combustion',
 'engines',
 'completion',
 'vehicle',
 'year',
 'farm',
 'city',
 'parents',
 '1826-1905',
 'immigrants',
 'years',
 'wars',
 'regime',
 'evidence',
 'backing',
 'politics',
 'part',
 'statements',
 'representative',
 'daughter-in-law',
 'funds',
 'movement',
 'sources',
 'concept',
 'development',
 'employees',
 'associate',
 'spot',
 'widow',
 'stock',
 'position',
 'home',
 'apprentice',
 'machinist',
 'actor',
 'brothers',
 'quarters',
 'quarter',
 'descent',
 'movies',
 'job',
 'busboy',
 'disco',
 'candidate',
 'office',
 'particular',
 'hosts',
 'show',
 'times',
 'appearances',
 'movie',
 'version',
 "children's",
 'series',
 'appearance',
 'scoutmaster',
 'pedophile',
 'shirt',
 'mou

In [None]:
tagmap = {
    'noun-common': ['NN', 'NNS'],
    'noun-proper': ['NNP', 'NNPS'],
    'adjective': ['JJ', 'JJR', 'JJS'],
    'pronoun': ['PRP', 'PRP$'],
    'adverb': ['RB', 'RBR', 'RBS'],
    'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
}

In [4]:
# generate topomaps of the activations of each part of speech in tagmap
# only word instances win TRT>=min_trt are accepted
# shorter instances are padded (according to padding) and averaged together
# afterwards the eeg means of the instances are padded to match the longest and averaged again
# Results are saved in the pos_analysis/topomaps folder

task = 'TSR'
pos_tags_src = 'nltk'
min_trt = 500
padding = 'mean'

tagmap = {
    'noun-common': ['NN', 'NNS'],
    'noun-proper': ['NNP', 'NNPS'],
    'adjective': ['JJ', 'JJR', 'JJS'],
    'pronoun': ['PRP', 'PRP$'],
    'adverb': ['RB', 'RBR', 'RBS'],
    'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
}

if not os.path.exists(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/topomaps'):
    os.makedirs(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/topomaps')

pos_df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_tags/{pos_tags_src}.tsv', sep='\t', keep_default_na=False, na_values=[''])
full_df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/data.tsv', sep='\t', keep_default_na=False, na_values=[''])
full_df['TRT'] = full_df['TRT'].astype(float)

def helper_process_pos_df_row(row):
    curr_full_df = full_df[(full_df['TRT']>=min_trt) & (full_df['sentence_id']==row['sentence_id']) & (full_df['word_idx']==row['word_idx'])]
    eeg_mean = helper_get_word_mean_eeg( curr_full_df[['task', 'sentence_id', 'word_idx']], padding=padding )
    return eeg_mean

for pos in tagmap.keys():
    print(f'Potting for {pos}...')
    curr_pos_df = pos_df[ pos_df['part_of_speech'].isin(tagmap[pos]) ]
    instance_means = curr_pos_df.apply(helper_process_pos_df_row, axis=1).dropna()
    maxlen = instance_means.apply(lambda x: len(x)).max()
    padded_instance_means = instance_means.apply(eeg_data_pad_to_len, args=(maxlen, padding)).to_numpy()
    pos_mean = np.mean(padded_instance_means, axis=0)
    evoked = get_evoked_for_eeg_data(pos_mean, frequency=1000) # TODO freq
    fig = evoked.plot_topomap(times='peaks', show=False)
    fig.savefig(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/topomaps/{pos}_topomap_{padding}.png', format='png', dpi=1200)
    plt.close()
    print(f'Finished plotting for {pos}')
print('Done')

Potting for noun-common...
Finished plotting for noun-common
Potting for noun-proper...
Finished plotting for noun-proper
Potting for adjective...
Finished plotting for adjective
Potting for pronoun...
Finished plotting for pronoun
Potting for adverb...
Finished plotting for adverb
Potting for verb...
Finished plotting for verb
Done


In [2]:
# generate spectral density plots of the activations of each part of speech in tagmap
# only word instances win TRT>=min_trt are accepted
# shorter instances are padded (according to padding) and averaged together
# afterwards the eeg means of the instances are padded to match the longest and averaged again
# Results are saved in the pos_analysis/topomaps folder

task = 'TSR'
pos_tags_src = 'nltk'
min_trt = 200
padding = 'mean'

tagmap = {
    'noun-common': ['NN', 'NNS'],
    'noun-proper': ['NNP', 'NNPS'],
    'adjective': ['JJ', 'JJR', 'JJS'],
    'pronoun': ['PRP', 'PRP$'],
    'adverb': ['RB', 'RBR', 'RBS'],
    'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
}

if not os.path.exists(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/spectrography/mintrt{min_trt}'):
    os.makedirs(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/spectrography/mintrt{min_trt}')

pos_df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_tags/{pos_tags_src}.tsv', sep='\t', keep_default_na=False, na_values=[''])
full_df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/data.tsv', sep='\t', keep_default_na=False, na_values=[''])
full_df['TRT'] = full_df['TRT'].astype(float)

def helper_process_pos_df_row(row):
    curr_full_df = full_df[(full_df['TRT']>=min_trt) & (full_df['sentence_id']==row['sentence_id']) & (full_df['word_idx']==row['word_idx'])]
    eeg_mean = helper_get_word_mean_eeg( curr_full_df[['task', 'sentence_id', 'word_idx']], padding=padding )
    return eeg_mean

for pos in tagmap.keys():
    print(f'Potting for {pos}...')
    curr_pos_df = pos_df[ pos_df['part_of_speech'].isin(tagmap[pos]) ]
    instance_means = curr_pos_df.apply(helper_process_pos_df_row, axis=1).dropna()
    maxlen = instance_means.apply(lambda x: len(x)).max()
    padded_instance_means = instance_means.apply(eeg_data_pad_to_len, args=(maxlen, padding)).to_numpy()
    pos_mean = np.mean(padded_instance_means, axis=0)
    evoked = get_evoked_for_eeg_data(pos_mean, frequency=1000) # TODO freq
    spectrum = evoked.compute_psd(n_jobs=3)
    fig = spectrum.plot(exclude='bads', show=False)
    fig.savefig(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/pos_analysis/spectrography/mintrt{min_trt}/{pos}_psd_{padding}.png', format='png', dpi=1200)
    plt.close()
    print(f'Finished plotting for {pos}')
print('Done')

Potting for noun-common...
    Using multitaper spectrum estimation with 7 DPSS windows
Plotting power spectral density (dB=True).
Finished plotting for noun-common
Potting for noun-proper...
    Using multitaper spectrum estimation with 7 DPSS windows
Plotting power spectral density (dB=True).
Finished plotting for noun-proper
Potting for adjective...
    Using multitaper spectrum estimation with 7 DPSS windows
Plotting power spectral density (dB=True).
Finished plotting for adjective
Potting for pronoun...
    Using multitaper spectrum estimation with 7 DPSS windows
Plotting power spectral density (dB=True).
Finished plotting for pronoun
Potting for adverb...
    Using multitaper spectrum estimation with 7 DPSS windows
Plotting power spectral density (dB=True).
Finished plotting for adverb
Potting for verb...
    Using multitaper spectrum estimation with 7 DPSS windows
Plotting power spectral density (dB=True).
Finished plotting for verb
Done


In [3]:
task = 'TSR'
full_df = pd.read_csv(f'{EXTRACTED_DATA_PATH}extracted_data_{task}/data.tsv', sep='\t', keep_default_na=False, na_values=[''])
full_df['TRT'] = full_df['TRT'].astype(float)
full_df['TRT'].max()

np.float64(3837.0)

In [4]:
len(full_df)

143831

In [6]:
full_df[ [i%2==0 for i in range(143831)] ]

Unnamed: 0,sentence,sentence_id,subject,task,content,word_idx,GD,TRT,FFD,SFD,GPT,nFix,reading_order,first_fixation_time
0,He was also the unsuccessful Republican nomine...,0,YMD,TSR,He,0,,,,,,,,
2,He was also the unsuccessful Republican nomine...,0,YMD,TSR,also,2,,,,,,,,
4,He was also the unsuccessful Republican nomine...,0,YMD,TSR,unsuccessful,4,112.0,209.0,112.0,,112.0,2.0,2.0,179.0
6,He was also the unsuccessful Republican nomine...,0,YMD,TSR,nominee,6,251.0,251.0,106.0,,251.0,2.0,4.0,587.0
8,He was also the unsuccessful Republican nomine...,0,YMD,TSR,President,8,170.0,298.0,51.0,,170.0,3.0,5.0,778.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143822,In 1999 Bush cofounded a educational-software ...,389,YAK,TSR,types,16,,,,,,,,
143824,In 1999 Bush cofounded a educational-software ...,389,YAK,TSR,content,18,154.0,154.0,154.0,154.0,154.0,1.0,13.0,2604.0
143826,In 1999 Bush cofounded a educational-software ...,389,YAK,TSR,appeal,20,84.0,84.0,84.0,84.0,84.0,1.0,14.0,2745.0
143828,In 1999 Bush cofounded a educational-software ...,389,YAK,TSR,multiple,22,121.0,121.0,121.0,121.0,121.0,1.0,15.0,2899.0
