In [1]:
# Electric Boogaloo
import os

def get_subject(filename):
    fis = os.path.basename(filename)
    return fis.split("sults")[1].split("_")[0]

def get_task(filename):
    fis = os.path.basename(filename)
    return fis.split("_")[1].replace(".mat", '')


def get_files(rootdir = 'task1 - NR/Matlab files/', task = 'NR'):
    all_files = []
    for file in os.listdir(rootdir):
        if file.endswith(task+".mat"):
            file_path = os.path.join(rootdir, file)
            subject = get_subject(file_path)
            # exclude YMH due to incomplete data because of dyslexia
            if subject == 'YMH':
                continue
            all_files.append(file_path)
    return all_files

all_files = get_files()
all_files

['task1 - NR/Matlab files/resultsYHS_NR.mat',
 'task1 - NR/Matlab files/resultsYDR_NR.mat',
 'task1 - NR/Matlab files/resultsYMD_NR.mat',
 'task1 - NR/Matlab files/resultsYRH_NR.mat',
 'task1 - NR/Matlab files/resultsYRK_NR.mat',
 'task1 - NR/Matlab files/resultsYSD_NR.mat',
 'task1 - NR/Matlab files/resultsYFR_NR.mat',
 'task1 - NR/Matlab files/resultsYDG_NR.mat',
 'task1 - NR/Matlab files/resultsYIS_NR.mat',
 'task1 - NR/Matlab files/resultsYFS_NR.mat',
 'task1 - NR/Matlab files/resultsYMS_NR.mat',
 'task1 - NR/Matlab files/resultsYAG_NR.mat',
 'task1 - NR/Matlab files/resultsYLS_NR.mat',
 'task1 - NR/Matlab files/resultsYRP_NR.mat',
 'task1 - NR/Matlab files/resultsYSL_NR.mat',
 'task1 - NR/Matlab files/resultsYTL_NR.mat',
 'task1 - NR/Matlab files/resultsYAK_NR.mat',
 'task1 - NR/Matlab files/resultsYAC_NR.mat']

In [2]:
import numpy as np
import h5py
import pandas as pd
from tqdm import tqdm

import data_loading_helpers as dh

In [3]:
f = h5py.File(all_files[0])
sentence_data = f['sentenceData']
rawData = sentence_data['rawData']
contentData = sentence_data['content']
omissionR = sentence_data['omissionRate']
wordData = sentence_data['word']

In [4]:
list(sentence_data.keys())

['allFixations',
 'content',
 'mean_a1',
 'mean_a1_diff',
 'mean_a2',
 'mean_a2_diff',
 'mean_b1',
 'mean_b1_diff',
 'mean_b2',
 'mean_b2_diff',
 'mean_g1',
 'mean_g1_diff',
 'mean_g2',
 'mean_g2_diff',
 'mean_t1',
 'mean_t1_diff',
 'mean_t2',
 'mean_t2_diff',
 'omissionRate',
 'rawData',
 'word',
 'wordbounds']

In [5]:
allFixations = sentence_data['allFixations']
allFixations

<HDF5 dataset "allFixations": shape (349, 1), type "|O">

In [6]:
f[ allFixations[0][0] ]

<HDF5 group "/#refs#/#e/msx" (4 members)>

In [7]:
list(f[ allFixations[0][0] ].keys()) 

['duration', 'pupilsize', 'x', 'y']

In [8]:
f[ allFixations[0][0] ]['duration']

<HDF5 dataset "duration": shape (28, 1), type "<f8">

In [9]:
f[ allFixations[0][0] ]['pupilsize']

<HDF5 dataset "pupilsize": shape (28, 1), type "<f8">

In [10]:
f[ allFixations[0][0] ]['x']

<HDF5 dataset "x": shape (28, 1), type "<f8">

In [11]:
f[ allFixations[0][0] ]['y']

<HDF5 dataset "y": shape (28, 1), type "<f8">

In [12]:
f[ allFixations[0][0] ]['duration'] [:]

array([[ 71.],
       [112.],
       [ 56.],
       [ 80.],
       [ 79.],
       [102.],
       [134.],
       [ 92.],
       [121.],
       [ 89.],
       [ 87.],
       [103.],
       [ 97.],
       [131.],
       [ 64.],
       [ 95.],
       [114.],
       [ 51.],
       [106.],
       [124.],
       [154.],
       [ 94.],
       [ 94.],
       [114.],
       [ 71.],
       [151.],
       [ 76.],
       [194.]])

In [13]:
# checking if the numbers in wordData->fixPositions are indices to the arrays in allFixations
# will need to check wordbounds first

In [14]:
wordbounds = sentence_data['wordbounds']
wordbounds

<HDF5 dataset "wordbounds": shape (349, 1), type "|O">

In [15]:
f[ wordbounds[0][0] ]

<HDF5 dataset "Zxx": shape (4, 25), type "<f8">

In [16]:
(f[ wordbounds[0][0] ][0][0], f[ wordbounds[0][0] ][1][0], f[ wordbounds[0][0] ][2][0], f[ wordbounds[0][0] ][3][0])

(89.0, 166.0, 148.125, 184.0)

In [17]:
(f[ wordbounds[0][0] ][0][1], f[ wordbounds[0][0] ][1][1], f[ wordbounds[0][0] ][2][1], f[ wordbounds[0][0] ][3][0])

(148.125, 165.0, 204.875, 184.0)

In [18]:
(f[ wordbounds[0][0] ][0][2], f[ wordbounds[0][0] ][1][2], f[ wordbounds[0][0] ][2][2], f[ wordbounds[0][0] ][3][2])

(204.875, 165.0, 253.0, 180.0)

In [19]:
# ^ (xmin, ymin, xmax, ymax)

In [20]:
# Function that will check the validity of interpreting wordData->fixPositions contents as indices for allFixations.
# !!! It would seem that for some ungodly reason the wordData->fixPositons contents assume indexation starts from 1.
def check_indices():
    def helper_is_in_interval(x, xmin, xmax):
        if x < xmin:
            return False
        if x > xmax:
            return False
        return True
    fails = []
    out_of_range_debug = []
    for sentence_idx in tqdm(range(len(wordData))):
        allFixPositions = list(zip(np.squeeze(f[allFixations[sentence_idx][0]]['x'], axis=1), np.squeeze(f[allFixations[sentence_idx][0]]['y'], axis=1)))
        for word_idx in tqdm(range(len( f[wordData[sentence_idx][0]] ['fixPositions'] ))):
            if len( f[ f[wordData[sentence_idx][0]] ['fixPositions'][word_idx][0] ].shape ) != 2:
                continue
            fixIndices = np.squeeze(f[ f[wordData[sentence_idx][0]] ['fixPositions'][word_idx][0] ], axis=1)
            bounds = f[ wordbounds[sentence_idx][0] ] [:, word_idx]
            for idx in fixIndices:
                if idx != int(idx):
                    return (False, 'Not all indices are integers!')
                idx = int(idx)-1
                
                # index out of range debug
                if not helper_is_in_interval(idx, 0, len(allFixPositions)-1):
                    out_of_range_debug.append((sentence_idx, idx, 0, len(allFixPositions)-1))
                    continue
                
                if not (helper_is_in_interval(allFixPositions[idx][0], bounds[0], bounds[2]) and helper_is_in_interval(allFixPositions[idx][1], bounds[1], bounds[3])):
                    debug= [0]
                    if not helper_is_in_interval(allFixPositions[idx][0], bounds[0], bounds[2]):
                        debug[0]+=1
                        debug.append((allFixPositions[idx][0], bounds[0], bounds[2]))
                    if not helper_is_in_interval(allFixPositions[idx][1], bounds[1], bounds[3]):
                        debug[0]+=2
                        debug.append((allFixPositions[idx][1], bounds[1], bounds[3]))
                    fails.append(((sentence_idx, word_idx), debug))
                    #return (False, (sentence_idx, word_idx), debug)
    if len(fails)!=0 or len(out_of_range_debug)!=0:
        return (fails, out_of_range_debug)
    return True
check_indices()
                    

  0%|                                                   | 0/349 [00:00<?, ?it/s]
100%|█████████████████████████████████████████| 25/25 [00:00<00:00, 2159.70it/s][A

100%|█████████████████████████████████████████| 17/17 [00:00<00:00, 2051.59it/s][A

100%|█████████████████████████████████████████| 15/15 [00:00<00:00, 2291.47it/s][A

100%|█████████████████████████████████████████| 27/27 [00:00<00:00, 2020.56it/s][A

100%|█████████████████████████████████████████| 23/23 [00:00<00:00, 2525.83it/s][A

100%|█████████████████████████████████████████| 37/37 [00:00<00:00, 2361.26it/s][A

100%|█████████████████████████████████████████| 25/25 [00:00<00:00, 2051.16it/s][A

100%|█████████████████████████████████████████| 33/33 [00:00<00:00, 2256.47it/s][A
  2%|▉                                          | 8/349 [00:00<00:04, 71.23it/s]
100%|█████████████████████████████████████████| 13/13 [00:00<00:00, 2079.40it/s][A

100%|█████████████████████████████████████████| 22/22 [00:00<00:00, 2469.2

([((2, 2), [1, (253.6, 171.0, 248.5)]),
  ((2, 3), [3, (359.7, 248.5, 273.96875), (236.5, 165.0, 180.0)]),
  ((2, 10), [3, (524.6, 88.0, 180.3828125), (172.5, 229.0, 244.0)]),
  ((2, 12), [1, (172.1, 220.5, 284.125)]),
  ((2, 13), [1, (227.8, 284.125, 336.5234375)]),
  ((2, 14), [1, (309.3, 336.5234375, 432.0)]),
  ((6, 11), [1, (484.1, 197.0, 327.3984375)]),
  ((6, 14), [1, (222.6, 469.90625, 509.5)]),
  ((10, 1), [3, (305.4, 132.0, 175.0), (412.5, 169.0, 180.0)]),
  ((10, 3), [3, (182.5, 200.9765625, 248.4765625), (412.5, 167.0, 184.0)]),
  ((10, 6), [1, (214.3, 349.90625, 379.0)]),
  ((10, 8), [1, (172.6, 487.7578125, 540.5)]),
  ((10, 10), [3, (361.0, 89.0, 176.0), (172.5, 225.0, 243.0)]),
  ((10, 11), [3, (549.7, 176.0, 282.96875), (172.5, 226.0, 244.0)]),
  ((10, 12), [1, (218.1, 282.96875, 419.8828125)]),
  ((10, 12), [3, (520.1, 282.96875, 419.8828125), (172.5, 225.0, 244.0)]),
  ((10, 15), [1, (150.3, 488.5, 537.5)]),
  ((10, 15), [1, (172.0, 488.5, 537.5)]),
  ((10, 18), [1, 

In [21]:
f[ f[wordData[2][0]]['fixPositions'] [2][0] ] [:]

array([[9.]])

In [22]:
(f[ allFixations[2][0] ]['x'][8][0], f[ allFixations[2][0] ]['y'][8][0])

(253.6, 172.5)

In [23]:
(f[ wordbounds[2][0] ][0][2], f[ wordbounds[2][0] ][1][2], f[ wordbounds[2][0] ][2][2], f[ wordbounds[2][0] ][3][2])

(171.0, 165.0, 248.5, 180.0)

In [24]:
def get_all_fixIndices_for_sentence(sentence_idx=0):
    result = {}
    totalFixationCount = f[allFixations[sentence_idx][0]]['x'].shape[0]
    for word_idx in tqdm(range( len( f[wordData[sentence_idx][0]] ['fixPositions'] ) )):
        if len( f[ f[wordData[sentence_idx][0]] ['fixPositions'][word_idx][0] ].shape ) != 2:
                continue
        for item in np.squeeze(f[ f[wordData[sentence_idx][0]] ['fixPositions'][word_idx][0] ], axis=1):
            if item not in result:
                result[item]=1
            else:
                result[item]+=1
    return totalFixationCount, dict(sorted(result.items()))
get_all_fixIndices_for_sentence(252)

100%|█████████████████████████████████████████| 13/13 [00:00<00:00, 2980.05it/s]


(5, {1.0: 1, 2.0: 1, 3.0: 1, 4.0: 1, 5.0: 1, 6.0: 1})

In [25]:
# aaand, the dataset is busted
# The bustedness so far consists of (only the NR data of subject YHS was checked):
# 1. Words associated with fixations outside their bounds, some of which are further than 50 pixels away from the border
# 2. There are 2 sentences (102 and 252) with words associated with non-existing fixations (beyond the index limit in allFixations).

In [26]:
rawData

<HDF5 dataset "rawData": shape (349, 1), type "|O">

In [27]:
rawData[0][0]

<HDF5 object reference>

In [28]:
f[ rawData[0][0] ]

<HDF5 dataset "Of": shape (3274, 105), type "<f8">

In [29]:
# rawData seems to contain the raw EEG data

In [30]:
len(f[ rawData[0][0] ])

3274

In [31]:
f[ allFixations[0][0] ]['duration'][:, 0]

array([ 71., 112.,  56.,  80.,  79., 102., 134.,  92., 121.,  89.,  87.,
       103.,  97., 131.,  64.,  95., 114.,  51., 106., 124., 154.,  94.,
        94., 114.,  71., 151.,  76., 194.])

In [32]:
np.sum( f[ allFixations[0][0] ]['duration'][:, 0] )

2856.0

In [33]:
f[ f[ wordData[0][0] ]['fixPositions'] [0][0] ] [:, 0]

array([3.])

In [34]:
f[ f[ wordData[0][0] ]['fixPositions'] [1][0] ] [:, 0]

array([2.])

In [35]:
tmp = []
for widx in range(len(f[ wordData[0][0] ]['fixPositions'])):
    fixPos = np.array(f[ f[ wordData[0][0] ]['fixPositions'][widx][0] ])
    tmp.append(fixPos)
dh.extract_word_order_from_fixations(tmp)

[22,
 1,
 0,
 4,
 5,
 6,
 10,
 14,
 15,
 16,
 20,
 22,
 23,
 11,
 3,
 6,
 4,
 5,
 9,
 11,
 15,
 16,
 20,
 22,
 23,
 12,
 11,
 4]

In [36]:
tmp=dh.extract_word_level_data(f, f[wordData[0][0]])
#tmp[0]
tmp2=[]
for i in tmp.keys():
    if 'reading_order' in tmp[i]:
        tmp2.append((i, tmp[i]['reading_order']))
    else:
        tmp2.append((i, None))
tmp2

[(0, 2),
 (1, 1),
 (2, None),
 (3, 13),
 (4, 3),
 (5, 4),
 (6, 5),
 (7, None),
 (8, None),
 (9, 14),
 (10, 6),
 (11, 12),
 (12, 15),
 (13, None),
 (14, 7),
 (15, 8),
 (16, 9),
 (17, None),
 (18, None),
 (19, None),
 (20, 10),
 (21, None),
 (22, 0),
 (23, 11),
 (24, None)]

In [37]:
# looking for FRP-relevant data

In [38]:
rawData

<HDF5 dataset "rawData": shape (349, 1), type "|O">

In [39]:
f[ rawData[0][0] ]

<HDF5 dataset "Of": shape (3274, 105), type "<f8">

In [40]:
f[ rawData[1][0] ]

<HDF5 dataset "Pf": shape (2495, 105), type "<f8">

In [41]:
contentData

<HDF5 dataset "content": shape (349, 1), type "|O">

In [42]:
f[ contentData[0][0] ]

<HDF5 dataset "b": shape (152, 1), type "<u2">

In [43]:
f[ wordData[0][0] ]

<HDF5 group "/#refs#/vNb" (96 members)>

In [44]:
list(f[ wordData[0][0] ].keys())

['FFD',
 'FFD_a1',
 'FFD_a1_diff',
 'FFD_a2',
 'FFD_a2_diff',
 'FFD_b1',
 'FFD_b1_diff',
 'FFD_b2',
 'FFD_b2_diff',
 'FFD_g1',
 'FFD_g1_diff',
 'FFD_g2',
 'FFD_g2_diff',
 'FFD_pupilsize',
 'FFD_t1',
 'FFD_t1_diff',
 'FFD_t2',
 'FFD_t2_diff',
 'GD',
 'GD_a1',
 'GD_a1_diff',
 'GD_a2',
 'GD_a2_diff',
 'GD_b1',
 'GD_b1_diff',
 'GD_b2',
 'GD_b2_diff',
 'GD_g1',
 'GD_g1_diff',
 'GD_g2',
 'GD_g2_diff',
 'GD_pupilsize',
 'GD_t1',
 'GD_t1_diff',
 'GD_t2',
 'GD_t2_diff',
 'GPT',
 'GPT_a1',
 'GPT_a1_diff',
 'GPT_a2',
 'GPT_a2_diff',
 'GPT_b1',
 'GPT_b1_diff',
 'GPT_b2',
 'GPT_b2_diff',
 'GPT_g1',
 'GPT_g1_diff',
 'GPT_g2',
 'GPT_g2_diff',
 'GPT_pupilsize',
 'GPT_t1',
 'GPT_t1_diff',
 'GPT_t2',
 'GPT_t2_diff',
 'SFD',
 'SFD_a1',
 'SFD_a1_diff',
 'SFD_a2',
 'SFD_a2_diff',
 'SFD_b1',
 'SFD_b1_diff',
 'SFD_b2',
 'SFD_b2_diff',
 'SFD_g1',
 'SFD_g1_diff',
 'SFD_g2',
 'SFD_g2_diff',
 'SFD_pupilsize',
 'SFD_t1',
 'SFD_t1_diff',
 'SFD_t2',
 'SFD_t2_diff',
 'TRT',
 'TRT_a1',
 'TRT_a1_diff',
 'TRT_a2',
 'TR

In [45]:
f[ wordData[0][0] ]['FFD_a1']

<HDF5 dataset "FFD_a1": shape (25, 1), type "|O">

In [46]:
f[ f[ wordData[0][0] ]['FFD_a1'] [0][0] ]

<HDF5 dataset "aSb": shape (105, 1), type "<f8">

In [47]:
f[ f[ wordData[0][0] ]['FFD_a1'] [0][0] ] [:]

array([[0.6874614 ],
       [0.5069532 ],
       [0.36162168],
       [0.62940453],
       [0.51723752],
       [0.18245342],
       [0.67716419],
       [1.12280817],
       [0.94857309],
       [0.60052726],
       [0.38780863],
       [0.54821176],
       [0.88552557],
       [1.09511367],
       [0.73449303],
       [0.76387177],
       [0.85963046],
       [1.7931953 ],
       [0.71892968],
       [0.74304526],
       [0.06263754],
       [0.13402293],
       [0.12736848],
       [0.1246332 ],
       [0.38647165],
       [0.708815  ],
       [0.88107522],
       [0.81382108],
       [0.79380677],
       [0.89229209],
       [0.17045424],
       [0.6829877 ],
       [1.13340495],
       [0.89974947],
       [0.90402   ],
       [0.46207011],
       [0.68716916],
       [1.18163884],
       [0.93100749],
       [0.99780443],
       [0.52189303],
       [0.89131296],
       [1.16184906],
       [1.3970904 ],
       [1.15322937],
       [0.72872543],
       [0.18451192],
       [0.756

In [48]:
f[ f[ wordData[0][0] ]['TRT_a1'] [0][0] ]

<HDF5 dataset "s2b": shape (105, 1), type "<f8">

In [50]:
f[ f[ wordData[0][0] ]['TRT_a1'] [0][0] ] [:]

array([[0.6874614 ],
       [0.5069532 ],
       [0.36162168],
       [0.62940453],
       [0.51723752],
       [0.18245342],
       [0.67716419],
       [1.12280817],
       [0.94857309],
       [0.60052726],
       [0.38780863],
       [0.54821176],
       [0.88552557],
       [1.09511367],
       [0.73449303],
       [0.76387177],
       [0.85963046],
       [1.7931953 ],
       [0.71892968],
       [0.74304526],
       [0.06263754],
       [0.13402293],
       [0.12736848],
       [0.1246332 ],
       [0.38647165],
       [0.708815  ],
       [0.88107522],
       [0.81382108],
       [0.79380677],
       [0.89229209],
       [0.17045424],
       [0.6829877 ],
       [1.13340495],
       [0.89974947],
       [0.90402   ],
       [0.46207011],
       [0.68716916],
       [1.18163884],
       [0.93100749],
       [0.99780443],
       [0.52189303],
       [0.89131296],
       [1.16184906],
       [1.3970904 ],
       [1.15322937],
       [0.72872543],
       [0.18451192],
       [0.756

In [51]:
f[ f[ wordData[0][0] ]['SFD_a1'] [0][0] ]

<HDF5 dataset "gYb": shape (105, 1), type "<f8">

In [52]:
f[ f[ wordData[0][0] ]['GD_a1'] [0][0] ]

<HDF5 dataset "I9b": shape (105, 1), type "<f8">

In [53]:
f[ f[ wordData[0][0] ]['GPT_a1'] [0][0] ]

<HDF5 dataset "Yfc": shape (105, 1), type "<f8">