## Applies Deep Learning methods to ePodium dataset for prediction of Dyslexia.

#### Import Packages

In [1]:
import mne
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

from functions import epodium
from models.dnn import fully_connected_model
from models.transformer import TransformerModel

import local_paths

2022-08-15 08:33:01.399019: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-15 08:33:01.399055: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


#### Check number of epochs in each experiment
Experiments with enough epochs are added to *clean_list*

In [2]:
standard_minimum = 180 # total of 360
deviant_minimum = 80 # total size of 120
firststandard_minimum = 80 # total size of 120

count_analyzed = 0
count_bad = 0

clean_list = []

firststandard_index = [1, 4, 7, 10]
standard_index = [2, 5, 8, 11]
deviant_index = [3, 6, 9, 12]

for event_file in os.listdir(local_paths.ePod_processed_autoreject_events):
    if event_file.endswith('.txt') and len(event_file) == 8:
        # print(f"Analyzing {event_file}")
        count_analyzed += 1
        event = np.loadtxt(os.path.join(local_paths.ePod_processed_autoreject_events, event_file), dtype=int)

        # Count how many events are left in standard, deviant, and FS
        for i in range(4): 
            if (np.count_nonzero(event[:, 2] == standard_index[i]) < standard_minimum
            or np.count_nonzero(event[:, 2] == deviant_index[i]) < deviant_minimum
            or np.count_nonzero(event[:, 2] == firststandard_index[i]) < firststandard_minimum):
                count_bad += 1
                break
            if i == 3: # No bads found at end of for loop
                clean_list.append(event_file)   

clean_list = sorted(clean_list)
print(f"Analyzed: {count_analyzed}, bad: {count_bad}")
print(f"{len(clean_list)} files have enough epochs for analysis.")

Analyzed: 188, bad: 37
151 files have enough epochs for analysis.


#### Split into train and test dataset
Both the train and test sets have the same proportion of participants that did either a, b, or both experiments

In [3]:
# Split test/train on participant
experiments = [file.replace('.txt', '') for file in clean_list]

# Split experiments into participants that did a, b, and both
experiments_a = [file.replace('a', '') for file in experiments]
experiments_a = [item for item in experiments_a if len(item) == 3]
experiments_b = [file.replace('b', '') for file in experiments]
experiments_b = [item for item in experiments_b if len(item) == 3]
experiments_a_and_b = [file for file in experiments_a if file in experiments_b]
experiments_a_only = [file for file in experiments_a if file not in experiments_b]
experiments_b_only = [file for file in experiments_b if file not in experiments_a]

participants = sorted(experiments_a_and_b + experiments_a_only + experiments_b_only)

# Split participants into train and test dataset
train_ab, test_ab = train_test_split(experiments_a_and_b, test_size=0.25)  
train_a, test_a = train_test_split(experiments_a_only, test_size=0.25) 
train_b, test_b = train_test_split(experiments_b_only, test_size=0.25) 

train = [x + 'a' for x in train_ab] + [x + 'b' for x in train_ab] + \
        [x + 'a' for x in train_a] + [x + 'b' for x in train_b]
test = [x + 'a' for x in test_ab] + [x + 'b' for x in test_ab] + \
       [x + 'a' for x in test_a] + [x + 'b' for x in test_b]

#### Create Iterator Sequence as input to feed the model
https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence


In [4]:
class EvokedIterator(Sequence):
    
    def __init__(self, experiments, n_experiments = 8, n_trials_averaged = 60):
        self.experiments = experiments                
        self.n_experiments = n_experiments
        self.n_trials_averaged = n_trials_averaged
                
        metadata_path = os.path.join(local_paths.ePod_metadata, "children.txt")
        self.metadata = pd.read_table(metadata_path)
        
        event_types = 12 # (FS/S/D in 4 conditions)
        self.n_files =  len(self.experiments) * event_types        
        self.batch_size = self.n_experiments * event_types
    
    def __len__(self):
        # The number of batches in the Sequence.
        return int(np.ceil(len(self.experiments) / self.n_experiments))   
    
    def __getitem__(self, index):
        
        x_batch = []
        y_batch = []
        
        for i in range(self.n_experiments):
            participant_index = (index * self.n_experiments + i) % len(self.experiments)
            participant_id = self.experiments[participant_index][:3]
            participant_metadata = self.metadata.loc[self.metadata['ParticipantID'] == float(participant_id)]
            
            for key in epodium.event_dictionary:
            
                # Get file
                npy_name = self.experiments[participant_index] + "_" + key + ".npy"
                npy_path = os.path.join(local_paths.ePod_processed_autoreject_epochs_split_downsampled, npy_name)
                npy = np.load(npy_path)     
                
                # Create ERP from averaging 'n_trials_averaged' trials.
                trial_indexes = np.random.choice(npy.shape[0], self.n_trials_averaged, replace=False)
                evoked = np.mean(npy[trial_indexes,:,:], axis=0)
                x_batch.append(evoked)
                
                # Create labels
                y = np.zeros(5)
                if(participant_metadata["Sex"].item() == "F"):
                    y[0] = 1
                if(participant_metadata["Group_AccToParents"].item() == "At risk"):
                    y[1] = 1
                if(key.endswith("_FS")):
                    y[2] = 1
                if(key.endswith("_S")):
                    y[3] = 1                
                if(key.endswith("_D")):
                    y[4] = 1
                y_batch.append(y)        
        
        return np.array(x_batch), np.array(y_batch)
    
train_sequence = EvokedIterator(train)
test_sequence = EvokedIterator(test)
# x,y = train_sequence.__getitem__(0)
# x.shape

#### Train model

The data is an *evoked* or *ERP* from a participant in the ePodium experiment. 60 EEG signals were averaged from -0.2 to +0.8 seconds after onset of an event. This is done for each of the 12 event types seperately.

dimensions: 
+ x (batches, timesteps, channels)
+ y (batches, labels)

labels: 
+ (Sex, At risk of dyslexia, first standard, standard, deviant)


In [None]:
# fit network
try:
    print(f"{model} already loaded")
except:
    print("initialise model")
    model = TransformerModel()
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                             loss=tf.keras.losses.BinaryCrossentropy(),
                             metrics=[tf.keras.metrics.Precision(),tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Recall()])

    output_filename = 'fully_connecteed_model'
    output_file = os.path.join(local_paths.models, output_filename)
    checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=1200, verbose=1)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=200, min_lr=0.0001, verbose=1)

history = model.fit(x=train_sequence,
                    validation_data=test_sequence,
                    epochs=100, 
                    callbacks=[checkpointer, earlystopper, reduce_lr])

initialise model


2022-08-15 08:33:06.272643: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-08-15 08:33:06.272672: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-08-15 08:33:06.272693: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (floriscpu): /proc/driver/nvidia/version does not exist
2022-08-15 08:33:06.272849: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100


2022-08-15 08:33:29.022625: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 805306368 exceeds 10% of free system memory.
2022-08-15 08:33:29.337401: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 805306368 exceeds 10% of free system memory.
2022-08-15 08:33:29.917128: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 805306368 exceeds 10% of free system memory.
2022-08-15 08:33:30.195133: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 805306368 exceeds 10% of free system memory.
2022-08-15 08:33:30.826165: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 805306368 exceeds 10% of free system memory.


