# Training Deep Learning Models.

In this notebook the Dutch Dyslexia Programme (DDP) and ePodium dataset are used to train a deep neural network model.
The model is trained to predict the age and risk of dyslexia.
The input data consists of averaged epochs of the EEG data.

+ In section 1. [Prepare Dataset](#1mt) ...
+ In section 2. [Deep Learning](#2mt) ...

It is recommended to run this notebook with CUDA enabled with a dedicated graphics card to speed-up the training.

#### Import Packages

In [1]:
import mne
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import pandas as pd

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy, MeanSquaredError
from tensorflow.keras.metrics import Precision, BinaryAccuracy, Recall
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

import local_paths
from functions import processing, display_helper, data_io
from functions.epodium import Epodium
from functions.ddp import DDP
#from functions.train_and_predict import EpodiumSequence, DDPSequence

from models import transformer
from models.dnn import fully_connected_model
from models.hfawaz import cnn, encoder

<br>



---
<a id='1mt'></a>
## 1. Prepare Dataset

__input dimensions__: 
+ x (batches, timesteps, channels)
+ y (batches, labels)

__labels__: 
+ Binary: At risk of dyslexia
+ Regressive: Age, Vocabulary



In [2]:
dataset_name = "ddp" # "epodium" "ddp"

if dataset_name == "epodium":
    dataset = Epodium()    
    epochs_directory = local_paths.ePod_epochs
    event_directory = local_paths.ePod_epochs_events
    
    epod_labels = dataset.create_labels(local_paths.ePod_metadata)
    print(f"The available labels are:\n {list(epod_labels.columns)}")

elif dataset_name == "ddp":
    dataset = DDP()
    epochs_directory = local_paths.DDP_epochs
    event_directory = local_paths.DDP_epochs_events
    
    directory_age_metadata = os.path.join(local_paths.DDP_metadata, "ages")
    ddp_labels = dataset.create_labels(local_paths.DDP_dataset, directory_age_metadata)
    print(f"The available labels are:\n {list(ddp_labels.columns)}") 

The available labels are:
 ['filename', 'participant', 'age_group', 'age_days']


#### Split processed epochs* into train and test sequence.

    *In the context of electroencephalography (EEG), *epochs* are EEG segments in which an event occurs. In the context of deep learning, *epochs* are iterations over the entire training dataset.

First choose which processed data to use

In [3]:
experiment_list = processing.valid_experiments(dataset, event_directory, min_standards=180, min_deviants=80)
experiments_train, experiments_test = dataset.split_train_test_datasets(experiment_list)

Analyzed: 1057, bad: 238
819 experiments have enough epochs for analysis.
The dataset is split up into 618 train and 201 test experiments


#### Preparing data iterator (Sequence) as input to the deep learning models.
https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence

In [4]:
from tensorflow.keras.utils import Sequence
import random

class EpodiumSequence(Sequence):
    """
        An Iterator Sequence class as input to feed the model.
        The next value is given from the __getitem__ function.
        For more information on Sequences, go to:
        https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence

        self.labels contains:  ['Participant', 'Age_days_a', 'Age_days_b', 'Risk_of_dyslexia']
    """    

    def __init__(self, experiments, target_labels, epochs_directory, n_experiments_batch=8, n_trials_averaged=30, gaussian_noise=0):
        self.experiments = experiments
        self.labels = target_labels
        self.epochs_directory = epochs_directory
        
        self.n_experiments_batch = n_experiments_batch
        self.n_trials_averaged = n_trials_averaged
        self.gaussian_noise = gaussian_noise


    # The number of experiments in the entire dataset.
    def __len__(self):
        return int(np.ceil(len(self.experiments)/self.n_experiments_batch))

    def __getitem__(self, index, verbose=False):        
        x_batch = []
        y_batch = []
        
        #print(self.labels)

        for i in range(self.n_experiments_batch):

            # Set participant
            experiment_index = (index * self.n_experiments_batch + i) % len(self.experiments)
            experiment = self.experiments[experiment_index]
            participant = experiment[:3]
            participant_labels = self.labels.loc[self.labels['Participant']==float(participant)]

            if(verbose):
                print(experiment)
                
            # Load .fif file
            path_epochs = os.path.join(epochs_directory, experiment + "_epo.fif")
            epochs = mne.read_epochs(path_epochs, verbose=0)
            print(epochs)
            
            # A data instance is created for each condition
            for condition in ['GiepM', "GiepS", "GopM", "GopS"]:
                
                standard_event = condition + '_S'
                deviant_event = condition + '_D'
                npy_S = epochs[standard_event].get_data() # TODO: DDP channels/ePod
                npy_D = epochs[deviant_event].get_data()
                                
                # Create ERP from averaging 'n_trials_averaged' trials.
                trial_indexes_S = np.random.choice(npy_S.shape[0], self.n_trials_averaged, replace=False)
                evoked_S = np.mean(npy_S[trial_indexes_S,:,:], axis=0)
                trial_indexes_D = np.random.choice(npy_D.shape[0], self.n_trials_averaged, replace=False)
                evoked_D = np.mean(npy_D[trial_indexes_D,:,:], axis=0)
                
                x_batch.append(evoked_S)

                ## Merge Standard and Deviant evoked along the channel dimensions.
                # evoked = np.concatenate((evoked_S, evoked_D))
                # evoked += np.random.normal(0, self.gaussian_noise, evoked.shape)
                # x_batch.append(evoked)

                # Binary labels:
                # y = np.zeros(2)
                # if participant_labels["Sex"].item() == "M" :
                #     y[0] = 1
                # if participant_labels["Group_AccToParents"].item() == "At risk":
                #     y[1] = 1
                
                # Append age to target 'y'
                if str(experiment[-1]) == "a":
                    y = int(participant_labels[f"Age_days_a"].item())
                elif str(experiment[-1]) == "b":
                    try: 
                        y = int(participant_labels[f"Age_days_b"].item())
                    except: # If age of 'b' experiment not in metadata
                        y = int(participant_labels[f"Age_days_a"].item()) + 120

                y_batch.append(y)

        # Shuffle batch
        shuffle_batch = list(zip(x_batch, y_batch))
        random.shuffle(shuffle_batch)
        x_batch, y_batch = zip(*shuffle_batch)

        return np.array(x_batch), np.array(y_batch)
    

In [5]:
if dataset_name == "epodium":
    train_sequence = EpodiumSequence(experiments_train, epod_labels, epochs_directory, n_experiments_batch=1, gaussian_noise=1e-6)
    test_sequence = EpodiumSequence(experiments_test, epod_labels, epochs_directory, n_experiments_batch=1)

#### Visualise data instance

During processing, the epochs are chosen to be 1 second in which the event occurs at 0.2s.

In [6]:
x, y = test_sequence.__getitem__(6, True)
print(f"The shape of one data instance is {x[0].shape}")

index = 3 # 4 data instances for each experiment
display_helper.plot_array_as_evoked(x[index], dataset.channel_names, frequency=2048, n_trials=30)
# display_helper.plot_array_as_evoked(x[index][32:], frequency=128)

NameError: name 'test_sequence' is not defined

<br>
<br>

---
<a id='2mt'></a>
## 2. Deep Learning

The data is an *evoked* or *ERP* from a participant in the ePodium experiment. 60 EEG signals were averaged from -0.2 to +0.8 seconds after onset of an event. This is done for each of the 12 event types seperately.




#### Train model

In [None]:
model_name = "encoder_age_128_3"
model = encoder((64,128), 1)
epochs = 300
learning_rate = 1e-5

# Paths to save model info
base_path = os.path.join(local_paths.models, model_name)

path_history = os.path.join(base_path, "history.npy")
path_model = os.path.join(base_path, "model")
path_testset = os.path.join(base_path, "testset.txt")
path_weights = os.path.join(base_path, "weights.h5")

if os.path.exists(path_model):
    print(f"Model: '{model_name}' already exist. Delete the existing model first or rename this model.")    
else:
    print(f"Create model: {model_name}")
    if not os.path.exists(base_path):
        os.mkdir(base_path)

    # Save validation-set for future testing
    with open(path_testset, 'w') as f:
        for participant in test:
            f.write(participant + '\n')

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss=MeanSquaredError()) # , metrics=[Precision(), BinaryAccuracy(), Recall()]

    # Fit model
    checkpointer = ModelCheckpoint(filepath=path_weights, monitor='val_loss', verbose=1, save_weights_only=True, save_best_only=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=10, factor=0.7, verbose=1) # add to callbacks if uncomment
    history = model.fit(x=train_sequence, validation_data=test_sequence, epochs=epochs, callbacks=[checkpointer])

    np.save(path_history, history.history)
    model.save(path_model)

#### Show Training History

In [None]:
display_helper.show_plot(x=range(len(history.history['loss'])), y=history.history['loss'], xlabel="epochs", ylabel="validation loss", title=f"Loss during training ({model_name})")
display_helper.show_plot(x=range(len(history.history['loss'])), y=history.history['val_loss'], xlabel="epochs", ylabel="validation loss", title=f"Validation loss during training ({model_name})")