## Deep Learning

+ Create labels for processed data
+ Create generator because of limited memory
+ Load multiple deep learning models models
+ Train the EEG data and labels on the models
+ Load and save the models
+ Visualise the results and performance

Modified from: https://github.com/epodium/EEG_age_prediction


In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import mne
from IPython.display import clear_output

# PATH_MAIN = os.path.join(main_path, 'researchdrive', 'ePodium (Projectfolder)')
PATH_MAIN = os.path.join('D:', 'EEG Data', 'DDP Surfdrive')
PATH_METADATA = os.path.join(PATH_MAIN, 'metadata')
PATH_PROCESSED = os.path.join(PATH_MAIN, 'processed')
PATH_MODELS = os.path.join(PATH_MAIN, 'models')

dataframe = pd.read_excel(os.path.join(PATH_MAIN, 'DDP_age_labels.xlsx'), index_col=0) 

FileNotFoundError: [Errno 2] No such file or directory: 'D:/EEG Data/DDP Surfdrive/DDP_age_labels.xlsx'

#### Create and Load metadata file of processed data

In [2]:
excel_filepath = os.path.join(PATH_MAIN, 'DDP_age_labels_processed.xlsx')

if(os.path.exists(excel_filepath)):
    dataframe_processed = pd.read_excel(excel_filepath)
else:
    # Storing each column seperately, before concatinating as DataFrame
    code_list = []
    path_list = []
    file_list = []
    age_group_list = []
    age_days_list = []

    files_path = glob.glob(PATH_PROCESSED + '/*.npy')
    for i, file_path in enumerate(files_path):
        filename = os.path.basename(os.path.splitext(file_path)[0])  
        data = dataframe.loc[dataframe['cnt_file'] == filename]

        code_list.append(data['code'].values[0] )    
        path_list.append(file_path)
        file_list.append(filename)
        age_group_list.append( data['age_group'].values[0])
        age_days_list.append(data['age_days'].values[0])

    dataframe_processed = pd.DataFrame({"code": code_list, 'path': path_list, "file": file_list, 'age_group': age_group_list, 'age_days': age_days_list})
    dataframe_processed.to_excel(excel_filepath, index = True)

In [3]:
from sklearn.model_selection import train_test_split

subject_ids = sorted(list(set(dataframe_processed["code"].tolist())))
IDs_train, IDs_temp = train_test_split(subject_ids, test_size=0.3, random_state=42)
IDs_test, IDs_val = train_test_split(IDs_temp, test_size=0.5, random_state=42)

In [4]:
subject_ids = sorted(list(set(dataframe_processed["code"].tolist())))
print(f"{len(subject_ids)} unique subject id's")

304 unique subject id's


In [5]:
from dataset_generator import DataGenerator

In [6]:
train_generator_noise = DataGenerator(list_IDs = IDs_train,
                                      BASE_PATH = PATH_PROCESSED,
                                      metadata = dataframe_processed,
                                      n_average = 30,
                                      batch_size = 10,
                                      gaussian_noise=0.01,
                                      iter_per_epoch = 30,
                                      n_timepoints = 501, 
                                      n_channels=30, 
                                      shuffle=True)

val_generator = DataGenerator(list_IDs = IDs_val,
                              BASE_PATH = PATH_PROCESSED,
                              metadata = dataframe_processed,
                              n_average = 30,
                              batch_size = 10,
                              iter_per_epoch = 100,
                              n_timepoints = 501,
                              n_channels=30,
                              shuffle=True)

In [7]:
train_generator_noise.__getitem__(0)

D:EEG Data\DDP Surfdrive\processed\126_11_mr_mmn2_25_wk_mmn47_wk_mmn58_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\345_11_mc_mmn58_slp.cnt.npy
D:EEG Data\DDP Surfdrive\processed\334_23_mc_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\472_35_jd_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\466_29_md_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\162_11_mr_mmn36.cnt.npy
D:EEG Data\DDP Surfdrive\processed\494_11_jd_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\001_23_jc_mmn36_wk_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\148_35_mr_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\637-479-29m-mc-mmn36.cnt.npy


(array([[[ 2.12376484e-01,  3.33735747e-02,  1.83864336e-01, ...,
           3.63006617e-02,  2.19464798e-02,  2.83105898e-02],
         [ 2.31186848e-01,  6.34548754e-02,  2.05194632e-01, ...,
           1.51055446e-02,  4.55502839e-02, -4.09207915e-02],
         [ 2.24586755e-01,  5.29160757e-02,  2.02865850e-01, ...,
           2.13269079e-02,  1.99130268e-02, -6.34920408e-02],
         ...,
         [ 6.38247445e-02,  1.07084180e-01,  6.47480873e-01, ...,
          -1.15780907e-01, -2.23342328e-01, -3.39567968e-01],
         [ 7.37314609e-02,  1.08483372e-01,  6.45949453e-01, ...,
          -1.30336869e-01, -2.85129156e-01, -3.20048528e-01],
         [ 5.77828149e-02,  7.74539263e-02,  6.26372186e-01, ...,
          -1.42107016e-01, -3.07350385e-01, -3.18309913e-01]],
 
        [[ 2.51031182e-02, -8.77178173e-02, -8.16399840e-02, ...,
           4.27777225e-02,  1.81261858e-01,  1.88736271e-01],
         [ 3.15925440e-02, -9.07187258e-02, -1.11924316e-01, ...,
           3.36567883

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Input, Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense, Conv1D, LeakyReLU, AveragePooling1D, Flatten, Reshape, MaxPooling1D
from tensorflow.keras.optimizers import Adam, Adadelta, SGD
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
import time

n_timesteps = 501
n_features = 30 
n_outputs = 1

input_shape = (n_timesteps, n_features)

In [9]:
def evaluate_model(model):
    """ Evaluates the model """
    model.evaluate(train_generator_noise)
    model.evaluate(val_generator)
    model.evaluate(test_generator) 
    
def print_few_predictions(model):
    """ Prints a few predictions, as a sanity check """
    x_test, y_test = test_generator.__getitem__(0)

    print(model.predict(x_test))
    print(y_test)
    
def plot_loss(history):
    """ Plots the MSE, RMSE, and MAE loss for the training and validation data over time """
    
    %matplotlib inline
    
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=(12,12), dpi=200)

    ax1.plot(history.history['loss'], label='training data')  
    min_loss = min(history.history['val_loss'])
    val_plot1 = ax1.plot(history.history['val_loss'], label='validation data')
    ax1.axhline(y = min_loss, color = val_plot1[0].get_color(), linestyle = '--') 
    x0,x1 = ax1.get_xlim()
    ax1.text(x1, min_loss, "{:.2f}".format(min_loss), ha='left', va='center')
    ax1.set_title('MSE loss')
    ax1.set_ylabel("MSE")
    ax1.set_xlabel("epochs")
    ax1.legend()

    ax2.plot(history.history['root_mean_squared_error'], label='training data')
    min_loss = min(history.history['val_root_mean_squared_error'])
    val_plot2 = ax2.plot(history.history['val_root_mean_squared_error'], label='validation data')
    ax2.axhline(y = min_loss, color = val_plot2[0].get_color(), linestyle = '--') 
    x0,x1 = ax2.get_xlim()
    ax2.text(x1, min_loss, "{:.2f}".format(min_loss), ha='left', va='center')
    ax2.set_title('RMSE loss')
    ax2.set_ylabel("RMSE")
    ax2.set_xlabel("epochs")
    ax2.legend()
    
    ax3.plot(history.history['mean_absolute_error'], label='training data')    
    min_loss = min(history.history['val_mean_absolute_error'])
    val_plot3 = ax3.plot(history.history['val_mean_absolute_error'], label='validation data')
    ax3.axhline(y = min_loss, color = val_plot3[0].get_color(), linestyle = '--') 
    x0,x1 = ax3.get_xlim()
    ax3.text(x1, min_loss, "{:.2f}".format(min_loss), ha='left', va='center')
    ax3.set_title('MAE loss')
    ax3.set_ylabel("MAE")
    ax3.set_xlabel("epochs")
    ax3.legend()

In [10]:
def fully_connected_model():
    """ Returns the fully connected model from Ismail Fawaz et al. (2019). """

    input_layer = keras.layers.Input(input_shape)

    input_layer_flattened = keras.layers.Flatten()(input_layer)

    layer_1 = keras.layers.Dropout(0.1)(input_layer_flattened)
    layer_1 = keras.layers.Dense(500, activation='relu')(layer_1)

    layer_2 = keras.layers.Dropout(0.2)(layer_1)
    layer_2 = keras.layers.Dense(500, activation='relu')(layer_2)

    layer_3 = keras.layers.Dropout(0.2)(layer_2)
    layer_3 = keras.layers.Dense(500, activation='relu')(layer_3)

    output_layer = keras.layers.Dropout(0.3)(layer_3)
    output_layer = keras.layers.Dense(1)(output_layer)

    model = keras.models.Model(inputs=input_layer, outputs=output_layer)

    return model

In [11]:
model = fully_connected_model()

optimizer = Adadelta(learning_rate=0.01)    
              
model.compile(loss='mean_squared_error', 
              optimizer=optimizer, 
              metrics=[RootMeanSquaredError(), MeanAbsoluteError()])

# 01 seems to be incorrect (makes too many predictions, changed model)
# Fully_connected_regressor_01: MSE, Adadelta, N_average=30, 5000 epochs, ES=1000, RLR=200, gaussian=0.01
# Fully_connected_regressor_02: MSE, Adadelta, N_average=30, 5000 epochs, ES=1000, RLR=200, gaussian=0.01
output_filename = 'Fully_connected_regressor_FINAL_LOCAL'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=1200, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=200, min_lr=0.0001, verbose=1)

In [12]:
%%time

from dataset_generator import DataGenerator
epochs = 5000

# fit network
history = model.fit(x=train_generator_noise,
                    validation_data=val_generator,
                    epochs=epochs, 
                    callbacks=[checkpointer, earlystopper, reduce_lr])

D:EEG Data\DDP Surfdrive\processed\126_11_mr_mmn2_25_wk_mmn47_wk_mmn58_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\345_11_mc_mmn58_slp.cnt.npy
D:EEG Data\DDP Surfdrive\processed\334_23_mc_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\472_35_jd_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\466_29_md_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\162_11_mr_mmn36.cnt.npy
D:EEG Data\DDP Surfdrive\processed\494_11_jd_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\001_23_jc_mmn36_wk_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\148_35_mr_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\637-479-29m-mc-mmn36.cnt.npy
Epoch 1/5000
D:EEG Data\DDP Surfdrive\processed\015_35_jc_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\147_35_jd_mmn36_2.cnt.npy
D:EEG Data\DDP Surfdrive\processed\173_35_jr_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\118_29_mr_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\141_17_jr_mmn58.cnt.npy
D:EEG Data\DDP Surfdrive\process

  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


D:EEG Data\DDP Surfdrive\processed\486_23_jd_mmn36_wk.cnt.npy
 33/144 [=====>........................] - ETA: 4:01 - loss: nan - root_mean_squared_error: nan - mean_absolute_error: nan                  D:EEG Data\DDP Surfdrive\processed\025_11_mc_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\705-050-41m-jr-mmn36.cnt.npy
D:EEG Data\DDP Surfdrive\processed\181_11_jr_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\610-185-29m-jc-mmn36-2-waak.cnt.npy
D:EEG Data\DDP Surfdrive\processed\129_35_jr_mmn36_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\123_23_jr_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\448_41_jd_mmn39_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\306_17_mc_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\430_23_md_mmn25_wk.cnt.npy
D:EEG Data\DDP Surfdrive\processed\616-240-29m-mc-mmn36.cnt.npy
D:EEG Data\DDP Surfdrive\processed\306_11_mc_mmn_2.cnt.npy
D:EEG Data\DDP Surfdrive\processed\034_11_mc_mmn47_slp.cnt.npy
D:EEG Data\DDP Surfdrive\processed\307_29_j