## Deep learning models for age prediction on EEG data

This notebook aims to experiment with different deep learning architectures for age predicting using EEG data. This notebook uses a processed dataset that was preprocessed differently (not like the preprocessing notebooks included in this project).

The first model is a similar model as in "EEG-Based Age and Gender Prediction Using Deep BLSTM-LSTM Network Model" by Kaushik et al (2019). In this paper, they predict (classification) gender and age using EEG data with this model. However, for age prediction they've used age group bins (e.g. 42-55 years old). We change this classification task to a regression task for our models, and also experiment with different architectures.

In [1]:
import sys
import os
import fnmatch
import csv

sys.path.insert(0, os.path.dirname(os.getcwd()))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Input
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, BatchNormalization, Dense, Conv1D, LeakyReLU, AveragePooling1D, Flatten, Reshape, MaxPooling1D
from tensorflow.keras.optimizers import Adam, Adadelta
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from dataset_generator_old import DataGenerator

In [2]:
from config import PATH_DATA_PROCESSED_DL, PATH_MODELS

## Load preprocessed data

In [3]:
dirs = os.listdir(PATH_DATA_PROCESSED_DL) # The files from directories containing the processed data
dirs

['processed_metadata_602-115-17m-mc-mmn.csv',
 'processed_metadata_602-115-29m-mc-mmn36.csv',
 'processed_metadata_604-133-17m-jc-mmn36.csv',
 'processed_metadata_604-133-29m-jc-mmn36.csv',
 'processed_metadata_605-131-17m-jc-mmn.csv',
 'processed_metadata_605-131-29m-jc-mmn36.csv',
 'processed_metadata_607-000-17m-jc-mmn1_36.csv',
 'processed_metadata_607-128-29m-jc-mmn36.csv',
 'processed_metadata_608-170-17m-mc-mmn36_2.csv',
 'processed_metadata_608-170-29m-mc-mmn36.csv',
 'processed_metadata_609-158-17m-jc-mmn36.csv',
 'processed_metadata_609-158-29m-jc-mmn36.csv',
 'processed_metadata_610-185-29m-jc-mmn36-2-waak.csv',
 'processed_metadata_610_185_17m_jc_mmn36.csv',
 'processed_metadata_611_157_17m_mc_mmn36.csv',
 'processed_metadata_613-176-17m-mc-mmn36.csv',
 'processed_metadata_613-176-29m-mc-mmn36.csv',
 'processed_metadata_618-163-17m-j-c-mmn36.csv',
 'processed_metadata_618-163-29m-jc-mmn36.csv',
 'processed_metadata_619-247-17m-mc-mmn36.csv',
 'processed_metadata_620_313_17m

In [4]:
dir_path = PATH_DATA_PROCESSED_DL

In [5]:
files_npy = []
files_csv = []

# Extend the two lists above with all the file names of the data files (.npy and .csv files)
for file in dirs:
    #print(file)
#     dir_path = os.path.join(PATH_DATA_PROCESSED_DL, directory)
#     files_in_dir = os.listdir(dir_path)
    files_npy.extend([os.path.join(dir_path, file_name) for file_name in fnmatch.filter(dirs, "*.npy")])
    files_csv.extend([os.path.join(dir_path, file_name) for file_name in fnmatch.filter(dirs, "*.csv")])

In [6]:
files_npy

['C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw602-115-17m-mc-mmn.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw602-115-29m-mc-mmn36.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw604-133-17m-jc-mmn36.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw604-133-29m-jc-mmn36.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw605-131-17m-jc-mmn.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw605-131-29m-jc-mmn36.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw607-000-17m-jc-mmn1_36.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw607-128-29m-jc-mmn36.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw608-170-17m-mc-mmn36_2.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw608-170-29m-mc-mmn36.npy',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_raw609-158-17m-jc-

In [7]:
len(files_csv), len(files_npy)

(32448, 32448)

### Count the (main) labels for all present files

In [8]:
def read_labels(filename, PATH):
    "Opens the file, reads the meta data and returns the first value"
    
    metadata = []
    filename = os.path.join(PATH, filename)
    
    with open(filename, 'r') as readFile:
        reader = csv.reader(readFile, delimiter=',')
        for row in reader:
            #if len(row) > 0:
            metadata.append(row)
            
    readFile.close()
    return metadata[0]

In [9]:
print(type(files_csv[1]))

<class 'str'>


In [10]:
# Slightly 'hacky' way to determine the age related to the data signal

import re

label_collection = []
label_counts = []

for filename in files_csv: # Look at all .csv files
    
#     age_in_months = re.findall(r'\d+', filename)[0] # Read the age in months from the file name
#     print(age_in_months)
    if '17m' in filename:
        # print(filename)
        age_in_months = 17
    if '29m' in filename:
        # print(filename)
        age_in_months = 29
    

    y_EEG = read_labels(filename, PATH_DATA_PROCESSED_DL)
    y_EEG = [float(age_in_months) for label in y_EEG] # Overwrite the label found in the data 
                                                      # with the age in months from the file name
    
    labels_unique = list(set(y_EEG)) # Determine unique labels
    label_collection.append(labels_unique)
    
    # Count instances for each unique label
    label_count = []
    
    for label in labels_unique:
        idx = np.where(np.array(y_EEG) == label)[0]
        label_count.append(len(idx))
    label_counts.append(label_count)

label_counts[:10], label_collection

([[7], [7], [7], [7], [7], [7], [7], [7], [7], [7]],
 [[17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [29.0],
  [17.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [17.0],
  [29.0],
  [29.0],
  [17.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [29.0],
  [17.0],
  [17.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [29.0],
  [17.0],
  [17.0],
  [17.0],
  [17.0],
  [29.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0],
  [29.0],
  [29.0],
  [17.0],
  [29.0],
  [17.0

In [11]:
main_labels = [label[0] for label in label_collection]
len(main_labels)

32448

In [12]:
main_labels.count(17), main_labels.count(23), main_labels.count(29), main_labels.count(35), main_labels.count(41)

(19344, 0, 13104, 0, 0)

## Import and initiate data generator function

In [13]:
files_csv[:5]

['C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_metadata_602-115-17m-mc-mmn.csv',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_metadata_602-115-29m-mc-mmn36.csv',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_metadata_604-133-17m-jc-mmn36.csv',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_metadata_604-133-29m-jc-mmn36.csv',
 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_metadata_605-131-17m-jc-mmn.csv']

### Split data set into train, validation and test sets

In [14]:
for label in list(set(main_labels)):
    print("Found datapoints for label", label, "--->", main_labels.count(label))

Found datapoints for label 17.0 ---> 19344
Found datapoints for label 29.0 ---> 13104


In [15]:
np.where(np.array(main_labels) == 17)[0].shape

(19344,)

In [16]:
np.random.seed(1098)
split_ratio = (0.7, 0.15, 0.15)

IDs_train = []
IDs_val = []
IDs_test = []

for label in list(set(main_labels)):
    idx = np.where(np.array(main_labels) == label)[0]
    N_label = len(idx)
    print("Found", N_label, "datapoints for label", label)
    
    N_train = int(split_ratio[0] * N_label)
    N_val = int(split_ratio[1] * N_label)
    N_test = N_label - N_train - N_val
    print("Split dataset for label", label, "into train/val/test fractions:", N_train, N_val, N_test)
    
    # Select training, validation, and test IDs:
    trainIDs = np.random.choice(idx, N_train, replace=False)
    valIDs = np.random.choice(list(set(idx) - set(trainIDs)), N_val, replace=False)
    testIDs = list(set(idx) - set(trainIDs) - set(valIDs))
    
    IDs_train.extend(list(trainIDs))
    IDs_val.extend(list(valIDs))
    IDs_test.extend(list(testIDs))

Found 19344 datapoints for label 17.0
Split dataset for label 17.0 into train/val/test fractions: 13540 2901 2903
Found 13104 datapoints for label 29.0
Split dataset for label 29.0 into train/val/test fractions: 9172 1965 1967


In [17]:
# print(IDs_test)

In [18]:
# print(IDs_train)

In [19]:
train_generator = DataGenerator(list_IDs = IDs_train,
                                 main_labels = main_labels,
                                 filenames = [x[:-4] for x in files_csv],
                                 to_fit=True,
                                 n_average = 40,
                                 batch_size = 10,
                                 iter_per_epoch = 30,
                                 up_sampling = True,
                                 n_timepoints = 501,
                                 n_channels=30, 
                                 shuffle=True,
                                 ignore_labels=[])

train_generator_noise = DataGenerator(list_IDs = IDs_train,
                                      main_labels = main_labels,
                                      filenames = [x[:-4] for x in files_csv],
                                      to_fit=True,
                                      gaussian_noise=0.01,
                                      n_average = 40,
                                      batch_size = 10,
                                      iter_per_epoch = 30,
                                      up_sampling = True,
                                      n_timepoints = 501,
                                      n_channels=30, 
                                      shuffle=True,
                                      ignore_labels=[])

val_generator = DataGenerator(list_IDs = IDs_val,
                                 main_labels = main_labels,
                                 filenames = [x[:-4] for x in files_csv],
                                 to_fit=True, 
                                 n_average = 40,
                                 batch_size = 10,
                                 iter_per_epoch = 30,
                                 up_sampling = True,
                                 n_timepoints = 501,
                                 n_channels=30, 
                                 shuffle=True,
                                 ignore_labels=[])

test_generator = DataGenerator(list_IDs = IDs_test,
                                 main_labels = main_labels,
                                 filenames = [x[:-4] for x in files_csv],
                                 to_fit=True, 
                                 n_average = 40,
                                 batch_size = 10,
                                 iter_per_epoch = 30,
                                 up_sampling = True,
                                 n_timepoints = 501,
                                 n_channels=30, 
                                 shuffle=True,
                                 ignore_labels=[])

In [20]:
X, y  = train_generator.__getitem__(1)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_metadata_757-487-17m-jr-mmn36.npy'

In [21]:
test_generator.__getitem__(50)[1]

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Projects\\EEG_explorer\\Data/data_processed_DL/processed_metadata_725-161-29m-jr-mmn36.npy'

In [22]:
X.shape, len(y) # Would expect it to be the batch size, but doesn't when it can't find enough samples with this stimilus

NameError: name 'X' is not defined

In [None]:
print(y[:11])

## Definition and testing of DL models

In [None]:
n_timesteps = 501
n_features = 30 # Due to differences in the number of electrodes between ages, less than 62+2 
n_outputs = 1

input_shape = (n_timesteps, n_features)

In [None]:
# Helper functions

def evaluate_model(model):
    """ Evaluates the model """
    model.evaluate(train_generator)
    model.evaluate(val_generator)
    X_test_eval, y_test_eval = test_generator.get_all_data() # Get data of all subjects in test set once
    model.evaluate(X_test_eval, y_test_eval)    
    
def print_few_predictions(model):
    """ Prints a few predictions, as a sanity check """
    x_test, y_test = test_generator.__getitem__(0)

    print(model.predict(x_test))
    print(y_test)
    
def plot_loss(history):
    """ Plots the MSE, RMSE, and MAE loss for the training and validation data over time """
    
    %matplotlib inline
    
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=(12,12), dpi=200)

    ax1.plot(history.history['loss'], label='training data')  
    min_loss = min(history.history['val_loss'])
    val_plot1 = ax1.plot(history.history['val_loss'], label='validation data')
    ax1.axhline(y = min_loss, color = val_plot1[0].get_color(), linestyle = '--') 
    x0,x1 = ax1.get_xlim()
    ax1.text(x1, min_loss, "{:.2f}".format(min_loss), ha='left', va='center')
    ax1.set_title('MSE loss')
    ax1.set_ylabel("MSE")
    ax1.set_xlabel("epochs")
    ax1.legend()

    ax2.plot(history.history['root_mean_squared_error'], label='training data')
    min_loss = min(history.history['val_root_mean_squared_error'])
    val_plot2 = ax2.plot(history.history['val_root_mean_squared_error'], label='validation data')
    ax2.axhline(y = min_loss, color = val_plot2[0].get_color(), linestyle = '--') 
    x0,x1 = ax2.get_xlim()
    ax2.text(x1, min_loss, "{:.2f}".format(min_loss), ha='left', va='center')
    ax2.set_title('RMSE loss')
    ax2.set_ylabel("RMSE")
    ax2.set_xlabel("epochs")
    ax2.legend()
    
    ax3.plot(history.history['mean_absolute_error'], label='training data')    
    min_loss = min(history.history['val_mean_absolute_error'])
    val_plot3 = ax3.plot(history.history['val_mean_absolute_error'], label='validation data')
    ax3.axhline(y = min_loss, color = val_plot3[0].get_color(), linestyle = '--') 
    x0,x1 = ax3.get_xlim()
    ax3.text(x1, min_loss, "{:.2f}".format(min_loss), ha='left', va='center')
    ax3.set_title('MAE loss')
    ax3.set_ylabel("MAE")
    ax3.set_xlabel("epochs")
    ax3.legend()

### Model 1 - BLSTM-LSTM, based on Kaushik et al.

In [None]:
def blstm_lstm_model():
    """ Returns the BLSTM-LSTM model from Kaushik et al. (2019). """
    
    # TODO: This model compresses too much in the last phase, check if possible to improve.
    
    model = keras.Sequential()
    
    # BLSTM layer
    # model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=input_shape))
    model.add(Bidirectional(LSTM(256, return_sequences=True), input_shape=input_shape, merge_mode='ave'))
    model.add(Dropout(.2))
    model.add(BatchNormalization())
    
    # LSTM layer
    model.add(LSTM(128, return_sequences=True))
    model.add(BatchNormalization())

    # LSTM layer
    model.add(LSTM(64, return_sequences=True))
    model.add(BatchNormalization())
    
#     # LSTM layer
#     model.add(LSTM(64))
#     model.add(BatchNormalization())
    
    # Fully connected layer
    model.add(Dense(32, activation='relu'))
    model.add(Dense(n_outputs))
    
    return model 

In [None]:
model = blstm_lstm_model()

optimizer = Adam(learning_rate=0.01, 
                 beta_1=0.9, 
                 beta_2=0.999, 
                 epsilon=1e-07, 
                 amsgrad=False,
                 name='Adam')

model.build((n_timesteps, n_features))
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.MeanAbsoluteError()])
model.summary()

In [None]:
output_filename = 'BLSTM_EEG_classifier01'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=200, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=20, min_lr=0.0001, verbose=1)

In [None]:
epochs = 1500

# fit network
history = model.fit(x=train_generator,
                    validation_data=val_generator,
                    epochs=epochs,
                    callbacks = [checkpointer, earlystopper, reduce_lr])


In [None]:
evaluate_model(model)

In [None]:
print_few_predictions(model)

In [None]:
plot_loss(history)

### Model 2 - CNN

In [None]:
def cnn_model():
    model = keras.Sequential()
    model.add(Conv1D(filters=48, kernel_size=20, input_shape=(n_timesteps,n_features)))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=64, kernel_size=10)) #, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=96, kernel_size=5)) #, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(n_outputs))
    
    return model

In [None]:
model = cnn_model()

optimizer = Adam(learning_rate=0.01, 
                 beta_1=0.9, 
                 beta_2=0.999, 
                 epsilon=1e-07, 
                 amsgrad=False,
                 name='Adam')

model.build((n_timesteps, n_features))
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.MeanAbsoluteError()])
model.summary()

In [None]:
output_filename = 'CNN_EEG_classifier01'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=200, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0.0001, verbose=1)

In [None]:
epochs = 1500

# fit network
history = model.fit(x=train_generator,
                    validation_data=val_generator,
                    epochs=epochs,
                    callbacks = [checkpointer, earlystopper, reduce_lr])


In [None]:
evaluate_model(model)

In [None]:
print_few_predictions(model)

In [None]:
plot_loss(history)

### Model 2a - CNN - Add another dropout, gaussian noise

In [None]:
def cnn_model_2a():
    model = keras.Sequential()
    model.add(Conv1D(filters=48, kernel_size=20, input_shape=(n_timesteps,n_features)))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=64, kernel_size=10)) #, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=96, kernel_size=5)) #, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(60, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(n_outputs))
    
    return model

In [None]:
model = cnn_model_2a()

optimizer = Adam(learning_rate=0.01, 
                 beta_1=0.9, 
                 beta_2=0.999, 
                 epsilon=1e-07, 
                 amsgrad=False,
                 name='Adam')

model.build((n_timesteps, n_features))
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.MeanAbsoluteError()])
model.summary()

In [None]:
output_filename = 'CNN_EEG_classifier01a'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=50, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, min_lr=0.0001, verbose=1)

In [None]:
epochs = 1500

# fit network
history = model.fit(x=train_generator_noise,
                    validation_data=val_generator,
                    epochs=epochs,
                    callbacks = [checkpointer, earlystopper, reduce_lr])


In [None]:
evaluate_model(model)

In [None]:
print_few_predictions(model)

In [None]:
plot_loss(history)

### Model 3 - CNN and RNN (LSTM) combined

In [None]:
def cnn_lstm_model():

    model = keras.Sequential()
    model.add(Conv1D(filters=48, kernel_size=20, input_shape=(n_timesteps,n_features)))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=64, kernel_size=10)) #, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=96, kernel_size=5)) #, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Reshape((-1, 96)))

    model.add(Bidirectional(LSTM(96, return_sequences=True), input_shape=input_shape, merge_mode='ave'))
    model.add(Dropout(.2))
    model.add(BatchNormalization())
    model.add(LSTM(48))
    model.add(BatchNormalization())
    model.add(Dense(32))
    model.add(Dense(n_outputs))

    return model

In [None]:
model = cnn_lstm_model()

optimizer = Adam(learning_rate=0.01, 
                 beta_1=0.9, 
                 beta_2=0.999, 
                 epsilon=1e-07, 
                 amsgrad=False,
                 name='Adam')

model.build((n_timesteps, n_features))
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.MeanAbsoluteError()])
model.summary()

In [None]:
output_filename = 'COMBINED_EEG_classifier01'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=200, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0.0001, verbose=1)

In [None]:
epochs = 1500

# fit network
history = model.fit(x=train_generator,
                    validation_data=val_generator,
                    epochs=epochs,
                    callbacks = [checkpointer, earlystopper, reduce_lr])

In [None]:
evaluate_model(model)

In [None]:
print_few_predictions(model)

In [None]:
plot_loss(history)

#### Model 4 - Deep CNN

In [None]:
def cnn_model_2():

    model = keras.Sequential()
    model.add(Conv1D(filters=48, kernel_size=20, input_shape=(n_timesteps,n_features)))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=64, kernel_size=10)) #, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=96, kernel_size=5)) #, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Conv1D(filters=164, kernel_size=2, activation='relu'))
    model.add(BatchNormalization())
    model.add(LeakyReLU())
    model.add(AveragePooling1D(pool_size=2))

    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    # model.add(Dropout(0.2))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(n_outputs))
    
    return model

In [None]:
model = cnn_model_2()

optimizer = Adam(learning_rate=0.01, 
                 beta_1=0.9, 
                 beta_2=0.999, 
                 epsilon=1e-07, 
                 amsgrad=False,
                 name='Adam')

model.build((n_timesteps, n_features))
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.MeanAbsoluteError()])
model.summary()

In [None]:
output_filename = 'CNN_EEG_classifier02'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=200, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20, min_lr=0.0001, verbose=1)

In [None]:
epochs = 1500

# fit network
history = model.fit(x=train_generator,
                    validation_data=val_generator,
                    epochs=epochs,
                    callbacks = [checkpointer, earlystopper, reduce_lr])


In [None]:
evaluate_model(model)

In [None]:
print_few_predictions(model)

In [None]:
plot_loss(history)

### Model 5 - InceptionTime model

In [None]:
from inception_time import Regressor_INCEPTION

In [None]:
output_filename = 'Inception_classifier01'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=200, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1)

In [None]:
model = Regressor_INCEPTION(PATH_MODELS, global_avg_pooling=True, input_shape, 1, verbose=True).model

In [None]:
epochs = 1500

# fit network
history = model.model.fit(x=train_generator,
                          validation_data=val_generator,
                          epochs=epochs,
                          callbacks = [checkpointer, earlystopper, reduce_lr])

In [None]:
evaluate_model(model)

In [None]:
print_few_predictions(model)

In [None]:
plot_loss(history)

### Model 6 - InceptionTime model, no global pooling

In [None]:
from inception_time import Regressor_INCEPTION

In [None]:
output_filename = 'Inception_classifier02'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=200, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1)

In [None]:
model = Regressor_INCEPTION(PATH_MODELS, input_shape, 1, verbose=True).model

In [None]:
epochs = 1500

# fit network
history = model.model.fit(x=train_generator,
              validation_data=val_generator,
              epochs=epochs,
              callbacks = [checkpointer, earlystopper, reduce_lr])

In [None]:
evaluate_model(model)

In [None]:
print_few_predictions(model)

In [None]:
plot_loss(history)

### Model 7 - InceptionTime model, no global pooling, gaussian noise

In [None]:
from inception_time import Regressor_INCEPTION

In [None]:
output_filename = 'Inception_classifier03'
output_file = os.path.join(PATH_MODELS, output_filename)

checkpointer = ModelCheckpoint(filepath = output_file + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=200, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=50, min_lr=0.0001, verbose=1)

In [None]:
model = Regressor_INCEPTION(PATH_MODELS, input_shape, 1, verbose=True).model

In [None]:
epochs = 1500

# fit network
history = model.model.fit(x=train_generator_noise, # Use generator with noise
              validation_data=val_generator,
              epochs=epochs,
              callbacks = [checkpointer, earlystopper, reduce_lr])

In [None]:
evaluate_model(model)

In [None]:
print_few_predictions(model)

In [None]:
plot_loss(history)

### Load best model and explore best model (InceptionTime 3)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

In [None]:
best_filename = 'Inception_classifier03'
best_file = os.path.join(PATH_MODELS, best_filename)

best_model_path = best_file + '.hdf5'

In [None]:
model = tf.keras.models.load_model(best_model_path)

In [None]:
def get_targets_and_predictions(model, X_test):
    predicted_values = model.predict(X_test)
        
    return np.array(predicted_values).flatten()

In [None]:
X_test, true_values = test_generator.get_all_data()

predicted_values = get_targets_and_predictions(model, X_test)
df = pd.DataFrame(list(zip(true_values, predicted_values)), columns =['true_values', 'predicted_values']) 

In [None]:
evaluate_model(model)

In [None]:
from scipy.stats import pearsonr

# calculate Pearson's correlation
corr, _ = pearsonr(true_values, predicted_values)
print('Pearsons correlation: %.3f' % corr)

In [None]:
from scipy.stats import gaussian_kde

def age_pred_age_scatterplot(df):
    %matplotlib inline
    
    fig, ax = plt.subplots(figsize=(8,8))
    ax.scatter(df['true_values'], df['predicted_values'])

    ax.set_xlim(4, 48)
    ax.set_ylim(4, 48)

    ax.set_xticks(np.arange(5, 48, 2.0))
    ax.set_yticks(np.arange(5, 48, 2.0))

    ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=(1,0.2,0,0.3))
    ax.grid()
    ax.set_axisbelow(True)

    ax.set_xlabel('Chronological age (months)')
    ax.set_ylabel('EEG-based age (months)')
    ax.set_title('Chronological age vs. predicted EEG-based age')

In [None]:
age_pred_age_scatterplot(df)

In [None]:
from scipy import stats

def describe_data(df):
    print("==== True values, descriptive statistics")
    print(stats.describe(df['true_values']))
    print()
    print("==== Predicted values, descriptive statistics")
    print(stats.describe(df['predicted_values']))

In [None]:
describe_data(df)

In [None]:
from scipy import stats

def plot_distribution_all(df):
    %matplotlib inline
    
    fig, ax = plt.subplots(3, 2, figsize=(15,17))
    ax=ax.flatten()

    ax[0].hist(df.loc[df['true_values'] == 17]['predicted_values'], density=True, bins=20, label="Predictions")
    min_y, max_y = ax[0].get_ylim()
    ax[0].vlines(17, min_y, max_y, colors=(0,0,0,0.5), linestyles='dashed', label="True age")
    min_x, max_x = ax[0].get_xlim()
    ax[0].set_xlim(min_x, max_x)
    kde_xs = np.linspace(min_x, max_x, 300)
    kde = stats.gaussian_kde(df.loc[df['true_values'] == 17]['predicted_values'])
    ax[0].plot(kde_xs, kde.pdf(kde_xs), label="PDF")
    ax[0].legend(loc="upper left")
    ax[0].set_xlabel('EEG-based age prediction (months)')
    ax[0].set_ylabel('Frequency')
    ax[0].set_title(f'Frequency histogram, chronological age = 17 months')
    
    ax[1].hist(df.loc[df['true_values'] == 23]['predicted_values'], density=True, bins=20, label="Predictions")
    min_y, max_y = ax[1].get_ylim()
    ax[1].vlines(23, min_y, max_y, colors=(0,0,0,0.5), linestyles='dashed', label="True age")
    min_x, max_x = ax[1].get_xlim()
    ax[1].set_xlim(min_x, max_x)
    kde_xs = np.linspace(min_x, max_x, 300)
    kde = stats.gaussian_kde(df.loc[df['true_values'] == 23]['predicted_values'])
    ax[1].plot(kde_xs, kde.pdf(kde_xs), label="PDF")
    ax[1].legend(loc="upper left")
    ax[1].set_xlabel('EEG-based age prediction (months)')
    ax[1].set_ylabel('Frequency')
    ax[1].set_title(f'Frequency histogram, chronological age = 23 months')
    
    ax[2].hist(df.loc[df['true_values'] == 29]['predicted_values'], density=True, bins=20, label="Predictions")
    min_y, max_y = ax[2].get_ylim()
    ax[2].vlines(29, min_y, max_y, colors=(0,0,0,0.5), linestyles='dashed', label="True age")
    min_x, max_x = ax[2].get_xlim()
    ax[2].set_xlim(min_x, max_x)
    kde_xs = np.linspace(min_x, max_x, 300)
    kde = stats.gaussian_kde(df.loc[df['true_values'] == 29]['predicted_values'])
    ax[2].plot(kde_xs, kde.pdf(kde_xs), label="PDF")
    ax[2].legend(loc="upper left")
    ax[2].set_xlabel('EEG-based age prediction (months)')
    ax[2].set_ylabel('Frequency')
    ax[2].set_title(f'Frequency histogram, chronological age = 29 months')
    
    ax[3].hist(df.loc[df['true_values'] == 35]['predicted_values'], density=True, bins=20, label="Predictions")
    min_y, max_y = ax[3].get_ylim()
    ax[3].vlines(35, min_y, max_y, colors=(0,0,0,0.5), linestyles='dashed', label="True age")
    min_x, max_x = ax[3].get_xlim()
    ax[3].set_xlim(min_x, max_x)
    kde_xs = np.linspace(min_x, max_x, 300)
    kde = stats.gaussian_kde(df.loc[df['true_values'] == 35]['predicted_values'])
    ax[3].plot(kde_xs, kde.pdf(kde_xs), label="PDF")
    ax[3].legend(loc="upper left")
    ax[3].set_xlabel('EEG-based age prediction (months)')
    ax[3].set_ylabel('Frequency')
    ax[3].set_title(f'Frequency histogram, chronological age = 35 months')
    
    ax[4].hist(df.loc[df['true_values'] == 41]['predicted_values'], density=True, bins=20, label="Predictions")
    min_y, max_y = ax[4].get_ylim()
    ax[4].vlines(41, min_y, max_y, colors=(0,0,0,0.5), linestyles='dashed', label="True age")
    min_x, max_x = ax[4].get_xlim()
    ax[4].set_xlim(min_x, max_x)
    kde_xs = np.linspace(min_x, max_x, 300)
    kde = stats.gaussian_kde(df.loc[df['true_values'] == 41]['predicted_values'])
    ax[4].plot(kde_xs, kde.pdf(kde_xs), label="PDF")
    ax[4].legend(loc="upper left")
    ax[4].set_xlabel('EEG-based age prediction (months)')
    ax[4].set_ylabel('Frequency')
    ax[4].set_title(f'Frequency histogram, chronological age = 41 months')
    
    ax[5].remove()

In [None]:
plot_distribution_all(df)

In [None]:
def plot_distributions_combined(df):
    %matplotlib inline
    
    bins = 20
    
    fig, ax = plt.subplots(figsize=(12,6))        
    ax.hist([df.loc[df['true_values'] == 17]['predicted_values'], 
             df.loc[df['true_values'] == 23]['predicted_values'], 
             df.loc[df['true_values'] == 29]['predicted_values'], 
             df.loc[df['true_values'] == 35]['predicted_values'],
             df.loc[df['true_values'] == 41]['predicted_values']], density=True, alpha=0.5, bins=bins, 
            label=["Age = 17", 
                   "Age = 23", 
                   "Age = 29", 
                   "Age = 35",
                   "Age = 41"])

    ax.set_xlabel('EEG-based age prediction (months)')
    ax.set_ylabel('Frequency')
    ax.set_title(f'Frequency histogram for all ages')
    
    ax.legend(loc="upper left")

In [None]:
plot_distributions_combined(df)

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
ax.set_yticks(np.arange(11, 48, 6.0))
sns.boxplot(ax=ax, x="true_values", y="predicted_values", data=df, showmeans=True, 
            meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                       "markersize":"6"})
ax.set_xlabel('Chronological age (months)')
ax.set_ylabel('EEG-based age (months)')
ax.set_title('Chronological age vs. predicted EEG-based age')

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
ax.set_yticks(np.arange(11, 48, 6.0))
sns.swarmplot(ax=ax, x="true_values", y="predicted_values", data=df)
ax.set_xlabel('Chronological age (months)')
ax.set_ylabel('EEG-based age (months)')
ax.set_title('Chronological age vs. predicted EEG-based age')

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
ax.set_yticks(np.arange(11, 48, 6.0))
sns.stripplot(ax=ax, x="true_values", y="predicted_values", data=df)
ax.set_xlabel('Chronological age (months)')
ax.set_ylabel('EEG-based age (months)')
ax.set_title('Chronological age vs. predicted EEG-based age')

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
ax.set_yticks(np.arange(11, 48, 6.0))
sns.violinplot(ax=ax, x="true_values", y="predicted_values", data=df)
ax.set_xlabel('Chronological age (months)')
ax.set_ylabel('EEG-based age (months)')
ax.set_title('Chronological age vs. predicted EEG-based age')

In [None]:
fig, ax = plt.subplots(figsize=(14,8))
ax.set_yticks(np.arange(11, 48, 6.0))
sns.violinplot(ax=ax, x="true_values", y="predicted_values", data=df)
sns.swarmplot(ax=ax, x="true_values", y="predicted_values", data=df, color='k', alpha=0.5)
sns.boxplot(ax=ax, x="true_values", y="predicted_values", data=df, showmeans=True, 
            meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                       "markersize":"6"})
ax.set_xlabel('Chronological age (months)')
ax.set_ylabel('EEG-based age (months)')
ax.set_title('Chronological age vs. predicted EEG-based age')