# Model
The goal of this document is to set up a pipeline containing data preprocessing (feature selection and onwards), model training, and model testing.

## Importing dependencies

In [2]:
DEBUG = False

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [3]:
import h5py

import json

from tensorflow.keras import backend as keras
from keras.utils import to_categorical

import matplotlib.pyplot as plt

import numpy as np

import os

import pandas as pd

import pickle

import random

from sklearn import ensemble, preprocessing, svm
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score 
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, ShuffleSplit
from sklearn.utils import shuffle, class_weight

import sqlite3

import tempfile

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, AveragePooling1D
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D, AveragePooling2D
from tensorflow.keras.layers import DepthwiseConv2D, AveragePooling2D, SeparableConv2D
from tensorflow.keras.layers import LSTM, GRU, RNN
from tensorflow.keras.losses import MAE
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1, l2

# from tensorflow.compat.v1.keras.backend import set_session as keras_set_session
from tensorflow.python.client import device_lib
# visible_devices = '1' #this is the GPU number, this is GPU0 ‘0’ or GPU1 ‘1’
# memory_fraction = 1.0 #This will allow 20% of the GPU memory to be allocated to your process, pick this number large enough for your script but also not too large so others can still do things.
# os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
# gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=memory_fraction, allow_growth=True) 
# tf_session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
# keras_set_session(tf_session)

import random

Using TensorFlow backend.


In [4]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 6833896541482878856,
 name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 733754835540681801
 physical_device_desc: "device: XLA_CPU device",
 name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 12636885107794194104
 physical_device_desc: "device: XLA_GPU device",
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14648777152
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 3958190333107961514
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"]

In [5]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

## Read in data(base)
Load in SQLite database and fetch all tokens (recordings) and the extracted features.

In [6]:
#
#   CONNECT TO LOCAL DATABASE
#
def create_db_connection(db_file_name):
    conn = None

    try:
        conn = sqlite3.connect(db_file_name)
    except sqlite3.Error as e:
        print(e)

    return conn

In [7]:
#TUEP
conn = create_db_connection('/mnt/disks/data/files/TUEP_files/eeg_recordings_TUEP.db')


query = """SELECT patients.patient_id, sessions.session_id, tokens.token_id, patients.diagnosis, sessions.electrode_setup, tokens.recording_duration, tokens.sampling_freq, tokens.len_of_samples, tokens.file_name, tokens.file_path
           FROM patients 
           
           INNER JOIN sessions ON patients.patient_id == sessions.patient_id 
           INNER JOIN tokens ON sessions.patient_id == tokens.patient_id AND sessions.session_id == tokens.session_id"""
cur = conn.cursor()
df_TUEP = pd.read_sql(query, conn)

#TUAB
conn = create_db_connection('/mnt/disks/data/files/TUAB_files/eeg_recordings_TUAB.db')


query = """SELECT patients.patient_id, sessions.session_id, tokens.token_id, patients.diagnosis, sessions.electrode_setup, tokens.recording_duration, tokens.sampling_freq, tokens.len_of_samples, tokens.file_name, tokens.file_path
           ,patients.patient_train_or_test FROM patients 
           INNER JOIN sessions ON patients.patient_id == sessions.patient_id 
           INNER JOIN tokens ON sessions.patient_id == tokens.patient_id AND sessions.session_id == tokens.session_id"""
cur = conn.cursor()
df_TUAB = pd.read_sql(query, conn)

#TUSZ
conn = create_db_connection('/mnt/disks/data/files/TUSZ_files/eeg_recordings_TUSZ.db')

query = """SELECT patients.patient_id, sessions.session_id, tokens.token_id, tokens.number_of_windows, tokens.diagnosis, sessions.electrode_setup, tokens.recording_duration, tokens.sampling_freq, tokens.len_of_samples, tokens.file_name, tokens.file_path
            FROM patients 
           INNER JOIN sessions ON patients.patient_id == sessions.patient_id 
           INNER JOIN tokens ON sessions.patient_id == tokens.patient_id AND sessions.session_id == tokens.session_id"""
cur = conn.cursor()
df_TUSZ = pd.read_sql(query, conn)

In [8]:
df_dict = {"TUEP": df_TUEP, "TUAB": df_TUAB, "TUSZ": df_TUSZ}
h5f_dict = {"TUEP": "/mnt/disks/data/files/TUEP_files/raw_data_TUEP.h5", "TUAB": "/mnt/disks/data/files/TUAB_files/raw_data_TUAB.h5", "TUSZ": "/mnt/disks/data/files/TUSZ_files/raw_data_TUSZ.h5"}
plus_edf_dict = {"TUEP": False, "TUAB": True, "TUSZ": True}
class1_dict = {"TUEP": "epilepsy", "TUAB": "abnormal", "TUSZ": "seiz"}
class2_dict = {"TUSZ": "bckg"}
multiple_labels_dict = {"TUEP": False, "TUAB": False, "TUSZ": True}

model_dict = {"TUEP":{"EEG": "tuep_eeg_train_all.hdf5", "TVGG":"tuep_tvgg_train_all.hdf5"}, "TUAB":{"EEG": "tuab_eeg_train_all.hdf5", "TVGG":"tuab_tvgg_train_all.hdf5"}, "TUSZ":{"EEG": "tusz_eeg_train_all.hdf5", "TVGG":"tusz_tvgg_train_all.hdf5"}}

In [9]:
#Avoid data leakage
dataset1 = "TUEP"
dataset2 = "TUSZ"

patient_ids1 = np.unique(df_dict[dataset1]["patient_id"])
patient_ids2 = np.unique(df_dict[dataset2]["patient_id"])
intersection = list(set(patient_ids1) & set(patient_ids2))
print(len(intersection))
print(len(patient_ids1))
print(len(patient_ids2))
#df_TUAB = df_TUAB[df_TUAB["patient_id"].isin(intersection) == False]
#print(len(np.unique(df_TUAB["patient_id"])))
if len(patient_ids1) > len(patient_ids2):
    dataset = dataset1
else:
    dataset = dataset2

df_dict[dataset] = df_dict[dataset][df_dict[dataset]["patient_id"].isin(intersection) == False]
print(len(np.unique(df_dict[dataset]["patient_id"])))

2
200
25
198


### Overarching function

In [10]:
def window_sum(l):
    window_sum = 0
    for windows in l:
        windows = windows.split(" ")
        for window in windows:
            if not window == '':
                window_sum += int(window)
    return window_sum

In [11]:
def window_list(l):
    window_list = []
    for windows in l:
        window_element = []
        windows = windows.split(" ")
        for window in windows:
            if not window == '':
                window_element.append(int(window))
        window_list.append(window_element)
    return window_list

In [12]:
def total_windows(l):
    total = 0
    for window_element in l:
        for window in window_element:
            total += window
    return total

In [13]:
#
#   CALCULATE THE CUMULATIVE NUMBER OF TOKENS/SECONDS OF RAW DATA PER PATIENT
#
def get_tokens_cumsum(df, name):
    # Get (cumulative) number of tokens per unique epilepsy patient
    #       Select patient_ids
    #       (Optional TO DO: order patients to assure variation in age and sexe)
    patient_ids = np.unique(df["patient_id"])

    #       Iterate over patient_ids and keep track of number of tokens per patient
    patient_tokens = []
    for curr_patient_id in patient_ids:
        curr_patient_tokens = df[df["patient_id"] == curr_patient_id]
        if multiple_labels_dict[name]:
            curr_patient_windows = window_sum(df[df["patient_id"] == curr_patient_id]['number_of_windows'])
            patient_tokens.append(int(curr_patient_windows))
        else:
            curr_patient_windows = np.floor(df[df["patient_id"] == curr_patient_id]["recording_duration"] / 10)
            patient_tokens.append(curr_patient_windows.sum())        
        
    patient_tokens = np.asarray(patient_tokens)

    #       Calculate cumulative number of recordings 
    patient_tokens_cumsum = np.cumsum(patient_tokens)
    
    return patient_tokens_cumsum

In [14]:
#
#   EXTRACT VALUES AND LABELS FROM TRAIN/VALIDATION/TEST DATASET; TAKES FEATURES TO BE SELECTED AS ARGUMENTS TO FORWARD TO FEATURE SELECTION ALGORITHM
#
def data_to_values_and_labels(dfs, name):
    dfs_values, dfs_labels = [], []
        
    
    # Read in diagnosis and features/feature images from database
    for df in dfs:
        values = [] # (NR_OF_SAMPLE)
        labels = [] # (NR_OF_SAMPLES,)
            
        rows = df.iterrows()
        for row_index, row in rows:
            # Return list of filenames to read in with generator
            #       Extract file path and name from database
            file_name = row["file_name"]
            file_path = row["file_path"]

            #       Add file path and name to values array
            values.append(f"{'data'}{file_path}{file_name}")

            #       Add patient diagnosis as label to labels array
            if multiple_labels_dict[name]:#TODO: make sure this works
                label_list = []
                diagnoses = row["diagnosis"]
                diagnoses = diagnoses.split(" ")
                for diagnosis in diagnoses:
                    if diagnosis == class1_dict[name]:
                        label_list.append(1)
                    elif diagnosis == class2_dict[name]:
                        label_list.append(0)
                        
                labels.append(label_list)
            
            else:
                labels.append(1 if row["diagnosis"] == class1_dict[name] else 0)
              
        # Add extracted values and labels to arrays for returning 
        dfs_values.append(values)             
        dfs_labels.append(labels)
        
        
    return dfs_values, dfs_labels

In [15]:
def get_data(dataset):
    # Filter out empty arrays
    df = df_dict[dataset]
    if not multiple_labels_dict[dataset]:
        df = df[df["recording_duration"] > 10]        
        
    # Gather all epileptic and non-epileptic patients in separate dataframes
    if dataset == "TUEP":
        df_class1 = df[df["diagnosis"]=="epilepsy"]
        df_class2 = df[df["diagnosis"]=="no_epilepsy"]
    elif dataset == "TUAB":
        df_class1 = df[df["diagnosis"]=="normal"]
        df_class2 = df[df["diagnosis"]=="abnormal"]
    
    if not multiple_labels_dict[dataset]:
        # Split epileptical and non-epileptic patients in train, validation and test split
        df_class1_train_all, df_class1_test, df_class1_train, df_class1_val, df_class1_val_stratified = train_val_test_split(df_class1, dataset,stratify_ref_df=df_class2, stratify = False)
        df_class2_train_all, df_class2_test, df_class2_train, df_class2_val, df_class2_val_stratified = train_val_test_split(df_class2, dataset, stratify_ref_df=df_class1, stratify = False)
   
        # Combine dataframes to form training, validation and test dataframes

        #       Concatenate (pot. stratified) epileptic and non-epileptic training data 
        df_train_all = pd.concat([df_class1_train_all, df_class2_train_all])
        df_train = pd.concat([df_class1_train, df_class2_train])

        #       Concatenate validation and test data to preserve the data distribution
        df_test = pd.concat([df_class1_test, df_class2_test])
        df_val = pd.concat([df_class1_val, df_class2_val])
        df_val_stratified = pd.concat([df_class1_val_stratified, df_class2_val_stratified])
    
    else:
        df_train_all, df_test, df_train, df_val, class1_filters = train_val_test_split2(df, dataset)

    
    #SHUFFLE
    df_train_all = df_train_all.sample(frac=1).reset_index(drop=True)
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)
    df_val = df_val.sample(frac=1).reset_index(drop=True)
    if not multiple_labels_dict[dataset]:
        df_val_stratified = df_val_stratified.sample(frac=1).reset_index(drop=True)
    
    ##TODO recording time necessary?
    
    # Extract data values and labels from individual datasets
    if not multiple_labels_dict[dataset]:
        dfs = [df_train_all, df_test, df_train, df_val, df_val_stratified]
  
    else:
        dfs = [df_train_all, df_test, df_train, df_val]
    dfs_values, dfs_labels = data_to_values_and_labels(dfs, dataset) 

    #       Extract all values and labels from their respective datasets
    train_all_values, train_all_labels = dfs_values[0], dfs_labels[0]
    test_values, test_labels = dfs_values[1], dfs_labels[1]
    train_values, train_labels = dfs_values[2], dfs_labels[2]
    val_values, val_labels = dfs_values[3], dfs_labels[3]    #the stratified ones
    if not multiple_labels_dict[dataset]:
        val_values_stratified, val_labels_stratified = dfs_values[4], dfs_labels[4]    #the stratified ones
    
    
    train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = np.asarray(train_all_values), np.asarray(train_all_labels), np.asarray(test_values), np.asarray(test_labels), np.asarray(train_values), np.asarray(train_labels), np.asarray(val_values), np.asarray(val_labels)
    if not multiple_labels_dict[dataset]:
        val_values_stratified, val_labels_stratified= np.asarray(val_values_stratified), np.asarray(val_labels_stratified)
        return [train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels, val_values_stratified, val_labels_stratified]
    else:
        return [train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels, class1_filters]
        

In [16]:
#
#   SPLIT DATASET FROM DATABASE IN TRAIN, VALIDATION AND TEST DATASETS BASED ON THE NUMBER OF TOKENS PER PATIENT
#
def train_val_test_split(df, dataset, stratify_ref_df = None, stratify=True):
    # Get patient ids
    patient_ids = np.unique(df["patient_id"])

    # Get (cumulative) number of tokens per unique epilepsy patient
    patient_tokens_cumsum = get_tokens_cumsum(df, dataset)
    total_tokens = np.amax(patient_tokens_cumsum)

    
    # Split patient_ids based on number of recordings; 80% of all tokens should be training data
    # (of which 64% training data and 16% validation data) and 20% should be test data.
    #       Find patient_index corresponding to a cumulative 80% of the data to split training from test data
    test_split = next(index for index, curr_patient_tokens_cumsum in enumerate(patient_tokens_cumsum) if curr_patient_tokens_cumsum > 0.8 * total_tokens) + 1
    
    #       Use split patient_id to split off 20% of the data for test data
    train_all_patient_ids = patient_ids[:test_split]
    train_all_patient_tokens = patient_tokens_cumsum[:test_split]
    
    test_patient_ids = patient_ids[test_split:]
    test_patient_tokens = patient_tokens_cumsum[test_split:]

    #       Find patient_id corresponding to a cumulative 80% of the training data to split training data from validation data
    total_train_tokens = np.amax(train_all_patient_tokens)
    val_split = next(index for index, curr_patient_tokens_cumsum in enumerate(train_all_patient_tokens) if curr_patient_tokens_cumsum > 0.8 * total_train_tokens) + 1

    #       Use split patient_id to split off 20% of the data for validation data
    train_patient_ids = train_all_patient_ids[:val_split]
    train_patient_tokens = train_all_patient_tokens[:val_split]

    val_patient_ids = train_all_patient_ids[val_split:]
    val_patient_tokens = train_all_patient_tokens[val_split:]

    
    # Split dataframe in train, validation and test sets based on the split made above
    df_train_all = df[df["patient_id"].isin(train_all_patient_ids)]
    df_test = df[df["patient_id"].isin(test_patient_ids)]
    
    df_train = df[df["patient_id"].isin(train_patient_ids)]
    df_val = df[df["patient_id"].isin(val_patient_ids)]
    
    # Stratify training datasets based on reference dataset
    if stratify and not multiple_labels_dict[dataset] and (stratify_ref_df is not None):
        #       Find size of smallest dataset, either the argument dataset or the reference one
        ref_tokens_cumsum = get_tokens_cumsum(stratify_ref_df, dataset)
        ref_total_tokens = np.amax(ref_tokens_cumsum)
        
        if ref_total_tokens < total_tokens:
            
            # Get sizes of respective reference training sets
            ref_df_train_all, _, ref_df_train, ref_df_val, _ = train_val_test_split(stratify_ref_df, dataset)
            
            ref_train_all_tokens_cumsum = get_tokens_cumsum(ref_df_train_all, dataset)
            train_all_subset_size = np.amax(ref_train_all_tokens_cumsum)
            
            ref_train_tokens_cumsum = get_tokens_cumsum(ref_df_train, dataset)
            train_subset_size = np.amax(ref_train_tokens_cumsum)
            
            ref_val_tokens_cumsum = get_tokens_cumsum(ref_df_val, dataset)
            val_subset_size = np.amax(ref_val_tokens_cumsum)
            
            #  take subset of data to match size of reference set
            df_train_all_stratified = stratify_ensemble_split(df_train_all, dataset, train_all_subset_size)
            df_train_stratified = stratify_ensemble_split(df_train, dataset, train_subset_size)
            df_val_stratified = stratify_ensemble_split(df_val, dataset, val_subset_size)
            
            return df_train_all_stratified, df_test, df_train_stratified, df_val, df_val_stratified
    
    return df_train_all, df_test, df_train, df_val, df_val

In [17]:
def train_val_test_split2(df, dataset):
     # Get patient ids
    patient_ids = np.unique(df["patient_id"])

    # Get (cumulative) number of tokens per unique epilepsy patient
    patient_tokens_cumsum = get_tokens_cumsum(df, dataset)
    total_tokens = np.amax(patient_tokens_cumsum)

    
    # Split patient_ids based on number of recordings; 80% of all tokens should be training data
    # (of which 64% training data and 16% validation data) and 20% should be test data.
    #       Find patient_index corresponding to a cumulative 80% of the data to split training from test data
    test_split = next(index for index, curr_patient_tokens_cumsum in enumerate(patient_tokens_cumsum) if curr_patient_tokens_cumsum > 0.8 * total_tokens) + 1
    
    #       Use split patient_id to split off 20% of the data for test data
    train_all_patient_ids = patient_ids[:test_split]
    train_all_patient_tokens = patient_tokens_cumsum[:test_split]
    
    test_patient_ids = patient_ids[test_split:]
    test_patient_tokens = patient_tokens_cumsum[test_split:]

    #       Find patient_id corresponding to a cumulative 80% of the training data to split training data from validation data
    total_train_tokens = np.amax(train_all_patient_tokens)
    val_split = next(index for index, curr_patient_tokens_cumsum in enumerate(train_all_patient_tokens) if curr_patient_tokens_cumsum > 0.8 * total_train_tokens) + 1

    #       Use split patient_id to split off 20% of the data for validation data
    train_patient_ids = train_all_patient_ids[:val_split]
    train_patient_tokens = train_all_patient_tokens[:val_split]

    val_patient_ids = train_all_patient_ids[val_split:]
    val_patient_tokens = train_all_patient_tokens[val_split:]

    
    # Split dataframe in train, validation and test sets based on the split made above
    df_train_all = df[df["patient_id"].isin(train_all_patient_ids)]
    df_test = df[df["patient_id"].isin(test_patient_ids)]
    
    df_train = df[df["patient_id"].isin(train_patient_ids)]
    df_val = df[df["patient_id"].isin(val_patient_ids)]
    
    dfs = [df_train_all, df_train, df_val]
    class1_filters = []
    #stratify this shit
    for df in dfs:
        #count how many of each class
        number_class0, number_class1 = number_class(df, dataset)
        
        #class 1 (majority class) downsamplen
        class1_filter = list(range(number_class1))
        class1_filter = random.sample(class1_filter, number_class0)
        class1_filters.append(class1_filter)
    
    return df_train_all, df_test, df_train, df_val, class1_filters      

In [18]:
def number_class(df, dataset):
    number_class0 = 0
    number_class1 = 0
    
    class0 = class1_dict[dataset]
    class1 = class2_dict[dataset]
    for index, row in df.iterrows():
        diagnoses = row["diagnosis"]
        diagnoses = diagnoses.split(" ")
        diagnoses.remove('')
        
        number_of_windows = row["number_of_windows"]
        number_of_windows = number_of_windows.split(" ")
        number_of_windows.remove('')
        for i in range(len(diagnoses)):
            diagnosis = diagnoses[i]
            windows = number_of_windows[i]
            if diagnosis == class0:
                number_class0 += int(windows)
            elif diagnosis == class1:
                number_class1 += int(windows)
    return number_class0, number_class1

In [19]:
#
#   SPLIT DATA IN ALMOST EQUAL PARTS
#
def stratify_ensemble_split(df, dataset, split_size):
    #shuffle randomly
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Get patient ids
    patient_ids = np.unique(df["patient_id"])

    # Get (cumulative) number of tokens per unique patient
    patient_tokens_cumsum = get_tokens_cumsum(df, dataset)
    
    # take approx split size subset of df
    split_index = next(index for index, curr_patient_tokens_cumsum in enumerate(patient_tokens_cumsum) if curr_patient_tokens_cumsum > split_size) + 1
    stratified_ids = patient_ids[:split_index]
    df = df[df["patient_id"].isin(stratified_ids)]
    
    return df

In [20]:
class EEG_Data_Generator_Heterogeneous(keras.utils.Sequence):
    
    def __init__(self, file_paths, file_labels, batch_size, dataset, conv=False, class_filter = None):  
        
        # Store file paths and corresponding labels
        self.file_paths = file_paths
        self.file_names = [file_path.split("/")[-1] for file_path in file_paths]
        self.file_labels = file_labels
        self.dataset = dataset
        df = df_dict[dataset]
        
        #Store class filter and class counters
        if class_filter == None:
            class_filter = list(range(len(file_labels)))
        self.class_filter = class_filter
        self.class0_counter = 0
        self.class1_counter = 0
        
        # Store batch size
        self.batch_size = batch_size
        
        #initialize reshape stuff
        self.conv = conv
        
        # Calculate total lentgh of the data
        df_gen = df[df["file_name"].isin(self.file_names)]
        df_gen = df_gen.set_index("file_name")
        df_gen = df_gen.loc[self.file_names]
    
        if multiple_labels_dict[self.dataset]:
            self.number_of_windows = window_list(df_gen["number_of_windows"].tolist())
            self.total_windows = total_windows(self.number_of_windows)
        else:            
            self.windows = np.floor((df_gen["recording_duration"] - 1) / 10).astype(int)
            self.windows_cumsum = self.windows.cumsum()
            self.total_windows = self.windows_cumsum.iloc[-1]

        # Make heterogeneously randomized list of tuples of file index and sample index in that file
        if multiple_labels_dict[self.dataset]:
            j = 0
            file_and_sample_index = []
            file_and_sample_label = []
            for file_index in range(len(self.file_paths)):
                file_labels = self.file_labels[file_index]
                file_paths = []
                for i in range(len(file_labels)):
                    file_path = self.file_paths[file_index]+'_'+str(i)
                    file_paths.append(file_path)
                    for sample_index in range(self.number_of_windows[file_index][i]):
                        if file_labels[i] == 0:
                            j += 1
                        if file_labels[i] == 0 and j not in class_filter:
                            pass
                        else:
                            file_and_sample_index.append((file_index, i, sample_index))
                            file_and_sample_label.append(file_labels[i])
                            
                            if file_labels[i] == 0:
                                self.class0_counter += 1
                            elif file_labels[i] == 1:
                                self.class1_counter += 1
                        
        
        else:
            file_and_sample_index = []
            file_and_sample_label = []
            for file_index in range(len(self.file_paths)):
                file_path = self.file_paths[file_index]
                file_label = self.file_labels[file_index]
                for sample_index in range(self.windows[file_index]):
                    file_and_sample_index.append((file_index, sample_index))
                    file_and_sample_label.append(file_label)
                    
        #print(self.class0_counter), print(self.class1_counter)
        # Make stratified shuffle split of data
        test_size = len(file_and_sample_index) % batch_size
        #test_size += batch_size
        splits = len(file_and_sample_index) // batch_size
        #splits -= 1
        sss = StratifiedShuffleSplit(splits, test_size=test_size)
        s = sss.split(np.array(file_and_sample_index), np.array(file_and_sample_label))
        

        #       Split training set in subsets of size batch_size
        s = next(s)
        split_indices = np.split(s[0], splits)

        #       Append test set to split training set
        split_indices.append(s[1])

        self.split_files_and_samples = []
        self.split_labels = []
        for split_index in split_indices:
            self.split_files_and_samples.append(np.asarray(file_and_sample_index)[split_index])
            self.split_labels.append(np.asarray(file_and_sample_label)[split_index])     
       
        
    def __len__(self):
        if multiple_labels_dict[self.dataset]:
            return (np.ceil((self.class0_counter + self.class1_counter) / float(self.batch_size))-1).astype(np.int)
        else:
            return (np.ceil(self.total_windows / float(self.batch_size))-1).astype(np.int)


    def __getitem__(self, idx):
        
        # Define function to append values and labels to the return values and labels
        def append_values_and_labels(batch_values, file_data_h5, file_path, sample_index, index=None):
            file_name = file_path.split("/")[-1][:-4]
            if plus_edf_dict[self.dataset]:
                file_name = file_name + '.edf'
            
            #Get file values
            if index is None:
                file_values = file_data_h5[file_name][sample_index]   
            else:
                file_values = file_data_h5[file_name+"_"+str(index)][sample_index]   
                
            if self.conv:
                file_values = np.swapaxes(file_values, 0, 1)
            else:
                file_values = file_values.reshape((1, file_values.shape[0], file_values.shape[1]))
            batch_values.append(file_values)

            return batch_values
        
        # Read in values for all file paths and duplicate file labels according the the amount of values
        #       Select subset of data
        batch_files_and_samples = self.split_files_and_samples[idx]
        batch_labels = self.split_labels[idx]
        
        #       Open raw data file
        h5f = h5py.File(h5f_dict[self.dataset], 'r')

        #       Get values for filepaths in batch
        batch_values = []
        
        if  multiple_labels_dict[self.dataset]:
            for file_path_index, index, sample_index in batch_files_and_samples:
                file_path = self.file_paths[file_path_index]
                batch_values = append_values_and_labels(batch_values, file_data_h5=h5f, file_path=file_path, sample_index=sample_index, index = index)
        else:
            for file_path_index, sample_index in batch_files_and_samples:
                file_path = self.file_paths[file_path_index]
                batch_values = append_values_and_labels(batch_values, file_data_h5=h5f, file_path=file_path, sample_index=sample_index)
            
        h5f.close()
        self.p = np.random.permutation(len(batch_values))
        batch_values, batch_labels = np.asarray(batch_values)[self.p], np.asarray(batch_labels)[self.p]
        return batch_values, batch_labels

## Model performance functions

In [21]:
#
#   HELP VISUALIZE LEARNING PROGRESSION
#
def plot_history(history):
    plt.figure(figsize = (12,16))
    plt.subplot(4,2,1)

    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    try:
        plt.plot(history.epoch, np.array(history.history['acc']),'g-',
               label='Train accuracy')
        plt.plot(history.epoch, np.array(history.history['val_acc']),'r-',
               label = 'Validation accuracy')
    except:
        plt.plot(history.epoch, np.array(history.history['accuracy_m']),'g-',
               label='Train accuracy')
        plt.plot(history.epoch, np.array(history.history['val_accuracy_m']),'r-',
               label = 'Validation accuracy')
    plt.ylim([0.0,1.0])
    plt.legend()

    plt.subplot(4,2,2)
    plt.xlabel('Epoch')
    plt.ylabel('Loss minimised by model')
    plt.plot(history.epoch, np.array(history.history['loss']),'g-',
           label='Train loss')
    plt.plot(history.epoch, np.array(history.history['val_loss']),'r-',
           label = 'Validation loss')
    plt.legend()
    
    plt.subplot(4,2,3)
    plt.xlabel('Epoch')
    plt.ylabel('Recall')
    plt.plot(history.epoch, np.array(history.history['recall_m']),'g-',
           label='Train recall')
    plt.plot(history.epoch, np.array(history.history['val_recall_m']),'r-',
           label = 'Validation recall')
    plt.legend()
    
    plt.subplot(4,2,4)
    plt.xlabel('Epoch')
    plt.ylabel('Negative recall')
    plt.plot(history.epoch, np.array(history.history['neg_recall_m']),'g-',
           label='Train neg. recall')
    plt.plot(history.epoch, np.array(history.history['val_neg_recall_m']),'r-',
           label = 'Validation neg. recall')
    plt.legend()
    
    plt.subplot(4,2,5)
    plt.xlabel('Epoch')
    plt.ylabel('Precision')
    plt.plot(history.epoch, np.array(history.history['precision_m']),'g-',
           label='Train precision')
    plt.plot(history.epoch, np.array(history.history['val_precision_m']),'r-',
           label = 'Validation precision')
    plt.legend()
    
    plt.subplot(4,2,6)
    plt.xlabel('Epoch')
    plt.ylabel('Negative precision')
    plt.plot(history.epoch, np.array(history.history['neg_precision_m']),'g-',
           label='Train neg. precision')
    plt.plot(history.epoch, np.array(history.history['val_neg_precision_m']),'r-',
           label = 'Validation neg. precision')
    plt.legend()
    
    plt.subplot(4,2,7)
    plt.xlabel('Epoch')
    plt.ylabel('F1')
    plt.plot(history.epoch, np.array(history.history['f1_m']),'g-',
           label='Train F1')
    plt.plot(history.epoch, np.array(history.history['val_f1_m']),'r-',
           label = 'Validation F1')
    plt.legend()
    
    plt.subplot(4,2,8)
    plt.xlabel('Epoch')
    plt.ylabel('Negative F1')
    plt.plot(history.epoch, np.array(history.history['neg_f1_m']),'g-',
           label='Train neg. F1')
    plt.plot(history.epoch, np.array(history.history['val_neg_f1_m']),'r-',
           label = 'Validation neg. F1')
    plt.legend()
    
    

In [22]:
#
#   CALCULATE RECALL, PRECISION, AND F1-SCORE PERFORMANCE MEASURES
#
#w = 3.2552225249772935
def w_binary_crossentropy(y_true, y_pred):
    weights = y_true * class1_weight + (1. - y_true) * class0_weight
    bce = keras.backend.binary_crossentropy(y_true, y_pred)
    weighted_bce = keras.backend.mean(bce * weights)
    return weighted_bce


def tp_m(y_true, y_pred):
    tp = keras.backend.sum(keras.backend.round(keras.backend.clip(y_true * y_pred, 0, 1)))
    return tp

def fp_m(y_true, y_pred):
    fp = keras.backend.sum(keras.backend.round(keras.backend.clip((1-y_true) * y_pred, 0, 1)))
    return fp

def fn_m(y_true, y_pred):
    fn = keras.backend.sum(keras.backend.round(keras.backend.clip(y_true * (1-y_pred), 0, 1)))
    return fn

def tn_m(y_true, y_pred):
    tn = keras.backend.sum(keras.backend.round(keras.backend.clip((1-y_true) * (1-y_pred), 0, 1)))
    return tn

def accuracy_m(y_true, y_pred):
    accuracy = (tp_m(y_true, y_pred) + tn_m(y_true, y_pred)) / (tp_m(y_true, y_pred) + fp_m(y_true, y_pred) + tn_m(y_true, y_pred) + fn_m(y_true, y_pred) + keras.backend.epsilon())
    return accuracy

def pos_true_m(y_true, y_pred):
    return tp_m(y_true, y_pred) + fn_m(y_true, y_pred) 

def pos_pred_m(y_true, y_pred):
    return tp_m(y_true, y_pred) + fp_m(y_true, y_pred)

def neg_true_m(y_true, y_pred):
    return tn_m(y_true, y_pred) + fp_m(y_true, y_pred)

def neg_pred_m(y_true, y_pred):
    return tn_m(y_true, y_pred) + fn_m(y_true, y_pred)

# Sensitivity
def recall_m(y_true, y_pred):
    recall = tp_m(y_true, y_pred) / (tp_m(y_true, y_pred) + fn_m(y_true, y_pred) + keras.backend.epsilon())
    return recall

# Specificity
def neg_recall_m(y_true, y_pred):
    neg_recall = tn_m(y_true, y_pred) / (tn_m(y_true, y_pred) + fp_m(y_true, y_pred) + keras.backend.epsilon())
    return neg_recall

def precision_m(y_true, y_pred):
    precision = tp_m(y_true, y_pred) / (tp_m(y_true, y_pred) + fp_m(y_true, y_pred) + keras.backend.epsilon())
    return precision

# Negative predictive value
def neg_precision_m(y_true, y_pred):
    neg_precision = tn_m(y_true, y_pred) / (tn_m(y_true, y_pred) + fn_m(y_true, y_pred) + keras.backend.epsilon())
    return neg_precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+keras.backend.epsilon()))

def neg_f1_m(y_true, y_pred):
    neg_precision = neg_precision_m(y_true, y_pred)
    neg_recall = neg_recall_m(y_true, y_pred)
    return 2 * ((neg_precision * neg_recall) / (neg_precision + neg_recall + keras.backend.epsilon()))



In [23]:
def perf_measure(y_actual, y_hat):
    y_actual = np.asarray(y_actual)
    y_hat = np.asarray(y_hat)
    
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    
    for i in range(len(y_hat)): 
        if len(y_actual.shape) == 2 and y_actual.shape[1] == 2:
            y_actual_ = np.argmax(y_actual[i])
        else:
            y_actual_ = y_actual[i]
        
        if len(y_hat.shape) == 2 and y_hat.shape[1] == 2:
            y_hat_ = np.argmax(y_hat[i])
        else:
            y_hat_ = y_hat[i]
            
        if y_actual_==y_hat_==1:
            TP += 1
        if y_hat_==1 and y_actual_!=y_hat_:
            FP += 1
        if y_actual_==y_hat_==0:
            TN += 1
        if y_hat_==0 and y_actual_!=y_hat_:
            FN += 1
    
    print(f"tp({TP}), fp({FP}), tn({TN}), fn({FN})")
    
    accuracy = (TP + TN) / (TN + TP + FP + FN)
    recall = TP / (FN + TP)
    neg_recall = TN / (FP + TN)
    
    try:
        precision = TP / (FP + TP)
    except:
        print("No positive predictions...")
        precision = 0
        
    try:
        neg_precision = TN / (FN + TN)
    except:
        print("No negative predictions...")
        neg_precision = 0
            
    try:
        f1 = 2 * (recall * precision) / (recall + precision)
    except:
        f1 = 0
        
    try:
        neg_f1 = 2 * (neg_recall * neg_precision) / (neg_recall + neg_precision)
    except:
        neg_f1 = 0

    aupr = average_precision_score(np.asarray(y_actual), np.asarray(y_hat))
    neg_aupr = average_precision_score(np.asarray([1 - y_a for y_a in y_actual]), np.asarray([1 - y_h for y_h in y_hat]))

    print(f"acc={accuracy}, recall={recall}, neg_recall={neg_recall}, precision={precision}, neg_precision={neg_precision}, f1={f1}, neg_f1={neg_f1}, aupr={aupr}, neg_aupr={neg_aupr}")
    
    return accuracy, recall, neg_recall, precision, neg_precision, f1, neg_f1, aupr, neg_aupr

## Model definition, training and analysis

### T-VGG

#### TUEP TUAB

In [35]:
def get_tvgg(learning_rate=0.001, dropouts=[0,0,0,0,0.5], max_norms=[None,None,None,None], l2s=[0,0,0,0,0]):
    """TINY-VISUAL GEOMETRY GROUP CNN CLASSIFIER"""
    
    # Initialize max_norm and l2 regularization parameters
    max_norms = [max_norm(mn) if mn is not None else None for mn in max_norms]
    l2s = [l2(l2_) if l2_ is not None else None for l2_ in l2s]
    
    # Define model
    model = Sequential()

    model.add(keras.layers.LayerNormalization(input_shape=(2560, 19), axis = 1))
    
    #    Block 0
    model.add(Conv1D(filters=16, kernel_size=3, strides=1, input_shape=(2560, 19), padding="same"))
    model.add(keras.layers.LayerNormalization(axis = 1))
    model.add(Activation('relu'))
    
    model.add(Conv1D(filters=16, kernel_size=3, strides=1, padding="same"))
    model.add(keras.layers.LayerNormalization(axis = 1))
    model.add(Activation('relu'))
    
    model.add(MaxPooling1D(pool_size=4, strides=4))
    model.add(Dropout(dropouts[0]))

    #    Block 1
    model.add(Conv1D(filters=32, kernel_size=3, strides=1, padding="same"))
    model.add(keras.layers.LayerNormalization(axis = 1))
    model.add(Activation('relu'))
    
    model.add(Conv1D(filters=32, kernel_size=3, strides=1, padding="same"))
    model.add(keras.layers.LayerNormalization(axis = 1))
    model.add(Activation('relu'))
    
    model.add(MaxPooling1D(pool_size=4, strides=4))
    
    #    Block 2    
    model.add(Conv1D(filters=32, kernel_size=3, strides=1, padding="same"))
    model.add(keras.layers.LayerNormalization(axis = 1))
    model.add(Activation('relu'))
    
    model.add(Conv1D(filters=32, kernel_size=3, strides=1, padding="same"))
    model.add(keras.layers.LayerNormalization(axis = 1))
    model.add(Activation('relu'))
    
    model.add(MaxPooling1D(pool_size=4, strides=4))
    model.add(Dropout(dropouts[2]))
    
    #    Classification block
    model.add(Flatten())
    
    model.add(Dense(128))
    model.add(keras.layers.LayerNormalization(axis = 1))
    model.add(Activation('relu'))
    
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # Set model layers to trainable for consistency reasons
    for layer in model.layers:
        layer.trainable = True
        
    batch_size = 256
    epochs = 10
    # Compile model
    model.compile(loss= 'binary_crossentropy',
                  optimizer=keras.optimizers.Adam(learning_rate),
                  metrics=[accuracy_m, recall_m, neg_recall_m, precision_m, neg_precision_m, f1_m, neg_f1_m, pos_pred_m, neg_pred_m])

    return model, batch_size, epochs

In [1]:
#source model
basepath = "/home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/"
source_model = "TUEP"
target_model = "TUAB"
my_model, batch_size, epochs = get_tvgg()
my_model.load_weights(basepath + model_dict[source_model]["TVGG"])

#We don't freeze layers

NameError: name 'get_tvgg' is not defined

In [37]:
my_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_normalization_8 (Layer (None, 2560, 19)          5120      
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 2560, 16)          928       
_________________________________________________________________
layer_normalization_9 (Layer (None, 2560, 16)          5120      
_________________________________________________________________
activation_8 (Activation)    (None, 2560, 16)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 2560, 16)          784       
_________________________________________________________________
layer_normalization_10 (Laye (None, 2560, 16)          5120      
_________________________________________________________________
activation_9 (Activation)    (None, 2560, 16)         

In [40]:
keras.backend.clear_session()
keras.backend.set_learning_phase(1)
dataset = target_model

if not multiple_labels_dict[dataset]:
    train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels, val_values_stratified, val_labels_stratified = get_data(dataset)

    _, batch_size, epochs = get_tvgg()
    train_generator = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, dataset, conv = True)
    #val_generator = EEG_Data_Generator_Heterogeneous(val_values, val_labels, batch_size, dataset, conv = True)
else:
    train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels, class1_filters = get_data(dataset)
    train_all_filter = class1_filters[0]
    train_filter = class1_filters[1]
    val_filter = class1_filters[2]

    my_model, batch_size, epochs = get_tvgg()
    train_generator = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, dataset, conv = True, class_filter = train_all_filter)
    #val_generator = EEG_Data_Generator_Heterogeneous(val_values, val_labels, batch_size, dataset, conv = True, class_filter = val_filter)

In [41]:
epochs = 10
checkpoint_path = '/home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/model_weights/TUEP_TUAB_tvgg_train_all.hdf5'
cp_callback = keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                 monitor='val_f1_m',
                                                 save_weights_only=True,
                                                 verbose=1)
log_callback = keras.callbacks.CSVLogger(filename =  '/home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/logbooks/logbook_TUEP_TUAB_tvgg_train_all.csv', separator=",", append=False)
active_callbacks = [cp_callback, log_callback]


my_model_history = my_model.fit_generator(generator=train_generator,
                                          steps_per_epoch=len(train_generator),
                                          epochs=epochs,
                                          shuffle=True,
                                          #validation_data=val_generator,
                                          #validation_steps=len(val_generator),
                                        callbacks = active_callbacks)

Epoch 1/10
Epoch 00001: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/model_weights/tusz_tuep_tvgg_train_all.hdf5
Epoch 2/10
Epoch 00002: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/model_weights/tusz_tuep_tvgg_train_all.hdf5
Epoch 3/10
Epoch 00003: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/model_weights/tusz_tuep_tvgg_train_all.hdf5
Epoch 4/10
Epoch 00004: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/model_weights/tusz_tuep_tvgg_train_all.hdf5
Epoch 5/10
Epoch 00005: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/model_weights/tusz_tuep_tvgg_train_all.hdf5
Epoch 6/10
Epoch 00006: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/model_weights/tusz_tuep_tvgg_train_all.hdf5
Epoc

In [42]:
my_model, batch_size, epochs = get_tvgg()
my_model.load_weights(checkpoint_path)
test_generator = EEG_Data_Generator_Heterogeneous(test_values, test_labels, batch_size, dataset, conv = True)

In [2]:
test_dict = my_model.evaluate(test_generator, steps=len(test_generator), return_dict = True)
test_preds = my_model.predict_generator(test_generator, steps=len(test_generator))
test_labels_ = np.concatenate([test_generator[i][1] for i in range(len(test_generator))])

print("accuracy", test_dict['accuracy_m'])
print("aupr", average_precision_score(test_labels_, test_preds, average="micro"))
print("recall", test_dict['recall_m'], "precision", test_dict['precision_m'], "\n")

NameError: name 'my_model' is not defined

#### TUAB TUEP

In [None]:
#source model
basepath = "/home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/"
source_model = "TUAB"
target_model = "TUEP"
my_model, batch_size, epochs = get_tvgg()
my_model.load_weights(basepath + model_dict[source_model]["TVGG"])

#We don't freeze layers

In [None]:
keras.backend.clear_session()
keras.backend.set_learning_phase(1)
dataset = target_model

if not multiple_labels_dict[dataset]:
    train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels, val_values_stratified, val_labels_stratified = get_data(dataset)

    my_model, batch_size, epochs = get_tvgg()
    train_generator = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, dataset, conv = True)
    #val_generator = EEG_Data_Generator_Heterogeneous(val_values, val_labels, batch_size, dataset, conv = True)
else:
    train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels, class1_filters = get_data(dataset)
    train_all_filter = class1_filters[0]
    train_filter = class1_filters[1]
    val_filter = class1_filters[2]

    my_model, batch_size, epochs = get_tvgg()
    train_generator = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, dataset, conv = True, class_filter = train_all_filter)
    #val_generator = EEG_Data_Generator_Heterogeneous(val_values, val_labels, batch_size, dataset, conv = True, class_filter = val_filter)

In [None]:
epochs = 10
checkpoint_path = '/home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/model_weights/TUAB_TUEP_tvgg_train_all.hdf5'
cp_callback = keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                 monitor='val_f1_m',
                                                 save_weights_only=True,
                                                 verbose=1)
log_callback = keras.callbacks.CSVLogger(filename =  '/home/jupyter/time_series_transfer_learning/transfer_learning/1-transfer-learning/logbooks/logbook_TUAB_TUEP_tvgg_train_all.csv', separator=",", append=False)
active_callbacks = [cp_callback, log_callback]


my_model_history = my_model.fit_generator(generator=train_generator,
                                          steps_per_epoch=len(train_generator),
                                          epochs=epochs,
                                          shuffle=True,
                                          #validation_data=val_generator,
                                          #validation_steps=len(val_generator),
                                        callbacks = active_callbacks)

### EEGNET

In [98]:
def get_eegnet(input_shape, output_shape):
    
    F1 = 4
    C = 19
    D = 2
    F2 = D * F1

    do = [0.25, 0.25]
    mn = [1, 0.25]
    
    
    model = Sequential()
    
    model.add(Conv2D(filters=F1, kernel_size=(1, 128), input_shape=input_shape, padding="same"))
    model.add(Activation("linear"))
    model.add(BatchNormalization(axis=1))

    model.add(DepthwiseConv2D(kernel_size=(C, 1), padding="valid", depth_multiplier=D, depthwise_constraint=keras.constraints.max_norm(mn[0]), data_format='channels_first'))
    model.add(Activation("linear"))
    model.add(BatchNormalization(axis=1))
    model.add(Activation("elu"))

    model.add(AveragePooling2D(pool_size=(1, 4), data_format='channels_first'))
    model.add(Dropout(do[0]))

    model.add(SeparableConv2D(filters=F2, kernel_size=(1, 16), padding="same", use_bias=False))
    model.add(Activation("linear"))
    
    model.add(BatchNormalization(axis=1))
    model.add(Activation("elu"))

    model.add(AveragePooling2D(pool_size=(1, 8), data_format='channels_first'))
    model.add(Dropout(do[1]))


    model.add(Flatten())
    model.add(Dense(output_shape, kernel_constraint=keras.constraints.max_norm(mn[1])))
    model.add(Activation('sigmoid'))

    epochs = 5
    batch_size = 128
    learning_rate = 0.0001
    
    
    model.compile(loss= 'binary_crossentropy',
                  optimizer=keras.optimizers.Adam(learning_rate),
                  metrics=[accuracy_m, recall_m, neg_recall_m, precision_m, neg_precision_m, f1_m, neg_f1_m, pos_pred_m, neg_pred_m])
    
    return model, epochs, batch_size

#### Train model and tune hyperparameters

#### TUEP TUAB

In [None]:
#source model
basepath = "/home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/"
source_model = "TUEP"
target_model = "TUAB"
my_model, batch_size, epochs = get_eegnet((1, 19, 2560), output_shape = 1)
my_model.load_weights(basepath + model_dict[source_model]["TVGG"])

#We don't freeze layers

In [101]:
keras.backend.clear_session()
dataset = target_model

if not multiple_labels_dict[dataset]:
    train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(dataset)
    _, epochs, batch_size = get_eegnet((1, 19, 2560), output_shape = 1)

    train_generator = EEG_Data_Generator_Heterogeneous(train_values, train_labels, batch_size, dataset)
    #val_generator = EEG_Data_Generator_Heterogeneous(val_values, val_labels, batch_size, dataset)

else:
    train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels, class1_filters = get_data(dataset)
    train_all_filter = class1_filters[0]
    train_filter = class1_filters[1]
    val_filter = class1_filters[2]

    my_model, epochs, batch_size = get_eegnet((1, 19, 2560), output_shape = 1)
    train_generator = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, dataset, class_filter = train_all_filter)
    #val_generator = EEG_Data_Generator_Heterogeneous(val_values, val_labels, batch_size, dataset, class_filter = val_filter)

In [104]:
epochs = 10
checkpoint_path = '/home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/TUEP_TUSZ_eeg_train_all.hdf5'
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                 monitor='val_f1_m',
                                                 save_weights_only=True,
                                                 verbose=1)
log_callback = tf.keras.callbacks.CSVLogger(filename =  '/home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/logbooks/logbook_TUEP_TUSZ_eeg_train_all.csv', separator=",", append=False)
active_callbacks = [cp_callback, log_callback]


my_model_history = my_model.fit_generator(generator=train_generator,
                                          steps_per_epoch=len(train_generator),
                                          epochs=epochs,
                                          shuffle=True,
                                        callbacks = active_callbacks)

Epoch 1/5
Epoch 00001: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/tusz_eeg_train_all.hdf5
Epoch 2/5
Epoch 00002: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/tusz_eeg_train_all.hdf5
Epoch 3/5
Epoch 00003: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/tusz_eeg_train_all.hdf5
Epoch 4/5
Epoch 00004: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/tusz_eeg_train_all.hdf5
Epoch 5/5
Epoch 00005: saving model to /home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/tusz_eeg_train_all.hdf5


In [105]:
my_model.load_weights(checkpoint_path)
test_generator = EEG_Data_Generator_Heterogeneous(test_values, test_labels, batch_size, dataset)

In [106]:
test_dict = my_model.evaluate(test_generator, steps=len(test_generator), return_dict = True)
test_preds = my_model.predict_generator(test_generator, steps=len(test_generator))
test_labels_ = np.concatenate([test_generator[i][1] for i in range(len(test_generator))])

print("accuracy", test_dict['accuracy_m'])
print("aupr", average_precision_score(test_labels_, test_preds, average="micro"))
print("recall", test_dict['recall_m'], "precision", test_dict['precision_m'], "\n")

accuracy 0.3411458432674408
aupr 0.9678087810374134
recall 0.32824501395225525 precision 0.9625529646873474 



#### TUAB TUEP

In [None]:
#source model
basepath = "/home/jupyter/time_series_transfer_learning/transfer_learning/0-baseline/model_weights/"
source_model = "TUAB"
target_model = "TUEP"
my_model, batch_size, epochs = get_tvgg()
my_model.load_weights(basepath + model_dict[source_model]["TVGG"])

#We don't freeze layers