In [1]:
import os
import random
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sn
import math
import gc
import nbimporter

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, LeaveOneOut
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE

from collections import Counter

from scipy.fft import fft

from keras import backend as K
from keras.layers import Dense, Activation, Dropout, Flatten, Input
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers.merge import concatenate
from keras.utils import plot_model, to_categorical
from keras.models import Model, Sequential, load_model
from keras import optimizers, regularizers

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

print("Available GPU devices:", tf.config.list_physical_devices('GPU'))
tf.random.set_seed(3)

import models as models

Available GPU devices: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Importing Jupyter notebook from models.ipynb


## Helper functions

This notebook contains a number of helper functions useful for a variety of ML experiments (train-test, kFold cross validation, leave one subject out...). They are used by the corresponding experiment notebooks.

**clear_old_dirs():** The function below clears old logs and checkpoints used in previous runs.

In [2]:
def clear_old_dirs(which_data, which_experiment):
    tf.keras.backend.clear_session()
    
    if which_experiment == "train_test":
        if which_data == "contact":
            for d in os.listdir("model_checkpoints/"+which_data+"_train_test/"):
                os.remove("model_checkpoints/"+which_data+"_train_test/"+d)
            for d in os.listdir("model_training_logs/"+which_data+"_train_test/"):
                shutil.rmtree("model_training_logs/"+which_data+"_train_test/"+d)
            for d in os.listdir("results/"+which_data+"_train_test/"):
                os.remove("results/"+which_data+"_train_test/"+d)
        elif which_data == "radar":
            for d in os.listdir("model_checkpoints/"+which_data+"_train_test/"):
                os.remove("model_checkpoints/"+which_data+"_train_test/"+d)
            for d in os.listdir("model_training_logs/"+which_data+"_train_test/"):
                shutil.rmtree("model_training_logs/"+which_data+"_train_test/"+d)
            for d in os.listdir("results/"+which_data+"_train_test/"):
                os.remove("results/"+which_data+"_train_test/"+d)
        else:
            print("Unknown data!")
            return
    elif which_experiment == "kfold":
        if which_data == "contact":
            for d in os.listdir("model_checkpoints/"+which_data+"_kfold/"):
                shutil.rmtree("model_checkpoints/"+which_data+"_kfold/"+d)
            for d in os.listdir("model_training_logs/"+which_data+"_kfold/"):
                shutil.rmtree("model_training_logs/"+which_data+"_kfold/"+d)
            for d in os.listdir("results/"+which_data+"_kfold/"):
                shutil.rmtree("results/"+which_data+"_kfold/"+d)
        elif which_data == "radar":
            for d in os.listdir("model_checkpoints/"+which_data+"_kfold/"):
                shutil.rmtree("model_checkpoints/"+which_data+"_kfold/"+d)
            for d in os.listdir("model_training_logs/"+which_data+"_kfold/"):
                shutil.rmtree("model_training_logs/"+which_data+"_kfold/"+d)
            for d in os.listdir("results/"+which_data+"_kfold/"):
                shutil.rmtree("results/"+which_data+"_kfold/"+d)
        else:
            print("Unknown data!")
            return
    elif which_experiment == "loo":
        if which_data == "contact":
            for d in os.listdir("model_checkpoints/"+which_data+"_loo/"):
                shutil.rmtree("model_checkpoints/"+which_data+"_loo/"+d)
            for d in os.listdir("model_training_logs/"+which_data+"_loo/"):
                shutil.rmtree("model_training_logs/"+which_data+"_loo/"+d)
            for d in os.listdir("results/"+which_data+"_loo/"):
                shutil.rmtree("results/"+which_data+"_loo/"+d)
        elif which_data == "radar":
            for d in os.listdir("model_checkpoints/"+which_data+"_loo/"):
                shutil.rmtree("model_checkpoints/"+which_data+"_loo/"+d)
            for d in os.listdir("model_training_logs/"+which_data+"_loo/"):
                shutil.rmtree("model_training_logs/"+which_data+"_loo/"+d)
            for d in os.listdir("results/"+which_data+"_loo/"):
                shutil.rmtree("results/"+which_data+"_loo/"+d)
        else:
            print("Unknown data!")
            return
    elif which_experiment == "spectrograms":
        if which_data == "contact":
            for d in os.listdir("model_checkpoints/"+which_data+"_spectrograms/"):
                shutil.rmtree("model_checkpoints/"+which_data+"_spectrograms/"+d)
            for d in os.listdir("model_training_logs/"+which_data+"_spectrograms/"):
                shutil.rmtree("model_training_logs/"+which_data+"_spectrograms/"+d)
            for d in os.listdir("results/"+which_data+"_spectrograms/"):
                shutil.rmtree("results/"+which_data+"_spectrograms/"+d)
        elif which_data == "radar":
            for d in os.listdir("model_checkpoints/"+which_data+"_spectrograms/"):
                shutil.rmtree("model_checkpoints/"+which_data+"_spectrograms/"+d)
            for d in os.listdir("model_training_logs/"+which_data+"_spectrograms/"):
                shutil.rmtree("model_training_logs/"+which_data+"_spectrograms/"+d)
            for d in os.listdir("results/"+which_data+"_spectrograms/"):
                shutil.rmtree("results/"+which_data+"_spectrograms/"+d)
        else:
            print("Unknown data!")
            return
    else:
        print("Unknown experiment type!")

**scheduler():** A function to decrease the learning rate in accordance with some mathematical function.

In [3]:
def scheduler(epoch, lr=0.1, decay=0.001):
    if epoch <= 10:
        return lr
    else:
        return lr * 1/(1+decay*epoch)

**show_dist():** A function that shows the distribution of (class) values of an array.

In [4]:
def show_dist(input_array):
    display(pd.DataFrame(input_array, columns=["class"]).value_counts(normalize=True))

**oversample_smote():** The function below oversamples the data using Synthetic Minority Oversampling Technique, or SMOTE for short. It achieves equal class distribution in the dataset. It returns the oversampled dataset.

In [5]:
def oversample_smote(X_all, Y_all):
    oversample_X, oversample_Y = [], []
    for i in range(X_all.shape[2]):
        print("Oversampling signal", i)
        oversample = SMOTE(n_jobs=8)
        X_t, Y_t = oversample.fit_resample(X_all[:,:,i], Y_all)
        oversample_X.append(X_t)
        oversample_Y.append(Y_t)

    X_all = np.stack(oversample_X, axis=2)
    Y_all = Y_t
    print("Shapes after oversampling:", X_all.shape, Y_all.shape)
    
    return X_all, Y_all

**shuffle_within_train_test():** The function below shuffles the data within the train and test split separately, as to not cause any neighbouring instances to appear in train and test. It returns the shuffled train and test data.

In [6]:
def shuffle_within_train_test(X_train, Y_train, X_test, Y_test):
    print("Shuffling WITHIN train/test, NOT overall!")

    indices_train = np.arange(Y_train.shape[0])
    np.random.shuffle(indices_train)
    X_train = X_train[indices_train]
    Y_train = Y_train[indices_train]

    indices_test = np.arange(Y_test.shape[0])
    np.random.shuffle(indices_test)
    X_test = X_test[indices_test]
    Y_test = Y_test[indices_test]
    
    return X_train, Y_train, X_test, Y_test

**standardize_input():** Simple standard scaler (normalization) from sklearn.

In [7]:
def standardize_input(X_train, X_test):
    print("Scaling!")
    scaler = StandardScaler()
    for i in range(X_train.shape[2]):
        scaler = scaler.fit(X_train[:,:,i])
        X_train[:,:,i] = scaler.transform(X_train[:,:,i])
        X_test[:,:,i]  = scaler.transform(X_test[:,:,i])
    
    return X_train, X_test

**get_model():** The function below creates a selected deep learning ANN architecture model and returns it.

In [1]:
def get_model(which_data, which_model, input_X, input_Y, hyperparams, metrics):
    if which_data == "contact":
        if which_model == "fully_connected_small":
            model = models.create_contact_fully_connected_small(
                input_data = input_X, 
                lr         = hyperparams["LR"], 
                dropout    = hyperparams["DROPOUT"],
                n_classes  = to_categorical(input_Y).shape[1],
                metrics    = metrics
            )
        elif which_model == "1d_cnn":
            model = models.cnn1d(
                input_data = input_X, 
                hyperparams= hyperparams,
                metrics    = metrics
            )
        elif which_model == "hybrid":
            model = models.hybrid_model(
                input_data = input_X, 
                hyperparams= hyperparams,
                metrics    = metrics
            )
        else:
            print("Uknown model type!")
            return None
    elif which_data == "radar":
        if which_model == "fully_connected_small":
            model = models.create_radar_fully_connected_small(
                input_data = input_X, 
                lr         = hyperparams["LR"], 
                dropout    = hyperparams["DROPOUT"],
                n_classes  = to_categorical(input_Y).shape[1],
                #n_classes  = 6,
                metrics    = metrics
            )
        elif which_model == "1d_cnn":
            model = models.cnn1d(
                input_data = input_X, 
                hyperparams= hyperparams,
                metrics    = metrics
            )
        elif which_model == "hybrid":
            model = models.hybrid_model(
                input_data = input_X, 
                hyperparams= hyperparams,
                metrics    = metrics
            )
        else:
            print("Uknown model type!")
            return None
    else:
        print("Unknown data!")
        return None
    
    return model

**evaluate_model():** The function below evaluates a given model on a given (separate) test dataset. It returns a table with results and saves a confusion matrix.

In [9]:
def evaluate_model(trained_model, X_test, Y_test, hyperparams, which_data, class_map, iteration, which_experiment, run_id):
    # Table of numeric results
    all_results = pd.DataFrame(columns=["loss", "tp", "fp", "tn", "fn", "accuracy", "precision", "recall", "auc"])
    
    results = trained_model.evaluate(
        x=[np.squeeze(x) for x in np.split(X_test, X_test.shape[2], axis=2)], # split individual signals
        y=to_categorical(Y_test, num_classes=6), # one hot encode discrete labels
        batch_size=hyperparams["BATCH_SIZE"], 
        verbose=hyperparams["VERBOSE"]
    )
    all_results.loc[0] = results[1:]
    
    # Confusion matrix
    predictions = trained_model.predict([np.squeeze(x) for x in np.split(X_test, X_test.shape[2], axis=2)])
    Y_predicted = np.argmax(predictions, axis=1).astype('float')
    
    inv_class_map = {v: k for k, v in class_map.items()}
    if np.unique(Y_test).size >= np.unique(Y_predicted).size:
        larger = Y_test
    else:
        larger = Y_predicted
    plot_classes = []
    for val in np.unique(larger):
        plot_classes.append(inv_class_map[int(val)])
    
    # Plot confusion matrix
    cm = confusion_matrix(y_true=Y_test, y_pred=Y_predicted)
    df_cm = pd.DataFrame(cm, index=plot_classes, columns=plot_classes)
    plt.figure(figsize=(10,7))
    sn.heatmap(df_cm, annot=True)
    plt.xlabel("PREDICTED")
    plt.ylabel("TRUE")
    
    if which_experiment == "kfold" or which_experiment == "loo":
        plt.savefig("results/"+which_data+"_"+which_experiment+"/"+run_id+"/"+str(iteration)+"/confusion_matrix.png", bbox_inches='tight')
        plt.close()
    else:
        plt.savefig("results/"+which_data+"_"+which_experiment+"/confusion_matrix.png", bbox_inches='tight')
        plt.close()
    
    return all_results

**execute_single_fold():** The function below executes a single fold of the cross validation procedure, obtaining a set of results for the current iteration.

In [1]:
def execute_single_fold(which_data, which_model, train_index, test_index, X_all, Y_all, hyperparams, metrics, iteration, class_map):
    print("TRAIN:", train_index, "TEST:", test_index)
        
    # Take n-1 folds for train and 1 for test
    X_train = X_all[train_index, :, :]
    X_test  = X_all[test_index, :, :]
    Y_train = Y_all[train_index]
    Y_test  = Y_all[test_index]
    print("Train shapes:", X_train.shape, Y_train.shape, "Test shapes:", X_test.shape, Y_test.shape)
    show_dist(Y_train)
    show_dist(Y_test)
    
    # Random shuffling within train test (no overall)
    if hyperparams["SHUFFLE"]:
        X_train, Y_train, X_test, Y_test = shuffle_within_train_test(X_train, Y_train, X_test, Y_test)

    # Standardize/scale the data
    if hyperparams["STANDARD"]:
        X_train, X_test = standardize_input(X_train, X_test)

    # Define fully-connected (contact or radar) signal model
    model = get_model(which_data, which_model, X_train, Y_train, hyperparams, metrics)

    if not os.path.exists("model_checkpoints/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration)) or not os.path.exists("model_training_logs/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration)) or not os.path.exists("results/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration)):
        os.makedirs("model_checkpoints/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration))
        os.makedirs("model_training_logs/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration))
        os.makedirs("results/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration))
    
    # Train the model
    history = model.fit(
        x=[np.squeeze(x) for x in np.split(X_train, X_train.shape[2], axis=2)], # split individual signals
        y=to_categorical(Y_train, num_classes=6), # one hot encode discrete labels
        batch_size=hyperparams["BATCH_SIZE"],
        epochs=hyperparams["N_EPOCHS"],
        verbose=hyperparams["VERBOSE"],
        validation_split=hyperparams["VALIDATION"],
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(filepath="model_checkpoints/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration)+"/best_model.hdf5", monitor='val_loss', mode='min', save_best_only=True),
            tf.keras.callbacks.TensorBoard(log_dir="model_training_logs/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration)),
            tf.keras.callbacks.LearningRateScheduler(scheduler),
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
        ]
    )

    # Evaluate the model on left-out test data
    trained_model = load_model(filepath="model_checkpoints/"+which_data+"_kfold/"+hyperparams["RUN_ID"]+"/"+str(iteration)+"/best_model.hdf5")
    results = evaluate_model(trained_model, X_test, Y_test, hyperparams, which_data, class_map, iteration, which_experiment="kfold", run_id=hyperparams["RUN_ID"])
    
    # Cleanup for GPU memory
    del model
    del trained_model
    gc.collect()
    K.clear_session()
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()

    return results

**execute_single_loo_iteration():** The function below executes a single iteration of the leave-one-out procedure, obtaining a set of results for the current iteration.

In [18]:
def execute_single_loo_iteration(which_data, which_model, train_index, test_index, X_loo, Y_loo, hyperparams, metrics, iteration, class_map):
    print("TRAIN:", train_index, "TEST:", test_index)
        
    # Take n-1 for train and 1 for test
    X_train = np.concatenate(np.asarray(X_loo, dtype=object)[train_index], axis=0)
    X_test = np.concatenate(np.asarray(X_loo, dtype=object)[test_index], axis=0)
    Y_train = np.concatenate(np.asarray(Y_loo, dtype=object)[train_index], axis=0)
    Y_test = np.concatenate(np.asarray(Y_loo, dtype=object)[test_index], axis=0)
    print("Train shapes:", X_train.shape, Y_train.shape, "Test shapes:", X_test.shape, Y_test.shape)
    
    # Oversample within train and test (left-out subject) separately, since distribution is skewed in both
    if hyperparams["OVERSAMPLE"]:
        if np.unique(Y_test).size <= 1:
            print("Test data (left-out subject) has only 1 class:", np.unique(Y_test), " so it doesn't make sense to oversample!")
        else:
            # Oversample minority classes in a smart way using SMOTE
            print("Oversampling the train (n-1) subjects, with shapes:", X_train.shape, Y_train.shape)
            X_train, Y_train = oversample_smote(X_train, Y_train)
            print("Oversampling the test (left-out) subject with shapes:", X_test.shape, Y_test.shape)
            if X_test.shape[0] >= 10:
                X_test, Y_test = oversample_smote(X_test, Y_test)
    
    show_dist(Y_train)
    show_dist(Y_test)
    
    # Random shuffling within train test (no overall)
    if hyperparams["SHUFFLE"]:
        X_train, Y_train, X_test, Y_test = shuffle_within_train_test(X_train, Y_train, X_test, Y_test)
    
    # Standardize/scale the data
    if hyperparams["STANDARD"]:
        X_train, X_test = standardize_input(X_train, X_test)

    # Define fully-connected (contact or radar) signal model
    model = get_model(which_data, which_model, X_train, Y_train, hyperparams, metrics)

    if not os.path.exists("model_checkpoints/"+which_data+"_loo/"+str(iteration)) or not os.path.exists("model_training_logs/"+which_data+"_loo/"+str(iteration)) or not os.path.exists("results/"+which_data+"_loo/"+str(iteration)):
        os.makedirs("model_checkpoints/"+which_data+"_loo/"+str(iteration))
        os.makedirs("model_training_logs/"+which_data+"_loo/"+str(iteration))
        os.makedirs("results/"+which_data+"_loo/"+str(iteration))

    # Train the model
    history = model.fit(
        x=[np.squeeze(x) for x in np.split(X_train, X_train.shape[2], axis=2)], # split individual signals
        y=to_categorical(Y_train, num_classes=np.unique(Y_train).size), # one hot encode discrete labels
        batch_size=hyperparams["BATCH_SIZE"],
        epochs=hyperparams["N_EPOCHS"],
        verbose=hyperparams["VERBOSE"],
        validation_split=hyperparams["VALIDATION"],
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(filepath="model_checkpoints/"+which_data+"_loo/"+str(iteration)+"/best_model.hdf5", monitor='val_loss', mode='min', save_best_only=True),
            tf.keras.callbacks.TensorBoard(log_dir="model_training_logs/"+which_data+"_loo/"+str(iteration)),
            tf.keras.callbacks.LearningRateScheduler(scheduler),
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
        ]
    )

    # Evaluate the model on left-out test data
    trained_model = load_model(filepath="model_checkpoints/"+which_data+"_loo/"+str(iteration)+"/best_model.hdf5")
    results = evaluate_model(trained_model, X_test, Y_test, hyperparams, which_data, class_map, iteration, which_experiment="loo")
    
    # Cleanup for GPU memory
    del model
    del trained_model
    gc.collect()
    K.clear_session()
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()

    return results

**plot_cv_indices():** The function below plots exact split of indices in the kFold CV, for sanity check.

In [12]:
def plot_cv_indices(cv, X, y, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=plt.cm.coolwarm,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.Paired)

    # Formatting
    yticklabels = list(range(n_splits)) + ['class']
    ax.set(yticks=np.arange(n_splits + 1) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

## Main train-test experiment

**train_test_experiment():** The function below calls all other functions to read the data, split it, define and train a model and evaluate it in train-test split experiment. Finally, it does some memory cleanup.

In [13]:
def train_test_experiment(which_data, which_model, hyperparams, metrics, class_map, selected_classes=None):
    # Clean old things from previous run
    clear_old_dirs(which_data, "train_test")
    
    # Read the full data
    X_all = np.load("model_inputs_and_targets/train_test/X_all_"+which_data+".npy")
    Y_all = np.load("model_inputs_and_targets/train_test/Y_all.npy")
    print("Original shape:", X_all.shape, Y_all.shape)
    
    if selected_classes:
        # Make subsets of data of selected classes
        print("Taking subsets of data for classes:", selected_classes)
        X_new, Y_new = [], []
        for selected_class in selected_classes:
            idx = (Y_all == class_map[selected_class]).nonzero()
            X_new.append(X_all[idx[0], :, :])
            Y_new.append(Y_all[idx])

        X_all = np.concatenate(X_new, axis=0)
        Y_all = np.concatenate(Y_new, axis=0)
        print("Subset shapes:", X_all.shape, Y_all.shape)
        
        class_map_new = {selected_class: class_map[selected_class] for selected_class in selected_classes}
        class_map = class_map_new
    
    if hyperparams["OVERSAMPLE"]:
        # Oversample minority classes in a smart way using SMOTE
        X_all, Y_all = oversample_smote(X_all, Y_all)
        
    # Withold test data
    split_idx = math.ceil(hyperparams["TRAIN_AMNT"]*Y_all.shape[0])
    X_train   = X_all[:split_idx,:,:]
    Y_train   = Y_all[:split_idx]
    X_test    = X_all[split_idx:,:,:]
    Y_test    = Y_all[split_idx:]
    #X_train, X_test, Y_train, Y_test = train_test_split(X_all, Y_all, test_size=1-train_amnt, shuffle=False, stratify=Y_all)
    
    # Random shuffling within train test (no overall)
    if hyperparams["SHUFFLE"]:
        X_train, Y_train, X_test, Y_test = shuffle_within_train_test(X_train, Y_train, X_test, Y_test)
    
    # Standardize/scale the data
    if hyperparams["STANDARD"]:
        X_train, X_test = standardize_input(X_train, X_test)
            
    # Define fully-connected (contact or radar) signal model
    model = get_model(which_data, which_model, X_train, Y_train, hyperparams, metrics)
    
    # Train the model
    history = model.fit(
        x=[np.squeeze(x) for x in np.split(X_train, X_train.shape[2], axis=2)], # split individual signals
        y=to_categorical(Y_train, num_classes=6), # one hot encode discrete labels
        batch_size=hyperparams["BATCH_SIZE"],
        epochs=hyperparams["N_EPOCHS"],
        verbose=hyperparams["VERBOSE"],
        validation_split=hyperparams["VALIDATION"],
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(filepath="model_checkpoints/"+which_data+"_train_test/best_model.hdf5", monitor='val_loss', mode='min', save_best_only=True),
            tf.keras.callbacks.TensorBoard(log_dir="model_training_logs/"+which_data+"_train_test/"),
            tf.keras.callbacks.LearningRateScheduler(scheduler),
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
        ]
    )
    
    # Evaluate the model on left-out test data
    trained_model = load_model(filepath="model_checkpoints/"+which_data+"_train_test/best_model.hdf5")
    all_results = evaluate_model(trained_model, X_test, Y_test, hyperparams, which_data, class_map, iteration=-1, which_experiment="train_test")
    
    # Cleanup for GPU memory
    del model
    del trained_model
    gc.collect()
    K.clear_session()
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    
    return all_results

## Main k-fold CV experiment

**kfold_cv_experiment():** The function below calls all other functions to read the data, split it, define and train a model and evaluate it in k-fold cross validation experiment. Finally, it does some memory cleanup.

In [3]:
def kfold_cv_experiment(which_data, which_model, hyperparams, metrics, class_map, n_folds=10, selected_classes=None):
    # Clean old things from previous run
    #clear_old_dirs(which_data, "kfold")
    
    # Read the full data
    X_all = np.load("model_inputs_and_targets/kfold/X_all_"+which_data+".npy")
    Y_all = np.load("model_inputs_and_targets/kfold/Y_all.npy")
    print("Original shape:", X_all.shape, Y_all.shape)
    
    if selected_classes:
        # Make subsets of data of selected classes
        print("Taking subsets of data for classes:", selected_classes)
        X_new, Y_new = [], []
        for selected_class in selected_classes:
            idx = (Y_all == class_map[selected_class]).nonzero()
            X_new.append(X_all[idx[0], :, :])
            Y_new.append(Y_all[idx])

        X_all = np.concatenate(X_new, axis=0)
        Y_all = np.concatenate(Y_new, axis=0)
        print("Subset shapes:", X_all.shape, Y_all.shape)
        
        class_map_new = {selected_class: class_map[selected_class] for selected_class in selected_classes}
        class_map = class_map_new
    
    if hyperparams["OVERSAMPLE"]:
        # Oversample minority classes in a smart way using SMOTE
        X_all, Y_all = oversample_smote(X_all, Y_all)
        
    if hyperparams["FFT_ONLY"]:
        print("Transforming temporal data to 1D FFT (abs squared)...")
        for signal in range(X_all.shape[2]):
            i = 0
            for row in X_all[:,:,signal]:
                X_all[i, :, signal] = np.abs(fft(row))**2
                i += 1
    
    if hyperparams["ADD_FFT"]:
        # FFT
        print("Adding 1D FFT (abs squared) to temporal data...")
        X_all_fft = np.copy(X_all)
        for signal in range(X_all.shape[2]):
            i = 0
            for row in X_all[:,:,signal]:
                X_all_fft[i,:,signal] = np.abs(fft(row))**2
                i += 1
        
        X_all = np.concatenate([X_all[:,:,:], X_all_fft[:,:,:]], axis=2)
        print(X_all.shape)
        
    # Group_by class (for visualisation purposes)
    idx = np.argsort(Y_all)
    X_all = X_all[idx]
    Y_all = Y_all[idx]
        
    # Prepare kFold CV
    skf = StratifiedKFold(n_splits=n_folds)
    print("Number of splits:", skf.get_n_splits(X_all, Y_all))
    
    # Sanity plot of index splits
    fig, ax = plt.subplots()
    plot_cv_indices(skf, X_all, Y_all, ax, n_folds)
    plt.show()

    iteration = 0
    all_results = pd.DataFrame(columns=["loss", "tp", "fp", "tn", "fn", "accuracy", "precision", "recall", "auc"])
    for train_index, test_index in skf.split(X_all, Y_all):
        print("====================== FOLD:", iteration, "======================")
        
        # Sanity check
        print("Train:", train_index)
        print("Test:", test_index)
        print("Intersection:", list(set(train_index) & set(test_index)))
        
        results = execute_single_fold(which_data, which_model, train_index, test_index, X_all, Y_all, hyperparams, metrics, iteration, class_map)
        all_results = all_results.append(results)

        iteration += 1
    
    return all_results

## Main LOO experiment

**loo_experiment():** The function below calls other functions to read the data, split it, define and train a model, and evaluate it in a LOO experiment. Finally, it does some memory cleanup.

In [15]:
def loo_experiment(which_data, which_model, hyperparams, metrics, class_map, selected_classes=None, remove_incomplete_data=False):
    # Clean old things from previous run
    clear_old_dirs(which_data, "loo")
    
    # Read full data
    X_loo = list(np.load("model_inputs_and_targets/leave_one_out/X_loo_contact.npy", allow_pickle=True)[()].values())
    Y_loo = list(np.load("model_inputs_and_targets/leave_one_out/Y_loo.npy", allow_pickle=True)[()].values())
    
    # Remove subjects with incomplete data
    if remove_incomplete_data:
        X_loo_copy = X_loo
        Y_loo_copy = Y_loo
        for i, subject in enumerate(Y_loo_copy):
            if len(Counter(subject).keys()) < 5:
                X_loo.pop(i)
                Y_loo.pop(i)
    
    print("Full shapes:", np.concatenate(X_loo, axis=0).shape, np.concatenate(Y_loo, axis=0).shape)
    
    if selected_classes:
        # Make subsets of data of selected classes
        print("Taking subsets of data for classes:", selected_classes)
        X_new, Y_new = [], []
        for selected_class in selected_classes:
            idx = (Y_loo == class_map[selected_class]).nonzero()
            X_new.append(X_loo[idx[0], :, :])
            Y_new.append(Y_loo[idx])

        X_loo = np.concatenate(X_new, axis=0)
        Y_loo = np.concatenate(Y_new, axis=0)
        print("Subset shapes:", X_loo.shape, Y_loo.shape)
        
        class_map_new = {selected_class: class_map[selected_class] for selected_class in selected_classes}
        class_map = class_map_new
    
    # Prepare leave one out
    loo = LeaveOneOut()
    print("Number of splits:", loo.get_n_splits(X_loo), loo.get_n_splits(Y_loo), "\n")
    
    iteration = 0
    all_results = pd.DataFrame(columns=["loss", "tp", "fp", "tn", "fn", "accuracy", "precision", "recall", "auc"])
    for train_index, test_index in loo.split(X_loo):
        print("====================== ITERATION:", iteration, "======================") 
        results = execute_single_loo_iteration(which_data, which_model, train_index, test_index, X_loo, Y_loo, hyperparams, metrics, iteration, class_map)
        all_results = all_results.append(results)

        iteration += 1
    
    return all_results

## Main spectrograms experiment

**spectrograms_experiment():** The function below calls other functions to read the data, split it, define and train a model, and evaluate it in a k-fold experiment using spectrograms. Finally, it does some memory cleanup.

In [16]:
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, which_data, batch_size, dim, n_channels=6, n_classes=6, shuffle=False):
        'Initialization'
        self.which_data = which_data
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp, indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            
            # Store sample
            X[i,:,:,:] = np.load("model_inputs_and_targets/spectrograms/" + self.which_data + "/" + ID)
            
            # Store class
            y[i] = self.labels[indexes[i]]
        
        return ([x for x in np.split(X, X.shape[3], axis=3)], keras.utils.to_categorical(y, num_classes=self.n_classes))

In [16]:
def spectrograms_experiment(which_data, which_model, hyperparams, metrics, class_map, n_folds, selected_classes=None):
    # Clean old things from previous run
    clear_old_dirs(which_data, "spectrograms")
    
    # Read full data
    base_path = "model_inputs_and_targets/spectrograms/"
    X_dict, Y_dict = {}, {}
    for class_name in os.listdir(base_path+which_data):
        X_dict[class_name] = np.array([class_name+"/"+file for file in os.listdir(base_path+which_data+"/"+class_name)])
        Y_dict[class_name] = np.array([class_map[class_name] for file in os.listdir(base_path+which_data+"/"+class_name)])

    # Prepare kFold CV
    X_full = np.concatenate([x for x in X_dict.values()])
    Y_full = np.concatenate([y for y in Y_dict.values()])
    skf = StratifiedKFold(n_splits=n_folds)
    print("Number of splits:", skf.get_n_splits(X_full), skf.get_n_splits(Y_full), "\n")
    
    iteration = 0
    all_results = pd.DataFrame(columns=["loss", "tp", "fp", "tn", "fn", "accuracy", "precision", "recall", "auc"])
    for train_idx, test_idx in skf.split(X_full, Y_full):
        print("========== Iteration:", iteration, "==========")
        X_train_files = X_full[train_idx[0:int(0.7*train_idx.shape[0])]]
        X_val_files = X_full[train_idx[int(0.7*train_idx.shape[0]):]]
        X_test_files = X_full[test_idx]
        Y_train = Y_full[train_idx[0:int(0.7*train_idx.shape[0])]]
        Y_val = Y_full[train_idx[int(0.7*train_idx.shape[0]):]]
        Y_test = Y_full[test_idx]
        
        print(X_test_files)
        print(Y_test)
        
        # Make a model
        example_instance = np.load(base_path+which_data+"/"+X_train_files[0])
        model = models.create_2d_CNN_small(example_instance, kernel_size=(3,3), n_classes=6)
        
        # Generator
        train_gen = DataGenerator(X_train_files, Y_train, which_data, hyperparams["BATCH_SIZE"], dim=example_instance.shape)
        val_gen = DataGenerator(X_val_files, Y_val, which_data, hyperparams["BATCH_SIZE"], dim=example_instance.shape)
        test_gen = DataGenerator(X_test_files, Y_test, which_data, hyperparams["BATCH_SIZE"], dim=example_instance.shape)
        
        model.fit(x=train_gen, 
                  validation_data=val_gen, 
                  use_multiprocessing=True, 
                  workers=8, 
                  epochs=hyperparams["N_EPOCHS"],
                  verbose=hyperparams["VERBOSE"],
                  callbacks=[
                        tf.keras.callbacks.ModelCheckpoint(filepath="model_checkpoints/"+which_data+"_spectrograms/best_model.hdf5", monitor='val_loss', mode='min', save_best_only=True),
                        tf.keras.callbacks.TensorBoard(log_dir="model_training_logs/"+which_data+"_spectrograms/"),
                        tf.keras.callbacks.LearningRateScheduler(scheduler),
                        tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)
                  ])
        
        iteration += 1
        break
        
    return all_results