# Model
This notebook is for majority voting and weighted voting experiments.

## Importing dependencies

In [1]:
DEBUG = False

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import h5py

import json

from tensorflow.keras import backend as keras
from keras.utils import to_categorical

import matplotlib.pyplot as plt

import numpy as np

import os

import pandas as pd

import pickle

import random

from sklearn import ensemble, preprocessing, svm
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score 
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, ShuffleSplit
from sklearn.utils import shuffle, class_weight

import sqlite3

import tempfile

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, AveragePooling1D
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D, AveragePooling2D
from tensorflow.keras.layers import DepthwiseConv2D, AveragePooling2D, SeparableConv2D
from tensorflow.keras.layers import LSTM, GRU, RNN
from tensorflow.keras.losses import MAE
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1, l2

# from tensorflow.compat.v1.keras.backend import set_session as keras_set_session
from tensorflow.python.client import device_lib
# visible_devices = '1' #this is the GPU number, this is GPU0 ‘0’ or GPU1 ‘1’
# memory_fraction = 1.0 #This will allow 20% of the GPU memory to be allocated to your process, pick this number large enough for your script but also not too large so others can still do things.
# os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
# gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=memory_fraction, allow_growth=True) 
# tf_session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
# keras_set_session(tf_session)

import random

Using TensorFlow backend.


In [3]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 11249854408656244004,
 name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 3537972219279262217
 physical_device_desc: "device: XLA_CPU device",
 name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 1702918962599476935
 physical_device_desc: "device: XLA_GPU device",
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14613293312
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 5435507454229705395
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"]

In [4]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

## Read in data(base)
Load in SQLite database and fetch all tokens (recordings) and the extracted features.

In [5]:
#
#   CONNECT TO LOCAL DATABASE
#
def create_db_connection(db_file_name):
    conn = None

    try:
        conn = sqlite3.connect(db_file_name)
    except sqlite3.Error as e:
        print(e)

    return conn

In [6]:
df_TUEP, df_TUAB, df_TUSZ = None, None, None

In [7]:
#TUEP
conn = create_db_connection('/mnt/disks/data/files/TUEP_files/eeg_recordings_TUEP.db')


query = """SELECT patients.patient_id, sessions.session_id, tokens.token_id, patients.diagnosis, sessions.electrode_setup, tokens.recording_duration, tokens.sampling_freq, tokens.len_of_samples, tokens.file_name, tokens.file_path
           FROM patients 
           
           INNER JOIN sessions ON patients.patient_id == sessions.patient_id 
           INNER JOIN tokens ON sessions.patient_id == tokens.patient_id AND sessions.session_id == tokens.session_id"""
cur = conn.cursor()
df_TUEP = pd.read_sql(query, conn)

#TUAB
conn = create_db_connection('/mnt/disks/data/files/TUAB_files/eeg_recordings_TUAB.db')


query = """SELECT patients.patient_id, sessions.session_id, tokens.token_id, patients.diagnosis, sessions.electrode_setup, tokens.recording_duration, tokens.sampling_freq, tokens.len_of_samples, tokens.file_name, tokens.file_path
           ,patients.patient_train_or_test FROM patients 
           INNER JOIN sessions ON patients.patient_id == sessions.patient_id 
           INNER JOIN tokens ON sessions.patient_id == tokens.patient_id AND sessions.session_id == tokens.session_id"""
cur = conn.cursor()
df_TUAB = pd.read_sql(query, conn)

#TUSZ
conn = create_db_connection('/mnt/disks/data/files/TUSZ_files/eeg_recordings_TUSZ.db')

query = """SELECT patients.patient_id, sessions.session_id, tokens.token_id, tokens.number_of_windows, tokens.diagnosis, sessions.electrode_setup, tokens.recording_duration, tokens.sampling_freq, tokens.len_of_samples, tokens.file_name, tokens.file_path
            FROM patients 
           INNER JOIN sessions ON patients.patient_id == sessions.patient_id 
           INNER JOIN tokens ON sessions.patient_id == tokens.patient_id AND sessions.session_id == tokens.session_id"""
cur = conn.cursor()
df_TUSZ = pd.read_sql(query, conn)

#TUSL
conn = create_db_connection('/mnt/disks/data/files/TUSL_files/eeg_recordings_TUSL.db')

query = """SELECT patients.patient_id, sessions.session_id, tokens.token_id, tokens.number_of_windows, tokens.diagnosis, sessions.electrode_setup, tokens.recording_duration, tokens.sampling_freq, tokens.len_of_samples, tokens.file_name, tokens.file_path
            FROM patients 
           INNER JOIN sessions ON patients.patient_id == sessions.patient_id 
           INNER JOIN tokens ON sessions.patient_id == tokens.patient_id AND sessions.session_id == tokens.session_id"""
cur = conn.cursor()
df_TUSL = pd.read_sql(query, conn)

In [8]:
df_dict = {"TUEP": df_TUEP, "TUAB": df_TUAB, "TUSZ": df_TUSZ, "TUSL": df_TUSL}
h5f_dict = {"TUEP": "/mnt/disks/data/files/TUEP_files/raw_data_TUEP.h5", "TUAB": "/mnt/disks/data/files/TUAB_files/raw_data_TUAB.h5", "TUSZ": "/mnt/disks/data/files/TUSZ_files/raw_data_TUSZ.h5", "TUSL": "/mnt/disks/data/files/TUSL_files/raw_data_TUSL.h5"}
plus_edf_dict = {"TUEP": False, "TUAB": True, "TUSZ": True, "TUSL": True}
class1_dict = {"TUEP": "epilepsy", "TUAB": "abnormal", "TUSZ": "seiz", "TUSL": "slow"}
class2_dict = {"TUEP":"no_epilepsy", "TUAB": "normal","TUSZ": "bckg", "TUSL": "bckg"}
multiple_labels_dict = {"TUEP": False, "TUAB": False, "TUSZ": True, "TUSL": True}

minirocket_dict = {"TUEP": "TUEP_weights.npy", "TUAB": "TUAB_weights.npy", "TUSZ": "TUSZ_weights.npy"}
classifier_dict = {"TUEP": {"TUEP": "TUEP_TUEP_weights", "TUAB":  "TUEP_TUAB_weights", "TUSZ":  "TUEP_TUSZ_weights"},
             "TUAB": {"TUEP": "TUAB_TUEP_weights", "TUAB":  "TUAB_TUAB_weights", "TUSZ":  "TUAB_TUSZ_weights"},
             "TUSZ": {"TUEP": "TUSZ_TUEP_weights", "TUAB":  "TUSZ_TUAB_weights", "TUSZ":  "TUSZ_TUSZ_weights"}}

In [9]:
#Avoid data leakage
dataset1 = "TUEP"
dataset2 = "TUAB"

patient_ids1 = np.unique(df_dict[dataset1]["patient_id"])
patient_ids2 = np.unique(df_dict[dataset2]["patient_id"])
intersection = list(set(patient_ids1) & set(patient_ids2))
print(len(intersection))
print(len(patient_ids1))
print(len(patient_ids2))
#df_TUAB = df_TUAB[df_TUAB["patient_id"].isin(intersection) == False]
#print(len(np.unique(df_TUAB["patient_id"])))
if len(patient_ids1) > len(patient_ids2):
    dataset = dataset1
else:
    dataset = dataset2

df_dict[dataset] = df_dict[dataset][df_dict[dataset]["patient_id"].isin(intersection) == False]
print(len(np.unique(df_dict[dataset]["patient_id"])))

121
200
1240
1119


### Overarching function

In [10]:
def window_sum(l):
    window_sum = 0
    for windows in l:
        windows = windows.split(" ")
        for window in windows:
            if not window == '':
                window_sum += int(window)
    return window_sum

In [11]:
def window_list(l):
    window_list = []
    for windows in l:
        window_element = []
        windows = windows.split(" ")
        for window in windows:
            if not window == '':
                window_element.append(int(window))
        window_list.append(window_element)
    return window_list

In [12]:
def total_windows(l):
    total = 0
    for window_element in l:
        for window in window_element:
            total += window
    return total

In [13]:
#
#   CALCULATE THE CUMULATIVE NUMBER OF TOKENS/SECONDS OF RAW DATA PER PATIENT
#
def get_tokens_cumsum(df, name, patient_ids):

    #       Iterate over patient_ids and keep track of number of tokens per patient
    patient_tokens = []
    for curr_patient_id in patient_ids:
        curr_patient_tokens = df[df["patient_id"] == curr_patient_id]
        if multiple_labels_dict[name]:
            curr_patient_windows = window_sum(df[df["patient_id"] == curr_patient_id]['number_of_windows'])
            patient_tokens.append(int(curr_patient_windows))
        else:
            curr_patient_windows = np.floor(df[df["patient_id"] == curr_patient_id]["recording_duration"] / 10)
            patient_tokens.append(curr_patient_windows.sum())        
        
    patient_tokens = np.asarray(patient_tokens)

    #       Calculate cumulative number of recordings 
    patient_tokens_cumsum = np.cumsum(patient_tokens)
    
    return patient_tokens_cumsum

In [14]:
#
#   EXTRACT VALUES AND LABELS FROM TRAIN/VALIDATION/TEST DATASET; TAKES FEATURES TO BE SELECTED AS ARGUMENTS TO FORWARD TO FEATURE SELECTION ALGORITHM
#
def data_to_values_and_labels(dfs, name):
    dfs_values, dfs_labels = [], []
        
    
    # Read in diagnosis and features/feature images from database
    for df in dfs:
        values = [] # (NR_OF_SAMPLE)
        labels = [] # (NR_OF_SAMPLES,)
            
        rows = df.iterrows()
        for row_index, row in rows:
            # Return list of filenames to read in with generator
            #       Extract file path and name from database
            file_name = row["file_name"]
            file_path = row["file_path"]

            #       Add file path and name to values array
            values.append(f"{'data'}{file_path}{file_name}")

            #       Add patient diagnosis as label to labels array
            if multiple_labels_dict[name]:#TODO: make sure this works
                label_list = []
                diagnoses = row["diagnosis"]
                diagnoses = diagnoses.split(" ")
                for diagnosis in diagnoses:
                    if diagnosis == class1_dict[name]:
                        label_list.append(1)
                    elif diagnosis == class2_dict[name]:
                        label_list.append(0)
                        
                labels.append(label_list)
            
            else:
                labels.append(1 if row["diagnosis"] == class1_dict[name] else 0)
              
        # Add extracted values and labels to arrays for returning 
        dfs_values.append(values)             
        dfs_labels.append(labels)
        
        
    return dfs_values, dfs_labels

In [15]:
def get_data(dataset):
    # Filter out empty arrays
    df = df_dict[dataset]
    if not multiple_labels_dict[dataset]:
        df = df[df["recording_duration"] > 10]        
    
    df_train_all, df_test, df_train, df_val = train_val_test_split2(df, dataset)       

    
    #SHUFFLE
    df_train_all = df_train_all.sample(frac=1).reset_index(drop=True)
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    df_test = df_test.sample(frac=1).reset_index(drop=True)
    df_val = df_val.sample(frac=1).reset_index(drop=True)
    
    # Extract data values and labels from individual datasets
    dfs = [df_train_all, df_test, df_train, df_val]
    dfs_values, dfs_labels = data_to_values_and_labels(dfs, dataset) 

    #       Extract all values and labels from their respective datasets
    train_all_values, train_all_labels = dfs_values[0], dfs_labels[0]
    test_values, test_labels = dfs_values[1], dfs_labels[1]
    train_values, train_labels = dfs_values[2], dfs_labels[2]
    val_values, val_labels = dfs_values[3], dfs_labels[3]
    
    
    train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = np.asarray(train_all_values), np.asarray(train_all_labels), np.asarray(test_values), np.asarray(test_labels), np.asarray(train_values), np.asarray(train_labels), np.asarray(val_values), np.asarray(val_labels)
    return [train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels]        

In [16]:
class EEG_Data_Generator_Heterogeneous(keras.utils.Sequence):
    
    def __init__(self, file_paths, file_labels, batch_size, dataset, conv=False, class_filter = None):  
        
        # Store file paths and corresponding labels
        self.file_paths = file_paths
        self.file_names = [file_path.split("/")[-1] for file_path in file_paths]
        self.file_labels = file_labels
        self.dataset = dataset
        df = df_dict[dataset]
        
        #Store class filter and class counters
        self.class_filter = class_filter
        self.class0_counter = 0
        self.class1_counter = 0
        
        # Store batch size
        self.batch_size = batch_size
        
        #initialize reshape stuff
        self.conv = conv
        
        # Calculate total lentgh of the data
        df_gen = df[df["file_name"].isin(self.file_names)]
        df_gen = df_gen.set_index("file_name")
        df_gen = df_gen.loc[self.file_names]
    
        if multiple_labels_dict[self.dataset]:
            self.number_of_windows = window_list(df_gen["number_of_windows"].tolist())
            self.total_windows = total_windows(self.number_of_windows)
        else:            
            self.windows = np.floor((df_gen["recording_duration"] - 1) / 10).astype(int)
            self.windows_cumsum = self.windows.cumsum()
            self.total_windows = self.windows_cumsum.iloc[-1]

        # Make heterogeneously randomized list of tuples of file index and sample index in that file
        if multiple_labels_dict[self.dataset]:
            j = 0
            file_and_sample_index = []
            file_and_sample_label = []
            for file_index in range(len(self.file_paths)):
                file_labels = self.file_labels[file_index]
                file_paths = []
                for i in range(len(file_labels)):
                    file_path = self.file_paths[file_index]+'_'+str(i)
                    file_paths.append(file_path)
                    for sample_index in range(self.number_of_windows[file_index][i]):
                        file_and_sample_index.append((file_index, i, sample_index))
                        file_and_sample_label.append(file_labels[i])
                        
                        if file_labels[i] == 0 and self.class0_counter < 10000:
                            self.class0_counter += 1
                        elif file_labels[i] == 1 and self.class1_counter < 10000:
                            self.class1_counter += 1
                        
        
        else:
            j = 0
            file_and_sample_index = []
            file_and_sample_label = []
            for file_index in range(len(self.file_paths)):
                file_path = self.file_paths[file_index]
                file_label = self.file_labels[file_index]
                for sample_index in range(self.windows[file_index]):
                    file_and_sample_index.append((file_index, sample_index))
                    file_and_sample_label.append(file_label)
                            
                    if file_label == 0 and self.class0_counter < 10000:
                        self.class0_counter += 1
                    elif file_label == 1 and self.class1_counter < 10000:
                        self.class1_counter += 1
                    else:
                        pass
                        #print("Error in init from EEG_Data_Generator_Heterogeneous")
                       
                    
                    
                    
        #Shuffle data
        split = list(zip(file_and_sample_index, file_and_sample_label))

        random.shuffle(split)

        file_and_sample_index, file_and_sample_label = list(zip(*split))

        self.split_files_and_samples = []
        self.split_labels = []
        length = int(np.floor(len(file_and_sample_label)/batch_size))
        
        for i in range(length):
            start_index = batch_size*i
            end_index = batch_size*(i+1)
            self.split_files_and_samples.append(np.asarray(file_and_sample_index[start_index:end_index]))
            self.split_labels.append(np.asarray(file_and_sample_label[start_index:end_index]))
            
       
        
    def __len__(self):
        return (np.ceil((self.class0_counter + self.class1_counter) / float(self.batch_size))-1).astype(np.int)           

    def get_class_weights(self):
        total = self.class0_counter + self.class1_counter
        class0_weight = self.class1_counter/total
        class1_weight = self.class0_counter/total
        return {0 : class0_weight, 1 : class1_weight}
    def __getitem__(self, idx):
        
        # Define function to append values and labels to the return values and labels
        def append_values_and_labels(batch_values, file_data_h5, file_path, sample_index, index=None):
            file_name = file_path.split("/")[-1][:-4]
            if plus_edf_dict[self.dataset]:
                file_name = file_name + '.edf'
            
            #Get file values
            if index is None:
                file_values = file_data_h5[file_name][sample_index]   
            else:
                file_values = file_data_h5[file_name+"_"+str(index)][sample_index]   
            
            if self.conv:
                file_values = np.swapaxes(file_values, 0, 1)
            else:
                file_values = file_values.reshape((1, file_values.shape[0], file_values.shape[1])).astype(float)
            batch_values.append(file_values)

            return batch_values
        
        # Read in values for all file paths and duplicate file labels according the the amount of values
        #       Select subset of data
        batch_files_and_samples = self.split_files_and_samples[idx]
        batch_labels = self.split_labels[idx]
        
        #       Open raw data file
        h5f = h5py.File(h5f_dict[self.dataset], 'r')

        #       Get values for filepaths in batch
        batch_values = []
        
        if  multiple_labels_dict[self.dataset]:
            for file_path_index, index, sample_index in batch_files_and_samples:
                file_path = self.file_paths[file_path_index]
                batch_values = append_values_and_labels(batch_values, file_data_h5=h5f, file_path=file_path, sample_index=sample_index, index = index)
        else:
            for file_path_index, sample_index in batch_files_and_samples:
                file_path = self.file_paths[file_path_index]
                batch_values = append_values_and_labels(batch_values, file_data_h5=h5f, file_path=file_path, sample_index=sample_index)
        
        h5f.close()
        self.p = np.random.permutation(len(batch_values))
        batch_values, batch_labels = np.asarray(batch_values)[self.p], np.asarray(batch_labels)[self.p]
        return batch_values, batch_labels

In [17]:
def train_val_test_split2(df, dataset):
     # Get patient ids
    patient_ids = np.unique(df["patient_id"])
    #if dataset == "TUEP":
     #   random.Random(5).shuffle(patient_ids)
        #patient_ids = np.concatenate((patient_ids[:38], patient_ids[111:]))
    # Get (cumulative) number of tokens per unique epilepsy patient
    patient_tokens_cumsum = get_tokens_cumsum(df, dataset, patient_ids)
    total_tokens = np.amax(patient_tokens_cumsum)

    # Split patient_ids based on number of recordings; 80% of all tokens should be training data
    # (of which 64% training data and 16% validation data) and 20% should be test data.
    #       Find patient_index corresponding to a cumulative 80% of the data to split training from test data      
    test_split = next(index for index, curr_patient_tokens_cumsum in enumerate(patient_tokens_cumsum) if curr_patient_tokens_cumsum > 0.8 * total_tokens) + 1
        
    
    #       Use split patient_id to split off 20% of the data for test data
    train_all_patient_ids = patient_ids[:test_split]
    train_all_patient_tokens = patient_tokens_cumsum[:test_split]
    
    test_patient_ids = patient_ids[test_split:]
    test_patient_tokens = patient_tokens_cumsum[test_split:]

    #       Find patient_id corresponding to a cumulative 80% of the training data to split training data from validation data
    total_train_tokens = np.amax(train_all_patient_tokens)
    val_split = next(index for index, curr_patient_tokens_cumsum in enumerate(train_all_patient_tokens) if curr_patient_tokens_cumsum > 0.8 * total_train_tokens) + 1

    #       Use split patient_id to split off 20% of the data for validation data
    train_patient_ids = train_all_patient_ids[:val_split]
    train_patient_tokens = train_all_patient_tokens[:val_split]

    val_patient_ids = train_all_patient_ids[val_split:]
    val_patient_tokens = train_all_patient_tokens[val_split:]
    
    # Split dataframe in train, validation and test sets based on the split made above
    df_train_all = df[df["patient_id"].isin(train_all_patient_ids)]
    df_test = df[df["patient_id"].isin(test_patient_ids)]
    
    df_train = df[df["patient_id"].isin(train_patient_ids)]
    df_val = df[df["patient_id"].isin(val_patient_ids)]
    
    return df_train_all, df_test, df_train, df_val   

In [18]:
def number_class(df, dataset):
    number_class0 = 0
    number_class1 = 0
    
    class0 = class1_dict[dataset]
    class1 = class2_dict[dataset]
    for index, row in df.iterrows():
        if not multiple_labels_dict[dataset]:
            if class1_dict[dataset] == row["diagnosis"]:
                number_class0 += np.floor((row["recording_duration"] - 1) / 10).astype(int)
            else:
                number_class1 += np.floor((row["recording_duration"] - 1) / 10).astype(int)
        else:
            diagnoses = row["diagnosis"]
            diagnoses = diagnoses.split(" ")
            diagnoses.remove('')

            number_of_windows = row["number_of_windows"]
            number_of_windows = number_of_windows.split(" ")
            number_of_windows.remove('')
            for i in range(len(diagnoses)):
                diagnosis = diagnoses[i]
                windows = number_of_windows[i]
                if diagnosis == class0:
                    number_class0 += int(windows)
                elif diagnosis == class1:
                    number_class1 += int(windows)
    return int(number_class0), int(number_class1)

In [19]:
#
#   SPLIT DATA IN ALMOST EQUAL PARTS
#
def stratify_ensemble_split(df, dataset, split_size):
    #shuffle randomly
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Get patient ids
    patient_ids = np.unique(df["patient_id"])

    # Get (cumulative) number of tokens per unique patient
    patient_tokens_cumsum = get_tokens_cumsum(df, dataset)
    
    # take approx split size subset of df
    split_index = next(index for index, curr_patient_tokens_cumsum in enumerate(patient_tokens_cumsum) if curr_patient_tokens_cumsum > split_size) + 1
    stratified_ids = patient_ids[:split_index]
    df = df[df["patient_id"].isin(stratified_ids)]
    
    return df

## Model performance functions

In [20]:
#
#   HELP VISUALIZE LEARNING PROGRESSION
#
def plot_history(history):
    plt.figure(figsize = (12,16))
    plt.subplot(4,2,1)

    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    try:
        plt.plot(history.epoch, np.array(history.history['acc']),'g-',
               label='Train accuracy')
        plt.plot(history.epoch, np.array(history.history['val_acc']),'r-',
               label = 'Validation accuracy')
    except:
        plt.plot(history.epoch, np.array(history.history['accuracy_m']),'g-',
               label='Train accuracy')
        plt.plot(history.epoch, np.array(history.history['val_accuracy_m']),'r-',
               label = 'Validation accuracy')
    plt.ylim([0.0,1.0])
    plt.legend()

    plt.subplot(4,2,2)
    plt.xlabel('Epoch')
    plt.ylabel('Loss minimised by model')
    plt.plot(history.epoch, np.array(history.history['loss']),'g-',
           label='Train loss')
    plt.plot(history.epoch, np.array(history.history['val_loss']),'r-',
           label = 'Validation loss')
    plt.legend()
    
    plt.subplot(4,2,3)
    plt.xlabel('Epoch')
    plt.ylabel('Recall')
    plt.plot(history.epoch, np.array(history.history['recall_m']),'g-',
           label='Train recall')
    plt.plot(history.epoch, np.array(history.history['val_recall_m']),'r-',
           label = 'Validation recall')
    plt.legend()
    
    plt.subplot(4,2,4)
    plt.xlabel('Epoch')
    plt.ylabel('Negative recall')
    plt.plot(history.epoch, np.array(history.history['neg_recall_m']),'g-',
           label='Train neg. recall')
    plt.plot(history.epoch, np.array(history.history['val_neg_recall_m']),'r-',
           label = 'Validation neg. recall')
    plt.legend()
    
    plt.subplot(4,2,5)
    plt.xlabel('Epoch')
    plt.ylabel('Precision')
    plt.plot(history.epoch, np.array(history.history['precision_m']),'g-',
           label='Train precision')
    plt.plot(history.epoch, np.array(history.history['val_precision_m']),'r-',
           label = 'Validation precision')
    plt.legend()
    
    plt.subplot(4,2,6)
    plt.xlabel('Epoch')
    plt.ylabel('Negative precision')
    plt.plot(history.epoch, np.array(history.history['neg_precision_m']),'g-',
           label='Train neg. precision')
    plt.plot(history.epoch, np.array(history.history['val_neg_precision_m']),'r-',
           label = 'Validation neg. precision')
    plt.legend()
    
    plt.subplot(4,2,7)
    plt.xlabel('Epoch')
    plt.ylabel('F1')
    plt.plot(history.epoch, np.array(history.history['f1_m']),'g-',
           label='Train F1')
    plt.plot(history.epoch, np.array(history.history['val_f1_m']),'r-',
           label = 'Validation F1')
    plt.legend()
    
    plt.subplot(4,2,8)
    plt.xlabel('Epoch')
    plt.ylabel('Negative F1')
    plt.plot(history.epoch, np.array(history.history['neg_f1_m']),'g-',
           label='Train neg. F1')
    plt.plot(history.epoch, np.array(history.history['val_neg_f1_m']),'r-',
           label = 'Validation neg. F1')
    plt.legend()
    
    

In [21]:
#
#   CALCULATE RECALL, PRECISION, AND F1-SCORE PERFORMANCE MEASURES
#
#w = 3.2552225249772935
def w_binary_crossentropy(y_true, y_pred):
    weights = y_true * class1_weight + (1. - y_true) * class0_weight
    bce = keras.backend.binary_crossentropy(y_true, y_pred)
    weighted_bce = keras.backend.mean(bce * weights)
    return weighted_bce


def tp_m(y_true, y_pred):
    tp = keras.backend.sum(keras.backend.round(keras.backend.clip(y_true * y_pred, 0, 1)))
    return tp

def fp_m(y_true, y_pred):
    fp = keras.backend.sum(keras.backend.round(keras.backend.clip((1-y_true) * y_pred, 0, 1)))
    return fp

def fn_m(y_true, y_pred):
    fn = keras.backend.sum(keras.backend.round(keras.backend.clip(y_true * (1-y_pred), 0, 1)))
    return fn

def tn_m(y_true, y_pred):
    tn = keras.backend.sum(keras.backend.round(keras.backend.clip((1-y_true) * (1-y_pred), 0, 1)))
    return tn

def accuracy_m(y_true, y_pred):
    accuracy = (tp_m(y_true, y_pred) + tn_m(y_true, y_pred)) / (tp_m(y_true, y_pred) + fp_m(y_true, y_pred) + tn_m(y_true, y_pred) + fn_m(y_true, y_pred) + keras.backend.epsilon())
    return accuracy

def pos_true_m(y_true, y_pred):
    return tp_m(y_true, y_pred) + fn_m(y_true, y_pred) 

def pos_pred_m(y_true, y_pred):
    return tp_m(y_true, y_pred) + fp_m(y_true, y_pred)

def neg_true_m(y_true, y_pred):
    return tn_m(y_true, y_pred) + fp_m(y_true, y_pred)

def neg_pred_m(y_true, y_pred):
    return tn_m(y_true, y_pred) + fn_m(y_true, y_pred)

# Sensitivity
def recall_m(y_true, y_pred):
    recall = tp_m(y_true, y_pred) / (tp_m(y_true, y_pred) + fn_m(y_true, y_pred) + keras.backend.epsilon())
    return recall

# Specificity
def neg_recall_m(y_true, y_pred):
    neg_recall = tn_m(y_true, y_pred) / (tn_m(y_true, y_pred) + fp_m(y_true, y_pred) + keras.backend.epsilon())
    return neg_recall

def precision_m(y_true, y_pred):
    precision = tp_m(y_true, y_pred) / (tp_m(y_true, y_pred) + fp_m(y_true, y_pred) + keras.backend.epsilon())
    return precision

# Negative predictive value
def neg_precision_m(y_true, y_pred):
    neg_precision = tn_m(y_true, y_pred) / (tn_m(y_true, y_pred) + fn_m(y_true, y_pred) + keras.backend.epsilon())
    return neg_precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+keras.backend.epsilon()))

def neg_f1_m(y_true, y_pred):
    neg_precision = neg_precision_m(y_true, y_pred)
    neg_recall = neg_recall_m(y_true, y_pred)
    return 2 * ((neg_precision * neg_recall) / (neg_precision + neg_recall + keras.backend.epsilon()))

def balanced_acc_m(y_true, y_pred):
    return (recall_m(y_true, y_pred) + neg_recall_m(y_true, y_pred))/2

def balanced_acc(preds, labels):
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(preds)):
        if preds[i] == 0 and labels[i] == 0:
            tn +=1
        elif preds[i] == 0 and labels[i] == 1:
            fn += 1
        elif preds[i] == 1 and labels[i] == 0:
            fp += 1
        elif preds[i] == 1 and labels[i] == 1:
            tp += 1
        else:
            return "Error other values than [0,1] are given" + str(preds[i]) + ", " + str(labels[i])
    if tp + fp == 0 or tn + fn == 0:
        return "Error devision by zero: " + str(tp) + " "  + str(tn) + " " + str(fp) + " " + str(fn)
    return (tp/(tp+fn) + tn/(tn+fp))/2, tp, tn, fp, fn

In [22]:
def perf_measure(y_actual, y_hat):
    y_actual = np.asarray(y_actual)
    y_hat = np.asarray(y_hat)
    
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    
    for i in range(len(y_hat)): 
        if len(y_actual.shape) == 2 and y_actual.shape[1] == 2:
            y_actual_ = np.argmax(y_actual[i])
        else:
            y_actual_ = y_actual[i]
        
        if len(y_hat.shape) == 2 and y_hat.shape[1] == 2:
            y_hat_ = np.argmax(y_hat[i])
        else:
            y_hat_ = y_hat[i]
            
        if y_actual_==y_hat_==1:
            TP += 1
        if y_hat_==1 and y_actual_!=y_hat_:
            FP += 1
        if y_actual_==y_hat_==0:
            TN += 1
        if y_hat_==0 and y_actual_!=y_hat_:
            FN += 1
    
    print(f"tp({TP}), fp({FP}), tn({TN}), fn({FN})")
    
    accuracy = (TP + TN) / (TN + TP + FP + FN)
    recall = TP / (FN + TP)
    neg_recall = TN / (FP + TN)
    
    try:
        precision = TP / (FP + TP)
    except:
        print("No positive predictions...")
        precision = 0
        
    try:
        neg_precision = TN / (FN + TN)
    except:
        print("No negative predictions...")
        neg_precision = 0
            
    try:
        f1 = 2 * (recall * precision) / (recall + precision)
    except:
        f1 = 0
        
    try:
        neg_f1 = 2 * (neg_recall * neg_precision) / (neg_recall + neg_precision)
    except:
        neg_f1 = 0

    aupr = average_precision_score(np.asarray(y_actual), np.asarray(y_hat))
    neg_aupr = average_precision_score(np.asarray([1 - y_a for y_a in y_actual]), np.asarray([1 - y_h for y_h in y_hat]))

    print(f"acc={accuracy}, recall={recall}, neg_recall={neg_recall}, precision={precision}, neg_precision={neg_precision}, f1={f1}, neg_f1={neg_f1}, aupr={aupr}, neg_aupr={neg_aupr}")
    
    return accuracy, recall, neg_recall, precision, neg_precision, f1, neg_f1, aupr, neg_aupr

## Class imbalance

In [23]:
def class_weights(dataset, file_paths, file_labels):
    #number of windows of label, number of windows of window 
    class0 = 0
    class1 = 0
    
    # Store file paths and corresponding labels
    file_names = [file_path.split("/")[-1] for file_path in file_paths]
    df = df_dict[dataset]
        
    # Calculate total lentgh of the data
    df_gen = df[df["file_name"].isin(file_names)]
    df_gen = df_gen.set_index("file_name")
    df_gen = df_gen.loc[file_names]
    
    #calc number of windows
    if multiple_labels_dict[dataset]:
        number_of_windows = window_list(df_gen["number_of_windows"].tolist())
    else:            
        windows = np.floor((df_gen["recording_duration"] - 1) / 10).astype(int)
        
   # Make heterogeneously randomized list of tuples of file index and sample index in that file
    if multiple_labels_dict[dataset]:
        for file_index in range(len(file_paths)):
            this_file_labels = file_labels[file_index]
            for i in range(len(this_file_labels)):
                if this_file_labels[i] == 0:
                    class0 += number_of_windows[file_index][i]
                else:
                    class1 += number_of_windows[file_index][i]
                        
        
    else:
        for file_index in range(len(file_paths)):
            file_label = file_labels[file_index]
            if file_label == 0:
                class0 += windows[file_index]
            else:
                class1 += windows[file_index]    
    
    total = class0 + class1
    class0_weight = class1/total
    class1_weight = class0/total
    return {0 : class0_weight, 1 : class1_weight}
    

# Import rocket stuff

In [24]:
run_tsfresh_check = False #@param {type:"boolean", run:"auto"}

import pickle
with open('run_tsfresh_check.pickle', 'wb') as f:
    pickle.dump(run_tsfresh_check, f)

In [25]:
import_errors = []

import scipy
if scipy.__version__ == '1.4.1':
    print(f'Upgrading package: "{"scipy"}"...')
    !pip install scipy>=1.5
    import_errors.append('scipy')

try:
    import tsai
except ImportError as e:
    print(f'Installing package: "{"tsai"}"...')
    #!pip install -Uqq tsai
    !pip install -Uqq git+https://github.com/timeseriesAI/tsai.git
    import_errors.append('tsai')

if import_errors:
    print(f'The following 3rd party packages had to be installed: {import_errors}.')
    print('Restarting runtime...')
    import os
    os.kill(os.getpid(), 9)
else:
    print('All packages are installed and up-to-date!')

All packages are installed and up-to-date!


In [26]:
#fix bug
import sys
import sklearn
sys.modules['sklearn.ensemble._base'] = sklearn.ensemble.base
# Import native packages
import json
import os
import requests

# Import auxillary packages
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import tabulate

# Import machine learning packages
import fastai

from lightgbm import LGBMClassifier

import pickle

from scipy import signal
from scipy.fft import fft, rfft

import seaborn as sn

from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import RidgeClassifierCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

from sktime.transformations.panel.rocket import MiniRocket as MiniRocket_sktime, MiniRocketMultivariate as MiniRocketMultivariate_sktime

import torch

#from tsai.all import *
#from tsai.all import MiniRocketClassifier as MiniRocket_fastai

import tsfresh
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters

In [27]:
import h5py

import json

from tensorflow.keras import backend as keras
#from keras.utils import to_categorical

import matplotlib.pyplot as plt

import numpy as np

import os

import pandas as pd

import pickle

import random

from sklearn import ensemble, preprocessing, svm
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score 
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, ShuffleSplit
from sklearn.utils import shuffle, class_weight

import sqlite3

import tempfile

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, AveragePooling1D
from tensorflow.keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D, AveragePooling2D
from tensorflow.keras.layers import DepthwiseConv2D, AveragePooling2D, SeparableConv2D
from tensorflow.keras.layers import LSTM, GRU, RNN
from tensorflow.keras.losses import MAE
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1, l2

# from tensorflow.compat.v1.keras.backend import set_session as keras_set_session
from tensorflow.python.client import device_lib
# visible_devices = '1' #this is the GPU number, this is GPU0 ‘0’ or GPU1 ‘1’
# memory_fraction = 1.0 #This will allow 20% of the GPU memory to be allocated to your process, pick this number large enough for your script but also not too large so others can still do things.
# os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
# gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=memory_fraction, allow_growth=True) 
# tf_session = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))
# keras_set_session(tf_session)

import random

In [28]:
class majority_voting():
    def __init__(self, minirockets, classifiers):
        self.minirocket0 = minirockets[0]
        self.minirocket1 = minirockets[1]
        self.minirocket2 = minirockets[2]
        
        self.classifier0 = classifiers[0]
        self.classifier1 = classifiers[1]
        self.classifier2 = classifiers[2]        
    
    def predict_generator(self, generator, mode, weights = [1/3,1/3,1/3]):
        if mode == 'hard':
            predictions0, predictions1, predictions2 = [], [], []
            gold = []
            for i in range(len(test_generator)):
                print(i*batch_size)
                X_test, y_test= test_generator[i]
                X_test = X_test.squeeze(axis = 1)
                X_0 = self.minirocket0.transform(X_test)
                r_test0 = self.classifier0.predict(X_0)
                X_1 = self.minirocket1.transform(X_test)
                r_test1 = self.classifier1.predict(X_1)
                X_2 = self.minirocket2.transform(X_test)
                r_test2 = self.classifier2.predict(X_2)
                #correct format
                for element in r_test0:
                    predictions0.append(np.float32(element))
                for element in r_test1:
                    predictions1.append(np.float32(element))
                for element in r_test2:
                    predictions2.append(np.float32(element))
                for element in y_test:
                    gold.append(np.float32(element))

            predictions0, predictions1, predictions2 = np.asarray(predictions0), np.asarray(predictions1), np.asarray(predictions2)
            gold = np.asarray(gold)
            
            print("balanced accuracy, my_model0: ", balanced_acc_m(predictions0, gold))
            print("balanced accuracy, my_model1: ", balanced_acc(predictions1, gold))
            print("balanced accuracy, my_model1: ", balanced_acc(predictions2, gold))
            
            test_preds = [np.int32(1) if (weights[0]*predictions0[i] + weights[1]*predictions1[i] + weights[2]*predictions2[i]) > 0.5 else np.int32(0) for i in range(len(predictions0))]
            print("balanced accuracy", balanced_acc(test_preds, gold))
        elif mode == 'soft':
            test_preds0, test_preds1, test_preds2 = [], [], []
            test_labels_ = []
            for i in range(len(generator)):#maybe use predict proba??
                X, y = generator[i]
                X = X.squeeze(axis = 1)
                X_0 = minirocket0.transform(X)
                X_1 = minirocket1.transform(X)
                X_2 = minirocket2.transform(X)                   
                test_preds0.extend(self.classifier0.predict_proba(X_0))
                test_preds1.extend(self.classifier1.predict_proba(X_1))
                test_preds2.extend(self.classifier2.predict_proba(X_2))
                test_labels_.extend(y)
            print(weights)
            test_preds = [np.int32(1) if (weights[0]*test_preds0[i] + weights[1]*test_preds1[i] + weights[2]*test_preds2[i]) > 0.5 else np.int32(0) for i in range(len(test_preds0))]
            #print("balanced accuracy, my_model0: ", balanced_acc(test_preds0, test_labels_))
            #print("balanced accuracy, my_model1: ", balanced_acc(test_preds1, test_labels_))
            #print("balanced accuracy, my_model1: ", balanced_acc(test_preds2, test_labels_))
            print("balanced accuracy", balanced_acc(test_preds, test_labels_))

In [30]:
#source model
rocket_path = "/home/jupyter/time_series_transfer_learning/transfer_learning/2-rocket/model_weights/"
target_model = "TUEP"
source_models = ["TUEP", "TUAB", "TUSZ"]
source_models.remove(target_model)

#Load in the data
dataset = target_model
batch_size = 512

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(dataset)

#Generator
test_generator = EEG_Data_Generator_Heterogeneous(test_values, test_labels, batch_size, dataset, class_filter = None)
train_generator = EEG_Data_Generator_Heterogeneous(train_values, train_labels, batch_size, dataset, class_filter = None)

# Define MINIROCKET transformer
# Must fit first redundant
minirocket0 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket0.fit(X_train, y_train)
minirocket0.parameters = tuple(np.load(rocket_path + minirocket_dict[target_model] , allow_pickle=True))

# Must fit first redundant
minirocket1 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket1.fit(X_train, y_train)
minirocket1.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[0]] , allow_pickle=True))

# Must fit first redundant
minirocket2 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket2.fit(X_train, y_train)
minirocket2.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[1]] , allow_pickle=True))

#Load in the classifiers (ridgeRegressionClassifierCV)
classifier0 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier0 = pickle.load(open(rocket_path + classifier_dict[target_model][target_model], 'rb'))

classifier1 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier1 = pickle.load(open(rocket_path + classifier_dict[source_models[0]][target_model], 'rb'))

classifier2 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier2 = pickle.load(open(rocket_path + classifier_dict[source_models[1]][target_model], 'rb'))

minirockets = [minirocket0, minirocket1, minirocket2]
classifiers = [classifier0, classifier1, classifier2]

In [31]:
v = majority_voting(minirockets, classifiers)

In [32]:
v.predict_generator(test_generator, 'hard')

0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
balanced accuracy, my_model0:  tf.Tensor(0.59874415, shape=(), dtype=float32)
balanced accuracy, my_model1:  (0.68957747527979, 9217, 3321, 1659, 3723)
balanced accuracy, my_model1:  (0.5785393506017888, 9147, 2242, 2738, 3793)
balanced accuracy (0.6565524850561442, 9516, 2877, 2103, 3424)


In [33]:
#source model
rocket_path = "/home/jupyter/time_series_transfer_learning/transfer_learning/2-rocket/model_weights/"
target_model = "TUSZ"
source_models = ["TUEP", "TUAB", "TUSZ"]
source_models.remove(target_model)

#Load in the data
dataset = target_model
batch_size = 512

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(dataset)

#Generator
test_generator = EEG_Data_Generator_Heterogeneous(test_values, test_labels, batch_size, dataset, class_filter = None)
train_generator = EEG_Data_Generator_Heterogeneous(train_values, train_labels, batch_size, dataset, class_filter = None)

# Define MINIROCKET transformer
# Must fit first redundant
minirocket0 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket0.fit(X_train, y_train)
minirocket0.parameters = tuple(np.load(rocket_path + minirocket_dict[target_model] , allow_pickle=True))

# Must fit first redundant
minirocket1 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket1.fit(X_train, y_train)
minirocket1.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[0]] , allow_pickle=True))

# Must fit first redundant
minirocket2 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket2.fit(X_train, y_train)
minirocket2.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[1]] , allow_pickle=True))

#Load in the classifiers (ridgeRegressionClassifierCV)
classifier0 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier0 = pickle.load(open(rocket_path + classifier_dict[target_model][target_model], 'rb'))

classifier1 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier1 = pickle.load(open(rocket_path + classifier_dict[source_models[0]][target_model], 'rb'))

classifier2 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier2 = pickle.load(open(rocket_path + classifier_dict[source_models[1]][target_model], 'rb'))

minirockets = [minirocket0, minirocket1, minirocket2]
classifiers = [classifier0, classifier1, classifier2]

In [34]:
v = majority_voting(minirockets, classifiers)

In [35]:
v.predict_generator(test_generator, 'hard')

0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
balanced accuracy, my_model0:  tf.Tensor(0.6422349, shape=(), dtype=float32)
balanced accuracy, my_model1:  (0.5422945970794848, 68, 10442, 97, 657)
balanced accuracy, my_model1:  (0.5518247821719655, 92, 10294, 245, 633)
balanced accuracy (0.534277936465869, 55, 10462, 77, 670)


In [36]:
#source model
rocket_path = "/home/jupyter/time_series_transfer_learning/transfer_learning/2-rocket/model_weights/"
target_model = "TUAB"
source_models = ["TUEP", "TUAB", "TUSZ"]
source_models.remove(target_model)

#Load in the data
dataset = target_model
batch_size = 512

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(dataset)

#Generator
test_generator = EEG_Data_Generator_Heterogeneous(test_values, test_labels, batch_size, dataset, class_filter = None)
train_generator = EEG_Data_Generator_Heterogeneous(train_values, train_labels, batch_size, dataset, class_filter = None)

# Define MINIROCKET transformer
# Must fit first redundant
minirocket0 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket0.fit(X_train, y_train)
minirocket0.parameters = tuple(np.load(rocket_path + minirocket_dict[target_model] , allow_pickle=True))

# Must fit first redundant
minirocket1 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket1.fit(X_train, y_train)
minirocket1.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[0]] , allow_pickle=True))

# Must fit first redundant
minirocket2 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket2.fit(X_train, y_train)
minirocket2.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[1]] , allow_pickle=True))

#Load in the classifiers (ridgeRegressionClassifierCV)
classifier0 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier0 = pickle.load(open(rocket_path + classifier_dict[target_model][target_model], 'rb'))

classifier1 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier1 = pickle.load(open(rocket_path + classifier_dict[source_models[0]][target_model], 'rb'))

classifier2 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier2 = pickle.load(open(rocket_path + classifier_dict[source_models[1]][target_model], 'rb'))

minirockets = [minirocket0, minirocket1, minirocket2]
classifiers = [classifier0, classifier1, classifier2]

In [37]:
v = majority_voting(minirockets, classifiers)

In [38]:
v.predict_generator(test_generator, 'hard')

0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
balanced accuracy, my_model0:  tf.Tensor(0.5862753, shape=(), dtype=float32)
balanced accuracy, my_model1:  (0.7521070946296873, 8697, 1240, 260, 4139)
balanced accuracy, my_model1:  (0.6986156642775527, 10045, 922, 578, 2791)
balanced accuracy (0.7691819881583049, 9606, 1185, 315, 3230)


# Weighted voting: similarity

In [60]:
class EEG_Data_Generator_Heterogeneous(keras.utils.Sequence):
    
    def __init__(self, file_paths, file_labels, batch_size, dataset, conv=False, class_filter = None):  
        
        # Store file paths and corresponding labels
        self.file_paths = file_paths
        self.file_names = [file_path.split("/")[-1] for file_path in file_paths]
        self.file_labels = file_labels
        self.dataset = dataset
        df = df_dict[dataset]
        
        #Store class filter and class counters
        self.class_filter = class_filter
        self.class0_counter = 0
        self.class1_counter = 0
        
        # Store batch size
        self.batch_size = batch_size
        
        #initialize reshape stuff
        self.conv = conv
        
        # Calculate total lentgh of the data
        df_gen = df[df["file_name"].isin(self.file_names)]
        df_gen = df_gen.set_index("file_name")
        df_gen = df_gen.loc[self.file_names]
    
        if multiple_labels_dict[self.dataset]:
            self.number_of_windows = window_list(df_gen["number_of_windows"].tolist())
            self.total_windows = total_windows(self.number_of_windows)
        else:            
            self.windows = np.floor((df_gen["recording_duration"] - 1) / 10).astype(int)
            self.windows_cumsum = self.windows.cumsum()
            self.total_windows = self.windows_cumsum.iloc[-1]

        # Make heterogeneously randomized list of tuples of file index and sample index in that file
        if multiple_labels_dict[self.dataset]:
            j = 0
            file_and_sample_index = []
            file_and_sample_label = []
            for file_index in range(len(self.file_paths)):
                file_labels = self.file_labels[file_index]
                file_paths = []
                for i in range(len(file_labels)):
                    file_path = self.file_paths[file_index]+'_'+str(i)
                    file_paths.append(file_path)
                    for sample_index in range(self.number_of_windows[file_index][i]):
                        file_and_sample_index.append((file_index, i, sample_index))
                        file_and_sample_label.append(file_labels[i])
                        
                        if file_labels[i] == 0:
                            self.class0_counter += 1
                        elif file_labels[i] == 1:
                            self.class1_counter += 1
                        
        
        else:
            j = 0
            file_and_sample_index = []
            file_and_sample_label = []
            for file_index in range(len(self.file_paths)):
                file_path = self.file_paths[file_index]
                file_label = self.file_labels[file_index]
                for sample_index in range(self.windows[file_index]):
                    file_and_sample_index.append((file_index, sample_index))
                    file_and_sample_label.append(file_label)
                            
                    if file_label == 0: #and (self.class0_counter+self.class1_counter == 0 or self.class1_counter/(self.class0_counter+self.class1_counter) > 0.83):
                        self.class0_counter += 1
                    elif file_label == 1:# and (self.class0_counter+self.class1_counter == 0 or self.class1_counter/(self.class0_counter+self.class1_counter) < 0.83):
                        self.class1_counter += 1
                    else:
                        pass
                        #print("Error in init from EEG_Data_Generator_Heterogeneous")
                       
                    
                    
                    
        #Shuffle data
        split = list(zip(file_and_sample_index, file_and_sample_label))

        random.shuffle(split)

        file_and_sample_index, file_and_sample_label = list(zip(*split))

        self.split_files_and_samples = []
        self.split_labels = []
        length = int(np.floor(len(file_and_sample_label)/batch_size))
        
        for i in range(length):
            start_index = batch_size*i
            end_index = batch_size*(i+1)
            self.split_files_and_samples.append(np.asarray(file_and_sample_index[start_index:end_index]))
            self.split_labels.append(np.asarray(file_and_sample_label[start_index:end_index]))
            
       
        
    def __len__(self):
        return (np.ceil((self.class0_counter + self.class1_counter) / float(self.batch_size))-1).astype(np.int)           

    def get_class_weights(self):
        total = self.class0_counter + self.class1_counter
        class0_weight = self.class1_counter/total
        class1_weight = self.class0_counter/total
        return {0 : class0_weight, 1 : class1_weight}
    def __getitem__(self, idx):
        
        # Define function to append values and labels to the return values and labels
        def append_values_and_labels(batch_values, file_data_h5, file_path, sample_index, index=None):
            file_name = file_path.split("/")[-1][:-4]
            if plus_edf_dict[self.dataset]:
                file_name = file_name + '.edf'
            
            #Get file values
            if index is None:
                file_values = file_data_h5[file_name][sample_index]   
            else:
                file_values = file_data_h5[file_name+"_"+str(index)][sample_index]   
            
            if self.conv:
                file_values = np.swapaxes(file_values, 0, 1)
            else:
                #file_values = file_values.reshape((1, file_values.shape[0], file_values.shape[1])).astype(float)
                file_values = file_values.reshape((file_values.shape[1], file_values.shape[0]))
            batch_values.append(file_values)

            return batch_values
        
        # Read in values for all file paths and duplicate file labels according the the amount of values
        #       Select subset of data
        batch_files_and_samples = self.split_files_and_samples[idx]
        batch_labels = self.split_labels[idx]
        
        #       Open raw data file
        h5f = h5py.File(h5f_dict[self.dataset], 'r')

        #       Get values for filepaths in batch
        batch_values = []
        
        if  multiple_labels_dict[self.dataset]:
            for file_path_index, index, sample_index in batch_files_and_samples:
                file_path = self.file_paths[file_path_index]
                batch_values = append_values_and_labels(batch_values, file_data_h5=h5f, file_path=file_path, sample_index=sample_index, index = index)
        else:
            for file_path_index, sample_index in batch_files_and_samples:
                file_path = self.file_paths[file_path_index]
                batch_values = append_values_and_labels(batch_values, file_data_h5=h5f, file_path=file_path, sample_index=sample_index)
        
        h5f.close()
        self.p = np.random.permutation(len(batch_values))
        batch_values, batch_labels = np.asarray(batch_values)[self.p], np.asarray(batch_labels)[self.p]
        batch_values, batch_labels = batch_values.tolist(), batch_labels.tolist()
        return batch_values, batch_labels

In [61]:
from tslearn.metrics import cdist_dtw

In [70]:
file1 = open("log_sim.txt","w")

In [None]:
d1 = "TUEP"
d2 = "TUEP"
batch_size = 256

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d1)
d1_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d1, class_filter = None)

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d2)
d2_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d2, class_filter = None)

similarity_measure = []
for i in range(50):
    X1 = d1_gen[i][0]
    X2 = d2_gen[i][0]
    for j in range(batch_size):
        similarity_measure.append(cdist_dtw([X1[j]],[X2[j]]))
    print(str(i) + '/' + str(5))
#return macro average of sim measure
print(sum(similarity_measure)/len(similarity_measure))
file1.write(d1 + " " + d2 + ": " + str(sum(similarity_measure)/len(similarity_measure)))

0/5


In [None]:
d1 = "TUEP"
d2 = "TUAB"
batch_size = 256

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d1)
d1_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d1, class_filter = None)

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d2)
d2_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d2, class_filter = None)

similarity_measure = []
for i in range(50):
    X1 = d1_gen[i][0]
    X2 = d2_gen[i][0]
    for j in range(batch_size):
        similarity_measure.append(cdist_dtw([X1[j]],[X2[j]]))
    print(str(i) + '/' + str(5))
#return macro average of sim measure
print(sum(similarity_measure)/len(similarity_measure))
file1.write(d1 + " " + d2 + ": " + str(sum(similarity_measure)/len(similarity_measure)))

In [None]:
d1 = "TUEP"
d2 = "TUSZ"
batch_size = 256

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d1)
d1_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d1, class_filter = None)

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d2)
d2_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d2, class_filter = None)

similarity_measure = []
for i in range(50):
    X1 = d1_gen[i][0]
    X2 = d2_gen[i][0]
    for j in range(batch_size):
        similarity_measure.append(cdist_dtw([X1[j]],[X2[j]]))
    print(str(i) + '/' + str(5))
#return macro average of sim measure
print(sum(similarity_measure)/len(similarity_measure))
file1.write(d1 + " " + d2 + ": " + str(sum(similarity_measure)/len(similarity_measure)))

In [None]:
d1 = "TUAB"
d2 = "TUAB"
batch_size = 256

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d1)
d1_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d1, class_filter = None)

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d2)
d2_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d2, class_filter = None)

similarity_measure = []
for i in range(50):
    X1 = d1_gen[i][0]
    X2 = d2_gen[i][0]
    for j in range(batch_size):
        similarity_measure.append(cdist_dtw([X1[j]],[X2[j]]))
    print(str(i) + '/' + str(5))
#return macro average of sim measure
print(sum(similarity_measure)/len(similarity_measure))
file1.write(d1 + " " + d2 + ": " + str(sum(similarity_measure)/len(similarity_measure)))

In [None]:
d1 = "TUAB"
d2 = "TUSZ"
batch_size = 256

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d1)
d1_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d1, class_filter = None)

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d2)
d2_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d2, class_filter = None)

similarity_measure = []
for i in range(50):
    X1 = d1_gen[i][0]
    X2 = d2_gen[i][0]
    for j in range(batch_size):
        similarity_measure.append(cdist_dtw([X1[j]],[X2[j]]))
    print(str(i) + '/' + str(5))
#return macro average of sim measure
print(sum(similarity_measure)/len(similarity_measure))
file1.write(d1 + " " + d2 + ": " + str(sum(similarity_measure)/len(similarity_measure)))

In [None]:
d1 = "TUSZ"
d2 = "TUSZ"
batch_size = 256

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d1)
d1_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d1, class_filter = None)

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(d2)
d2_gen = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, d2, class_filter = None)

similarity_measure = []
for i in range(50):
    X1 = d1_gen[i][0]
    X2 = d2_gen[i][0]
    for j in range(batch_size):
        similarity_measure.append(cdist_dtw([X1[j]],[X2[j]]))
    print(str(i) + '/' + str(5))
#return macro average of sim measure
print(sum(similarity_measure)/len(similarity_measure))
file1.write(d1 + " " + d2 + ": " + str(sum(similarity_measure)/len(similarity_measure)))

In [None]:
file1.close()

# Optimized voting

In [58]:
class predictions():
    def __init__(self, minirockets, classifiers):
        self.minirocket0 = minirockets[0]
        self.minirocket1 = minirockets[1]
        self.minirocket2 = minirockets[2]
        
        self.classifier0 = classifiers[0]
        self.classifier1 = classifiers[1]
        self.classifier2 = classifiers[2]       
    
    def predict_generator(self, generator):
        preds0, preds1, preds2 = [], [], []
        gold = []
        for i in range(len(generator)):
            print(i*batch_size)
            X_test, y_test= test_generator[i]
            X_test = X_test.squeeze(axis = 1)
            X_0 = self.minirocket0.transform(X_test)
            r_test0 = self.classifier0.predict(X_0)
            X_1 = self.minirocket1.transform(X_test)
            r_test1 = self.classifier1.predict(X_1)
            X_2 = self.minirocket2.transform(X_test)
            r_test2 = self.classifier2.predict(X_2)
            #correct format
            for element in r_test0:
                preds0.append(np.float32(element))
            for element in r_test1:
                preds1.append(np.float32(element))
            for element in r_test2:
                preds2.append(np.float32(element))
            for element in y_test:
                gold.append(np.float32(element))
        
        preds = [[preds0[i],preds1[i],preds2[i]] for i in range(len(preds0))]
        return preds, gold

In [59]:
#source model
rocket_path = "/home/jupyter/time_series_transfer_learning/transfer_learning/2-rocket/model_weights/"
target_model = "TUSZ"
source_models = ["TUEP", "TUAB", "TUSZ"]
source_models.remove(target_model)

#Load in the data
dataset = target_model
batch_size = 512

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(dataset)

#Generator
test_generator = EEG_Data_Generator_Heterogeneous(test_values, test_labels, batch_size, dataset, class_filter = None)
train_generator = EEG_Data_Generator_Heterogeneous(train_all_values, train_all_labels, batch_size, dataset, class_filter = None)

# Define MINIROCKET transformer
# Must fit first redundant
minirocket0 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket0.fit(X_train, y_train)
minirocket0.parameters = tuple(np.load(rocket_path + minirocket_dict[target_model] , allow_pickle=True))

# Must fit first redundant
minirocket1 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket1.fit(X_train, y_train)
minirocket1.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[0]] , allow_pickle=True))

# Must fit first redundant
minirocket2 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket2.fit(X_train, y_train)
minirocket2.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[1]] , allow_pickle=True))

#Load in the classifiers (ridgeRegressionClassifierCV)
classifier0 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier0 = pickle.load(open(rocket_path + classifier_dict[target_model][target_model], 'rb'))

classifier1 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier1 = pickle.load(open(rocket_path + classifier_dict[source_models[0]][target_model], 'rb'))

classifier2 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier2 = pickle.load(open(rocket_path + classifier_dict[source_models[1]][target_model], 'rb'))

minirockets = [minirocket0, minirocket1, minirocket2]
classifiers = [classifier0, classifier1, classifier2]

In [None]:
p = predictions(minirockets, classifiers)
input, labels = p.predict_generator(train_generator)

scores = []
for i1 in range(2):
    for i2 in range(2):
        for i3 in range(2):
            for i4 in range(2):
                for i5 in range(2):
                    for i6 in range(2):
                        dict = {(0,0,0): 0, (1,1,1): 1, (0,0,1): i1, (0,1,0): i2, (1,0,0): i3, (0,1,1): i4, (1,0,1): i5, (1,1,0): i6}
                        predictions = []
                        for i in range(len(input)):
                            predictions.append(dict[tuple(input[i])])
                        acc = balanced_acc(predictions, labels)
                        print(acc)
                        scores.append([acc[0], [i1,i2,i3,i4,i5,i6], acc])

0


In [None]:
scores = sorted(scores, key = lambda a: a[0])
print(scores[-1])

In [None]:
test_input, test_labels = p.predict_generator(test_generator)

In [52]:
#optimized algo with dict values calculated on train_all set, we always set (0,0,0) to 0 and (1,1,1) to 1 because obv
dict = {(0,0,0): 0, (1,1,1): 1, (0,0,1): 0, (0,1,0): 0, (1,0,0): 1, (0,1,1): 1, (1,0,1): 0, (1,1,0): 1}
predictions = []
for i in range(len(test_input)):
    predictions.append(dict[tuple(test_input[i])])
acc = balanced_acc(predictions, test_labels)
print(acc)

(0.7040391759348238, 9805, 3229, 1724, 3162)


In [54]:
class predictions():
    def __init__(self, minirockets, classifiers):
        self.minirocket0 = minirockets[0]
        self.minirocket1 = minirockets[1]
        self.minirocket2 = minirockets[2]
        
        self.classifier0 = classifiers[0]
        self.classifier1 = classifiers[1]
        self.classifier2 = classifiers[2]       
    
    def predict_generator(self, generator):
        preds0, preds1, preds2 = [], [], []
        gold = []
        for i in range(len(generator)):
            print(i*batch_size)
            X_test, y_test= test_generator[i]
            X_test = X_test.squeeze(axis = 1)
            X_0 = self.minirocket0.transform(X_test)
            r_test0 = self.classifier0.predict(X_0)
            X_1 = self.minirocket1.transform(X_test)
            r_test1 = self.classifier1.predict(X_1)
            X_2 = self.minirocket2.transform(X_test)
            r_test2 = self.classifier2.predict(X_2)
            #correct format
            for element in r_test0:
                preds0.append(np.float32(element))
            for element in r_test1:
                preds1.append(np.float32(element))
            for element in r_test2:
                preds2.append(np.float32(element))
            for element in y_test:
                gold.append(np.float32(element))
        
        preds = [[preds0[i],preds1[i],preds2[i]] for i in range(len(preds0))]
        return preds, gold

In [53]:
#source model
rocket_path = "/home/jupyter/time_series_transfer_learning/transfer_learning/2-rocket/model_weights/"
target_model = "TUAB"
source_models = ["TUEP", "TUAB", "TUSZ"]
source_models.remove(target_model)

#Load in the data
dataset = target_model
batch_size = 512

train_all_values, train_all_labels, test_values, test_labels, train_values, train_labels, val_values, val_labels = get_data(dataset)

#Generator
test_generator = EEG_Data_Generator_Heterogeneous(test_values, test_labels, batch_size, dataset, class_filter = None)
train_generator = EEG_Data_Generator_Heterogeneous(train_values, train_labels, batch_size, dataset, class_filter = None)

# Define MINIROCKET transformer
# Must fit first redundant
minirocket0 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket0.fit(X_train, y_train)
minirocket0.parameters = tuple(np.load(rocket_path + minirocket_dict[target_model] , allow_pickle=True))

# Must fit first redundant
minirocket1 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket1.fit(X_train, y_train)
minirocket1.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[0]] , allow_pickle=True))

# Must fit first redundant
minirocket2 = MiniRocketMultivariate_sktime()
X_train, y_train= train_generator[0]
X_train = X_train.squeeze(axis = 1)
minirocket2.fit(X_train, y_train)
minirocket2.parameters = tuple(np.load(rocket_path + minirocket_dict[source_models[1]] , allow_pickle=True))

#Load in the classifiers (ridgeRegressionClassifierCV)
classifier0 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier0 = pickle.load(open(rocket_path + classifier_dict[target_model][target_model], 'rb'))

classifier1 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier1 = pickle.load(open(rocket_path + classifier_dict[source_models[0]][target_model], 'rb'))

classifier2 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier2 = pickle.load(open(rocket_path + classifier_dict[source_models[1]][target_model], 'rb'))

minirockets = [minirocket0, minirocket1, minirocket2]
classifiers = [classifier0, classifier1, classifier2]

In [55]:
p = predictions(minirockets, classifiers)
input, labels = p.predict_generator(train_generator)

scores = []
for i1 in range(2):
    for i2 in range(2):
        for i3 in range(2):
            for i4 in range(2):
                for i5 in range(2):
                    for i6 in range(2):
                        dict = {(0,0,0): 0, (1,1,1): 1, (0,0,1): i1, (0,1,0): i2, (1,0,0): i3, (0,1,1): i4, (1,0,1): i5, (1,1,0): i6}
                        predictions = []
                        for i in range(len(input)):
                            predictions.append(dict[tuple(input[i])])
                        acc = balanced_acc(predictions, labels)
                        print(acc)
                        scores.append([acc[0], [i1,i2,i3,i4,i5,i6], acc])
scores = sorted(scores, key = lambda a: a[0])
print(scores[-1])

0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432
18944
19456
(0.7301137430952387, 9033, 1960, 90, 8885)
(0.7312511740476262, 9773, 1880, 170, 8145)
(0.7495727147248032, 10954, 1820, 230, 6964)
(0.7507101456771906, 11694, 1740, 310, 6224)
(0.7541105959669933, 10767, 1860, 190, 7151)
(0.7552480269193806, 11507, 1780, 270, 6411)
(0.7735695675965577, 12688, 1720, 330, 5230)
(0.7747069985489452, 13428, 1640, 410, 4490)
(0.6844851477870734, 10020, 1660, 390, 7898)
(0.6856225787394608, 10760, 1580, 470, 7158)
(0.7039441194166378, 11941, 1520, 530, 5977)
(0.7050815503690253, 12681, 1440, 610, 5237)
(0.7084820006588279, 11754, 1560, 490, 6164)
(0.7096194316112153, 12494, 1480, 570, 5424)
(0.7279409722883923, 13675, 1420, 630, 4243)
(0.7290784032407798, 14415, 1340, 710, 3503)
(0.728286012975098, 9658, 1881, 169, 8260)
(0.7294234439274854, 10398, 18

In [56]:
test_input, test_labels = p.predict_generator(test_generator)

0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824


In [57]:

#optimized algo with dict values calculated on train_all set, we always set (0,0,0) to 0 and (1,1,1) to 1 because obv
dict = {(0,0,0): 0, (1,1,1): 1, (0,0,1): 0, (0,1,0): 0, (1,0,0): 0, (0,1,1): 1, (1,0,1): 1, (1,1,0): 1}
predictions = []
for i in range(len(test_input)):
    predictions.append(dict[tuple(test_input[i])])
acc = balanced_acc(predictions, test_labels)
print(acc)

(0.7733687514576368, 9667, 1150, 293, 3226)
