# Retrain the model using the complete dataset

Retrain, predict local test set.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras import layers, regularizers
import keras_tuner as kt
import sys
import os

# Sets off SettingWithCopyWarning.
pd.set_option('mode.chained_assignment', None)


# ----------------------------------------
# Flags for working on my different machines.
flag_kaggle = True
# flag_FW = True
# flag_LN = True

try:
    if flag_kaggle:
        sys.path.insert(0, '/kaggle/input/hms-lib')
        base_dir = '/kaggle/input/hms-harmful-brain-activity-classification'
        devset_dir = '/kaggle/input/hms-cwt-scalograms-single-numpy-v1'
        output_dir = ''
except:
    pass 

try:
    if flag_FW:
        sys.path.insert(0, '../lib')
        base_dir = '../../kaggle_data/hms'
        devset_dir = '../data'
        output_dir = 'results/'
except:
    pass 

try:
    if flag_LN:
        sys.path.insert(0, '../lib')
        base_dir = '../../data/hms'
        devset_dir = '../data'
        output_dir = 'results/'
except:
    pass 
# ----------------------------------------

from KLmetric import score

path_train = f'{devset_dir}/05_single_cwt_v1_train.npy'
path_train_items = f'{devset_dir}/05_single_cwt_v1_train_items.npy'
path_val = f'{devset_dir}/05_single_cwt_v1_val.npy'
path_val_items = f'{devset_dir}/05_single_cwt_v1_val_items.npy'
path_test = f'{devset_dir}/05_single_cwt_v1_test.npy'
path_test_items = f'{devset_dir}/05_single_cwt_v1_test_items.npy'

2024-03-14 17:06:09.959098: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 17:06:09.959215: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 17:06:10.254081: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Merge training and validation sets.

In [2]:
#
# Data generator for training.
#
# coefficients of cwt's arrays
# 5 channels (LP, RP, LT, RP, C)
#

class RetrainDataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, path_to_train_items, path_to_train_data, path_to_val_items, path_to_val_data, batch_size=32, n_classes=6, shuffle=True):
        ''' Initialization
        item: [eeg_id, eeg_sub_id, idx in sgrams (1st index), target,
        seizure_vote, lpd_vote, gpd_vote, lrda_vote,
        grda_vote, other_vote]
        '''
        self.n_channels = 5
        # self.n_freqs = 40

        self.data = np.concatenate([np.load(path_to_train_data), np.load(path_to_val_data)])
        self.items = np.concatenate([np.load(path_to_train_items), np.load(path_to_val_items)])
        self.dim = (self.data.shape[1], self.data.shape[2])
        self.batch_size = batch_size
        self.len = self.data.shape[0]
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.len / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def get_dim(self):
        'Dimensions for the input layer.'
        return (self.dim[0], self.dim[1], self.n_channels)

    def get_num_observations(self):
        return self.data.shape[0]

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.len)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        true_size = len(indexes)
        X = np.empty((true_size, *self.dim, self.n_channels))
        y = np.empty((true_size, self.n_classes), dtype=float)

        # Generate data
        for i, idx in enumerate(indexes):
            item = self.items[idx]
            # print(item)  # Uncomment for testing.
            X[i,:,:,:] = self.data[np.int32(item[2]), :, :, :]
            # Store solution
            y[i,:] = item[-6:]

        return X, y


In [3]:
# Parameters
params = {
    'batch_size': 32,
    'n_classes': 6,
    'shuffle': True
    }

retraining_generator = RetrainDataGenerator(path_train_items, path_train, path_val_items, path_val, **params)

print("Observations in training set:", retraining_generator.get_num_observations())


Observations in training set: 14334


In [4]:
original_file = '/kaggle/input/hms-model-keras-cwt-v1/checkpoint-12-1.model.keras'
!cp '/kaggle/input/hms-model-keras-cwt-v1/checkpoint-12-1.model.keras' '/kaggle/working/checkpoint-12-1.model.keras'
checkpoint_filepath = '/kaggle/working/checkpoint-12-1.model.keras'
model = keras.models.load_model(checkpoint_filepath)


In [5]:
history = model.fit(retraining_generator, epochs=4)


Epoch 1/4


  self._warn_if_super_not_called()


[1m  5/448[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13s[0m 31ms/step - kl_divergence: 0.6640 - loss: 0.6640

I0000 00:00:1710436129.777527      77 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1710436129.797505      77 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 40ms/step - kl_divergence: 0.7042 - loss: 0.7042
Epoch 2/4
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 26ms/step - kl_divergence: 0.5473 - loss: 0.5473
Epoch 3/4
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 26ms/step - kl_divergence: 0.4503 - loss: 0.4503
Epoch 4/4
[1m448/448[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 26ms/step - kl_divergence: 0.3810 - loss: 0.3810


In [6]:
model_filename = f'{output_dir}hms-keras-12-cwt-final.keras'
model.save(model_filename)

In [7]:

#
# Test Data generator for predicting
# 

class TestDataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, path_to_items, path_to_data, batch_size=32, n_classes=6, shuffle=False):
        ''' Initialization
        item: [eeg_id, eeg_sub_id, idx in sgrams (1st index), target,
        seizure_vote, lpd_vote, gpd_vote, lrda_vote,
        grda_vote, other_vote]
        '''
        self.n_channels = 5
        self.data = np.load(path_to_data)
        self.items = np.load(path_to_items)
        self.dim = (self.data.shape[1], self.data.shape[2])
        self.batch_size = batch_size
        self.len = self.data.shape[0]
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.len / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X = self.__data_generation(indexes)

        return X

    def get_dim(self):
        'Dimensions for the input layer.'
        return (self.dim[0], self.dim[1], self.n_channels)

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.len)
        # pass 
        
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        true_size = len(indexes)
        X = np.empty((true_size, *self.dim, self.n_channels))

        # Generate data
        for i, idx in enumerate(indexes):
            item = self.items[idx]
            # print(item)  # Uncomment for testing.
            X[i,:,:,:] = self.data[np.int32(item[2]), :, :, :]

        return X


In [8]:
params = {
    'batch_size': 32,
    'n_classes': 6,
    }

test_generator = TestDataGenerator(path_test_items, path_test, **params)

y_pred = model.predict(test_generator)

[1m12/39[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 15ms/step

W0000 00:00:1710436195.175339      76 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step


In [9]:
TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

test_items = np.load(path_test_items)
df_test_items = pd.DataFrame(test_items)
df_test_items[0] = df_test_items[0].astype(int)

sub = pd.DataFrame({'eeg_id':df_test_items[0]})
sub[TARGETS] = np.round(y_pred,6)
# sub.to_csv('submission.csv',index=False)

df_test_scoring = df_test_items[[0,4,5,6,7,8,9]]
df_test_scoring.columns = sub.columns

score(df_test_scoring, sub, 'eeg_id')

0.5133006242859438