In [None]:
import os #, gc
import time
import json
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
from sklearn.model_selection import KFold, GroupKFold, StratifiedGroupKFold
import tensorflow.keras.backend as K, gc
import tensorflow as tf
import pandas as pd, numpy as np
import matplotlib.pyplot as plt


print('TensorFlow version =',tf.__version__)

# USE MULTIPLE GPUS
gpus = tf.config.list_physical_devices('GPU')
if len(gpus)<=1: 
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    print(f'Using {len(gpus)} GPU')
else: 
    strategy = tf.distribute.MirroredStrategy()
    print(f'Using {len(gpus)} GPUs')



In [None]:
# IF THIS EQUALS NONE, THEN WE TRAIN NEW MODELS
# IF THIS EQUALS DISK PATH, THEN WE LOAD PREVIOUSLY TRAINED MODELS
LOAD_MODELS_FROM = '/kaggle/input/effnetb0-2-pop-model-train-twice-v1'

USE_KAGGLE_SPECTROGRAMS = True
USE_EEG_SPECTROGRAMS = True
# USE MIXED PRECISION
MIXED_PRECISION = True
# READ ALL SPECTROGRAMS
READ_SPEC_FILES = False
# # READ ALL EEG SPECTROGRAMS
READ_EEG_SPEC_FILES = False

EVAL_ONLY = True

In [None]:
from pathlib import Path

VERSION = 1

MODEL_NAME = f'effnetB0_2_pop_twice_train_v{VERSION}'
MODEL_LOC = f'models/{MODEL_NAME}'

NOTES = """Split training data into 2 populations based on total vote sum.
EffNetB0 efficientnet package with unique eeg_id and votes.
Using GKF and no augmentation.
"""

SEED = 2444
BATCH_SIZE = 32
FOLDS = 5 # 5-fold cross-validation.


LR_START = 1e-4
LR_MAX = 1e-4
LR_RAMPUP_EPOCHS = 0
LR_SUSTAIN_EPOCHS = 2
LR_STEP_DECAY = 0.5 # 0.5
LR_EVERY = 1
EPOCHS = 5
PATIENCE = 2
START_FROM_EPOCH = 2

TIMESTAMP = pd.Timestamp.now('utc')

model_info = {
    'api': f'TensorFlow version = {tf.__version__}',
    'datetime': TIMESTAMP.isoformat(),
    'filename': 'efficientnet_tf_unique_vote.ipynb',
    'folds': FOLDS,
    'model': MODEL_NAME,
    'notes': NOTES,
    'path': MODEL_LOC,
    'version': VERSION,
    'SEED': SEED,
    'BATCH_SIZE': BATCH_SIZE,
    'EPOCHS': EPOCHS,
    'FOLDS': FOLDS,
    'PATIENCE': PATIENCE,
    'USE_KAGGLE_SPECTROGRAMS': USE_KAGGLE_SPECTROGRAMS,
    'USE_EEG_SPECTROGRAMS': USE_EEG_SPECTROGRAMS,
    'MIXED_PRECISION': MIXED_PRECISION,
    'READ_SPEC_FILES': READ_SPEC_FILES,
    'READ_EEG_SPEC_FILES': READ_EEG_SPEC_FILES,
    'LR_START': LR_START,
    'LR_MAX': LR_MAX,
    'LR_RAMPUP_EPOCHS': LR_RAMPUP_EPOCHS,
    'LR_SUSTAIN_EPOCHS': LR_SUSTAIN_EPOCHS,
    'LR_STEP_DECAY': LR_STEP_DECAY,
    'LR_EVERY': LR_EVERY,
    'START_FROM_EPOCH': START_FROM_EPOCH,
     }

tf.random.set_seed(
    SEED
)
np.random.seed(SEED)

# Check if the directory exists
if not os.path.exists(MODEL_LOC):
    Path(MODEL_LOC).mkdir(parents=True, exist_ok=True)
    
model_info_path = os.path.join(MODEL_LOC, 'model_info.json')
with open(model_info_path, 'w') as f:
    json.dump(model_info, f)

In [None]:
# USE MIXED PRECISION
if MIXED_PRECISION:
    tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
    print('Mixed precision enabled')
else:
    print('Using full precision')

# Load Train Data

In [None]:
df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
TARGETS = df.columns[-6:]
print('Train shape:', df.shape )
print('Targets', list(TARGETS))

df['total_evaluators'] = df[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].sum(axis=1)

df_uniq = df.drop_duplicates(subset=['eeg_id'] + list(TARGETS))
# df_uniq = df_uniq.sort_values('eeg_id', ascending=True)
print(f'There are {df.patient_id.nunique()} patients in the training data.')
print(f'There are {df.eeg_id.nunique()} EEG IDs in the training data.')
print(f'There are {df_uniq.shape[0]} unique eeg_id + votes in the training data.')

df_uniq.eeg_id.value_counts().value_counts().plot(kind='bar', title=f'Distribution of Count of EEG w Unique Vote: '
                                                                    f'{df_uniq.shape[0]} examples');

In [None]:
%%time
if not EVAL_ONLY:
    spectrograms = np.load('/kaggle/input/brain-spectrograms/specs.npy',allow_pickle=True).item()

In [None]:
%%time
if not EVAL_ONLY:
    all_eegs = np.load('/kaggle/input/eeg-spectrogram-by-lead-id-unique/eeg_specs.npy',allow_pickle=True).item()

# Training DataFrame

In [None]:
if not EVAL_ONLY:
    train = df[df['label_id'].isin(all_eegs.keys())].copy()
    
    pop_1_idx = train['total_evaluators'] < 10
    
    y_data = train[TARGETS].values
    y_data = y_data / y_data.sum(axis=1,keepdims=True)
    train[TARGETS] = y_data

    train['target'] = train['expert_consensus']
    
    train_pop_1 = train[pop_1_idx].copy().reset_index()
    train_pop_2 = train[~pop_1_idx].copy().reset_index()
    # train = train.reset_index()
    print('Pop 1: train unique eeg_id + votes shape:', train_pop_1.shape )
    plt.figure(figsize=(10, 6))
    plt.hist(train['total_evaluators'], bins=10, color='blue', edgecolor='black')
    plt.title('Histogram of Total Evaluators')
    plt.xlabel('Total Evaluators')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
import albumentations as albu
TARS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}
TARS2 = {x:y for y,x in TARS.items()}
 # 256
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, specs, eeg_specs,
                 batch_size=32, shuffle=False, augment=False, mode='train'): 
        self.dim_1 = 256
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.augment = augment
        self.mode = mode
        self.specs = specs
        self.eeg_specs = eeg_specs
        self.on_epoch_end()
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = int( np.ceil( len(self.data) / self.batch_size ) )
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X, y = self.__data_generation(indexes)
        if self.augment: X = self.__augment_batch(X) 
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange( len(self.data) )
        if self.shuffle: np.random.shuffle(self.indexes)
                        
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        
        X = np.zeros((len(indexes),128,self.dim_1,8),dtype='float32')
        y = np.zeros((len(indexes),6),dtype='float32')
        img = np.ones((128,self.dim_1),dtype='float32')
        
        for j,i in enumerate(indexes):
            row = self.data.iloc[i]
            if self.mode=='test': 
                r = 0
                spec_id = row.spec_id
            elif self.mode in ['ensemble', 'granular_train', 'granular_valid']:
                r = int(row['spectrogram_label_offset_seconds'] // 2)
                spec_id = row['spectrogram_id']
            else: 
                r = int( (row['min'] + row['max'])//4 )
                spec_id = row.spec_id

            for k in range(4):
                # EXTRACT 300 ROWS OF SPECTROGRAM
                img = self.specs[spec_id][r:r+300,k*100:(k+1)*100].T
                
                # LOG TRANSFORM SPECTROGRAM
                img = np.clip(img,np.exp(-4),np.exp(8))
                img = np.log(img)
                
                # STANDARDIZE PER IMAGE
                ep = 1e-6
                m = np.nanmean(img.flatten())
                s = np.nanstd(img.flatten())
                img = (img-m)/(s+ep)
                img = np.nan_to_num(img, nan=0.0)
                
                # CROP TO 256 TIME STEPS
                X[j,14:-14,:,k] = img[:,22:-22] / 2.0
            
            if self.mode in ['ensemble', 'granular_train', 'granular_valid']:
                # ensemble uses label_id as a unique identifier
                img = self.eeg_specs[row.label_id]
            else:
                # EEG SPECTROGRAMS
                img = self.eeg_specs[row.eeg_id]
            X[j,:,:,4:] = img

            if self.mode!='test':
                y[j,] = row[TARGETS]
            
        return X,y
    
    def __random_transform(self, img):
        composition = albu.Compose([
            albu.HorizontalFlip(p=0.5),
            #albu.CoarseDropout(max_holes=8,max_height=32,max_width=32,fill_value=0,p=0.5),
        ])
        return composition(image=img)['image']
            
    def __augment_batch(self, img_batch):
        for i in range(img_batch.shape[0]):
            img_batch[i, ] = self.__random_transform(img_batch[i, ])
        return img_batch

# Train Scheduler

In [None]:
def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = LR_MAX * LR_STEP_DECAY**((epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS)//LR_EVERY)
    return lr

rng = [i for i in range(EPOCHS)]
y = [lrfn(x) for x in rng]
plt.figure(figsize=(10, 4))
plt.plot(rng, y, 'o-'); 
plt.xlabel('epoch',size=14); plt.ylabel('learning rate',size=14)
plt.title('Step Training Schedule',size=16); plt.show()

LR = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)

In [None]:
def lrfn2(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = LR_MAX * LR_STEP_DECAY**((epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS)//LR_EVERY)
    return lr

rng = [i for i in range(EPOCHS)]
y = [lrfn2(x) for x in rng]
plt.figure(figsize=(10, 4))
plt.plot(rng, y, 'o-'); 
plt.xlabel('epoch',size=14); plt.ylabel('learning rate',size=14)
plt.title('Step Training Schedule',size=16); plt.show()

LR2 = tf.keras.callbacks.LearningRateScheduler(lrfn2, verbose = True)

## Build EfficientNet Model

In [None]:
!pip install --no-index --find-links=/kaggle/input/tf-efficientnet-whl-files /kaggle/input/tf-efficientnet-whl-files/efficientnet-1.1.1-py3-none-any.whl

In [None]:
import efficientnet.tfkeras as efn


def build_model():
    
    inp = tf.keras.Input(shape=(128,300,10))
    base_model = efn.EfficientNetB0(include_top=False, weights=None, input_shape=None)
    base_model.load_weights('/kaggle/input/tf-efficientnet-noisy-student-weights/efficientnet-b0_noisy-student_notop.h5')
    
    # RESHAPE INPUT 128x256x8 => 512x512x3 MONOTONE IMAGE
    # KAGGLE SPECTROGRAMS
    x1 = [inp[:,:,:,i:i+1] for i in range(4)] #300
    x1 = tf.keras.layers.Concatenate(axis=1)(x1)
    # EEG SPECTROGRAMS
    x2 = [inp[:,:,:,i+4:i+5] for i in range(4)]
    x2 = tf.keras.layers.Concatenate(axis=1)(x2)
    # MAKE 512X512X3
    if USE_KAGGLE_SPECTROGRAMS & USE_EEG_SPECTROGRAMS:
        x = tf.keras.layers.Concatenate(axis=2)([x1,x2])
    elif USE_EEG_SPECTROGRAMS: 
        x = x2
    else: 
        x = x1
    # possible to change input channel?
    x = tf.keras.layers.Concatenate(axis=3)([x,x,x])

    # OUTPUT
    x = base_model(x)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
#   x = tf.keras.layers.Dense(1024, activation='relu')(x)
#   x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(6,activation='softmax', dtype='float32')(x)

    # Add your custom layers
        
    # COMPILE MODEL
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate = 1e-3)
#     opt = tf.keras.optimizers.legacy.Adam(learning_rate = 1e-3)
    loss = tf.keras.losses.KLDivergence()

    model.compile(loss=loss, optimizer = opt) 
        
    return model

# Train Model
We train using Group KFold on patient id. If `LOAD_MODELS_FROM = None`, then we will train new models in this notebook version. Otherwise we will load saved models from the path `LOAD_MODELS_FROM`.

In [None]:
import io
import itertools
import matplotlib as mpl
from tensorflow import keras
from sklearn import metrics

def plot_confusion_matrix(cm, class_names):
    figure = plt.figure(figsize=(8, 8))
    plt.imshow(cm, interpolation='nearest', cmap=mpl.colormaps['Greens'])
    plt.title("Confusion matrix")
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    cm = np.around(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], decimals=2)
    threshold = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        color = "white" if cm[i, j] > threshold else "black"
        plt.text(j, i, cm[i, j], horizontalalignment="center", color=color)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    return figure

def plot_to_image(figure):
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close(figure)
    buf.seek(0)

    digit = tf.image.decode_png(buf.getvalue(), channels=4)
    digit = tf.expand_dims(digit, 0)

    return digit

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
if not EVAL_ONLY:
    import sys
    sys.path.append('/kaggle/input/kaggle-kl-div')
    from kaggle_kl_div import score

    EARLY_STOP = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        restore_best_weights=True,
        patience=PATIENCE,
        start_from_epoch=START_FROM_EPOCH,
        min_delta=0.0025
    )

    all_oof = []
    all_true = []
    all_histories = []
 
    gkf = GroupKFold(n_splits=FOLDS)
    val_indices = {}
    val_label_ids = {}
    preds={}
    for fold_idx, (train_index, valid_index) in enumerate(gkf.split(train_pop_1, train_pop_1.target, train_pop_1.patient_id)):  
        val_indices[fold_idx] = [int(i) for i in valid_index]
    #     continue
        print('#'*25)
        print(f'### Fold {fold_idx+1}')
        train_valid = train_pop_1.iloc[valid_index]
        val_label_ids[fold_idx] = [int(i) for i in train_valid['label_id']]

        train_gen = DataGenerator(train_pop_1.iloc[train_index], specs=spectrograms, eeg_specs=all_eegs,
                                  shuffle=True, batch_size=BATCH_SIZE, augment=False, mode='granular_train')
        valid_gen = DataGenerator(train_valid, specs=spectrograms, eeg_specs=all_eegs,
                                  shuffle=False, batch_size=64, mode='granular_valid')

        print(f'### train size {len(train_index)}, valid size {len(valid_index)}')
        print('#'*25)

        K.clear_session()
        with strategy.scope():
            model = build_model()
        h5_filename = f'EffNet_pop1_v{VERSION}_f{fold_idx}.h5'
        if LOAD_MODELS_FROM is None:
#             log_dir = os.path.join(
#                 'logs/fit',
#                 f'{TIMESTAMP.strftime("%Y%m%d-%H%M%S")}_{MODEL_NAME}_k{fold_idx}_pop1')
#             file_writer_cm = tf.summary.create_file_writer(os.path.join(log_dir, 'cm'))
#             TENSORBOARD_CALLBACK = tf.keras.callbacks.TensorBoard(
#                 log_dir=log_dir,
#                 histogram_freq=1,
#                 write_graph=True,
#                 write_images=False,
#                 update_freq='epoch',
#                 profile_batch=2,
#                 embeddings_freq=1
#             )
            
#             def log_confusion_matrix(epoch, logs):
#                 predictions = model.predict(valid_gen, verbose=1)
#                 predictions = np.argmax(predictions, axis=1)
#                 true = np.argmax(train_valid[TARGETS].values, axis=1)

#                 cm = metrics.confusion_matrix(true, predictions)
#                 figure = plot_confusion_matrix(cm, class_names=TARGETS)
#                 cm_image = plot_to_image(figure)

#                 with file_writer_cm.as_default():
#                     tf.summary.image("Confusion Matrix", cm_image, step=epoch)

#             CM = tf.keras.callbacks.LambdaCallback(on_epoch_end=log_confusion_matrix)
            # ModelCheckpoint callback to save the best model

            h5_path = os.path.join(MODEL_LOC, h5_filename)
            CHECKPOINT = tf.keras.callbacks.ModelCheckpoint(
                h5_path,         # Path where to save the model
                save_best_only=True,     # Only save a model if `val_loss` has improved
                monitor='val_loss',      # Monitor 'val_loss' for improvement
                mode='min'               # The smaller the `val_loss`, the better
            )


            history = model.fit(train_gen, verbose=1,
                  validation_data = valid_gen,
                  epochs=EPOCHS, callbacks = [EARLY_STOP, LR, CHECKPOINT, 
#                                               CM, TENSORBOARD_CALLBACK
                                             ])
            all_histories.append(history)
            model.load_weights(h5_path)

        else:
            # load weights from pop 2 training to get oof 
            h5_filename = f'EffNet_pop2_v{VERSION}_f{fold_idx}.h5'
            model.load_weights(os.path.join(LOAD_MODELS_FROM, h5_filename))

        oof = model.predict(valid_gen, verbose=1)
        all_oof.append(oof)
        all_true.append(train_valid[TARGETS].values)

        preds[fold_idx] = oof

        del model, oof
        gc.collect()


    all_oof = np.concatenate(all_oof)
    all_true = np.concatenate(all_true)


    if LOAD_MODELS_FROM is None:
        history_dict = {}
        for fold, h in enumerate(all_histories):
            history_dict[fold] = str(h.history)

        with open('histories.json', 'w') as file:
            json.dump(history_dict, file, indent=4)

        with open('val_indices.json', 'w') as file:
            json.dump(val_indices, file, indent=4)

        with open('val_label_ids.json', 'w') as file:
            json.dump(val_label_ids, file, indent=4)

     
    flattened_list = [item for sublist in val_label_ids.values() for item in sublist]
    sub = pd.DataFrame({'label_id':flattened_list})
    sub[TARGETS] = np.vstack(preds.values())
    sub.to_csv(os.path.join(MODEL_LOC, 'ensemble_data1.csv'), index=False)
    
    oof = pd.DataFrame(all_oof.copy())
    oof['id'] = np.arange(len(oof))

    true = pd.DataFrame(all_true.copy())
    true['id'] = np.arange(len(true))

    cv = score(solution=true, submission=oof, row_id_column_name='id')
    print('CV Score KL-Div =',cv)
    model_info['CV Score KL-Div Pop 1'] = cv
    with open(model_info_path, 'w') as f:
        json.dump(model_info, f)

In [None]:
if not EVAL_ONLY:
    import sys
    sys.path.append('/kaggle/input/kaggle-kl-div')
    from kaggle_kl_div import score
    
    EARLY_STOP = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        restore_best_weights=True,
        patience=PATIENCE,
        start_from_epoch=START_FROM_EPOCH,
        min_delta=0.0025
    )

    all_oof2 = []
    all_true2 = []
    all_histories2 = []
 
    gkf = GroupKFold(n_splits=FOLDS)
    val_indices2 = {}
    val_label_ids2 = {}
    preds2={}
    for fold_idx, (train_index, valid_index) in enumerate(gkf.split(train_pop_2, train_pop_2.target, train_pop_2.patient_id)):  

        val_indices2[fold_idx] = [int(i) for i in valid_index]
        print('#'*25)
        print(f'### Fold {fold_idx+1}')
        train_valid = train_pop_2.iloc[valid_index]
        val_label_ids2[fold_idx] = [int(i) for i in train_valid['label_id']]

        train_gen = DataGenerator(train_pop_2.iloc[train_index], specs=spectrograms, eeg_specs=all_eegs,
                                  shuffle=True, batch_size=BATCH_SIZE, augment=False, mode='granular_train')
        valid_gen = DataGenerator(train_valid, specs=spectrograms, eeg_specs=all_eegs,
                                  shuffle=False, batch_size=64, mode='granular_valid')

        print(f'### train size {len(train_index)}, valid size {len(valid_index)}')
        print('#'*25)

        K.clear_session()
        with strategy.scope():
            model = build_model()
        h5_filename_1 = f'EffNet_pop1_v{VERSION}_f{fold_idx}.h5'
#         h5_filename_1 = f'EffNet_pop1_v1_f{fold_idx}.h5'
        h5_filename_2 = f'EffNet_pop2_v{VERSION}_f{fold_idx}.h5'
        if LOAD_MODELS_FROM is None:
#             log_dir = os.path.join(
#                 'logs/fit',
#                 f'{TIMESTAMP.strftime("%Y%m%d-%H%M%S")}_{MODEL_NAME}_k{fold_idx}_pop2')
#             file_writer_cm = tf.summary.create_file_writer(os.path.join(log_dir, 'cm'))
#             TENSORBOARD_CALLBACK = tf.keras.callbacks.TensorBoard(
#                 log_dir=log_dir,
#                 histogram_freq=1,
#                 write_graph=True,
#                 write_images=False,
#                 update_freq='epoch',
#                 profile_batch=2,
#                 embeddings_freq=1
#             )
            
#             def log_confusion_matrix(epoch, logs):
#                 predictions = model.predict(valid_gen, verbose=1)
#                 predictions = np.argmax(predictions, axis=1)
#                 true = np.argmax(train_valid[TARGETS].values, axis=1)

#                 cm = metrics.confusion_matrix(true, predictions)
#                 figure = plot_confusion_matrix(cm, class_names=TARGETS)
#                 cm_image = plot_to_image(figure)

#                 with file_writer_cm.as_default():
#                     tf.summary.image("Confusion Matrix", cm_image, step=epoch)

#             CM = tf.keras.callbacks.LambdaCallback(on_epoch_end=log_confusion_matrix)
#             # ModelCheckpoint callback to save the best model

            h5_path = os.path.join(MODEL_LOC, h5_filename_2)
            CHECKPOINT = tf.keras.callbacks.ModelCheckpoint(
                h5_path,         # Path where to save the model
                save_best_only=True,     # Only save a model if `val_loss` has improved
                monitor='val_loss',      # Monitor 'val_loss' for improvement
                mode='min'               # The smaller the `val_loss`, the better
            )

            model.load_weights(os.path.join(MODEL_LOC, h5_filename_1))
#             model.load_weights(os.path.join('/kaggle/input/temp-for-correction', h5_filename_1))
            history = model.fit(train_gen, verbose=1,
                  validation_data = valid_gen,
                  epochs=EPOCHS, callbacks = [EARLY_STOP, LR, CHECKPOINT, 
#                                               CM, TENSORBOARD_CALLBACK
                                             ])
            all_histories2.append(history)
            model.load_weights(h5_path)

        else:
            model.load_weights(os.path.join(LOAD_MODELS_FROM, h5_filename))

        oof = model.predict(valid_gen, verbose=1)
        all_oof2.append(oof)
        all_true2.append(train_valid[TARGETS].values)

        preds2[fold_idx] = oof

        del model, oof
        gc.collect()


    all_oof2 = np.concatenate(all_oof2)
    all_true2 = np.concatenate(all_true2)


    if LOAD_MODELS_FROM is None:
        history_dict = {}
        for fold, h in enumerate(all_histories):
            history_dict[fold] = str(h.history)

        with open('histories.json', 'w') as file:
            json.dump(history_dict, file, indent=4)

        with open('val_indices.json', 'w') as file:
            json.dump(val_indices, file, indent=4)

        with open('val_label_ids.json', 'w') as file:
            json.dump(val_label_ids, file, indent=4)

     
    flattened_list = [item for sublist in val_label_ids2.values() for item in sublist]
    sub = pd.DataFrame({'label_id':flattened_list})
    sub[TARGETS] = np.vstack(preds2.values())
    sub.to_csv(os.path.join(MODEL_LOC, 'ensemble_data2.csv'), index=False)
    
    oof = pd.DataFrame(all_oof2.copy())
    oof['id'] = np.arange(len(oof))

    true = pd.DataFrame(all_true2.copy())
    true['id'] = np.arange(len(true))

    cv = score(solution=true, submission=oof, row_id_column_name='id')
    print('CV Score KL-Div =',cv)
    model_info['CV Score KL-Div Pop 2'] = cv
    with open(model_info_path, 'w') as f:
        json.dump(model_info, f)

In [None]:
if not EVAL_ONLY:
    oof = pd.DataFrame(np.concatenate([all_oof, all_oof2]).copy())
    oof['id'] = np.arange(len(oof))

    true = pd.DataFrame(np.concatenate([all_true, all_true2]).copy())
    true['id'] = np.arange(len(true))

    cv = score(solution=true, submission=oof, row_id_column_name='id')
    print('CV Score KL-Div =',cv)
    
    # ens = pd.read_csv(os.path.join(MODEL_LOC, 'ensemble_data1.csv'))
    # ens2 = pd.read_csv(os.path.join(MODEL_LOC, 'ensemble_data2.csv'))
    # pd.concat([ens, ens2], ignore_index=True).to_csv('ensemble_data.csv', index=False)

# Infer Test and Create Submission CSV
Below we use our 5 EfficientNet fold models to infer the test data and create a `submission.csv` file.

In [None]:
import pywt, librosa

USE_WAVELET = None 

NAMES = ['LL','LP','RP','RR']

FEATS = [['Fp1','F7','T3','T5','O1'],
         ['Fp1','F3','C3','P3','O1'],
         ['Fp2','F8','T4','T6','O2'],
         ['Fp2','F4','C4','P4','O2']]

# DENOISE FUNCTION
def maddest(d, axis=None):
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def denoise(x, wavelet='haar', level=1):    
    coeff = pywt.wavedec(x, wavelet, mode="per")
    sigma = (1/0.6745) * maddest(coeff[-level])

    uthresh = sigma * np.sqrt(2*np.log(len(x)))
    coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])

    ret=pywt.waverec(coeff, wavelet, mode='per')
    
    return ret

def spectrogram_from_eeg(parquet_path, display=False, offset=None):
    
    # LOAD MIDDLE 50 SECONDS OF EEG SERIES
    eeg = pd.read_parquet(parquet_path)
#     print(eeg.shape)
    if offset is None:
        middle = (len(eeg)-10_000)//2
        eeg = eeg.iloc[middle:middle+10_000]
    else:
        eeg = eeg.iloc[offset:offset+10_000]
    
    # VARIABLE TO HOLD SPECTROGRAM
    img = np.zeros((128,256,4),dtype='float32')
    
    if display: plt.figure(figsize=(10,7))
    signals = []
    for k in range(4):
        COLS = FEATS[k]
        
        for kk in range(4):
        
            # COMPUTE PAIR DIFFERENCES
            x = eeg[COLS[kk]].values - eeg[COLS[kk+1]].values

            # FILL NANS
            m = np.nanmean(x)
            if np.isnan(x).mean() < 1: 
                x = np.nan_to_num(x,nan=m)
            else: x[:] = 0

            # DENOISE
            if USE_WAVELET:
                x = denoise(x, wavelet=USE_WAVELET)
            signals.append(x)

            # RAW SPECTROGRAM
            mel_spec = librosa.feature.melspectrogram(y=x, sr=200, hop_length=len(x)//256, 
                  n_fft=1024, n_mels=128, fmin=0, fmax=20, win_length=128)

            # LOG TRANSFORM
            width = (mel_spec.shape[1]//32)*32
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).astype(np.float32)[:,:width]

            # STANDARDIZE TO -1 TO 1
            mel_spec_db = (mel_spec_db+40)/40 
            img[:,:,k] += mel_spec_db
                
        # AVERAGE THE 4 MONTAGE DIFFERENCES
        img[:,:,k] /= 4.0
        
        if display:
            plt.subplot(2,2,k+1)
            plt.imshow(img[:,:,k],aspect='auto',origin='lower')
#             plt.title(f'EEG {eeg_id} - Spectrogram {NAMES[k]}')
            
    if display: 
        plt.show()
        plt.figure(figsize=(10,5))
        offset = 0
        for k in range(4):
            if k>0: offset -= signals[3-k].min()
            plt.plot(range(10_000),signals[k]+offset,label=NAMES[3-k])
            offset += signals[3-k].max()
        plt.legend()
#         plt.title(f'EEG {eeg_id} Signals')
        plt.show()
        print(); print('#'*25); print()
        
    return img

# Test Data

In [None]:
if not EVAL_ONLY:
    del all_eegs, spectrograms; gc.collect()

In [None]:
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
print('Test shape',test.shape)
test.head()

In [None]:
# READ ALL SPECTROGRAMS
PATH2 = '/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/'
files2 = os.listdir(PATH2)
print(f'There are {len(files2)} test spectrogram parquets')
    
spectrograms2 = {}
for i,f in enumerate(files2):
    if i%100==0: print(i,', ',end='')
    tmp = pd.read_parquet(f'{PATH2}{f}')
    name = int(f.split('.')[0])
    spectrograms2[name] = tmp.iloc[:,1:].values
    
# RENAME FOR DATALOADER
test = test.rename({'spectrogram_id':'spec_id'},axis=1)

In [None]:
# READ ALL EEG SPECTROGRAMS
PATH2 = '/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/'
DISPLAY = 1
EEG_IDS2 = test.eeg_id.unique()
all_eegs2 = {}

print('Converting Test EEG to Spectrograms...'); print()
for i,eeg_id in enumerate(EEG_IDS2):
        
    # CREATE SPECTROGRAM FROM EEG PARQUET
    img = spectrogram_from_eeg(f'{PATH2}{eeg_id}.parquet', i<DISPLAY)
    all_eegs2[eeg_id] = img

In [None]:
# INFER EFFICIENTNET ON TEST
preds = []
model = build_model()
test_gen = DataGenerator(test, spectrograms2, all_eegs2, shuffle=False, batch_size=64, mode='test')

for i in range(FOLDS):
    print(f'Fold {i+1}')
    if LOAD_MODELS_FROM:
        model.load_weights(os.path.join(LOAD_MODELS_FROM, f'EffNet_pop2_v{VERSION}_f{i}.h5'))
    else:
        model.load_weights(os.path.join(MODEL_LOC, f'EffNet_pop2_v{VERSION}_f{i}.h5'))
    pred = model.predict(test_gen, verbose=1)
    preds.append(pred)
pred = np.mean(preds,axis=0)
print()
print('Test preds shape',pred.shape)

In [None]:
sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
sub[TARGETS] = pred
sub.to_csv('submission.csv',index=False)
print('Submissionn shape',sub.shape)
sub.head()

In [None]:
# SANITY CHECK TO CONFIRM PREDICTIONS SUM TO ONE
sub.iloc[:,-6:].sum(axis=1)

In [None]:
!touch submission.csv