# Split spectrograms into channels

Running time: Comparison between using only numpy and loading parquet files inside the data generators.

Reading parquet files in data generator: TOO SLOW, it does not use GPU.

In [41]:
import pathlib
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras import regularizers
import matplotlib.pyplot as plt

base_dir = '../../kaggle_data/hms'
# base_dir = '../../data/hms'
# base_dir = '/kaggle/input/hms-harmful-brain-activity-classification'

In [42]:
df_traincsv = pd.read_csv(f'{base_dir}/train.csv')

TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

df_traincsv = pd.read_csv(f'{base_dir}/train.csv')
df_traincsv.loc[df_traincsv.expert_consensus == 'Seizure', 'target'] = 0
df_traincsv.loc[df_traincsv.expert_consensus == 'LPD', 'target'] = 1
df_traincsv.loc[df_traincsv.expert_consensus == 'GPD', 'target'] = 2
df_traincsv.loc[df_traincsv.expert_consensus == 'LRDA', 'target'] = 3
df_traincsv.loc[df_traincsv.expert_consensus == 'GRDA', 'target'] = 4
df_traincsv.loc[df_traincsv.expert_consensus == 'Other', 'target'] = 5

idxs_wo_nan = np.load("../data/02_eegs_idxs_up_to_5_nan.npy")

# Removing observations where eeg has more than 5 NaN rows.
df = df_traincsv.iloc[idxs_wo_nan]

print("Loaded train.csv. Added target column.")
df.head()

Loaded train.csv. Added target column.


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0,0.0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0,0.0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0,0.0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0,0.0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0,0.0


In [43]:
#
# Train/Val split (v1)
#

ptrain = 0.8

# Set the seed for reproducibility.
rng = np.random.default_rng(113)
# rng = np.random.default_rng(45163)
# rng = np.random.default_rng(3233)

ids = rng.permutation(np.unique(df['patient_id']))
cut = int(ptrain * len(ids))
idxs_train = df.loc[df['patient_id'].isin(ids[0:cut])].index
idxs_val = df.loc[df['patient_id'].isin(ids[cut:])].index

print("Train samples:", len(idxs_train))
print("Validation samples:", len(idxs_val))

#
# -----------------
#

Train samples: 84191
Validation samples: 22167


In [44]:
#
# Data generator to load parquet files for each batch.
#
# Original spectrograms.
# 50 seconds slice
# 4 channels
#

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, path_to_data, batch_size=32, n_classes=6, shuffle=True):
        ''' Initialization
        df: dataframe extracted from train.csv with training set.
        path_to_data: folder with parquet files.
        '''
        self.n_channels = 4
        # self.max_freq = 30  # Max freq. is 19.92 Hz in given spectrograms.

        # 62 features: frequencies above 8 Hz (and less than 20 Hz)
        self.dim = (25, 62)

        self.path_to_data = path_to_data
        # self.idxs = idxs
        self.df = df
        self.batch_size = batch_size
        self.len = len(df)
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.len / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.len)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        true_size = len(indexes)
        X = np.empty((true_size, *self.dim, self.n_channels))
        y = np.empty((true_size), dtype=int)

        # Generate data
        for i, idx in enumerate(indexes):
            item = self.df.iloc[idx]
            df_spec = pd.read_parquet(f'{self.path_to_data}/{item.spectrogram_id}.parquet')
            offset = item.spectrogram_label_offset_seconds
            # 50 seconds centered in the 10 minutes slice.
            spectrogram = df_spec.loc[(df_spec.time>=(offset+275)) & (df_spec.time<(offset+325))].values
            #replace NaNs

            for c in np.arange(self.n_channels):
                cinitial = c * 100
                cfinal = cinitial + 62
                X[i,:,:,c] = spectrogram[:, cinitial:cfinal]
            # Store class
            y[i] = item.target

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [45]:
# #
# # Data generator using numpy and no pandas.
# #
# # Original spectrograms.
# # 50 seconds slice
# # 4 channels
# #

# class DataGenerator(keras.utils.Sequence):
#     'Generates data for Keras'
#     def __init__(self, items, data, batch_size=32, n_classes=6, shuffle=True):
#         ''' Initialization
#         items: [eeg_id, eeg_sub_id, idx of offset, target]
#         '''
#         self.n_channels = 4
#         self.n_freqs = 40
#         self.dim = (25, self.n_freqs)

#         self.data = data
#         self.items = items
#         self.batch_size = batch_size
#         self.len = items.shape[0]
#         self.n_classes = n_classes
#         self.shuffle = shuffle
#         self.on_epoch_end()

#     def __len__(self):
#         'Denotes the number of batches per epoch'
#         return int(np.ceil(self.len / self.batch_size))

#     def __getitem__(self, index):
#         'Generate one batch of data'
#         # Generate indexes of the batch
#         indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

#         # Generate data
#         X, y = self.__data_generation(indexes)

#         return X, y

#     def on_epoch_end(self):
#         'Updates indexes after each epoch'
#         self.indexes = np.arange(self.len)
#         if self.shuffle == True:
#             np.random.shuffle(self.indexes)

#     def __data_generation(self, indexes):
#         'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
#         # Initialization
#         true_size = len(indexes)
#         X = np.empty((true_size, *self.dim, self.n_channels))
#         y = np.empty((true_size), dtype=int)

#         # Generate data
#         for i, idx in enumerate(indexes):
#             item = self.items[idx]
#             # print(item)  # Uncomment for testing.
#             # Sample is 50 second long, that's 25 rows.
#             initial = item[2] + 137
#             final = initial + 25
#             for c in np.arange(self.n_channels):
#                 cinitial = c * 100
#                 cfinal = cinitial + self.n_freqs
#                 X[i,:,:,c] = self.data[initial:final, cinitial:cfinal]
#             # Store class
#             y[i] = item[3]

#         return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [46]:

def make_model(input_shape, num_classes):
    input_layer = keras.layers.Input(input_shape)

    conv1 = keras.layers.Conv2D(filters=32,
                                kernel_size=3,
                                strides=1,
                                padding="same",
                                data_format="channels_last",
                                # kernel_regularizer=regularizers.l2(0.001),
                                # use_bias=True,
                                )(input_layer)
    conv1 = keras.layers.BatchNormalization()(conv1)
    # conv1 = keras.layers.MaxPooling2D(pool_size=8)(conv1)
    conv1 = keras.layers.ReLU()(conv1)
    
    conv2 = keras.layers.Conv2D(filters=32,
                                kernel_size=5,
                                padding="same",
                                data_format="channels_last",
                                )(conv1)
    #conv2 = keras.layers.BatchNormalization()(conv2)
    # conv2 = keras.layers.MaxPooling2D(pool_size=8)(conv2)
    conv2 = keras.layers.ReLU()(conv2)

    conv3 = keras.layers.Conv2D(filters=64,
                                kernel_size=7,
                                padding="same",
                                data_format="channels_last",
                                )(conv2)
    conv3 = keras.layers.ReLU()(conv3)
    # conv3 = keras.layers.MaxPooling2D(pool_size=2)(conv3)

    conv4 = keras.layers.Conv2D(filters=64,
                                kernel_size=3,
                                padding="same",
                                data_format="channels_last",
                                )(conv3)
    # conv4 = keras.layers.BatchNormalization()(conv4)
    conv4 = keras.layers.ReLU()(conv4)
    conv4 = keras.layers.MaxPooling2D(pool_size=2)(conv4)

    fltn  = keras.layers.Flatten()(conv4) 
    dense1 = keras.layers.Dense(256)(fltn)
    # dout1 = keras.layers.Dropout(rate=0.4)(dense1)
    
    # relu1 = keras.layers.Dense(256)(fltn)
    # relu1 = keras.layers.ReLU()(relu1)

    # relu2 = keras.layers.Dense(64)(relu1)
    # relu2 = keras.layers.ReLU(64)(relu2)

#     lin = keras.layers.Dense(2)(relu2)

    output_layer = keras.layers.Dense(num_classes, activation="softmax")(dense1)

    return keras.models.Model(inputs=input_layer, outputs=output_layer)


In [48]:
# Parameters
params = {
    'batch_size': 32,
    'n_classes': 6,
    'shuffle': True
    }

path_to_data = f'{base_dir}/train_spectrograms'

training_generator = DataGenerator(df.loc[idxs_train], path_to_data, **params)
validation_generator = DataGenerator(df.loc[idxs_val], path_to_data, **params)

model = make_model(input_shape=(25,62,4), num_classes=6)
model.compile(optimizer='sgd',
            loss='categorical_crossentropy',
            metrics=['accuracy'])

model.fit(training_generator, epochs=2, validation_data=validation_generator)

Epoch 1/2
 164/2631 [>.............................] - ETA: 28:55 - loss: 1.7906 - accuracy: 0.1879

KeyboardInterrupt: 

In [12]:
TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

#
# Test Data generator: for predicting.
#

class test_DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, ids, path_to_test_data, batch_size=32, n_classes=6):
        'Initialization'
        self.n_channels = 4
        self.dim = (25,40)
        self.n_freqs = 40

        self.path = path_to_test_data
        # self.files = os.listdir(path_to_test_data)
        self.ids = ids
        self.indexes = np.arange(len(self.ids))
        # self.columns = self.data.columns[2:]
        self.batch_size = batch_size
        self.n_classes = n_classes
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(len(self.ids) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        # items_temp = self.items.iloc[indexes]

        # Generate data
        X = self.__data_generation(indexes)

        return X

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        # self.indexes = np.arange(self.len)
        # if self.shuffle == True:
        #     np.random.shuffle(self.indexes)
        pass

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((len(indexes), *self.dim, self.n_channels))

        # Generate data
        for i, idx in enumerate(indexes):
            # item = self.items.iloc[idx]
            test_spectrogram = pd.read_parquet(f'{self.path}{self.ids[idx]}.parquet')
            test_spectrogram.replace(np.nan, 0, inplace=True)

            initial = 137
            final = initial + 25
            for c in np.arange(self.n_channels):
                cinitial = c * 100 + 1
                cfinal = cinitial + self.n_freqs
                # X[i,:,:,c] = self.data[initial:final, cinitial:cfinal]
                X[i,:,:,c] = test_spectrogram.iloc[initial:final,cinitial:cfinal].to_numpy(copy=True)

        return X


# Parameters
params = {
    'batch_size': 32,
    'n_classes': 6,
    }


base_dir = "../toy_data"

path_to_test_data = f'{base_dir}/test_spectrograms/'
test = pd.read_csv(f'{base_dir}/test.csv')
ids = test['spectrogram_id'].values

test_generator = test_DataGenerator(ids, path_to_test_data, **params)

y_pred = model.predict(test_generator)

sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
sub[TARGETS] = np.round(y_pred,6)
sub.to_csv('submission.csv',index=False)




In [13]:
sub

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,704367771,0.176074,0.171767,0.154017,0.170628,0.152159,0.175355
1,4044323427,0.176056,0.17185,0.15402,0.170587,0.1522,0.175287
2,3403081220,0.176007,0.171779,0.153979,0.170692,0.152176,0.175367
3,2936526887,0.176003,0.171786,0.153965,0.170708,0.152171,0.175367
4,2918575845,0.176014,0.171784,0.153964,0.170664,0.152194,0.175381
5,615255846,0.176028,0.171788,0.15403,0.170622,0.152187,0.175345
6,2493948338,0.176003,0.171776,0.153956,0.17069,0.15219,0.175384
7,1405667816,0.176008,0.171774,0.153966,0.170692,0.15218,0.175379
8,983495114,0.176018,0.171787,0.153957,0.170683,0.152176,0.175378
9,2676914434,0.176008,0.171774,0.153966,0.170695,0.152181,0.175376


In [14]:
# On validation set

y_pred = model.predict(validation_generator)
sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
sub[TARGETS] = np.round(y_pred,6)




ValueError: Length of values (1486) does not match length of index (20)

In [15]:
test

Unnamed: 0,spectrogram_id,eeg_id,patient_id
0,647711672,704367771,53544
1,1279379926,4044323427,56411
2,1309472604,3403081220,24450
3,1878092388,2936526887,61262
4,1896553221,2918575845,17948
5,536397604,615255846,49973
6,1987874958,2493948338,48272
7,272214389,1405667816,54199
8,1222625047,983495114,58327
9,1802995379,2676914434,59198


## Loading each parquet file inside the data generator.

In [None]:
#
# FIX THIS
# using train.csv
#

path_to_data = "../data/00_spectrograms_reduced.npy"
path_to_items = "../data/00_sub_spectrograms_idxs_reduced.npy"

data = np.load(path_to_data)
items = np.load(path_to_items)
n_total_samples = items.shape[0]

ptrain = 0.8

idx_train = np.array([], dtype=int)
idx_val = np.array([], dtype=int)
for i in np.arange(6):
    idx = np.where(items[:,3] == i)[0]
    idx = np.random.permutation(idx)
    cut = int(ptrain*idx.shape[0])
    idx_train = np.append(idx_train, idx[0:cut])
    idx_val = np.append(idx_val, idx[cut:])

items_train = items[idx_train]
items_val = items[idx_val]
print("Train samples:", len(items_train))
print("Validation samples:", len(items_val))


In [None]:
#
# TO BE COMPLETED
#

#
# Data generator pandas to load the parquet files into a numpy array.
#
# Original spectrograms.
# 50 seconds slice
# 4 channels
#

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, items, data, batch_size=32, n_classes=6, shuffle=True):
        ''' Initialization
        items: [eeg_id, eeg_sub_id, idx of offset, target]
        '''
        self.n_channels = 4
        self.n_freqs = 40
        self.dim = (25, self.n_freqs)

        self.data = data
        self.items = items
        self.batch_size = batch_size
        self.len = items.shape[0]
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.len / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.len)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        true_size = len(indexes)
        X = np.empty((true_size, *self.dim, self.n_channels))
        y = np.empty((true_size), dtype=int)

        # Generate data
        for i, idx in enumerate(indexes):
            item = self.items[idx]
            # print(item)  # Uncomment for testing.
            # Sample is 50 second long, that's 25 rows.
            initial = item[2] + 137
            final = initial + 25
            for c in np.arange(self.n_channels):
                cinitial = c * 100
                cfinal = cinitial + self.n_freqs
                X[i,:,:,c] = self.data[initial:final, cinitial:cfinal]
            # Store class
            y[i] = item[3]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [None]:

def make_model(input_shape, num_classes):
    input_layer = keras.layers.Input(input_shape)

    conv1 = keras.layers.Conv2D(filters=32,
                                kernel_size=3,
                                strides=1,
                                padding="same",
                                data_format="channels_last",
                                # kernel_regularizer=regularizers.l2(0.001),
                                # use_bias=True,
                                )(input_layer)
    conv1 = keras.layers.BatchNormalization()(conv1)
    # conv1 = keras.layers.MaxPooling2D(pool_size=8)(conv1)
    conv1 = keras.layers.ReLU()(conv1)
    
    conv2 = keras.layers.Conv2D(filters=32,
                                kernel_size=5,
                                padding="same",
                                data_format="channels_last",
                                )(conv1)
    #conv2 = keras.layers.BatchNormalization()(conv2)
    # conv2 = keras.layers.MaxPooling2D(pool_size=8)(conv2)
    conv2 = keras.layers.ReLU()(conv2)

    conv3 = keras.layers.Conv2D(filters=64,
                                kernel_size=7,
                                padding="same",
                                data_format="channels_last",
                                )(conv2)
    conv3 = keras.layers.ReLU()(conv3)
    # conv3 = keras.layers.MaxPooling2D(pool_size=2)(conv3)

    conv4 = keras.layers.Conv2D(filters=64,
                                kernel_size=3,
                                padding="same",
                                data_format="channels_last",
                                )(conv3)
    # conv4 = keras.layers.BatchNormalization()(conv4)
    conv4 = keras.layers.ReLU()(conv4)
    conv4 = keras.layers.MaxPooling2D(pool_size=2)(conv4)

    fltn  = keras.layers.Flatten()(conv4) 
    dense1 = keras.layers.Dense(256)(fltn)
    # dout1 = keras.layers.Dropout(rate=0.4)(dense1)
    
    # relu1 = keras.layers.Dense(256)(fltn)
    # relu1 = keras.layers.ReLU()(relu1)

    # relu2 = keras.layers.Dense(64)(relu1)
    # relu2 = keras.layers.ReLU(64)(relu2)

#     lin = keras.layers.Dense(2)(relu2)

    output_layer = keras.layers.Dense(num_classes, activation="softmax")(dense1)

    return keras.models.Model(inputs=input_layer, outputs=output_layer)


In [None]:
# Parameters
params = {
    'batch_size': 32,
    'n_classes': 6,
    'shuffle': True
    }

training_generator = DataGenerator(items_train, data, **params)
validation_generator = DataGenerator(items_val, data, **params)

model = make_model(input_shape=(25,40,4), num_classes=6)
model.compile(optimizer='sgd',
            loss='categorical_crossentropy',
            metrics=[tf.keras.metrics.CategoricalCrossentropy()])

model.fit(training_generator, epochs=7, validation_data=validation_generator)

In [None]:
TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

#
# Test Data generator: for predicting.
#

class test_DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, ids, path_to_test_data, batch_size=32, n_classes=6):
        'Initialization'
        self.n_channels = 4
        self.dim = (25,40)
        self.n_freqs = 40

        self.path = path_to_test_data
        # self.files = os.listdir(path_to_test_data)
        self.ids = ids
        self.indexes = np.arange(len(self.ids))
        # self.columns = self.data.columns[2:]
        self.batch_size = batch_size
        self.n_classes = n_classes
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(len(self.ids) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        # items_temp = self.items.iloc[indexes]

        # Generate data
        X = self.__data_generation(indexes)

        return X

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        # self.indexes = np.arange(self.len)
        # if self.shuffle == True:
        #     np.random.shuffle(self.indexes)
        pass

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((len(indexes), *self.dim, self.n_channels))

        # Generate data
        for i, idx in enumerate(indexes):
            # item = self.items.iloc[idx]
            test_spectrogram = pd.read_parquet(f'{self.path}{self.ids[idx]}.parquet')
            test_spectrogram.replace(np.nan, 0, inplace=True)

            initial = 137
            final = initial + 25
            for c in np.arange(self.n_channels):
                cinitial = c * 100 + 1
                cfinal = cinitial + self.n_freqs
                # X[i,:,:,c] = self.data[initial:final, cinitial:cfinal]
                X[i,:,:,c] = test_spectrogram.iloc[initial:final,cinitial:cfinal].to_numpy(copy=True)

        return X


# Parameters
params = {
    'batch_size': 32,
    'n_classes': 6,
    }


base_dir = "../toy_data"

path_to_test_data = f'{base_dir}/test_spectrograms/'
test = pd.read_csv(f'{base_dir}/test.csv')
ids = test['spectrogram_id'].values

test_generator = test_DataGenerator(ids, path_to_test_data, **params)

y_pred = model.predict(test_generator)

sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
sub[TARGETS] = np.round(y_pred,6)
# sub.to_csv('submission.csv',index=False)
sub