# Measuring performance

Testing different approaches.

model.fit :

| Desc       | Where       | Time  | Obs. |
| ---        | ----        | ----  | ---- |
| A - 1st try    | framework   |  1m 48s | |
| A - 1st try    | Kaggle CPU  | 3m 4s  | |
| A - 1st try    | Kaggle GPU  | 27.4s  | Get batch takes most of the time. |
| B - Only numpy | Kaggle CPU  | 4m 1s    | 186 batches x 32 |
| B - Only numpy | Kaggle GPU  | 11s    | 186 batches x 32 |


In [1]:
import pandas as pd
import numpy as np
import keras
import pathlib
import os 
import matplotlib.pyplot as plt

import timeit

np.random.seed(536)

base_dir = pathlib.Path("../data/reduced_ds")
path_to_data = pathlib.Path("../data/reduced_ds/spectrograms_reduced_800.parquet")

2024-01-21 19:21:38.116318: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

df_traincsv = pd.read_csv(f'{base_dir}/train.csv')
df_traincsv.loc[df_traincsv.expert_consensus == 'Seizure', 'target'] = 0
df_traincsv.loc[df_traincsv.expert_consensus == 'LPD', 'target'] = 1
df_traincsv.loc[df_traincsv.expert_consensus == 'GPD', 'target'] = 2
df_traincsv.loc[df_traincsv.expert_consensus == 'LRDA', 'target'] = 3
df_traincsv.loc[df_traincsv.expert_consensus == 'GRDA', 'target'] = 4
df_traincsv.loc[df_traincsv.expert_consensus == 'Other', 'target'] = 5

print("Loaded train.csv. Added target column.")

Loaded train.csv. Added target column.


In [3]:
#
# Train/Validation indexes in df_traincsv
#

ptrain = 0.8

n_total_samples = df_traincsv.shape[0]
cut = int(ptrain*n_total_samples)
idx = np.random.permutation(n_total_samples)
idx_train = idx[0:cut]
idx_val = idx[cut:]
print("Train samples:", len(idx_train))
print("Validation samples:", len(idx_val))

Train samples: 6443
Validation samples: 1611


## Model definition

The same for all tests.

In [5]:

def make_model(input_shape, num_classes):
    input_layer = keras.layers.Input(input_shape)

    #max1 = keras.layers.MaxPooling1D(pool_size=2)(input_layer)
    
    conv1 = keras.layers.Conv2D(filters=32, kernel_size=3, padding="same")(input_layer)
    #conv1 = keras.layers.BatchNormalization()(conv1)
    conv1 = keras.layers.MaxPooling2D(pool_size=8)(conv1)
    conv1 = keras.layers.ReLU()(conv1)
    
    conv2 = keras.layers.Conv2D(filters=64, kernel_size=7, padding="same")(conv1)
    #conv2 = keras.layers.BatchNormalization()(conv2)
    conv2 = keras.layers.MaxPooling2D(pool_size=8)(conv2)
    conv2 = keras.layers.ReLU()(conv2)

    conv3 = keras.layers.Conv2D(filters=256, kernel_size=7, padding="same")(conv2)
    #conv3 = keras.layers.BatchNormalization()(conv3)
    conv3 = keras.layers.MaxPooling2D(pool_size=2)(conv3)
    conv3 = keras.layers.ReLU()(conv3)

    # conv4 = keras.layers.Conv1D(filters=512, kernel_size=3, padding="same")(conv3)
    # conv4 = keras.layers.BatchNormalization()(conv4)
    # conv4 = keras.layers.MaxPooling1D(pool_size=4)(conv4)
    # conv4 = keras.layers.ReLU()(conv4)

    fltn  = keras.layers.Flatten()(conv3) 
    
    relu1 = keras.layers.Dense(256)(fltn)
    relu1 = keras.layers.ReLU()(relu1)

    relu2 = keras.layers.Dense(64)(relu1)
    relu2 = keras.layers.ReLU(64)(relu2)

    lin = keras.layers.Dense(2)(relu2)

    output_layer = keras.layers.Dense(num_classes, activation="softmax")(lin)

    return keras.models.Model(inputs=input_layer, outputs=output_layer)


## A - Same as first try

Using pandas to access the info.

In [5]:
#
# Data generator
#

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, items, path_to_data, batch_size=32, dim=(300,400), n_channels=1,
                 n_classes=6, shuffle=True):
        'Initialization'
        sel = [("spectrogram_id", "in", items['spectrogram_id'])]
        self.data = pd.read_parquet(path_to_data, filters=sel)
        self.data.replace(np.nan, 0, inplace=True)
        self.columns = self.data.columns[2:]
        self.dim = dim
        self.batch_size = batch_size
        # self.labels = labels
        self.items = items
        self.len = items.shape[0]
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.len / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        # items_temp = self.items.iloc[indexes]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.len)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        true_size = len(indexes)
        X = np.empty((true_size, *self.dim, self.n_channels))
        y = np.empty((true_size), dtype=int)

        # Generate data
        for i, idx in enumerate(indexes):
            item = self.items.iloc[idx]
            # Store sample
            X[i,] = self.data.loc[(self.data.spectrogram_id == int(item.spectrogram_id))&
               (self.data.time >= item.spectrogram_label_offset_seconds)&
               (self.data.time < item.spectrogram_label_offset_seconds + 600)][self.columns].to_numpy(copy=True).reshape((*self.dim,1))

            # Store class
            y[i] = int(item.target)

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

Testing this class, try to measure performance of loading batches.

In [6]:
items = df_traincsv[['spectrogram_id','spectrogram_label_offset_seconds',
                      'target']].iloc[idx_train].reset_index(drop=True)

# Parameters
params = {
    'dim': (300,400),
    'batch_size': 32,
    'n_classes': 6,
    'n_channels': 1,
    'shuffle': True
    }

%time training_generator = DataGenerator(items, path_to_data, **params)


CPU times: user 3.23 s, sys: 3.05 s, total: 6.28 s
Wall time: 1.01 s


In [7]:
training_generator.__len__()

202

In [8]:
%time a = training_generator.__getitem__(2)

CPU times: user 75.5 ms, sys: 3.05 ms, total: 78.6 ms
Wall time: 77.7 ms


In [9]:
items_train = df_traincsv[['spectrogram_id','spectrogram_label_offset_seconds',
                    'target']].iloc[idx_train].reset_index(drop=True)
items_val = df_traincsv[['spectrogram_id','spectrogram_label_offset_seconds',
                    'target']].iloc[idx_val].reset_index(drop=True)

# Parameters
params = {
    'dim': (300,400),
    'batch_size': 32,
    'n_classes': 6,
    'n_channels': 1,
    'shuffle': True
    }

%time training_generator = DataGenerator(items_train, path_to_data, **params)
%time validation_generator = DataGenerator(items_val, path_to_data, **params)


model = make_model(input_shape=(*params['dim'],1), num_classes=6)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

%time model.fit(training_generator, epochs=1, validation_data=validation_generator)


CPU times: user 3.48 s, sys: 2.99 s, total: 6.47 s
Wall time: 882 ms
CPU times: user 3.1 s, sys: 2.07 s, total: 5.17 s
Wall time: 662 ms
CPU times: user 20min 58s, sys: 37.5 s, total: 21min 36s
Wall time: 1min 48s


<keras.src.callbacks.History at 0x7fb0d4728190>

## B - Avoiding Pandas when using tensorflow.

Numpy arrays are passed by reference.

Except when you operate on the right hand side: b = a + 1, b is a copy.

Example, in an object of this class,"self.a" is a reference to "a":

In [10]:
class arr():
    def __init__(self, a):
        self.a = a

    def dis(self):
        print(self.a)

a = np.array([0,1,2,3,4,5])

print(a)
b = arr(a)
b.dis()
a[2] = 44
print(a)
b.dis()

[0 1 2 3 4 5]
[0 1 2 3 4 5]
[ 0  1 44  3  4  5]
[ 0  1 44  3  4  5]


In [36]:
#
# Data generator using numpy and no pandas.
#

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, items, data, batch_size=32, dim=(300,400), n_channels=1,
                 n_classes=6, shuffle=True):
        ''' Initialization
        items: [eeg_id, eeg_sub_id, idx of offset, target]
        '''
        self.data = data
        self.items = items
        self.dim = dim
        self.batch_size = batch_size
        self.len = items.shape[0]
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.len / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.len)
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        true_size = len(indexes)
        X = np.empty((true_size, *self.dim, self.n_channels))
        y = np.empty((true_size), dtype=int)

        # Generate data
        for i, idx in enumerate(indexes):
            item = self.items[idx]
            # print(item)  # Uncomment for testing.
            # Store sample
            X[i,] = self.data[item[2]:(item[2]+300)].reshape(*self.dim,1)
            # Store class
            y[i] = item[3]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [37]:
path_to_data = "../data/00_spectrograms_reduced.npy"
path_to_items = "../data/00_sub_spectrograms_idxs_reduced.npy"

data = np.load(path_to_data)
items = np.load(path_to_items)
n_total_samples = items.shape[0]

ptrain = 0.8
items = np.random.permutation(items)

cut = int(ptrain*n_total_samples)
items_train = items[0:cut]
items_val = items[cut:]
print("Train samples:", len(items_train))
print("Validation samples:", len(items_val))


Train samples: 5933
Validation samples: 1484


In [38]:
# Parameters
params = {
    'dim': (300,400),
    'batch_size': 32,
    'n_classes': 6,
    'n_channels': 1,
    'shuffle': True
    }

%time training_generator = DataGenerator(items_train, data, **params)
%time validation_generator = DataGenerator(items_val, data, **params)

CPU times: user 154 µs, sys: 26 µs, total: 180 µs
Wall time: 170 µs
CPU times: user 46 µs, sys: 8 µs, total: 54 µs
Wall time: 52 µs


Testing this new class. Check if the items loaded in the batches are equal to the original spectrograms.

PASSED.

In [39]:
x,y = training_generator.__getitem__(0)
x.shape

(32, 300, 400, 1)

In [40]:
TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

kaggle_data = "../../data/hms"
df_traincsv = pd.read_csv(f'{kaggle_data}/train.csv')
df_traincsv.loc[df_traincsv.expert_consensus == 'Seizure', 'target'] = 0
df_traincsv.loc[df_traincsv.expert_consensus == 'LPD', 'target'] = 1
df_traincsv.loc[df_traincsv.expert_consensus == 'GPD', 'target'] = 2
df_traincsv.loc[df_traincsv.expert_consensus == 'LRDA', 'target'] = 3
df_traincsv.loc[df_traincsv.expert_consensus == 'GRDA', 'target'] = 4
df_traincsv.loc[df_traincsv.expert_consensus == 'Other', 'target'] = 5

print("Loaded train.csv. Added target column.")

Loaded train.csv. Added target column.


In [41]:
# Columns of interest.
df = pd.read_parquet(f'{kaggle_data}/train_spectrograms/399182714.parquet')
columns = df.columns[1:]

eeg_id = 1460778765
eeg_sub_id = 168
i = 5

# spectrogram_id = 0
# for i in np.arange(x.shape[0]):
item_train = df_traincsv.loc[(df_traincsv.eeg_id == eeg_id)&(df_traincsv.eeg_sub_id == eeg_sub_id)].iloc[0]
    # if item_train.spectrogram_id != spectrogram_id:
    #     spectrogram_id = item_train.spectrogram_id
    #     df = pd.read_parquet(f'{base_dir}/train_spectrograms/{spectrogram_id}.parquet')
    #     df.replace(np.nan, 0, inplace=True)

spectrogram_id = item_train.spectrogram_id
df = pd.read_parquet(f'{kaggle_data}/train_spectrograms/{spectrogram_id}.parquet')
df.replace(np.nan, 0, inplace=True)

offset = item_train.spectrogram_label_offset_seconds
original_spec = df.loc[(df.time >= offset)&(df.time < (offset + 600))][columns].to_numpy(copy=True)
# idx = items[i][2]
saved_spec = x[i,:,:,0]
np.all(saved_spec == original_spec)


False

In [42]:

model = make_model(input_shape=(*params['dim'],1), num_classes=6)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])

%time model.fit(training_generator, epochs=1, validation_data=validation_generator)


CPU times: user 7min 55s, sys: 25.1 s, total: 8min 20s
Wall time: 3min 16s


<keras.src.callbacks.History at 0x7fbeb066bca0>