# Read data

In [19]:
# Запустить установку mne, если не установлена в VSCode
#! pip install mne

In [20]:
import mne
import numpy as np
import pandas as pd
import pickle
from zipfile import ZipFile
import os

#import tensorflow as tf
from tensorflow import keras

# библиотека взаимодействия с интерпретатором
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

# Все исходные файлы размещены в PATH
PATH = '' ## /content/drive/MyDrive/.../

# Папка для сохранения весов лучшей модели при обучении (исп-ся в ModelCheckpoint в функции callbacks)
PATH_FOR_MODEL = 'lstm_model'

In [21]:
def read_data(path_to_zip):
    mounts = {
        1 : {
            'path_X_train' : 'X_train_1.npy',
            'path_y_train' : 'y_train_1.npy',
            'path_X_test_dataset' : 'X_test_dataset_1.pkl',
        },
        2 : {
            'path_X_train' : 'X_train_2.npy',
            'path_y_train' : 'y_train_2.npy',
            'path_X_test_dataset' : 'X_test_dataset_2.pkl',
        },
        3 : {
            'path_X_train' : 'X_train_3.npy',
            'path_y_train' : 'y_train_3.npy',
            'path_X_test_dataset' : 'X_test_dataset_3.pkl',
        }
    }

    SFREQ = 1000.0 / 33

    for mount_name, mount in mounts.items():
        mount['X_train'] = np.load(path_to_zip)[mount['path_X_train']]
        mount['y_train'] = np.load(path_to_zip)[mount['path_y_train']]
        with ZipFile(path_to_zip) as myzip:
            with myzip.open(mount['path_X_test_dataset']) as myfile:
                mount['X_test_dataset'] = pickle.load(myfile)
        
        X_train = mount['X_train'] 
        y_train = mount['y_train']
        
        raw = mne.io.RawArray(
            data=X_train.T,
            info=mne.create_info(
                ch_names=list(np.arange(X_train.shape[1]).astype(str)),
                sfreq=SFREQ,
                ch_types='eeg'
            )
        )
        raw_y = mne.io.RawArray(
            data=y_train.reshape(1,-1),
            info=mne.create_info(
                ch_names=['y'],
                sfreq=SFREQ,
                ch_types='misc'
            )
        )
        raw = raw.add_channels([raw_y])
        
        events = np.where(np.abs(np.diff(y_train)) > 0)[0]

        events = np.stack([
            events,
            np.zeros_like(events),
            np.zeros_like(events)
        ], axis=1)
        
        epochs = mne.Epochs(
            raw,
            events=events,
            tmin=-1, 
            tmax=1*2.5, 
            preload=True,
            baseline=None,
            picks='all'
        )
        
        X_train_nn = epochs.copy().pick_types(eeg =True)._data.swapaxes(1, 2)
        mount['X_train_nn'] = X_train_nn

    return mounts

In [22]:
def read_y_test(path_to_zip):
    # Чтение sample_submission.csv из архива
    with ZipFile(path_to_zip) as myzip:
        y_test = pd.read_csv(myzip.open('sample_submission.csv'))

    y_test[['subject_id', 'sample', 'timestep']] = (
        y_test['subject_id-sample-timestep']
        .str.split('-', 2, expand=True)
        .astype(int)
    )
    return y_test

In [23]:
# Функция для расчета метрики f1_score, Precision, Recall
# Примечание: Metrics have been removed from Keras core on 2.0 version
# https://stackoverflow.com/questions/66554207/calculating-micro-f-1-score-in-keras

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """
        Recall metric.
        Only computes a batch-wise average of recall.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """
        Precision metric.
        Only computes a batch-wise average of precision.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives/(predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision + recall + K.epsilon()))

# Callbacks that used for training model
def callbacks(lr, num_train):
    checkpoint = ModelCheckpoint(
        os.path.join(PATH_BEST_MODEL, 'best_model_rnn_' + str(num_train) + '.hdf5'), 
        monitor='val_f1', 
        verbose=1, 
        mode='max', 
        save_best_only=True
    )

    earlystop = EarlyStopping(
        monitor='val_f1', 
        mode='max', 
        patience=150, 
        restore_best_weights=True
    )

    reduce_lr = ReduceLROnPlateau(
        monitor='val_f1', 
        mode='max', 
        factor=0.9, 
        patience=15, # можно 10
        verbose=1, 
        min_lr=lr/10000
    )
    
    return [checkpoint, earlystop, reduce_lr]

In [24]:
def add_prediction(mount, mount_name, y_test):
    X_train_nn = mount['X_train_nn']
    X_test_dataset = mount['X_test_dataset']
    m_lstm = keras.models.load_model(os.path.join(PATH_FOR_MODEL, 'model_lstm_' + str(mount_name)), 
                                    custom_objects={"f1": f1})
    m_lstm.predict(mount['X_train_nn'], verbose=0)
    
    y_pred_test_lstm = []

    for i in range(len(X_test_dataset)):
        X_test_i = np.expand_dims(X_test_dataset[i], axis=0).swapaxes(1, 2).astype(np.float64)
        y_pred_test_lstm += [m_lstm.predict(X_test_i, verbose=0)]
    
    y_pred_test_lstm = [arr.argmax(axis=-1) for arr in y_pred_test_lstm]
    print(len(y_pred_test_lstm))
    assert len(y_pred_test_lstm) == y_test.query("subject_id == @mount_name")['sample'].nunique()
    
    mount['y_pred_test_lstm'] = y_pred_test_lstm
    return mount

In [25]:
def make_embedding(mounts, y_test):
    for mount_name, mount in mounts.items():
        mount = add_prediction(mount, mount_name, y_test)

    y_pred_test_res = []
    
    for mount_name, mount in mounts.items():
        y_pred_test_res.extend(mount['y_pred_test_lstm'])
    y_pred_test_res = np.concatenate(y_pred_test_res, axis=-1)[0]
    
    assert y_pred_test_res.shape[0] == y_test.shape[0]
    
    y_test_submit = y_test[['subject_id-sample-timestep', 'class']]
    y_test_submit['class'] = y_pred_test_res
    y_test_submit.to_csv('./y_test_submit_rnn_LSTM_embeded.csv', index=False)
    return y_test_submit

In [26]:
path_to_zip = os.path.join(PATH, 'data/motorica-advanced-gesture-classification.zip')
mounts = read_data(path_to_zip)
y_test = read_y_test(path_to_zip)
y_test_submit = make_embedding(mounts, y_test)
print('y_test_submit created')

Creating RawArray with float64 data, n_channels=50, n_times=24030
    Range : 0 ... 24029 =      0.000 ...   792.957 secs
Ready.
Creating RawArray with float64 data, n_channels=1, n_times=24030
    Range : 0 ... 24029 =      0.000 ...   792.957 secs
Ready.
Not setting metadata
277 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 277 events and 107 original time points ...
1 bad epochs dropped
Creating RawArray with float64 data, n_channels=50, n_times=23202
    Range : 0 ... 23201 =      0.000 ...   765.633 secs
Ready.
Creating RawArray with float64 data, n_channels=1, n_times=23202
    Range : 0 ... 23201 =      0.000 ...   765.633 secs
Ready.
Not setting metadata
264 matching events found
No baseline correction applied
0 projection items activated
Using data from preloaded Raw for 264 events and 107 original time points ...
1 bad epochs dropped
Creating RawArray with float64 data, n_channels=50, n_times=23177
    Rang