# Lesson3 系列データで分類・予測させてみよう（RNN, LSTM）

## Homework

RNNを用いてさらに高精度なECG5000の分類器を作ってみましょう。

ネットワークの形などは特に制限を設けませんし、今回のLessonで扱った内容以外の工夫も組み込んでもらって構いません。

上位者はリーダーボードに掲載させていただきます。（評価はaccuracyによって行います。）

### 目標値
Accuracy 95%

### ルール
- 訓練データは`x_train`, `y_train`, テストデータは`x_test`で与えられます.
- 予測ラベルは **one_hot表現ではなく0~4のクラスラベル** で表してください.
- 下のセルで指定されているx_train, y_train以外の学習データは使わないでください.

### 評価について

- テストデータ(x_test)に対する予測ラベルをcsvファイルで提出してください.
- ファイル名はsubmission.csvとしてください.
- 予測ラベルのy_testに対する精度 (F値)で評価します.
- 毎日24時にテストデータの一部に対する精度でLeader Boardを更新します.
- 最終的な評価はテストデータ全体に対する精度でおこないます.

### サンプルコード
**次のセルで指定されているx_train, y_trainのみを使って学習させてください.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Input, add, concatenate, Dense, Activation, SimpleRNN, LSTM, CuDNNLSTM, Bidirectional, Conv1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

SEED = 9999
AUG_NUM = [100,100,100]

def load_dataset():
    # 学習データ
    x_train = np.load('/root/userspace/public/lesson3/data/x_train.npy')
    y_train = np.load('/root/userspace/public/lesson3/data/y_train.npy')
    y_train = to_categorical(y_train[:, np.newaxis], num_classes = 5)
    
    # テストデータ
    x_test = np.load('/root/userspace/public/lesson3/data/x_test.npy')
    
    #水増し 
#     aug=True
#     if aug:
#         x_train, y_train = augwave(x_train,y_train,AUG_NUM) 

    #標準化
    std=True
    if std==True:
        scl = StandardScaler()
        scl.fit(x_train[:,:,0])
        x_train = scl.transform(x_train[:,:,0]).reshape(x_train.shape)
        x_test = scl.transform(x_test[:,:,0]).reshape(x_test.shape)
        print("std: ", x_train.shape, x_test.shape)
        
    # roll
    
    # min-max scl はよくない気がする。最大値がわからないので    
    return x_train, x_test, y_train

def augwave(x,y,n_list):
    """2,3,4のラベルにノイズを加えて水増しする"""
    aug_labels = [2,3,4]
    for label, n in zip(aug_labels, n_list):
        # 増やすラベルの波形のインデックス
        label_idx = np.where(np.argmax(y, axis=1) == label)[0]

        # ノイズを加える波形をn個選ぶ
        indices = np.random.choice(label_idx, n, replace=True)
        #ランダムにノイズ追加
        tmp_x = x[indices].copy()
        tmp_x += (np.random.rand(*tmp_x.shape)-0.5)*0.8 #
        tmp_y = y[indices].copy()

        # 追加
        x = np.vstack((x,tmp_x))
        y = np.vstack((y, tmp_y))
    print('augx: ',x.shape)
    print('augy: ',y.shape)
    return x, y

def fft(x, n_freq=100):
    tmp = x[:,:,0].copy()
    # 横にくっつける
    for _ in range(n_freq-1):
        tmp = np.hstack((tmp, x[:,:,0]))
    #plt.plot(tmp)

    # 高速フーリエ変換(FFT)
    N = x.shape[1]*n_freq
    #print(tmp.shape, N)
    F = np.fft.fft(tmp)
    # FFT結果（複素数）を絶対値に変換
    F_abs = np.abs(F)
    
    # 振幅を元に信号に揃える
    F_abs_amp = F_abs / N * 2 # 交流成分はデータ数で割って2倍する
    # グラフ表示
    #plt.plot(F_abs_amp[:, :int(N/2)+1])
    return F_abs_amp[:, :int(N/2)+1]

x_train, x_test, y_train = load_dataset()

In [None]:
fftx_train = fft(x_train)
fftx_test = fft(x_test)
fftx_train = fftx_train.reshape(*fftx_train.shape, 1)
fftx_test = fftx_test.reshape(*fftx_test.shape, 1)
# amp = fft(x_train)

In [None]:
print(x_train.shape, x_test.shape, y_train.shape)
print(fftx_train.shape, fftx_test.shape)
print(y_train)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,5))
print(pd.Series(np.argmax(y_train, axis=1)).value_counts())
sns.countplot(pd.Series(np.argmax(y_train, axis=1)), ax=ax[0])

In [None]:
"""https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric"""

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

def build_model():
    hid_dim = 32
    input1 = Input(x_train.shape[1:])
    input2 = Input(fftx_train.shape[1:])
    
    def cnn_lstm_layer(inputs, filter_size, kernel_size, layer_num=3, ):
        for i in range(layer_num):
            if i == 0:
                x = Conv1D(filter_size, kernel_size, padding='same', kernel_initializer='he_normal', input_shape=x_train.shape[1:])(inputs)
            else:
                x = Conv1D(filter_size, kernel_size, padding='same', kernel_initializer='he_normal')(x)
            x = BatchNormalization()(x)
            x = Activation('relu')(x)
        x = Bidirectional(CuDNNLSTM(hid_dim))(x)
        return x
    x1 = cnn_lstm_layer(input1, 32, 40)
    x2 = cnn_lstm_layer(input2, 32, 40)
    x = concatenate([x1, x2])
    out = Dense(y_train.shape[1], activation='softmax')(x)
    model = Model(inputs=[input1, input2], outputs=out)
    model.compile(loss=f1_loss, optimizer=Adam(), metrics=['accuracy', f1])
    return model

In [None]:
# 訓練に使用しないデータ x_val, y_val
x_tra, x_val, y_tra, y_val = train_test_split(x_train, y_train, test_size=0.33, random_state=SEED, stratify=y_train)
fftx_tra, fftx_val, ffty_tra, ffty_val = train_test_split(fftx_train, y_train, test_size=0.33, random_state=SEED, stratify=y_train)
#x_tra, y_tra = augwave(x_tra, y_tra, AUG_NUM)
print(x_tra.shape, y_tra.shape)
print(fftx_tra.shape, fftx_val.shape)

In [None]:
pd.Series(np.argmax(y_tra, axis=1)).value_counts()

In [None]:
model = build_model()
modelpath = '/root/userspace/cnn_lstm_model.hdf5'
callbacks = [
        EarlyStopping(monitor='val_acc', patience=8, mode='max',min_delta=0.001),
        ReduceLROnPlateau(monitor='val_acc', factor=0.2, patience=4, mode='max', min_delta=0.001),
        ModelCheckpoint(filepath=modelpath, monitor='val_acc', save_best_only=True, mode='max')
    ]
history = model.fit([x_tra, fftx_tra], y_tra, epochs=1000, batch_size=128, verbose=2,
                    validation_data=([x_val,fftx_val], y_val), callbacks=callbacks)
# model.fit_generator(my_generator(x_tra, y_tra, batch_size),
#                     steps_per_epoch=len(tr_X)//batch_size,
#                     epochs=epochs,
#                     validation_data=my_generator(val_X, val_y, batch_size),
#                     validation_steps=len(val_X)//batch_size,
#                     validation_split=
#                     callbacks=callbacks,
#                     verbose=2)

# y_pred = np.argmax(model.predict(x_test), 1)

# submission = pd.Series(y_pred, name='label')
# submission.to_csv('/root/userspace/submission.csv', header=True, index_label='id')

In [None]:
# 精度確認
y_pred = model.predict([x_val, fftx_val])
print('f1_macro: ', f1_score(np.argmax(y_val, axis=1), np.argmax(y_pred, axis=1), average='macro'))
print('acc: ', accuracy_score(np.argmax(y_val, axis=1), np.argmax(y_pred, axis=1)))
print(model.evaluate([x_train, fftx_train], y_train))

In [None]:
# 混同行列
cm = confusion_matrix(np.argmax(y_val, axis=1), np.argmax(y_pred, axis=1))
sns.heatmap(cm, annot=True, cmap='Blues')

In [None]:
%%time

# cv
def run_cv(train, test, target, params={}):
    N = 5
    kf = StratifiedKFold(n_splits=N, random_state=SEED, shuffle=True)
    fold_splits = kf.split(train, target.argmax(axis=1))
    acc_scores = []
    f1_scores = []
    results = np.zeros((test.shape[0], N+1))
    i = 0
    
    for tr_idx, val_idx in fold_splits:
        print(f'Start fold {i+1}/{N}')
        tr_X, val_X = [train[tr_idx, :], fftx_train[tr_idx, :]], [train[val_idx, :], fftx_train[val_idx, :]]
        tr_y, val_y = target[tr_idx, :], target[val_idx, :]
        print(tr_X[0].shape, tr_X[1].shape)
        params['modelpath'] = f'/root/userspace/cnn_lstm_fft_cv{i}_model.hdf5'

        val_acc, val_f1, test_pred = run_model(tr_X, tr_y, val_X, val_y, test, params)
        acc_scores.append(val_acc)
        f1_scores.append(val_f1)
        results[:, i] = test_pred
        i+=1
    print('mean acc: ', np.mean(acc_scores))
    print('mean F1 : ', np.mean(f1_scores))
    return results

# モデル予測実行
def run_model(tr_X, tr_y, val_X, val_y, test, params):
    print('Train model')
    batch_size=params['batch_size']
    epochs = params['epochs']
    modelpath = params['modelpath']

    # 訓練データ水増し
    #tr_X, tr_y = augwave(tr_X, tr_y, AUG_NUM)
    print(pd.Series(np.argmax(tr_y, axis=1)).value_counts())
    
    model = build_model()    
    callbacks = [
        EarlyStopping(monitor='val_acc', patience=8, mode='max', min_delta=0.001),
        ReduceLROnPlateau(monitor='val_acc', factor=0.2, patience=4, mode='max', min_delta=0.001),
        ModelCheckpoint(filepath=modelpath, monitor='val_acc', save_best_only=True, mode='max')
    ]
    history = model.fit(tr_X, tr_y, epochs=epochs, batch_size=batch_size,
                        verbose=2, validation_data=(val_X, val_y),
                        callbacks=callbacks)

    print('Pred 1/2')
    
    #load_best model and eval
    model = load_model(modelpath,  custom_objects={'f1_loss': f1_loss, 'f1': f1})
    
    # cmx
    y_pred = model.predict(val_X)
    cm = confusion_matrix(np.argmax(val_y, axis=1), np.argmax(y_pred, axis=1))
    sns.heatmap(cm, annot=True, cmap='Blues') 
    plt.show()
        
    tr_loss, tr_acc, tr_f1 = model.evaluate(tr_X, tr_y)
    val_loss, val_acc, val_f1 = model.evaluate(val_X, val_y)
    print(f'[Train] acc:{tr_acc}  loss:{tr_loss}  f1:{tr_f1}')
    print(f'[Val]   acc:{val_acc} loss:{val_loss} f1:{val_f1}')
    
    print('Pred 2/2')
    pred_test = np.argmax(model.predict([test, fftx_test]), axis=1)
    return val_acc, val_f1, pred_test

params = {'batch_size':128,
          'epochs':1000,}
results = run_cv(x_train, x_test, y_train, params)

In [None]:
#print(results.shape)
submission = pd.DataFrame(results, dtype=int)
submission = submission.apply(lambda x: np.argmax(x.value_counts()), axis=1)
submission.head(20)

In [None]:
submission.name = 'label'
submission.to_csv('/root/userspace/cnn_lstm_fft.csv', header=True, index_label='id')

In [None]:
submission.value_counts()