# AIF_sprint17-rnn　RNN

RNN（リカレントニューラルネットワーク ）には時間（タイムステップ）の概念が導入されており、  
手前の時刻の出力が後の時刻の入力として使用される構造を持つ。これによって、時系列上の特徴を  
抽出することが可能となるため、文字列、音声、株価などの時系列データを扱うタスクに利用される。

以下では、IMDB映画レビューのデータセットを使用し、各レビューが肯定、否定のどちらであるかをRNNによって判定する。

## Kerasによる実装   
## simpleRNN

In [36]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding
from keras.models import Model
from keras.layers import SimpleRNN
from keras.datasets import imdb

max_features = 10000
maxlen = 40
batch_size = 32

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

inp = Input(shape=(maxlen,), dtype='int32', name='main_input')
x = Embedding(max_features, 128)(inp) # max_featuresを128次元に成形
simple_rnn_out = SimpleRNN(32)(x)
predictions = Dense(1, activation='sigmoid')(simple_rnn_out)
model = Model(inputs=inp, outputs=predictions)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=3,
          validation_data=(x_test, y_test))
loss, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test loss:', loss)
print('Test accuracy:', acc)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss: 0.6505090194892883
Test accuracy: 0.76868


# GRU

In [37]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding
from keras.models import Model
from keras.layers import SimpleRNN,GRU
from keras.datasets import imdb

max_features = 10000
maxlen = 40
batch_size = 32

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

inp = Input(shape=(maxlen,), dtype='int32', name='main_input')
x = Embedding(max_features, 128)(inp) # max_featuresを128次元に成形
simple_rnn_out = GRU(32)(x)
predictions = Dense(1, activation='sigmoid')(simple_rnn_out)
model = Model(inputs=inp, outputs=predictions)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=3,
          validation_data=(x_test, y_test))
loss, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test loss:', loss)
print('Test accuracy:', acc)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss: 0.4630580368423462
Test accuracy: 0.80036


# LSTM

In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding
from keras.models import Model
from keras.layers import SimpleRNN,LSTM
from keras.datasets import imdb

max_features = 10000
maxlen = 40
batch_size = 32

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

inp = Input(shape=(maxlen,), dtype='int32', name='main_input')
x = Embedding(max_features, 128)(inp) # max_featuresを128次元に成形
simple_rnn_out = LSTM(32)(x)
predictions = Dense(1, activation='sigmoid')(simple_rnn_out)
model = Model(inputs=inp, outputs=predictions)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=3,
          validation_data=(x_test, y_test))
loss, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test loss:', loss)
print('Test accuracy:', acc)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss: 0.4734947679901123
Test accuracy: 0.79388


## Kerasの中間層の出力を取得

In [3]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding
from keras.models import Model
from keras.layers import SimpleRNN,LSTM
from keras.datasets import imdb
from keras import backend as K


max_features = 10000
maxlen = 40
batch_size = 32

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

inp = Input(shape=(maxlen,), dtype='int32', name='main_input')
x = Embedding(max_features, 128)(inp) # max_featuresを128次元に成形
simple_rnn_out = LSTM(32)(x)
predictions = Dense(1, activation='sigmoid')(simple_rnn_out)
model = Model(inputs=inp, outputs=predictions)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
layers=[]
get_3rd_layer_output = K.function([model.layers[0].input],
                                  [model.layers[1].output])
layer_output = get_3rd_layer_output([x_train])[0]

In [71]:
model.layers[0].input

<tf.Tensor 'strided_slice_2:0' shape=() dtype=int32>

In [72]:
model.layers[1].output

<tf.Tensor 'embedding_8/GatherV2:0' shape=(?, 40, 128) dtype=float32>

In [66]:
layer_output

array([[[-0.03416888,  0.03783664,  0.00586157, ..., -0.01624478,
          0.00909306,  0.01922173],
        [-0.04722911,  0.04446899,  0.04739446, ..., -0.00340547,
         -0.04565128,  0.03332147],
        [-0.02866079, -0.02254005,  0.02716302, ...,  0.00278345,
          0.02685889,  0.01859263],
        ...,
        [ 0.03347677,  0.04959262,  0.00310228, ..., -0.00387122,
          0.01852157, -0.04137981],
        [-0.00931389,  0.00699956,  0.02615228, ...,  0.00101366,
         -0.04573927,  0.03486884],
        [ 0.01426223, -0.04787136,  0.03231819, ...,  0.03190733,
         -0.02121609,  0.0231463 ]],

       [[-0.00246792,  0.02797038,  0.03113807, ...,  0.02212362,
          0.02466423,  0.04999024],
        [ 0.02720055, -0.04736433,  0.01700575, ..., -0.0414017 ,
         -0.007948  ,  0.00824443],
        [ 0.03480778,  0.01760054,  0.01557399, ..., -0.01079752,
         -0.04466461, -0.03789822],
        ...,
        [-0.01962675, -0.01568997,  0.00439792, ...,  

In [58]:
layer_output[0,0,:]

array([-0.03416888,  0.03783664,  0.00586157, -0.02042237,  0.02980703,
        0.04513042, -0.02026241,  0.01383116, -0.00808274,  0.04402683,
        0.0141789 ,  0.02471377,  0.00486485, -0.02917048,  0.01988434,
       -0.00414156, -0.00351591,  0.00242301,  0.01064868,  0.02924028,
       -0.04600067,  0.00834206, -0.00471908,  0.02337712, -0.01399683,
        0.04838243,  0.04735166,  0.00734   ,  0.01291228,  0.04309101,
        0.03739926,  0.03233192, -0.0233871 ,  0.00374447,  0.00095449,
        0.02878969,  0.02251926, -0.00867373, -0.02138217, -0.01424947,
       -0.02910378, -0.03883444,  0.02741053, -0.0267715 , -0.01529577,
        0.00185541,  0.030097  ,  0.0069525 , -0.01310914, -0.01322677,
        0.00978652,  0.00373502,  0.01566556,  0.04557996,  0.02935301,
        0.02947242, -0.00891559, -0.00899905, -0.01780238,  0.02641508,
        0.01252932,  0.04723981, -0.01200227, -0.02359664, -0.00188351,
       -0.0159466 , -0.03588001, -0.02022603, -0.00789155,  0.00

# Chainer による実装

In [34]:
import argparse
import datetime
import json
import os

import chainer
from chainer import training
from chainer.training import extensions

import nets
from nlp_utils import convert_seq
import text_datasets

batchsize = 32
layer = 2
unit = 32
dropout = 0
epoch = 3
out = '/Users/h_t_mac_book_pro/NN/sprint17'

def main():
    current_datetime = '{}'.format(datetime.datetime.today())

    train, test, vocab = text_datasets.get_imdb()
    
    print('# train data: {}'.format(len(train)))
    print('# test  data: {}'.format(len(test)))
    print('# vocab: {}'.format(len(vocab)))
    n_class = len(set([int(d[1]) for d in train]))
    print('# class: {}'.format(n_class))

    train_iter = chainer.iterators.SerialIterator(train[:1000], batchsize)
    test_iter = chainer.iterators.SerialIterator(test[:1000], batchsize,
                                                 repeat=False, shuffle=False)
    # インスタンス生成
    Encoder = nets.RNNEncoder
    encoder = Encoder(n_layers=layer, n_vocab=len(vocab),
                      n_units=unit, dropout=dropout)
    model = nets.TextClassifier(encoder, n_class)

    # オプティマイザ
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

    # トレーナー
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer,
        converter=convert_seq)
    trainer = training.Trainer(updater, (epoch, 'epoch'), out=out)

    # 検証
    trainer.extend(extensions.Evaluator(
                   test_iter, model, converter=convert_seq))

    # ベストスコア記録
    record_trigger = training.triggers.MaxValueTrigger(
        'validation/main/accuracy', (1, 'epoch'))
    trainer.extend(extensions.snapshot_object(
        model, 'best_model.npz'),
        trigger=record_trigger)

    # loss , acc 
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.PrintReport(
        ['epoch', 'main/loss', 'validation/main/loss',
         'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
    
    # 進捗バー表示
    trainer.extend(extensions.ProgressBar())

    # 訓練実施
    trainer.run()


if __name__ == '__main__':
    main()


read imdb
constract vocabulary based on frequency
# train data: 25000
# test  data: 25000
# vocab: 20000
# class: 2
epoch       main/loss   validation/main/loss  main/accuracy  validation/main/accuracy  elapsed_time
[J1           0.378264    0.0372162             0.993164       1                         178.308       
[J2           0.00765625  0.00132426            1              1                         341.392       
[J3           0.00112716  0.000803077           1              1                         502.959       
[J

# スクラッチによるsimpleRNNの実装

### データ生成関数

In [11]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding
from keras.models import Model
from keras.layers import SimpleRNN,LSTM
from keras.datasets import imdb
from keras import backend as K


def load_imdb(max_features = 10000, maxlen = 40,batch_size = 32):
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    inp = Input(shape=(maxlen,), dtype='int32', name='main_input')
    x = Embedding(max_features, 128)(inp) # max_featuresを128次元に成形
    simple_rnn_out = LSTM(32)(x)
    predictions = Dense(1, activation='sigmoid')(simple_rnn_out)
    model = Model(inputs=inp, outputs=predictions)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    layers=[]
    get_3rd_layer_output = K.function([model.layers[0].input],
                                      [model.layers[1].output])
    x_train = get_3rd_layer_output([x_train])[0]
    x_test = get_3rd_layer_output([x_test])[0]
    return x_train, y_train, x_test, y_test


### 勾配チェック関数

In [None]:
def grad_check(forward, backvalue):
    epsilon = 0.0001
    gradient_checker = (forward(x + epsilon) -
    forward(x - epsilon)) / (2 * epsilon)
    diff = np.abs(backvalue - gradient_checker)
    print(diff)

### 計算クラス

In [2]:
# 積
class MultiplyGate:
    def forward(self,W, x):
        return np.dot(W, x)

    def backward(self, W, x, dz):
        dW = np.asarray(np.dot(dz, x.T))
        dx = np.dot(np.transpose(W), dz)
        return dW, dx
    
# 和
class AddGate:
    def forward(self, x1, x2):
        return x1 + x2

    def backward(self, x1, x2, dz):
        dx1 = dz * np.ones_like(x1)
        dx2 = dz * np.ones_like(x2)
        return dx1, dx2
    
# アダマール
class AdaGate:
    def forward(self,x1, x2):
        return x1 * x2
    
    def backward(self, x1, x2, dz):
        dx1 = dz * x2
        dx2 = dz * x1
        return dx1, dx2

### Softmax(最終出力)

In [3]:
class Softmax:
    def predict(self, x):
        exp_scores = np.exp(x)
        return exp_scores / np.sum(exp_scores,axis=0)

    def loss(self, x, y):
        probs = self.predict(x)
        return -np.log(probs[y])

    def diff(self, x, y):
        probs = self.predict(x)
        probs[y] -= 1.0
        return probs

### Tanh(活性化関数)

In [None]:
class Tanh:
    def forward(self, x):
        return np.tanh(x)

    def backward(self, x, top_diff):
        output = self.forward(x)
        return (1.0 - np.square(output)) * top_diff

### simpleRNN layer

In [None]:
mulGate = MultiplyGate()
addGate = AddGate()
activation = Tanh()

class L:
    def forward(self, x, prev_s, U, W, V):
        self.mulu = mulGate.forward(U, x)
        self.mulw = mulGate.forward(W, prev_s)
        self.add = addGate.forward(self.mulw, self.mulu)
        self.s = activation.forward(self.add)  
        return self.add, self.s
    
    def backward(self, y, x, A, S, V, W, pred, num):
        
        diffs = pred - y
        # dV
        dV = np.dot(diffs , S[-1].T)
        # dSt
        pre_dSt = np.dot(V.T, diffs)
        dU = 0
        dW = 0
        start = x.shape[1]-1
        for i in range(start,start-num,-1):
            #dAi
            tanh = Tanh()
            dA = tanh.backward(A[i], pre_dSt)
            #dU
            dU += np.dot(dA, x[:,i,:])
            #dW
            dW += np.dot(dA, S[i-1].T)
            # dSi-1
            pre_dSt = np.dot(W.T, dA)      
            
        return dV, dU, dW        

### simple RNN model

In [29]:
from datetime import datetime
import sys

class RNNmodels:
    def __init__(self, word_dim, hidden_dim=100, truncate=4, batchsize=32, optimizer = 'sgd'):
        self.optimizer = optimizer
        self.batchsize = batchsize
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.truncate = truncate
        self.U = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), \
                                   (hidden_dim, word_dim))
        self.W = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim),\
                                   (hidden_dim, hidden_dim))
        self.V = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), \
                                   (2, hidden_dim))
        if self.optimizer == 'adam':
            self.m_U = np.zeros_like(self.U)
            self.v_U = np.zeros_like(self.U)
            self.m_W = np.zeros_like(self.W)
            self.v_W = np.zeros_like(self.W)
            self.m_V = np.zeros_like(self.V)
            self.v_V = np.zeros_like(self.V)
            self.beta1 = 0.88
            self.beta2 = 0.998
            self.adam_lr = 0.00095
        else:
            None
        self.optimizer = optimizer
        

    def forward_propagation(self, x):   
        T = x.shape[1]
        self.all_A = []
        self.all_S = []
        prev_s = np.zeros((hidden_dim, x.shape[0]))
        A = np.zeros((hidden_dim, x.shape[0]))
        S = np.zeros((hidden_dim, x.shape[0]))
        epsilon = 0.0001
        dU_c = 0
        dW_c = 0
        dV_c = 0
        for t in range(T):
            layer = L()
            A, S = layer.forward(x[:,t,:].T, prev_s, self.U, self.W, self.V)
            prev_s = S
            self.all_A.append(A)
            self.all_S.append(S)     
        return np.dot(self.V, S)
            
        
    def loss_and_acc(self, x_train, y_train, x_test, y_test):
        #forward
        train_out = self.forward_propagation(x_train)
        test_out = self.forward_propagation(x_test) 
        #predict
        Soft = Softmax()
        train_predict = Soft.predict(train_out)
        test_predict = Soft.predict(test_out)
        #acc
        train_predict2 = train_predict.copy()
        test_predict2 = test_predict.copy()
        train_predict2[train_predict2<0.5]=0
        test_predict2[test_predict2<0.5]=0
        train_predict2[0.5<=train_predict2]=1
        test_predict2[0.5<=test_predict2]=1
        self.train_acc = (train_predict2 * y_train).sum() / y_train.shape[1]
        self.test_acc = (test_predict2 * y_test).sum() / y_test.shape[1]
        #loss
        train_log = train_predict * y_train 
        self.train_loss = -np.log(train_log[np.where(1e-9<train_log)] +1e-8).sum() / y_train.shape[1]
        test_log = test_predict * y_test    
        self.test_loss = -np.log(test_log[np.where(1e-9<test_log)] +1e-8).sum() / y_test.shape[1]
    

    def update(self, dV, dU, dW, learning_rate):
        if self.optimizer == 'sgd':
            self.U -= learning_rate * dU 
            self.V -= learning_rate * dV 
            self.W -= learning_rate * dW 
            
        elif self.optimizer == 'adam':
            self.m_U = self.beta1 * self.m_U + (1- self.beta1) * dU
            self.v_U = self.beta2 * self.v_U + (1- self.beta2) * (dU * dU)
            m_hat_U = self.m_U / (1 - self.beta1)
            v_hat_U = self.v_U / (1 - self.beta2)
            self.U -= self.adam_lr * m_hat_U / (np.sqrt(v_hat_U) + 1e-8)
            
            self.m_V = self.beta1 * self.m_V + (1- self.beta1) * dV
            self.v_V = self.beta2 * self.v_V + (1- self.beta2) * (dV * dV)
            m_hat_V = self.m_V / (1 - self.beta1)
            v_hat_V = self.v_V / (1 - self.beta2)
            self.V -= self.adam_lr * m_hat_V / (np.sqrt(v_hat_V) + 1e-8)
            
            self.m_W = self.beta1 * self.m_W + (1- self.beta1) * dW
            self.v_W = self.beta2 * self.v_W + (1- self.beta2) * (dW * dW)
            m_hat_W = self.m_W / (1 - self.beta1)
            v_hat_W = self.v_W / (1 - self.beta2)
            self.W -= self.adam_lr * m_hat_W / (np.sqrt(v_hat_W) + 1e-8) 
        else:
            None          

            
    def trains(self, Xtrain, Ytrain, Xtest, Ytest, learning_rate=0.005, nepoch=100):
        iteration = len(Ytrain) // self.batchsize
        y_hot = np.zeros((2,self.batchsize))  
        pred1 = np.zeros((2,self.batchsize))
        #Yrain ,Ytestをone-hot化しておく
        Ytrain_hot = np.identity(2)[Ytrain].T
        Ytest_hot = np.identity(2)[Ytest].T
        Soft = Softmax()
        for epoch in range(nepoch):
            for itr in range(iteration):
                start = itr * batchsize
                x_batch = Xtrain[start:start + self.batchsize]
                y_batch = Ytrain[start:start + self.batchsize]
                y_hot = np.identity(2)[y_batch].T# (2, batchsize) 0行目が０ユニット用、1行目が１ユニット用
                # forward
                output = self.forward_propagation(x_batch)
                #predict
                pred1 = Soft.predict(output)
                layers = L()
                dV, dU, dW = layers.backward(y_hot, x_batch, self.all_A, self.all_S, self.V, \
                                             self.W, pred1, self.truncate)
                
                #update
                self.update(dV, dU, dW, learning_rate)
            
            # loss
            if (epoch==0) or ((epoch % 10) == 9):   
                self.loss_and_acc(Xtrain, Ytrain_hot, Xtest, Ytest_hot)
                print("*"*50)
                print("epoch",epoch + 1)
                print("train : loss {:.4} ,acc {:.4}".format(self.train_loss, self.train_acc))
                print("test : loss {:.4} ,acc {:.4}".format(self.test_loss, self.test_acc))

### 実施

In [31]:
from keras.datasets import imdb
import numpy as np
word_dim = 128
hidden_dim = 64
bptt_truncate = 4
batchsize = 32
train_size =7000
test_size=3000
learning_rate=0.002
nepoch=30
np.random.seed(15) #シード固定

# データ取得
x_train, y_train, x_test, y_test=load_imdb(max_features = 10000, maxlen = 40)
#モデルインスタンス生成
rnn = RNNmodels(word_dim, hidden_dim ,bptt_truncate, batchsize, optimizer='adam')
#訓練・検証実施
rnn.trains(x_train[:train_size], y_train[:train_size],x_test[:test_size], y_test[:test_size],\
           learning_rate, nepoch)

load dataset
padding
build model
**************************************************
epoch 1
train : loss 0.6628 ,acc 0.6151
test : loss 0.6677 ,acc 0.5893
**************************************************
epoch 10
train : loss 0.6028 ,acc 0.667
test : loss 0.6594 ,acc 0.6123
**************************************************
epoch 20
train : loss 0.5963 ,acc 0.6724
test : loss 0.6683 ,acc 0.6137
**************************************************
epoch 30
train : loss 0.595 ,acc 0.6737
test : loss 0.6737 ,acc 0.612
