# RNN



## 1. RNN Review

In [2]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' # 避免notebook 执行时退出

## 2. SimpleRNN

下面我们介绍如何使用SimpleRNN 来做情感预测

### Step 1: 读取IMDB数据

In [3]:
import glob
import os

from random import shuffle

def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    
    pos_label = 1
    neg_label = 0
    
    dataset = []
    
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))
            
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))
    
    shuffle(dataset)
    
    return dataset



In [4]:
imdb_datasets = '/Users/chenwang/Workspace/datasets/IMDB/aclImdb/train'

In [5]:
dataset = pre_process_data(imdb_datasets)

### Step 2. Tokenization and vectorization

In [6]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors

In [7]:
word_vectors = KeyedVectors.load_word2vec_format('/Users/chenwang/Workspace/datasets/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)



In [8]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        # sample 是一个二元组，sample[0] 是label, sample[1] 是评论text
        tokens = tokenizer.tokenize(sample[1])
        
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab
        
        # list of list    
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [9]:
def collect_expected(dataset):
    """ Peel of the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [10]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [11]:
len(vectorized_data)

25000

In [12]:
len(expected)

25000

### Step 3. split training and testing sets

已经shuffle 过，所以直接取前80%elements 作为training set，剩余的作为testing set.

In [13]:
split_point = int(len(vectorized_data)*.8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

### Step 4. Set up RNN parameters



这里我们选50个neuron 主要是为了reduce computation time.

In [52]:
maxlen = 400
batch_size = 32         # How many samples to show the net before backpropogating the error and updating the weights
embedding_dims = 300    # Length of the token vectors we will create for passing into the Convnet
epochs = 2
num_neurons = 50


### Step 5. preprocessing (padding & truncate)

In [15]:
def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    
    zero_vector = []
    
    # data[0] 第一个review 所有token 的词向量
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
 
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [29]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)



In [30]:
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))  # 20000 * 400 * 300
y_train = np.array(y_train)

x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [51]:
x_train.shape  # 20000 条评论，每个评论400个tokens，每个token 是一个长度为300 的vector 

(20000, 400, 300)

### Step 6. 定义model

- input layer:
    - 300 个neurons，对应长度为300的word embedding

- SimpleRNN layer:
    - 50个neurons
    - `return_sequences=True`: 中间结果不丢掉

- dropout layer:

- flatten:
    - 最后的dense 层需要一个flat vector 作为输入，而SimpleRNN 的输出是一个400 * 50 的tensor

⚠️ 通常RNN 不需要每个输入都有相同的长度，但是在这个例子里需要这样做，原因是我们把所有中间的output 都要传给feedforward netowrk，而feedforward network 需要一个unifomly sized input。因此flatten 之后接了一个dense 层，我们需要明确知道dense 层的维度。所以我们在之前还是做了padding & truncate。
    
- dense (fully connected) layer:
    - 1 个neuron，输出0/1

In [19]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN

print('Build model...')
model = Sequential()

model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile('rmsprop', 'binary_crossentropy',  metrics=['accuracy'])
print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 400, 50)           17550     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 20001     
Total params: 37,551
Trainable params: 37,551
Non-trainable params: 0
_________________________________________________________________
None


#### 参数分析

我们可以看出，一共有37551 个参数：
- first time step of the first layer: 
    - input layer: 300 neurons (对应第一个单词的word embedding)
    - RNN layer：50 neurons 
    - 一共是15000个参数 + 50个bias = 15050
- 从第2步到第400步 of the first layer:
    - 每个neuron：
        - input layer: 300 neurons (对应t时刻的word embedding)
        - RNN: 50 neurons (from t-1 时刻的输出)
        - bias: 1 neurons
        - 一共是351 个参数
    - 一共有50个neurons：
        - 50*351 = 17750
- output layer:
    - flatten 的输出：20000
    - bias: 1
    - 一共20001 个参数

### Step 7. Train the model

In [20]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1a3f6c5748>

### Step 8. Save the model


In [21]:
model_structure = model.to_json()
with open("simplernn_model1.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("simplernn_weights1.h5")
print('Model saved.')

Model saved.


### Step 9. Load the model


In [None]:
# from keras.models import model_from_json

# with open("cnn_model.json", "r") as json_file:
#     json_string = json_file.read()
# model = model_from_json(json_string)

# model.load_weights('cnn_weights.h5')

### Step 10. Prediction

In [23]:
sample_1 = "I'm hate that the dismal weather that had me down for so long, when will it break! Ugh, when does happiness return?  The sun is blinding and the puffy clouds are too thin.  I can't wait for the weekend."

# We pass a dummy value in the first element of the tuple just because our helper expects it from the way processed the initial data.  That value won't ever see the network, so it can be whatever.
vec_list = tokenize_and_vectorize([(1, sample_1)])

# Tokenize returns a list of the data (length 1 here)
test_vec_list = pad_trunc(vec_list, maxlen)

test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))


In [24]:
model.predict_classes(test_vec)


array([[0]], dtype=int32)

## 3. Improvement

### 3.1 Hyperparam tuning

- `maxlen`
    - 因为padding 和 truncate，带来了很多噪音
    - maxlen 可以看作调整信噪比

- `embedding_dims`: 这个是词向量的维度，我们不做更改

- `batch_size`: 
    - 越大训练越快，因为backpropagation 的次数减少
    - 增大有可能converge 到local minimum
 
- `epochs`
    - 可以先设定一个较小的值，结果如果还不太好，就再训练一个epoch，keras 允许继续训练
    - callback: EarlyStopping
        - validation accuracy for several consecutive epochs
        
- `num_neurons`: RNN 的neuron 个数，可调

- `dropout`
    - 通常0.2 - 0.5 是一个safe range


### 3.2 Backwards RNN

SimpleRNN 中有一个`go_backwards` 参数，默认设置为false。如果设置成True，则会把所有序列倒过来，输入到RNN 中。

这对我们的预测效果会更好，因为RNN is more receptive to data at the end of the sample than at the beginning. 而我们很多documents 最后都加了padding，所以相当于是浪费了这个特性。

In [53]:
print('Build model...')
model_0 = Sequential()

model_0.add(SimpleRNN(num_neurons, return_sequences=True, go_backwards=True, input_shape=(maxlen, embedding_dims)))

model_0.add(Dropout(.2))

model_0.add(Flatten())
model_0.add(Dense(1, activation='sigmoid'))

model_0.compile('rmsprop', 'binary_crossentropy',  metrics=['accuracy'])


Build model...


In [54]:
model_0.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1a5a62b128>

从上面的结果可以看出，使用backwards RNN，2个epoch 之后出现了overfitting。

### 3.3 Bi-directional RNN

<img src="img/bi_directional_rnn.png" alt="drawing" width="700"/>



In [42]:

num_neurons_bi = 10
maxlen_bi = 100
embedding_dims = 300


In [43]:
from keras.layers.wrappers import Bidirectional

print('Build model...')
model_1 = Sequential()

model_1.add(Bidirectional(SimpleRNN(num_neurons_bi, return_sequences=True, input_shape=(maxlen_bi, embedding_dims))))
model_1.add(Dropout(.2))

model_1.add(Flatten())
model_1.add(Dense(1, activation='sigmoid'))

Build model...


In [44]:
model_1.compile('rmsprop', 'binary_crossentropy',  metrics=['accuracy'])


In [45]:
model_1.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x10d035320>

我们可以看出，双向RNN 的performance 要略好于SimpleRNN，但是因为正向RNN 加了很多噪音，所以结果仍然不是很理想。

### 3.4 不使用padding

to_check...