In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

In [None]:
imdb = keras.datasets.imdb

 # num_words=10000的意思是训练集中我们指保留词频最高的前10000个单词。
 # 10000名之后的词汇都会被直接忽略，不出现在train_data和test_data中。
vocab_size = 10000
index_from = 3 

# 词的索引+3，因为数据集中词的索引从1开始，向右挪3个位置，
# 前面从0算起留出4个槽位，可以放0:【PAD】,1:【STAR】,2:【UNK】,3:【END】
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
    num_words = vocab_size, index_from = index_from)

In [None]:
# 加载词典 
word_index = imdb.get_word_index()
print(len(word_index))

for k,v in word_index.items():
    print(k,v)
    break
# print(word_index)
# 词对应的索引从1开始：the:1,and:2...

In [None]:
for k,v in word_index.items():
    if k=="fawn":
        print(v)

In [None]:
# 把词的索引向右挪三个位置
word_index = {k:(v+3) for k, v in word_index.items()}

In [None]:
# 前面空出的槽位，添加4个标记
word_index['<PAD>'] = 0 # padding 填充位
word_index['<START>'] = 1 # 起始位
word_index['<UNK>'] = 2 # 空值
word_index['<END>'] = 3 # 结束标志

# 将词典中key、value置换
reverse_word_index = dict(
    [(value, key) for key, value in word_index.items()])


In [None]:

def decode_review(text_ids):
    """解析训练数据的内容"""
    return ' '.join(
        [reverse_word_index.get(word_id, "<UNK>") for word_id in text_ids])

In [None]:
decode_review(train_data[0])

In [None]:
print(reverse_word_index[14],reverse_word_index[22])

In [None]:
for k,v in word_index.items():
    if v==0:
        print(k)

In [None]:
max_length = 500 # 句子高于500被截断，低于500被补全

# 进行数据补全
train_data = keras.preprocessing.sequence.pad_sequences(
    train_data, # list of list
    value = word_index['<PAD>'],# 用PAD的索引值0填充
    padding = 'post', # 不够max_length，post:往句子后填充, pre:向句子前方进行填充 
    maxlen = max_length)

test_data = keras.preprocessing.sequence.pad_sequences(
    test_data, # list of list
    value = word_index['<PAD>'],
    padding = 'post', # post, pre
    maxlen = max_length)

print(train_data[0])

In [None]:
# 定义模型
# 单向单层rnn
embedding_dim = 16 # 每个词embedding（转换）成长度为16的向量
batch_size = 128

single_rnn_model = keras.models.Sequential([
    # 1. define matrix: [vocab_size, embedding_dim]
    # 2. [1,2,3,4..], max_length * embedding_dim
    # 3. batch_size * max_length * embedding_dim
    # Embedding详解:https://www.jianshu.com/p/b2c33d7e56a5
    keras.layers.Embedding(vocab_size, embedding_dim,
                           input_length = max_length),
    #  return_sequences : 如果设置return_sequences=True，意思是返回每个展开的RNN cell的输出
    # return_sequences=False仅返回最后一个输出。 
    keras.layers.LSTM(units=64,return_sequences=False),
    keras.layers.Dense(64,activation="relu"),
    keras.layers.Dense(1,activation="sigmoid"),
])

single_rnn_model.summary()
# binary_crossentropy:二分类交叉熵
single_rnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [None]:
single_rnn_history = single_rnn_model.fit(train_data, train_labels,
                    epochs = 30,
                    batch_size = batch_size,
                    validation_split = 0.2)

In [None]:
def plot_learning_curves(history, label, epochs, min_value, max_value):
    data = {}
    data[label] = history.history[label]
    data['val_'+label] = history.history['val_'+label]
    pd.DataFrame(data).plot(figsize=(8, 5))
    plt.grid(True)
    plt.axis([0, epochs, min_value, max_value])
    plt.show()
    
plot_learning_curves(single_rnn_history, 'accuracy', 30, 0, 1)
plot_learning_curves(single_rnn_history, 'loss', 30, 0, 1)

- 二分类问题，准确率太低，没什么意义
- 单层单向的rnn效果不理想

In [None]:
single_rnn_model.evaluate(
    test_data, test_labels,
    batch_size = batch_size,)

# 多层双向rnn

In [None]:
# 定义模型
# 多层双向rnnn
embedding_dim = 16 # 每个词embedding（转换）成长度为16的向量
batch_size = 128

model = keras.models.Sequential([
    # 1. define matrix: [vocab_size, embedding_dim]
    # 2. [1,2,3,4..], max_length * embedding_dim
    # 3. batch_size * max_length * embedding_dim
    # Embedding详解:https://www.jianshu.com/p/b2c33d7e56a5
    keras.layers.Embedding(vocab_size, embedding_dim,
                           input_length = max_length),
    
    #  return_sequences : 如果设置return_sequences=True，意思是返回每个展开的RNN cell的输出
    # return_sequences=False仅返回最后一个输出。 
    
    keras.layers.Bidirectional(keras.layers.LSTM(units=64,return_sequences=True)),
    
    keras.layers.Bidirectional(keras.layers.LSTM(units=64,return_sequences=False)),
    
    keras.layers.Dense(64,activation="relu"),
    keras.layers.Dense(1,activation="sigmoid"),
])

model.summary()
# binary_crossentropy:二分类交叉熵
model.compile(optimizer = 'adam', loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [None]:
history = model.fit(train_data, train_labels,
                    epochs = 30,
                    batch_size = batch_size,
                    validation_split = 0.2)

In [None]:
def plot_learning_curves(history, label, epochs, min_value, max_value):
    data = {}
    data[label] = history.history[label]
    data['val_'+label] = history.history['val_'+label]
    pd.DataFrame(data).plot(figsize=(8, 5))
    plt.grid(True)
    plt.axis([0, epochs, min_value, max_value])
    plt.show()
plot_learning_curves(history, 'accuracy', 30, 0, 1)
plot_learning_curves(history, 'loss', 30, 0, 1)

### 双层双向RNN
- 在过拟合现象严重，可能是模型过于复杂

# 单层双向rnn

In [None]:
# 定义模型
# 单层双向rnnn
embedding_dim = 16 # 每个词embedding（转换）成长度为16的向量
batch_size = 128

bi_rnn_model = keras.models.Sequential([
    # 1. define matrix: [vocab_size, embedding_dim]
    # 2. [1,2,3,4..], max_length * embedding_dim
    # 3. batch_size * max_length * embedding_dim
    # Embedding详解:https://www.jianshu.com/p/b2c33d7e56a5
    keras.layers.Embedding(vocab_size, embedding_dim,
                           input_length = max_length),
    
    #  return_sequences : 如果设置return_sequences=True，意思是返回每个展开的RNN cell的输出
    # return_sequences=False仅返回最后一个输出。 
    
    # 双向RNN层
    keras.layers.Bidirectional(keras.layers.LSTM(units=32,return_sequences=False)),
    
   
    # 全连接层
    keras.layers.Dense(32,activation="relu"),
    # 输出层
    keras.layers.Dense(1,activation="sigmoid"),
])

bi_rnn_model.summary()
# binary_crossentropy:二分类交叉熵
bi_rnn_model.compile(optimizer = 'adam', loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [None]:
bi_rnn_history = bi_rnn_model.fit(train_data, train_labels,
                    epochs = 30,
                    batch_size = batch_size,
                    validation_split = 0.2)

In [None]:
plot_learning_curves(bi_rnn_history, 'accuracy', 30, 0, 1)
plot_learning_curves(bi_rnn_history, 'loss', 30, 0, 1)

- 在此模型中在第5次以后val_loss急剧上升,说明循环神经网络过拟合非常明显，由于rnn过于强大，所以才会造成此现象
- 通过降低模型尺寸，两层-->单层，
- 正则化向、dropout
- 使用更强大的RNN-->LSTM