情感分析

In [1]:
import tensorflow as tf
from tensorflow import keras
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

In [2]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

上面的输出可以见得，文本已经经过预处理变成了numpy数组

预处理：每个整数代表一个单词，删除了所有标点符号，单词换成小写字母，用空格风格，最后按频率索引。

整数0、1、2表示填充令牌、序列开始令牌和未知单词

如果想要可视化单词，则可以进行解码（<font color='red'>解码方式不深究，复制就完事了）<font co>


In [3]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

然而很多时候，数据并不是处理好的，必须子集分词、过滤大小写、筛除标点符号

接下来tensorflow数据集，以文本的形式加载原始的IMDb评论

## 使用tensorflow提供的原始数据集

体验预处理的痛苦

In [4]:
import tensorflow_datasets as tfds
import numpy as np
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [5]:
print(datasets.keys())
# 获取训练集测试集大小
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

# 解码
for X_batch, y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

dict_keys(['train', 'test', 'unsupervised'])
Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [6]:
def preprocess(X_batch, y_batch):
    '''
    从截断评论开始，每条评论仅仅保留前300个字符

    然后使用正则表达式来用空格替换<br/>，替换字母和引号以外的所有字符

    最后将评论按空格分割，
    用<pad>填充标记来填充所有评论，使他们具有相同的长度

    因为通常可以在第一句或者第二句就判断正面负面，

    所以不会对模型性能产生很大影响
    '''
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [7]:
# 用预处理函数
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

接下来，需要构建词汇表：

        遍历整个训练集，应用preprocess()函数，并使用Counter对每个单词的次数进行计数

In [8]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [9]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [10]:
"""
为了获得良好的性能，可能不需要模型直到字典里的所有单词

因此只保留10000个最常见的单词
"""
vocab_size = 10000
truncated_vocabulary = [
word for word, count in vocabulary.most_common()[:vocab_size]
]

In [11]:
"""
现在需要把每个单词替换其在词汇表中的索引

使用1000 out-of-vocabulary存储桶来创建一个查找表

"""
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [12]:
"""
然后我们可以使用此表来查找几个单词的ID

注意到，在此表中，最后一个单词没有找到，因为oov桶的最高ID为10000

而这个单词的ID为10053
""" 
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))


<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]], dtype=int64)>

In [13]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [14]:
# 现在开始可以创建模型并对其进行训练
embed_size = 128
model = keras.models.Sequential([
    # 第一层是嵌入层，将每个单词ID转为嵌入
    # 嵌入矩阵需要每个单词ID一行，每个嵌入维度一列，维度是超参数
    # 模型的输入是形状为[批处理大小，时间步长]的2D张量
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           input_shape=[None]),
    # 嵌入层的输出是[批处理大小，时间步长，嵌入大小]的3D张量
    keras.layers.GRU(128, return_sequences=True),
    # 仅仅返回最后一个时间不步长的输出
    keras.layers.GRU(128),
    # 仅仅用sigmoid估计概率，该概率反映了情感倾向
    keras.layers.Dense(1, activation="sigmoid"),
])
model.compile(loss="binary_crossentropy", optimizer='adam',
              metrics=['accuracy'])
history = model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
model.save("sentiment_analysis.h5")

掩码屏蔽

就目前而言，该模型的训练是需要忽略填充令牌的，从而专注学习实际上很重要的数据

在创建嵌入时添加mask_zero=True，这意味着所有下游层都将忽略填充令牌

具体原理见p468

## 使用封装好的掩码超参数

In [16]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)
model.save("blackbox_mask.h5")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## 手动构建掩码

In [17]:
K = keras.backend
embed_size = 128
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.models.Model(inputs=[inputs], outputs=[outputs])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)
model.save("manual_mask.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
