# 简单的RNN实作

## 程式参考来源：
- https://keras.io/api/layers/core_layers/embedding/
- https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
- https://keras.io/guides/working_with_rnns/


## 载入相关套件

In [1]:
# 载入相关套件
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## 嵌入层测试

In [4]:
# 建立模型
model = tf.keras.Sequential()

# 模型只含嵌入层(Embedding layer)
# 字汇表最大为1000，输出维度为 64，输入的字数为 10
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# 产生乱数资料，32笔资料，每笔 10 个数字
input_array = np.random.randint(1000, size=(32, 10))

# 指定优化器、损失函数
model.compile('rmsprop', 'mse')

# 预测
output_array = model.predict(input_array)
print(output_array.shape)
output_array[0]

(32, 10, 64)


array([[-4.09067757e-02,  4.13169377e-02,  3.79419327e-03,
        -8.12249258e-03,  3.98785211e-02,  4.84695174e-02,
        -3.52774151e-02, -2.07844265e-02, -3.32484469e-02,
         1.15686059e-02, -2.05504298e-02,  4.01307456e-02,
        -3.33517343e-02,  4.53372933e-02, -1.14959478e-02,
        -3.42349410e-02, -2.31464747e-02,  4.93111499e-02,
         3.65070440e-02,  1.29793398e-02,  3.98182534e-02,
        -4.83712554e-02,  2.58997716e-02,  3.76032479e-02,
         4.48194407e-02, -3.18442471e-02,  1.50911510e-05,
         4.13540117e-02, -1.83008537e-02, -3.48059647e-02,
         4.89773043e-02, -2.05516815e-04,  6.68109581e-03,
         2.11245939e-03, -4.50933240e-02,  7.08359480e-03,
        -3.61134633e-02, -3.95359285e-02,  4.99451868e-02,
         4.33850288e-03, -2.35689413e-02,  4.22668122e-02,
         2.27580108e-02,  2.43217610e-02, -3.27639952e-02,
        -3.15042622e-02, -2.30477341e-02,  3.61427777e-02,
        -4.26368602e-02, -4.84775379e-03, -3.10918931e-0

## 使用真实的资料转换

In [8]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 测试资料
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']

# 转成 one-hot encoding
vocab_size = 50 # 字典最大字数
maxlen = 4      # 语句最大字数
encoded_docs = [one_hot(d, vocab_size) for d in docs]

# 转成固定长度，长度不足则后面补空白
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

# 模型只有 Embedding
model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 64, input_length=maxlen))
model.compile('rmsprop', 'mse')

# 预测
output_array = model.predict(padded_docs)
output_array.shape

(10, 4)


(10, 4, 64)

In [11]:
# one-hot encoding 转换结果
print(encoded_docs[0])

# 补空白后的输入维度
print(padded_docs.shape)

[34, 33]
(10, 4)


## 加上完全连接层(Dense)

In [15]:
# 定义 10 个语句的正面(1)或负面(0)的情绪
labels = np.array([1,1,1,1,1,0,0,0,0,0])

vocab_size = 50
maxlen = 4
encoded_docs = [one_hot(d, vocab_size) for d in docs]
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 8, input_length=maxlen))
model.add(layers.Flatten())

# 加上完全连接层(Dense)
model.add(layers.Dense(1, activation='sigmoid'))

# 指定优化器、损失函数
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

print(model.summary())

# 模型训练
model.fit(padded_docs, labels, epochs=50, verbose=0)

# 模型评估
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 89.999998


In [7]:
model.predict(padded_docs)

array([[0.5838079 ],
       [0.5428731 ],
       [0.50959533],
       [0.52323276],
       [0.53539276],
       [0.50386965],
       [0.49095556],
       [0.49666357],
       [0.5119646 ],
       [0.41784462]], dtype=float32)

## 加上 RNN 神经层

In [24]:
model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 8, input_length=maxlen))

# 加上 RNN 神经层，输出 128 个神经元
model.add(layers.SimpleRNN(128))

# 加上完全连接层(Dense)
model.add(layers.Dense(1, activation='sigmoid'))

# 指定优化器、损失函数
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

print(model.summary())
# 模型训练
model.fit(padded_docs, labels, epochs=50, verbose=0)

# 模型评估
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 4, 8)              400       
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 128)               17536     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 18,065
Trainable params: 18,065
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 100.000000


In [25]:
model.predict(padded_docs)



array([[9.9992478e-01],
       [9.9990129e-01],
       [9.9997473e-01],
       [9.9999011e-01],
       [9.9997044e-01],
       [2.6669091e-05],
       [4.1497133e-05],
       [7.0826311e-05],
       [7.6519085e-05],
       [1.3496495e-05]], dtype=float32)

In [51]:
list(np.argmax(model.predict(padded_docs), axis=-1).reshape(-1))

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

## 使用词向量(Word2Vec)

## 读取 GloVe 300维的词向量，产生字典资料型变数，方便搜寻

In [18]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./GloVe/glove.6B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

## 分词

In [35]:
# 分词
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(docs)

vocab_size = len(t.word_index) + 1

# 转为序列整数
encoded_docs = t.texts_to_sequences(docs)

# 补 0
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')
padded_docs

array([[ 6,  2,  0,  0],
       [ 3,  1,  0,  0],
       [ 7,  4,  0,  0],
       [ 8,  1,  0,  0],
       [ 9,  0,  0,  0],
       [10,  0,  0,  0],
       [ 5,  4,  0,  0],
       [11,  3,  0,  0],
       [ 5,  1,  0,  0],
       [12, 13,  2, 14]])

## 转换为GloVe 300维的词向量

In [41]:
# 转换为 GloVe 300维的词向量
# 初始化输出
embedding_matrix = np.zeros((vocab_size, 300))

# 读取词向量值
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# 任取一笔观察        
embedding_matrix[2]

array([ 0.19205999,  0.16459   ,  0.060122  ,  0.17696001, -0.27405   ,
        0.079646  , -0.25292999, -0.11763   ,  0.17614   , -1.97870004,
        0.10707   , -0.028088  ,  0.093991  ,  0.48135   , -0.037581  ,
        0.0059231 , -0.11118   , -0.099847  , -0.22189   ,  0.0062044 ,
        0.17721   ,  0.25786   ,  0.42120999, -0.13085   , -0.32839   ,
        0.39208999, -0.050214  , -0.46766999, -0.063107  , -0.0023065 ,
        0.21005   ,  0.26982   , -0.22652   , -0.42958999, -0.89682001,
        0.21932   , -0.0020377 ,  0.1358    , -0.12661999, -0.058927  ,
        0.0049502 , -0.28457999, -0.29530999, -0.29295999, -0.24212   ,
        0.091915  ,  0.01977   ,  0.14503001,  0.26495999,  0.10817   ,
        0.029115  ,  0.075254  ,  0.16463999,  0.12097   , -0.37494001,
        0.52671999,  0.094318  , -0.054813  , -0.021008  ,  0.081353  ,
        0.18735   , -0.14458001, -0.031203  ,  0.31753999,  0.027703  ,
       -0.28657001,  0.34630999, -0.27772   ,  0.18669   , -0.11

## Embedding 设为不需训练，直接输入转换后的向量

In [42]:
model = tf.keras.Sequential()

# trainable=False：不需训练，直接输入转换后的向量
model.add(layers.Embedding(vocab_size, 300, weights=[embedding_matrix], 
                           input_length=maxlen, trainable=False))
model.add(layers.SimpleRNN(128))
model.add(layers.Dense(1, activation='sigmoid'))

# 指定优化器、损失函数
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

print(model.summary())

# 模型训练
model.fit(padded_docs, labels, epochs=50, verbose=0)

# 模型评估
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 4, 300)            4500      
_________________________________________________________________
simple_rnn_5 (SimpleRNN)     (None, 128)               54912     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 59,541
Trainable params: 55,041
Non-trainable params: 4,500
_________________________________________________________________
None
Accuracy: 100.000000


In [50]:
list(np.argmax(model.predict(padded_docs), axis=-1).reshape(-1))



[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

In [38]:
model.predict(padded_docs)



array([[9.9977702e-01],
       [9.9972540e-01],
       [9.9990332e-01],
       [9.9989653e-01],
       [9.9989903e-01],
       [1.1761398e-04],
       [1.1944198e-04],
       [2.3762533e-04],
       [1.7523505e-04],
       [1.6099006e-04]], dtype=float32)