In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import numpy as np
import zipfile
tf.__version__

'2.0.0'

In [2]:
def load_data_jay_lyrics():
    """Load the Jay Chou lyric data set (available in the Chinese book)."""
    with zipfile.ZipFile('../../data/jaychou_lyrics.txt.zip') as zin:
        with zin.open('jaychou_lyrics.txt') as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
    corpus_chars = corpus_chars[0:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in corpus_chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

In [3]:
corpus_indices, char_to_idx, idx_to_char, vocab_size = load_data_jay_lyrics()
vocab_size

1027

## 简介

它引入了重置门（reset gate）和更新门（update gate）的概念，从而修改了循环神经网络中隐藏状态的计算方式。
### 1.重置门和更新门

门控循环单元中的重置门和更新门的输入均为当前时间步输入$\boldsymbol{X}_t$与上一时间步隐藏状态$\boldsymbol{H}_{t-1}$ ，输出由激活函数为sigmoid函数的全连接层计算得到。

具体来说，假设隐藏单元个数为$h$，给定时间步$t$的小批量输入$\boldsymbol{X}_t \in \mathbb{R}^{n \times d}$（样本数为$n$，输入个数为$d$）和上一时间步隐藏状态$\boldsymbol{H}_{t-1} \in \mathbb{R}^{n \times h}$。重置门$\boldsymbol{R}_t \in \mathbb{R}^{n \times h}$和更新门$\boldsymbol{Z}_t \in \mathbb{R}^{n \times h}$的计算如下：

$$\begin{split}\begin{aligned}
\boldsymbol{R}_t = \sigma(\boldsymbol{X}_t \boldsymbol{W}_{xr} + \boldsymbol{H}_{t-1} \boldsymbol{W}_{hr} + \boldsymbol{b}_r),\\
\boldsymbol{Z}_t = \sigma(\boldsymbol{X}_t \boldsymbol{W}_{xz} + \boldsymbol{H}_{t-1} \boldsymbol{W}_{hz} + \boldsymbol{b}_z),
\end{aligned}\end{split}$$

其中$\boldsymbol{W}_{xr}, \boldsymbol{W}_{xz} \in \mathbb{R}^{d \times h}$和$\boldsymbol{W}_{hr}, \boldsymbol{W}_{hz} \in \mathbb{R}^{h \times h}$是权重参数，$\boldsymbol{b}_r, \boldsymbol{b}_z \in \mathbb{R}^{1 \times h}$是偏差参数。“多层感知机”一节中介绍过，sigmoid函数可以将元素的值变换到0和1之间。因此，重置门$\boldsymbol{R}_t$和更新门$\boldsymbol{Z}_t$中每个元素的值域都是$[0, 1]$。

### 2.候选隐藏状态

接下来，门控循环单元将计算候选隐藏状态来辅助稍后的隐藏状态计算。我们将当前时间步重置门的输出与上一时间步隐藏状态做按元素乘法（符号为 ⊙ ）。如果重置门中元素值接近0，那么意味着重置对应隐藏状态元素为0，即丢弃上一时间步的隐藏状态。如果元素值接近1，那么表示保留上一时间步的隐藏状态。然后，将按元素乘法的结果与当前时间步的输入连结，再通过含激活函数tanh的全连接层计算出候选隐藏状态，其所有元素的值域为$[-1,1]$。
具体来说，时间步$t$的候选隐藏状态$\tilde{\boldsymbol{H}}_t \in \mathbb{R}^{n \times h}$的计算为:

$$\tilde{\boldsymbol{H}}_t = \text{tanh}(\boldsymbol{X}_t \boldsymbol{W}_{xh} + \left(\boldsymbol{R}_t \odot \boldsymbol{H}_{t-1}\right) \boldsymbol{W}_{hh} + \boldsymbol{b}_h),$$

其中$\boldsymbol{W}_{xh} \in \mathbb{R}^{d \times h}$和$\boldsymbol{W}_{hh} \in \mathbb{R}^{h \times h}$是权重参数，$\boldsymbol{b}_h \in \mathbb{R}^{1 \times h}$是偏差参数。从上面这个公式可以看出，重置门控制了上一时间步的隐藏状态如何流入当前时间步的候选隐藏状态。而上一时间步的隐藏状态可能包含了时间序列截至上一时间步的全部历史信息。因此，重置门可以用来丢弃与预测无关的历史信息。

### 3.隐藏状态

时间步$t$的隐藏状态$\boldsymbol{H}_t \in \mathbb{R}^{n \times h}$的计算使用当前时间步的更新门$\boldsymbol{Z}_t$来对上一时间步的隐藏状态$\boldsymbol{H}_{t-1}$和当前时间步的候选隐藏状态 $\tilde{\boldsymbol{H}}_t$做组合：

$$\boldsymbol{H}_t = \boldsymbol{Z}_t \odot \boldsymbol{H}_{t-1}  + (1 - \boldsymbol{Z}_t) \odot \tilde{\boldsymbol{H}}_t.$$

### 4.输出

$$\boldsymbol{O}_t = \boldsymbol{H}_t \boldsymbol{W}_{hq} + \boldsymbol{b}_q.$$


![image](http://zh.d2l.ai/_images/gru_3.svg)


我们对门控循环单元的设计稍作总结：

#### 重置门有助于捕捉时间序列里短期的依赖关系；
#### 更新门有助于捕捉时间序列里长期的依赖关系。


## 从零开始实现

In [4]:
## 初始化模型参数，num_hidden为隐藏层个数
def gru(inputs,states,params):
    w_xr,w_hr,b_r,w_xz,w_hz,b_z,w_xh,w_hh,b_h,w_hq,b_q = params
    H = states,
    outputs = []
    for x in inputs:
        r = keras.activations.sigmoid(tf.matmul(x,w_xr) + tf.matmul(H,w_hr) + b_r)
        z = keras.activations.sigmoid(tf.matmul(x,w_xz) + tf.matmul(H,w_hz) + b_z)
        h_hat = keras.activations.tanh(tf.matmul(x,w_xh) + tf.matmul(tf.multiply(H,r),w_hh) + b_h)
        H = tf.multiply(z,H) + tf.multiply((1 - z),h_hat)
        y = keras.activations.softmax(tf.matmul(H,w_hq)+b_q)
        outputs.append(y)
    return tf.squeeze(tf.stack(outputs)),tf.squeeze(H)

In [5]:
num_inputs, num_hiddens, num_outputs = vocab_size, 256, vocab_size

In [6]:
def init_params():
    def _three():
        return tf.random.normal(shape=(num_inputs,num_hiddens)),tf.random.normal(shape=(num_hiddens,num_hiddens)),tf.zeros(shape=(1,num_hiddens))
    
    w_xr,w_hr,b_r = _three()
    w_xz,w_hz,b_z = _three()
    w_xh,w_hh,b_h = _three()
    
    w_hq = tf.random.normal(shape=(num_hiddens,num_outputs))
    b_q = tf.zeros(shape=(1,num_outputs))
    
    return w_xr,w_hr,b_r,w_xz,w_hz,b_z,w_xh,w_hh,b_h,w_hq,b_q

In [7]:
def init_gru_state(batch_size, num_hiddens):
    return tf.zeros(shape=(batch_size, num_hiddens)) 

In [8]:
def to_onehot(X,depth):
    outs = [tf.one_hot(x,depth=depth) for x in tf.transpose(X)]
    return tf.stack(outs,axis=0)

In [9]:
x = np.array(range(10)).reshape((2,5))
inputs = to_onehot(x,vocab_size)

In [10]:
state = init_gru_state(x.shape[0],num_hiddens)
gru(inputs,state,init_params())

(<tf.Tensor: id=276, shape=(5, 2, 1027), dtype=float32, numpy=
 array([[[1.8270682e-11, 2.5593363e-05, 6.3220362e-08, ...,
          3.9930143e-07, 1.4222721e-02, 3.2587191e-06],
         [4.7124170e-13, 8.5673486e-09, 8.2448855e-07, ...,
          9.7057195e-03, 7.7105735e-09, 1.6129597e-09]],
 
        [[1.7154330e-18, 1.2036309e-15, 1.4542325e-13, ...,
          1.4132907e-18, 1.0188059e-03, 4.0110215e-18],
         [1.4956226e-21, 3.1594125e-16, 6.2469773e-15, ...,
          7.0781132e-09, 2.9770661e-15, 5.6619004e-17]],
 
        [[6.3793612e-18, 9.3383079e-12, 1.6710391e-14, ...,
          1.7759067e-14, 8.0174162e-14, 6.6339607e-20],
         [1.4138936e-19, 7.5977135e-18, 3.5574745e-25, ...,
          5.4918436e-22, 3.0829259e-11, 1.2664023e-13]],
 
        [[1.0444699e-08, 1.4076769e-12, 3.1676660e-12, ...,
          3.1688436e-20, 5.2080968e-15, 8.8241148e-10],
         [4.8150122e-07, 2.3504249e-15, 3.6923222e-24, ...,
          1.3702804e-14, 7.0515479e-13, 1.0034578e-18]],

### tensorflow 中的GRU

In [41]:
import jieba

In [48]:
# 加载数据
def load_data_jay_lyrics():
    """Load the Jay Chou lyric data set (available in the Chinese book)."""
    with zipfile.ZipFile('../../data/jaychou_lyrics.txt.zip') as zin:
        with zin.open('jaychou_lyrics.txt') as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n', '').replace('\r', '')
    corpus_chars = corpus_chars[0:10000]
    jieba_cut = jieba.cut(corpus_chars)
    chars = []
    for char in jieba_cut:
        chars.append(char)
    idx_to_char = list(set(chars))
    char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
    vocab_size = len(char_to_idx)
    corpus_indices = [char_to_idx[char] for char in chars]
    return corpus_indices, char_to_idx, idx_to_char, vocab_size

In [49]:
corpus_indices, char_to_idx, idx_to_char, vocab_size = load_data_jay_lyrics()

In [53]:
len(corpus_indices),len( char_to_idx),len( idx_to_char), vocab_size

(6379, 1258, 1258, 1258)

In [54]:
# 生成训练数据
seq_lenght = 100
dataX = []
dataY = []
for i in range(0,vocab_size-seq_lenght,1):
    seq_in = corpus_indices[i:i+seq_lenght+1]
    dataX.append(seq_in)

np.random.shuffle(dataX)
for i in range(len(dataX)):
    dataY.append([dataX[i][seq_lenght]])
    dataX[i] = dataX[i][:seq_lenght]


In [55]:
len(dataX),len(dataY)

(1158, 1158)

In [56]:
## 构建模型
model = keras.Sequential([
    layers.Embedding(vocab_size,64,input_length=seq_lenght),
    layers.GRU(64,activation="softmax",return_sequences=True),
    layers.GRU(32,activation="softmax",return_sequences=True),
    layers.GRU(16,activation="softmax"),
    layers.Dense(vocab_size,activation="softmax")
])


In [57]:
adam = tf.keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(loss=keras.losses.sparse_categorical_crossentropy, optimizer=adam,metrics=["acc"])

In [58]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 64)           80512     
_________________________________________________________________
gru_10 (GRU)                 (None, 100, 64)           24960     
_________________________________________________________________
gru_11 (GRU)                 (None, 100, 32)           9408      
_________________________________________________________________
gru_12 (GRU)                 (None, 16)                2400      
_________________________________________________________________
dense_4 (Dense)              (None, 1258)              21386     
Total params: 138,666
Trainable params: 138,666
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.fit(dataX,dataY,batch_size=32,epochs=5)

Train on 1158 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x20febbad470>

In [61]:
start = np.random.randint(0, len(dataX) - 1)
pattern = dataX[start]
print("Seed : ")
print(''.join([idx_to_char[value] for value in pattern]))
n_generation = 100  # 生成的长度
print('开始生成，生成长度为', n_generation)
finall_result = []
for i in range(n_generation):
    x = np.reshape(pattern, (1, len(pattern)))
    prediction = model.predict(x, verbose=0)[0]
    index = np.argmax(prediction)
    result = idx_to_char[index]
    # sys.stdout.write(result)
    finall_result.append(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

for i in range(len(finall_result)):
    if finall_result[i] != '。':
        print(finall_result[i], end='')
    else:
        print('。')

Seed : 
 配上几斤的牛肉我说店小二 三两银够不够景色入秋 漫天黄沙凉过塞北的客栈人多 牧草有没有 我马儿有些瘦天涯尽头 满脸风霜落寞 近乡情怯的我相思寄红豆 相思寄红豆无能为力的在人海中漂泊心伤透娘子她人在江南等我 泪不休 语沉默娘子她人在江南等我 泪不休 语沉默一壶好酒 再来一碗热粥 配上几斤的牛肉我说店小二 三两
开始生成，生成长度为 100
的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的的

In [62]:
start = np.random.randint(0, len(dataX) - 1)
pattern = dataX[start]
print(''.join([idx_to_char[value] for value in pattern]))
n_generation = 100  # 生成的长度
x = np.reshape(pattern, (1, len(pattern)))


看远方的星是否听的见手牵手一步两步三步四步望著天 看星星一颗两颗三颗四颗 连成线背著背默默许下心愿看远方的星如果听的见它一定实现它一定实现娘子 娘子却依旧每日 折一枝杨柳你在那里 在小村外的溪边河口默默等著我娘子依旧每日折一枝杨柳你在那里 在小村外的溪边 默默等待 娘子一壶好酒 再来一碗热粥 配上几斤的


In [63]:
model(x)

<tf.Tensor: id=67325, shape=(1, 1258), dtype=float32, numpy=
array([[1.5346650e-03, 1.7639837e-05, 1.6419698e-03, ..., 1.7816583e-05,
        1.8238619e-05, 1.6785410e-03]], dtype=float32)>

In [67]:
tf.argmax(tf.squeeze(model(x))).numpy()

966

In [65]:
tf.reduce_max(model(x))

<tf.Tensor: id=98441, shape=(), dtype=float32, numpy=0.051233962>

In [68]:
idx_to_char[966]

'的'