# LSTM

RNN容易出现梯度爆炸和梯度消失，因为相邻的时间步是连载一起的，因此权重偏导数要么都小于要么都大于1，因此难以训练，存在短时记忆的问题  
LSTM增加了一个状态C并利用了3个门实现了对信息的更精准的控制，并且能够有效解决信息的长期以来，避免梯度消失或爆炸

遗忘门Forget Gate，决定了上一时刻单元状态$c_{t-1}$有多少保留到当前时刻$c_t$   
输入门Input Gate，决定了当前时刻网络的输入$x_t$有多少保留到单元状态$c_t$  
输出门Output Gate，控制单元状态$c_t$有多少输出到LSTM的当前输出值$h_t$  
比如RNN多了3个线性变换，所以LSTM的参数个数是RNN的4倍

![](./pics/LSTM_model.png)

参考：《Python深度学习基于PyTroch》

## 1 Pytorch实现

In [1]:
import torch
import torch.nn as nn

In [3]:
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2)

LSTM的结果都变成了4x20，正好是RNN的4倍

In [4]:
# 第一层相关参数形状
lstm.weight_ih_l0.shape, lstm.weight_hh_l0.shape, lstm.bias_hh_l0.shape

(torch.Size([80, 10]), torch.Size([80, 20]), torch.Size([80]))

In [5]:
# 第二层相关参数形状
lstm.weight_ih_l1.shape, lstm.weight_hh_l1.shape, lstm.bias_hh_l1.shape

(torch.Size([80, 20]), torch.Size([80, 20]), torch.Size([80]))

In [9]:
input = torch.randn(100, 32, 10)
h_0 = torch.randn(2, 32, 20)
h0 = (h_0, h_0)

In [10]:
output, h_n = lstm(input, h0)

In [12]:
output.size(), h_n[0].size(), h_n[1].size()

(torch.Size([100, 32, 20]), torch.Size([2, 32, 20]), torch.Size([2, 32, 20]))

一个LSTM单元：

In [13]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size, cell_size, output_size):
        super(LSTMCell, self).__init__()
        self.hidden_size = hidden_size
        self.cell_size = cell_size
        self.gate = nn.Linear(input_size + hidden_size, cell_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input, hidden, cell):
        combined = torch.cat((input, hidden), 1)
        f_gate = self.sigmoid(self.gate(combined)) # 遗忘门，控制状态保留到下一个状态
        i_gate = self.sigmoid(self.gate(combined)) # 输入门，控制输入保留成状态
        o_gate = self.sigmoid(self.gate(combined)) # 输出门，控制状态多少进行输出
        z_state = self.tanh(self.gate(combined))
        cell = torch.add(torch.mul(cell, f_gate), torch.mul(z_state, i_gate))
        hidden = torch.mul(self.tanh(cell), o_gate)
        output = self.output(hidden)
        output = self.softmax(output)
        return output, hidden, cell
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    def initCell(self):
        return torch.zeros(1, self.cell_size)

In [17]:
lstm_cell = LSTMCell(input_size=10, hidden_size=20, cell_size=20, output_size=10)

In [18]:
input = torch.randn(32, 10)
h_0 = torch.randn(32, 20)

In [20]:
output, hn, cn = lstm_cell(input, h_0, h_0)

In [21]:
output.size(), hn.size(), cn.size()

(torch.Size([32, 10]), torch.Size([32, 20]), torch.Size([32, 20]))

## 2 LSTM词性预测

In [54]:
training_data = [
    ('The cat ate the fish'.split(), ['DET', 'NN', 'V', 'DET', 'NN']),
    ('They read that book'.split(), ['NN', 'V', 'DET', 'NN'])
]

In [55]:
testing_data = [('They ate the fish'.split())]

In [56]:
word_to_ix = {} # 单词的索引词典
for sent, tags in training_data:
    for word in sent:
        # word = word.lower() # 不知道是否需要
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_to_ix

{'The': 0,
 'cat': 1,
 'ate': 2,
 'the': 3,
 'fish': 4,
 'They': 5,
 'read': 6,
 'that': 7,
 'book': 8}

In [57]:
tag_to_ix = {'DET': 0, 'NN': 1, 'V': 2} # 手工设置词性的索引字典 DET determiner 限定词  NN  V

In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [59]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) # embeddign层
        self.lstm = nn.LSTM(embedding_dim, hidden_dim) 
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
    # 初始化隐含状态state及C
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim), 
                torch.zeros(1, 1, self.hidden_dim))
    def forward(self, sentence):
        # 获得词嵌入矩阵embeds
        embeds = self.word_embeddings(sentence)
        # 按lstm格式修改embdes的形状
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        # 修改隐含状态的形状，作为全连接层的输入
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        # 计算每个单词属于各词性的概率
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

把数据转换成模型要求的格式

In [60]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return tensor

训练网络

In [61]:
EMBEDDING_DIM = 10
HIDDEN_DIM = 3 # 这里等于词性个数
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

简单的运行一次

In [62]:
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)

In [63]:
training_data[0][0], inputs

(['The', 'cat', 'ate', 'the', 'fish'], tensor([0, 1, 2, 3, 4]))

In [64]:
tag_scores

tensor([[-1.4428, -0.9571, -0.9682],
        [-1.3651, -1.0276, -0.9499],
        [-1.4358, -0.9852, -0.9450],
        [-1.4128, -1.0616, -0.8900],
        [-1.5824, -1.0591, -0.8035]], grad_fn=<LogSoftmaxBackward>)

In [65]:
torch.max(tag_scores, 1)

torch.return_types.max(
values=tensor([-0.9571, -0.9499, -0.9450, -0.8900, -0.8035], grad_fn=<MaxBackward0>),
indices=tensor([1, 2, 2, 2, 2]))

训练模型

In [66]:
for epoch in range(400): # 训练400次
    for sentence, tags in training_data:
        model.zero_grad() # 清除网络之前的梯度值
        model.hidden = model.init_hidden() # 重新初始化隐藏层数据
        # 按照网络要求的格式处理输入数据和真实标签数据
        sentence_in = prepare_sequence(sentence, word_to_ix) 
        targets = prepare_sequence(tags, tag_to_ix)
        # 实例化模型
        tag_scores = model(sentence_in)
        # 计算损失，反向传递梯度及更新模型参数
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

In [68]:
# 查看模型训练的结果
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)

In [69]:
training_data[0][0], inputs

(['The', 'cat', 'ate', 'the', 'fish'], tensor([0, 1, 2, 3, 4]))

In [70]:
tag_scores

tensor([[-0.0794, -3.7229, -2.9528],
        [-4.9416, -0.0306, -3.7709],
        [-3.0128, -3.4794, -0.0834],
        [-0.0351, -6.1703, -3.4301],
        [-5.3156, -0.0481, -3.1684]], grad_fn=<LogSoftmaxBackward>)

In [71]:
torch.max(tag_scores, 1) # 精确度为100%

torch.return_types.max(
values=tensor([-0.0794, -0.0306, -0.0834, -0.0351, -0.0481], grad_fn=<MaxBackward0>),
indices=tensor([0, 1, 2, 0, 1]))

测试模型

In [73]:
test_inputs = prepare_sequence(testing_data[0], word_to_ix)
tag_scores01 = model(test_inputs)

In [74]:
testing_data[0], test_inputs

(['They', 'ate', 'the', 'fish'], tensor([5, 2, 3, 4]))

In [75]:
tag_scores01

tensor([[-6.4252, -0.0239, -3.8151],
        [-3.1983, -3.6759, -0.0685],
        [-0.0358, -6.1995, -3.4067],
        [-5.3140, -0.0486, -3.1568]], grad_fn=<LogSoftmaxBackward>)

In [76]:
torch.max(tag_scores, 1) # 精确度为100%

torch.return_types.max(
values=tensor([-0.0794, -0.0306, -0.0834, -0.0351, -0.0481], grad_fn=<MaxBackward0>),
indices=tensor([0, 1, 2, 0, 1]))