# Pytorch中的LSTM
为了使用pytorch中的LSTM，那么先要认识LSTM的输入输出以及初始化参数
- input_size 等同于embedding层的维度
- hidden_size 隐藏层的维度，输出维度也一样，每一个时刻的output就是每一个时刻的hidden state
- num_layers 堆叠的层数，默认是1层，
- bais 内部的仿射运算是否带偏置，默认是True
- batch_first 该值默认是False，此时出入的形状是[seq_len, batch_size, input_size],但是通常设置为True
- dropout 默认为0，如果非0，那么在除了最后一层的每一层厚添加一个Dropout层
- bidirectional 默认为False,选择是否为双向LSTM

输入输出格式
- input:    [seq_len, batch_size, input_size]
- hidden_0: [num_layers$\times$num_directions, batch_size, hidden_size]
- c_0:      [num_layers$\times$num_directions, batch_size, hidden_size]
- output:   [seq_len, batch_size, hidden_size]
- hidden_n: [num_layers$\times$num_directions, batch_size, hidden_size]
- c_n:      [num_layers$\times$num_directions, batch_size, hidden_size]

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as func
import torch.optim as optim

In [15]:
# 下面的实验，先将一个长度为5的序列拆分，每一个元素分别作为输入。
lstm = nn.LSTM(3, 3)# input size is 3, output size is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # 生成一个长度为5的序列
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))
for i in inputs:
    # 由于整个输入只有一个元素，那么输出也只有一个元素，并且同时输出该长度为1的序列最后时刻的隐藏状态
    out, hidden = lstm(i.view(1, 1, -1), hidden)

In [16]:
# 在实际模型训练的时候，肯定是不可能将一个序列拆开，这样就失去了LSTM原本的意义
inputs = torch.cat(inputs).view(len(inputs), 1, -1) # Tensor.Size(5, 1, 3)
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[ 0.1739,  0.3531, -0.2200]],

        [[ 0.1449,  0.3306, -0.1245]],

        [[ 0.1511,  0.4159, -0.2037]],

        [[ 0.2128,  0.2146, -0.2572]],

        [[ 0.2415,  0.2041, -0.1552]]], grad_fn=<StackBackward>)
(tensor([[[ 0.2415,  0.2041, -0.1552]]], grad_fn=<StackBackward>), tensor([[[ 0.8400,  0.6427, -0.4190]]], grad_fn=<StackBackward>))


# 使用LSTM进行序列标注
模型输入的句子是$w_1, w_2, \ldots, w_m$，其中$w_i \in V$，标签的集合定义为T, $y_i$为$w_i$的标签，用$\hat{y_i}$表示对单词$w_i$的词性预测。
<br>这是一个结构预测模型，我们的输出$\hat{y_1}, \hat{y_2}, \ldots, \hat{y_m}$,其中$\hat{y_i} \in T$
<br>
模型的运算流程如下，将每一个单词embedding后输入到LSTM中，得到每一个词对应的隐状态$h_i$。同样的，需要对每一个标签进行标号。与word_to_ix字典类似。而$\hat{y_i}$的计算过程如下
$$\hat{y_i} = argmax_j\big(logSoftmax(Ah_i + b)\big)$$
### 准备数据

In [18]:
def prepare_sequence(sequence, to_ix):
    idxs = [to_ix[w] for w in sequence]
    return torch.tensor(idxs, dtype=torch.long)

train_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

word_to_ix = dict()
for sent, labels in train_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


### 构建模型

In [26]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim)
        self.hidden2tag = nn.Linear(self.hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        return (
            torch.zeros(1, 1, self.hidden_dim),
            torch.zeros(1, 1, self.hidden_dim)
        )
    
    def forward(self, sentence):
        embeds = self.embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space= self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_score = func.log_softmax(tag_space, dim=1)
        return tag_score

### 训练模型

In [34]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

with torch.no_grad():
    inputs = prepare_sequence(train_data[0][0], word_to_ix)
    tag_score = model(inputs)
    print(tag_score)

# 迭代训练300次
for epoch in range(300):
    for sent, tags in train_data:
        model.zero_grad()
        model.hidden = model.init_hidden()
        inputs = prepare_sequence(sent, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        tag_score = model(inputs)
        loss = loss_func(tag_score, targets)
        loss.backward()
        optimizer.step()
        
with torch.no_grad():
    inputs = prepare_sequence(train_data[0][0], word_to_ix)
    tag_score = model(inputs)
    print(tag_score)

tensor([[-0.9993, -1.2668, -1.0495],
        [-1.0269, -1.2444, -1.0391],
        [-1.0469, -1.2104, -1.0473],
        [-1.0060, -1.1611, -1.1358],
        [-0.9603, -1.2339, -1.1207]])
tensor([[-1.0216, -1.1727, -1.1073],
        [-1.0585, -1.1451, -1.0942],
        [-1.0849, -1.1191, -1.0922],
        [-1.0385, -1.0590, -1.2067],
        [-0.9815, -1.1371, -1.1891]])
