### [序列模型和基于LSTM的循环神经网络](https://zhuanlan.zhihu.com/p/28448135)

In [1]:
import torch
import torch.autograd as autograd # torch中自动计算梯度模块
import torch.nn as nn             # 神经网络模块
import torch.nn.functional as F   # 神经网络模块中的常用功能 
import torch.optim as optim       # 模型优化器模块

torch.manual_seed(1)    

<torch._C.Generator at 0x7ffb081a3930>

In [3]:
# lstm单元输入和输出维度都是3
lstm = nn.LSTM(3, 3)  
# 生成一个长度为5，每一个元素为1*3的序列作为输入，这里的数字3对应于上句中第一个3
inputs = [autograd.Variable(torch.randn((1, 3))) for _ in range(5)]  

# 设置隐藏层维度，初始化隐藏层的数据
hidden = (autograd.Variable(torch.randn(1, 1, 3)),autograd.Variable(torch.randn((1, 1, 3))))

for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
print(out)
print('-------------')
print(hidden)

Variable containing:
(0 ,.,.) = 
 -0.0935  0.1743  0.1439
[torch.FloatTensor of size 1x1x3]

-------------
(Variable containing:
(0 ,.,.) = 
 -0.0935  0.1743  0.1439
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
 -0.2359  0.4571  0.3482
[torch.FloatTensor of size 1x1x3]
)


In [4]:
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(torch.randn((1, 1, 3))))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

Variable containing:
(0 ,.,.) = 
 -0.3550  0.1404  0.4250

(1 ,.,.) = 
 -0.0541  0.0796  0.0990

(2 ,.,.) = 
 -0.0918  0.1193  0.0291

(3 ,.,.) = 
 -0.0955  0.1475 -0.2148

(4 ,.,.) = 
 -0.0807  0.1724  0.1495
[torch.FloatTensor of size 5x1x3]

(Variable containing:
(0 ,.,.) = 
 -0.0807  0.1724  0.1495
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
 -0.2022  0.4516  0.3600
[torch.FloatTensor of size 1x1x3]
)


In [5]:
training_data = [("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
                        ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])]
#"DET", "NN", "V"分别代表的是冠词、名词、和动词

In [6]:
word_to_ix = {} # 单词的索引字典
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)  #通过已有长度来标号 自动不断+1 这个可以
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2} # 手工设定词性标签数据字典

{'the': 3, 'read': 6, 'that': 7, 'dog': 1, 'apple': 4, 'ate': 2, 'Everybody': 5, 'The': 0, 'book': 8}


In [43]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
    
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.hidden_dim = hidden_dim
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size) #3*3
        self.hidden = self.init_hidden() # 2*(1*1*3)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

    def init_hidden(self):
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        #print(embeds)
        #print(embeds.view(len(sentence), 1, -1)) 
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden) 
        #view函数将张量x变形成一维向量形式，总特征数不变  / 感觉类似于reshape
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores


In [35]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    #print('idxs: ',idxs)
    tensor = torch.LongTensor(idxs)
    #print('tensor: ',tensor)
    return autograd.Variable(tensor)

In [45]:
EMBEDDING_DIM = 5
HIDDEN_DIM = 3

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
#print(model.word_embeddings)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(training_data[0][0])
print(inputs)
print(tag_scores)

['The', 'dog', 'ate', 'the', 'apple']
Variable containing:
 0
 1
 2
 3
 4
[torch.LongTensor of size 5]

Variable containing:
-1.2643 -1.1219 -0.9367
-1.2868 -0.9703 -1.0646
-1.2397 -1.0354 -1.0344
-1.1445 -1.0413 -1.1129
-1.2212 -1.0483 -1.0368
[torch.FloatTensor of size 5x3]



In [46]:
for epoch in range(300):  # 我们要训练300次，可以根据任务量的大小酌情修改次数。
    for sentence, tags in training_data:
        # 清除网络先前的梯度值，梯度值是Pytorch的变量才有的数据，Pytorch张量没有
        model.zero_grad()
        # 重新初始化隐藏层数据，避免受之前运行代码的干扰
        model.hidden = model.init_hidden()
        # 准备网络可以接受的的输入数据和真实标签数据，这是一个监督式学习
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
        # 运行我们的模型，直接将模型名作为方法名看待即可
        tag_scores = model(sentence_in)
        # 计算损失，反向传递梯度及更新模型参数
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# 来检验下模型训练的结果
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(tag_scores)

Variable containing:
-0.1178 -3.2758 -2.6122
-4.7934 -0.0141 -5.1631
-3.4574 -3.6932 -0.0581
-0.0614 -3.7314 -3.3356
-4.2133 -0.0207 -5.1680
[torch.FloatTensor of size 5x3]



In [59]:
# practice
embedding = nn.Embedding(4,2)
 # a batch of 2 samples of 4 indices each
input1 = autograd.Variable(torch.LongTensor([0,1,2,3]))
embedding(input1)

Variable containing:
-1.0650 -0.2801
-0.6246 -0.3273
-0.0420  0.6035
-1.1232 -0.5985
[torch.FloatTensor of size 4x2]