# 语言模型

$$P(w1,w2,..wn) = P(w1)·P(w2|w1)·P(w3|w1,w2)·...·P(wn|w1,w2,...w_{n-1})$$

学习语言模型，以及如何训练一个语言模型

学习torchtext的基本使用方法

构建 vocabulary， word to index 和 index to word

学习torch.nn的一些基本模型
Linear RNN LSTM GRU

RNN的训练技巧

Gradient Clipping

如何保存和读取模型


In [0]:

import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()

# 为了保证实验结果可以复现，我们经常会把各种random seed固定在某一个值

random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)

BATCH_SIZE = 32
EMBEDDING_SIZE = 25
MAX_VOCAB_SIZE = 30000 #src=500000
HIDDEN_SIZE = 50


我们会继续使用上次的text8作为我们的训练，验证和测试数据

TorchText的一个重要概念是Field，它决定了你的数据会如何被处理。我们使用TEXT这个field来处理文本数据。我们的TEXT field有lower=True这个参数，所以所有的单词都会被lowercase。

torchtext提供了LanguageModelingDataset这个class来帮助我们处理语言模型数据集。

build_vocab可以根据我们提供的训练数据集来创建最高频单词的单词表，max_size帮助我们限定单词总量。

BPTTIterator可以连续地得到连贯的句子，BPTT的全程是back propagation through time。

In [0]:
TEXT = torchtext.data.Field(lower=True) #定义Field，名称为TEXT，用于预处理

#划分数据集，path="."表示当前文件夹
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path=".", 
    train="text8.train.txt", validation="text8.dev.txt", test="text8.test.txt", text_field=TEXT)

TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE) #创建大小为MAX_VOCAB_SIZE的词库，
#实际上大小是MAX_VOCAB_SIZE+2，TorchText会增加了两个特殊的token，<unk>表示未知的单词，<pad>表示padding。


In [6]:
#pass
len(TEXT.vocab) #50002
TEXT.vocab.itos[:10] #indextostring ['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']
TEXT.vocab.stoi["apple"] #stringtoindex 1273


1273

In [0]:
device = torch.device("cuda" if USE_CUDA else "cpu")
# 定义迭代器
train_iter,val_iter,test_iter = torchtext.data.BPTTIterator.splits((train,val,test),batch_size=BATCH_SIZE, device=device, bptt_len=50,repeat=False,shuffle=True)

#bptt_len随时间反向传播的序列长度.hang seqLengt句子长度？

In [15]:
#pass
it = iter(train_iter)
batch = next(it)
batch #包含text和target，维度是【seqlength，batchsize】
#---batch
#[torchtext.data.batch.Batch of size 32]
#	[.text]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
#	[.target]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]

#取第一维度的数据看下，用前面的单词去预测后一个单词
print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
print()
print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the

originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization


## 定义模型

In [0]:
import torch.nn as nn
class RNNModel(nn.Module):
    def __init__(self,rnn_type,vocab_size,embed_size,hidden_size):
        #rnn_type:model类型，RNN，LSTM，GRU
        super(RNNModel,self).__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size,embed_size) #输出【batchsize，embeddingdim】
        self.lstm = nn.LSTM(embed_size,hidden_size,batch_first=False) #TODO
        self.linear = nn.Linear(hidden_size,vocab_size) #计算\haty,
    def forward(self,text,hidden):
        #text大小是【seqlength，batchsize】
        emb = self.embed(text) #输出是【seqlength，batchsize，embeddingdim】
        output, hidden = self.lstm(emb,hidden)
        #output输出大小【seq_len, batchsize, num_directions*hidden_size】
        #hidden输出大小(num_layers*num_directions, batch, hidden_size)
        
        """
        #因为线性变换的输入维度是2，而outpit的输出维度是3，要view成为2维的
        output = output.view(-1,outpit.shape[2]) # 输出是【seqlen*batchsize，hiddensize】
        out_vocab = self.linear(output) # 输出是【seqlenth*batchsize，vocabsize】

        #变成【seqlen，batchsize，vocabsize】
        out_vocab = out_vocab.view(output.shape[0],output.shape[1],out_vocab.shape[-1])
        #TODO：why要变回来
        """
        #nn.Linear()线性变化只针对最后一维，只要保证最后一维是输入就行
        out_vocab = self.linear(output) # 输出是【seqlenth,batchsize，vocabsize】
        
        #TODO:why 不加softmax 答：lossfn是CrossEntropyLoss

        return out_vocab, hidden
    
    def init_hidden(self,batchsize,requires_grad=True):
        #TODO:
        weight = next(self.parameters())
        return (weight.new_zeros((1,batchsize,self.hidden_size),requires_grad=True),
            weight.new_zeros((1,batchsize,self.hidden_size),requires_grad=True))
        # 返回hiddenstate初始状态h0和 cellstate初始状态c0
    

In [0]:
model = RNNModel(rnn_type="LSTM",
                 vocab_size=len(TEXT.vocab),
                 embed_size=EMBEDDING_SIZE,
                 hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    model = model.to(device)

In [42]:
#pass
model

RNNModel(
  (embed): Embedding(30002, 25)
  (lstm): LSTM(25, 50)
  (linear): Linear(in_features=50, out_features=30002, bias=True)
)

In [0]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    #为了防止内存爆炸，将hidden的图节点断掉，只保留值，不保留计算图的信息
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def evaluate(model, data):
    model.eval()
    total_loss = 0.
    total_count = 0.
    with torch.no_grad():
        it = iter(data)
        hidden = model.init_hidden(BATCH_SIZE,requires_grad=False)
        #注意：只有在语言模型中，不同batch之间是连续的（一篇文章分成了不同batch），
        #所以hidden适合一直传下去，但内存会爆炸
        for batchIndex,batch in enumerate(it):
            data, target = batch.text, batch.target
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
            #croosentropy loss，希望haty是【样本数，vocabsize】，target是【样本数】，
            #原haty【seqlenth,batchsize，vocabsize】，原traget【seqlength，batchsize】，所以要view下
            loss = lossfn(output.view(-1,len(TEXT.vocab)),target.view(-1))
             
            #*表示解包，得到seqlength和batchszie，相乘得到样本数
            total_count = np.multiply(*data.size())
            total_loss = loss.item()*total_count#crossentropyloss默认返回均值
    model.train() #val集合上完了，继续再train集合上训练
    return total_loss /total_count


## train and save model

In [0]:
lossfn = nn.CrossEntropyLoss()
learning_rate=0.01
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) #lr decay
GRAD_CLIP = 5.0
NUM_EPOCH = 5
valbestloss = 9999

for epochIndex in range(NUM_EPOCH):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    #注意：只有在语言模型中，不同batch之间是连续的（一篇文章分成了不同batch），
    #所以hidden适合一直传下去，但内存会爆炸
    for batchIndex,batch in enumerate(it):
        data, target = batch.text, batch.target
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        #croosentropy loss，希望haty是【样本数，vocabsize】，target是【样本数】，
        #原haty【seqlenth,batchsize，vocabsize】，原traget【seqlength，batchsize】，所以要view下
        loss = lossfn(output.view(-1,len(TEXT.vocab)),target.view(-1))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), GRAD_CLIP)
        optimizer.step()

        if batchIndex %1000:#每1000batch再验证集上计算valloss
            valloss = evaluate(model, val_iter)
            if valloss < valbestloss:
                valbestloss = valloss
                torch.save(model.state_dict(), "lm.pth") #save模型的参数，文件名为lm.pth
                print("model saved")
            else: #每次loss不降就降lr，也可以连续n次不下降，这里为了简单
                scheduler.step() #lr decay

            print("epochIndex:",epochIndex, loss.item(),valloss)



## 加载saved模型


In [90]:
#先初始化模型，然后加载参数
mysavedmodel = RNNModel(rnn_type="LSTM",
                 vocab_size=len(TEXT.vocab),
                 embed_size=EMBEDDING_SIZE,
                 hidden_size=HIDDEN_SIZE)
if USE_CUDA:
    mysavedmodel = mysavedmodel.to(device)
mysavedmodel.load_state_dict(torch.load("lm.pth"))

testloss = evaluate(mysavedmodel,test_iter)
print(testloss)

5.525225639343262


## 生成句子


In [132]:
hidden = mysavedmodel.init_hidden(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

words = ["he"]
wordid = TEXT.vocab.stoi[words[0]]
input = torch.tensor(wordid).long().view(1,1).to(device) #model的forward输入是【seqlenth，batchsize】
# 先随机生成一个【1，1】大小的随机数，最大为vocabsize
# input = torch.randint(len(TEXT.vocab), (1, 1), dtype=torch.long).to(device)

for i in range(10):
    output,hidden = mysavedmodel(input,hidden)
    word_weights = output.squeeze().exp().cpu()# squeeze，去掉维度为1的
    word_id = torch.multinomial(word_weights,1)[0] #选择概率最大的word
    # print(input)
    input.fill_(word_id)#TODO:why fill not concat
    # print(input)
    word = TEXT.vocab.itos[word_id]
    words.append(word)

print(" ".join(words))



he says that the conservative he is incapable of both special


In [146]:
hidden = mysavedmodel.init_hidden(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
words = ["he"]
wordslist = [TEXT.vocab.stoi[words[0]]]
input = torch.tensor(wordlist[0]).long().view(1,1).to(device) #model的forward输入是【seqlenth，batchsize】

for i in range(1,15):
    output,hidden = mysavedmodel(input,hidden)
    word_weights = output.squeeze().exp().cpu()
    word_id = torch.multinomial(word_weights,1)[0] #选择概率最大的word
    wordslist.append(word_id)
    input = torch.tensor(wordslist).long().view(i+1,1).to(device)
    word = TEXT.vocab.itos[word_id]
    words.append(word)

print(" ".join(words))


he was died placed became died in produced married later little had was one founded
