In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)


FloatTensor = torch.FloatTensor
LongTensor = torch.LongTensor
ByteTensor = torch.ByteTensor

#本案例(见下图)中利用文章前面的词预测后面的词       seq_lenth = max_length
#文章所有的词汇分为16（BATCH_SIZE）份 ，共有length个单词。  得到矩阵 （16，length/16)
#getch方法将（16，length/16) 矩阵沿着length/16方向分为 一个个 (16,seq_lenth)的小矩阵
#得到 x = （16，seq_length,v) = (b,max_length,v) ==>(m,b,v)
#y为X矩阵沿着length/16方向下移一个单位的矩阵   (m,b,v)



#      X :(m,b,v)
#hidden_state =  [num_layers(=1) * num_directions(=1), b, h]=(1,b,h)
# cell_state =   [num_layers(=1) * num_directions(=1), b, h]=(1,b,h)


# self.lstm = nn.LSTM(input_size=v, hidden_size=h)   #(v,h)   
#LSTM 操作：    outputs, (_, _) = self.lstm(input, (hidden_state, cell_state))             #self.lstm(   (m,b,v) , ( (1,b,h),(1,b,h)  )  

 # 得到的outputs : [m, b, num_directions(=1) * h] = (m,b,h)


# 然后output乘以矩阵： (v,h) * (m,b,h) = (m,b,v)       
#(m,b,v)与实际的y做softmax得到loss 

In [2]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<unk>"], seq))
    return LongTensor(idxs)


def prepare_ptb_dataset(filename, word2index=None):
    corpus = open(filename, 'r', encoding='utf-8').readlines()
    corpus = flatten([co.strip().split() + ['</s>'] for co in corpus])        #corpus  是 （1，len(word) ),不同行的sentence混合了
    #每句话以</s>结尾
    if word2index == None:
        vocab = list(set(corpus))
        word2index = {'<unk>': 0}
        for vo in vocab:
            if word2index.get(vo) is None:
                word2index[vo] = len(word2index)
    #返回corpus中word对应的index的数组
    return prepare_sequence(corpus, word2index), word2index


#将batch_size*nbatch多余的部分去掉
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).   将batch_size*nbatch多余的部分去掉
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).contiguous()
    # print("data.size()=",data.size())     #  data.size()= torch.Size([16, 58099])
    # print("data.size(1)=",data.size(1))    #   data.size(1)= 58099
    # print("data=",data)
  
 
    return data  #data是矩阵   （batchsize，length/batchsize)


def getBatch(data, seq_length):
    #data是矩阵:（batchsize，length/batchsize)           #data.size() = torch.Size([16, 58099]) 
    #inputs,targets都是矩阵，target是inputs的下一个单词   （batch，seq_length) =(16,30)
     for i in range(0, data.size(1) - seq_length, seq_length):
        inputs = Variable(data[:, i: i + seq_length])
        targets = Variable(data[:, (i + 1): (i + 1) + seq_length].contiguous())
        yield (inputs, targets)

In [16]:
train_data, word2index = prepare_ptb_dataset(r'C:\workspace\python-work\nlp\NLP-cs224n\NLP-cs224n_ec\notebooks\dataset\ptb\ptb.train.txt',)
dev_data , _ = prepare_ptb_dataset(r'C:\workspace\python-work\nlp\NLP-cs224n\NLP-cs224n_ec\notebooks\dataset\ptb\ptb.valid.txt', word2index)
test_data, _ = prepare_ptb_dataset(r'C:\workspace\python-work\nlp\NLP-cs224n\NLP-cs224n_ec\notebooks\dataset\ptb\ptb.test.txt', word2index)

print('vocab_size,=',len(word2index))
index2word = {v:k for k, v in word2index.items()}


EMBED_SIZE = 128           #d
HIDDEN_SIZE = 256          #h
NUM_LAYER = 1              #l
LR = 0.01
SEQ_LENGTH = 30            # max_length       for BPTT（Back Propagation Trough Time） 
BATCH_SIZE = 16            #b
EPOCH = 1
RESCHEDULED = False

print(train_data.size())
#train_data是矩阵:（batchsize，length/batchsize)             不同行的word已经混合了，然后统一分为batchsize份
train_data = batchify(train_data, BATCH_SIZE)    
print(train_data.size())
# torch.Size([929589])
# torch.Size([16, 58099])            16*58099 = 929584
dev_data = batchify(dev_data, BATCH_SIZE//2)
test_data = batchify(test_data, BATCH_SIZE//2)

vocab_size,= 10000
torch.Size([929589])
torch.Size([16, 58099])


In [36]:
class LanguageModel(nn.Module): 
    def __init__(self, vocab_size, embedding_size, hidden_size, n_layers=1, dropout_p=0.5):

        super(LanguageModel, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embedding_size)  #(d,v)
        self.LSTM = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True)      #(h,d)
        self.linear = nn.Linear(hidden_size, vocab_size)  #(h,v)
        self.dropout = nn.Dropout(dropout_p)
        
    def init_weight(self):
        self.embed.weight = nn.init.xavier_uniform(self.embed.weight)
        self.linear.weight = nn.init.xavier_uniform(self.linear.weight)
        self.linear.bias.data.fill_(0)
        
    def init_hidden(self,batch_size):
        attention = Variable(torch.zeros(self.n_layers,batch_size,self.hidden_size))
        cell = Variable(torch.zeros(self.n_layers,batch_size,self.hidden_size))
        return  (attention, cell)
    
    #保持原网络参数不变，只训练部分分支，或少数几层网络；切断这些分支的反向传播
    def detach_hidden(self, hiddens):
        return tuple([hidden.detach() for hidden in hiddens])
    
    
    
    def forward(self, inputs, hidden, is_training=False): 

        embeds = self.embed(inputs)   #(d,v) * (v,b,seq_length) = （b,seq_length，d)=torch.Size([16, 30, 128])
        if is_training:
            embeds = self.dropout(embeds)
        # attention_state = Variable(torch.zeros(1, len(X), n_hidden))           # [num_layers(=1) * num_directions(=1), b, h]=(1,b,h)
        # cell_state = Variable(torch.zeros(1, len(X), n_hidden))              # [num_layers(=1) * num_directions(=1), b, h]=(1,b,h)
        #hidden = (attention_state,cell_state)
        out,hidden = self.LSTM(embeds, hidden)            #  self.lstm(   (m,b,d) , ( (1,b,h),(1,b,h)  )  
        # out.size() = (b,seq_length,h) 
        # hidden = (hidden_state,cell_state) =  ( (1,b,h) ,(1,b,h)) 
        #第一个返回值(v,h) * (h,b) *= (v,b) 然后softmax，        第二个返回值（h，b)
        return self.linear(out.contiguous().view(out.size(0) * out.size(1), -1)), hidden    
        #(h,v) * (b*seq_length,h) = (b*seq_lenth,v) ==========> (b*seq_length,v) = (16*30,1000) =  torch.Size([480, 10000])
        

    


model = LanguageModel(len(word2index), EMBED_SIZE, HIDDEN_SIZE, NUM_LAYER, 0.5)
print(model)
'''
LanguageModel(
  (embed): Embedding(10000, 128)
  (rnn): LSTM(128, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=10000, bias=True)
  (dropout): Dropout(p=0.5)
)
'''
model.init_weight() 
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)




for epoch in range(EPOCH):
    total_loss = 0
    losses = []
    hidden = model.init_hidden(BATCH_SIZE)  #hidden是元组 (attention,cell)相当于初始化的attention，cell    hidden[0].size()=torch.Size([1, 16, 256])=（1，b，h)
    #train_data是矩阵:（batchsize，length/batchsize)    #SEQ_LENGTH = 30            # for BPTT（Back Propagation Trough Time）
    for i,batch in enumerate(getBatch(train_data, SEQ_LENGTH)):   
    
        # inputs.size()=target.size()  = (b,seq_length)= torch.Size([16, 30]) 
        inputs, targets = batch
        #切断这些分支的反向传播,保持原网络参数不变
        hidden = model.detach_hidden(hidden)
        model.zero_grad()
        preds, hidden = model(inputs, hidden, True)
        #preds.size() = (b*seq_length,v) = (16*30,1000) 
        loss = loss_function(preds, targets.view(-1))     #(v,b,seq_length) 与（v,b,seq_length）之间的softmax
        losses.append(loss.data[0])
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 0.5) # gradient clipping
        optimizer.step()

        if i > 0 and i % 50 == 0:
            print("[%02d/%d] mean_loss : %0.2f, Perplexity : %0.2f" % (epoch,EPOCH, np.mean(losses), np.exp(np.mean(losses))))
            losses = []
        

    if RESCHEDULED == False and epoch == EPOCH//2:
            LR *= 0.1
            optimizer = optim.Adam(model.parameters(), lr=LR)
            RESCHEDULED = True

LanguageModel(
  (embed): Embedding(10000, 128)
  (LSTM): LSTM(128, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=10000, bias=True)
  (dropout): Dropout(p=0.5)
)


  del sys.path[0]
  




[00/1] mean_loss : 7.36, Perplexity : 1567.21


[00/1] mean_loss : 6.66, Perplexity : 781.53


[00/1] mean_loss : 6.58, Perplexity : 719.81


[00/1] mean_loss : 6.46, Perplexity : 639.85


[00/1] mean_loss : 6.58, Perplexity : 723.89


[00/1] mean_loss : 6.41, Perplexity : 610.24


[00/1] mean_loss : 6.40, Perplexity : 599.53


[00/1] mean_loss : 6.21, Perplexity : 499.29


[00/1] mean_loss : 6.23, Perplexity : 506.44


[00/1] mean_loss : 6.05, Perplexity : 424.70


[00/1] mean_loss : 6.09, Perplexity : 439.51


[00/1] mean_loss : 6.08, Perplexity : 435.39


[00/1] mean_loss : 5.89, Perplexity : 360.59


[00/1] mean_loss : 5.97, Perplexity : 389.65


[00/1] mean_loss : 5.94, Perplexity : 381.54


[00/1] mean_loss : 5.97, Perplexity : 392.72


[00/1] mean_loss : 5.87, Perplexity : 353.04


[00/1] mean_loss : 5.85, Perplexity : 348.02


[00/1] mean_loss : 5.81, Perplexity : 335.07


[00/1] mean_loss : 5.83, Perplexity : 341.57


[00/1] mean_loss : 5.69, Perplexity : 295.95


[00/1] mean_loss : 5.80, Perplexity : 330.38


[00/1] mean_loss : 5.70, Perplexity : 298.05


[00/1] mean_loss : 5.69, Perplexity : 297.33


[00/1] mean_loss : 5.63, Perplexity : 278.79


[00/1] mean_loss : 5.68, Perplexity : 292.61


[00/1] mean_loss : 5.65, Perplexity : 285.66


[00/1] mean_loss : 5.70, Perplexity : 297.77


In [35]:
total_loss = 0
hidden = model.init_hidden(BATCH_SIZE//2)
for batch in getBatch(test_data, SEQ_LENGTH):
    inputs,targets = batch
    hidden = model.detach_hidden(hidden)
    model.zero_grad()
    preds, hidden = model(inputs, hidden)
    total_loss += inputs.size(1) * loss_function(preds, targets.view(-1)).data

total_loss = total_loss[0]/test_data.size(1)
print("Test Perpelexity : %5.2f" % (np.exp(total_loss)))

Test Perpelexity : 9812.14


  # Remove the CWD from sys.path while we load stuff.
