In [None]:
#!pip install paddlenlp

In [None]:
# 用的paddlenlp的word embeddings，具体看 https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/model_zoo/embeddings.md

# 1. Data

In [1]:
import os
import sys
import math

from collections import Counter
import numpy as np
import random 

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm

### 1.1 Load and Read

In [2]:
tr_in = open("couplet/train/in.txt",encoding='utf8').read()
tr_out = open("couplet/train/out.txt",encoding='utf8').read()
te_in = open("couplet/test/in.txt",encoding='utf8').read()
te_out = open("couplet/test/out.txt",encoding='utf8').read()

from paddlenlp.embeddings import TokenEmbedding
token_embedding = TokenEmbedding(embedding_name="w2v.baidu_encyclopedia.target.word-word.dim300")

[2021-11-28 22:41:20,517] [    INFO] - Loading token embedding...
[2021-11-28 22:41:31,392] [    INFO] - Finish loading embedding vector.
[2021-11-28 22:41:31,394] [    INFO] - Token Embedding info:             
Unknown index: 635963             
Unknown token: [UNK]             
Padding index: 635964             
Padding token: [PAD]             
Shape :[635965, 300]


### 1.2 Vocabulary

In [8]:
vocab = list(set(tr_in + tr_out + te_in + te_out))
vocab.insert(0,'<EOS>')
vocab.insert(0,'<BOS>')
vocab.insert(0,'<PAD>')

embeddings = dict()
embeddings[2] = np.random.rand(300,).astype('float32') *2 - 1 # range(-1:1)
embeddings[1] = np.random.rand(300,).astype('float32') *2 - 1 # range(-1:1)
embeddings[0] = np.zeros(300,)

for i,w in enumerate(vocab,start = 3):
    embeddings[i] = token_embedding.search(w).reshape(300,)
    
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for i,w in enumerate(vocab)}

### 1.3 Word2idx

In [63]:
def convert(x, y, word2id):
    in_sentences = [[word2id[w] for w in sent.split()] for sent in x]
    out_sentences = [[word2id[w] for w in sent.split()] for sent in y]
    
    # 根据句子的长度排序
    #sorted_len = sorted([ (i,len(x)) for i,x in enumerate(tr_in.split('\n')[:-3000])], key=lambda x: x[1])
    
    #sorted_index = [x[0] for x in sorted_len]
    #in_sentences = [in_sentences[i] for i in sorted_index]
    #out_sentences = [out_sentences[i] for i in sorted_index]
    
    return in_sentences, out_sentences

train_x, train_y = convert(tr_in.split('\n')[:-3000], tr_out.split('\n')[:-3000], word2idx)
dev_x, dev_y = convert(tr_in.split('\n')[-3000:], tr_out.split('\n')[-3000:], word2idx)
test_x, test_y = convert(te_in.split('\n')[:-1], te_out.split('\n')[:-1], word2idx)

In [64]:
train_x[0]

[4163, 3696, 3597, 3805, 3805, 3928, 1823]

### 1.4 Batch

In [66]:
# 这个函数的作用是我们输入训练集的样本个数， batch_size大小， 就会返回多批 连续的batch_size个索引， 每一个索引代表一个样本
# 也就是可以根据这个索引去拿到一个个的batch
def get_minibatches(n, minibatch_size, shuffle=True):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx+minibatch_size, n)))
    return minibatches      # 这个会返回多批连着的bath_size个索引  
#get_minibatches(len(train_en), 32)

# 这个函数是在做数据预处理， 由于每个句子都不是一样长， 所以通过这个函数就可以把句子进行补齐， 不够长的在句子后面添加0
def prepare_data(seqs):
    lengths = [len(seq) for seq in seqs]    # 得到每个句子的长度
    n_samples = len(seqs)       # 得到一共有多少个句子
    max_len = np.max(lengths)              # 找出最大的句子长度
    
    x = np.zeros((n_samples, max_len)).astype('int32')    # 按照最大句子长度生成全0矩阵
    x_lengths = np.array(lengths).astype('int32')
    for idx, seq in enumerate(seqs):        # 把有句子的位置填充进去
        x[idx, :lengths[idx]] = seq
    return x, x_lengths      # x_mask

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)   # 得到batch个索引
    all_ex = []
    for minibatch in minibatches:   # 每批数据的索引
        mb_en_sentences = [en_sentences[t] for t in minibatch]   # 取数据
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]  # 取数据
        mb_x, mb_x_len = prepare_data(mb_en_sentences) # 填充成一样的长度， 但是要记录一下句子的真实长度， 这个在后面输入网络的时候得用
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    return all_ex

batch_size = 64
train_data = gen_examples(train_x, train_y, batch_size)   # 产生训练集
random.shuffle(train_data)
dev_data = gen_examples(dev_x, dev_y, batch_size)   # 产生验证集

In [73]:
train_data[0][0]

array([[8461,  937, 8009, ...,    0,    0,    0],
       [3253, 7172, 2463, ..., 2821, 6084, 4154],
       [6971,  453, 1519, ...,    0,    0,    0],
       ...,
       [5279,  209, 3087, ...,    0,    0,    0],
       [5393, 3700,  753, ...,    0,    0,    0],
       [1631, 5295, 3815, ...,    0,    0,    0]])

# 2. Model

### 2.1 Encoder(GRU)

In [34]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size*2, dec_hidden_size)
        
    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))   # [batch_size, seq_len, embed_size]
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # [batch_size, seq_len, 2*enc_hidden_size]
        _, original_idx = sorted_idx.sort(0, descending=False)
        out = out[original_idx.long()].contiguous()   # [batch_size, seq_len, 2*enc_hidden_size]
        hid = hid[:, original_idx.long()].contiguous()   # [2, batch_size, enc_hidden_size]
        
        hid = torch.cat([hid[-2], hid[-1]], dim=1)   # 双向的GRU， 这里是最后一个状态， 联结起来  [batch_size, 2*enc_hidden_size]
        hid = torch.tanh(self.fc(hid)).unsqueeze(0)  # [1, batch_size, dec_hidden_size]
        
        return out, hid


### 2.2 Attention

In [35]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(Attention, self).__init__()
        
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size
        
        self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size, bias=False)
        self.linear_out = nn.Linear(enc_hidden_size*2+dec_hidden_size, dec_hidden_size)
    
    def forward(self, output, encoder_output, mask):
        # output: [batch_size, seq_len_y-1, dec_hidden_size]  这个output 是decoder的每个时间步输出的隐藏状态
        # encoder_output: [batch_size, seq_len_x, 2*enc_hidden_size]
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = encoder_output.size(1)
        
        context_in = self.linear_in(encoder_output.view(batch_size*input_len, -1))  # [batch_size*seq_len_x,dec_hidden_size]
        context_in = context_in.view(batch_size, input_len, -1)  # [batch_size, seq_len_x, dec_hidden_size]
        context_in = context_in.transpose(1, 2)   # [batch_size, dec_hidden_size, seq_len_x]
        
        attn = torch.bmm(output, context_in)  # [batch_size, seq_len_y-1, seq_len_x]
        # 这个东西就是求得当前时间步的输出output和所有输入相似性关系的一个得分score , 下面就是通过softmax把这个得分转成权重
        attn = F.softmax(attn, dim=2)    # 此时第二维度的数字全都变成了0-1之间的数， 越大表示当前的输出output与哪个相关程度越大
        
        context = torch.bmm(attn, encoder_output)   # [batch_size, seq_len_y-1, 2*enc_hidden_size]
        
        output = torch.cat((context, output), dim=2)  # [batch_size, seq_len_y-1, 2*enc_hidden_size+dec_hidden_size]
        
        output = output.view(batch_size*output_len, -1)   # [batch_size*seq_len_y-1, 2*enc_hidden_size+dec_hidden_size]
        output = torch.tanh(self.linear_out(output))     # [batch_size*seq_len_y-1, dec_hidden_size]
        output = output.view(batch_size, output_len, -1)  # [batch_size, seq_len_y-1, dec_hidden_size]
        
        return output, attn


### 2.3 Decoder

In [36]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(dec_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
    
    def create_mask(self, x_len, y_len):
        # a mask of shape x_len*y_len
        x_mask = torch.arange(x_len.max(), device=x_len.device)[None, :] < x_len[:, None]
        y_mask = torch.arange(y_len.max(), device=x_len.device)[None, :] < y_len[:, None]
        
        x_mask = x_mask.float()
        y_mask = y_mask.float()
        mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
        return mask
    
    def forward(self, encoder_out, encoder_out_lengths, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]   # 句子从长到短排序
        hid = hid[:, sorted_idx.long()]
        
        y_sorted = self.dropout(self.embed(y_sorted))     # [batch_size, output_length, embed_size]
        
        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()   # [batch_size, seq_len_y-1, dec_hidden_size]
        hid = hid[:, original_idx.long()].contiguous()
        
        mask = self.create_mask(y_lengths, encoder_out_lengths)
        
        output, attn = self.attention(output_seq, encoder_out, mask)
        output = F.log_softmax(self.out(output), -1)
        
        return output, hid, attn


### 2.4 Wrapped Network

In [37]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(encoder_out, x_lengths, y, y_lengths, hid)
        
        return output, attn
    
    def translate(self, x, x_lengths, y, max_length=100):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid, attn = self.decoder(encoder_out, x_lengths, y, torch.ones(batch_size).long().to(y.device), hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            attns.append(attn)
        
        return torch.cat(preds, 1), torch.cat(attns, 1)


### 2.5 Loss Function

In [42]:
# masked cross entropy loss
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()
    
    def forward(self, input, target, mask):
        # input: [batch_size, seq_len, vocab_size]    每个单词的可能性
        input = input.contiguous().view(-1, input.size(2))   # [batch_size*seq_len-1, vocab_size]
        target = target.contiguous().view(-1, 1)    #  [batch_size*seq_len-1, 1]
        
        mask = mask.contiguous().view(-1, 1)   # [batch_size*seq_len-1, 1]
        output = -input.gather(1, target) * mask # 在每个vocab_size维度取正确单词的索引， 但是里面有很多是填充进去的， 所以mask去掉这些填充的
        # 这个其实在写一个NLloss ， 也就是sortmax的取负号
        output = torch.sum(output) / torch.sum(mask)
        
        return output  # [batch_size*seq_len-1, 1]


# 3. Training

### Config Model

In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dropout = 0.2
hidden_size = 100
encoder = Encoder(vocab_size=len(word2id), enc_hidden_size=hidden_size, dec_hidden_size=hidden_size, embed_size = 300, dropout=dropout)
decoder = Decoder(vocab_size=len(word2id), enc_hidden_size=hidden_size, dec_hidden_size=hidden_size, embed_size = 300, dropout=dropout)

model = Seq2Seq(encoder, decoder)
model = model.to(device)
loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

In [60]:
train_data[0]

(array([[3020, 1469, 2508, ...,    0,    0,    0],
        [3413, 1469, 4390, ...,    0,    0,    0],
        [7793, 1469, 1029, ...,    0,    0,    0],
        ...,
        [5923, 1469, 7262, ...,    0,    0,    0],
        [3062, 1469, 5898, ...,    0,    0,    0],
        [1494, 1469, 6855, ...,    0,    0,    0]]),
 array([26, 14, 24, 28, 24, 34,  4, 24, 14, 14, 14, 32, 14, 34, 12, 20, 38,
        10, 24, 14, 56, 24, 14, 18, 10, 26, 48, 24, 18, 14,  4, 18, 22, 14,
        14, 14, 14, 14, 14, 36, 10, 12, 14, 24, 24, 14, 36, 10, 18,  8, 14,
        12, 18, 26, 14, 14, 14, 14, 22, 14,  4, 46, 24, 18]),
 array([[3708, 1469, 3100, ...,    0,    0,    0],
        [ 937, 1469, 8173, ...,    0,    0,    0],
        [8642, 1469, 5316, ...,    0,    0,    0],
        ...,
        [5697, 1469, 2211, ...,    0,    0,    0],
        [8172, 1469, 5495, ...,    0,    0,    0],
        [3708, 1469, 5740, ...,    0,    0,    0]]),
 array([26, 14, 24, 28, 24, 34,  4, 24, 14, 14, 14, 32, 14, 34, 12, 

#### Train and evluate

In [62]:
# 定义训练和验证函数
def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    with torch.no_grad():
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()    # 这个是一个batch的英文句子 大小是[batch_size, seq_len]
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()    # 每个句子的长度
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()  # 解码器那边的输入， 输入一个单词去预测另外一个单词
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()   # 解码器那边的输出  [batch_size, seq_len-1]
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()  # 这个减去1， 因为没有了最后一个  [batch_size, seq_len-1]
            mb_y_len[mb_y_len<=0] =  1   # 这句话是为了以防出错
            
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]  
            # [batch_size, mb_y_len.max()], 上面是bool类型， 下面是float类型， 只计算每个句子的有效部分， 填充的那部分去掉
            mb_out_mask = mb_out_mask.float()  # [batch_size, seq_len-1]  因为mb_y_len.max()就是seq_len-1
            
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
    print('Evaluation loss', total_loss / total_num_words)

def train(model, data, num_epochs=20):
    for epoch in range(num_epochs):
        model.train()
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in  enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1
            
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()
            
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
            # 更新
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)     # 这里防止梯度爆炸， 这是和以往不太一样的地方
            optimizer.step()
            
            if it % 100 == 0:
                print('Epoch', epoch, 'iteration', it, 'loss', loss.item())

        print('Epoch', epoch, 'Training loss', total_loss / total_num_words)
        if epoch % 5 == 0:
            evaluate(model, dev_data)
        
# 训练
train(model, train_data, num_epochs=20)

Epoch 0 iteration 0 loss 2.2863216400146484
Epoch 0 iteration 100 loss 2.1728837490081787
Epoch 0 iteration 200 loss 2.1048381328582764
Epoch 0 iteration 300 loss 2.2445578575134277
Epoch 0 iteration 400 loss 2.1681671142578125
Epoch 0 iteration 500 loss 2.302433490753174
Epoch 0 iteration 600 loss 2.2872703075408936
Epoch 0 iteration 700 loss 2.2662642002105713
Epoch 0 iteration 800 loss 2.198660135269165
Epoch 0 iteration 900 loss 2.1695706844329834
Epoch 0 iteration 1000 loss 2.2012994289398193
Epoch 0 iteration 1100 loss 2.187025785446167
Epoch 0 iteration 1200 loss 2.2352516651153564
Epoch 0 iteration 1300 loss 2.2365846633911133
Epoch 0 iteration 1400 loss 2.2648298740386963
Epoch 0 iteration 1500 loss 2.2279813289642334
Epoch 0 iteration 1600 loss 2.256891965866089
Epoch 0 iteration 1700 loss 2.3395471572875977
Epoch 0 iteration 1800 loss 2.2788023948669434
Epoch 0 iteration 1900 loss 2.2199950218200684
Epoch 0 iteration 2000 loss 2.1266772747039795
Epoch 0 iteration 2100 loss 2

RuntimeError: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0

In [57]:
train_data[0][][0]

array([3020, 1469, 2508, 1469, 8798, 1469, 4261, 1469, 4261, 1469, 2340,
       1469, 6683, 1469, 1101, 1469, 3225, 1469, 3458, 1469, 5898, 1469,
       6747, 1469, 8078, 1469,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])