In [75]:
import os
import sys
import math

from collections import Counter
import numpy as np
import random 

import torch
import torch.nn as nn
import torch.nn.functional as F

import nltk
#nltk.download()

import jieba

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 数据准备
## Tokenization

In [76]:
def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, 'r') as f:
        for line in f:
            line = line.strip().split('\t')      # 每一行是英文+翻译的形式
            en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])
            cn.append(['BOS'] +  list(jieba.cut(line[1]))+ ['EOS'])
    return en, cn

train_file = 'nmt/train.txt'
dev_file = 'nmt/dev.txt'
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

## Numericalization

In [77]:
UNK_IDX = 0
PAD_IDX = 1
def build_dict(sentences, max_words=50000):
    word_count = Counter() 
    for sentence in sentences:
        for s in sentence:
            word_count[s] += 1  
    ls = word_count.most_common(max_words) 
    print(len(ls)) 
    total_words = len(ls) + 2
    word_dict = {w[0]: index+2 for index, w in enumerate(ls)}
    
    word_dict["UNK"] = UNK_IDX 
    word_dict["PAD"] = PAD_IDX 
    return word_dict, total_words

en_dict, en_total_words = build_dict(train_en) 
cn_dict, cn_total_words = build_dict(train_cn)

inv_en_dict = {v: k for k, v in en_dict.items()}
inv_cn_dict = {v: k for k, v in cn_dict.items()}

2014
3031


In [78]:
def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
    return out_en_sentences, out_cn_sentences

train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)

In [79]:
def get_batches(n, batch_size, shuffle=True):
    idx_list = np.arange(0, n, batch_size) 
    if shuffle:
        np.random.shuffle(idx_list)
    batches = []
    for idx in idx_list:
        batches.append(np.arange(idx, min(idx + batch_size, n)))
    return batches

def sent_padding(seqs):
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs) 
    max_len = np.max(lengths) # 取出最长的的语句长度
    x = np.zeros((n_samples, max_len)).astype('int32')
    x_lengths = np.array(lengths).astype("int32")
    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
    return x, x_lengths 

def get_examples(en_sentences, cn_sentences, batch_size):
    batches = get_batches(len(en_sentences), batch_size)
    all_ex = []
    for batch in batches: 
        mb_en_sentences = [en_sentences[t] for t in batch]        
        mb_cn_sentences = [cn_sentences[t] for t in batch]
        # padding
        mb_x, mb_x_len = sent_padding(mb_en_sentences)
        mb_y, mb_y_len = sent_padding(mb_cn_sentences)
        
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
        # （英文句子，英文句子长度，中文句子，中文句子长度） 
    return all_ex


batch_size = 64
train_data = get_examples(train_en, train_cn, batch_size)  # (mb_x, mb_x_len, mb_y, mb_y_len)
random.shuffle(train_data) 
dev_data = get_examples(dev_en, dev_cn, batch_size) 

# 模型架构
## Encoder

In [80]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        out = out[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        # hid: [2, batch_size, enc_hidden_size]
        hid = torch.cat([hid[-2], hid[-1]], dim=1) # 将最后一层的hid的双向拼接
        # hid: [batch_size, 2*enc_hidden_size]
        hid = torch.tanh(self.fc(hid)).unsqueeze(0)
        # hid: [1, batch_size, dec_hidden_size]
        # out: [batch_size, seq_len, 2*enc_hidden_size]
        return out, hid

## Decoder

In [81]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        # enc_hidden_size跟Encoder的一样
        super(Attention, self).__init__()
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size

        self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size, bias=False)
        self.linear_out = nn.Linear(enc_hidden_size*2 + dec_hidden_size, dec_hidden_size)
        
    def forward(self, output, context, mask):
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = context.size(1) # input_len = context_len
        context_in = self.linear_in(context.view(batch_size*input_len, -1)).view(                
            batch_size, input_len, -1) # batch_size, context_len, dec_hidden_size
        attn = torch.bmm(output, context_in.transpose(1,2)) 
        attn = F.softmax(attn, dim=2) 
        context = torch.bmm(attn, context) 
        output = torch.cat((context, output), dim=2) 
        output = output.view(batch_size*output_len, -1)
        output = torch.tanh(self.linear_out(output)) 
        output = output.view(batch_size, output_len, -1)
        return output, attn

In [82]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2,hidden_size=100):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(dec_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def create_mask(self, x_len, y_len):
        device = x_len.device
        max_x_len = x_len.max()
        max_y_len = y_len.max()
        x_mask = torch.arange(max_x_len, device=device)[None, :] < x_len[:, None]
        y_mask = torch.arange(max_y_len, device=device)[None, :] < y_len[:, None]
        mask = ( ~ x_mask[:, :, None] * y_mask[:, None, :]).byte()
        return mask
    
    def forward(self, encoder_out, x_lengths, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]
        y_sorted = self.dropout(self.embed(y_sorted)) # batch_size, output_length, embed_size
        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        mask = self.create_mask(y_lengths, x_lengths)
        output, attn = self.attention(output_seq, encoder_out, mask) 
        output = F.log_softmax(self.out(output), -1) 
        return output, hid, attn

## Model: Seq2seq

In [83]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        # print(hid.shape)=torch.Size([1, batch_size, dec_hidden_size])
        # print(out.shape)=torch.Size([batch_size, seq_len, 2*enc_hidden_size])
        output, hid, attn = self.decoder(encoder_out=encoder_out, 
                    x_lengths=x_lengths,
                    y=y,
                    y_lengths=y_lengths,
                    hid=hid)
        # output =(batch_size, output_len, vocab_size)
        # hid.shape = (1, batch_size, dec_hidden_size)
        # attn.shape = (batch_size, output_len, context_len)
        return output, attn

    def translate(self, x, x_lengths, y, max_length=100):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid, attn = self.decoder(encoder_out=encoder_out, 
                    x_lengths=x_lengths,
                    y=y,
                    y_lengths=torch.ones(batch_size).long().to(y.device),
                    hid=hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            attns.append(attn)
        return torch.cat(preds, 1), torch.cat(attns, 1)

## Loss

In [84]:
# masked cross entropy loss
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        input = input.contiguous().view(-1, input.size(2))
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        output = -input.gather(1, target) * mask
        output = torch.sum(output) / torch.sum(mask)
        return output

# 训练
## 参数选择

In [85]:
dropout = 0.2
embed_size = hidden_size = 100
encoder = Encoder(vocab_size=en_total_words,
                    embed_size=embed_size,
                    enc_hidden_size=hidden_size,
                    dec_hidden_size=hidden_size,
                    dropout=dropout)
decoder = Decoder(vocab_size=cn_total_words,
                    embed_size=embed_size,
                    enc_hidden_size=hidden_size,
                    dec_hidden_size=hidden_size,
                    dropout=dropout)
model = Seq2Seq(encoder, decoder)
model = model.to(device)
loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

## 定义训练函数

In [86]:
def train(model, data, num_epochs=2):
    for epoch in range(num_epochs):
        model.train()
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            #（英文batch，英文长度，中文batch，中文长度）         
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()  
            # 前n-1个单词作为输入，后n-1个单词作为输出，因为输入的前一个单词要预测后一个单词
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
           
            mb_y_len[mb_y_len<=0] = 1
            
            optimizer.zero_grad()
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()  # 下三角矩阵

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            
            num_words = torch.sum(mb_y_len).item()  # 一个batch里多少个单词 
            total_loss += loss.item() * num_words 
            total_num_words += num_words
          
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            #为了防止梯度过大，设置梯度的阈值 
            optimizer.step()
            
            if it % 100 == 0:
                print("Epoch", epoch, "iteration", it, "loss", loss.item())

        print("Epoch", epoch, "Training loss", total_loss/total_num_words)
        if epoch % 5 == 0:
            evaluate(model, dev_data) 

## 评估函数

In [87]:
def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    with torch.no_grad():#不需要更新模型，不需要梯度
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
    print("Evaluation loss", total_loss/total_num_words)

## 预测

In [88]:
def translate_dev(i):
    en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])
    print(en_sent)
    cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])
    print("".join(cn_sent))

    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)   
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)

    translation, attn = model.translate(mb_x, mb_x_len, bos)
    # 这里传入bos作为首个单词的输入
    #translation=tensor([[ 8,  6, 11, 25, 22, 57, 10,  5,  6,  4]], device='cuda:0')
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
     
    trans = []
    for word in translation:
        if word != "EOS": 
            trans.append(word) 
        else:
            break
    print("".join(trans))


In [89]:
train(model, train_data, num_epochs=30)

Epoch 0 iteration 0 loss 8.011500358581543
Epoch 0 Training loss 6.756094622533424
Evaluation loss 5.6999799464571765
Epoch 1 iteration 0 loss 5.798335075378418
Epoch 1 Training loss 5.466534294509326
Epoch 2 iteration 0 loss 5.529472351074219
Epoch 2 Training loss 5.285823825766366
Epoch 3 iteration 0 loss 5.407910346984863
Epoch 3 Training loss 5.166656795135513
Epoch 4 iteration 0 loss 5.3240885734558105
Epoch 4 Training loss 5.073005681552489
Epoch 5 iteration 0 loss 5.256463050842285
Epoch 5 Training loss 4.991418913231434
Evaluation loss 5.405315972189129
Epoch 6 iteration 0 loss 5.183313369750977
Epoch 6 Training loss 4.9111994279650535
Epoch 7 iteration 0 loss 5.100142478942871
Epoch 7 Training loss 4.8293622338364735
Epoch 8 iteration 0 loss 5.013706684112549
Epoch 8 Training loss 4.751505716461744
Epoch 9 iteration 0 loss 4.955442905426025
Epoch 9 Training loss 4.669564300295297
Epoch 10 iteration 0 loss 4.870354175567627
Epoch 10 Training loss 4.586660949672792
Evaluation lo

In [90]:
for i in range(100,120):
    translate_dev(i)
    print()

BOS you may go anywhere . EOS
BOS 你 可以 随便 去 哪儿 。 EOS
你可以任何你。

BOS do n't UNK me . EOS
BOS UNK UNK 我 。 EOS
請我。

BOS here 's your tea . EOS
BOS 这 是 你 的 UNK 。 EOS
你是你的。

BOS i 'm UNK UNK . EOS
BOS 我 真是 UNK 。 EOS
我的房子。

BOS tom could be UNK . EOS
BOS 汤姆 可能 是 UNK 。 EOS
汤姆是个好。

BOS what happened that night ? EOS
BOS 这个 晚上 发生 了 什么 ？ EOS
這是什麼？

BOS prices are going up . EOS
BOS UNK 了 。 EOS
这是个一个。

BOS i took a shower . EOS
BOS 我 洗 了 澡 。 EOS
我已經了。

BOS how is your wife ? EOS
BOS 你 太太 怎么样 ？ EOS
我可以了什麼？

BOS please UNK your name . EOS
BOS 请 UNK 一下 您 的 名字 。 EOS
你想你的東西。

BOS have you eaten lunch ? EOS
BOS 你 吃 UNK 飯 了 嗎 ？ EOS
你在哪里？

BOS it 's our pleasure . EOS
BOS 这是 我们 的 UNK 。 EOS
这是个好。

BOS i ca n't remember . EOS
BOS 我 UNK 來 。 EOS
我不喜欢。

BOS he traveled on business . EOS
BOS 他 旅行 UNK 。 EOS
他是个好了。

BOS is his UNK regular ? EOS
BOS 他 的 UNK UNK UNK 嗎 ？ EOS
他是什麼？

BOS he 's very UNK . EOS
BOS 他 说话 很 直接 。 EOS
他是个好。

BOS my UNK are UNK . EOS
BOS 我 的 UNK UNK 。 EOS
我的房子的房子。

BOS she works very hard . EO