In [15]:
import jieba
import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1.准备数据，建立vocab

In [8]:
df = pd.read_csv('cmn.txt', sep='\t', header=None, names=['en', 'zh'])
my_vocab = {}

# 使用torchtext.vocab.vocab建立中文词表，按字切分，不分词
counter = Counter()
for string_ in df['zh']:
    counter.update(list(string_))
my_vocab['zh'] = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
my_vocab['zh'].set_default_index(my_vocab['zh']['<unk>'])

# 使用torchtext.vocab.vocab建立英文词表，用spacy进行分词
counter = Counter()
for string_ in df['en']:
    counter.update(jieba.cut(string_))
my_vocab['en'] = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
my_vocab['en'].set_default_index(my_vocab['en']['<unk>'])

# 将中文的字和英文词分别转换成vocab中对应的index
def data_process(df):
    data = []
    for raw_zh, raw_en in zip(df['zh'], df['en']):
        zh_tensor_ = torch.LongTensor([my_vocab['zh'][token] for token in list(raw_zh)])
        en_tensor_ = torch.LongTensor([my_vocab['en'][token] for token in jieba.cut(raw_en)])
        data.append((zh_tensor_, en_tensor_))
    return data

train_data = data_process(df)

BATCH_SIZE = 256
PAD_IDX = my_vocab['zh']['<pad>']
BOS_IDX = my_vocab['zh']['<bos>']
EOS_IDX = my_vocab['zh']['<eos>']

# collate_fn，传给DataLoader，对于每一个batch，将其中的句子都pad成和最长的一样长，用PAD_IDX填充
def generate_batch(data_batch):
    zh_batch, en_batch = [], []
    for zh_item, en_item in data_batch:
        zh_batch.append(torch.cat([torch.tensor([BOS_IDX]), zh_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    zh_batch = pad_sequence(zh_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return zh_batch, en_batch

# 用DataLoader获取train数据迭代器
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

# 2.构建Encoder、Decoder和Seq2Seq

In [17]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size  # encoder vocab size
        self.embed = nn.Embedding(vocab_size, embed_size)  # 将vocab size嵌入到embed size
        # GRU循环网络，输入[steps * batch_size * embde_size]，输出[steps * batch_size * hidden_size]
        self.rnn = nn.GRU(embed_size, hidden_size)  
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # 返回encoder的输出，大小为[steps*batch_size*hidden_size]
        # 返回encoder GRU隐层的最后一步
        embedded = self.dropout(self.embed(x))
        enc_output, enc_hidden = self.rnn(embedded)
        return enc_output, enc_hidden  


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size  # decoder vocab size
        self.embed = nn.Embedding(vocab_size, embed_size)  # 将vocab size嵌入到embed size
        # GRU循环网络，输入[steps*batch_size*embde_size]，输出[steps*batch_size*hidden_size]
        self.rnn = nn.GRU(embed_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)  # 全连接层，输出尺寸为decoder vocab size
        self.dropout = nn.Dropout(dropout)

    def forward(self, y, hidden):
        embedded = self.dropout(self.embed(y))
        dec_output, hidden = self.rnn(embedded, hidden)
        dec_output = self.fc(dec_output)
        return dec_output, hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        enc_output, hidden = self.encoder(src)  # 首先拿到encoder的output和最后一个时间步的隐状态
        max_len, batch_size = tgt.shape[0], tgt.shape[1]  
        # Seq2Seq output尺寸为[tgt_max_len*batch_size*tgt_vocab_size]
        output = torch.zeros(max_len, batch_size, self.decoder.vocab_size).to(device)
        # 先拿tgt的第一个时间步，即<bos>开始，输入到decoder中，第一个时刻的hidden为encoder的
        # 最后一个时间步的hidden
        y = tgt[0, :]  
        # 第二步开始，遍历tgt的每一个时间步，decoder输入为上一时刻的预测结果，已经上一时刻的hidden
        for t in range(1, max_len):  
            y.unsqueeze_(0)
            y, hidden = self.decoder(y, hidden)
            y.squeeze_(0)
            output[t] = y
            y = y.max(1)[1]
        return output
    
# 初始化encoder、decoder和Seq2Seq
enc = Encoder(vocab_size=len(my_vocab['zh']), embed_size=64, hidden_size=64)
dec = Decoder(vocab_size=len(my_vocab['en']), embed_size=64, hidden_size=64)
model = Seq2Seq(enc, dec).to(device)
model

Seq2Seq(
  (encoder): Encoder(
    (embed): Embedding(3441, 64)
    (rnn): GRU(64, 64)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (decoder): Decoder(
    (embed): Embedding(6923, 64)
    (rnn): GRU(64, 64)
    (fc): Linear(in_features=64, out_features=6923, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
)

# 3.模型参数初始化

In [24]:
optimizer = optim.Adam(model.parameters(), lr=1e-4)
# loss为交叉熵，忽略PAD_IDX
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(device)

# 查看模型中有多少可学习的参数
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,163,211 trainable parameters


# 4.训练模型

In [25]:
model.train()
for epoch in range(5):
    epoch_loss = 0
    for src, tgt in tqdm(train_iter):
        src = src.to(device)
        tgt = tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[1:].view(-1, output.shape[-1])
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        epoch_loss += loss.item()
    print('epoch:', epoch + 1, ', loss:', epoch_loss / len(train_data))

  0%|          | 0/83 [00:00<?, ?it/s]

epoch: 1 , loss: 0.012823243544485155


  0%|          | 0/83 [00:00<?, ?it/s]

epoch: 2 , loss: 0.012761838645234119


  0%|          | 0/83 [00:00<?, ?it/s]

[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

# 5.使用模型进行翻译

In [20]:
model.eval()
# 先讲中文输入到encoder中，拿到encoder的hidden，从<bos>依次输入到decoder中，
# 直到预测到<eos>停止，或者超过设定的max_len时停止
def translate(zh, max_len=10):
    zh_idx = [my_vocab['zh']['<bos>']] + my_vocab['zh'].lookup_indices(list(zh)) + [my_vocab['zh']['<eos>']]
    zh_idx = torch.tensor(zh_idx, dtype=torch.long, device=device).unsqueeze_(1)
    en_bos = my_vocab['en']['<bos>']
    enc_output, hidden = model.encoder(zh_idx)
    preds = []
    y = torch.tensor([en_bos], dtype=torch.long, device=device)
    for t in range(max_len):
        y.unsqueeze_(1)
        y, hidden = model.decoder(y, hidden)
        y.squeeze_(1)
        y = y.max(1)[1]
        if y.item() == my_vocab['en']['<eos>']:
            break
        preds.append(my_vocab['en'].get_itos()[y.item()])
    return ' '.join(preds)

In [21]:
print(translate('我是一个学生'))

I                  


In [22]:
for zh in df['zh'][0: 100]:
    print(zh, '   ==>   ', translate(zh, max_len=10))

嗨。    ==>    Tom                  
你好。    ==>    I                  
你用跑的。    ==>    I                  
等等！    ==>    Tom                  
你好。    ==>    I                  
让我来。    ==>    I                  
我赢了。    ==>    I                  
不会吧。    ==>    Tom                  
乾杯!    ==>    Tom                  
你懂了吗？    ==>    I                  
他跑了。    ==>    I                  
跳进来。    ==>    I                  
我迷失了。    ==>    I                  
我退出。    ==>    I                  
我沒事。    ==>    I                  
听着。    ==>    Tom                  
不可能！    ==>    Tom                  
没门！    ==>    Tom                  
你确定？    ==>    I                  
试试吧。    ==>    The                  
我们来试试。    ==>    I                  
为什么是我？    ==>    I                  
去问汤姆。    ==>    I                  
冷静点。    ==>    Tom                  
公平点。    ==>    Tom                  
友善点。    ==>    Tom                  
和气点。    ==>    Tom                  
联系我。    ==>    I               