In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

import pandas as pd
from collections import Counter
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
df = pd.read_csv('cmn.txt', sep='\t', header=None, names=['en', 'zh'])
my_vocab = {}
counter = Counter()
for string_ in df['zh']:
    counter.update(list(string_))
my_vocab['zh'] = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
my_vocab['zh'].set_default_index(my_vocab['zh']['<unk>'])

counter = Counter()
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
for string_ in df['en']:
    counter.update(en_tokenizer(string_))
my_vocab['en'] = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
my_vocab['en'].set_default_index(my_vocab['en']['<unk>'])


def data_process(df):
    data = []
    for raw_zh, raw_en in zip(df['zh'], df['en']):
        zh_tensor_ = torch.tensor([my_vocab['zh'][token] for token in list(raw_zh)],
                                  dtype=torch.long)
        en_tensor_ = torch.tensor([my_vocab['en'][token] for token in en_tokenizer(raw_en)],
                                  dtype=torch.long)
        data.append((zh_tensor_, en_tensor_))
    return data


train_data = data_process(df)

BATCH_SIZE = 256
PAD_IDX = my_vocab['zh']['<pad>']
BOS_IDX = my_vocab['zh']['<bos>']
EOS_IDX = my_vocab['zh']['<eos>']


def generate_batch(data_batch):
    zh_batch, en_batch = [], []
    for zh_item, en_item in data_batch:
        zh_batch.append(torch.cat([torch.tensor([BOS_IDX]), zh_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    zh_batch = pad_sequence(zh_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return zh_batch, en_batch


train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

In [3]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embed(x))
        enc_output, enc_hidden = self.rnn(embedded)
        return enc_output, enc_hidden


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, y, hidden):
        embedded = self.dropout(self.embed(y))
        dec_output, hidden = self.rnn(embedded, hidden)
        dec_output = self.fc(dec_output)
        return dec_output, hidden


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        enc_output, hidden = self.encoder(src)
        max_len, batch_size = tgt.shape[0], tgt.shape[1]
        output = torch.zeros(max_len, batch_size, self.decoder.vocab_size).to(device)
        y = tgt[0, :]
        for t in range(1, max_len):
            y.unsqueeze_(0)
            y, hidden = self.decoder(y, hidden)
            y.squeeze_(0)
            output[t] = y
            y = y.max(1)[1]
        return output

In [4]:
enc = Encoder(vocab_size=len(my_vocab['zh']), embed_size=64, hidden_size=64)
dec = Decoder(vocab_size=len(my_vocab['en']), embed_size=64, hidden_size=64)

model = Seq2Seq(enc, dec, device).to(device)


def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX).to(device)


def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,162,824 trainable parameters


In [130]:
model.train()
for epoch in range(100):
    epoch_loss = 0
    for src, tgt in train_iter:
        src = src.to(device)
        tgt = tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output = output[1:].view(-1, output.shape[-1])
        tgt = tgt[1:].view(-1)
        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        epoch_loss += loss.item()
    print('epoch:', epoch + 1, ', loss:', epoch_loss / len(train_iter))

epoch: 1 , loss: 2.142829372222165
epoch: 2 , loss: 2.138854519430413
epoch: 3 , loss: 2.1333219128918937
epoch: 4 , loss: 2.129233419177044
epoch: 5 , loss: 2.1305747218878874
epoch: 6 , loss: 2.124239278126912
epoch: 7 , loss: 2.116394963609167
epoch: 8 , loss: 2.1189611748040442
epoch: 9 , loss: 2.120151555681803
epoch: 10 , loss: 2.1185890321271965
epoch: 11 , loss: 2.1128992632210974
epoch: 12 , loss: 2.1098639591630683
epoch: 13 , loss: 2.1058094286056885
epoch: 14 , loss: 2.106322065893426
epoch: 15 , loss: 2.099632248821029
epoch: 16 , loss: 2.0954696227269
epoch: 17 , loss: 2.092597869505365
epoch: 18 , loss: 2.0933723837496285
epoch: 19 , loss: 2.090676277516836
epoch: 20 , loss: 2.087188844221184
epoch: 21 , loss: 2.0880146931452925
epoch: 22 , loss: 2.0820474610271225
epoch: 23 , loss: 2.0824243763843215
epoch: 24 , loss: 2.0796353587185044
epoch: 25 , loss: 2.0744919748191375
epoch: 26 , loss: 2.0732915128570006
epoch: 27 , loss: 2.069009046956717
epoch: 28 , loss: 2.06651

In [134]:
model.eval()
def translate(zh, max_len=10):
    zh_idx = [my_vocab['zh']['<bos>']] + my_vocab['zh'].lookup_indices(list(zh)) + [my_vocab['zh']['<eos>']]
    zh_idx = torch.tensor(zh_idx, dtype=torch.long, device=device).unsqueeze_(1)
    en_bos = my_vocab['en']['<bos>']
    enc_output, hidden = model.encoder(zh_idx)
    preds = []
    y = torch.tensor([en_bos], dtype=torch.long, device=device)
    for t in range(max_len):
        y.unsqueeze_(1)
        y, hidden = model.decoder(y, hidden)
        y.squeeze_(1)
        y = y.max(1)[1]
        if y.item() == my_vocab['en']['<eos>']:
            break
        preds.append(my_vocab['en'].get_itos()[y.item()])
    return ' '.join(preds)

In [148]:
print(translate('我是一个学生'))

I 'm a slight of student .


In [132]:
for zh in df['zh'][0: 100]:
    print(zh, '   ==>   ', translate(zh, max_len=10))

嗨。    ==>    Hi people
你好。    ==>    You you . .
你用跑的。    ==>    You made him .
等等！    ==>    Wait waiting party waiting
你好。    ==>    You you . .
让我来。    ==>    Let me . . .
我赢了。    ==>    I won won .
不会吧。    ==>    Why are not .
乾杯!    ==>    Cheers !
你懂了吗？    ==>    Got you ? ?
他跑了。    ==>    He turned it
跳进来。    ==>    Hop in once
我迷失了。    ==>    I lost lost person .
我退出。    ==>    I confessed the car .
我沒事。    ==>    I do n't serious . .
听着。    ==>    Answer the party .
不可能！    ==>    Stop ca understand !
没门！    ==>    Shut open the
你确定？    ==>    Really you . .
试试吧。    ==>    Try it .
我们来试试。    ==>    We 'll try another . .
为什么是我？    ==>    Are you coming ?
去问汤姆。    ==>    Tom Tom .
冷静点。    ==>    Be late cold
公平点。    ==>    Above of fair .
友善点。    ==>    Dogs a sharp
和气点。    ==>    Be a late
联系我。    ==>    I me advice
联系我们。    ==>    Answer us later
进来。    ==>    Wait and straight back
找到汤姆。    ==>    We Tom .
滾出去！    ==>    Get out the . .
出去！    ==>    Get to the .
走開！    ==> 