In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np
import random
import math
import time

In [2]:
CUDA_IS_AVAILABLE = True if torch.cuda.is_available() else False
SEED = 1
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if CUDA_IS_AVAILABLE:
    torch.cuda.manual_seed(SEED)
    device = torch.device('cuda:0')
else:
    device = None

### 数据预处理

In [3]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [4]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [6]:
"""
Multi30k.splits(
    exts,
    fields,
    root='.data',
    train='train',
    validation='val',
    test='test2016',
    **kwargs,
)
"""

train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                    root='.',
                                                    train='train',
                                                    validation='val',
                                                    test='test',
                                                    fields = (SRC, TRG))

In [7]:
train_data

<torchtext.legacy.datasets.translation.Multi30k at 0x1cbc88fc730>

In [8]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [9]:
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [10]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [11]:
train_data

<torchtext.legacy.datasets.translation.Multi30k at 0x1cbc88fc730>

In [12]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7853
Unique tokens in target (en) vocabulary: 5893


In [13]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device
)

In [14]:
batch = next(iter(train_iterator))
batch


[torchtext.legacy.data.batch.Batch of size 128 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 34x128 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 41x128 (GPU 0)]

In [15]:
print(batch.src)
print(batch.trg)

tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   4,    4,    4,  ...,    4,    4,    4],
        [1365, 2921,    0,  ...,  129,   69,  215],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]], device='cuda:0')
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  4,  16,   4,  ...,   4,   4,  16],
        [  9,  19,  53,  ...,   9, 762, 127],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]], device='cuda:0')


In [16]:
text = " ".join(SRC.vocab.itos[i] for i in batch.src[:,0])
print(text)

<sos> . stift einen hält und sofa einem auf liegt tätowierungen mit mann ein <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [17]:
text = " ".join(TRG.vocab.itos[i] for i in batch.trg[:,0])
print(text)

<sos> a man with tattoos is lounging on a couch holding a pencil . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


### 模型搭建

#### Encoder

In [18]:
class Encoder(nn.Module):
    def __init__(self, embedding_size, src_vocab_size, hidden_size, n_layers, dropout):
        super().__init__()
        
        self.embedding_size = embedding_size
        self.src_vocab_size = src_vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src):
        # src => [seq_len, batch_size]
        embedded = self.embedding(src) # embedded => [seq_len, batch_size, embedding_size]
        output, (hidden, cell) = self.lstm(embedded)
#         output => [seq_len, batch_size, hidden_size]
#         hidden => [n_layers, batch_size, hidden_size]
#         cell => [n_layers, batch_size, hidden_size]
        return hidden, cell

#### Decoder

In [19]:
class Decoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, trg_vocab_size, n_layers, dropout):
        super().__init__()
        
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.trg_vocab_size = trg_vocab_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedd = nn.Embedding(trg_vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout=dropout)
        self.linear = nn.Linear(hidden_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, cell):
#         input => [batch_size]
#         hidden => [n_layers, batch_size, hidden_size]
#         cell => [n_layers, batch_size, hidden_size]
        input = input.unsqueeze(0) # input => [1, batch_size]
        embedded = self.embedd(input) # embedded => [1, batch_size, embedding_size]
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
#         output => [1, batch_size, hidden_size]
#         hidden => [n_layers, batch_size, hidden_size]
#         cell => [n_layers, batch_size, hidden_size]
        prediction = self.linear(output.squeeze(0)) # prediction => [batch_size, trg_vocab_size]
        
        return prediction, hidden, cell

#### Seq2Seq

In [20]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert self.encoder.hidden_size == self.decoder.hidden_size
        assert self.encoder.n_layers == self.decoder.n_layers
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
#         src => [src_seq_len, batch_size]
#         trg => [trg_seq_len, batch_size]
        batch_size = trg.shape[1]
        trg_seq_len = trg.shape[0]
        trg_vocab_size = self.decoder.trg_vocab_size
        
        outputs = torch.zeros(trg_seq_len, batch_size, trg_vocab_size).to(self.device)
        # outputs => [trg_len, batch_size, trg_vocab_size]
        
        hidden, cell = self.encoder(src) 
#         hidden => [seq_len, batch_size, hidden_size]
#         cell => [seq_len, batch_size, hidden_size]
        input = trg[0,:] # 每条句子中的第一个符号索引 input => [batch_size]
        
        for t in range(1, trg_seq_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
#             output => [batch_size, trg_vocab_size]
#             hidden => [n_layers, batch_size, hidden_size]
#             cell => [n_layers, batch_size, hidden_size]
            outputs[t] = output 
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
            
        return outputs

### 训练模型

In [21]:
SRC_VOCAB_SIZE = len(SRC.vocab)
TRG_VOCAB_SIZE = len(TRG.vocab)
ENC_EMBEDDING_SIZE = 256
DEC_EMBEDDING_SIZE = 256
HIDDEN_SIZE = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

encoder = Encoder(ENC_EMBEDDING_SIZE, 
                  SRC_VOCAB_SIZE, 
                  HIDDEN_SIZE, 
                  N_LAYERS, 
                  ENC_DROPOUT)
decoder = Decoder(DEC_EMBEDDING_SIZE,
                  HIDDEN_SIZE,
                  TRG_VOCAB_SIZE,
                  N_LAYERS,
                  DEC_DROPOUT)
model = Seq2Seq(encoder, decoder, device).to(device)

In [22]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedd): Embedding(5893, 256)
    (lstm): LSTM(256, 512, num_layers=2, dropout=0.5)
    (linear): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [23]:
def count_parameters(m):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model)} trainable_parameters.")

The model has 13898501 trainable_parameters.


#### 构建优化器

In [24]:
LEARNING_RATE = 0.005
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

#### 构建损失函数

In [25]:
TRG.pad_token

'<pad>'

In [26]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
loss_fn = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

#### 可视化网络架构

In [35]:
from torchviz import make_dot

batch = next(iter(train_iterator))
src, trg = batch.src, batch.trg
outputs = model(src, trg)
g = make_dot(outputs, params=dict(list(model.named_parameters())))  # 实例化 make_dot
g.view() # 可视化网络结构
print("模型结构保存完成...")

模型结构保存完成...


#### 训练函数

In [29]:
def train(model, iterator, optimizer, loss_fn, clip):
    model.train() # 将模型调到训练模式
    
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch.src, batch.trg
        optimizer.zero_grad() # 清空优化器的梯度信息
        outputs = model(src, trg)
        # outputs => [trg_len, batch_size, trg_vocab_size]
        # trg => [trg_len, batch_size]
        trg_vocab_size = outputs.shape[-1]
        loss = loss_fn(outputs[1:].view(-1, trg_vocab_size), trg[1:].view(-1))
        loss.backward() # 反向传播
        torch.nn.utils.clip_grad_norm(model.parameters(), clip) # 梯度裁剪
        optimizer.step() # 更新参数
        epoch_loss += loss.item() # loss => scalar
        
    return epoch_loss / len(iterator) 

#### 评估函数

In [30]:
def evaluate(model, iterator, loss_fn):
    model.eval() # 将模型调到测试模式
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch.src, batch.trg
        outputs = model(src, trg, 0)
        trg_vocab_size = outputs.shape[-1]
        loss = loss_fn(outputs[1:].view(-1, trg_vocab_size), trg[1:].view(-1))
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

#### compute every epoch time

In [31]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#### 训练循环

In [32]:
EPOCHS = 10
CLIP = 1

best_valid_loss = float("inf") # 默认正无穷
for epoch in range(EPOCHS):
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, loss_fn, CLIP)
    valid_loss = evaluate(model, valid_iterator, loss_fn)
    
    end_time = time.time()
    
    elapsed_mins, elapsed_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-model.pth')
    
    print(f'Epoch: {epoch+1:02} | time: {elapsed_mins}min,{elapsed_secs}sec.')
    print(f'\t Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t   Val Loss: {valid_loss:.3f} |   Val PPL: {math.exp(valid_loss):7.3f}')

  torch.nn.utils.clip_grad_norm(model.parameters(), clip) # 梯度裁剪


Epoch: 01 | time: 0min,35sec.
	 Train Loss: 4.792 | Train PPL: 120.489
	   Val Loss: 5.071 |   Val PPL: 159.323
Epoch: 02 | time: 0min,35sec.
	 Train Loss: 4.330 | Train PPL:  75.967
	   Val Loss: 5.058 |   Val PPL: 157.280
Epoch: 03 | time: 0min,36sec.
	 Train Loss: 4.235 | Train PPL:  69.031
	   Val Loss: 5.056 |   Val PPL: 157.039
Epoch: 04 | time: 0min,36sec.
	 Train Loss: 4.114 | Train PPL:  61.220
	   Val Loss: 4.833 |   Val PPL: 125.528
Epoch: 05 | time: 0min,37sec.
	 Train Loss: 3.963 | Train PPL:  52.612
	   Val Loss: 4.681 |   Val PPL: 107.923
Epoch: 06 | time: 0min,37sec.
	 Train Loss: 3.798 | Train PPL:  44.610
	   Val Loss: 4.559 |   Val PPL:  95.470
Epoch: 07 | time: 0min,37sec.
	 Train Loss: 3.685 | Train PPL:  39.863
	   Val Loss: 4.422 |   Val PPL:  83.229
Epoch: 08 | time: 0min,37sec.
	 Train Loss: 3.616 | Train PPL:  37.181
	   Val Loss: 4.448 |   Val PPL:  85.414
Epoch: 09 | time: 0min,37sec.
	 Train Loss: 3.548 | Train PPL:  34.734
	   Val Loss: 4.437 |   Val PPL: 

#### 测试模型

In [33]:
model.load_state_dict(torch.load('best-model.pth'))
test_loss = evaluate(model, test_iterator, loss_fn)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 4.452 | Test PPL:  85.792 |
