# Importações

In [31]:
!pip install numpy==1.23.5 d2l==1.0.3 --quiet

In [32]:
import os
from d2l import torch as d2l
import torch
from torch import nn

# Download e Leitura dos dados

In [33]:
def read_data_nmt():
    """Carrega o dataset Inglês-Francês do Projeto Tatoeba."""
    d2l.DATA_HUB['fra-eng'] = (
        d2l.DATA_URL + 'fra-eng.zip',
        '94646ad1522d915e7b0f9296181140edcf86a4f5')
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r', encoding='utf-8') as f:
        return f.read()

# Pré-processamento

In [34]:
def preprocess_nmt(text):
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '

    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text)]
    return ''.join(out)

# Tokenização

In [35]:
def tokenize_nmt(text, num_examples=None):
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i >= num_examples:
            break
        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source, target

# Truncar ou preencher

In [36]:
def truncate_pad(line, num_steps, padding_token):
    if len(line) > num_steps:
        return line[:num_steps]
    return line + [padding_token] * (num_steps - len(line))

# Transformar para array

In [37]:
def build_array_nmt(lines, vocab, num_steps):
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor([truncate_pad(
        l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
    return array, valid_len

# Load do dataset

In [38]:
def load_data_nmt(batch_size, num_steps, num_examples=600):
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = d2l.Vocab(source, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = d2l.Vocab(target, min_freq=2,
                          reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    data_iter = d2l.load_array(data_arrays, batch_size)
    return data_iter, src_vocab, tgt_vocab

# Teste para o exercício 9.5.7 - 1

In [39]:
batch_size, num_steps = 32, 10
device = d2l.try_gpu()
train_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size, num_steps)

for n in [100, 1000, 5000]:
    _, src_vocab_n, tgt_vocab_n = load_data_nmt(batch_size=2, num_steps=10, num_examples=n)
    print(f"{n} exemplos → vocabulário origem: {len(src_vocab_n)}, destino: {len(tgt_vocab_n)}")

100 exemplos → vocabulário origem: 40, destino: 39
1000 exemplos → vocabulário origem: 266, destino: 321
5000 exemplos → vocabulário origem: 875, destino: 1231


# Modelo Encoder-Decoder

In [40]:
class Seq2SeqEncoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers, batch_first=True)

    def forward(self, X):
        embedded = self.embedding(X)
        output, state = self.rnn(embedded)
        return output, state

class Seq2SeqDecoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers, batch_first=True)
        self.dense = nn.Linear(num_hiddens, vocab_size)

    def forward(self, X, state):
        embedded = self.embedding(X)
        output, state = self.rnn(embedded, state)
        return self.dense(output), state

class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X):
        enc_outputs, enc_state = self.encoder(enc_X)
        dec_outputs, _ = self.decoder(dec_X, enc_state)
        return dec_outputs

# Inicialização e Treino

In [41]:
embed_size, num_hiddens, num_layers = 32, 32, 2
encoder = Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers)
decoder = Seq2SeqDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers)
model = EncoderDecoder(encoder, decoder).to(device)

def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Embedding:
        nn.init.xavier_uniform_(m.weight)
model.apply(init_weights)

loss = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Loop de treinamento

In [42]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X, X_valid_len, Y, Y_valid_len in train_iter:
        X, Y = X.to(device), Y.to(device)
        bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0], device=device).reshape(-1, 1)
        dec_input = torch.cat([bos, Y[:, :-1]], 1)
        pred = model(X, dec_input)
        l = loss(pred.reshape(-1, pred.shape[-1]), Y.reshape(-1))
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        total_loss += l.item()
    print(f'Epoch {epoch + 1}, Loss {total_loss:.3f}')


Epoch 1, Loss 80.262
Epoch 2, Loss 58.052
Epoch 3, Loss 54.556
Epoch 4, Loss 51.738
Epoch 5, Loss 49.206
Epoch 6, Loss 46.916
Epoch 7, Loss 44.865
Epoch 8, Loss 43.304
Epoch 9, Loss 41.819
Epoch 10, Loss 40.591
