# Tradução Automática

Script que realiza a tradução de uma sentença do português para o inglês por meio de um modelo Seq2Seq.

Inspiração: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import random

Criação do corpus paralelo e customizado para tradução automática

In [None]:
class CustomDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, source_vocab, target_vocab,max_len):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, index):
        source_sentence = self.source_sentences[index]
        target_sentence = self.target_sentences[index]

        source_indices = [self.source_vocab.get(word, self.source_vocab['<unk>']) for word in source_sentence.split()]
        target_indices = [self.target_vocab.get(word, self.target_vocab['<unk>']) for word in target_sentence.split()]

        source_indices.append(self.source_vocab['<sos>'])
        target_indices.append(self.target_vocab['<sos>'])

        tam = self.max_len -len(source_indices)-1
        source_indices.extend([self.source_vocab['<pad>']]*tam)
        tam = self.max_len -len(target_indices)-1
        target_indices.extend([self.target_vocab['<pad>']]*tam)

        source_indices.extend([self.source_vocab['<eos>']])
        source_sentence_tensor = torch.LongTensor(source_indices)

        target_indices.extend([self.target_vocab['<eos>']])
        target_sentence_tensor = torch.LongTensor(target_indices)

        return source_sentence_tensor,target_sentence_tensor

In [None]:
class Encoder(nn.Module):

    def __init__(self, input_dim,embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size
        embedding = self.embedding(x)
        # embedding shape: (seq_length, N, embedding_size)
        outputs, (hidden, cell) = self.lstm(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(
        self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(Decoder, self).__init__()
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    # formato de entrada: [batch_size], recebe uma palavra (apenas a primeira) da sentenca de cada batch
    def forward(self, x, hidden, cell):

        # se batch_size = 4, entra [1,2,3,4] sai [[1],[2],[3],[4]]
        #print(x.shape)
        x = x.unsqueeze(0)
        #print(x.shape)

        # estrutura de saida do embedding possui dimensoes: (1, batch_size, embedding_dim)
        embedding = self.embedding(x)

        # estrutura de saida do LSTM possui dimensoes: (1, batch_size, hidden_dim)
        outputs, (hidden, cell) = self.lstm(embedding, (hidden, cell))

        # estrutura de saida do LSTM possui dimensoes: (1, batch_size, hidden_dim)
        predictions = self.fc(outputs)

        # estrutura de saida [[1],[2],[3],[4]] sai se batch_size = 4, entra [1,2,3,4] sai
        predictions = predictions.squeeze(0)
        return predictions, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(target_vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size)

        hidden, cell = self.encoder(source)

        x = target[0]

        for t in range(1, target_len):
            # Pega o ultimo [estado interno, estado de celula] do enconder como o vetor de contexto na entrada do decoder
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Trabalhando como um modelo de linguagem
            # Armazena a proxima palavra a ser predita
            outputs[t] = output

            # Pega a palavra com a maior probabilidade a ser predita
            next_word = output.argmax(1)

            #if random.random() < teacher_force_ratio:
            x = target[t]
            #else:
            #x = next_word

        return outputs

Definição dos dados de treinamento

In [None]:
training_data = [
    ("eu gosto de python", "i like python too"),
    ("eu nao sei","i do not know"),
    ("eu estou sem palavras","i am speechless"),
    ("sou eu quem faz o almoco","it is me who cooks the lunch"),
    ("eu posso montar o armario","i can build the shelve"),
    ("voce nao sabe como falar isso em ingles","you do not know how to say that in english")
]

Criação do vocabulario como dicionarios

In [None]:
source_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
target_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}

for pt_br,eng in training_data:
    for word in pt_br.split():
        if word not in source_vocab:
            source_vocab[word] = len(source_vocab)
    for word in eng.split():
        if word not in target_vocab:
            target_vocab[word] = len(target_vocab)

source_idx2word = {}
target_idx2word = {}

for word,index in source_vocab.items():
    source_idx2word[index] = word

for word,index in target_vocab.items():
    target_idx2word[index] = word

Definição dos hiperparâmetros

In [None]:
input_dim_encoder = len(source_vocab)
input_dim_decoder = len(target_vocab)
hidden_dim = 256
output_dim = len(target_vocab)

embedding_dim = 128
learning_rate = 1e-5
batch_size=2
num_epochs = 300
max_sentence_len = 16

Criar o dataloader a partir dos dados de treinamento

In [None]:
dataset = CustomDataset([pair[0] for pair in training_data], [pair[1] for pair in training_data], source_vocab, target_vocab,max_sentence_len)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
encoder = Encoder(input_dim_encoder, embedding_dim, hidden_dim)
decoder = Decoder(input_dim_decoder, embedding_dim, hidden_dim,output_dim)
model = Seq2Seq(encoder, decoder)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignorar indice do padding
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

Processo de treinamento

In [None]:
for epoch in range(num_epochs):

    total_loss = 0

    # ativando o modo de treinamento do modelo
    model.train()

    for inp_data,target in train_loader:

        # zera os gradientes
        optimizer.zero_grad()

        # passo forward: obtem as saidas para as sentenças do batch
        output = model(inp_data, target)

        # Saida possui dimensoes (target_len, batch_size, output_dim), mas a Cross Entropy Loss
        # processa entradas apenas nas dimensoes (output_words * batch_size)
        # Assim eh necessario fazer um reshape. Retire os comentarios dos prints
        # para verificar
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        loss = criterion(output, target)

        loss.backward()

        optimizer.step()

        total_loss+=loss.item()

    print(f"[Epoch {epoch} / {num_epochs}] ------------------------------- {total_loss/len(train_loader):.4f}")

[Epoch 0 / 300] ------------------------------- 3.4528
[Epoch 1 / 300] ------------------------------- 3.4444
[Epoch 2 / 300] ------------------------------- 3.4417
[Epoch 3 / 300] ------------------------------- 3.4434
[Epoch 4 / 300] ------------------------------- 3.4409
[Epoch 5 / 300] ------------------------------- 3.4394
[Epoch 6 / 300] ------------------------------- 3.4354
[Epoch 7 / 300] ------------------------------- 3.4278
[Epoch 8 / 300] ------------------------------- 3.4239
[Epoch 9 / 300] ------------------------------- 3.4305
[Epoch 10 / 300] ------------------------------- 3.4137
[Epoch 11 / 300] ------------------------------- 3.4144
[Epoch 12 / 300] ------------------------------- 3.4226
[Epoch 13 / 300] ------------------------------- 3.4096
[Epoch 14 / 300] ------------------------------- 3.4112
[Epoch 15 / 300] ------------------------------- 3.4001
[Epoch 16 / 300] ------------------------------- 3.4054
[Epoch 17 / 300] ------------------------------- 3.3973
[E

In [None]:
test_data = [
    ('voce nao pode falar nada', "you can not say anything"),
    ('existe um almoco no armario',"there is a meal in the shelve")
]

dataset = CustomDataset([pair[0] for pair in test_data], [pair[1] for pair in test_data], source_vocab, target_vocab,max_sentence_len)
test_loader = DataLoader(dataset, batch_size=2, shuffle=True)

for test_sentences,targets in test_loader:

    model.eval()
    torch.no_grad()

    batch_size = test_sentences.shape[1]
    target_len = target.shape[0]

    for idx in range(0,len(test_sentences)):

        outputs = torch.zeros(16, 1, input_dim_decoder)

        for i in range(0,16):
            outputs[i] = target_vocab['<sos>']

        hidden, cell = encoder(test_sentences[idx])

        x = test_sentences[idx][0]

        translated_sentence = []

        for t in range(1, 16):

            output, hidden, cell = decoder(x, hidden, cell)

            outputs[t] = output

            next_word = output.argmax(0)

            x = next_word

            translated_sentence.append(x)

        translated_sentence2 = []
        for idx in translated_sentence:
            translated_sentence2.append(target_idx2word[int(idx)])

        print(''.join(translated_sentence2))

<sos><sos><sos><sos><sos>the<sos><sos>cooks<sos>cooks<sos>cooks<sos>cooks
<eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos><eos>
