In [61]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
"""
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html"""
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.optim import AdamW

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [62]:
from torchtext.datasets import Multi30k
train_dataloader = Multi30k(split='train', language_pair=('de','en'))

In [63]:
train_data = list(iter(train_dataloader))

In [64]:
de,en = zip(*train_data)

In [65]:
class Tokenizer:
    def __init__(self, sentences, max_len):
        words = ['<EOS>','SOS']
        for sentence in sentences:
            for word in sentence.strip().split():
                words.append(word)
        self.i_to_t = {i:t for i,t in enumerate(words)}
        self.t_to_i = {t:i for i,t in enumerate(words)}
        self.vocab_size = len(self.i_to_t)

In [66]:
from datasets import load_dataset
from transformers import AutoTokenizer

books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [67]:
source_lang = "en"
target_lang = "fr"
# prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=64, truncation=True)
    return model_inputs
    
tokenized_books = books.map(preprocess_function, batched=True)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101668/101668 [00:08<00:00, 12609.07 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25417/25417 [00:02<00:00, 12351.27 examples/s]


In [68]:
tokenized_books['train']

Dataset({
    features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 101668
})

In [69]:
tokenized_books['translation']

KeyError: 'translation'

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.gru(embedded)
        return output, hidden

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        # decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, x, hidden):
        output = self.embedding(x)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [None]:
device = torch.device('cuda')
input_size = input_lang.n_words
hidden_size = 64
dropout=0.0
output_size = output_lang.n_words
encoder = EncoderRNN(input_size, hidden_size,dropout_p=dropout).to(device)
decoder = DecoderRNN(hidden_size, output_size).to(device)

lr = 1e-2
optim = AdamW(params=encoder.parameters(),lr=lr)

In [None]:
step = 0
for epoch in range(1):
    for X,y in train_dataloader:
        step +=1
        X = X.to(device)
        y = y.to(device)
        enc_outputs, enc_hidden = encoder(X)
        decoder_outputs, decoder_hidden, _ = decoder(enc_outputs, enc_hidden, target_tensor=y)
        loss = F.cross_entropy(decoder_outputs.view(-1, decoder_outputs.shape[-1]), y.view(-1))
        if step % 100 == 0:
            print(f'e{epoch}:s{step} | loss:{loss:.3f}')

        for param in model.parameters():
            param.grad = 0.0
            
        loss.backward()
        
        # optim.step()
        for param in model.parameters():
            param.data -= param.grad * lr


        # break


In [134]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')
evaluateRandomly(encoder,decoder)

> what is that thing ?
= que es esa cosa ?
< refractor refractor refractor atenas abotonarme ram solida hablanos aparentar escribia

> tom lived in a trailer
= tom vivia en una caravana
< pudiera pudiera pudiera caigo partiste ahuyentar darselo escribia mary rascacielos

> people sometimes make illogical decisions
= a veces las personas toman decisiones ilogicas
< la quedaban captura dejan tocarme prefieres aventuroso trasplante cuidar llamarlo

> i went into details
= entre en detalles
< te confundio evacuar chance peligros traerse salta saliera copenhague conseguira

> she went blind
= se quedo ciega
< ella atenas atenas viejas polonia escribia formalidad enigma alimenticios moderese

> he is soon to be a father
= dentro de poco sera padre
< el el el honestamente viejas copenhague nacimientos peligros apartame hablanos

> i got here a little early today
= llegue aqui un poco temprano hoy
< pensaste refractor atenas viejas polonia escribia tengamos copenhague nacimientos peligros

> w

In [None]:
eva