<a href="https://colab.research.google.com/github/gupta24789/seq2seq/blob/main/seq2seq_gru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Machine Translation :  German to English

In [None]:
# !pip install evaluate
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [None]:
import shutil
import random
import itertools
import spacy
import pandas as pd
import numpy as np
from pprint import pprint
from tqdm import tqdm , tqdm_notebook

import evaluate
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl

tqdm.pandas()

## Set Seed

In [None]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Read Data

In [None]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/seq2seq/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/seq2seq/main/data/val.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/seq2seq/main/data/test.csv")

train_df.shape, val_df.shape, test_df.shape

((29000, 2), (1014, 2), (1000, 2))

In [None]:
train_df.head(3)

Unnamed: 0,en,de
0,"Two young, White males are outside near many b...",Zwei junge weiße Männer sind im Freien in der ...
1,Several men in hard hats are operating a giant...,Mehrere Männer mit Schutzhelmen bedienen ein A...
2,A little girl climbing into a wooden playhouse.,Ein kleines Mädchen klettert in ein Spielhaus ...


## Tokenizer

In [None]:
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")

In [None]:
string = "What a lovely day it is today!"

[token.text for token in nlp_en.tokenizer(string)]

['What', 'a', 'lovely', 'day', 'it', 'is', 'today', '!']

## Prepare data

In [None]:
def tokenized_text(text, nlp, max_length = 1000, is_lower = True):
    if is_lower:
        text = str(text).lower()
    tokens = [token.text for token in nlp.tokenizer(text)]
    return tokens

In [None]:
train_df['en_tokens'] = train_df.en.map(lambda x: tokenized_text(x, nlp_en))
train_df['de_tokens'] = train_df.de.map(lambda x: tokenized_text(x, nlp_de))

## val
val_df['en_tokens'] = val_df.en.map(lambda x: tokenized_text(x, nlp_en))
val_df['de_tokens'] = val_df.de.map(lambda x: tokenized_text(x, nlp_de))
## test
test_df['en_tokens'] = test_df.en.map(lambda x: tokenized_text(x, nlp_en))
test_df['de_tokens'] = test_df.de.map(lambda x: tokenized_text(x, nlp_de))

In [None]:
train_df.head(3)

Unnamed: 0,en,de,en_tokens,de_tokens
0,"Two young, White males are outside near many b...",Zwei junge weiße Männer sind im Freien in der ...,"[two, young, ,, white, males, are, outside, ne...","[zwei, junge, weiße, männer, sind, im, freien,..."
1,Several men in hard hats are operating a giant...,Mehrere Männer mit Schutzhelmen bedienen ein A...,"[several, men, in, hard, hats, are, operating,...","[mehrere, männer, mit, schutzhelmen, bedienen,..."
2,A little girl climbing into a wooden playhouse.,Ein kleines Mädchen klettert in ein Spielhaus ...,"[a, little, girl, climbing, into, a, wooden, p...","[ein, kleines, mädchen, klettert, in, ein, spi..."


## Build Vocab

In [None]:
special_words = ["<unk>","<pad>", "<sos>","<eos>"]
en_words = list(set(itertools.chain.from_iterable(train_df.en_tokens.tolist())))
de_words = list(set(itertools.chain.from_iterable(train_df.de_tokens.tolist())))

en_words = special_words + en_words
de_words = special_words + de_words

en_vocab = {w:i for i,w in enumerate(en_words)}
de_vocab = {w:i for i,w in enumerate(de_words)}

UNK_ID = en_vocab['<unk>']
PAD_ID = en_vocab['<pad>']
SOS_ID = en_vocab['<sos>']
EOS_ID = en_vocab['<eos>']

print(f"en_vocab : {len(en_vocab)}")
print(f"de_vocab : {len(de_vocab)}")

en_vocab : 9795
de_vocab : 18669


## Encode text

In [None]:
def encode_text(tokens, vocab, is_add_sos = True, is_add_eos = True):
    encoded = []
    for w in tokens:
        encoded.append(vocab.get(w, UNK_ID))

    if is_add_sos:
        encoded = [SOS_ID] + encoded
    if is_add_eos:
        encoded =  encoded + [EOS_ID]
    return encoded

In [None]:
train_df['en_encoded'] = train_df.en_tokens.apply(lambda x: encode_text(x, en_vocab))
train_df['de_encoded'] = train_df.de_tokens.apply(lambda x: encode_text(x, de_vocab))

## val
val_df['en_encoded'] = val_df.en_tokens.apply(lambda x: encode_text(x, en_vocab))
val_df['de_encoded'] = val_df.de_tokens.apply(lambda x: encode_text(x, de_vocab))
## test
test_df['en_encoded'] = test_df.en_tokens.apply(lambda x: encode_text(x, en_vocab))
test_df['de_encoded'] = test_df.de_tokens.apply(lambda x: encode_text(x, de_vocab))

In [None]:
train_df.head(3)

Unnamed: 0,en,de,en_tokens,de_tokens,en_encoded,de_encoded
0,"Two young, White males are outside near many b...",Zwei junge weiße Männer sind im Freien in der ...,"[two, young, ,, white, males, are, outside, ne...","[zwei, junge, weiße, männer, sind, im, freien,...","[2, 4525, 2494, 6023, 591, 3214, 5611, 2053, 2...","[2, 9631, 12850, 6087, 17567, 12552, 17569, 12..."
1,Several men in hard hats are operating a giant...,Mehrere Männer mit Schutzhelmen bedienen ein A...,"[several, men, in, hard, hats, are, operating,...","[mehrere, männer, mit, schutzhelmen, bedienen,...","[2, 6028, 9729, 8160, 3203, 2546, 5611, 9332, ...","[2, 2833, 17567, 11574, 5083, 18477, 9541, 239..."
2,A little girl climbing into a wooden playhouse.,Ein kleines Mädchen klettert in ein Spielhaus ...,"[a, little, girl, climbing, into, a, wooden, p...","[ein, kleines, mädchen, klettert, in, ein, spi...","[2, 7307, 2193, 4520, 6500, 7255, 7307, 1912, ...","[2, 9541, 17600, 7526, 10294, 15634, 9541, 144..."


## Data Loaders

In [None]:
train_data = train_df[['en_encoded','de_encoded']].to_dict('records')
val_data = val_df[['en_encoded','de_encoded']].to_dict('records')
test_data = test_df[['en_encoded','de_encoded']].to_dict('records')

In [None]:
pprint(train_data[:2], compact=True)

[{'de_encoded': [2, 9631, 12850, 6087, 17567, 12552, 17569, 12147, 15634, 17501,
                 10171, 17078, 9692, 3315, 3],
  'en_encoded': [2, 4525, 2494, 6023, 591, 3214, 5611, 2053, 243, 2871, 8517,
                 1705, 3]},
 {'de_encoded': [2, 2833, 17567, 11574, 5083, 18477, 9541, 2391, 3315, 3],
  'en_encoded': [2, 6028, 9729, 8160, 3203, 2546, 5611, 9332, 7307, 2805, 7408,
                 14, 1705, 3]}]


In [None]:
def custom_collate(batch):
    """
    Dynamic padding : find the max len in the batch and do the padding
    """
    en_batch = [torch.tensor(item['en_encoded']) for item in batch]
    de_batch = [torch.tensor(item['de_encoded']) for item in batch]

    padded_en = nn.utils.rnn.pad_sequence(en_batch, batch_first= True, padding_value= PAD_ID)
    padded_de = nn.utils.rnn.pad_sequence(de_batch, batch_first= True, padding_value= PAD_ID)

    return {"padded_en": padded_en, "padded_de":  padded_de}

In [None]:
batch_size = 3
train_dl = DataLoader(train_data , batch_size = batch_size, shuffle = False, collate_fn= custom_collate)
example = next(iter(train_dl))
padded_en, padded_de = example['padded_en'],example['padded_de']
print(padded_en.shape, padded_de.shape)

torch.Size([3, 14]) torch.Size([3, 15])


In [None]:
padded_en

tensor([[   2, 4525, 2494, 6023,  591, 3214, 5611, 2053,  243, 2871, 8517, 1705,
            3,    1],
        [   2, 6028, 9729, 8160, 3203, 2546, 5611, 9332, 7307, 2805, 7408,   14,
         1705,    3],
        [   2, 7307, 2193, 4520, 6500, 7255, 7307, 1912, 4069, 1705,    3,    1,
            1,    1]])

In [None]:
padded_de

tensor([[    2,  9631, 12850,  6087, 17567, 12552, 17569, 12147, 15634, 17501,
         10171, 17078,  9692,  3315,     3],
        [    2,  2833, 17567, 11574,  5083, 18477,  9541,  2391,  3315,     3,
             1,     1,     1,     1,     1],
        [    2,  9541, 17600,  7526, 10294, 15634,  9541, 14413, 15413, 10444,
          3315,     3,     1,     1,     1]])

In [None]:
## dataloaders
batch_size = 256
train_dl = DataLoader(train_data , batch_size = batch_size, shuffle = True, collate_fn= custom_collate)
val_dl = DataLoader(val_data , batch_size = batch_size, shuffle = False, collate_fn= custom_collate)
test_dl = DataLoader(test_data , batch_size = batch_size, shuffle = False, collate_fn= custom_collate)

## Building Model

In [None]:
class Encoder(nn.Module):
    """
    Encode German text
    """
    def __init__(self, vocab_size, emb_dim, hidden_dim, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, dropout=dropout, batch_first= True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, batch):
        # batch = [batch size, sent len]
        embedded = self.dropout(self.embedding(batch))
        # embedded = [batch size, sent len, embedding dim]
        # print(f"embedded : {embedded.shape}")
        outputs, hidden = self.rnn(embedded)
        # print(f"outputs : {outputs.shape}")
        # print(f"hidden : {hidden.shape}")
         # no cell state in GRU!
        # outputs = [batch size, sent len, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden

In [None]:
## Enoder
print(f"Input : {padded_de.shape}")
encoder = Encoder(vocab_size = len(de_vocab), emb_dim = 100, hidden_dim=64,dropout=0.1)
hidden = encoder(padded_de)
print(hidden.shape)

Input : torch.Size([3, 15])
torch.Size([1, 3, 64])




In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, dropout):
        super().__init__()
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim + hidden_dim, hidden_dim, dropout=dropout,  batch_first= True)
        self.fc_out = nn.Linear(emb_dim + hidden_dim * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, context):

        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # context = [n layers * n directions, batch size, hidden dim]
        # n layers and n directions in the decoder will both always be 1, therefore:
        # hidden = [1, batch size, hidden dim]
        # context = [1, batch size, hidden dim]

        # print(f"Input : {input.shape}")
        # print(f'hidden : {hidden.shape}')
        # print(f'context : {context.shape}')

        input = input.unsqueeze(1)
        # print(f"Input -1 : {input.shape}")
        # input = [batch size, 1]

        embedded = self.dropout(self.embedding(input))
        # print("embedded : ", embedded.shape)
        # embedded = [batch size, 1, embedding dim]

        emb_con = torch.cat((embedded, context.permute(1,0,2)), dim=2)
        # print(f"emb con : {emb_con.shape}")
        # emb_con = [batch size, 1, embedding dim + hidden dim]

        output, hidden = self.rnn(emb_con, hidden)
        # print(f"output : {output.shape}")
        # print(f"hidden  : {hidden.shape}")
        # output = [batch size, seq length, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # seq len, n layers and n directions will always be 1 in this decoder, therefore:
        # output = [batch size, 1, hidden dim]
        # hidden = [1, batch size, hidden dim]

        output = torch.cat((embedded.permute(1,0,2).squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim=1).squeeze(0)
        # print(output.shape)
        # output = [batch size, embedding dim + hidden dim * 2]
        prediction = self.fc_out(output)
        # prediction = [batch size, output dim]
        return prediction, hidden

In [None]:
## Decoder
## de[:,0] -> first input
decoder = Decoder(vocab_size = len(en_vocab), emb_dim = 100, hidden_dim=64, dropout=0.1)
prediction, hidden = decoder(padded_en[:,0], hidden, hidden)
prediction.shape, hidden.shape

(torch.Size([3, 9795]), torch.Size([1, 3, 64]))

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [batch size, sent len]
        # trg = [batch size, sent len]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[0]
        trg_length = trg.shape[1]
        trg_vocab_size = self.decoder.vocab_size
        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size,trg_length,trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden = self.encoder(src)
        context = hidden
        # hidden = [n layers * n directions, batch size, hidden dim]
        # context = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[:,0]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden = self.decoder(input, hidden, context)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            output = output.squeeze(dim = 1)
            outputs[:,t,:] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[:,t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [None]:
# model = Seq2Seq(encoder, decoder, device = "cpu")
# outputs = model(padded_de, padded_en, teacher_forcing_ratio = True)
# print(outputs.shape)

## Training

In [None]:
class Seq2SeqLightningModel(pl.LightningModule):

    def __init__(self, encode, decoder, learning_rate, device, teacher_forcing_ratio):
        super().__init__()
        self.learning_rate = learning_rate
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.encoder = encoder
        self.decoder = decoder
        self.seq2seq_model = Seq2Seq(self.encoder, self.decoder, device)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID)
        self.init_weights()

        self.train_loss = []
        self.val_loss = []
        self.test_loss = []

    def init_weights(self):
        for name, param in self.named_parameters():
            nn.init.normal_(param.data, mean=0, std=0.01)

    def forward(self, de, en):
        logits = self.seq2seq_model(de, en, teacher_forcing_ratio = self.teacher_forcing_ratio)
        return logits

    def training_step(self, batch):
        src, trg = batch['padded_de'], batch['padded_en']
        output = self(src, trg)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        # print(output.shape, trg.shape)
        loss = self.loss_fn(output, trg)
        self.train_loss.append(loss.item())
        self.log_dict({"train_loss": loss}, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch):
        src, trg = batch['padded_de'], batch['padded_en']
        output = self(src, trg)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = self.loss_fn(output, trg)
        self.val_loss.append(loss.item())
        self.log_dict({"val_loss": loss}, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        print(f"Epoch : {self.current_epoch}  \
              Train Loss : {np.mean(self.train_loss)} \
              Val Loss : {np.mean(self.val_loss)} \
              Train PPL : {np.exp(np.mean(self.train_loss))} \
              Val PPL : {np.exp(np.mean(self.val_loss))} ")

        self.train_loss =[]
        self.val_loss =[]


    def test_step(self, batch):
        src, trg = batch['padded_de'], batch['padded_en']
        output = self(src, trg)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = self.loss_fn(output, trg)
        self.test_loss.append(loss.item())
        return loss

    def on_test_epoch_end(self) -> None:
        print(f"Test Loss : {np.mean(self.test_loss)}  Test PPL : {np.exp(np.mean(self.test_loss))}")
        self.test_loss = []

    def configure_optimizers(self):
        opt = optim.Adam(self.parameters(), lr = self.learning_rate)
        return opt

In [None]:
## test architecture
encoder = Encoder(vocab_size = len(de_vocab), emb_dim = 256, hidden_dim=512, dropout=0.5)
decoder = Decoder(vocab_size = len(en_vocab), emb_dim = 256, hidden_dim=512, dropout=0.5)
model = Seq2SeqLightningModel(encoder, decoder, learning_rate= .001, device ="cpu", teacher_forcing_ratio=0.5)
outputs = model(padded_de, padded_en)
outputs.shape



torch.Size([3, 14, 9795])

In [None]:
## clear old checkpoints
checkpoints_dir = "checkpoints_logs"
if os.path.exists(checkpoints_dir):
    shutil.rmtree(checkpoints_dir)

## Model Training
encoder = Encoder(vocab_size = len(de_vocab), emb_dim = 300, hidden_dim=512, dropout=0.5)
decoder = Decoder(vocab_size = len(en_vocab), emb_dim = 300, hidden_dim=512, dropout=0.5)
model = Seq2SeqLightningModel(encoder, decoder, learning_rate= .001, device ="cuda", teacher_forcing_ratio=0.5)

callbacks = pl.callbacks.ModelCheckpoint(dirpath = checkpoints_dir,
                                         filename = '{epoch}-{val_loss:.2f}',
                                          mode = "min",
                                          monitor = "val_loss",
                                          save_last = True,
                                          save_top_k=-1)


trainer = pl.Trainer(accelerator= "gpu",
           max_epochs=10,
           check_val_every_n_epoch = 2,
           gradient_clip_val=1,
           gradient_clip_algorithm="value",
           callbacks = [callbacks])

trainer.fit(model, train_dl, val_dl)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name          | Type             | Params
---------------------------------------------------
0 | encoder       | Encoder          | 6.9 M 
1 | decoder       | Decoder          | 18.0 M
2 | seq2seq_model | Seq2Seq          | 24.8 M
3 | loss_fn       |

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch : 0                Train Loss : nan               Val Loss : 9.18659782409668               Train PPL : nan               Val PPL : 9765.370889417609 


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 1                Train Loss : 5.42063157600269               Val Loss : 5.022166967391968               Train PPL : 226.02182739506364               Val PPL : 151.73976290370982 


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 3                Train Loss : 4.790816967947441               Val Loss : 4.609658598899841               Train PPL : 120.39969118315156               Val PPL : 100.44985009243915 


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 5                Train Loss : 4.392380410119107               Val Loss : 4.313825607299805               Train PPL : 80.83260489973121               Val PPL : 74.72581444191289 


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 7                Train Loss : 3.9589707014853492               Val Loss : 3.9980509281158447               Train PPL : 52.40335947998493               Val PPL : 54.491837952586906 


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch : 9                Train Loss : 3.5317980302007577               Val Loss : 3.7198106050491333               Train PPL : 34.18537873207244               Val PPL : 41.25657958075547 


`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
## if test dataset has loss near around val loss that means we are not overfitting
model = model.eval()
trainer.test(model, dataloaders= test_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]
/home/saurabh/anaconda3/envs/lighting/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

Test Loss : 3.7403244376182556  Test PPL : 42.11165055251135


[{}]

## Predict

In [None]:
en_vocab_lookup = {i:w for w,i in en_vocab.items()}

def encode_tokens(tokens, vocab):
    encoded = []
    for w in tokens:
        encoded.append(vocab.get(w, UNK_ID))
    return encoded


def decode_tokens(tokens, vocab_loopup):
    decoded = []
    for i in tokens:
        if i == 2 or i==3:
            continue
        decoded.append(vocab_loopup.get(i))
    return decoded


def translate_sentence(sentence, model, en_nlp, de_nlp, en_vocab, de_vocab, sos_token, eos_token, device, lower = True, max_output_length=25):
    ## model eval mode
    model.eval()

    model = model.to(device)

    ## encode german sent
    sent = str(sentence).lower() if lower else sentence
    tokens = [token.text for token in de_nlp.tokenizer(sent)]
    tokens = [sos_token] + tokens + [eos_token]
    ids = encode_tokens(tokens, de_vocab)

    ## encoder input
    tensor = torch.LongTensor(ids).unsqueeze(0).to(device)
    tensor = tensor.to(device)

    ## encoder
    hidden = model.encoder(tensor)

    ## input to decoder <sos>
    inputs = encode_tokens([sos_token], en_vocab)

    for _ in range(max_output_length):
        inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
        output, hidden = model.decoder(inputs_tensor, hidden, hidden)
        predicted_token = output.argmax(-1).item()
        inputs.append(predicted_token)
        if predicted_token == en_vocab[eos_token]:
            break

    tokens = decode_tokens(inputs, en_vocab_lookup)

    return tokens

In [None]:
sent = "Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt."
predictions = translate_sentence(sent, model, nlp_en, nlp_de, en_vocab, de_vocab, sos_token="<sos>", eos_token="<eos>", device = "cuda")
print(predictions)

['a', 'man', 'in', 'a', 'green', 'jacket', 'is']


## Blue Scroe

In [None]:
bleu = evaluate.load("bleu")

In [None]:
references = test_df.en.str.lower().tolist()
pprint(references[:2], compact=True)

['a man in an orange hat starring at something.',
 'a boston terrier is running on lush green grass in front of a white fence.']


In [None]:
prediction_list = []
prediction_list = [translate_sentence(sent, model, nlp_en, nlp_de, en_vocab, de_vocab, sos_token="<sos>", eos_token="<eos>", device = "cuda") for sent in tqdm_notebook(test_df.de.values)]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  prediction_list = [translate_sentence(sent, model, nlp_en, nlp_de, en_vocab, de_vocab, sos_token="<sos>", eos_token="<eos>", device = "cuda") for sent in tqdm_notebook(test_df.de.values)]


  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
pprint(prediction_list[:2], compact = True)

[['a', 'man', 'in', 'a', 'green', 'jacket', 'is'],
 ['a', 'brown', 'dog', 'runs', 'through', 'a', 'grassy', 'field', '.']]


In [None]:
def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

tokenizer_fn = get_tokenizer_fn(nlp_en,  lower = True)

In [None]:
predictions = list(map(lambda x: " ".join(x),prediction_list))

In [None]:
results = bleu.compute(predictions=predictions, references=references, tokenizer=tokenizer_fn)
results

{'bleu': 0.07709181260649708,
 'precisions': [0.5672100165367352,
  0.1994374497723011,
  0.08304979894834519,
  0.03293084522502744],
 'brevity_penalty': 0.5812798123625764,
 'length_ratio': 0.6482885366413967,
 'translation_length': 8466,
 'reference_length': 13059}