<a href="https://colab.research.google.com/github/djm3622/bayesDL/blob/main/entofrv0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import/Setup

Get the kaggle dataset, import necessary packages.

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

# large dataset
# ! kaggle datasets download -d dhruvildave/en-fr-translation-dataset
# ! unzip /content/en-fr-translation-dataset.zip
# ! rm /content/en-fr-translation-dataset.zip

# small dataset
! kaggle datasets download -d devicharith/language-translation-englishfrench
! unzip /content/language-translation-englishfrench.zip
! rm /content/language-translation-englishfrench.zip

Downloading language-translation-englishfrench.zip to /content
  0% 0.00/3.51M [00:00<?, ?B/s]
100% 3.51M/3.51M [00:00<00:00, 191MB/s]
Archive:  /content/language-translation-englishfrench.zip
  inflating: eng_-french.csv         


In [None]:
import pandas as pd
import numpy as np
import torch
import random

In [None]:
data = pd.read_csv('/content/eng_-french.csv')

In [None]:
data.dropna(inplace=True)
small = False

In [None]:
data

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


# Byte Pair Encoding

Using the GPT2 Tokenizer. Seperate tokenizer for english and french text.

In [None]:
! pip install transformers --quiet
from transformers import GPT2TokenizerFast

en_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", unk_token='<unk>', bos_token='<bos>', eos_token='<eos>', pad_token='<pad>')
fr_tokenizer = GPT2TokenizerFast.from_pretrained("benjamin/gpt2-wechsel-french", unk_token='<unk>', bos_token='<bos>', eos_token='<eos>', pad_token='<pad>')

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/250 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/855k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/514k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [None]:
! pip3 install subword-nmt &> log
! wget https://raw.githubusercontent.com/yandexdataschool/nlp_course/2020/week04_seq2seq/vocab.py -O vocab.py

--2023-12-09 19:56:19--  https://raw.githubusercontent.com/yandexdataschool/nlp_course/2020/week04_seq2seq/vocab.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2879 (2.8K) [text/plain]
Saving to: ‘vocab.py’


2023-12-09 19:56:20 (55.8 MB/s) - ‘vocab.py’ saved [2879/2879]



In [None]:
import csv
from nltk.tokenize import WordPunctTokenizer
from subword_nmt.learn_bpe import learn_bpe
from subword_nmt.apply_bpe import BPE
tokenizer = WordPunctTokenizer()
def tokenize(x):
    return ' '.join(tokenizer.tokenize(x.lower()))

# split and tokenize the data
with open('train.en', 'w') as f_src,  open('train.fr', 'w') as f_dst:
  with open('eng_-french.csv', 'r') as csv_file:
    csv_reader = csv.reader(csv_file)
    header = next(csv_reader)
    for line in csv_reader:
        src_line, dst_line = line[0], line[1]
        f_src.write(tokenize(src_line) + '\n')
        f_dst.write(tokenize(dst_line) + '\n')

# build and apply bpe vocs
bpe = {}
for lang in ['en', 'fr']:
    learn_bpe(open('./train.' + lang), open('bpe_rules.' + lang, 'w'), num_symbols=8000)
    bpe[lang] = BPE(open('./bpe_rules.' + lang))

    with open('train.bpe.' + lang, 'w') as f_out:
        for line in open('train.' + lang):
            f_out.write(bpe[lang].process_line(line.strip()) + '\n')

100%|██████████| 8000/8000 [00:17<00:00, 446.80it/s]
100%|██████████| 8000/8000 [00:15<00:00, 513.75it/s] 


In [None]:
data_inp = np.array(open('./train.bpe.fr').read().split('\n'))
data_out = np.array(open('./train.bpe.en').read().split('\n'))

from sklearn.model_selection import train_test_split
train_inp, dev_inp, train_out, dev_out = train_test_split(data_inp, data_out, test_size=0.20, random_state=42)

In [None]:
from vocab import Vocab
inp_voc = Vocab.from_lines(train_inp)
out_voc = Vocab.from_lines(train_out)

In [None]:
batch_inp = inp_voc.to_matrix(train_inp[1:2])
batch_out = out_voc.to_matrix(train_out[1:2])

In [None]:
from torch.utils.data import Dataset

class EngFrDatasetAnton(Dataset):
  def __init__(self, inp_voc, out_voc, train_inp, train_out, who_train='en'):
    self.input_voc, self.out_voc = inp_voc, out_voc
    self.train_inp, self.train_out = train_inp, train_out

  def __len__(self):
    return len(self.train_inp)

  def __getitem__(self, idex):
    en = self.out_voc.to_matrix(train_out[idex:idex+1]).t().squeeze(dim=1)
    fr = self.input_voc.to_matrix(train_inp[idex:idex+1]).t().squeeze(dim=1)
    return (en, en.size()), (fr, fr.size())

In [None]:
train_dset = EngFrDatasetAnton(inp_voc, out_voc, train_inp, train_out)

In [None]:
valid_dset = EngFrDatasetAnton(inp_voc, out_voc, dev_inp, dev_out)

In [None]:
def binaryMatrix(l, value=1):
  m = []
  for i, seq in enumerate(l):
    m.append([])
    for token in seq:
      if token == value:
        m[i].append(0)
      else:
        m[i].append(1)
  return m

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.notebook import tqdm_notebook

def collate_fn(batch):
  enc, dec = zip(*batch)
  enc, es = zip(*enc)
  dec, ds = zip(*dec)

  packed_enc_ins = pad_sequence(enc, padding_value=len(train_dset.out_voc))
  packed_dec_outs = pad_sequence(dec, padding_value=len(train_dset.input_voc))

  return packed_enc_ins, torch.tensor(es).squeeze().to(int), torch.tensor(binaryMatrix(packed_enc_ins, value=len(train_dset.out_voc)+1))==1, packed_dec_outs, torch.tensor(binaryMatrix(packed_dec_outs, value=len(train_dset.input_voc)+1))==1, packed_dec_outs.shape[0]

In [None]:
train_dl = DataLoader(train_dset, batch_size=64, shuffle=True, collate_fn=collate_fn, num_workers=1)

In [None]:
valid_dl = DataLoader(valid_dset, batch_size=64, shuffle=True, collate_fn=collate_fn, num_workers=1)

# Device Setup

Determining what resources are available.

In [None]:
device = (
    'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
)
print(f"Using {device} device.")

Using cuda device.


# Dataset

Building the Dataset, Dataloader, and determining what will be encode and what will be decoded.

In [None]:
from torch.utils.data import Dataset

class EngFrDataset(Dataset):
  def __init__(self, data, en_tokenizer, fr_tokenizer, who_train='en'):
    self.size = data.shape
    english, french = data.iloc[:, 0].values, data.iloc[:, 1].values

    if who_train == 'en':
      self.enc_tokenizer, self.dec_tokenizer = en_tokenizer, fr_tokenizer
      self.enc, self.dec = english, french
    else:
      self.enc_tokenizer, self.dec_tokenizer = fr_tokenizer, en_tokenizer
      self.enc, self.dec = french, english

  def __len__(self):
    return self.size[0]

  def __getitem__(self, idex):
    enc_in = self.enc_tokenizer(self.enc[idex]+'<eos>')['input_ids']
    dec_in = self.dec_tokenizer(self.dec[idex]+'<eos>')['input_ids']

    if len(enc_in) > 100:
      enc_in = enc_in[0:99]
      enc_in = enc_in + [self.enc_tokenizer.eos_token_id]
    if len(dec_in) > 100:
      dec_in = dec_in[0:99]
      dec_in = dec_in + [self.dec_tokenizer.eos_token_id]

    source, target = torch.tensor(enc_in, dtype=torch.int), torch.tensor(dec_in, dtype=torch.int)
    return (source.long(), source.size()), (target.long(), target.size())

In [None]:
train_data = data.sample(frac=0.80, random_state=42)
valid_data = data.drop(train_data.index)

In [None]:
train = EngFrDataset(train_data, en_tokenizer, fr_tokenizer)
valid = EngFrDataset(valid_data, en_tokenizer, fr_tokenizer)

In [None]:
train[0][0][0].shape

torch.Size([5])

*Need to use a custome collate function with an RNN because of the varying input sizes.*

In [None]:
def binaryMatrix(l, value=50260):
  m = []
  for i, seq in enumerate(l):
    m.append([])
    for token in seq:
      if token == value:
        m[i].append(0)
      else:
        m[i].append(1)
  return m

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.notebook import tqdm_notebook

def collate_fn(batch):
  enc, dec = zip(*batch)
  enc, es = zip(*enc)
  dec, ds = zip(*dec)

  packed_enc_ins = pad_sequence(enc, padding_value=50260)
  packed_dec_outs = pad_sequence(dec, padding_value=50260)

  return packed_enc_ins, torch.tensor(es).squeeze().to(int), torch.tensor(binaryMatrix(packed_enc_ins))==1, packed_dec_outs, torch.tensor(binaryMatrix(packed_dec_outs))==1, packed_dec_outs.shape[0]

train_dl = DataLoader(train, batch_size=64, shuffle=True, collate_fn=collate_fn, num_workers=1)
valid_dl = DataLoader(valid, batch_size=64, shuffle=True, collate_fn=collate_fn, num_workers=1)

In [None]:
count = 1

for k in tqdm_notebook(train_dl):
  if count > 100: break
  else: count += 1

  0%|          | 0/2196 [00:00<?, ?it/s]

In [None]:
inst = train[10]

for idx, (token, tsizes) in enumerate(inst):
  print(token)
  if idx == 0:
    print(train_dl.dataset.enc_tokenizer.convert_ids_to_tokens(token))
    print(train_dl.dataset.enc_tokenizer.decode(token))
  else:
    print(train_dl.dataset.dec_tokenizer.convert_ids_to_tokens(token))
    print(train_dl.dataset.dec_tokenizer.decode(token))

tensor([ 3666,  2988,  4762,   326,  2687,   508,   714,   407,   787,   257,
         2877,   287,  2869,   373, 16931,    13, 50258])
['My', 'Ġfather', 'Ġbelieved', 'Ġthat', 'Ġanyone', 'Ġwho', 'Ġcould', 'Ġnot', 'Ġmake', 'Ġa', 'Ġliving', 'Ġin', 'ĠJapan', 'Ġwas', 'Ġlazy', '.', '<eos>']
My father believed that anyone who could not make a living in Japan was lazy.<eos>
tensor([ 5653,  2694, 13350,   334,  1388,   357,   412,  3632,  3852,   455,
          814,   313,  5536,   866,   286, 43662,    14, 50258])
['Mon', 'ĠpÃ¨re', 'Ġpensait', 'Ġque', 'Ġcelui', 'Ġqui', 'Ġne', 'Ġpouvait', 'Ġgagner', 'Ġsa', 'Ġvie', 'Ġau', 'ĠJapon', 'ĠÃ©tait', 'Ġun', 'Ġparesseux', '.', '<eos>']
Mon père pensait que celui qui ne pouvait gagner sa vie au Japon était un paresseux.<eos>


# Seq2Seq

With bidirectional encoding, global attention, and teacher forcing.

In [None]:
from torch import nn

class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
    super().__init__()

    self.hidden_dim = hidden_dim

    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.dropout = nn.Dropout(dropout)
    self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)

  def forward(self, seq, lens):
    embs = self.embedding(seq) # apply embedding to padded data

    packed_embeddings = nn.utils.rnn.pack_padded_sequence(embs, lens, enforce_sorted=False)

    # tun the packed embeddings through the GRU, and then unpack the sequences
    outputs, hidden = self.rnn(packed_embeddings)
    outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)

    # bidirectional encoding, the reverse is simply appended to output vector
    outputs = outputs[..., :self.hidden_dim] + outputs[..., self.hidden_dim:]
    return outputs, hidden

In [None]:
class Attention(nn.Module):
  def __init__(self, hidden_size):
    super().__init__()

    self.hidden_size = hidden_size
    self.attn = nn.Linear(hidden_size, hidden_size)

  def general_score(self, hidden, encoder_output):
    energy = self.attn(encoder_output) # seq_len x batch_size x hidden_size
    return torch.sum(hidden * energy, dim=2) # seq_len x batch_size

  def forward(self, hidden, outputs, mask):
    attn_scores = self.general_score(hidden, outputs)

    # dont apply att to padding, low score means low probability when softmaxing
    attn_scores.masked_fill_(mask == False, -1e10)

    # return attn_scores, tranpose max_length and batch_size
    attn_scores = attn_scores.t()

    # softmax to get probaility distribution of each encoder input
    # becomes (batch x probaility x input_len)
    return nn.functional.softmax(attn_scores, dim=1).unsqueeze(1)

In [None]:
class Decoder(nn.Module):
  def __init__(self, input_dim, output_dim, emb_dim, hidden_dim, n_layers, dropout):
    super().__init__()

    self.n_layers = n_layers

    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.gru = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout)

    self.concat = nn.Linear(hidden_dim * 2, hidden_dim)
    self.out = nn.Linear(hidden_dim, output_dim)

    self.attn = Attention(hidden_dim)

  def forward(self, current, last_hidden, enc_outputs, enc_mask):
    emb = self.embedding(current)
    dec_output, hidden = self.gru(emb, last_hidden)

    # apply attn (t-1 and encoders_output)
    attn_wghts = self.attn(dec_output, enc_outputs, enc_mask)

    # batch matrix-matrix product of (attentionn_weights * encoder_outputs)
    # the attn_wghts holds the prob dist of the ender outputs
    # now we bmm to get a matrix (batch x summed_seq x summed_weights)
    # these summed_seq are summed with a multiplication from the prob dist
    # becomes (batch_size x hidden_size)
    context = attn_wghts.bmm(enc_outputs.transpose(0, 1))

    # concat context vector and GRU output
    dec_output = dec_output.squeeze(0)
    context = context.squeeze(1)
    concat_input = torch.cat((dec_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))

    # pass concat_output to fully connected
    output = self.out(concat_output)
    output = nn.functional.softmax(output, dim=1) # word probability

    return output, hidden

Teacher forcing is when the model is allowed to use the ground truth when predicting.

In [None]:
def maskNLLLoss(inp, target, mask):
  nTotal = mask.sum()
  crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
  loss = crossEntropy.masked_select(mask).mean()
  loss = loss.to(device)
  return loss, nTotal.item()

# calculate and accumulate loss
def calc_loss(decoder_output, target_variable, mask, loss, print_losses, n_totals):
  mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable, mask)
  print_losses.append(mask_loss.item() * nTotal)
  return (loss + mask_loss), print_losses, (n_totals + nTotal)

def train(enc, enc_lens, enc_mask, dec, dec_mask, max_dec_len, encoder, decoder, sos_idx, eos_idx, enc_optim, dec_optim, clip, teacher_forcing_ratio=1.0):
  enc_optim.zero_grad()
  dec_optim.zero_grad()

  enc, dec, enc_mask, dec_mask = enc.to(device), dec.to(device), enc_mask.to(device), dec_mask.to(device)

  loss = 0
  print_losses = []
  n_totals = 0

  encoder_outputs, encoder_hidden = encoder(enc, enc_lens)

  # init decoder input (start with SOS tokens for each sentence) (token x batch_size)
  decoder_input = torch.LongTensor([[sos_idx]*dec.shape[1]])
  decoder_input = decoder_input.to(device)

  # set initial decoder hidden state to the encoder's final hidden state,
  # since it was bidirectional be only look at the forward direction
  decoder_hidden = encoder_hidden[:decoder.n_layers]

  # teacher forcing uses the ground truth during training
  # the model still outputs predictions but the ground truth is used to make next prediction
  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:
    for t in range(max_dec_len):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs, enc_mask)
      # next input is current target, look at next t, expand dim so single batch
      decoder_input = dec[t].unsqueeze(0)

      loss, print_losses, n_totals = calc_loss(decoder_output, dec[t], dec_mask[t], loss, print_losses, n_totals)
  else:
    for t in range(max_dec_len):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs, enc_mask)
      # next input is decoder's own current output
      decoder_input = decoder_output.argmax(1).unsqueeze(0).to(device)

      loss, print_losses, n_totals = calc_loss(decoder_output, dec[t], dec_mask[t], loss, print_losses, n_totals)

  loss.backward()

  _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
  _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

  enc_optim.step()
  dec_optim.step()

  return sum(print_losses) / n_totals

In [None]:
len(train_dl.dataset.out_voc)+1

7602

In [None]:
import torch.optim as optim

def training_loop(epoches, steps, from_pretrained=None):
  bos_idx, eos_idx = 0, 1
  enc_vocab_size = len(train_dl.dataset.out_voc)+1
  dec_vocab_size = len(train_dl.dataset.input_voc)+1

  encoder = Encoder(enc_vocab_size, 64, 128, 2, 0.1)
  decoder = Decoder(dec_vocab_size, dec_vocab_size, 64, 128, 2, 0.1)

  #wandb.watch(encoder, log_freq=100)
  #wandb.water(decoder, log_freq=100)

  encoder, decoder = encoder.to(device), decoder.to(device)

  enc_optim, dec_optim = optim.Adam(encoder.parameters(), lr=1.0e-3), optim.Adam(decoder.parameters(), lr=1.0e-3)

  clip = 50.0

  for i in range(epoches):
    epoch_loss = 0
    custom_steps = steps
    pbar = tqdm_notebook(train_dl, total=steps)
    for enc, enc_lens, enc_mask, dec, dec_mask, max_seq in pbar:
      if custom_steps == 0: break
      loss = train(enc, enc_lens, enc_mask, dec, dec_mask, max_seq, encoder, decoder, bos_idx, eos_idx, enc_optim, dec_optim, clip)
      pbar.set_description(f'Batch Loss: {round(loss, 4)} ')
      epoch_loss += loss
      custom_steps -= 1

    print(f'Epoch Loss: {epoch_loss/steps} ')

  torch.save(encoder.state_dict(), 'encoder.pth')
  torch.save(decoder.state_dict(), 'decoder.pth')

  return encoder, decoder

In [None]:
epoches = 10
steps = len(train_dl)
pretrained = ['/content/drive/MyDrive/ColabNotebooks/enfrdata/encoder.pth', '/content/drive/MyDrive/ColabNotebooks/enfrdata/decoder.pth']

encoder, decoder = training_loop(epoches, steps, None)

  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 1.8388440699596453 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 1.2045869885697016 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 0.9938771019347727 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 0.8594231837008587 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 0.773511002404895 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 0.707133879475259 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 0.657708172345789 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 0.6174901548989689 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 0.5853496421968615 


  0%|          | 0/2196 [00:00<?, ?it/s]

Epoch Loss: 0.5567056011196951 


In [None]:
want_move = True

if want_move:
  ! mv decoder.pth /content/drive/MyDrive/ColabNotebooks/enfrdata/
  ! mv encocer.pth /content/drive/MyDrive/ColabNotebooks/enfrdata/

In [None]:
encoder

Encoder(
  (embedding): Embedding(7602, 64)
  (dropout): Dropout(p=0.1, inplace=False)
  (rnn): GRU(64, 128, num_layers=2, dropout=0.1, bidirectional=True)
)

In [None]:
decoder

Decoder(
  (embedding): Embedding(7902, 64)
  (gru): GRU(64, 128, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=256, out_features=128, bias=True)
  (out): Linear(in_features=128, out_features=7902, bias=True)
  (attn): Attention(
    (attn): Linear(in_features=128, out_features=128, bias=True)
  )
)

In [None]:
def get_params(model):
  acc = 0

  for p in model.parameters():
    if not p.requires_grad:
      continue
    elif len(p.shape) > 1:
      mult = p.shape[0]

      for k in p.shape[1:]:
        mult *= k
      acc += mult
    else:
      acc += p.shape[0]

  return acc

In [None]:
print(f'Encoder Trainable Params: {get_params(encoder)}' f'\nEncoder Trainable Params: {get_params(decoder)}')

Encoder Trainable Params: 931968
Encoder Trainable Params: 1748062


In [None]:
class GreedySearchDecoder(nn.Module):
  def __init__(self, encoder, decoder, sos_idx, batch_size):
    super().__init__()
    self.encoder, self.decoder = encoder, decoder
    self.sos_idx = sos_idx
    self.batch_size = batch_size

  def forward(self, input_seq, input_length, max_length, enc_mask):
    enc_mask = enc_mask.to(device)

    # get encoder out and hidden
    encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
    decoder_hidden = encoder_hidden[:decoder.n_layers] # encoder hidden is inital hidden for decoder

    # init decoder input with sos_idx
    decoder_input = torch.ones(1, self.batch_size, device=device, dtype=torch.long) * self.sos_idx
    decoder_input = decoder_input.to(device)

    # setup output predictions
    all_tokens = torch.zeros((0, self.batch_size), device=device, dtype=torch.long)
    all_scores = torch.zeros((0, self.batch_size), device=device)

    with torch.no_grad():
      for _ in range(max_length):
        decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs, enc_mask)

        # get most likely word token and its softmax score
        decoder_scores, decoder_input = torch.max(decoder_output, dim=1)

        # record token and score
        all_tokens = torch.cat((all_tokens, decoder_input.unsqueeze(dim=0)), dim=0)
        all_scores = torch.cat((all_scores, decoder_scores.unsqueeze(dim=0)), dim=0)

        # get current token to be next decoder input (add a dimension)
        decoder_input = torch.unsqueeze(decoder_input, 0)

    return all_tokens, all_scores

In [None]:
def remove_specials(tensors):
  result_list = []
  for tensor in tensors.t():
    index = (tensor == 1).nonzero(as_tuple=True)[0]

    if not index.numel():
      result_list.append(tensor.tolist())
    else:
      result_list.append(tensor[:index[0]].tolist())

  return result_list

In [None]:
from torchtext.data.metrics import bleu_score

def evaluate(valid, encoder, decoder, steps):
  encoder.eval()
  decoder.eval()

  searcher = GreedySearchDecoder(encoder, decoder, 0, 64).to(device)

  total_bleu = 0

  custom_steps = steps
  pbar = tqdm_notebook(valid, total=steps)
  for enc, enc_lens, enc_mask, dec, dec_mask, max_seq in pbar:
    if enc.shape[1] < 64: break
    if custom_steps < 1: break

    all_tokens, all_scores = searcher(enc.to(device), enc_lens, max_seq, enc_mask)
    index = (all_tokens == 1)

    candidate = [inp_voc.to_lines([x])[0].split(' ') for x in remove_specials(all_tokens)]
    reference = [[inp_voc.to_lines([x])[0].split(' ')] for x in remove_specials(dec)]

    score = bleu_score(candidate, reference)
    pbar.set_description(f'Bleu: {round(score, 8)} ')

    total_bleu += score
    custom_steps -= 1

  return total_bleu / steps

In [None]:
evaluate(valid_dl, encoder, decoder, 200)

  0%|          | 0/200 [00:00<?, ?it/s]

0.35670901023437923

In [None]:
inp_voc.to_lines([remove_specials(a)[0]])

["vous m ' y prie ici ."]

In [None]:
#f'Bleu: {evaluate(valid_dl, encoder, decoder, 200)}'

In [None]:
search = GreedySearchDecoder(encoder, decoder, 0, 64).to(device)

enc, enc_lens, enc_mask, dec, _, max_seq = next(iter(valid_dl))
all_tokens, all_scores = search(enc.to(device), enc_lens, max_seq, enc_mask)

In [None]:
pred_fr = remove_specials(all_tokens)
true_fr = remove_specials(dec)
true_en = remove_specials(enc)
inst = 8
assert inst < 64

print(f'Pred FR: {inp_voc.to_lines([pred_fr[inst]])}')
print(f'True FR: {inp_voc.to_lines([true_fr[inst]])}')
print(f'True EN: {out_voc.to_lines([true_en[inst]])}')

Pred FR: ['combien de fois par jour nourris - tu par ton chien ?']
True FR: ['combien de fois par jour nourri@@ ssez - vous votre chien ?']
True EN: ['how many times a day do you feed your dog ?']


In [None]:
out_voc.to_lines([true_en[inst]])

["it ' s hard to understand you ."]

In [None]:
data[data['English words/sentences'] == out_voc.to_lines([true_en[inst]])[0]]

Unnamed: 0,English words/sentences,French words/sentences
