<a href="https://colab.research.google.com/github/dude123studios/AdvancedDeepLearning/blob/main/Seq2SeqTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import numpy as np
import re
import tensorflow as tf
import os
import unicodedata

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
if not os.path.exists('./datasets'):
  os.mkdir('./datasets')

The book, Deep Learning with tensorflow 2 and keras by PACKT, did not have an up to date version. This is my own way to extract the dataset

In [2]:
!wget  -P ./datasets https://www.manythings.org/anki/fra-eng.zip
!unzip ./datasets/fra-eng.zip -d ./datasets

--2021-02-10 16:39:05--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.55.222, 172.67.173.198, 2606:4700:3036::ac43:adc6, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.55.222|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6281268 (6.0M) [application/zip]
Saving to: ‘./datasets/fra-eng.zip’


2021-02-10 16:39:06 (9.22 MB/s) - ‘./datasets/fra-eng.zip’ saved [6281268/6281268]

Archive:  ./datasets/fra-eng.zip
  inflating: ./datasets/_about.txt   
  inflating: ./datasets/fra.txt      


In [3]:
def preprocess_sentence(sent):
  sent = ''.join([c for c in unicodedata.normalize('NFD', sent)])
  sent = re.sub(r'([!.?])', r' \1', sent)
  sent = re.sub(r'[^a-zA-Z!.?]+', r' ', sent)
  sent = re.sub(r'\s+', r' ', sent)
  sent = sent.lower()
  return sent

In [4]:
def download_and_read(num_pairs=50000):
    en_sents, fr_sents_in, fr_sents_out = [], [], []
    local_file = os.path.join("datasets", "fra.txt")
    with open(local_file, "r") as fin:
        for i, line in enumerate(fin):
            en_sent, fr_sent = line.strip().split('CC-BY')[0].strip().split('\t')
            en_sent = [w for w in preprocess_sentence(en_sent).split()]
            fr_sent = preprocess_sentence(fr_sent)
            fr_sent_in = [w for w in ("BOS " + fr_sent).split()]
            fr_sent_out = [w for w in (fr_sent + " EOS").split()]
            en_sents.append(en_sent)
            fr_sents_in.append(fr_sent_in)
            fr_sents_out.append(fr_sent_out)
            if i >= num_pairs-1:
              break
    return en_sents, fr_sents_in, fr_sents_out

In [5]:
sents_en, sents_fr_in, sents_fr_out = download_and_read()

tokenizer_en = tf.keras.preprocessing.text.Tokenizer(
    filters='',lower=False)
tokenizer_en.fit_on_texts(sents_en)
data_en = tokenizer_en.texts_to_sequences(sents_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(
    data_en, padding='post')

tokenizer_fr = tf.keras.preprocessing.text.Tokenizer(
    filters="", lower=False)
tokenizer_fr.fit_on_texts(sents_fr_in)
tokenizer_fr.fit_on_texts(sents_fr_out)
data_fr_in = tokenizer_fr.texts_to_sequences(sents_fr_in)
data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in, padding="post")
data_fr_out = tokenizer_fr.texts_to_sequences(sents_fr_out)
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out, padding="post")

vocab_size_en = len(tokenizer_en.word_index)
vocab_size_fr = len(tokenizer_fr.word_index)

word2idx_en = tokenizer_en.word_index
idx2word_en = {v:k for k,v in word2idx_en.items()}

word2idx_fr = tokenizer_fr.word_index
idx2word_fr = {v:k for k,v in word2idx_fr.items()}

print('english vocabulary: ', str (vocab_size_en))
print('french vocabulary: ', str (vocab_size_fr))

maxlen_en = data_en.shape[1]
maxlen_fr = data_fr_out.shape[1]
print('The maximum english length is: {:d} '.format(maxlen_en) +   
  'and the maximum french length is: {:d}'.format(maxlen_fr))

english vocabulary:  5889
french vocabulary:  9017
The maximum english length is: 8 and the maximum french length is: 18


In [6]:
batch_size = 64
dataset = tf.data.Dataset.from_tensor_slices(
    (data_en, data_fr_in, data_fr_out))
dataset = dataset.shuffle(10000)
test_size = 5000 
test_dataset = dataset.take(test_size).batch(batch_size, drop_remainder=True)
train_dataset = dataset.skip(test_size).batch(batch_size, drop_remainder=True)

In [7]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps, 
            encoder_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.encoder_dim = encoder_dim
        self.embedding = tf.keras.layers.Embedding(
            vocab_size, embedding_dim, input_length=num_timesteps)
        self.rnn = tf.keras.layers.GRU(
            encoder_dim, return_sequences=False, return_state=True)

    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, initial_state=state)
        return x, state

    def init_state(self, batch_size):
        return tf.zeros((batch_size, self.encoder_dim))


class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_timesteps,
            decoder_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.decoder_dim = decoder_dim
        self.embedding = tf.keras.layers.Embedding(
            vocab_size, embedding_dim, input_length=num_timesteps)
        self.rnn = tf.keras.layers.GRU(
            decoder_dim, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, x, state):
        x = self.embedding(x)
        x, state = self.rnn(x, state)
        x = self.dense(x)
        return x, state


embedding_dim = 128
encoder_dim, decoder_dim = 512, 512

In [8]:
def build_graph():
  encoder = Encoder(vocab_size_en+1,embedding_dim, maxlen_en, encoder_dim)
  decoder = Decoder(vocab_size_fr+1, embedding_dim, maxlen_fr, decoder_dim)
  return encoder, decoder

In [9]:
  def loss_fn(ytrue, ypred):
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    mask = tf.math.logical_not(tf.math.equal(ytrue, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = scce(ytrue, ypred, sample_weight=mask)
    return loss

In [10]:
optimizer = tf.keras.optimizers.Adam()


@tf.function
def train_step(encoder, decoder, encoder_in, decoder_in, decoder_out, encoder_state):
  with tf.GradientTape() as tape:
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state
    decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
    loss = loss_fn(decoder_out, decoder_pred)
    
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return loss

In [11]:
def predict(encoder, decoder, sents_en, data_en,
            sents_fr_out, word2idx_fr, idx2word_fr):
  random_id = np.random.choice(len(sents_en))
  print('input   : ', ' '.join(sents_en[random_id]))
  print('label   : ', ' '.join(sents_fr_out[random_id]))
  encoder_in = tf.expand_dims(data_en[random_id],axis=0)
  decoder_out = tf.expand_dims(sents_fr_out[random_id], axis=0)
  encoder_state = encoder.init_state(1)
  encoder_out, encoder_state = encoder(encoder_in, encoder_state)
  decoder_state = encoder_state
  decoder_in = tf.expand_dims(tf.constant([word2idx_fr['BOS']]), axis=0)
  pred_sent_fr = []
  while True:
    decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
    decoder_pred = tf.argmax(decoder_pred, axis=-1)
    pred_word = idx2word_fr[decoder_pred.numpy()[0][0]]
    pred_sent_fr.append(pred_word)
    if pred_word == 'EOS':
      break
    decoder_in = decoder_pred
  
  print('predicted   : ', ' '.join(pred_sent_fr[0:-1]))

In [12]:
def evaluate_bleu_score(encoder, decoder, test_dataset, 
        word2idx_fr, idx2word_fr):

    bleu_scores = []
    smooth_fn = SmoothingFunction()
    for encoder_in, decoder_in, decoder_out in test_dataset:
        encoder_state = encoder.init_state(batch_size)
        encoder_out, encoder_state = encoder(encoder_in, encoder_state)
        decoder_state = encoder_state
        decoder_pred, decoder_state = decoder(decoder_in, decoder_state)

        # compute argmax
        decoder_out = decoder_out.numpy()
        decoder_pred = tf.argmax(decoder_pred, axis=-1).numpy()

        for i in range(decoder_out.shape[0]):
            ref_sent = [idx2word_fr[j] for j in decoder_out[i].tolist() if j > 0]
            hyp_sent = [idx2word_fr[j] for j in decoder_pred[i].tolist() if j > 0]
            # remove trailing EOS
            ref_sent = ref_sent[0:-1]
            hyp_sent = hyp_sent[0:-1]
            bleu_score = sentence_bleu([ref_sent], hyp_sent, 
                smoothing_function=smooth_fn.method1)
            bleu_scores.append(bleu_score)

    return np.mean(np.array(bleu_scores))

This will take up to an hour

In [13]:
encoder, decoder = build_graph()
if not os.path.exists('./checkpoints'):
  os.mkdir('./checkpoints')
num_epochs = 200
for e in range(1,num_epochs+1):
  encoder_state = encoder.init_state(batch_size)
  for batch, data in enumerate(train_dataset):
    encoder_in, decoder_in, decoder_out = data 
    loss = train_step(encoder, decoder, encoder_in, decoder_in, decoder_out, encoder_state)
  eval_score = evaluate_bleu_score(encoder, decoder,test_dataset,word2idx_fr, idx2word_fr)
  print('Epoch: {}/{}, Loss: {:.4f}, Eval Score: {:.3e}'.format(e,num_epochs, loss.numpy(),eval_score))
  if e % 50 == 0:
    print('SAVING CHECKPOINT {} ...'.format(e // 50))
    encoder.save_weights('encoder_ckpt_{}.h5'.format(str (e // 50)))
    decoder.save_weights('decoder_ckpt_{}.h5'.format(str (e // 50)))
    print('COMPLETED SAVE, PREDICTING TEXT \n')
    predict(encoder, decoder, sents_en, data_en, sents_fr_out,word2idx_fr, idx2word_fr)

Epoch: 1/200, Loss: 1.4331, Eval Score: 1.927e-02
Epoch: 2/200, Loss: 0.8677, Eval Score: 2.852e-02
Epoch: 3/200, Loss: 0.7526, Eval Score: 3.501e-02
Epoch: 4/200, Loss: 0.6137, Eval Score: 4.214e-02
Epoch: 5/200, Loss: 0.4833, Eval Score: 5.248e-02
Epoch: 6/200, Loss: 0.3719, Eval Score: 6.297e-02
Epoch: 7/200, Loss: 0.3488, Eval Score: 7.494e-02
Epoch: 8/200, Loss: 0.2760, Eval Score: 8.433e-02
Epoch: 9/200, Loss: 0.2142, Eval Score: 9.337e-02
Epoch: 10/200, Loss: 0.2087, Eval Score: 1.031e-01
Epoch: 11/200, Loss: 0.1705, Eval Score: 1.110e-01
Epoch: 12/200, Loss: 0.1683, Eval Score: 1.186e-01
Epoch: 13/200, Loss: 0.1550, Eval Score: 1.216e-01
Epoch: 14/200, Loss: 0.1591, Eval Score: 1.255e-01
Epoch: 15/200, Loss: 0.1526, Eval Score: 1.276e-01
Epoch: 16/200, Loss: 0.1511, Eval Score: 1.330e-01
Epoch: 17/200, Loss: 0.1455, Eval Score: 1.368e-01
Epoch: 18/200, Loss: 0.1145, Eval Score: 1.382e-01
Epoch: 19/200, Loss: 0.1113, Eval Score: 1.387e-01
Epoch: 20/200, Loss: 0.1210, Eval Score: