In [1]:
#Building a NLP NMT English to German using RNN technique as well as Attention mechanism

In [8]:
import tensorflow as tf
from pathlib import Path


url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
                               extract=True)
text = (Path(path).with_name("spa-eng")/"spa.txt").read_text()


# import requests
# import tensorflow as tf
# from pathlib import Path
# url = 'https://www.manythings.org/anki/deu-eng.zip'
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

# r = requests.get(url, headers=headers)
# # save the file:
# # path = tf.keras.utils.get_file("deu-eng.zip", origin=url, extract=True)
# open('my_file.zip', 'wb').write(r.content)
# # text = (Path(path).with_name("deu-eng")/"deu.txt").read_text()


In [9]:
# open('my_file.zip')

In [10]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [11]:
#Now we shall remove some specific tokens present only in spanish language like inverted exclamation marks, etc
import numpy as np

# text = text.replace("¡", "").replace("¿", "")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42)  # extra code – ensures reproducibility on CPU
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs)  # separates the pairs into 2 lists

In [12]:
#Creating Textvectorization layets, one per language
vocab_size = 1000
max_length = 50
text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length
)
text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length
)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"startofseq {s} endofseq" for s in sentences_es])

In [13]:
text_vec_layer_en.get_vocabulary()[:10]

['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']

In [14]:
text_vec_layer_es.get_vocabulary()[:10]

['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']

In [15]:
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_es[100_000:]])
Y_train = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f"{s} endofseq" for s in sentences_es[100_000:]])

In [16]:
tf.random.set_seed(42)
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [17]:
#Encoding the sentences and embedding layer for each language
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [18]:
encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [19]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [20]:
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(decoder_outputs)

In [21]:
output_layer

<keras.layers.core.dense.Dense at 0x236c844ab60>

In [22]:
#Lets create a model now
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=5, validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x236d6335fc0>

In [23]:
#Now we can not use model.predict here as the decoder also expects inputs from previous predictions
#Now we shall make a fucntion to feed the decoder as well

def translate(sentence_en):
  translation = ""
  for word_idx in range(max_length):
    X = np.array([sentence_en]) #encoder input
    X_dec = np.array(["startofseq " + translation])
    y_proba = model.predict((X, X_dec))[0, word_idx]
    predicted_word_id = np.argmax(y_proba)
    predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
    if predicted_word == "endofseq":
      break
    translation += " " + predicted_word
  return translation.strip()


In [24]:
translate("This is love.")



'esto es amor'

In [25]:
# extra code – a basic implementation of beam search

def beam_search(sentence_en, beam_width, verbose=False):
    X = np.array([sentence_en])  # encoder input
    X_dec = np.array(["startofseq"])  # decoder input
    y_proba = model.predict((X, X_dec))[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]
    
    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue  # translation is finished, so don't try to extend it
            X = np.array([sentence_en])  # encoder input
            X_dec = np.array(["startofseq " + translation])  # decoder input
            y_proba = model.predict((X, X_dec))[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                word = text_vec_layer_es.get_vocabulary()[word_id]
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _, tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()



In [26]:
# extra code – shows how the model making an error
sentence_en = "I love cats and dogs"
translate(sentence_en)



'amo los gatos y gatos'

In [27]:

# extra code – shows how beam search can help
beam_search(sentence_en, beam_width=3, verbose=True)

Top first words: [(-1.1666902, 'amo'), (-1.3326418, 'me'), (-2.1916459, '[UNK]')]
Top translations so far: [(-1.4207265, 'amo los'), (-1.7773826, 'me encanta'), (-2.64506, '[UNK] los')]
Top translations so far: [(-1.7322148, 'amo los gatos'), (-2.4327912, 'me encanta el'), (-2.9497573, '[UNK] los gatos')]
Top translations so far: [(-2.229721, 'amo los gatos y'), (-3.1193464, 'me encanta el amor'), (-3.3642082, 'amo los gatos de')]
Top translations so far: [(-3.197341, 'amo los gatos y gatos'), (-3.4155998, 'me encanta el amor y'), (-3.830542, 'amo los gatos de amor')]
Top translations so far: [(-3.3064783, 'amo los gatos y gatos endofseq'), (-4.3385696, 'me encanta el amor y a'), (-4.806133, 'amo los gatos de amor a')]
Top translations so far: [(-3.3064783, 'amo los gatos y gatos endofseq'), (-4.732403, 'me encanta el amor y a los'), (-6.0797, 'amo los gatos de amor a los')]
Top translations so far: [(-3.3064783, 'amo los gatos y gatos endofseq'), (-5.32454, 'me encanta el amor y a los

'amo los gatos y gatos'

# Attention mechanism

In [28]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_sequences=True, return_state=True))

In [29]:
# extra code – this part of the model is exactly the same as earlier
encoder_outputs, *encoder_state = encoder(encoder_embeddings)
encoder_state = [tf.concat(encoder_state[::2], axis=-1),  # short-term (0 & 2)
                 tf.concat(encoder_state[1::2], axis=-1)]  # long-term (1 & 3)
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [30]:
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")
Y_proba = output_layer(attention_outputs)

In [32]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=5,
          validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x237b1e1f100>

In [33]:

translate("I like soccer and also going to the beach")



'me gusta el fútbol y también ir a la playa'

In [34]:
beam_search("I like soccer and also going to the beach", beam_width=3,
            verbose=True)

Top first words: [(-0.31683892, 'me'), (-2.3549259, 'yo'), (-2.8721812, 'prefiero')]
Top translations so far: [(-0.32778606, 'me gusta'), (-2.5895562, 'yo me'), (-3.2702641, 'prefiero fútbol')]
Top translations so far: [(-0.8119446, 'me gusta el'), (-2.6010826, 'yo me gusta'), (-2.7695754, 'me gusta la')]
Top translations so far: [(-0.8122299, 'me gusta el fútbol'), (-2.7739117, 'me gusta la fútbol'), (-3.1109324, 'yo me gusta el')]
Top translations so far: [(-0.81692344, 'me gusta el fútbol y'), (-2.7775795, 'me gusta la fútbol y'), (-3.111212, 'yo me gusta el fútbol')]
Top translations so far: [(-1.268604, 'me gusta el fútbol y también'), (-2.9715037, 'me gusta la fútbol y también'), (-3.1160407, 'yo me gusta el fútbol y')]
Top translations so far: [(-2.3784423, 'me gusta el fútbol y también ir'), (-3.5956879, 'me gusta el fútbol y también va'), (-3.6387482, 'yo me gusta el fútbol y también')]
Top translations so far: [(-2.4520779, 'me gusta el fútbol y también ir a'), (-3.6538622, '

'me gusta el fútbol y también ir a la playa'