# Neural Machine Translation using Encoders and Decoders

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

# Number of samples to train on
num_samples = 20000 

## Reading the data

In [2]:
input_texts = []
target_texts = []

input_characters = set()
target_characters = set()

# Since we have a '.txt' file, we will be reading it using open() method
with open("deu-eng/deu.txt", mode = 'r', encoding = 'utf-8') as f:
    lines = f.read().split("\n")

for line in lines[: min(num_samples, len(lines)-1)]:
    # Our sentences are separated by '\t'
    input_text, target_text, _ = line.split("\t")
    
    """We use '\t' as the "start sequence" character
       for the targets, and '\n' as the "end sequence" character
    """
    target_text = "\t" + target_text + "\n"
    input_texts.append(input_text)
    target_texts.append(target_text)
    
    # Now we extract the set of unique characters in both the languages
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
            
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [3]:
len(input_texts), len(target_texts)

(20000, 20000)

In [4]:
len(input_characters), len(target_characters)

(71, 89)

In [5]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

# Length of encoded and decoded characters
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

# Maximum length of input sentence [encoder] and target sentence [decoder]
max_encoder_seq_length = max([len(text) for text in input_texts])
max_decoder_seq_length = max([len(text) for text in target_texts])

In [6]:
# Total training samples
print("Number of Training Samples: ", len(input_texts))

# Printing unique tokens
print("Numer of unique Input [Encoder] Tokens: ", num_encoder_tokens)
print("Numer of unique target [Decoder] Tokens: ", num_decoder_tokens)

# Maximum length of sentences
print("Max sequence length for Input [Encoder]: ", max_encoder_seq_length)
print("Max sequence length for target [Decoder]: ", max_decoder_seq_length)

Number of Training Samples:  20000
Numer of unique Input [Encoder] Tokens:  71
Numer of unique target [Decoder] Tokens:  89
Max sequence length for Input [Encoder]:  17
Max sequence length for target [Decoder]:  74


## Indexing the Tokens

In [7]:
# Making all the characters iterable
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [8]:
input_token_index

{' ': 0,
 '!': 1,
 '"': 2,
 '$': 3,
 '%': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '0': 9,
 '1': 10,
 '2': 11,
 '3': 12,
 '4': 13,
 '5': 14,
 '6': 15,
 '7': 16,
 '8': 17,
 '9': 18,
 ':': 19,
 '?': 20,
 'A': 21,
 'B': 22,
 'C': 23,
 'D': 24,
 'E': 25,
 'F': 26,
 'G': 27,
 'H': 28,
 'I': 29,
 'J': 30,
 'K': 31,
 'L': 32,
 'M': 33,
 'N': 34,
 'O': 35,
 'P': 36,
 'Q': 37,
 'R': 38,
 'S': 39,
 'T': 40,
 'U': 41,
 'V': 42,
 'W': 43,
 'Y': 44,
 'a': 45,
 'b': 46,
 'c': 47,
 'd': 48,
 'e': 49,
 'f': 50,
 'g': 51,
 'h': 52,
 'i': 53,
 'j': 54,
 'k': 55,
 'l': 56,
 'm': 57,
 'n': 58,
 'o': 59,
 'p': 60,
 'q': 61,
 'r': 62,
 's': 63,
 't': 64,
 'u': 65,
 'v': 66,
 'w': 67,
 'x': 68,
 'y': 69,
 'z': 70}

In [9]:
target_token_index

{'\t': 0,
 '\n': 1,
 ' ': 2,
 '!': 3,
 '"': 4,
 '$': 5,
 '%': 6,
 "'": 7,
 ',': 8,
 '-': 9,
 '.': 10,
 '0': 11,
 '1': 12,
 '2': 13,
 '3': 14,
 '4': 15,
 '5': 16,
 '6': 17,
 '7': 18,
 '8': 19,
 '9': 20,
 ':': 21,
 '?': 22,
 'A': 23,
 'B': 24,
 'C': 25,
 'D': 26,
 'E': 27,
 'F': 28,
 'G': 29,
 'H': 30,
 'I': 31,
 'J': 32,
 'K': 33,
 'L': 34,
 'M': 35,
 'N': 36,
 'O': 37,
 'P': 38,
 'Q': 39,
 'R': 40,
 'S': 41,
 'T': 42,
 'U': 43,
 'V': 44,
 'W': 45,
 'Y': 46,
 'Z': 47,
 'a': 48,
 'b': 49,
 'c': 50,
 'd': 51,
 'e': 52,
 'f': 53,
 'g': 54,
 'h': 55,
 'i': 56,
 'j': 57,
 'k': 58,
 'l': 59,
 'm': 60,
 'n': 61,
 'o': 62,
 'p': 63,
 'q': 64,
 'r': 65,
 's': 66,
 't': 67,
 'u': 68,
 'v': 69,
 'w': 70,
 'x': 71,
 'y': 72,
 'z': 73,
 '\xa0': 74,
 'Ä': 75,
 'Ö': 76,
 'Ü': 77,
 'ß': 78,
 'ä': 79,
 'é': 80,
 'ö': 81,
 'ü': 82,
 'ō': 83,
 '–': 84,
 '’': 85,
 '“': 86,
 '„': 87,
 '\u202f': 88}

## One-hot Representation
Turning the sentences into 3 Numpy arrays, encoder_input_data, decoder_input_data, decoder_target_data

In [10]:
"""'encoder_input_data' is a 3D array of shape - (num_pairs, max_english_sentence_length, num_english_characters), 
   containing a one-hot vectorization of the English sentences.
"""
encoder_input_data = np.zeros(
                        (len(input_texts), max_encoder_seq_length, num_encoder_tokens), 
                        dtype = 'float32'
)


"""'decoder_input_data' is a 3D array of shape - (num_pairs, max_german_sentence_length, num_german_characters),
   containg a one-hot vectorization of the German sentences.
"""
decoder_input_data = np.zeros(
                        (len(input_texts), max_decoder_seq_length, num_decoder_tokens), 
                        dtype = 'float32'
)


"""'decoder_target_data' is the same as 'decoder_input_data' but offset by one timestep. 
    decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :].
"""
decoder_target_data = np.zeros(
                        (len(input_texts), max_decoder_seq_length, num_decoder_tokens), 
                        dtype = 'float32'
)

In [11]:
encoder_input_data.shape, decoder_input_data.shape, decoder_target_data.shape

((20000, 17, 71), (20000, 74, 89), (20000, 74, 89))

In [12]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    
    for t, char in enumerate(target_text):
        # 'decoder_target_data' is ahead of 'decoder_input_data' by 1 time-step
        # Hence, it does not include the starting character
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.

In [13]:
encoder_input_data[0].shape

(17, 71)

# Creating the Model

In [14]:
# Latent Dimentionality of the Encoding Space
latent_dim = 256

# Setting up the Encoder
encoder_inputs = Input(shape = (None, num_encoder_tokens))

# 'return_state' = True specifies that we do not want output based on time steps
encoder = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# Discard encoder_outputs by only keeping the states
encoder_states = [state_h, state_c]

In [15]:
# Setting up the Decoder
decoder_inputs = Input(shape = (None, num_decoder_tokens))

# 'return_state' = True specifies that we do not want output based on time steps
# 'return_sequences = True' returns full output sequences, and internal states as well
decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, 
                                     initial_state = encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation = "softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [16]:
# Defining the model that will turn 'encoder_input_data' & 'decoder_input_data' into 'decoder_target_data'
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 71)]   0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None, 89)]   0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 256),        335872      ['input_1[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

In [17]:
# Compiling the model
model.compile(optimizer = "adam", 
              loss = "categorical_crossentropy", 
              metrics = ["accuracy"])

In [18]:
model_history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data, 
                         validation_split = 0.2, 
                         epochs = 100, 
                         batch_size = 32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


## Inference

In [19]:
# Sampling mode
"""For decoding a test sentence: 
   
   1. We encode the input sentence and retrieve the initial decoder state
   2. Then run one step of the decoder with this initial state and a "start of sequence" token as target. 
      So, the output will be the next target character.
   3. Repeat the process with current target token and states.
"""

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [43]:
# Reverse-Lookup token index to decode sequences back to something readable
reverse_input_char_index = dict((i, char) for i, char in input_token_index.items())
reverse_target_char_index = dict((char, i) for i, char in target_token_index.items())

In [46]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [48]:
for seq_index in range(50):
    # Take one sequence and try decoding from training set
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input Sentence: ", input_texts[seq_index])
    print("Decoded Sentence: ", decoded_sentence)

-
Input Sentence:  Go.
Decoded Sentence:  Geh.

-
Input Sentence:  Hi.
Decoded Sentence:  Hallo!

-
Input Sentence:  Hi.
Decoded Sentence:  Hallo!

-
Input Sentence:  Run!
Decoded Sentence:  Lauf!

-
Input Sentence:  Run.
Decoded Sentence:  Lauf!

-
Input Sentence:  Wow!
Decoded Sentence:  Potzdonner!

-
Input Sentence:  Wow!
Decoded Sentence:  Potzdonner!

-
Input Sentence:  Duck!
Decoded Sentence:  Kopf runter!

-
Input Sentence:  Fire!
Decoded Sentence:  Feuer!

-
Input Sentence:  Help!
Decoded Sentence:  Zu Hülf!

-
Input Sentence:  Help!
Decoded Sentence:  Zu Hülf!

-
Input Sentence:  Stay.
Decoded Sentence:  Bleib!

-
Input Sentence:  Stop!
Decoded Sentence:  Anhalten!

-
Input Sentence:  Stop!
Decoded Sentence:  Anhalten!

-
Input Sentence:  Wait!
Decoded Sentence:  Warte!



-
Input Sentence:  Wait.
Decoded Sentence:  Warte.

-
Input Sentence:  Begin.
Decoded Sentence:  Fang an.

-
Input Sentence:  Do it.
Decoded Sentence:  Mache es!

-
Input Sentence:  Do it.
Decoded Sentence:  Mache es!

-
Input Sentence:  Go on.
Decoded Sentence:  Mach weiter!

-
Input Sentence:  Hello!
Decoded Sentence:  Hallo!

-
Input Sentence:  Hello!
Decoded Sentence:  Hallo!

-
Input Sentence:  Hurry!
Decoded Sentence:  Beeil dich!

-
Input Sentence:  Hurry!
Decoded Sentence:  Beeil dich!

-
Input Sentence:  I hid.
Decoded Sentence:  Ich versteckte mich.

-
Input Sentence:  I hid.
Decoded Sentence:  Ich versteckte mich.



-
Input Sentence:  I ran.
Decoded Sentence:  Ich rannte.

-
Input Sentence:  I see.
Decoded Sentence:  Ich verstehe.

-
Input Sentence:  I see.
Decoded Sentence:  Ich verstehe.

-
Input Sentence:  I try.
Decoded Sentence:  Ich versuche es.

-
Input Sentence:  I try.
Decoded Sentence:  Ich versuche es.

-
Input Sentence:  I won!
Decoded Sentence:  Ich hab gewonnen!

-
Input Sentence:  I won!
Decoded Sentence:  Ich hab gewonnen!

-
Input Sentence:  I won.
Decoded Sentence:  Ich habe gewonnen.

-
Input Sentence:  Oh no!
Decoded Sentence:  Oh, Nein!



-
Input Sentence:  Relax.
Decoded Sentence:  Entspann dich.

-
Input Sentence:  Shoot!
Decoded Sentence:  Schieß!

-
Input Sentence:  Shoot!
Decoded Sentence:  Schieß!

-
Input Sentence:  Smile.
Decoded Sentence:  Lächeln!

-
Input Sentence:  Sorry?
Decoded Sentence:  Entschuldigung?

-
Input Sentence:  Ask me.
Decoded Sentence:  Fragen Sie mich!

-
Input Sentence:  Ask me.
Decoded Sentence:  Fragen Sie mich!

-
Input Sentence:  Ask me.
Decoded Sentence:  Fragen Sie mich!

-
Input Sentence:  Attack!
Decoded Sentence:  Attacke!

-
Input Sentence:  Attack!
Decoded Sentence:  Attacke!

-
Input Sentence:  Buy it.
Decoded Sentence:  Kauf’s!



-
Input Sentence:  Cheers!
Decoded Sentence:  Zum Wohl!

-
Input Sentence:  Eat it.
Decoded Sentence:  Iss es.

-
Input Sentence:  Eat up.
Decoded Sentence:  Iss fertig.

-
Input Sentence:  Eat up.
Decoded Sentence:  Iss fertig.



In [49]:
tf.keras.backend.clear_session()