In [17]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [18]:
import tensorflow as tf
import tensorflow.keras as keras

In [19]:
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


Data from : https://www.manythings.org/anki/  
References:  Deep Learning: Advanced NLP and RNNs (Lazy Programmer), Keras and Tensorflow documentation

Constants:

In [2]:
BATCH_SIZE = 64
NUM_SAMPLES = 10000
MAX_SEQ_LEN = 100
#MAX_NUM_WORDS = 20000 # It's correlated with the number of samples
EMBEDDING_DIM = 100

# Preparing the Data

Loading and Cleaning the Data:

In [3]:
filepath = './por.txt'
with open(filepath, encoding='UTF-8') as f:
    eng2por = f.read()

In [4]:
eng2por = eng2por.split('\n')[:-1] # Breaking in lines first

Limit the number of samples:

In [13]:
np.random.seed(234)
eng2por = np.random.choice(eng2por, NUM_SAMPLES)

Getting English and Portuguese texts:

In [14]:
input_texts = [line.split('\t')[0] for line in eng2por]
translations = [line.split('\t')[1] for line in eng2por]

## Inputs (English)

Tokenizing the inputs:

In [20]:
tokenizer_inputs = Tokenizer()
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

Getting input word to index map and maximum size (for padding):

In [27]:
word2idx_inputs = tokenizer_inputs.word_index
max_len_input = max(len(s) for s in input_sequences)

Padding inputs (paddings are 'pre' by default) :

In [28]:
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)

Saving the input dictionary to calculate the embedding matrix (secondary script):

In [29]:
idx2word = {idx:word for (word, idx) in word2idx_inputs.items()}

In [38]:
with open('./idx2word_encoder', 'wb') as f:
    pickle.dump(idx2word, f)

## Translations (Portuguese)

Adding tags to translations:

In [39]:
translations = ['<sos> '+line+' <eos>' for line in translations]

Tokenizing the outputs:

In [41]:
tokenizer_translations = Tokenizer(filters='')
tokenizer_translations.fit_on_texts(translations)
translations_sequences = tokenizer_translations.texts_to_sequences(translations)

Getting translations dictionary, number of words and maximum target lentgh (for padding):

In [42]:
word2idx_translations = tokenizer_translations.word_index
num_words_output = len(word2idx_translations) + 1 # To account for 0 (padding)

Separating output and input for translations (Forced Teaching):

In [43]:
trans_inputs = [sequence[:-1] for sequence in translations_sequences]
trans_outputs = [sequence[1:] for sequence in translations_sequences]

The targets are the trans_outputs:

In [47]:
max_len_target = max(len(s) for s in trans_outputs)

Padding data for the decoder:

In [48]:
decoder_inputs = pad_sequences(trans_inputs, padding='post',
                               maxlen=max_len_target)
targets = pad_sequences(trans_outputs, padding='post',
                            maxlen=max_len_target)

## Data Generator

We need to do this because we can't one hot encode the whole target decoder sequence:

In [49]:
class DataGenerator(keras.utils.Sequence):
    
    def __init__(self, encoder_inputs, decoder_inputs, targets,
                 batch_size, num_words_output, shuffle=True):
        
        self.batch_size = batch_size
        self.num_words_output = num_words_output
        
        # Data
        self.encoder_inputs = encoder_inputs
        self.decoder_inputs = decoder_inputs
        self.targets = targets
        
        self.shuffle = shuffle
        self.on_epoch_end() # Shuffle the dataset betweem epochs
        
    def __len__(self):
        # Number of batches per epoch
        return int(self.encoder_inputs.shape[0] / self.batch_size)
    
    
    def __getitem__(self, index):
        
        rows = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        enc_inp = self.encoder_inputs[rows, :]
        dec_inp = self.decoder_inputs[rows, :]
        dec_out_one_hot = to_categorical(self.targets[rows, :], 
                                         num_classes=self.num_words_output)
        
        return [enc_inp, dec_inp], dec_out_one_hot
    
    
    def on_epoch_end(self):
        self.indexes = np.arange(self.encoder_inputs.shape[0])
        if self.shuffle:
            np.random.shuffle(self.indexes)

# Encoder-Decoder Model with GloVe

In [50]:
LATENT_DIM = 256

Loading embedding matrix and preparing embedding layer:

In [51]:
embedding_matrix = np.load('./embedding_matrix_encoding.npy')

In [52]:
num_words = len(word2idx_inputs) + 1 # To account for padding
 
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights = [embedding_matrix],
    input_length = max_len_input

)

Encoder architecture:

In [53]:
encoder_inputs_placeholder = Input(shape=(max_len_input, ))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LATENT_DIM, return_state=True, dropout=0.5)
encoder_outputs, h, c = encoder(x)
# We only need the final encoder states
encoder_states = [h, c]

Decoder architecture:

In [54]:
# Input and embedding for decoder (not pre-trained in this case)
decoder_inputs_placeholder = Input(shape=(max_len_target,))
decoder_embedding = Embedding(num_words_output, LATENT_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

# Decoder LSTM
decoder_lstm = LSTM(LATENT_DIM, return_state=True, return_sequences=True,
                    dropout=0.5)
decoder_outputs, _, _ = decoder_lstm(
    decoder_inputs_x,
    initial_state = encoder_states
)

Dense layer:

In [55]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

Building model:

In [56]:
model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder],
              decoder_outputs)

In [57]:
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

Training

In [63]:
data_generator = DataGenerator(encoder_inputs, decoder_inputs, targets,
                               BATCH_SIZE, num_words_output)

In [65]:
model.fit_generator(generator=data_generator, epochs = 30, workers=3)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x1cfac10f908>

# Model for Inference

We use the encoder model from before to calculate the initial states:

In [66]:
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

The decoder inputs will be different on inference (only one word instead of Forced Teaching):

In [67]:
# Hidden States
decoder_state_input_h = Input(shape=(LATENT_DIM, ))
decoder_state_input_c = Input(shape=(LATENT_DIM, ))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Initial Word
decoder_inputs_single = Input(shape=(1,)) # For this case it will be just a number

# Passing this word on the embedding
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

# Passing the embedded word on the decoder layer
decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x,
                                     initial_state=decoder_states_inputs)

decoder_states = [h, c]
decoder_outputs = decoder_dense(decoder_outputs)

The decoder model receives the word input and the states from the encoder and returns the output and its owe states (for the next word):

In [68]:
decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

Now the function to generate translations:

In [69]:
idx2word_translations = {idx:word for (word, idx) in word2idx_translations.items()}

In [70]:
def translate(input_seq):
    
    # Run the original sentence on the encoder
    states_values = encoder_model.predict(input_seq)
    
    # Start of sentence token
    decoder_input = np.zeros((1,1))
    decoder_input[0,0] = word2idx_translations['<sos>']
    
    # Token to end translation
    eos_token = word2idx_translations['<eos>']
    
    # Let's generate the translation
    translated_sentence = []
    for _ in range(max_len_target):
        
        # Pass the input through the decoder
        output_probs, h, c = decoder_model.predict(
           [decoder_input] + states_values
        )
        
        # Predicted next word id
        predicted_id = np.argmax(output_probs[0, 0, :])
        
        # If the predicted word is the eos token, break the loop
        if predicted_id == eos_token:
            break
        
        # If not, we add the word to the translated sentence
        word = ''
        if predicted_id > 0:
            word = idx2word_translations[predicted_id]
            translated_sentence.append(word)
            
        # Update the decoder_input and states
        decoder_input[0,0] = predicted_id
        states_values = [h, c]
    
    return ' ' .join(translated_sentence)

Checking some translations on the training set:

In [87]:
i = np.random.choice(len(input_texts))

print(input_texts[i])
translate(encoder_inputs[i:i+1])

Tom still hasn't taken down his Christmas tree.


'tom ainda não desmontou a sua árvore de natal dele.'

Generating new sentences (this is much harder, since our vocabulary is very limited in the baseline case):

In [101]:
def preprocess(phrase):
    
    tokens = tokenizer_inputs.texts_to_sequences(phrase)
    padded = pad_sequences(tokens, maxlen=max_len_input)
    
    return padded
    

In [122]:
phrase = ['I have to be honest here, Tom']
translate(preprocess(phrase))

'eu tenho que fazer isso com o tom.'