In [12]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [13]:
import tensorflow as tf
import tensorflow.keras as keras

In [14]:
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


Data from : https://www.manythings.org/anki/

Constants:

In [15]:
BATCH_SIZE = 64
#NUM_SAMPLES = 10000
MAX_SEQ_LEN = 100
#MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

# Preparing the Data

Loading and Cleaning the Data:

In [16]:
filepath = './por.txt'
with open(filepath, encoding='UTF-8') as f:
    eng2por = f.read()

In [17]:
eng2por = eng2por.split('\n')[:-1] # Breaking in lines first

Getting English and Portuguese texts:

In [18]:
input_texts = [line.split('\t')[0] for line in eng2por]
translations = [line.split('\t')[1] for line in eng2por]

## Inputs (English)

Tokenizing the inputs:

In [19]:
tokenizer_inputs = Tokenizer()
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

Getting input word to index map and maximum size (for padding):

In [20]:
word2idx_inputs = tokenizer_inputs.word_index
max_len_input = max(len(s) for s in input_sequences)

Padding inputs (paddings are 'pre' by default) :

In [21]:
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)

Saving the input dictionary to calculate the embedding matrix (secondary script):

In [22]:
idx2word = {idx:word for (word, idx) in word2idx_inputs.items()}

In [23]:
with open('./idx2word_encoder', 'wb') as f:
    pickle.dump(idx2word, f)

## Translations (Portuguese)

Adding tags to translations:

In [24]:
translations = ['<sos> '+line+' <eos>' for line in translations]

Tokenizing the outputs:

In [25]:
tokenizer_translations = Tokenizer(filters='')
tokenizer_translations.fit_on_texts(translations)
translations_sequences = tokenizer_translations.texts_to_sequences(translations)

Getting translations dictionary, number of words and maximum target lentgh (for padding):

In [26]:
word2idx_translations = tokenizer_translations.word_index
num_words_output = len(word2idx_translations) + 1 # To account for 0 (padding)

Separating output and input for translations (Forced Teaching):

In [27]:
trans_inputs = [sequence[:-1] for sequence in translations_sequences]
trans_outputs = [sequence[1:] for sequence in translations_sequences]

The targets are the trans_outputs:

In [28]:
max_len_target = max(len(s) for s in trans_outputs)

Padding data for the decoder:

In [29]:
decoder_inputs = pad_sequences(trans_inputs, padding='post',
                               maxlen=max_len_target)
targets = pad_sequences(trans_outputs, padding='post',
                            maxlen=max_len_target)

## Data Generator

We need to do this because we can't one hot encode the whole target decoder sequence:

In [66]:
class DataGenerator(keras.utils.Sequence):
    
    def __init__(self, encoder_inputs, decoder_inputs, targets,
                 batch_size, num_words_output, shuffle=True):
        
        self.batch_size = batch_size
        self.num_words_output = num_words_output
        
        # Data
        self.encoder_inputs = encoder_inputs
        self.decoder_inputs = decoder_inputs
        self.targets = targets
        
        self.shuffle = shuffle
        self.on_epoch_end() # Shuffle the dataset betweem epochs
        
    def __len__(self):
        # Number of batches per epoch
        return int(self.encoder_inputs.shape[0] / self.batch_size)
    
    
    def __getitem__(self, index):
        
        rows = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        enc_inp = self.encoder_inputs[rows, :]
        dec_inp = self.decoder_inputs[rows, :]
        dec_out_one_hot = to_categorical(self.targets[rows, :], 
                                         num_classes=self.num_words_output)
        
        return [enc_inp, dec_inp], dec_out_one_hot
    
    
    def on_epoch_end(self):
        self.indexes = np.arange(self.encoder_inputs.shape[0])
        if self.shuffle:
            np.random.shuffle(self.indexes)

# Encoder-Decoder Model with GloVe

In [57]:
LATENT_DIM = 256

Loading embedding matrix and preparing embedding layer:

In [58]:
embedding_matrix = np.load('./embedding_matrix_encoding.npy')

In [59]:
num_words = len(word2idx_inputs) + 1 # To account for padding
 
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights = [embedding_matrix],
    input_length = max_len_input

)

Encoder architecture:

In [60]:
encoder_inputs_placeholder = Input(shape=(max_len_input, ))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LATENT_DIM, return_state=True, dropout=0.5)
encoder_outputs, h, c = encoder(x)
# We only need the final encoder states
encoder_states = [h, c]

Decoder architecture:

In [61]:
# Input and embedding for decoder (not pre-trained in this case)
decoder_inputs_placeholder = Input(shape=(max_len_target,))
decoder_embedding = Embedding(num_words_output, LATENT_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

# Decoder LSTM
decoder_lstm = LSTM(LATENT_DIM, return_state=True, return_sequences=True,
                    dropout=0.5)
decoder_outputs, _, _ = decoder_lstm(
    decoder_inputs_x,
    initial_state = encoder_states
)

Dense layer:

In [62]:
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

Building model:

In [63]:
model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder],
              decoder_outputs)

In [64]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

Training

In [67]:
data_generator = DataGenerator(encoder_inputs, decoder_inputs, targets,
                               BATCH_SIZE, num_words_output)

In [None]:
model.fit_generator(generator=data_generator)

Epoch 1/1
   7/2468 [..............................] - ETA: 2:56:48 - loss: 8.0436 - accuracy: 0.6803