# Setup

## Import libraries

In [3]:
import numpy as np
import re
import tensorflow
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'



In [4]:
np.random.seed(12)
tensorflow.random.set_seed(12)
keras.utils.set_random_seed(12)

## Import training data

In [5]:
data_path = "spanish-english.txt"

# import the data
data = open(data_path, 'r', encoding='utf-8') 
# save each line from the txt file as an item in a list
lines = data.read().split('\n')

# Preprocess Data

## Setup docs
Docs refer to the whole input / output of a model.

In [6]:
# empty lists to hold sentences
input_docs = []
target_docs = []

for line in lines[:100000]:
    # input and target sentences are separated by tabs
    input_doc, target_doc = line.split('\t')[:2]

    # add each input sentence to the docs list
    input_docs.append(input_doc)

    # separate each token in the input doc by a space
    # e.g. 'Ve.' becomes 'Ve .'
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
    # append start and end tokens to the beginning and end of doc
    target_doc = '<START> ' + target_doc + ' <END>'
    target_docs.append(target_doc)

print("Example input sentence:", input_docs[9])
print("Example target sentence:", target_docs[9])

Example input sentence: Run!
Example target sentence: <START> ¡ Corred ! <END>


## Setup tokens
Tokens are the vocabulary sets of LLMs. Each token is a word the LLMs anticipates it'll receive or output.

Max encoder seq lengths will tell us the max number of tokens observed in input and target docs. This will inform the max output length we want the translator to generate when making predictions

In [7]:
# empty sets to hold tokens
input_tokens = []
target_tokens = []

# max number of tokens in a doc
max_encoder_seq_length = 0
max_decoder_seq_length = 0

for input_doc in input_docs:
    tokens = re.findall(r"[\w']+|[^\s\w]", input_doc)

    # if number of tokens is higher than the previous
    # highest max_encoder_seq_length value then update it
    if len(tokens) > max_encoder_seq_length:
         max_encoder_seq_length = len(tokens)

    # add each word or punctuation from the input sentences
    # to the tokens list if it's not already there
    for token in tokens:
        if token not in input_tokens:
                input_tokens.append(token)

# repeat for the target set
for target_doc in target_docs:
    tokens = target_doc.split()
    if len(tokens) > max_decoder_seq_length:
         max_decoder_seq_length = len(tokens)

    for token in tokens:
        if token not in target_tokens:
            target_tokens.append(token)

# take away two from the decoder token length as we don't want to
# include <START> and <END> tokens we added earlier in our outputs
# max_decoder_seq_length -=2

# alphabeticaly sort the tokens
input_tokens = sorted(input_tokens)
target_tokens = sorted(target_tokens)

print("Example input word:", input_tokens[2])
print("Example target word:", target_tokens[2])
print("Max encoder seq length:", max_encoder_seq_length)
print("Max decoder seq length:", max_decoder_seq_length)

Example input word: $
Example target word: $
Max encoder seq length: 14
Max decoder seq length: 23


## Create feature dictionaries

In [8]:
input_features_dict = {}
target_features_dict = {}
reverse_input_features_dict = {}
reverse_target_features_dict = {}

for i, token in enumerate(input_tokens):
    input_features_dict[token] = i
    reverse_input_features_dict[i] = token

for i, token in enumerate(target_tokens):
    target_features_dict[token] = i
    reverse_target_features_dict[i] = token

print(input_features_dict)
print(reverse_input_features_dict)



Here we create a three three dimensional arrays of zeros: encoder_input_data, decoder_input_data and decoder_target_data.

The first dimension represents the number of docs (sentences) we have.
The second dimension represents the timestep, which has the same max value as the highest number of tokens (words) found across all of the samples.
The last dimension represents the number of individual tokens available. 

In [9]:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')

decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

## One-hot encode data
No we one-hot encode the data so that our model can process it.
We iterate over each document. In that document, for every token position, we set the the value of the present token to 1.
So in this sentence: `We go shopping` at the first index, the token 'We' would get assigned a value of 1.

In [10]:
# zip the input and target docs together so that each an input and target of the
# corresponding index are collated together on one line
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):

    for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
        encoder_input_data[line, timestep, input_features_dict[token]] = 1.

    for timestep, token in enumerate(target_doc.split()):
        decoder_input_data[line, timestep, target_features_dict[token]] = 1.

        if timestep > 0:
            decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.


# Train Model

## Define hyperparameters

In [11]:
latent_dim = 256
batch_size = 96
epochs = 100

## Define the model

### Setup encoder and decoder

In [12]:
# define the shape of the encoder inputs
encoder_inputs = Input(shape=(None, num_encoder_tokens))
# define the encoder LSTM layer
encoder_lstm = LSTM(latent_dim, return_state=True)
# connect the encoder inputs to the LSTM layer
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)
encoder_states = [state_hidden, state_cell]

### Defining the shape of encoder inputs
When we define the shape of the encoder inputs, since the sentences can be of varying length the first param is 'None'. `num_encoder_tokens` represents the number of unique tokens.

### Defining the LSTM layer
latent_dim sets the dimensionality of the network. Usually, the higher this is the better the network can capture complex patterns but it then also requires more resources.

Setting the `return_state` to True ensures the LSTM returns its hidden and cell states.

### Connecting inputs to the LSTM layer
From this we extract `encoder_outputs` - the output of the layer for each timestep
`state_hidden` and `state_cell` hold the final hidden and cell states


In [None]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# setup the dense layer for predictions
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
# store the final output in `decoder_outputs`
decoder_outputs = decoder_dense(decoder_outputs)

### Build the model

In [13]:
translator = Model([encoder_inputs, decoder_inputs], decoder_outputs)

### Compile the model

In [14]:
translator.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])

### Summarise model

In [85]:
translator.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, None, 13415)]        0         []                            
                                                                                                  
 input_10 (InputLayer)       [(None, None, 26514)]        0         []                            
                                                                                                  
 lstm_4 (LSTM)               [(None, 256),                1400012   ['input_9[0][0]']             
                              (None, 256),                8                                       
                              (None, 256)]                                                        
                                                                                            

## Train Model

In [15]:
translator.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size = batch_size, epochs = epochs, validation_split = 0.2)

: 

# Translation

## Reconstruct the model

Here we reconstruct the encoders and decoders using the weights of the trained model. We do the reconstruction because the model was trained as one, however it's more efficient for us to have a separate encoder and decoder. This is because during inference the encoder is only ran once to get an encoded representation (context vector) of the input however, the decoder must be ran each time an output token is generated. If we used the model in its original trained form, then the encoder would be ran unnecessarily each time a new output token was generated.

In [None]:
encoder_outputs, state_h_enc, state_c_enc = translator.layers[2].output
encoder_states = [state_h_enc, state_c_enc]

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

## Define the inference process

In [None]:
def decode_sequence(test_input):
  # encode the input as state vectors.
  states_value = encoder_model.predict(test_input)

  target_seq = np.zeros((1, 1, num_decoder_tokens))
  target_seq[0, 0, target_features_dict['<START>']] = 1.
  decoded_sentence = ''

  # decoding loop
  stop_condition = False
  while not stop_condition:
    # get possible output tokens (with probabilities) and states
    output_tokens, hidden_state, cell_state = decoder_model.predict([target_seq] + states_value)

    # choose token with highest probability and add it to the decoded sentence
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_token = reverse_target_features_dict[sampled_token_index]
    decoded_sentence += " " + sampled_token

    # stop if you find stop token or reach max length
    if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
      stop_condition = True

    # update the target sequence and state vals which will be used to predict in the next iteration (of the loop)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.
    states_value = [hidden_state, cell_state]

  return decoded_sentence

## Translate sample phrases

In [None]:
for seq_index in range(20):
  test_input = encoder_input_data[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(test_input)
  print('-')
  print('Input sentence:', input_docs[seq_index])
  print('Decoded sentence:', decoded_sentence)