# Introduction

During this project I will use the Keras API to build a deep learning translator. I preprocessed the data to get one-hot encoded vectors, then train the model using LTSM and Dense from Keras, and finally test the model using various input strings.

In [414]:
#Import all libraries
import numpy as np
import re
from tensorflow import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model, load_model

# Preprocessing

In [389]:
# Importing our translations
data_path = "fra.txt"

# Defining lines as a list of each line
with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')



In [390]:
# Building empty lists to hold sentences
input_docs = []
target_docs = []
# Building empty vocabulary sets
input_tokens = set()
target_tokens = set()

In [391]:
for line in lines[:10000]:
  # Input and target sentences are separated by tabs
  input_doc, target_doc = line.split('\t')[:2]

  # Appending each input sentence to input_docs
  input_docs.append(input_doc)
  
  #The below expression tokenizes the target_doc into a list of words and punctuation marks, 
    # and then concatenates them into a single string separated by spaces.
  target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
  # Redefine target_doc below
  # and append it to target_docs:
  target_doc = '<START> ' + target_doc + ' <END>'
  target_docs.append(target_doc)

  # Now we split up each sentence into words
  # and add each unique word to our vocabulary set
  #The below expression tokenizes the input_doc into a list of words and punctuation marks, 
  # and iterates over each token
  for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
    # print(token)
    # Add your code here:
    if token not in input_tokens:
      input_tokens.add(token)
  for token in target_doc.split():
    # print(token)
    # And here:
    if token not in target_tokens:
      target_tokens.add(token)

input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

In [392]:
# Create num_encoder_tokens and num_decoder_tokens:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
#this code calculates the length of the longest sequence of tokens in the input_docs list by tokenizing 
# each input_doc into a list of tokens and finding the maximum length of these lists
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])

In [393]:
#Now we need to create dictionaries of each word for the input (English) and output (Japanese). We also need 
#reverse dictionaries so that we can find the word based on the index

input_features_dict = dict([(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict([(token, i) for i, token in enumerate(target_tokens)])

reverse_input_features_dict = dict((i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict((i, token) for token, i in target_features_dict.items())

In [394]:
#Now we need to create numpy arrays with 0s. THe arrays will be filled with a 1 for the token that we are looking to encode and decode.
# This is because the keras model requires all words to be in one-hot encode vectors
encoder_input_data = np.zeros((len(input_docs), max_encoder_seq_length, num_encoder_tokens),dtype='float32')
decoder_input_data = np.zeros((len(input_docs), max_decoder_seq_length, num_decoder_tokens),dtype='float32')
decoder_target_data = np.zeros((len(input_docs), max_decoder_seq_length, num_decoder_tokens),dtype='float32')

In [395]:
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
    #This loop processes the input data. For each line and each token (word or punctuation) in the input document, 
    # the code finds the index of the token in the input feature dictionary (input_features_dict). 
    # This index is used to set the corresponding entry in the 3D encoder input data array (encoder_input_data) to 1. 
    # This creates a one-hot encoding of the input data.
  for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
    # Assign 1. for the current line, timestep, & word
    # in encoder_input_data:
    encoder_input_data[line, timestep, input_features_dict[token]] = 1.

  for timestep, token in enumerate(target_doc.split()):

    decoder_input_data[line, timestep, target_features_dict[token]] = 1.
    if timestep > 0:

      decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.

In [396]:
print(list(input_features_dict.keys())[:50])

['!', '"', '$', '%', '&', ',', '-', '.', '17', '19', '2', '3', '30', '5', '50', '7', '8', '99', ':', '?', 'A', 'Abandon', 'Act', 'Add', 'After', 'Aim', "Ain't", 'Air', 'All', 'Allow', 'Am', 'American', 'Answer', 'Any', 'Anybody', 'Anyone', 'Anything', 'Apples', 'Arabs', 'Are', "Aren't", 'Arm', 'Asian', 'Ask', 'Attack', 'Autumn', 'Avoid', 'Awesome', 'B', 'Back']


In [397]:
print(reverse_target_features_dict[50])

Annulez


In [398]:
print(len(input_tokens))

2269


# Model Training

In [399]:
#the number of dimensions of the internal representation 
# of the input sequences in the encoder LSTM and decoder LSTM layers of a neural network.
latent_dim = 256
#The Batch Size is a hyperparameter of a machine learning model that defines the number of samples to work 
# through before updating the internal model parameters.
batch_size = 75
#Epoch specifies the number of times the training loop will run over the entire training data.
epochs = 50

In [400]:
#The below code creates an encoder network for a seq2seq model using Keras.

#This defines an input layer for the encoder with a shape of (None, num_encoder_tokens), where None represents the 
# length of the sequence and num_encoder_tokens is the number of tokens/words in the encoder vocabulary.
encoder_inputs = Input(shape=(None, num_encoder_tokens))

#This creates an LSTM layer with latent_dim number of units, and with return_state set to True, 
# which means it will return the hidden state and cell state of the LSTM layer in addition to its outputs.
#The return_state=True in the LSTM layer is important because the hidden state and cell state are needed for the decoder 
# part of the model to properly predict the output sequence. The hidden state and cell state capture the context information 
# from the input sequence that is fed into the encoder. This information is then used by the decoder to generate the target sequence.
encoder_lstm = LSTM(latent_dim, return_state=True)

#The third line applies the encoder_inputs to the encoder_lstm layer, and the outputs, state_hidden, and state_cell are assigned 
# to encoder_outputs, state_hidden, and state_cell respectively.
encoder_outputs, state_hidden, state_cell = encoder_lstm(encoder_inputs)

#This defines a list called encoder_states, which includes the hidden and cell states of the LSTM layer. 
# This will be used as the initial state of the decoder network.
encoder_states = [state_hidden, state_cell]

In [401]:
#This code is defining and implementing an LSTM decoder in Keras, which is used in a seq2seq neural network.


#The first line creates an input layer for the decoder with shape (None, num_decoder_tokens), where "None" indicates 
# a variable-length sequence and "num_decoder_tokens" is the number of tokens in the decoder's vocabulary. The "None" in 
# the shape of the decoder_inputs layer means that the length of the input sequences can vary and it is not fixed. 
# This allows the decoder to handle inputs of different lengths, which is important because 
# the length of the source and target sentences can be very different
decoder_inputs = Input(shape=(None, num_decoder_tokens))

#The second line creates an LSTM layer with "latent_dim" units and specifies
#  to return both the sequence of outputs and the hidden and cell states.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

#The third line applies the LSTM layer to the decoder_inputs and initializes the hidden and cell states with the encoder_states.
#  It also splits the outputs and states into separate variables.
decoder_outputs, decoder_state_hidden, decoder_state_cell = decoder_lstm(decoder_inputs, initial_state=encoder_states)

#The fourth line creates a dense layer with "num_decoder_tokens" units 
# and a "softmax" activation function, which will be used to produce a probability distribution over the decoder's vocabulary.
#A dense layer is a type of layer in a neural network that has connections to all the neurons in the previous layer. 
# Each neuron in the dense layer receives input from every neuron in the previous layer. The dense layer performs a 
# matrix multiplication of the input with a weight matrix and adds a bias term to produce the output. 
# The activation function applied to the output then determines the final activation values for each neuron in the dense layer.
#The softmax function maps its input to a probability distribution over the classes, with each output value representing the predicted probability
# for each class. The softmax function normalizes the input so that the sum of all the outputs is equal to 1, which represents a valid 
# probability distribution. The class with the highest probability is chosen as the final prediction. 
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

#he fifth line applies the dense layer to the decoder_outputs to obtain the final decoder output.
decoder_outputs = decoder_dense(decoder_outputs)

In [402]:
# Building the training model:
#By creating a model in this way, it is now possible to train the network on data by passing input 
# sequences to the encoder and decoder and using the decoder outputs as the target during training. 
# The model can be compiled, fit to data, and used to make predictions, which can be compared to the target to compute the 
# training loss and update the model's weights.
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [403]:
print("Model summary:\n")
training_model.summary()
print("\n\n")

Model summary:

Model: "model_29"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_43 (InputLayer)          [(None, None, 2269)  0           []                               
                                ]                                                                 
                                                                                                  
 input_44 (InputLayer)          [(None, None, 4488)  0           []                               
                                ]                                                                 
                                                                                                  
 lstm_26 (LSTM)                 [(None, 256),        2586624     ['input_43[0][0]']               
                                 (None, 256),                              

In [404]:
# Compile the model:
training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

#Categorical cross-entropy is a loss function used for multi-class classification problems. It is used to measure the 
# difference between the predicted probability distribution and the true distribution of the target classes. 
# The output of the loss function is a scalar value that summarizes the average discrepancy between the predicted 
# class probabilities and the true class labels for the given input data.

#RMSprop (Root Mean Square Propagation) is a popular optimization algorithm used in deep learning. 
# It is a gradient descent optimization algorithm that adapts the learning rates of individual parameters 
# based on the historical gradient information. The idea behind RMSprop is to divide the learning rate for each parameter 
# by a running average of the historical magnitudes of the gradients for that parameter, effectively reducing the 
# learning rate for parameters that have consistently high gradients. This can help prevent oscillations or divergence 
# during training and lead to faster convergence. RMSprop is often used with deep neural networks, especially 
# recurrent neural networks and convolutional neural networks.

In [405]:
#Finally, we fit the model with the encoder and decoder inputs
training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size = batch_size, 
epochs = epochs, validation_split = 0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x18812b134c0>

In [413]:
training_model.save('training_model.h5')

# Model Testing

In [415]:
training_model = load_model('training_model.h5')

In [406]:
#this code is extracting the inputs, outputs, and hidden states of the encoder part of a pre-trained neural machine translation model.

#This is the first input layer of the model, which corresponds to the source language sequence in a neural machine translation model.
encoder_inputs = training_model.input[0]

#These three variables correspond to the output and hidden states of the encoder part of the model. 
# encoder_outputs is the final output of the encoder, which is used as the input for the decoder part of the model. 
# state_h_enc and state_c_enc are the hidden states of the encoder, which are used to initialize the hidden states of 
# the decoder part of the model.
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output

#This is a list that combines the hidden states state_h_enc and state_c_enc of the encoder. 
# These hidden states capture the context of the source language sequence and are used to initialize the hidden states of the decoder.
encoder_states = [state_h_enc, state_c_enc]

In [407]:
#Now we build the model
encoder_model = Model(encoder_inputs, encoder_states)

In [408]:
#This code defines a new model, decoder_model, which is the decoder part of a neural machine translation model. 
# The code is defining the inputs and outputs of the decoder and how they are related.

#These are two input layers, each with a shape of (latent_dim,). latent_dim is a hyperparameter that defines the number of 
# dimensions in the hidden state of the decoder. The two input layers represent the initial hidden states of the decoder.
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))

#This is a list that combines the two input layers, decoder_state_input_hidden and decoder_state_input_cell.
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]

#These are the output and hidden states of the decoder. decoder_lstm is an LSTM layer that takes the decoder_inputs and the 
# initial hidden states decoder_states_inputs as inputs, and produces the output and hidden states decoder_outputs, state_hidden, and state_cell.
decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)

#This is a list that combines the hidden states state_hidden and state_cell of the decoder.
decoder_states = [state_hidden, state_cell]

#This line applies a dense layer, decoder_dense, to the output of the decoder, decoder_outputs.
decoder_outputs = decoder_dense(decoder_outputs)

#This line creates the decoder_model using the Model class from the Keras library. The first argument is a list that 
# combines the decoder_inputs and decoder_states_inputs, which are the inputs of the decoder. The second argument is a list 
# that combines the decoder_outputs and decoder_states, which are the outputs of the decoder.
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [409]:
def decode_sequence(test_input):
  # Encode the input as state vectors.
  states_value = encoder_model.predict(test_input)

  # Generate empty target sequence of length 1.
  target_seq = np.zeros((1, 1, num_decoder_tokens))
  # Populate the first token of target sequence with the start token.
  target_seq[0, 0, target_features_dict['<START>']] = 1.

  # Sampling loop for a batch of sequences
  # (to simplify, here we assume a batch of size 1).
  decoded_sentence = ''

  stop_condition = False
  while not stop_condition:
    # Run the decoder model to get possible 
    # output tokens (with probabilities) & states
    output_tokens, hidden_state, cell_state = decoder_model.predict(
      [target_seq] + states_value)

    # Choose token with highest probability
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_token = reverse_target_features_dict[sampled_token_index]
    decoded_sentence += " " + sampled_token

    # Exit condition: either hit max length
    # or find stop token.
    if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
      stop_condition = True

    # Update the target sequence (of length 1).
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.

    # Update states
    states_value = [hidden_state, cell_state]

  return decoded_sentence

In [412]:

for seq_index in range(5000,5100):
  test_input = encoder_input_data[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(test_input)
  print('-')
  print('Input sentence:', input_docs[seq_index])
  print('Decoded sentence:', decoded_sentence)

-
Input sentence: I like fruit.
Decoded sentence:  J'aime les les . <END>
-
Input sentence: I like girls.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like honey.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like music.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like opera.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like sugar.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like sushi.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like these.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like these.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like women.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I like women.
Decoded sentence:  J'aime les . <END>
-
Input sentence: I liked that.
Decoded sentence:  Je l'ai ai . <END>
-
Input sentence: I live alone.
Decoded sentence:  Je suis . <END>
-
Input sentence: I live alone.
Decoded sentence:  Je suis . <END>
-
Input sentence: I lo