# Neural Machine Translation

- Input is a sentence (sequence) in English 
- Output is the corresponding sequence in German
- Encoder Decoder models with a Deep Bidirectional LSTM

# TODO

1. Implement attention
2. Comment CODE!!

TRAINING with BIGGER DATASET (so far trained only with 60 sentence and it was working)

## Import needed libraries

In [1]:
import tensorflow as tf
import numpy as np

import src.text_processing as text_processing
import src.dictionary as dictionary
import src.neural_network as neural_network

# Update python files
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


## Data processing

### Read dataset

In [2]:
# Read file containing english and german translations
data = text_processing.load_doc("./dataset/ENG_to_GER.txt")

# Split data into english and german
english_sentences, german_sentences = text_processing.prepare_data(data)

# Check and print number of sentences from one language to the other
assert(len(english_sentences) == len(german_sentences))
print(english_sentences.shape)

# Example of sentence with translation
print(english_sentences[55])
print(german_sentences[55])

(60,)
['be', 'nice']
['seien', 'sie', 'nett']


### Split dataset (training + validation)

In [7]:
# Split percentage of training and validation
split_percentage = 0.75

# Count how many samples into training dataset
total_dataset = len(english_sentences)
train_dataset = int(total_dataset * split_percentage)

# Set random seed to have always same training and validation split
np.random.seed(42)
train_indices = np.random.choice(total_dataset, train_dataset, replace=False)

# Get training data for the two languages
training_english = english_sentences[train_indices]
training_german = german_sentences[train_indices]

# Get validation data
validation_english = np.delete(english_sentences, train_indices)
validation_german = np.delete(german_sentences, train_indices)

print("Training samples: " + str(training_english.shape[0]))
print("Validation samples: " + str(validation_english.shape[0]))

# Reset seed for randomness
np.random.seed()

Training samples: 45
Validation samples: 15


### Create dictionaries for the two languages

In [8]:
# Calculate longest sentence in the two languages
english_max_length = text_processing.max_length_sentence(training_english)
german_max_length = text_processing.max_length_sentence(training_german) + 1  # Plus one because I add <START> at the beginning

print("Longest sentence in English has " + str(english_max_length) + " tokens.")
print("Longest sentence in German has " + str(german_max_length) + " tokens.")
print()

# Create dictionaries
english_dictionary = dictionary.LanguageDictionary(training_english, english_max_length)
german_dictionary = dictionary.LanguageDictionary(training_german, german_max_length)

# Calculate size of the dictionaries
english_dictionary_size = len(english_dictionary.index_to_word)
german_dictionary_size = len(german_dictionary.index_to_word)

print("English dictionary size: " + str(english_dictionary_size))
print("German dictionary size: " + str(german_dictionary_size))

# Save dictionaries
text_processing.save_dump(english_dictionary, "./dumps/eng_dict.pickle")
text_processing.save_dump(german_dictionary, "./dumps/ger_dict.pickle")

Longest sentence in English has 3 tokens.
Longest sentence in German has 6 tokens.

English dictionary size: 40
German dictionary size: 75


### Prepare sequences for the Neural Network

In [10]:
# Prepare sequences of training data
train_source_input, train_target_input, train_target_output = text_processing.prepare_sequences(
                                                                       training_english, 
                                                                       training_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Prepare sequences of validation data
val_source_input, val_target_input, val_target_output = text_processing.prepare_sequences(
                                                                       validation_english, 
                                                                       validation_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Check if same number of samples
assert(len(train_source_input) == len(train_target_input) == len(train_target_output))
assert(len(val_source_input) == len(val_target_input) == len(val_target_output))

# Print shapes data
print("Training samples : " + str(len(train_source_input)))
print(train_source_input.shape)
print(train_target_input.shape)
print(train_target_output.shape)

print("Validation samples : " + str(len(val_source_input)))
print(val_source_input.shape)
print(val_target_input.shape)
print(val_target_output.shape)

Training samples : 143
(143, 3)
(143, 6)
(143,)
Validation samples : 44
(44, 3)
(44, 6)
(44,)


### Print sample input data in English, German and next word to be predicted in German

In [11]:
sample_sentence_index = 49
print(train_source_input[sample_sentence_index])
print(train_target_input[sample_sentence_index])

print("SOURCE => " + english_dictionary.indices_to_text(train_source_input[sample_sentence_index]))
print("TARGET => " + german_dictionary.indices_to_text(train_target_input[sample_sentence_index]))
print("PREDICTED => " + german_dictionary.indices_to_text([train_target_output[sample_sentence_index]]))

[22  0  0]
[ 1 30  0  0  0  0]
SOURCE => freeze <PAD> <PAD>
TARGET => <START> stehenbleiben <PAD> <PAD> <PAD> <PAD>
PREDICTED => <END>


## Neural Network

### Parameters

In [14]:
epochs = 50
batch_size = 64
embedding_size = 300
lstm_hidden_units = 256
lr = 4e-4
depth_lstm_bidirectional_layers = 3
keep_dropout_prob = 0.7

### Create model encoder-decoder with LSTM

In [15]:
tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, german_dictionary.max_length_sentence), 'output')
target_labels = tf.placeholder(tf.int32, (None), 'targets')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits = neural_network.create_network(input_sequence, 
                                       output_sequence, 
                                       input_keep_prob,
                                       output_keep_prob,
                                       english_dictionary_size, 
                                       german_dictionary_size, 
                                       embedding_size,
                                       lstm_hidden_units,
                                       depth_lstm_bidirectional_layers,
                                       verbose=1)

Input sequence: [None, 3]
Encoder embedding: [None, 3, 300]
Encoder FW last_state: [None, 256]
Decoder concatenated output: [None, 12, 256]
Logits: [None, 75]


### Set the loss function, optimizer and other useful tensors

In [16]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=target_labels)
loss = tf.reduce_mean(ce)

# Using Adam (Adaptive learning rate + momentum) for the update of the weights of the network
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=1))
correct_mask = tf.to_float(tf.equal(predictions, target_labels))
accuracy = tf.reduce_mean(tf.cast(correct_mask, tf.float32))

### Training of the network

In [17]:
# Training data variables
num_iterations_training = max(len(train_source_input) // batch_size, 1)
print("Training iterations per epoch: " + str(num_iterations_training))

# Validation data variables
max_val_acc = 0
val_batch_size = 64
num_iterations_validation = max(len(val_source_input) // val_batch_size, 1)

# Initializer for variables in the graph
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Perform each epoch, shuffle training dataset
indices = list(range(len(train_source_input)))

with tf.Session() as sess:
    
    # Initialize variables in the graph
    sess.run(init)
    
    # Iterate over epochs
    for i in range(epochs):
        
        # Shuffle indices with a random seed
        np.random.seed(42)
        np.random.shuffle(indices)
        
        # Shuffle data to not feed the network with always same sequence of data
        train_source_input = train_source_input[indices]
        train_target_input = train_target_input[indices]
        train_target_output = train_target_output[indices]

        # Iterate over mini-batches
        for j in range(num_iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size

            # Forward and backpropagation on training data
            _, train_loss, train_acc = sess.run([optimizer, loss, accuracy], feed_dict={
                                                            input_sequence: train_source_input[start_index:end_index],
                                                            output_sequence: train_target_input[start_index:end_index],
                                                            target_labels: train_target_output[start_index:end_index],
                                                            input_keep_prob: keep_dropout_prob,
                                                            output_keep_prob: keep_dropout_prob })
            
            # Print training loss and accuracy
            if j % 100 == 0:
                print("Training loss: " + str(train_loss) + ", accuracy: " + str(train_acc))

                
            # Check accuracy on validation 
            if j % 250 == 0:
                
                # Accumulate loss and accuracy
                val_loss_arr, val_acc_arr = [], []
                
                # Iterate over validation mini-batches
                for k in range(num_iterations_validation):
                    start_index_val = k * val_batch_size
                    end_index_val = (k + 1) * val_batch_size
                    
                    val_loss, val_acc = sess.run([loss, accuracy], feed_dict={
                                            input_sequence: val_source_input[start_index_val:end_index_val],
                                            output_sequence: val_target_input[start_index_val:end_index_val],
                                            target_labels: val_target_output[start_index_val:end_index_val],
                                            input_keep_prob: 1.0,
                                            output_keep_prob: 1.0})
                    
                    val_loss_arr.append(val_loss)
                    val_acc_arr.append(val_acc)

                val_acc = np.mean(val_acc_arr)

                # Save model if validation accuracy better
                if val_acc > max_val_acc:
                    max_val_acc = val_acc
                    print("VALIDATION loss: " + str(np.mean(val_loss_arr)) + ", accuracy: " + str(val_acc))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")


Training iterations per epoch: 2
Training loss: 4.323658, accuracy: 0.0
VALIDATION loss: 4.2501116, accuracy: 0.3181818
Training loss: 4.2061777, accuracy: 0.265625
Training loss: 3.9930282, accuracy: 0.34375
Training loss: 3.684175, accuracy: 0.34375
Training loss: 3.3601947, accuracy: 0.34375
Training loss: 3.8282838, accuracy: 0.265625
Training loss: 3.4937925, accuracy: 0.28125
Training loss: 3.2583766, accuracy: 0.359375
VALIDATION loss: 3.979183, accuracy: 0.36363637
Training loss: 3.4002028, accuracy: 0.34375
Training loss: 3.2220721, accuracy: 0.390625
Training loss: 3.1680245, accuracy: 0.34375
Training loss: 3.1470382, accuracy: 0.328125
Training loss: 3.2125325, accuracy: 0.3125
Training loss: 3.568695, accuracy: 0.265625
Training loss: 3.1350296, accuracy: 0.34375
Training loss: 2.9641955, accuracy: 0.390625
Training loss: 2.9725895, accuracy: 0.34375
Training loss: 2.761108, accuracy: 0.390625
Training loss: 2.7690969, accuracy: 0.359375
Training loss: 2.7762113, accuracy:

## Testing network

### Rebuild graph quickly if want to run only this part of the notebook

In [19]:
# Parameters network
#embedding_size = 300
#lstm_neurons = 256

# Load dictionaries from pickle
english_dictionary = text_processing.load_dump("./dumps/eng_dict.pickle")
german_dictionary = text_processing.load_dump("./dumps/ger_dict.pickle")

tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, german_dictionary.max_length_sentence), 'output')
target_labels = tf.placeholder(tf.int32, (None), 'targets')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits = neural_network.create_network(input_sequence, 
                                       output_sequence, 
                                       input_keep_prob,
                                       output_keep_prob,
                                       len(english_dictionary.index_to_word), 
                                       len(german_dictionary.index_to_word), 
                                       embedding_size,
                                       lstm_hidden_units,
                                       depth_lstm_bidirectional_layers,
                                       verbose=1)
# Predictions
scores = tf.nn.softmax(logits)
max_score = tf.reduce_max(scores)
predictions = tf.to_int32(tf.argmax(scores, axis=1))

Input sequence: [None, 3]
Encoder embedding: [None, 3, 300]
Encoder FW last_state: [None, 256]
Decoder concatenated output: [None, 12, 256]
Logits: [None, 75]


### Perform test predictions

In [23]:
# TF variables
init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./checkpoints/model.ckpt") 

    test_source_sentence = ["hi"]

    for source_sentence in test_source_sentence:

        # Normalize & tokenize (cut if longer than max_length_source)  
        source_preprocessed = text_processing.preprocess_sentence(source_sentence)[:english_dictionary.max_length_sentence]
       
        # Convert to numbers
        source_encoded = english_dictionary.text_to_indices(source_preprocessed)
        
        # Add padding
        source_input = text_processing.pad_sentence(source_encoded, english_dictionary.max_length_sentence)

        print(source_input)
        print(english_dictionary.indices_to_text(source_input))

        # Starting target sentence in German
        target_sentence = [["<START>"]]
        target_encoded = german_dictionary.text_to_indices(target_sentence[0])

        # Predict words and append to previous one until "<END>" is predicted
        for i in range(german_dictionary.max_length_sentence):    
            # Pad
            target_input = text_processing.pad_sentence(target_encoded, german_dictionary.max_length_sentence)
            
            # Perform prediction
            pred, prob_argmax = sess.run([predictions, max_score], feed_dict={ input_sequence: [source_input], 
                                                                            output_sequence: [target_input],
                                                                            input_keep_prob: 1.0,
                                                                            output_keep_prob: 1.0 })
            target_encoded.append(pred[0])
            print(german_dictionary.indices_to_text(pred) + " " + str(prob_argmax))

            if pred[0] == 2: # If <END>, stop
                break

        print(" ".join(source_preprocessed) + " => " + german_dictionary.indices_to_text(target_encoded))

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt
[4 0 0]
hi <PAD> <PAD>
hallo 0.08588736
<END> 0.8976733
hi => <START> hallo <END>
