# Neural Machine Translation

- Input is a sentence (sequence) in English 
- Output is the corresponding sequence in German
- Encoder Decoder models with a Deep Bidirectional LSTM

# TODO

1. Implement attention
2. Comment CODE!!

TRAINING with BIGGER DATASET (so far trained only with 60 sentence and it was working)

## Import needed libraries

In [2]:
import tensorflow as tf
import numpy as np

import src.text_processing as text_processing
import src.dictionary as dictionary
import src.neural_network as neural_network

# Update python files
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data processing

### Read dataset

In [3]:
# Read file containing english and german translations
data = text_processing.load_doc("./dataset/ENG_to_GER.txt")

# Split data into english and german
english_sentences, german_sentences = text_processing.prepare_data(data)

# Check and print number of sentences from one language to the other
assert(len(english_sentences) == len(german_sentences))
print(english_sentences.shape)

# Example of sentence with translation
print(english_sentences[55])
print(german_sentences[55])

(70,)
['be', 'nice']
['seien', 'sie', 'nett']


### Split dataset (training + validation)

In [4]:
# Split percentage of training and validation
split_percentage = 0.8

# Count how many samples into training dataset
total_dataset = len(english_sentences)
train_dataset = int(total_dataset * split_percentage)

# Set random seed to have always same training and validation split
np.random.seed(42)
train_indices = np.random.choice(total_dataset, train_dataset, replace=False)

# Get training data for the two languages
training_english = english_sentences[train_indices]
training_german = german_sentences[train_indices]

# Get validation data
validation_english = np.delete(english_sentences, train_indices)
validation_german = np.delete(german_sentences, train_indices)

print("Training samples: " + str(training_english.shape[0]))
print("Validation samples: " + str(validation_english.shape[0]))

# Reset seed for randomness
np.random.seed()

Training samples: 56
Validation samples: 14


### Create dictionaries for the two languages

In [5]:
# Calculate longest sentence in the two languages
english_max_length = text_processing.max_length_sentence(training_english)
german_max_length = text_processing.max_length_sentence(training_german) + 1  # Plus one because I add <START> at the beginning

print("Longest sentence in English has " + str(english_max_length) + " tokens.")
print("Longest sentence in German has " + str(german_max_length) + " tokens.")
print()

# Create dictionaries
english_dictionary = dictionary.LanguageDictionary(training_english, english_max_length)
german_dictionary = dictionary.LanguageDictionary(training_german, german_max_length)

# Calculate size of the dictionaries
english_dictionary_size = len(english_dictionary.index_to_word)
german_dictionary_size = len(german_dictionary.index_to_word)

print("English dictionary size: " + str(english_dictionary_size))
print("German dictionary size: " + str(german_dictionary_size))

# Save dictionaries
text_processing.save_dump(english_dictionary, "./dumps/eng_dict.pickle")
text_processing.save_dump(german_dictionary, "./dumps/ger_dict.pickle")

Longest sentence in English has 3 tokens.
Longest sentence in German has 7 tokens.

English dictionary size: 40
German dictionary size: 90


### Prepare sequences for the Neural Network

In [6]:
# Prepare sequences of training data
train_source_input, train_target_input = text_processing.prepare_sequences(training_english, 
                                                                       training_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Prepare sequences of validation data
val_source_input, val_target_input = text_processing.prepare_sequences(validation_english, 
                                                                       validation_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Check if same number of samples
assert(len(train_source_input) == len(train_target_input))
assert(len(val_source_input) == len(val_target_input))

# Print shapes data
print("Training samples : " + str(len(train_source_input)))
print(train_source_input.shape)
print(train_target_input.shape)

print("Validation samples : " + str(len(val_source_input)))
print(val_source_input.shape)
print(val_target_input.shape)

Training samples : 56
(56, 3)
(56, 7)
Validation samples : 14
(14, 3)
(14, 7)


### Print sample input data in English, German and next word to be predicted in German

In [7]:
sample_sentence_index = 6
print(train_source_input[sample_sentence_index])
print(train_target_input[sample_sentence_index])

print("SOURCE => " + english_dictionary.indices_to_text(train_source_input[sample_sentence_index]))
print("TARGET => " + german_dictionary.indices_to_text(train_target_input[sample_sentence_index]))

[ 0  0 14]
[1 6 2 0 0 0 0]
SOURCE => <PAD> <PAD> hello
TARGET => <START> hallo <END> <PAD> <PAD> <PAD> <PAD>


## Neural Network

### Parameters

In [36]:
epochs = 100
batch_size = 64
embedding_size = 64
lstm_hidden_units = 128
lr = 1e-3
depth_lstm_bidirectional_layers = 1
keep_dropout_prob = 0.75

### Create model encoder-decoder with LSTM

In [37]:
tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, None), 'output')
target_labels = tf.placeholder(tf.int32, (None, None), 'targets')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits = neural_network.create_network(input_sequence, 
                                       output_sequence, 
                                       input_keep_prob,
                                       output_keep_prob,
                                       english_dictionary_size, 
                                       german_dictionary_size, 
                                       embedding_size,
                                       lstm_hidden_units,
                                       depth_lstm_bidirectional_layers,
                                       verbose=1)

Input sequence: [None, 3]
Encoder embedding: [None, 3, 64]
Encoder FW last_state: [None, 128]
Encoder BW last_state: [None, 128]
Decoder output: [None, None, 256]
Logits: [None, None, 90]


### Set the loss function, optimizer and other useful tensors

In [38]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=target_labels)
loss = tf.reduce_mean(ce)

# Using Adam (Adaptive learning rate + momentum) for the update of the weights of the network
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=2))
correct_mask = tf.to_float(tf.equal(predictions, target_labels))
accuracy = tf.reduce_mean(tf.reduce_mean(tf.cast(correct_mask, tf.float32), axis=1))

In [39]:
print(scores)
print(predictions)
print(correct_mask)
print(accuracy)

Tensor("Softmax:0", shape=(?, ?, 90), dtype=float32)
Tensor("ToInt32:0", shape=(?, ?), dtype=int32)
Tensor("ToFloat:0", shape=(?, ?), dtype=float32)
Tensor("Mean_2:0", shape=(), dtype=float32)


### Training of the network

In [40]:
# Training data variables
num_iterations_training = max(len(train_source_input) // batch_size, 1)
print("Training iterations per epoch: " + str(num_iterations_training))

# Validation data variables
max_val_acc = 0
val_batch_size = 64
num_iterations_validation = max(len(val_source_input) // val_batch_size, 1)

# Initializer for variables in the graph
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Perform each epoch, shuffle training dataset
indices = list(range(len(train_source_input)))

with tf.Session() as sess:
    
    # Initialize variables in the graph
    sess.run(init)
    
    # Iterate over epochs
    for i in range(epochs):
        
        # Shuffle indices with a random seed
        np.random.seed(42)
        np.random.shuffle(indices)
        
        # Shuffle data to not feed the network with always same sequence of data
        train_source_input = train_source_input[indices]
        train_target_input = train_target_input[indices]

        # Iterate over mini-batches
        for j in range(num_iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size

            # Forward and backpropagation on training data
            _, train_loss, train_acc = sess.run([optimizer, loss, accuracy], feed_dict={
                                                            input_sequence: train_source_input[start_index:end_index],
                                                            output_sequence: train_target_input[start_index:end_index, :-1],
                                                            target_labels: train_target_input[start_index:end_index, 1:],
                                                            input_keep_prob: keep_dropout_prob,
                                                            output_keep_prob: keep_dropout_prob })
            
            # Print training loss and accuracy
            if j % 250 == 0:
                print("Training loss: " + str(train_loss) + ", accuracy: " + str(train_acc))

                
            # Check accuracy on validation 
            if j % 750 == 0:
                
                # Accumulate loss and accuracy
                val_loss_arr, val_acc_arr = [], []
                
                # Iterate over validation mini-batches
                for k in range(num_iterations_validation):
                    start_index_val = k * val_batch_size
                    end_index_val = (k + 1) * val_batch_size
                    
                    val_loss, val_acc = sess.run([loss, accuracy], feed_dict={
                                            input_sequence: val_source_input[start_index_val:end_index_val],
                                            output_sequence: val_target_input[start_index_val:end_index_val, :-1],
                                            target_labels: val_target_input[start_index_val:end_index_val, 1:],
                                            input_keep_prob: 1.0,
                                            output_keep_prob: 1.0})
                    
                    val_loss_arr.append(val_loss)
                    val_acc_arr.append(val_acc)

                val_acc = np.mean(val_acc_arr)

                # Save model if validation accuracy better
                if val_acc > max_val_acc:
                    max_val_acc = val_acc
                    print("VALIDATION loss: " + str(np.mean(val_loss_arr)) + ", accuracy: " + str(val_acc))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")

Training iterations per epoch: 1
Training loss: 4.4771194, accuracy: 0.020833334
VALIDATION loss: 4.3400617, accuracy: 0.32142857
Training loss: 4.273194, accuracy: 0.37797618
VALIDATION loss: 4.1295815, accuracy: 0.46428576
Training loss: 4.057469, accuracy: 0.52678573
VALIDATION loss: 3.9003692, accuracy: 0.53571427
Training loss: 3.8359578, accuracy: 0.5119048
Training loss: 3.559524, accuracy: 0.5
Training loss: 3.2796872, accuracy: 0.48511907
Training loss: 2.9464107, accuracy: 0.47916666
Training loss: 2.637219, accuracy: 0.4761905
Training loss: 2.4188752, accuracy: 0.4761905
Training loss: 2.3009858, accuracy: 0.4761905
Training loss: 2.2858515, accuracy: 0.47619048
Training loss: 2.317586, accuracy: 0.47619048
Training loss: 2.2756832, accuracy: 0.47619042
Training loss: 2.2116084, accuracy: 0.47619048
Training loss: 2.0918448, accuracy: 0.49107143
Training loss: 2.0160213, accuracy: 0.51488096
VALIDATION loss: 1.948818, accuracy: 0.5833334
Training loss: 1.9952133, accuracy: 

## Testing network

### Rebuild graph quickly if want to run only this part of the notebook

In [41]:
# Parameters network
#embedding_size = 300
#lstm_neurons = 256

# Load dictionaries from pickle
english_dictionary = text_processing.load_dump("./dumps/eng_dict.pickle")
german_dictionary = text_processing.load_dump("./dumps/ger_dict.pickle")

tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, None), 'output')
target_labels = tf.placeholder(tf.int32, (None), 'targets')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits = neural_network.create_network(input_sequence, 
                                       output_sequence, 
                                       input_keep_prob,
                                       output_keep_prob,
                                       len(english_dictionary.index_to_word), 
                                       len(german_dictionary.index_to_word), 
                                       embedding_size,
                                       lstm_hidden_units,
                                       depth_lstm_bidirectional_layers,
                                       verbose=1)
# Predictions
scores = tf.nn.softmax(logits)
#max_score = tf.reduce_max(scores, axis=1)
predictions = tf.to_int32(tf.argmax(scores, axis=2))

Input sequence: [None, 3]
Encoder embedding: [None, 3, 64]
Encoder FW last_state: [None, 128]
Encoder BW last_state: [None, 128]
Decoder output: [None, None, 256]
Logits: [None, None, 90]


### Perform test predictions

In [48]:
# TF variables
init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./checkpoints/model.ckpt") 

    test_source_sentence = ["HI"]

    for source_sentence in test_source_sentence:

        # Normalize & tokenize (cut if longer than max_length_source)  
        source_preprocessed = text_processing.preprocess_sentence(source_sentence)[:english_dictionary.max_length_sentence]
       
        # Convert to numbers
        source_encoded = english_dictionary.text_to_indices(source_preprocessed)
        
        # Add padding
        source_input = text_processing.pad_sentence(source_encoded, english_dictionary.max_length_sentence)
        print(source_input)
        
        # Starting target sentence in German
        target_sentence = [["<START>"]]
        target_encoded = german_dictionary.text_to_indices(target_sentence[0])
        
        i = 0
        word_predicted = 0
        while word_predicted != 2: # If <END> (index 2), stop
            # Perform prediction
            pred = sess.run(predictions, feed_dict={input_sequence: [source_input], 
                                                    output_sequence: [target_encoded],
                                                    input_keep_prob: 1.0,
                                                    output_keep_prob: 1.0 })
            # Accumulate
            target_encoded.append(pred[0][i])
            word_predicted = pred[0][i]
            i += 1

        print(english_dictionary.indices_to_text(source_input) + " => "
              + german_dictionary.indices_to_text(target_encoded))

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt
[0 0 6]
<PAD> <PAD> hi => <START> hallo <END>
