# Neural Machine Translation

- Input is a sentence (sequence) in English 
- Output is the corresponding sequence in German
- Encoder Decoder model with a Bidirectional GRU Encoder, Attention and GRU Decoder

## Import needed libraries

In [2]:
import tensorflow as tf
import numpy as np

# Import local libraries
import src.text_processing as text_processing
import src.dictionary as dictionary
import src.neural_network as neural_network

# Update python files
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data processing

### Read dataset

In [3]:
# Read file containing english and german translations
data = text_processing.load_doc("./dataset/ENG_to_GER.txt")

# Split data into english and german
english_sentences, german_sentences = text_processing.prepare_data(data)

# Check and print number of sentences from one language to the other
assert(len(english_sentences) == len(german_sentences))
print(english_sentences.shape)

# Example of sentence with translation
print(english_sentences[20])
print(german_sentences[20])

(100000,)
['i', 'lost']
['ich', 'habe', 'verloren']


### Split dataset (training + validation)

In [4]:
# Split percentage of training and validation
split_percentage = 0.85

# Count how many samples into training dataset
total_dataset = len(english_sentences)
train_dataset = int(total_dataset * split_percentage)

# Set random seed to have always same training and validation split
np.random.seed(42)
train_indices = np.random.choice(total_dataset, train_dataset, replace=False)

# Get training data for the two languages
training_english = english_sentences[train_indices]
training_german = german_sentences[train_indices]

# Get validation data
validation_english = np.delete(english_sentences, train_indices)
validation_german = np.delete(german_sentences, train_indices)

print("Training samples: " + str(training_english.shape[0]))
print("Validation samples: " + str(validation_english.shape[0]))

Training samples: 85000
Validation samples: 15000


### Create dictionaries for the two languages

In [5]:
# Calculate longest sentence in the two languages
english_max_length = text_processing.max_length_sentence(training_english)
german_max_length = text_processing.max_length_sentence(training_german) + 2  # + 2 because of <START> and <END> the beginning

print("Longest sentence in English has " + str(english_max_length) + " tokens.")
print("Longest sentence in German has " + str(german_max_length) + " tokens.")
print()

# Create dictionaries
english_dictionary = dictionary.LanguageDictionary(training_english, english_max_length)
german_dictionary = dictionary.LanguageDictionary(training_german, german_max_length)

# Calculate size of the dictionaries
english_dictionary_size = len(english_dictionary.index_to_word)
german_dictionary_size = len(german_dictionary.index_to_word)

print("English dictionary size: " + str(english_dictionary_size))
print("German dictionary size: " + str(german_dictionary_size))

# Save dictionaries
text_processing.save_dump(english_dictionary, "./dumps/eng_dict.pickle")
text_processing.save_dump(german_dictionary, "./dumps/ger_dict.pickle")

Longest sentence in English has 15 tokens.
Longest sentence in German has 17 tokens.

English dictionary size: 11587
German dictionary size: 20897


### Prepare sequences for the Neural Network

In [6]:
# Prepare sequences of training data
train_source_input, train_target_input = text_processing.prepare_sequences(training_english, 
                                                                       training_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Prepare sequences of validation data
val_source_input, val_target_input = text_processing.prepare_sequences(validation_english, 
                                                                       validation_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Check if same number of samples
assert(len(train_source_input) == len(train_target_input))
assert(len(val_source_input) == len(val_target_input))

# Print shapes data
print("Training samples : " + str(len(train_source_input)))
print(train_source_input.shape)
print(train_target_input.shape)

print("Validation samples : " + str(len(val_source_input)))
print(val_source_input.shape)
print(val_target_input.shape)

Training samples : 85000
(85000, 15)
(85000, 17)
Validation samples : 15000
(15000, 15)
(15000, 17)


### Print sample input data in English, German and next word to be predicted in German

In [25]:
print(train_source_input[0])
print(train_target_input[0])

print("SOURCE => " + english_dictionary.indices_to_text(train_source_input[0]))
print("TARGET => " + german_dictionary.indices_to_text(train_target_input[0]))

[ 0  0  0  0  0  0  0  4  5  6  7  8  9 10 11]
[1 4 5 6 7 8 9 2 0 0 0 0 0 0 0 0 0]
SOURCE => <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> tom is parents have a lot of money
TARGET => <START> toms eltern haben jede menge geld <END> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


## Neural Network

### Parameters

In [26]:
epochs = 200
batch_size = 128
embedding_size = 256
lstm_hidden_units = 192
lr = 1e-3
keep_dropout_prob = 0.7

### Create Seq2seq neural network graph

In [27]:
tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, None), 'output')
target_labels = tf.placeholder(tf.int32, (None, None), 'targets')
keep_prob = tf.placeholder(tf.float32, (None), 'dropout_prob')
decoder_outputs_tensor = tf.placeholder(tf.float32, (None, german_dictionary.max_length_sentence - 1, 
                                                     lstm_hidden_units * 2), 'output')

# Create graph for the network
logits, dec_output, mask = neural_network.create_network(input_sequence, 
                                                         output_sequence, 
                                                         keep_prob,
                                                         decoder_outputs_tensor,
                                                         english_dictionary_size, 
                                                         german_dictionary_size, 
                                                         embedding_size,
                                                         lstm_hidden_units)

Previous decoder outputs:  Tensor("decoder/ExpandDims:0", shape=(?, 16, 1, 384), dtype=float32)
Bahdanau score:  Tensor("decoder/dense_2/BiasAdd:0", shape=(?, 16, 15, 1), dtype=float32)
Attention weights:  Tensor("decoder/transpose_1:0", shape=(?, 16, 15, 1), dtype=float32)
Context vector:  Tensor("decoder/Sum:0", shape=(?, 16, 384), dtype=float32)
Embedding layer:  Tensor("decoder/embedding_lookup/Identity:0", shape=(?, ?, 256), dtype=float32)
Decoder input:  Tensor("decoder/concat_2:0", shape=(?, 16, 640), dtype=float32)
Logits: Tensor("dense/BiasAdd:0", shape=(?, 16, 20897), dtype=float32)


### Set the loss function, optimizer and other useful tensors

In [50]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=target_labels) * mask
loss = tf.reduce_mean(ce)

# Using Adam optimizer for the update of the weights of the network with gradient clipping
optimizer = tf.train.AdamOptimizer(learning_rate=lr) #.minimize(loss)
gradients, variables = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimize = optimizer.apply_gradients(zip(gradients, variables))

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=2))
correct_mask = tf.to_float(tf.equal(predictions, target_labels))
accuracy = tf.contrib.metrics.accuracy(predictions, target_labels, weights=mask)

### Training of the network

In [69]:
# Training and validation data variables
training_overfit = False
best_val_accuracy = 0
consecutive_validation_without_saving = 0
indices = list(range(len(train_source_input)))
print("Number of iterations per epoch: " + str((len(train_source_input) // batch_size) + 1))

# Start session and initialize variables in the graph
with tf.Session() as sess:    
    
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    
    for i in range(epochs):
        
        # Vector accumulating accuracy and loss during one epoch
        total_accuracies, total_losses = [], []
        
        # Shuffle data to not train the network always with the same order
        np.random.shuffle(indices)
        train_source_input = train_source_input[indices]
        train_target_input = train_target_input[indices]        
        
        # Iterate over mini-batches
        for j in range(0, len(train_source_input), batch_size):

            dec_out_tmp = neural_network.get_decoder_outputs(sess, dec_output, input_sequence, output_sequence,
                        decoder_outputs_tensor, keep_prob, keep_dropout_prob, 
                        len(train_source_input[j:j+batch_size]), german_dictionary.max_length_sentence - 1, 
                        lstm_hidden_units, train_source_input[j:j+batch_size],
                        train_target_input[j:j+batch_size, :-1])
            
            _, avg_accuracy, avg_loss = sess.run([optimize, accuracy, loss], feed_dict={
                                                input_sequence: train_source_input[j:j+batch_size],
                                                output_sequence: train_target_input[j:j+batch_size, :-1],
                                                target_labels: train_target_input[j:j+batch_size, 1:],
                                                keep_prob: keep_dropout_prob,
                                                decoder_outputs_tensor: dec_out_tmp })
            
            # Add values for this mini-batch iterations
            total_losses.append(avg_loss) 
            total_accuracies.append(avg_accuracy)
            
            # Statistics on validation set
            if (j // batch_size + 1) % 250 == 0:

                # Accumulate validation statistics
                val_accuracies, val_losses = [], []
                for k in range(0, len(val_source_input), batch_size):

                    dec_out_tmp = neural_network.get_decoder_outputs(sess, dec_output, input_sequence,
                        output_sequence, decoder_outputs_tensor, keep_prob, 1.0,
                        len(val_source_input[k:k+batch_size]), german_dictionary.max_length_sentence - 1, 
                        lstm_hidden_units, val_source_input[k:k+batch_size], val_target_input[k:k+batch_size, :-1])
                    
                    avg_accuracy, avg_loss = sess.run([accuracy, loss], feed_dict={
                                            input_sequence: val_source_input[k:k+batch_size],
                                            output_sequence: val_target_input[k:k+batch_size, :-1],
                                            target_labels: val_target_input[k:k+batch_size, 1:],
                                            keep_prob: 1.0,
                                            decoder_outputs_tensor: dec_out_tmp })                    
                    
                    val_losses.append(avg_loss) 
                    val_accuracies.append(avg_accuracy)
            
                # Average validation accuracy over batches
                final_val_accuracy = np.mean(val_accuracies)
                
                # Save model if validation accuracy better
                if final_val_accuracy > best_val_accuracy:
                    consecutive_validation_without_saving = 0
                    best_val_accuracy = final_val_accuracy
                    print("VALIDATION loss: " + str(np.mean(val_losses)) + ", accuracy: " + str(final_val_accuracy))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")
                else:
                    # Count every time check validation accuracy
                    consecutive_validation_without_saving += 1
                
                # If checked validation time many consecutive times without having improvement in accuracy
                if consecutive_validation_without_saving >= 10:
                    training_overfit = True
                    break
        
        # Epoch statistics
        print("Epoch: " + str(i+1) + ", AVG loss: " + str(np.mean(np.array(total_losses))) + 
              ", AVG accuracy: " + str(np.mean(np.array(total_accuracies))) + "\n")
        
        if training_overfit:
            print("Early stopping")
            break

Number of iterations per epoch: 665
INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt
VALIDATION loss: 0.82717645, accuracy: 0.73835766
Epoch: 1, AVG loss: 0.39362738, AVG accuracy: 0.7968743

VALIDATION loss: 0.82658976, accuracy: 0.73905164
VALIDATION loss: 0.82674766, accuracy: 0.739707
Epoch: 2, AVG loss: 0.3816911, AVG accuracy: 0.80143833

VALIDATION loss: 0.8305219, accuracy: 0.7413954
Epoch: 3, AVG loss: 0.37092778, AVG accuracy: 0.8063655

VALIDATION loss: 0.8336492, accuracy: 0.741549
VALIDATION loss: 0.8280896, accuracy: 0.74203044
Epoch: 4, AVG loss: 0.36125374, AVG accuracy: 0.80996794

VALIDATION loss: 0.83230466, accuracy: 0.7422639
VALIDATION loss: 0.82923895, accuracy: 0.7441072
Epoch: 5, AVG loss: 0.3524709, AVG accuracy: 0.8137857

Epoch: 6, AVG loss: 0.34542027, AVG accuracy: 0.81550807

VALIDATION loss: 0.83227813, accuracy: 0.7454422
Epoch: 7, AVG loss: 0.33784592, AVG accuracy: 0.8193833

VALIDATION loss: 0.83780986, accuracy: 0.7458219
VALIDATIO

## Testing network

### Rebuild graph quickly if want to run only this part of the notebook

In [29]:
# Load dictionaries from pickle
english_dictionary = text_processing.load_dump("./dumps/eng_dict.pickle")
german_dictionary = text_processing.load_dump("./dumps/ger_dict.pickle")

tf.reset_default_graph()

embedding_size = 256
lstm_hidden_units = 192

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, None), 'output')
target_labels = tf.placeholder(tf.int32, (None, None), 'targets')
keep_prob = tf.placeholder(tf.float32, (None), 'dropout_prob')
decoder_outputs_tensor = tf.placeholder(tf.float32, (None, german_dictionary.max_length_sentence - 1, 
                                                     lstm_hidden_units * 2), 'output')

# Create graph for the network
logits, dec_output, mask = neural_network.create_network(input_sequence, 
                                                         output_sequence, 
                                                         keep_prob,
                                                         decoder_outputs_tensor,
                                                         len(english_dictionary.index_to_word), 
                                                         len(german_dictionary.index_to_word), 
                                                         embedding_size,
                                                         lstm_hidden_units)
# Predictions
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=2))

Previous decoder outputs:  Tensor("decoder/ExpandDims:0", shape=(?, 16, 1, 384), dtype=float32)
Bahdanau score:  Tensor("decoder/dense_2/BiasAdd:0", shape=(?, 16, 15, 1), dtype=float32)
Attention weights:  Tensor("decoder/transpose_1:0", shape=(?, 16, 15, 1), dtype=float32)
Context vector:  Tensor("decoder/Sum:0", shape=(?, 16, 384), dtype=float32)
Embedding layer:  Tensor("decoder/embedding_lookup/Identity:0", shape=(?, ?, 256), dtype=float32)
Decoder input:  Tensor("decoder/concat_2:0", shape=(?, 16, 640), dtype=float32)
Logits: Tensor("dense/BiasAdd:0", shape=(?, 16, 20897), dtype=float32)


### Perform test predictions

In [33]:
with tf.Session() as sess:
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, "./checkpoints/model.ckpt") 

    test_source_sentence = ["I am trying to translate some sentences"]

    for source_sentence in test_source_sentence:
        
        # Normalize & tokenize (cut if longer than max_length_source)  
        source_preprocessed = text_processing.preprocess_sentence(source_sentence)
        
        # Convert to numbers
        source_encoded = english_dictionary.text_to_indices(source_preprocessed)
        
        # Add padding
        source_input = text_processing.pad_sentence(source_encoded, english_dictionary.max_length_sentence)
        
        # Starting target sentence in German
        target_sentence = [["<START>"]]
        target_encoded = german_dictionary.text_to_indices(target_sentence[0])

        i = 0
        word_predicted = 0
        while word_predicted != 2: # If <END> (index 2), stop
            
            target_encoded_pad = text_processing.pad_sentence(target_encoded, 
                                                          german_dictionary.max_length_sentence - 1, 
                                                           pad_before=False)
            
            dec_out_tmp = neural_network.get_decoder_outputs(
                                                            sess,
                                                            dec_output,
                                                            input_sequence,
                                                            output_sequence,
                                                            decoder_outputs_tensor,
                                                            keep_prob,
                                                            1.0,
                                                            1, 
                                                            german_dictionary.max_length_sentence - 1, 
                                                            lstm_hidden_units,
                                                            [source_input],
                                                            [target_encoded_pad])        
            # Perform prediction
            pred = sess.run(predictions, feed_dict={ input_sequence: [source_input], 
                                                    output_sequence: [target_encoded_pad],
                                                    keep_prob: 1.0,
                                                    decoder_outputs_tensor: dec_out_tmp })
            
            # Accumulate
            target_encoded.append(pred[0][i])
            word_predicted = pred[0][i]
            
            if i > german_dictionary.max_length_sentence:
                break
            i += 1

        print(english_dictionary.indices_to_text(source_input) + " => "
              + german_dictionary.indices_to_text(target_encoded))

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> i am trying to translate some sentences => <START> ich versuche nur ein paar unerwartete obst zu haben <END>
