# Neural Machine Translation

- Input is a sentence (sequence) in English 
- Output is the corresponding sequence in German
- Encoder Decoder models with a Deep Bidirectional LSTM

# TODO

1. Implement attention
2. Gradient clipping
2. Comment CODE!!
4. GRU CELL

TRAINING with BIGGER DATASET (so far trained only with 60 sentence and it was working)

## Import needed libraries

In [1]:
import tensorflow as tf
import numpy as np

import src.text_processing as text_processing
import src.dictionary as dictionary
import src.neural_network as neural_network

# Update python files
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


## Data processing

### Read dataset

In [2]:
# Read file containing english and german translations
data = text_processing.load_doc("./dataset/ENG_to_GER.txt")

# Split data into english and german
english_sentences, german_sentences = text_processing.prepare_data(data)

# Check and print number of sentences from one language to the other
assert(len(english_sentences) == len(german_sentences))
print(english_sentences.shape)

# Example of sentence with translation
print(english_sentences[18])
print(german_sentences[18])

(500,)
['i', 'know']
['ich', 'weiß']


### Split dataset (training + validation)

In [3]:
# Split percentage of training and validation
split_percentage = 0.8

# Count how many samples into training dataset
total_dataset = len(english_sentences)
train_dataset = int(total_dataset * split_percentage)

# Set random seed to have always same training and validation split
np.random.seed(42)
train_indices = np.random.choice(total_dataset, train_dataset, replace=False)

# Get training data for the two languages
training_english = english_sentences[train_indices]
training_german = german_sentences[train_indices]

# Get validation data
validation_english = np.delete(english_sentences, train_indices)
validation_german = np.delete(german_sentences, train_indices)

print("Training samples: " + str(training_english.shape[0]))
print("Validation samples: " + str(validation_english.shape[0]))

# Reset seed for randomness
np.random.seed()

Training samples: 400
Validation samples: 100


### Create dictionaries for the two languages

In [4]:
# Calculate longest sentence in the two languages
english_max_length = text_processing.max_length_sentence(training_english)
german_max_length = text_processing.max_length_sentence(training_german) + 2  # + 2 because of <START> and <END> the beginning

print("Longest sentence in English has " + str(english_max_length) + " tokens.")
print("Longest sentence in German has " + str(german_max_length) + " tokens.")
print()

# Create dictionaries
english_dictionary = dictionary.LanguageDictionary(training_english, english_max_length)
german_dictionary = dictionary.LanguageDictionary(training_german, german_max_length)

# Calculate size of the dictionaries
english_dictionary_size = len(english_dictionary.index_to_word)
german_dictionary_size = len(german_dictionary.index_to_word)

print("English dictionary size: " + str(english_dictionary_size))
print("German dictionary size: " + str(german_dictionary_size))

# Save dictionaries
text_processing.save_dump(english_dictionary, "./dumps/eng_dict.pickle")
text_processing.save_dump(german_dictionary, "./dumps/ger_dict.pickle")

Longest sentence in English has 4 tokens.
Longest sentence in German has 9 tokens.

English dictionary size: 325
German dictionary size: 403


### Prepare sequences for the Neural Network

In [5]:
# Prepare sequences of training data
train_source_input, train_target_input = text_processing.prepare_sequences(training_english, 
                                                                       training_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Prepare sequences of validation data
val_source_input, val_target_input = text_processing.prepare_sequences(validation_english, 
                                                                       validation_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Check if same number of samples
assert(len(train_source_input) == len(train_target_input))
assert(len(val_source_input) == len(val_target_input))

# Print shapes data
print("Training samples : " + str(len(train_source_input)))
print(train_source_input.shape)
print(train_target_input.shape)

print("Validation samples : " + str(len(val_source_input)))
print(val_source_input.shape)
print(val_target_input.shape)

Training samples : 400
(400, 4)
(400, 9)
Validation samples : 100
(100, 4)
(100, 9)


### Print sample input data in English, German and next word to be predicted in German

In [6]:
sample_sentence_index = 7
print(train_source_input[sample_sentence_index])
print(train_target_input[sample_sentence_index])

print("SOURCE => " + english_dictionary.indices_to_text(train_source_input[sample_sentence_index]))
print("TARGET => " + german_dictionary.indices_to_text(train_target_input[sample_sentence_index]))

[ 0 13 21 22]
[ 1 22 15 16 18  2  0  0  0]
SOURCE => <PAD> i am well
TARGET => <START> mir geht es gut <END> <PAD> <PAD> <PAD>


## Neural Network

### Parameters

In [7]:
epochs = 150
batch_size = 128
embedding_size = 256
lstm_hidden_units = 64
lr = 1e-3
depth_lstm_bidirectional_layers = 2
keep_dropout_prob = 0.7

### Create Seq2seq neural network graph

In [17]:
tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, None), 'output')
target_labels = tf.placeholder(tf.int32, (None, None), 'targets')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

decoder_outputs_tensor = tf.placeholder(tf.float32, (None, german_dictionary.max_length_sentence-1, 
                                                     lstm_hidden_units * 2), 'output')
# Create graph for the network
logits, dec_output = neural_network.create_network(input_sequence, 
                                                   output_sequence, 
                                                   input_keep_prob,
                                                   output_keep_prob,
                                                   decoder_outputs_tensor,
                                                   english_dictionary_size, 
                                                   german_dictionary_size, 
                                                   embedding_size,
                                                   lstm_hidden_units,
                                                   depth_lstm_bidirectional_layers)

Tensor("encoder/concat:0", shape=(?, 4, 128), dtype=float32)
Tensor("decoder/ExpandDims:0", shape=(?, 8, 1, 128), dtype=float32)
Tensor("decoder/dense/BiasAdd:0", shape=(?, 8, 4, 32), dtype=float32)
Tensor("decoder/dense_1/BiasAdd:0", shape=(?, 8, 1, 32), dtype=float32)
Tensor("decoder/Tanh:0", shape=(?, 8, 4, 32), dtype=float32)
Tensor("decoder/dense_2/BiasAdd:0", shape=(?, 8, 4, 1), dtype=float32)
Tensor("decoder/transpose_1:0", shape=(?, 8, 4, 1), dtype=float32)
Tensor("decoder/Sum:0", shape=(?, 8, 128), dtype=float32)
Tensor("decoder/embedding_lookup/Identity:0", shape=(?, ?, 256), dtype=float32)
Tensor("decoder/concat_2:0", shape=(?, 8, 384), dtype=float32)
Tensor("decoder/rnn/transpose_1:0", shape=(?, 8, 128), dtype=float32)
Tensor("dense/BiasAdd:0", shape=(?, 8, 403), dtype=float32)


### Set the loss function, optimizer and other useful tensors

In [24]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=target_labels)
loss = tf.reduce_mean(ce)

# Using Adam (Adaptive learning rate + momentum) for the update of the weights of the network
optimizer = tf.train.AdamOptimizer(learning_rate=lr) #.minimize(loss)
gradients, variables = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
optimize = optimizer.apply_gradients(zip(gradients, variables))

optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=2))
correct_mask = tf.to_float(tf.equal(predictions, target_labels))
accuracy = tf.reduce_mean(tf.reduce_mean(tf.cast(correct_mask, tf.float32), axis=1))

In [25]:
def get_decoder_outputs(batch_size, target_length, hidden_size, source, target_in):
    
    # Feature multiply by two because of bidirectional lstm
    first_output = np.zeros((batch_size, target_length, 2 * hidden_size))
    for i in range(target_length-1):

        fgg = sess.run(dec_output, feed_dict={
            input_sequence: source,
            output_sequence: target_in,
            decoder_outputs_tensor: first_output,
            input_keep_prob: keep_dropout_prob,
            output_keep_prob: keep_dropout_prob,
        })
        first_output[:,i+1] = fgg[:,i]
        
    return first_output

### Training of the network

In [None]:
# Training data variables
iterations_training = max((len(train_source_input) // batch_size), 1)
print("Training iterations per epoch: " + str(iterations_training))
training_overfit = False
consecutive_validation_without_saving = 0

# Validation data variables
max_val_acc = 0
iterations_validation = max((len(val_source_input) // batch_size), 1)

# Before each epoch, shuffle training dataset
indices = list(range(len(train_source_input)))

with tf.Session() as sess:
    
    # Initialize variables in the graph
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    
    # Iterate over epochs
    for i in range(epochs):
        
        # Shuffle data (with random seed for debug) to not train the network always with the same order
        np.random.seed(42)
        np.random.shuffle(indices)
        train_source_input = train_source_input[indices]
        train_target_input = train_target_input[indices]
        
        # Vector accumulating accuracy and loss during for one epoch
        total_accuracies, total_losses = [], []

        # Iterate over mini-batches
        for j in range(iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size 
            
            # If last batch, take also elements that are less than batch_size
            if j == (iterations_training - 1):
                end_index += (batch_size - 1)
                
            dec_out_tmp = get_decoder_outputs(len(train_source_input[start_index:end_index]), 
                                              german_dictionary.max_length_sentence - 1, 
                                              lstm_hidden_units,
                                              train_source_input[start_index:end_index],
                                              train_target_input[start_index:end_index, :-1])

            _, avg_accuracy, avg_loss = sess.run([optimizer, accuracy, loss], feed_dict={
                                                input_sequence: train_source_input[start_index:end_index],
                                                output_sequence: train_target_input[start_index:end_index, :-1],
                                                target_labels: train_target_input[start_index:end_index, 1:],
                                                input_keep_prob: keep_dropout_prob,
                                                output_keep_prob: keep_dropout_prob,
                                                decoder_outputs_tensor: dec_out_tmp })
            
            # Add values for this mini-batch iterations
            total_losses.append(avg_loss) 
            total_accuracies.append(avg_accuracy)

            # Print loss and accuracy
            if (j+1) % 1 == 0:
                print("Loss: " + str(avg_loss) + ", Accuracy: " + str(avg_accuracy))

            # Statistics on validation set
            if (j+1) % 1 == 0:
                
                # Accumulate validation statistics
                val_accuracies, val_losses = [], []

                # Iterate over mini-batches
                for k in range(iterations_validation):
                    start_index = k * batch_size
                    end_index = (k + 1) * batch_size 
                    
                    if j == (iterations_validation - 1):
                        end_index += (batch_size - 1)
                    
                    dec_out_tmp = get_decoder_outputs(len(val_source_input[start_index:end_index]), 
                                                      german_dictionary.max_length_sentence - 1, 
                                                      lstm_hidden_units,
                                                      val_source_input[start_index:end_index],
                                                      val_target_input[start_index:end_index, :-1])
                    
                    avg_accuracy, avg_loss = sess.run([accuracy, loss], feed_dict={
                                            input_sequence: val_source_input[start_index:end_index],
                                            output_sequence: val_target_input[start_index:end_index, :-1],
                                            target_labels: val_target_input[start_index:end_index, 1:],
                                            input_keep_prob: 1.0,
                                            output_keep_prob: 1.0,
                                            decoder_outputs_tensor: dec_out_tmp })                    
                    
                    # Statistics over the mini-batch
                    val_losses.append(avg_loss) 
                    val_accuracies.append(avg_accuracy)
            
                # Average validation accuracy over batches
                final_val_accuracy = np.mean(val_accuracies)
                
                # Save model if validation accuracy better
                if final_val_accuracy > max_val_acc:
                    consecutive_validation_without_saving = 0
                    max_val_acc = final_val_accuracy
                    print("VALIDATION loss: " + str(np.mean(val_losses)) + ", accuracy: " + str(final_val_accuracy))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")
                else:
                    # Count every time check validation accuracy
                    consecutive_validation_without_saving += 1
                
                # If checked validation time many consecutive times without having improvement in accuracy
                if consecutive_validation_without_saving >= 30:
                    #training_overfit = True
                    print(1)
        # Epoch statistics
        print("Training epoch: " + str(i+1) + ", AVG loss: " + str(np.mean(np.array(total_losses))) + 
              ", AVG accuracy: " + str(np.mean(np.array(total_accuracies))) + "\n")
        
        if training_overfit:
            print("Early stopping training because it starts overfitting")
            break

Training iterations per epoch: 3
Loss: 6.037218, Accuracy: 0.001953125
VALIDATION loss: 5.704887, accuracy: 0.38875
Loss: 5.7036953, Accuracy: 0.30273438
VALIDATION loss: 5.386392, accuracy: 0.51125
Loss: 5.338003, Accuracy: 0.5503472
VALIDATION loss: 5.06067, accuracy: 0.5125
Training epoch: 1, AVG loss: 5.692972, AVG accuracy: 0.28501156

Loss: 5.0241504, Accuracy: 0.5361328
Loss: 4.614168, Accuracy: 0.5498047
Loss: 4.282479, Accuracy: 0.5486111
Training epoch: 2, AVG loss: 4.640266, AVG accuracy: 0.5448496

Loss: 3.9157202, Accuracy: 0.5449219
Loss: 3.659391, Accuracy: 0.53808594
Loss: 3.2987156, Accuracy: 0.5503472
Training epoch: 3, AVG loss: 3.624609, AVG accuracy: 0.54445165

Loss: 3.1242027, Accuracy: 0.5390625
Loss: 2.8814511, Accuracy: 0.5576172
Loss: 2.8445296, Accuracy: 0.5399306
Training epoch: 4, AVG loss: 2.950061, AVG accuracy: 0.54553676

Loss: 2.743403, Accuracy: 0.54296875
Loss: 2.6450417, Accuracy: 0.5546875
Loss: 2.6968482, Accuracy: 0.5381944
VALIDATION loss: 2.89

## Testing network

### Rebuild graph quickly if want to run only this part of the notebook

In [21]:
# Load dictionaries from pickle
english_dictionary = text_processing.load_dump("./dumps/eng_dict.pickle")
german_dictionary = text_processing.load_dump("./dumps/ger_dict.pickle")

tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, None), 'output')
target_labels = tf.placeholder(tf.int32, (None, None), 'targets')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

decoder_outputs_tensor = tf.placeholder(tf.float32, (None, german_dictionary.max_length_sentence-1, 
                                                     lstm_hidden_units * 2), 'output')

# Create graph for the network
logits, dec_output = neural_network.create_network(input_sequence, 
                                                   output_sequence, 
                                                   input_keep_prob,
                                                   output_keep_prob,
                                                   decoder_outputs_tensor,
                                                   len(english_dictionary.index_to_word), 
                                                   len(german_dictionary.index_to_word), 
                                                   embedding_size,
                                                   lstm_hidden_units,
                                                   depth_lstm_bidirectional_layers)
# Predictions
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=2))

Tensor("encoder/concat:0", shape=(?, 4, 128), dtype=float32)
Tensor("decoder/ExpandDims:0", shape=(?, 8, 1, 128), dtype=float32)
Tensor("decoder/dense/BiasAdd:0", shape=(?, 8, 4, 32), dtype=float32)
Tensor("decoder/dense_1/BiasAdd:0", shape=(?, 8, 1, 32), dtype=float32)
Tensor("decoder/Tanh:0", shape=(?, 8, 4, 32), dtype=float32)
Tensor("decoder/dense_2/BiasAdd:0", shape=(?, 8, 4, 1), dtype=float32)
Tensor("decoder/transpose_1:0", shape=(?, 8, 4, 1), dtype=float32)
Tensor("decoder/Sum:0", shape=(?, 8, 128), dtype=float32)
Tensor("decoder/embedding_lookup/Identity:0", shape=(?, ?, 256), dtype=float32)
Tensor("decoder/concat_2:0", shape=(?, 8, 384), dtype=float32)
Tensor("decoder/rnn/transpose_1:0", shape=(?, 8, 128), dtype=float32)
Tensor("dense/BiasAdd:0", shape=(?, 8, 403), dtype=float32)


### Perform test predictions

In [22]:
# TF variables
init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./checkpoints/model.ckpt") 

    test_source_sentence = ["hello"]
    #test_source_sentence = validation_english

    for source_sentence in test_source_sentence:

        # ONLY IF VALIDATION ENGLISH DATASET USED (DEBUG)
        #source_sentence = " ".join(source_sentence)

        # Normalize & tokenize (cut if longer than max_length_source)  
        source_preprocessed = text_processing.preprocess_sentence(source_sentence)
        
        # Convert to numbers
        source_encoded = english_dictionary.text_to_indices(source_preprocessed)
        
        # Add padding
        source_input = text_processing.pad_sentence(source_encoded, english_dictionary.max_length_sentence)
        #print(english_dictionary.indices_to_text(source_input))
        
        # Starting target sentence in German
        target_sentence = [["<START>"]]
        target_encoded = german_dictionary.text_to_indices(target_sentence[0])

        i = 0
        word_predicted = 0
        while word_predicted != 2: # If <END> (index 2), stop
            
            target_encoded_pad = text_processing.pad_sentence(target_encoded, 
                                                          german_dictionary.max_length_sentence - 1, 
                                                           pad_before=False)

            dec_out_tmp = get_decoder_outputs(1, 
                                              german_dictionary.max_length_sentence - 1, 
                                              lstm_hidden_units,
                                              [source_input],
                                              [target_encoded_pad])
        
            # Perform prediction
            pred = sess.run(predictions, feed_dict={input_sequence: [source_input], 
                                                    output_sequence: [target_encoded_pad],
                                                    input_keep_prob: 1.0,
                                                    output_keep_prob: 1.0,
                                                    decoder_outputs_tensor: dec_out_tmp })
            
            # Accumulate
            target_encoded.append(pred[0][i])
            word_predicted = pred[0][i]
            
            if i > german_dictionary.max_length_sentence:
                break
            i += 1

        print(english_dictionary.indices_to_text(source_input) + " => "
              + german_dictionary.indices_to_text(target_encoded))

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt
<PAD> <PAD> <PAD> hello => <START> tom <END>
