# Neural Machine Translation

- Input is a sentence (sequence) in English 
- Output is the corresponding sequence in German
- Encoder Decoder models with a Deep Bidirectional LSTM

# TODO

1. Implement attention
2. Comment CODE!!

TRAINING with BIGGER DATASET (so far trained only with 60 sentence and it was working)

## Import needed libraries

In [1]:
import tensorflow as tf
import numpy as np

import src.text_processing as text_processing
import src.dictionary as dictionary
import src.neural_network as neural_network

# Update python files
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters


## Data processing

### Read dataset

In [2]:
# Read file containing english and german translations
data = text_processing.load_doc("./dataset/ENG_to_GER.txt")

# Split data into english and german
english_sentences, german_sentences = text_processing.prepare_data(data)

# Check and print number of sentences from one language to the other
assert(len(english_sentences) == len(german_sentences))
print(english_sentences.shape)

# Example of sentence with translation
print(english_sentences[18])
print(german_sentences[18])

(55000,)
['i', 'know']
['ich', 'weiß']


### Split dataset (training + validation)

In [3]:
# Split percentage of training and validation
split_percentage = 0.8

# Count how many samples into training dataset
total_dataset = len(english_sentences)
train_dataset = int(total_dataset * split_percentage)

# Set random seed to have always same training and validation split
np.random.seed(42)
train_indices = np.random.choice(total_dataset, train_dataset, replace=False)

# Get training data for the two languages
training_english = english_sentences[train_indices]
training_german = german_sentences[train_indices]

# Get validation data
validation_english = np.delete(english_sentences, train_indices)
validation_german = np.delete(german_sentences, train_indices)

print("Training samples: " + str(training_english.shape[0]))
print("Validation samples: " + str(validation_english.shape[0]))

# Reset seed for randomness
np.random.seed()

Training samples: 44000
Validation samples: 11000


### Create dictionaries for the two languages

In [4]:
# Calculate longest sentence in the two languages
english_max_length = text_processing.max_length_sentence(training_english)
german_max_length = text_processing.max_length_sentence(training_german) + 2  # + 2 because of <START> and <END> the beginning

print("Longest sentence in English has " + str(english_max_length) + " tokens.")
print("Longest sentence in German has " + str(german_max_length) + " tokens.")
print()

# Create dictionaries
english_dictionary = dictionary.LanguageDictionary(training_english, english_max_length)
german_dictionary = dictionary.LanguageDictionary(training_german, german_max_length)

# Calculate size of the dictionaries
english_dictionary_size = len(english_dictionary.index_to_word)
german_dictionary_size = len(german_dictionary.index_to_word)

print("English dictionary size: " + str(english_dictionary_size))
print("German dictionary size: " + str(german_dictionary_size))

# Save dictionaries
text_processing.save_dump(english_dictionary, "./dumps/eng_dict.pickle")
text_processing.save_dump(german_dictionary, "./dumps/ger_dict.pickle")

Longest sentence in English has 11 tokens.
Longest sentence in German has 14 tokens.

English dictionary size: 7824
German dictionary size: 12472


### Prepare sequences for the Neural Network

In [5]:
# Prepare sequences of training data
train_source_input, train_target_input = text_processing.prepare_sequences(training_english, 
                                                                       training_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Prepare sequences of validation data
val_source_input, val_target_input = text_processing.prepare_sequences(validation_english, 
                                                                       validation_german, 
                                                                       english_dictionary, 
                                                                       german_dictionary)

# Check if same number of samples
assert(len(train_source_input) == len(train_target_input))
assert(len(val_source_input) == len(val_target_input))

# Print shapes data
print("Training samples : " + str(len(train_source_input)))
print(train_source_input.shape)
print(train_target_input.shape)

print("Validation samples : " + str(len(val_source_input)))
print(val_source_input.shape)
print(val_target_input.shape)

Training samples : 44000
(44000, 11)
(44000, 14)
Validation samples : 11000
(11000, 11)
(11000, 14)


### Print sample input data in English, German and next word to be predicted in German

In [6]:
sample_sentence_index = 7
print(train_source_input[sample_sentence_index])
print(train_target_input[sample_sentence_index])

print("SOURCE => " + english_dictionary.indices_to_text(train_source_input[sample_sentence_index]))
print("TARGET => " + german_dictionary.indices_to_text(train_target_input[sample_sentence_index]))

[ 0  0  0  0  0  0 37 38 39 40 41]
[ 1 32 33 34 35 36  2  0  0  0  0  0  0  0]
SOURCE => <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> tom never tells me anything
TARGET => <START> tom erzählt mir nie etwas <END> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


## Neural Network

### Parameters

In [7]:
epochs = 150
batch_size = 128
embedding_size = 128
lstm_hidden_units = 128
lr = 1e-3
depth_lstm_bidirectional_layers = 2
keep_dropout_prob = 0.75

### Create Seq2seq neural network graph

In [8]:
tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, None), 'output')
target_labels = tf.placeholder(tf.int32, (None, None), 'targets')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits = neural_network.create_network(input_sequence, 
                                       output_sequence, 
                                       input_keep_prob,
                                       output_keep_prob,
                                       english_dictionary_size, 
                                       german_dictionary_size, 
                                       embedding_size,
                                       lstm_hidden_units,
                                       depth_lstm_bidirectional_layers,
                                       verbose=1)

Input sequence: [None, 11]
Encoder embedding: [None, 11, 128]
Encoder FW last_state: [None, 128]
Encoder BW last_state: [None, 128]
Decoder output: [None, None, 256]
Logits: [None, None, 12472]


### Set the loss function, optimizer and other useful tensors

In [9]:
# Cross entropy loss after softmax of logits
ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=target_labels)
loss = tf.reduce_mean(ce)

# Using Adam (Adaptive learning rate + momentum) for the update of the weights of the network
optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Useful tensors
scores = tf.nn.softmax(logits)
predictions = tf.to_int32(tf.argmax(scores, axis=2))
correct_mask = tf.to_float(tf.equal(predictions, target_labels))
accuracy = tf.reduce_mean(tf.reduce_mean(tf.cast(correct_mask, tf.float32), axis=1))



# MODIFY TENSORS FOR ACCURACY in case of teaching forcing methods
#correct_mask_int = tf.equal(predictions[:,-1], target_labels[:,-1])
#correct_mask = tf.to_float(tf.equal(predictions[:,-1], target_labels[:,-1]))
#accuracy = tf.reduce_mean(tf.cast(correct_mask, tf.float32))

### Training of the network

In [10]:
'''
    If NOT using teacher forcing method:
        - train_target_input[start_index:end_index, :-1],
        - target_labels: train_target_input[start_index:end_index, 1:],
'''

'''
    TEACHER FORCING ONLY 1 target output

    decoder_input = np.expand_dims(target_sentences[:, j], 1) 
    decoder_output = np.expand_dims(target_sentences[:, j+1], 1) 

    encoder_input = [source_sentences[i] for i in range(len(decoder_output)) if decoder_output[i] != 0]
    decoder_input = [decoder_input[i] for i in range(len(decoder_output)) if decoder_output[i] != 0]
    decoder_output = [decoder_output[i] for i in range(len(decoder_output)) if decoder_output[i] != 0]

    assert(len(encoder_input) == len(decoder_input) == len(decoder_output))
'''       

'\n    TEACHER FORCING ONLY 1 target output\n\n    decoder_input = np.expand_dims(target_sentences[:, j], 1) \n    decoder_output = np.expand_dims(target_sentences[:, j+1], 1) \n\n    encoder_input = [source_sentences[i] for i in range(len(decoder_output)) if decoder_output[i] != 0]\n    decoder_input = [decoder_input[i] for i in range(len(decoder_output)) if decoder_output[i] != 0]\n    decoder_output = [decoder_output[i] for i in range(len(decoder_output)) if decoder_output[i] != 0]\n\n    assert(len(encoder_input) == len(decoder_input) == len(decoder_output))\n'

In [11]:
def teacher_forcing_method(session, fetches, source_sentences, target_sentences, target_sentence_length, 
                           dropout_prob, tensor_input, tensor_output, tensor_label, tensor_input_prob,
                           tensor_output_prob):
    
    accuracies, losses = [], []
    total_samples = 0
    for j in range(target_sentence_length - 1):

        decoder_input = target_sentences[:, :j+1]
        decoder_output = target_sentences[:, j+1:j+2]        
        
        #I do not want to predict anything after <END> => choose where next target char is != "<PAD>" (0)
        encoder_input = [source_sentences[i] for i in range(len(decoder_output)) if decoder_output[i] != 0]
        decoder_input = [decoder_input[i] for i in range(len(decoder_output)) if decoder_output[i] != 0]
        decoder_output = [target_sentences[i, 1:j+2] for i in range(len(decoder_output)) if decoder_output[i] != 0]

        assert(len(encoder_input) == len(decoder_input) == len(decoder_output))
    
        # There could be only sentences shorter than the max => break
        if len(encoder_input) == 0:
            break
            
        #print([english_dictionary.indices_to_text(tmp) for tmp in encoder_input])
        #print([german_dictionary.indices_to_text(tmp) for tmp in decoder_input])
        #print([german_dictionary.indices_to_text(tmp) for tmp in decoder_output])
        #print()
        
        # Run TF graph
        _, value_accuracy, value_loss = sess.run(fetches, feed_dict={
                                                tensor_input: encoder_input,
                                                tensor_output: decoder_input,
                                                tensor_label: decoder_output,
                                                tensor_input_prob: dropout_prob,
                                                tensor_output_prob: dropout_prob })
        # Weighted avg of accuracy
        total_samples += len(decoder_output)
        accuracies.append(value_accuracy * float(len(decoder_output)))
        losses.append(value_loss * len(decoder_output))
    
    accuracies = np.array(accuracies)
    losses = np.array(losses)

    return np.sum(accuracies) / float(total_samples), np.sum(losses) / float(total_samples)

In [12]:
# Training data variables
iterations_training = max((len(train_source_input) // batch_size), 1)
print("Training iterations per epoch: " + str(iterations_training))
training_overfit = False
consecutive_validation_without_saving = 0

# Validation data variables
max_val_acc = 0
iterations_validation = max((len(val_source_input) // batch_size), 1)

# Initializer for variables in the graph
init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Perform each epoch, shuffle training dataset
indices = list(range(len(train_source_input)))

with tf.Session() as sess:
    
    # Initialize variables in the graph
    sess.run(init)
    
    # Iterate over epochs
    for i in range(epochs):
        
        # Shuffle data (with random seed for debug) to not train the network always with the same order
        np.random.seed(42)
        np.random.shuffle(indices)
        train_source_input = train_source_input[indices]
        train_target_input = train_target_input[indices]
        
        # Vector accumulating accuracy and loss during for one epoch
        total_accuracies, total_losses = [], []

        # Iterate over mini-batches
        for j in range(iterations_training):
            start_index = j * batch_size
            end_index = (j + 1) * batch_size 
            
            # If last batch, take also elements that are less than batch_size
            if j == (iterations_training - 1):
                end_index += (batch_size - 1)
                
            '''avg_accuracy, avg_loss = teacher_forcing_method(sess, [optimizer, accuracy, loss], 
                                                            train_source_input[start_index:end_index], 
                                                            train_target_input[start_index:end_index], 
                                                            german_dictionary.max_length_sentence, keep_dropout_prob,
                                                            input_sequence, output_sequence, target_labels, 
                                                            input_keep_prob, output_keep_prob)'''

            _, avg_accuracy, avg_loss = sess.run([optimizer, accuracy, loss], feed_dict={
                                            input_sequence: train_source_input[start_index:end_index],
                                            output_sequence: train_target_input[start_index:end_index, :-1],
                                            target_labels: train_target_input[start_index:end_index, 1:],
                                            input_keep_prob: keep_dropout_prob,
                                            output_keep_prob: keep_dropout_prob })
            
            # Add values for this mini-batch iterations
            total_losses.append(avg_loss) 
            total_accuracies.append(avg_accuracy)

            # Print loss and accuracy
            if (j + 1) % 250 == 0:
                print("Loss: " + str(avg_loss) + ", Accuracy: " + str(avg_accuracy))
                
            # Statistics on validation set
            if (j) % 1000 == 0:
                
                # Accumulate validation statistics
                val_accuracies, val_losses = [], []

                # Iterate over mini-batches
                for k in range(iterations_validation):
                    start_index = k * batch_size
                    end_index = (k + 1) * batch_size 
                    
                    if j == (iterations_validation - 1):
                        end_index += (batch_size - 1)

                    '''avg_accuracy, avg_loss = teacher_forcing_method(sess, [scores, accuracy, loss], 
                                                                    val_source_input[start_index:end_index], 
                                                                    val_target_input[start_index:end_index], 
                                                                    german_dictionary.max_length_sentence, 1.0,
                                                                    input_sequence, output_sequence, target_labels, 
                                                                    input_keep_prob, output_keep_prob)'''
                    
                    avg_accuracy, avg_loss = sess.run([accuracy, loss], feed_dict={
                                            input_sequence: val_source_input[start_index:end_index],
                                            output_sequence: val_target_input[start_index:end_index, :-1],
                                            target_labels: val_target_input[start_index:end_index, 1:],
                                            input_keep_prob: 1.0,
                                            output_keep_prob: 1.0 })
                    
                    # Statistics over the mini-batch
                    val_losses.append(avg_loss) 
                    val_accuracies.append(avg_accuracy)
            
                # Average validation accuracy over batches
                final_val_accuracy = np.mean(val_accuracies)
                
                # Save model if validation accuracy better
                if final_val_accuracy > max_val_acc:
                    consecutive_validation_without_saving = 0
                    max_val_acc = final_val_accuracy
                    print("VALIDATION loss: " + str(np.mean(val_losses)) + ", accuracy: " + str(final_val_accuracy))
                    save_path = saver.save(sess, "./checkpoints/model.ckpt")
                else:
                    # Count every time check validation accuracy
                    consecutive_validation_without_saving += 1
                
                # If checked validation time many consecutive times without having improvement in accuracy
                if consecutive_validation_without_saving >= 8:
                    training_overfit = True
            
        # Epoch statistics
        print("Training epoch: " + str(i+1) + ", AVG loss: " + str(np.mean(np.array(total_losses))) + 
              ", AVG accuracy: " + str(np.mean(np.array(total_accuracies))) + "\n")
        
        if training_overfit:
            print("Early stopping training because it starts overfitting")
            break

Training iterations per epoch: 343
VALIDATION loss: 9.314646, accuracy: 0.5230981
Loss: 2.2136345, Accuracy: 0.672476
Training epoch: 1, AVG loss: 2.701314, AVG accuracy: 0.6385796

VALIDATION loss: 2.2098331, accuracy: 0.6856618
Loss: 1.9288319, Accuracy: 0.719351
Training epoch: 2, AVG loss: 2.0662255, AVG accuracy: 0.70159847

VALIDATION loss: 1.9339939, accuracy: 0.720567
Loss: 1.7377322, Accuracy: 0.75120187
Training epoch: 3, AVG loss: 1.8388755, AVG accuracy: 0.72286934

VALIDATION loss: 1.7624063, accuracy: 0.73698384
Loss: 1.6116804, Accuracy: 0.747596
Training epoch: 4, AVG loss: 1.6769046, AVG accuracy: 0.73687994

VALIDATION loss: 1.6370198, accuracy: 0.75038874
Loss: 1.5861386, Accuracy: 0.7487981
Training epoch: 5, AVG loss: 1.54486, AVG accuracy: 0.74928784

VALIDATION loss: 1.5362573, accuracy: 0.761489
Loss: 1.3744733, Accuracy: 0.77463937
Training epoch: 6, AVG loss: 1.4338238, AVG accuracy: 0.7587253

VALIDATION loss: 1.4516664, accuracy: 0.7708781
Loss: 1.3545127, A

Loss: 0.23649451, Accuracy: 0.9344951
Training epoch: 57, AVG loss: 0.23659465, AVG accuracy: 0.935457

Loss: 0.25125268, Accuracy: 0.92788446
Training epoch: 58, AVG loss: 0.2338282, AVG accuracy: 0.936101

VALIDATION loss: 1.0240797, accuracy: 0.86271894
Loss: 0.21116418, Accuracy: 0.9381009
Training epoch: 59, AVG loss: 0.23000114, AVG accuracy: 0.9369295

Loss: 0.21366031, Accuracy: 0.9381009
Training epoch: 60, AVG loss: 0.22691005, AVG accuracy: 0.9377444

VALIDATION loss: 1.0246682, accuracy: 0.8630231
Loss: 0.2003677, Accuracy: 0.9441105
Training epoch: 61, AVG loss: 0.22286798, AVG accuracy: 0.9390447

VALIDATION loss: 1.0303345, accuracy: 0.8634968
Loss: 0.19871448, Accuracy: 0.9435095
Training epoch: 62, AVG loss: 0.21930826, AVG accuracy: 0.93951577

VALIDATION loss: 1.0281079, accuracy: 0.8637584
Loss: 0.19946466, Accuracy: 0.9435095
Training epoch: 63, AVG loss: 0.21640784, AVG accuracy: 0.94032204

Loss: 0.19614601, Accuracy: 0.9435095
Training epoch: 64, AVG loss: 0.213

KeyboardInterrupt: 

In [None]:
print([english_dictionary.indices_to_text(tmp) for tmp in train_source_input[1:4]])
print()
print([german_dictionary.indices_to_text(tmp) for tmp in train_target_input[1:4, :-1]])
print()
print([german_dictionary.indices_to_text(tmp) for tmp in train_target_input[1:4, 1:]])

## Testing network

### Rebuild graph quickly if want to run only this part of the notebook

In [None]:
# Parameters network
#embedding_size = 300
#lstm_neurons = 256

# Load dictionaries from pickle
english_dictionary = text_processing.load_dump("./dumps/eng_dict.pickle")
german_dictionary = text_processing.load_dump("./dumps/ger_dict.pickle")

tf.reset_default_graph()

# Placeholders
input_sequence = tf.placeholder(tf.int32, (None, english_dictionary.max_length_sentence), 'inputs')
output_sequence = tf.placeholder(tf.int32, (None, None), 'output')
target_labels = tf.placeholder(tf.int32, (None, None), 'targets')
input_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_input')
output_keep_prob = tf.placeholder(tf.float32, (None), 'dropout_output')

# Create graph for the network
logits = neural_network.create_network(input_sequence, 
                                       output_sequence, 
                                       input_keep_prob,
                                       output_keep_prob,
                                       len(english_dictionary.index_to_word), 
                                       len(german_dictionary.index_to_word), 
                                       embedding_size,
                                       lstm_hidden_units,
                                       depth_lstm_bidirectional_layers,
                                       verbose=0)
# Predictions
scores = tf.nn.softmax(logits)
#max_score = tf.reduce_max(scores, axis=1)
predictions = tf.to_int32(tf.argmax(scores, axis=2))

### Perform test predictions

In [26]:
# TF variables
init = tf.global_variables_initializer()
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, "./checkpoints/model.ckpt") 

    test_source_sentence = ["If I had eyes, I would look at you"]
    #test_source_sentence = validation_english

    for source_sentence in test_source_sentence:

        # ONLY IF VALIDATION ENGLISH DATASET USED (DEBUG)
        #source_sentence = " ".join(source_sentence)

        # Normalize & tokenize (cut if longer than max_length_source)  
        source_preprocessed = text_processing.preprocess_sentence(source_sentence)
        
        # Convert to numbers
        source_encoded = english_dictionary.text_to_indices(source_preprocessed)
        
        # Add padding
        source_input = text_processing.pad_sentence(source_encoded, english_dictionary.max_length_sentence)
        #print(english_dictionary.indices_to_text(source_input))
        
        # Starting target sentence in German
        target_sentence = [["<START>"]]
        target_encoded = german_dictionary.text_to_indices(target_sentence[0])

        i = 0
        word_predicted = 0
        while word_predicted != 2: # If <END> (index 2), stop

            # Perform prediction
            pred = sess.run(predictions, feed_dict={input_sequence: [source_input], 
                                                    output_sequence: [target_encoded],
                                                    input_keep_prob: 1.0,
                                                    output_keep_prob: 1.0 })
            
            # Accumulate
            target_encoded.append(pred[0][i])
            word_predicted = pred[0][i]
            i += 1

        print(english_dictionary.indices_to_text(source_input) + " => "
              + german_dictionary.indices_to_text(target_encoded))

INFO:tensorflow:Restoring parameters from ./checkpoints/model.ckpt
<PAD> <PAD> if i had eyes i would look at you => <START> wenn ich schau würde ich dich sehen <END>
