In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import math
import os
import h5py

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# from tf.keras.models import Model  # This does not work!
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [None]:
num_words = 10000
num_epochs = 10
state_size = 512
embedding_size = 128  # 128
num_layers = 3
shutdown = False
path_checkpoint = 'default_checkpoint.keras'

In [3]:
tf.__version__

'1.5.0'

In [4]:
tf.keras.__version__

'2.1.2-tf'

Download data set from Europarl

In [5]:
import europarl

Using French to English dataset

In [6]:
language_code='fr'

Tokens for start and end of translation sequence

In [7]:
mark_start = 'ssss '
mark_end = ' eeee'

In [9]:
europarl.maybe_download_and_extract(language_code=language_code)

Data has apparently already been downloaded and unpacked.


Load data for source language

In [1]:
data_src = europarl.load_data(english=False,
                              language_code=language_code)

NameError: name 'europarl' is not defined

Load data for target language

In [11]:
data_dest = europarl.load_data(english=True,
                               language_code=language_code,
                               start=mark_start,
                               end=mark_end)

Tokenize the data, convert text to numbers, assign int to every unique word, then convert tokens into vectors of floating-point numbers using embeddig layer. Use num_words most frequent words in the data-set 

In [19]:
class TokenizerWrap(Tokenizer):
    """Wrap the Tokenizer-class from Keras with more functionality."""
    
    def __init__(self, texts, padding,
                 reverse=False, num_words=None):
        """
        :param texts: List of strings. This is the data-set.
        :param padding: Either 'post' or 'pre' padding.
        :param reverse: Boolean whether to reverse token-lists.
        :param num_words: Max number of words to use.
        """

        Tokenizer.__init__(self, num_words=num_words)

        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)

        # Create inverse lookup from integer-tokens to words.
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))

        # Convert all texts to lists of integer-tokens.
        # Note that the sequences may have different lengths.
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            # Reverse the token-sequences.
            self.tokens = [list(reversed(x)) for x in self.tokens]
        
            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        # Max number of tokens to use in all sequences.
        # We will pad / truncate all sequences to this length.
        # This is a compromise so we save a lot of memory and
        # only have to truncate maybe 5% of all the sequences.
        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)

        # Pad / truncate all token-sequences to the given length.
        # This creates a 2-dim numpy matrix that is easier to use.
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen=self.max_tokens,
                                           padding=padding,
                                           truncating=truncating)

    def token_to_word(self, token):
        """Lookup a single word from an integer-token."""

        word = " " if token == 0 else self.index_to_word[token]
        return word 

    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""

        # Create a list of the individual words.
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text
    
    def text_to_tokens(self, text, reverse=False, padding=False):
        """
        Convert a single text-string to tokens with optional
        reversal and padding.
        """

        # Convert to tokens. Note that we assume there is only
        # a single text-string so we wrap it in a list.
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            # Reverse the tokens.
            tokens = np.flip(tokens, axis=1)

            # Sequences that are too long should now be truncated
            # at the beginning, which corresponds to the end of
            # the original sequences.
            truncating = 'pre'
        else:
            # Sequences that are too long should be truncated
            # at the end.
            truncating = 'post'

        if padding:
            # Pad and truncate sequences to the given length.
            tokens = pad_sequences(tokens,
                                   maxlen=self.max_tokens,
                                   padding='pre',
                                   truncating=truncating)

        return tokens

Create tokenizer for source and target languages. Input sequences are reversed and padded with zeroes

In [20]:
%%time
tokenizer_src = TokenizerWrap(texts=data_src,
                              padding='pre', # padding='pre'
                              reverse=True,  # reverse=True
                              num_words=num_words)

Wall time: 1min 56s


In [21]:
%%time
tokenizer_dest = TokenizerWrap(texts=data_dest,
                               padding='post',
                               reverse=False,
                               num_words=num_words)

Wall time: 1min 17s


In [22]:
tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded

(2007723, 55)
(2007723, 56)


In [23]:
token_start = tokenizer_dest.word_index[mark_start.strip()]

2

In [24]:
token_end = tokenizer_dest.word_index[mark_end.strip()]

3

In [32]:
encoder_input_data = tokens_src

In [33]:
decoder_input_data = tokens_dest[:, :-1]

(2007723, 55)

In [34]:
decoder_output_data = tokens_dest[:, 1:]

(2007723, 55)

In [40]:
encoder_input = Input(shape=(None, ), name='encoder_input')

This is the embedding-layer.

In [41]:
encoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='encoder_embedding')

This creates the 3 GRU layers that will map from a sequence of embedding-vectors to a single "thought vector" which summarizes the contents of the input-text. Note that the last GRU-layer does not return a sequence.

In [42]:

encoder_gru1 = GRU(state_size, name='encoder_gru1',
                   return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2',
                   return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3',
                   return_sequences=False)

In [43]:
def connect_encoder():
    # Start the neural network with its input-layer.
    net = encoder_input
    
    # Connect the embedding-layer.
    net = encoder_embedding(net)

    # Connect all the GRU-layers.

    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)


    # This is the output of the encoder.
    encoder_output = net
    
    return encoder_output

In [44]:
encoder_output = connect_encoder()

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [45]:
decoder_initial_state = Input(shape=(state_size,),
                              name='decoder_initial_state')

The decoder also needs a sequence of integer-tokens as inputs. During training we will supply this with a full sequence of integer-tokens e.g. corresponding to the text "ssss once upon a time eeee". 

During inference when we are translating new input-texts, we will start by feeding a sequence with just one integer-token for "ssss" which marks the beginning of a text, and combined with the "thought vector" from the encoder, the decoder will hopefully be able to produce the correct next word e.g. "once".

In [46]:
decoder_input = Input(shape=(None, ), name='decoder_input')

In [47]:
decoder_embedding = Embedding(input_dim=num_words,
                              output_dim=embedding_size,
                              name='decoder_embedding')

In [48]:
# decoder_list = []

# for i in range(num_layers):
#     decoder_list.append(GRU(state_size, name='decoder_gru' + str(i),return_sequences=True))


decoder_gru1 = GRU(state_size, name='decoder_gru1',
                   return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2',
                   return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3',
                   return_sequences=True)

In [49]:
decoder_dense = Dense(num_words,
                      activation='linear',
                      name='decoder_output')

In [50]:
def connect_decoder(initial_state):
    # Start the decoder-network with its input-layer.
    net = decoder_input

    # Connect the embedding-layer.
    net = decoder_embedding(net)
    
    # Connect all the GRU-layers.
#     for dencoder in dencoder_layers:
#         net = dencoder(net, initial_state=initial_state)
    
    
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)

    # Connect the final dense layer that converts to
    # one-hot encoded arrays.
    decoder_output = decoder_dense(net)
    
    return decoder_output

### Connect and Create the Models

In [51]:
decoder_output = connect_decoder(initial_state=encoder_output)

model_train = Model(inputs=[encoder_input, decoder_input],
                    outputs=[decoder_output])

In [52]:
model_encoder = Model(inputs=[encoder_input],
                      outputs=[encoder_output])

In [53]:
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state],
                      outputs=[decoder_output])

In [54]:
# model_train.compile(optimizer=optimizer,
#                     loss='sparse_categorical_crossentropy')

In [55]:
def sparse_cross_entropy(y_true, y_pred):
    """
    Calculate the cross-entropy loss between y_true and y_pred.
    
    y_true is a 2-rank tensor with the desired output.
    The shape is [batch_size, sequence_length] and it
    contains sequences of integer-tokens.

    y_pred is the decoder's output which is a 3-rank tensor
    with shape [batch_size, sequence_length, num_words]
    so that for each sequence in the batch there is a one-hot
    encoded array of length num_words.
    """

    # Calculate the loss. This outputs a
    # 2-rank tensor of shape [batch_size, sequence_length]
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
                                                          logits=y_pred)

    # Keras may reduce this across the first axis (the batch)
    # but the semantics are unclear, so to be sure we use
    # the loss across the entire 2-rank tensor, we reduce it
    # to a single scalar with the mean function.
    loss_mean = tf.reduce_mean(loss)

    return loss_mean

### Compile the Training Model

We have used the Adam optimizer in many of the previous tutorials, but it seems to diverge in some of these experiments with Recurrent Neural Networks. RMSprop seems to work much better for these.

In [56]:
optimizer = RMSprop(lr=1e-3)

There seems to be another bug in Keras so it cannot automatically deduce the correct shape of the decoder's output data. We therefore need to manually create a placeholder variable for the decoder's output. The shape is set to `(None, None)` which means the batch can have an arbitrary number of sequences, which can have an arbitrary number of integer-tokens.

In [57]:
decoder_target = tf.placeholder(dtype='int32', shape=(None, None))

We can now compile the model using our custom loss-function.

In [58]:
model_train.compile(optimizer=optimizer,
                    loss=sparse_cross_entropy,
                    target_tensors=[decoder_target])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


### Callback Functions

During training we want to save checkpoints and log the progress to TensorBoard so we create the appropriate callbacks for Keras.

This is the callback for writing checkpoints during training.

In [59]:
callback_checkpoint = ModelCheckpoint(filepath=path_checkpoint,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_weights_only=True,
                                      save_best_only=True)

This is the callback for stopping the optimization when performance worsens on the validation-set.

In [60]:
callback_early_stopping = EarlyStopping(monitor='val_loss',
                                        patience=3, verbose=1)

This is the callback for writing the TensorBoard log during training.

In [61]:
callback_tensorboard = TensorBoard(log_dir='./21_logs/',
                                   histogram_freq=0,
                                   write_graph=False)

In [67]:
callbacks = [callback_early_stopping,
             callback_checkpoint,
             callback_tensorboard]

### Load Checkpoint

You can reload the last saved checkpoint so you don't have to train the model every time you want to use it.

In [68]:
try:
    model_train.load_weights(path_checkpoint)
    print("loaded")
except Exception as error:
    print("Error trying to load checkpoint.")
    print(error)

loaded


### Train the Model

We wrap the data in named dicts so we are sure the data is assigned correctly to the inputs and outputs of the model.

In [69]:
x_data = \
{
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

In [70]:
y_data = \
{
    'decoder_output': decoder_output_data
}

We want a validation-set of 10000 sequences but Keras needs this number as a fraction.

In [71]:
validation_split = 10000 / len(encoder_input_data)
validation_split

0.004980766769121039

Now we can train the model. One epoch of training took about 1 hour on a GTX 1070 GPU. You probably need to run 10 epochs or more during training. After 10 epochs the loss was about 1.10 on the training-set and about 1.15 on the validation-set.

Note the batch-size of 512 which was chosen because it kept the GPU running at nearly 100% while being within the memory limits of 8GB for this GPU.

In [67]:
model_train.fit(x=x_data,
                y=y_data,
                batch_size=512,
                epochs=num_epochs,
                validation_split=validation_split,
                callbacks=callbacks)

Train on 1997723 samples, validate on 10000 samples
Epoch 1/2

Epoch 2/2



<tensorflow.python.keras._impl.keras.callbacks.History at 0x2d474d3b240>

## Translate Texts

This function translates a text from the source-language to the destination-language and optionally prints a true translation.

In [1]:
import nltk

def translate(input_text, true_output_text=None, output_path=None):
    """Translate a single text-string."""

    # Convert the input-text to integer-tokens.
    # Note the sequence of tokens has to be reversed.
    # Padding is probably not necessary.
    input_tokens = tokenizer_src.text_to_tokens(text=input_text,
                                                reverse=True,
                                                padding=True)
    
    # Get the output of the encoder's GRU which will be
    # used as the initial state in the decoder's GRU.
    # This could also have been the encoder's final state
    # but that is really only necessary if the encoder
    # and decoder use the LSTM instead of GRU because
    # the LSTM has two internal states.
    initial_state = model_encoder.predict(input_tokens)

    # Max number of tokens / words in the output sequence.
    max_tokens = tokenizer_dest.max_tokens

    # Pre-allocate the 2-dim array used as input to the decoder.
    # This holds just a single sequence of integer-tokens,
    # but the decoder-model expects a batch of sequences.
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape=shape, dtype=np.int)

    # The first input-token is the special start-token for 'ssss '.
    token_int = token_start

    # Initialize an empty output-text.
    output_text = ''

    # Initialize the number of tokens we have processed.
    count_tokens = 0

    # While we haven't sampled the special end-token for ' eeee'
    # and we haven't processed the max number of tokens.
    while token_int != token_end and count_tokens < max_tokens:
        # Update the input-sequence to the decoder
        # with the last token that was sampled.
        # In the first iteration this will set the
        # first element to the start-token.
        decoder_input_data[0, count_tokens] = token_int

        # Wrap the input-data in a dict for clarity and safety,
        # so we are sure we input the data in the right order.
        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }

        # Note that we input the entire sequence of tokens
        # to the decoder. This wastes a lot of computation
        # because we are only interested in the last input
        # and output. We could modify the code to return
        # the GRU-states when calling predict() and then
        # feeding these GRU-states as well the next time
        # we call predict(), but it would make the code
        # much more complicated.

        # Input this data to the decoder and get the predicted output.
        decoder_output = model_decoder.predict(x_data)

        # Get the last predicted token as a one-hot encoded array.
        token_onehot = decoder_output[0, count_tokens, :]
        
        # Convert to an integer-token.
        token_int = np.argmax(token_onehot)

        # Lookup the word corresponding to this integer-token.
        sampled_word = tokenizer_dest.token_to_word(token_int)

        # Append the word to the output-text.
        output_text += " " + sampled_word

        # Increment the token-counter.
        count_tokens += 1

    # Sequence of tokens output by the decoder.
    output_tokens = decoder_input_data[0]
    
    with open(output_path, 'w', encoding='utf-8') as file:
        # Print the input-text.
        print("Input text:")
        print(input_text)
        print()
        
        file.write("Input text:\n")
        file.write(input_text + "\n")

        # Print the translated output-text.
        print("Translated text:")
        print(output_text)
        print()
        
        file.write("Translated text:\n")
        file.write(output_text + "\n")

        # Optionally print the true translated text.
        if true_output_text is not None:
            print("True output text:")
            print(true_output_text)
            print()
            
            file.write("True output text:\n")
            file.write(true_output_text + "\n")
            
        
        # print bleu score
        ref = true_output_text.lower().split()
        pred = output_text.lower().split()
        del pred[len(pred)-1]
        
        smoothing = nltk.translate.bleu_score.SmoothingFunction().method5
        bleuScore = nltk.translate.bleu_score.sentence_bleu([ref], pred,smoothing_function=smoothing)
        print("BLEU Score: " + str(bleuScore))
        file.write("BLEU Score: " + str(bleuScore))
        file.write("\n*******************\n")
            

### Examples

Translate a text from the training-data. This translation is quite good. Note how it is not identical to the translation from the training-data, but the actual meaning is similar.

In [151]:
srcTrain_short_fr = [	# less than 10 words
'Les modifications n\'ont pas lieu d\'être',
'Il me semble que c\'est une bonne formule',
'Êtes-vous satisfait du rythme de ces progrès',
'C\'est grotesque et cela doit cesser',
'Le débat est clos',
'Il devrait être considéré séparément',
'J\'ai voté \"pour\"',
'Je voudrais évoquer quelques points spécifiques',
'Elles n\'avaient pas dégagé de majorité à l\'époque',
'Sans approvisionnement, leur santé et leurs vies sont en danger'
]

srcTrain_short_en = [	# less than 10 words
'There is no room for amendments',
'This seems to me to be a workable solution',
'Are you satisfied with this rate of progress',
'This is an obscenity and it must be stopped',
'The debate is closed',
'It should be taken separately',
'My vote is in favour',
'I should like to mention a few specific points',
'There was no majority for this at the time',
'Without supplies, their health and their lives can be at risk',
]

srcTrain_medium_fr = [ # 11-25 words
'Si votre décision est que je ne puis pas donner d\'explication de vote, je l\'accepte, mais avec certaines réserves',
'Il faut que les propositions qui en sortiront offrent un signal clair que l\'Europe doit être fondée sur ses nations et qu\'elle respectera leurs droits',
'Dans un domaine un peu différent, l\'Europe dispose d\'une législation stipulant ce que sont les produits régionaux',
'Dans le premier rapport du Parlement, nous avons déjà souligné quelques points',
'Monsieur le Président, je dois tout d\'abord vous rappeler que nous ne connaissons encore ni les causes exactes, ni le déroulement des événements',
'Mais il s\'agit effectivement d\'un problème auquel nombre de citoyens européens sont confrontés chaque jour',
'Nous disposons d\'assez d\'options dans le système actuel pour assurer l\'ouverture des marchés et la libre concurrence',
'Monsieur le Président, la concurrence est l\'âme et le moteur de la politique européenne en matière de marché intérieur',
'La question qui se pose maintenant est de savoir si l\'Union européenne participe aussi à cette lutte',
'Dans ce contexte, je soutiens les appels à renforcer les contrôles effectués par l\'État du port afin de garantir une inspection complète et efficace',
]

srcTrain_medium_en = [ # 11-25 words
'If your ruling is that I cannot give an explanation of vote, I accept that but with reservations',
'The proposals that result from it must give out a clear signal that Europe must be built on its nations and that it will respect their laws',
'In a slightly different area, there is legislation in Europe which defines precisely what regional products are',
'We have already underlined a number of these principles in Parliament\'s first report',
'Mr President, firstly I have to remind you that we do not yet know the exact causes and the series of events',
'But it is a problem which is affecting many European citizens on a daily basis',
'There are enough other options in the present system to safeguard open markets and free competition',
'Mr President, competition is at the heart of the European internal market policy and is also its driving force',
'But the question is now: is the European Union doing anything about it as well',
'In that context I support calls to strengthen port state control to ensure full and effective inspection',
]

srcTrain_long_fr = [	# 26 > words
'Sans prétendre à l\'exhaustivité, celle-ci garantirait au moins trois points: aucun cargo ou pétrolier ne pourra plus mouiller dans un port de l\'Union européenne s\'il est bon pour la casse; toutes les personnes concernées, y compris l\'affréteur du transport, sont responsables des dégâts qui peuvent survenir; et ces personnes doivent contracter des assurances suffisantes',
'Je voudrais, Madame la Commissaire, vous égrener rapidement puisqu\'ici le temps est très compté, les sept points qui nous paraissent, et vous en avez d\'ailleurs cités quelques-uns, essentiels à travailler: la double coque le plus tôt possible dans nos eaux; le contrôle par l\'État du port le plus sévère possible',
'Outre les dommages irréparables causés à l\'environnement, les dommages subis par l\'écosystème et les pertes pour le secteur touristique, la marée noire est un véritable coup de massue pour la pêche, pour la conservation des ressources dans le milieu marin, et il faudra attendre de nombreuses années avant que ces côtes ne se remettent de cette catastrophe',
'D\'une part, il convient de demander à la Commission de prendre, dans le cadre des aides prévues pour le secteur de la pêche, des mesures spéciales visant à pallier les effets de cette catastrophe sur le secteur productif des zones concernées et d\'exiger des organismes internationaux et d\'elle-même un contrôle extrême des bateaux battant pavillon de complaisance',
'Les dommages sociaux et économiques, dont on a déjà parlé ici aujourd\'hui, en termes tant de perte d\'emplois que de ressources marines et touristiques, sont d\'une ampleur telle qu\'ils justifient amplement une action décidée et marquante de la part des institutions communautaires',
'Nous devons tenir compte de ce fait. Je ne suis pas d\'accord avec ceux qui affirment sans nuance que l\'effet de serre est responsable de cette tempête mais il est relativement certain que si nous ne changeons pas rapidement de cap, nous devons craindre d\'autres catastrophes',
'Monsieur le Commissaire, l\'ignorance de certains États membres a plongé l\'Europe dans une crise alimentaire de grande ampleur et je suis une fois de plus irrité, malheureusement, de constater l\'absence totale et renouvelée du Conseil alors que vous présentez ce rapport intéressant',
'Je suppose que la question a trait à la question de savoir où la compétence de cette autorité commence et prend fin et où la compétence et l\'autorité des agences de sécurité alimentaire au sein des États membres commence et prend fin',
'Je demanderai conseil aux services juridiques, en gardant à l\'esprit, en particulier, que cela peut fournir une réponse rapide dans le cas où une législation communautaire, en attente d\'un arrêt du tribunal, n\'est pas respectée',
'Il se peut que cela fasse l\'objet de discussions ici et ailleurs. J\'écouterai toute suggestion mais ma première conclusion est que cette autorité devrait occuper une position centrale plutôt que périphérique'
]

srcTrain_long_en = [	# 26 > words
'No tanker or freighter fit only for the scrap heap must ever put into any harbour within the European Union again. All those involved, including the transport agent, are responsible for any ensuing damage, and these individuals are to provide satisfactory assurances',
'Commissioner, as time is very short, I would like to pick out the seven points which we feel, and you have already mentioned some of them, are essential to work on: double-hulled vessels in our waters as soon as possible and the strictest possible state control of ports. In particular, we must ensure that classification societies are compelled to make their reports public, as currently, we do not have access to them',
'Because, as well as the irreparable damage to the environment, the damage to the ecosystem and losses in the tourist industry, the oil slick dealt a huge blow to the fishing industry, to the conservation of marine resources, and the damaged coast lines will take years to recover',
'On the one hand, by asking the Commission, within the scope of aid to the fishing industry, to implement special measures to compensate for the effects of this catastrophe on the industry in the areas affected, and also by asking them to demand, of international bodies and of themselves, tight controls on ships which sail under flags of convenience',
'The economic and social damage, which we have spoken about today, in terms of the loss of jobs and fishing and tourist resources, is so great that they fully justify decisive and thorough action on the part of the Community institutions',
'We must be aware of this connection; I do not support those who make sweeping statements to the effect that the greenhouse effect is to blame for this storm, but one thing we can be relatively certain of is that we have reason to fear further catastrophes if we do not soon alter our course',
'Commissioner, the ignorance of certain Member States has landed us in a major food crisis here in Europe, and I am afraid that once more I have cause to be angry at the fact that again no Members of the Council are in attendance, and that on the day that you present your interesting report',
'I suspect that the question is focused on the issue of where the competence of the authority begins and ends and where the competence and authority of food safety agencies in Member States begin and end',
'It will require the advice of legal services and that will be sought, particularly bearing in mind that it may provide a speedy response to a failure to comply with Community law pending a court ruling',
'It may be a matter for discussion here and elsewhere and I will listen to any suggestions that are made but my preliminary conclusion is that this Authority should be located centrally rather than on the periphery',
]

Input text:
Il devrait aider les entreprises à prendre notre ressource et en faire autre chose pour les gens de chez nous

Translated text:
 it should help our businesses to invest and make our own people more attractive eeee

True output text:
It should help businesses to take our resource and develop other products from it for the good of our own people

BLEU Score: 0.18862964447124536
***********************
Input text:
Nous devons diversifier notre économie de manière à ne plus compter que sur une seule source de revenus tirée d'une même ressource naturelle

Translated text:
 we must diversify our economy on the basis of a spirit of income which is based on a resource based on a resource of a resource source eeee

True output text:
We have to diversify our economy so that we do not rely on one source of revenue coming from one natural resource

BLEU Score: 0.11919186667978673
***********************
Input text:
Comme vous le savez, lorsque les ressources deviennent limitées, des co

Now testing with sentences not from training data

In [157]:
srcTest_short_fr = [
# From books
# 'Souvent nos dimanches d\'hiver se passaient ainsi',
# 'Personne ne venait ouvrir à la visiteuse inconnue',
# 'Je t\'attendais pour te montrer',
# 'Mais ma mère n\'écoutait plus',
# 'Personne ne répondit',
# 'J\'hésitai une seconde',
# 'Alors ils crièrent',
# 'Je répondis bien vite',
# 'La conversation finit là',
# 'Mais il est trop tard',
# From canadian parliament
'Elles ont été déposées',
'Nous souffrons des compressions budgétaires aux aéroports',
'C\'était une bonne chose à mon avis',
'Nous approuvons d\'emblée certains aspects du projet de loi',
'Nous invitons le gouvernement à faire preuve de modération',
'Posez-vous cette question',
'Qu\'est-ce que les gens vont faire',
'C\'est inacceptable',
'Ils n\'auront pas le choix',
'Les problèmes sont toujours là'
]

srcTest_short_en = [
 # From books
# 'This is how our winter Sundays were often spent',
# 'No one came to let in the unknown visitor',
# 'I was waiting to show you',
# 'But Mother was no longer listening',
# 'No one answered',
# 'I hesitated for a moment',
# 'Then they shouted',
# 'I replied at once',
# 'The conversation ended there',
# 'But it is too late', 
# From canadian parliament
'They have been filed',
'We suffer from the cuts to airports',
'To me that was a good thing',
'We are pleased with certain aspects of the bill',
'We would encourage caution here',
'Ask yourself that question',
'What will people do',
'This is unacceptable',
'They will not have any choice',
'The problems have not gone away'
]


srcTest_medium_fr = [
# 'C\'était un froid dimanche de novembre, le premier jour d\'automne qui fît songer à l\'hiver',
# 'Mon père avait pris la lampe et, sans attendre, il ouvrait la porte qu\'on avait déjà fermée à clef',
# 'Puis, poussant la grille, s\'avançant sur le bord des marches, il leva la lumière au-dessus de sa tête pour voir ce qui se passait',
# 'les détails en sont trop longs à répéter, et je les ai entendus raconter de tant de façons que je puis à peine dire quel est le récit exact',
# 'Vous n\'irez jamais loin sans la coopération, la confiance et la camaraderie des autres hommes et femmes',
# 'Je n\'arrive pas à croire que vous ne soyez pas tout au moins disposé à envisager d\'autres possibilités',
# 'La culture joue un rôle actif dans la formation chez un individu du caractère, de l\'attitude et du regard sur la vie',
# 'Nous les humains avons une grande disposition à tordre les faits pour qu\'ils s\'ajustent à notre conclusion dès lors que nous en avons formé une',
# 'Après avoir été diplômée de l\'école, j\'ai de nouveau emménagé chez moi et vécu avec mes parents durant trois ans',
# 'Il y avait de nombreuses choses que nous voulions faire, mais nous ne sommes pas parvenus à faire beaucoup d\'entre elles'
# Canadian parliament
'Il devrait aider les entreprises à prendre notre ressource et en faire autre chose pour les gens de chez nous',
'Nous devons diversifier notre économie de manière à ne plus compter que sur une seule source de revenus tirée d\'une même ressource naturelle',
'Comme vous le savez, lorsque les ressources deviennent limitées, des conflits éclatent s\'il y a apparence de répartition inégale',
'Le devoir de satisfaire à leurs besoins est d\'une importance fondamentale pour les personnes handicapées, de même que pour des groupes comme les minorités religieuses',
'Il y a beaucoup d\'éléments positifs dans ce projet de loi, des éléments qui méritent que nous les appuyions au nom des gens qui nous ont élus',
'C\'est très décevant, surtout qu\'il s\'agit d\'un projet de loi d\'une si grande importance pour un si grand nombre de nos concitoyens',
'Un examen plus poussé de la Loi sur les droits de la personne et du système de la Commission des droits de la personne est également nécessaire',
'Toutefois, elle n\'examine pas la même plainte venant de nombreuses autres personnes ailleurs au pays',
'C\'est ce qui explique les réserves que j\'ai eues au départ à propos de la peine maximale de cinq ans',
'Je pense que c\'est une mesure extrêmement importante qui n\'existait pas auparavant' 
]

srcTest_medium_en = [
# 'It was a cold Sunday of November, the first day of autumn to make one think of winter',
# 'Father had taken up the lamp and, without waiting, went to open the door which had already been locked',
# 'Then pushing open the gate, he walked to the edge of the steps and raised his light above his head to see what was happening',
# 'The circumstances are too long to repeat, and I have heard them related so many ways, that I can scarce be certain which is the right account',
# 'You will never get far without the co-operation, confidence and comradeship of other men and women',
# 'I can\'t believe that you aren\'t at least willing to consider the possibility of other alternatives',
# 'Culture plays a dynamic role in shaping an individual\'s character, attitude, and outlook on life',
# 'We humans have a great way of twisting facts to fit our conclusion as soon as we have made one',
# 'After I graduated from college, I moved back home and lived with my parents for three years',
# 'There were many things that we wanted to do, but we never got around to doing many of them',
# Canadian parliament
'It should help businesses to take our resource and develop other products from it for the good of our own people',
'We have to diversify our economy so that we do not rely on one source of revenue coming from one natural resource',
'As we know, when resources become tight, when there seems to be an unfair distribution, then conflict often develops'
'The duty to accommodate is of vital importance to persons with disabilities as well as to groups such as religious minorities',
'There is a lot in this bill that is right and that is worthy of our support as representatives of the people who elected us',
'It is very disappointing, especially since this bill is so important to so many of our fellow citizens',    
'A further review of the human rights act and the human rights commission system is also needed',
'It does not, however, address the same complaint that many others may have across the country', 
'That is why I had the initial reservations with respect to the maximum sentence being only five years',
'I believe this is something that is extremely important which did not exist previous to this legislation'
]

srcTest_long_fr = [
# 'Cette bonne femme avait aussi une petite école qu\'elle tenait pour enseigner aux enfants à lire et à coudre; et ayant, comme j\'ai dit, autrefois vécu en bonne façon, elle élevait les enfants avec beaucoup d\'art autant qu\'avec beaucoup de soin',
# 'Mon avis est qu\'on ne peut créer des personnages que lorsque l\'on a beaucoup étudié les hommes, comme on ne peut parler une langue qu\'a la condition de l\'avoir sérieusement apprise',
# 'C\'était là une magnifique collection, et pas un de ces mille objets, si nécessaires à la toilette d\'une femme comme celle chez qui nous étions, n\'était en autre métal qu\'or ou argent',
# 'Le matin, elle avait manqué la messe; et jusqu\'au sermon, assis dans le chœur avec les autres enfants, j\'avais regardé anxieusement du côté des cloches, pour la voir entrer avec son chapeau neuf',
# 'En effet, à la porte de la salle à manger – la plus rapprochée des cinq portes vitrées qui donnaient sur la cour – une femme aux cheveux gris, penchée, cherchait à voir au travers des rideaux',
# 'Millie, sans doute, avait reçu le chapeau de La Gare, et sans rien entendre, au fond de la chambre rouge, devant un lit semé de vieux rubans et de plumes défrisées, elle cousait, décousait, rebâtissait sa médiocre coiffure',
# 'Je ne reconnaissais plus la femme aux cheveux gris, que j\'avais vue courbée devant la porte, une minute auparavant, avec cet air suppliant et hagard de poule qui aurait perdu l\'oiseau sauvage de sa couvée',
# 'Meaulnes ne disait rien; mais c\'était pour lui qu\'à chaque instant l\'un des plus bavards s\'avançait au milieu du groupe, et, prenant à témoin tour à tour chacun de ses compagnons, qui l\'approuvaient bruyamment, racontait quelque longue histoire de maraude, que tous les autres suivaient, le bec ouvert, en riant silencieusement',
# 'Si une nouvelle motion est présentée et si je la déclare recevable et vous demande d\'accepter que le Sénat donne son consentement, aucune autre motion ne pourra être déposée par la suite et ce, tant que le Sénat n\'aura pas donné son consentement',
# 'Il y a encore une autre mesure dont nous devons nous occuper d\'une manière ou d\'une autre, sans quoi les policiers risquent de ne pas pouvoir perquisitionner dans les résidences pour recueillir des éléments de preuve'
# canadian parliament
'Si on s\'attend à ce qu\'il y ait une rationalisation, comment va-t-on faire pour inviter les provinces à accepter de rationaliser leurs travailleurs dans les usines, s\'il n\'y a pas d\'arrimage avec ce qui va se passer au niveau de la capture',
'Franchement, que cela nous plaise ou non, n\'eut été des mesures prises par le gouvernement, ce serait bien étonnant si nous avions une entente internationale sur les stocks de poisson',
'À cet égard, il faut rappeler qu\'à cette époque, le débat avait porté sur le caractère licite ou illicite de ces nouvelles dispositions législatives en regard du droit international',
'D\'une façon générale, nous avons toujours été en faveur des changements au système de justice criminelle pour les personnes handicapées et je pense que les handicapés du Canada seront heureux de cette mesure et en profiteront largement',
'Le député va dans la bonne direction, c\'est-à-dire qu\'il a le coeur accroché à la bonne place, mais je crois que nous pourrions mettre l\'argent dans un régime de pension plus progressif',
'Nous devrions tout au moins chercher à créer un régime fiscal qui traite avec neutralité les familles où les conjoints tendent à consacrer le plus de temps possible à leurs enfants, ou au moins, comme quelqu\'un le suggérait, à modifier le régime afin d\'établir une discrimination en faveur de ces familles',
'Ces programmes ne visent pas à améliorer la situation économique des parents autant qu\'à régler les aspects sociaux et les rapports entre les parents et l\'enfant',
'Les victimes n\'ont plus qu\'à consacrer les précieuses années qu\'il leur reste à vivre à se battre devant les tribunaux pour obtenir l\'indemnisation à laquelle elles ont droit',
'Une conservation et une gestion appropriées de ces stocks pourraient contribuer grandement à assurer la durabilité de cette importante source d\'alimentation pour les générations futures',
'J\'aimerais beaucoup que le gouvernement en profite pour saisir la Chambre de projets de loi importants que les Canadiens réclament et dont ils ont grand besoin'
]

srcTest_long_en = [
# 'This woman had also had a little school, which she kept to teach children to read and to work; and having, as I have said, lived before that in good fashion, she bred up the children she took with a great deal of art, as well as with a great deal of care',
# 'In my opinion, it is impossible to create characters until one has spent a long time in studying men, as it is impossible to speak a language until it has been seriously acquired',
# 'It was a magnificent collection, and there was not one of those thousand little things so necessary to the toilet of a woman of the kind which was not in gold or silver',
# 'That morning she missed Mass, and right up to the sermon, from my place in the choir with the other children, I looked anxiously towards the door to see her come to church wearing her new hat',
# 'In fact, there stood, outside the dining-room door - the nearest of the five glass doors opening on the playground - a grey-headed woman, leaning forward and trying to look through the curtains',
# 'Without any doubt Millie had received her hat from the station, and, hearing nothing, at the end of the red bedroom, before a bed bestrewed with old ribbons and uncurled feathers, she was stitching, undoing, and remaking her modest headgear',
# 'I could no longer recognise the grey-headed woman whom, only a minute ago, I had seen stooping in front of the door, with the piteous and haggard bearing of a hen who has lost the wildest chick in her brood',
# 'Meaulnes never said anything, but it was because of him that repeatedly one chatterbox or another, making of himself the centre of the group, and taking in turn each of his noisily approving friends as witness, would relate some long story of poaching, which the others followed with gaping mouths and inward laughter',
# 'If a new motion comes forward and I find that it is in order and I ask that you agree to grant leave, then there can be no further motion put after that until such time as leave is granted',
# 'We have a situation which could result in an inability on the part of the police forces in our country to search residences for evidence, unless we deal with it one way or the other',
# canadian parliament
'If rationalization is what they are hoping for, how are they going to persuade the provinces to go along with rationalizing their plant workers unless this is tied in with the catch'
'Frankly, whether we like it or not, were it not for the actions of this government I would be surprised if we had an international agreement that deals with fish stocks'
'It must be kept in mind that, at that time, the debate addressed whether or not these new legislative powers were legal according to international law'
'We have always been generally supportive of changes to the criminal justice system for persons with disabilities and I believe that persons with disabilities in Canada will embrace the legislation and benefit greatly from it',
'The member is going in the right direction in terms of his heart and is being very thoughtful, but again I think we could put the money into a more progressive pension system',
'We ought to seek at the very least to create a tax code which treats families that seek to maximize their time with their children neutrally or at the very least, as someone proposed, we ought to make amendments to the tax code to positively discriminate in favour of such families',
'These programs do not address the economic situation of the parents as much as they address the social aspect and the interplay between the parent and the child',
'What is left now for these victims is that they will have to spend precious years of what is left of their lives in court fighting for compensation which they should rightfully receive',
'Proper conservation and management of these stocks could make a significant contribution to ensuring the sustainability of this important food source for our future generations',
'I really wish that this government would take an occasion like this to bring forward some substantial bills that the people out there are crying for, that they are demanding and that we need so desperately'
]


In [161]:
for fr, en in zip(srcTest_short_fr, srcTest_short_en):
    translate(input_text=fr, true_output_text=en, output_path="results/default/srcTest_medium.txt")
    print("***********************")

Input text:
Elles ont été déposées

Translated text:
 they were tabled eeee

True output text:
They have been filed

BLEU Score: 0.0766091750078838
***********************
Input text:
Nous souffrons des compressions budgétaires aux aéroports

Translated text:
 we are spending budgetary airports eeee

True output text:
We suffer from the cuts to airports

BLEU Score: 0.07740189180437576
***********************
Input text:
C'était une bonne chose à mon avis

Translated text:
 i think that is a good thing eeee

True output text:
To me that was a good thing

BLEU Score: 0.2771752356259992
***********************
Input text:
Nous approuvons d'emblée certains aspects du projet de loi

Translated text:
 we are certain that we are now in some aspects of the draft law eeee

True output text:
We are pleased with certain aspects of the bill

BLEU Score: 0.1978116988049654
***********************
Input text:
Nous invitons le gouvernement à faire preuve de modération

Translated text:
 we call on t

In [76]:
idx = 3
translate(input_text=data_src[idx],
          true_output_text=data_dest[idx])

Input text:
Vous avez souhaité un débat à ce sujet dans les prochains jours, au cours de cette période de session.

Translated text:
 you wanted a debate on this issue during the coming months eeee

True output text:
ssss You have requested a debate on this subject in the course of the next few days, during this part-session. eeee



Here is another example which is also a reasonable translation, although it has incorrectly translated the natural disasters. Note "countries of the European Union" has instead been translated as "member states" which are synonyms in this context.

In [77]:
idx = 4
translate(input_text=data_src[idx],
          true_output_text=data_dest[idx])

Input text:
En attendant, je souhaiterais, comme un certain nombre de collègues me l'ont demandé, que nous observions une minute de silence pour toutes les victimes, des tempêtes notamment, dans les différents pays de l'Union européenne qui ont été touchés.

Translated text:
 in the meantime i would like to ask a number of members to be here i am asking for a silence in particular for example of all the victims of the various countries which have suffered in particular the european union eeee

True output text:
ssss In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union. eeee



In this example we join two texts from the training-set. The model first sends this combined text through the encoder, which produces a "thought-vector" that seems to summarize both texts reasonably well so the decoder can produce a reasonable translation.

In [78]:
idx = 3
translate(input_text=data_src[idx] + data_src[idx+1],
          true_output_text=data_dest[idx] + data_dest[idx+1])

Input text:
Vous avez souhaité un débat à ce sujet dans les prochains jours, au cours de cette période de session.En attendant, je souhaiterais, comme un certain nombre de collègues me l'ont demandé, que nous observions une minute de silence pour toutes les victimes, des tempêtes notamment, dans les différents pays de l'Union européenne qui ont été touchés.

Translated text:
 you had a debate on this issue during the next part session in the debate on a number of occasions i would like to ask you to ensure that i am sure that one of the victims of the european union will be particularly a number of people in the various countries of the european union

True output text:
ssss You have requested a debate on this subject in the course of the next few days, during this part-session. eeeessss In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various c

If we reverse the order of these two texts then the meaning is not quite so clear for the latter text.

In [79]:
idx = 3
translate(input_text=data_src[idx+1] + data_src[idx],
          true_output_text=data_dest[idx+1] + data_dest[idx])

Input text:
En attendant, je souhaiterais, comme un certain nombre de collègues me l'ont demandé, que nous observions une minute de silence pour toutes les victimes, des tempêtes notamment, dans les différents pays de l'Union européenne qui ont été touchés.Vous avez souhaité un débat à ce sujet dans les prochains jours, au cours de cette période de session.

Translated text:
 in the meantime i would like to ask you to be a number of speakers here i have been here in the european union who are victims of the various countries in the european union for example in a number of years to be held in this debate in the course of the next part session

True output text:
ssss In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union. eeeessss You have requested a debate on this subject in the course of the next few days, dur