# Machine Translation With Attention

In [1]:
%load_ext autoreload
%aimport helper, tests
%autoreload 1

In [2]:
import collections

import helper
import numpy as np
import project_tests as tests
from attention_decoder import AttentionDecoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Verify GPU

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6265193439509100797
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 10909243802
locality {
  bus_id: 1
  links {
    link {
      device_id: 1
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 8895724049681493900
physical_device_desc: "device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:03:00.0, compute capability: 6.1"
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 8709937562
locality {
  bus_id: 1
  links {
    link {
      type: "StreamExecutor"
      strength: 1
    }
  }
}
incarnation: 10911508839903085276
physical_device_desc: "device: 1, name: GeForce GTX 1080 Ti, pci bus id: 0000:04:00.0, compute capability: 6.1"
]


### Load Data
The data is located in `data/small_vocab_en` and `data/small_vocab_fr`. The `small_vocab_en` file contains English sentences with their French translations in the `small_vocab_fr` file.  I'm reusing Udacity's dataset here, which has been preprocessed already.  The next step is to use a standard dataset such as [WMT](http://www.statmt.org/).

In [4]:
# Load English data
english_sentences = helper.load_data('data/small_vocab_en')
# Load French data
french_sentences = helper.load_data('data/small_vocab_fr')
print('Dataset Loaded')

Dataset Loaded


### Files
Each line in `small_vocab_en` contains an English sentence with the respective translation in each line of `small_vocab_fr`.  View the first two lines from each file.

In [5]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


### Vocabulary
The complexity of the problem is determined by the complexity of the vocabulary.  A more complex vocabulary is a more complex problem.  This dataset has been specifically chosen to have a small vocabulary. For comparison, _Alice's Adventures in Wonderland_ contains 2,766 unique words of a total of 15,500 words.

In [6]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


## Preprocess
Convert the text into sequences of integers using:
1. Tokenize the words into ids
2. Add padding to make all the sequences the same length.

### Tokenize (IMPLEMENTATION)
Each character or word can be turned into character and word ids (integers), respectively.  Character ids are used for character level models that generate text predictions for each character.  A word level model uses word ids that generate text predictions for each word.  Word level models tend to learn better, since they are lower in complexity.

Each sentence can be turned into a sequence of words ids using Keras's [`Tokenizer`](https://keras.io/preprocessing/text/#tokenizer) function. 

In [7]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """

    x_tk = Tokenizer(char_level=False)
    x_tk.fit_on_texts(x)

    return x_tk.texts_to_sequences(x), x_tk

tests.test_tokenize(tokenize)

# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


### Padding 
When batching the sequence of word ids together, each sequence needs to be the same length.  Hence, padding.

All source language sentences should be of the same length and all target language sentences too.  No need for source and target sentences to be of the same length, though. 

In [9]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

tests.test_pad(pad)

# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [10]:
english_sentences[0]

'new jersey is sometimes quiet during autumn , and it is snowy in april .'

### Preprocess Pipeline
All the preprocessing steps put together.

In [11]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


### Ids Back to Text
The neural network will be translating the input to words ids.  The function `logits_to_text` will bridge the gap between the logits from the neural network to the French translation.  

In [12]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [13]:
def decoder(sentence, tokenizer):
    '''decode a sentence, given the tokenizer'''
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'    
    return ' '.join([index_to_words[int(wordint)] for wordint in sentence])

### Define model architecture

Create a translation model (t_model) that incorporates embedding and a bidirectional rnn into one model.

In [14]:
from keras.layers import Dropout
from keras.models  import Sequential
def t_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    embed_size = 200    
    learning_rate = 1e-3
    num_hidden = 256
    
    model = Sequential()
    # Encoder part
    model.add( Embedding(english_vocab_size, embed_size, input_length=input_shape[-1] ) )    
#     model.add( Bidirectional( LSTM(num_hidden, go_backwards=True, input_shape=input_shape[1:],  dropout = 0.5, recurrent_dropout = 0.5)) )  
    model.add( Bidirectional( GRU(num_hidden, go_backwards=True, dropout = 0.4, recurrent_dropout = 0.4 ) ) )
    model.add( RepeatVector(output_sequence_length) )

    # Decoder part
#     model.add( Bidirectional( LSTM(num_hidden, return_sequences=True, dropout = 0.5, recurrent_dropout = 0.5)) ) 
    model.add( Bidirectional( GRU(num_hidden, return_sequences=True, dropout = 0.4, recurrent_dropout = 0.4 ) ) )
    model.add( TimeDistributed( Dense(512) ) )
    model.add( Dropout(0.5))
    model.add( TimeDistributed( Dense(french_vocab_size) ) )
    model.add( Activation('softmax') )

    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

tests.test_model_final(t_model)
print('Final Model Loaded')

Final Model Loaded




In [16]:
# Train the final model
tmp_x = pad( preproc_english_sentences, max_french_sequence_length )
tmp_x = tmp_x.reshape( (-1, preproc_french_sentences.shape[-2], 1) )

my_model = t_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)

print( my_model.summary() )

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 200)            39800     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 512)               701952    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 21, 512)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 21, 512)           1181184   
_________________________________________________________________
time_distributed_3 (TimeDist (None, 21, 512)           262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 21, 512)           0         
_________________________________________________________________
time_distributed_4 (TimeDist (None, 21, 344)           176472    
__________

In [17]:
from keras.layers import Dropout
from keras.models  import Sequential
def att_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    embed_size = 200    
    learning_rate = 1e-3
    num_hidden = 256
    
    model = Sequential()
    # Encoder part
    model.add( Embedding(english_vocab_size, embed_size, input_length=input_shape[-1] ) )    
#     model.add( Bidirectional( LSTM(num_hidden, go_backwards=True, input_shape=input_shape[1:],  dropout = 0.5, recurrent_dropout = 0.5)) )  
    model.add( Bidirectional( GRU(num_hidden, go_backwards=False, dropout = 0.4, recurrent_dropout = 0.4 ) ) )
    model.add( RepeatVector(output_sequence_length) )

    # Decoder part
#     model.add( Bidirectional( LSTM(num_hidden, return_sequences=True, dropout = 0.5, recurrent_dropout = 0.5)) ) 
#     model.add( Bidirectional( GRU(num_hidden, return_sequences=True, dropout = 0.4, recurrent_dropout = 0.4 ) ) )
#     model.add( TimeDistributed( Dense(512) ) )
#     model.add( Dropout(0.5))
#     model.add( TimeDistributed( Dense(french_vocab_size) ) )
#     model.add( Activation('softmax') )
    model.add( AttentionDecoder(num_hidden, french_vocab_size))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

tests.test_model_final(att_model)
print('Final Model Loaded')

inputs shape: (?, 21, 512)
Final Model Loaded




## Prediction (IMPLEMENTATION)

In [18]:
def final_predictions( x, y, x_tk, y_tk ):
    """
    Gets predictions using the final model
    :param x: Preprocessed English data
    :param y: Preprocessed French data
    :param x_tk: English tokenizer
    :param y_tk: French tokenizer
    """
    # note: x and y are already pre-processed
    # Train neural network using model_final
    model = t_model(x.shape, 
                    y.shape[1], 
                    len(x_tk.word_index), 
                    len(y_tk.word_index) )
    
    model.fit( x, y, batch_size=2048, epochs=10, validation_split=0.2 )
    
    y_id_to_word = { value: key for key, value in y_tk.word_index.items() }
    y_id_to_word[0] = '<PAD>'

    sentence = 'he saw a old yellow truck'
    sentence = [ x_tk.word_index[word] for word in sentence.split() ]
    sentence = pad_sequences( [sentence], maxlen=x.shape[-1], padding='post' )
    sentences = np.array( [sentence[0], x[0]] )
    predictions = model.predict( sentences, len(sentences ) )

    print('Sample 1:')
    print(' '.join( [y_id_to_word[np.argmax(x)] for x in predictions[0]] ) )
    print('Il a vu un vieux camion jaune')
    print(' '.join( [y_id_to_word[np.max(x)] for x in y[0]] ) )

    print('Sample 2:')
    print(' '.join( [y_id_to_word[np.argmax(x)] for x in predictions[1]] ) )
    print(' '.join( [y_id_to_word[np.max(x)] for x in y[1]] ) )
    return model

In [19]:
full_model=final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)

Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Sample 1:
il a vu un camion camion camion <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Il a vu un vieux camion jaune
new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Sample 2:
new jersey est parfois chaud pendant cours et il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
les états unis est généralement froid en juillet et il gèle habituellement en novembre <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [20]:
full_model.save( 'saved_models/full_model.h5' )

# Notes:

-  I have used `go_backwards=True` for the encoding RNNs in the Encoder-Decoder model as well as the final_model per suggestions in places such as __[this link](https://cntk.ai/pythondocs/CNTK_204_Sequence_To_Sequence.html)__ which advice to use `False` for the attention models and `True` otherwise. However, I found that the models here also work with the default `go_backwards=False` but just have lower accuracy by about 5%. 
-  References for encoder-decoder __[1](https://blog.keras.io/building-autoencoders-in-keras.html)__, __[2](https://github.com/keras-team/keras/issues/5203)__, __[3](https://www.jianshu.com/p/c294e4cb4070)__

## Proper test train split

This project focuses on learning various network architectures for machine translation, but we don't evaluate the models according to best practices by splitting the data into separate test & training sets -- so the model accuracy is overstated. Use the [`sklearn.model_selection.train_test_split()`](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function to create separate training & test datasets, then retrain each of the models using only the training set and evaluate the prediction accuracy using the hold out test set. Does the "best" model change?

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( english_sentences, french_sentences, test_size=0.1, random_state=42)

tt_preproc_english_sentences, tt_preproc_french_sentences, tt_english_tokenizer, tt_french_tokenizer =\
    preprocess( X_train, y_train )
    
tt_max_english_sequence_length = tt_preproc_english_sentences.shape[1]
tt_max_french_sequence_length = tt_preproc_french_sentences.shape[1]
tt_english_vocab_size = len(tt_english_tokenizer.word_index)
tt_french_vocab_size = len(tt_french_tokenizer.word_index)

tt_x_pad = pad( tt_preproc_english_sentences, tt_max_french_sequence_length )
tt_x = tt_x_pad.reshape( ( -1, tt_preproc_french_sentences.shape[-2], 1 ) )

test_preproc_english_sentences, test_preproc_french_sentences, _, _ = preprocess( X_test, y_test )
test_x_pad = pad( test_preproc_english_sentences, tt_preproc_french_sentences.shape[1] )
test_x = test_x_pad.reshape( (-1, tt_preproc_french_sentences.shape[1], 1 ) )
test_y = pad( test_preproc_french_sentences, tt_preproc_french_sentences.shape[1] )

print( 'Data Preprocessed')
print( "Max English sentence length:", tt_max_english_sequence_length )
print( "Max French sentence length:", tt_max_french_sequence_length )
print( "English vocabulary size:", tt_english_vocab_size )
print( "French vocabulary size:", tt_french_vocab_size )

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


# Accuracy on the test set
Implement test-train split

In [22]:
tt_model = t_model(
    tt_x_pad.shape,
    tt_max_french_sequence_length,
    tt_english_vocab_size,
    tt_french_vocab_size)
tt_model.fit( tt_x_pad, tt_preproc_french_sentences, batch_size=4096, epochs=10, validation_split=0.1 ) 

Train on 111666 samples, validate on 12408 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f78841e4828>

In [23]:
sentnum = 2
print('Prediction') 
print( logits_to_text( tt_model.predict( tt_x_pad[sentnum:sentnum+1] )[0], tt_french_tokenizer) )
print('Reference translation')
print( y_train[sentnum] )
print('Original English sentence')
print( X_train[sentnum] )

Prediction
la le est est fruit moins aimé la la mais est est moins aimé <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference translation
le pamplemousse est son moins aimé des fruits , mais la chaux est leur moins aimé .
Original English sentence
the grapefruit is her least liked fruit , but the lime is their least liked .


In [24]:
score, acc = tt_model.evaluate( tt_x_pad, tt_preproc_french_sentences, batch_size=2048 ) # train set
# print( 'Final model true score = {:4.2f}'.format(score) )
print( 'Final model acc on training data = {:4.2f}'.format(acc) )
score, acc = tt_model.evaluate( test_x_pad, test_y, batch_size=2048 ) # test set 
print( 'Final model acc on test data = {:4.2f}'.format(acc) )

Final model acc on training data = 0.68
Final model acc on test data = 0.54


In [25]:
tt_model.save( 'saved_models/tt_model.h5' )

# For model with attention

In [26]:
atten = att_model(
    tt_x_pad.shape,
    tt_max_french_sequence_length,
    tt_english_vocab_size,
    tt_french_vocab_size)
print( atten.summary() )

inputs shape: (?, 21, 512)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 21, 200)           39800     
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 512)               701952    
_________________________________________________________________
repeat_vector_6 (RepeatVecto (None, 21, 512)           0         
_________________________________________________________________
AttentionDecoder (AttentionD (None, 21, 344)           1565848   
Total params: 2,307,600
Trainable params: 2,307,600
Non-trainable params: 0
_________________________________________________________________
None


In [27]:
atten.fit( tt_x_pad, tt_preproc_french_sentences, batch_size=4096, epochs=10, validation_split=0.1 ) 

Train on 111666 samples, validate on 12408 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f783a130d68>

In [28]:
sentnum = 2
print('Prediction: ') 
print( logits_to_text( atten.predict( tt_x_pad[sentnum:sentnum+1] )[0], tt_french_tokenizer) )
print('Reference translation: ')
print( y_train[sentnum] )
print('Original English sentence: ')
print( X_train[sentnum] )

Prediction: 
la pamplemousse est son fruit aimé aimé fruits mais la est est moins moins aimé <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Reference translation: 
le pamplemousse est son moins aimé des fruits , mais la chaux est leur moins aimé .
Original English sentence: 
the grapefruit is her least liked fruit , but the lime is their least liked .


In [29]:
# score, acc = atten.evaluate( test_x_pad, test_y, batch_size=128 )
# print( 'Final model true score = {:4.2f}'.format(score) )
# print( 'Final model acc = {:4.2f}'.format(acc) )
score, acc = atten.evaluate( tt_x_pad, tt_preproc_french_sentences, batch_size=2048 ) # train set
# print( 'Final model true score = {:4.2f}'.format(score) )
print( 'Attention model acc on training data = {:4.2f}'.format(acc) )
score, acc = atten.evaluate( test_x_pad, test_y, batch_size=2048 ) # test set 
print( 'Attention model acc on test data = {:4.2f}'.format(acc) )

Attention model acc on training data = 0.73
Attention model acc on test data = 0.55


In [30]:
atten.save( 'saved_models/atten.h5' )