# Example-3 based on Keras tutorial on Seq2Seq [blog](https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html).

[dataset source (english-french)](http://www.manythings.org/anki/fra-eng.zip)

In this example we'll use words as tokens with Encoder/Decoder model (different than the one use in example_2)

### data prep

In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import GRU, Input, LSTM, Dense, TimeDistributed
from keras.layers.embeddings import Embedding
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    x_tokenized = tokenizer.texts_to_sequences(x)
    return x_tokenized, tokenizer

In [3]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [26]:
from keras.preprocessing.sequence import pad_sequences

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    # TODO: Implement
    x_padded = pad_sequences(x, maxlen=length, padding='post')
    return x_padded

def unpad(x):
    """
    remove the <PAD> added when padding text
    """
    x = x.replace('<PAD>','')
    return x.strip()

In [5]:
def sequence_to_text(sequence, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    return ' '.join([index_to_words[token] for token in sequence if token > 0])

In [84]:
filename = 'fra.txt'
input_texts = []
target_texts = []
lines = open(filename).read().split('\n')
num_samples = 20000

input_chars = set()
target_chars = set()

# process the lines
for line in lines[:min(num_samples, len(lines)-1)]:
    input_text, target_text = line.split('\t')
    input_texts.append(input_text)
    target_texts.append(target_text)

preproc_input_texts, input_texts_tk = tokenize(input_texts)
preproc_target_texts, target_texts_tk = tokenize(target_texts)

preproc_input_texts = pad(preproc_input_texts)
preproc_target_texts = pad(preproc_target_texts)

num_encoder_tokens = len(input_texts_tk.word_index) + 1
num_decoder_tokens = len(target_texts_tk.word_index) + 1
max_encoder_seq_length = max([len(txt) for txt in preproc_input_texts])
max_decoder_seq_length = max([len(txt) for txt in preproc_target_texts])


print ('number of samples: ', len(input_texts))
print ('number of input  tokens:', num_encoder_tokens)
print ('number of output tokens:', num_decoder_tokens)
print ('Max sequence length for inputs:', max_encoder_seq_length)
print ('Max sequence length for outputs:', max_decoder_seq_length)


number of samples:  10000
number of input  tokens: 3373
number of output tokens: 5829
Max sequence length for inputs: 8
Max sequence length for outputs: 12


In [85]:
# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
preproc_target_texts = preproc_target_texts.reshape(*preproc_target_texts.shape, 1)
#print(preproc_target_texts.shape)

In [86]:
# model setup using GRU
from keras.layers import RepeatVector, Dropout, Bidirectional

latent_dim = 32  # Latent dimensionality of the encoding space.

learning_rate = 0.01

def encdec_model(input_shape, target_sequence_length, input_vocab_size, target_vocab_size):
    """
    Build and train an encoder-decoder model on input and target
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param input_vocab_size: Number of unique words in the input dataset
    :param target_vocab_size: Number of unique words in the target dataset
    :return: Keras model built, but not trained
    """

    _input = Input(shape=input_shape[1:])
#     print('_input:', _input.shape)
    _embedded_input = Embedding(input_dim=input_vocab_size, 
                             input_length=max_encoder_seq_length, output_dim=512)(_input)

    _encoded = Bidirectional(GRU(latent_dim))(_embedded_input)
    
    _decoded = RepeatVector(target_sequence_length)(_encoded)
    _decoded = Bidirectional(GRU(latent_dim, return_sequences=True))(_decoded)
    _output = Dense(256, activation='relu')(_decoded)
    _output = Dense(target_vocab_size, activation='softmax')(_output)
    
    model = Model(inputs=_input, outputs=_output)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    model.summary()
    return model


In [87]:
# import json
# model.summary()
# print(encoder_input_data.shape)
# print(encoder_input_data.shape[-1])
# model_as_json = json.loads(model.to_json())
# print(json.dumps(model_as_json, indent=2))
print(preproc_input_texts.shape)

(10000, 8)


In [88]:
from keras.callbacks import ModelCheckpoint

batch_size = 1024  # Batch size for training.
epochs = 10  # Number of epochs to train for.

# Reshape the input
# tmp_input = preproc_input_texts.reshape((-1, preproc_input_texts.shape[1], 1))
tmp_input = preproc_input_texts

print('reshaped input: ', tmp_input.shape)

# Train the neural network
model = encdec_model(
    tmp_input.shape,
    preproc_target_texts.shape[1],
    num_encoder_tokens,
    num_decoder_tokens)

checkpointer = ModelCheckpoint(filepath='seq2seq_weights_best_3.hdf5', 
                           verbose=1, save_best_only=True)


model.fit(tmp_input, preproc_target_texts, 
                     batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[checkpointer])

# save the model
model.save('seq2seq_3.h5')

reshaped input:  (10000, 8)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 8)                 0         
_________________________________________________________________
embedding_12 (Embedding)     (None, 8, 512)            1726976   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 64)                104640    
_________________________________________________________________
repeat_vector_12 (RepeatVect (None, 12, 64)            0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 12, 64)            18624     
_________________________________________________________________
dense_23 (Dense)             (None, 12, 256)           16640     
_________________________________________________________________
dense_24 (Dense)             (None, 12, 5829)   

In [89]:
from keras.models import load_model

# del model
model = load_model('seq2seq_3.h5')

model.layers

[<keras.engine.topology.InputLayer at 0x15fd60fd0>,
 <keras.layers.embeddings.Embedding at 0x15fd60f60>,
 <keras.layers.wrappers.Bidirectional at 0x13c8cabe0>,
 <keras.layers.core.RepeatVector at 0x15fd65630>,
 <keras.layers.wrappers.Bidirectional at 0x15f8a4b00>,
 <keras.layers.core.Dense at 0x15fd659e8>,
 <keras.layers.core.Dense at 0x15fd65b38>]

In [65]:
# testing
for seq_index in range(500, 600):
    input_seq = preproc_input_texts[seq_index: seq_index+1]
#     print('seq: ', input_seq)
    # Reshape the input
#     input_seq = input_seq.reshape((*input_seq.shape, 1))
#     print(input_seq)
    decoded_seq = model.predict(input_seq)
    print('++--------------++')
    print('input seq  : ', ' '.join(input_texts[seq_index: seq_index+1]))
#     print('dseq: ', decoded_seq)
    print('decoded seq: ', unpad(logits_to_text(decoded_seq[0], target_texts_tk)))

++--------------++
input seq  :  I can ski.
decoded seq:  je me le
++--------------++
input seq  :  I cringed.
decoded seq:  je me un
++--------------++
input seq  :  I cringed.
decoded seq:  je me un
++--------------++
input seq  :  I cringed.
decoded seq:  je me un
++--------------++
input seq  :  I give up.
decoded seq:  j'ai été
++--------------++
input seq  :  I got hot.
decoded seq:  je me suis pas
++--------------++
input seq  :  I got hot.
decoded seq:  je me suis pas
++--------------++
input seq  :  I had fun.
decoded seq:  je me suis pas
++--------------++
input seq  :  I had fun.
decoded seq:  je me suis pas
++--------------++
input seq  :  I had fun.
decoded seq:  je me suis pas
++--------------++
input seq  :  I had fun.
decoded seq:  je me suis pas
++--------------++
input seq  :  I hate it.
decoded seq:  je l'ai
++--------------++
input seq  :  I hope so.
decoded seq:  j'ai besoin
++--------------++
input seq  :  I knew it.
decoded seq:  je l'ai
++--------------++
input 