# Example-2 based on Keras tutorial on Seq2Seq [blog](https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html).

[dataset source (english-french)](http://www.manythings.org/anki/fra-eng.zip)

In this example we'll use words as tokens with Embedding and LSTM

### data prep

In [1]:
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.layers.embeddings import Embedding
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
def sequence_to_text(sequence, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    return ' '.join([index_to_words[token] for token in sequence if token > 0])

In [3]:
filename = 'fra.txt'
input_texts_seq = []
target_texts_seq = []
lines = open(filename).read().split('\n')
num_samples = 10000
special_words = ['<PAD>', '<UNK>', '<GO>',  '<EOS>']

input_words = set(special_words)
target_words = set(special_words)

# process the lines
for line in lines[:min(num_samples, len(lines)-1)]:
    input_text, target_text = line.split('\t')
    # delimiter target_text with '<start>' for start word and '<end>' for end word
    input_text_seq = text_to_word_sequence(input_text)
    target_text_seq = text_to_word_sequence(target_text)
    target_text_seq = ['<GO>'] + target_text_seq + ['<EOS>']
    
    input_texts_seq.append(input_text_seq)
    target_texts_seq.append(target_text_seq)
    
    for w in input_text_seq:
        if w not in input_words:
            input_words.add(w)
    
    for w in target_text_seq:
        if w not in target_words:
            target_words.add(w)

input_words = sorted(list(input_words))
target_words = sorted(list(target_words))

num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
max_encoder_seq_length = max([len(seq) for seq in input_texts_seq])
max_decoder_seq_length = max([len(seq) for seq in target_texts_seq])
num_samples = len(input_texts_seq)

print ('number of samples: ', num_samples)
print ('number of input  tokens:', num_encoder_tokens)
print ('number of output tokens:', num_decoder_tokens)
print ('Max sequence length for inputs:', max_encoder_seq_length)
print ('Max sequence length for outputs:', max_decoder_seq_length)


number of samples:  10000
number of input  tokens: 2226
number of output tokens: 4578
Max sequence length for inputs: 5
Max sequence length for outputs: 12


In [4]:
print(target_texts_seq[10])

['<GO>', 'attends', '<EOS>']


In [5]:
input_token_index  = dict([(w, i) for i, w in enumerate(input_words)])
input_index_token  = dict([(i, w) for i, w in enumerate(input_words)])
target_token_index = dict([(w, i) for i, w in enumerate(target_words)])
target_index_token = dict([(i, w) for i, w in enumerate(target_words)])


In [None]:
print(target_token_index)

In [6]:


# initialization of the tensors used for training
encoder_input_data  = np.zeros((num_samples, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
decoder_input_data  = np.zeros((num_samples, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros((num_samples, max_decoder_seq_length, num_decoder_tokens), dtype='float32')

In [7]:
# setup the tensors from the input data for the model
for i, (input_text_seq, target_text_seq) in enumerate(zip(input_texts_seq, target_texts_seq)):
    for j, w in enumerate(input_text_seq):
        encoder_input_data[i, j, input_token_index[w]] = 1
    for j, w in enumerate(target_text_seq):
        # decoder_input_data is ahead of decoder_target_data by one timestep
        decoder_input_data[i, j, target_token_index[w]] = 1
        if j > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start token <GO>.
            decoder_target_data[i, j - 1, target_token_index[w]] = 1



In [8]:
# model setup using LSTM

latent_dim = 256  # Latent dimensionality of the encoding space.

# encoder
encoder_input = Input(shape=(None, num_encoder_tokens))
# embedded_encoder = Embedding(input_dim=num_encoder_tokens, 
#                              input_length=max_encoder_seq_length, output_dim=256)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_input)
# we'll only use the encoder state
encoder_states = [state_h, state_c]

# decoder
decoder_input = Input(shape=(None, num_decoder_tokens))
# embedded_decoder = Embedding(input_dim=num_decoder_tokens, 
#                              input_length=max_decoder_seq_length, output_dim=256)(decoder_inputs)
# decoder will return full output sequence and internal states
# internal states will be used during inference and not during training.
decoder_lstm = LSTM(latent_dim, return_state=True, return_sequences=True, name='decoder_lstm')
# Set up the decoder, using `encoder_states` as initial state.
decoder_outputs, _, _ = decoder_lstm(decoder_input, initial_state=encoder_states)

decoder_output = Dense(256, activation='relu', name='decoder_dense1')(decoder_outputs)
decoder_output = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense2')(decoder_output)

model = Model([encoder_input, decoder_input], decoder_output)


In [9]:
import json
model.summary()
# print(encoder_input_data.shape)
# print(encoder_input_data.shape[-1])
# model_as_json = json.loads(model.to_json())
# print(json.dumps(model_as_json, indent=2))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, None, 2226)    0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, None, 4578)    0                                            
____________________________________________________________________________________________________
encoder_lstm (LSTM)              [(None, 256), (None,  2542592     input_1[0][0]                    
____________________________________________________________________________________________________
decoder_lstm (LSTM)              [(None, None, 256), ( 4951040     input_2[0][0]                    
                                                                   encoder_lstm[0][1]      

In [10]:
from keras.callbacks import ModelCheckpoint

# Run training

batch_size = 64  # Batch size for training.
epochs = 20  # Number of epochs to train for.
# learning_rate = 0.005

checkpointer = ModelCheckpoint(filepath='seq2seq_weights_best_2.hdf5', 
                           verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
         batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[checkpointer])
# save the model
model.save('seq2seq_2.h5')

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
from keras.models import load_model

# del model
model = load_model('seq2seq_2.h5')

model.layers

In [11]:
# Inference... testing the model
# Here's the drill from the Keras tutorial code:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
#    and a "start of sequence" token as target.
#    Output will be the next target token
# 3) Repeat with the current target token and current states

from keras.models import load_model

# model = load_model('seq2seq_2.h5')

# inference models
encoder_model = Model(encoder_input, encoder_states)
# encoder_model.summary()

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
# decoder_inputs = Input(shape=(None,))
# embedded_decoder = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
# decoder_lstm = LSTM(latent_dim, return_state=True, return_sequences=True)

decoder_output, state_h, state_c = decoder_lstm(decoder_input, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]

decoder_output = Dense(256, activation='relu', name='decoder_dense1')(decoder_output)
decoder_output = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense2')(decoder_output)

decoder_model = Model([decoder_input]+decoder_state_inputs, [decoder_output]+decoder_states)

# save the model
decoder_model.save('seq2seq_inference_2.h5')

In [None]:
# model.summary()
# encoder_model.summary()
decoder_model.summary()


In [12]:
def decode_sequence(input_seq):
    # get encoded state vectors from input
    states = encoder_model.predict(input_seq)
    # define empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # set the first token of target sequence with the start token
    target_seq[0, 0, target_token_index['<GO>']] = 1
    
    # loop on the batch of sequences
    done = False
    decoded_sequence = []
    while not done:
        output_tokens, h, c = decoder_model.predict([target_seq]+states)
        decoded_token_index = np.argmax(output_tokens[0, -1, :])
        decoded_token = target_index_token[decoded_token_index]
        decoded_sequence.append(decoded_token)
        # we are done if we hit stop token or the sequence is at max length
        if (decoded_token == '<EOS>' or 
           len(decoded_sequence) > max_decoder_seq_length):
            done = True
            
        # update the target sequence of length 1
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, decoded_token_index] = 1
        # update states
        states = [h, c]
    
    return decoded_sequence

In [17]:
# testing
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index: seq_index+1]
    decoded_sequence = decode_sequence(input_seq)
    print('++--------------++')
    print('input seq  : ', input_texts_seq[seq_index])
    #print('dseq: ', decoded_sequence)
    #print('decoded seq: ', logits_to_text(decoded_sequence[0], target_texts_tokenizer))
    print('decoded seq: ', decoded_sequence)

++--------------++
input seq  :  ['go']
decoded seq:  ['cuisiner', 'enseigner', 'goûtez', 'goûtez', 'goûtez', 'goûtez', 'goûtez', 'espionnes', 'espionnes', 'espionnes', 'espionnes', 'espionnes', 'goûtez']
++--------------++
input seq  :  ['run']
decoded seq:  ['fantôme', 'descendez', 'goûtez', 'goûtez', 'goûtez', 'goûtez', 'goûtez', 'goûtez', 'espionnes', 'espionnes', 'goûtez', 'goûtez', 'goûtez']
++--------------++
input seq  :  ['run']
decoded seq:  ['fantôme', 'descendez', 'goûtez', 'goûtez', 'goûtez', 'goûtez', 'goûtez', 'goûtez', 'espionnes', 'espionnes', 'goûtez', 'goûtez', 'goûtez']
++--------------++
input seq  :  ['wow']
decoded seq:  ['ri', 'garder', 'simule', 'goûtez', 'discutâmes', 'emballe', 'emballe', 'emballe', 'emballe', 'emballe', 'emballe', 'emballe', 'emballe']
++--------------++
input seq  :  ['fire']
decoded seq:  ['jaune', 'jaune', 'discutâmes', 'goûtez', 'celle', 'goûtez', 'goûtez', 'celle', 'emballe', 'emballe', 'emballe', 'emballe', 'emballe']
++--------------+