# Example-2 based on Keras tutorial on Seq2Seq [blog](https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html).

[dataset source (english-french)](http://www.manythings.org/anki/fra-eng.zip)

In this example we'll use words as tokens with Embedding and LSTM

### data prep

In [1]:
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.layers.embeddings import Embedding
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam


Using TensorFlow backend.


In [2]:
def sequence_to_text(sequence, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    return ' '.join([index_to_words[token] for token in sequence if token > 0])

In [3]:
filename = 'fra.txt'
input_texts_seq = []
target_texts_seq = []
lines = open(filename).read().split('\n')
num_samples = 10000
special_words = ['<PAD>', '<UNK>', '<GO>',  '<EOS>']

input_words = set(special_words)
target_words = set(special_words)

# process the lines
for line in lines[:min(num_samples, len(lines)-1)]:
    input_text, target_text = line.split('\t')
    # delimiter target_text with '<start>' for start word and '<end>' for end word
    input_text_seq = text_to_word_sequence(input_text)
    target_text_seq = text_to_word_sequence(target_text)
    target_text_seq = ['<GO>'] + target_text_seq + ['<EOS>']
    
    input_texts_seq.append(input_text_seq)
    target_texts_seq.append(target_text_seq)
    
    for w in input_text_seq:
        if w not in input_words:
            input_words.add(w)
    
    for w in target_text_seq:
        if w not in target_words:
            target_words.add(w)

input_words = sorted(list(input_words))
target_words = sorted(list(target_words))

num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)
max_encoder_seq_length = max([len(seq) for seq in input_texts_seq])
max_decoder_seq_length = max([len(seq) for seq in target_texts_seq])
num_samples = len(input_texts_seq)

print ('number of samples: ', num_samples)
print ('number of input  tokens:', num_encoder_tokens)
print ('number of output tokens:', num_decoder_tokens)
print ('Max sequence length for inputs:', max_encoder_seq_length)
print ('Max sequence length for outputs:', max_decoder_seq_length)


number of samples:  10000
number of input  tokens: 2226
number of output tokens: 4578
Max sequence length for inputs: 5
Max sequence length for outputs: 12


In [4]:
print(target_texts_seq[10])

['<GO>', 'attends', '<EOS>']


In [5]:
input_token_index  = dict([(w, i) for i, w in enumerate(input_words)])
input_index_token  = dict([(i, w) for i, w in enumerate(input_words)])
target_token_index = dict([(w, i) for i, w in enumerate(target_words)])
target_index_token = dict([(i, w) for i, w in enumerate(target_words)])


In [6]:
print(target_token_index)

{'truc': 4242, 'confectionné': 776, 'biscuits': 415, 'long': 2337, 'étudie': 4560, 'gagnez': 1706, 'rhume\u202f': 3560, 'lavé': 2292, 'bâtie': 518, 'accises': 27, 'reçu': 3558, 'baissé': 367, 'pressez': 3198, 'désolée': 1290, 'sûre': 4000, 'plombs': 3120, 'fiça': 1593, 'juste\u202f': 2139, "m'aimes": 2378, 'rembourser': 3455, 'conduire': 770, 'déciderons': 1211, 'voir': 4394, 'pousse': 3175, 'travaillent': 4196, 'camionnette': 545, 'talons': 4056, 'distinctement': 1124, 'prune': 3254, 'tête': 4269, 'géré': 1816, 'pouvons': 3183, 'légal': 2367, 'haïssaient': 1839, 'ski': 3834, 'ponctuels': 3139, 'démarré': 1245, 'fière': 1594, 'fin': 1581, 'prie': 3205, 'approuvent': 211, 'vivante': 4382, 'réussirons': 3643, "j'obtiendrai": 2065, 'dernières': 1049, 'mesurer': 2560, 'musique': 2665, 'têtu': 4270, 'attache': 274, "d'autre": 990, 'puissantes': 3302, 'suicidaire': 3944, 'frères': 1671, 'maman': 2456, 'tricote': 4215, 'revoilà': 3555, 'escroquerie': 1435, 'fusil': 1682, 'voleur': 4406, 'surv

In [53]:


# initialization of the tensors used for training
encoder_input_data  = np.zeros((num_samples, max_encoder_seq_length), dtype='float32')
encoder_input_data.fill(input_token_index['<PAD>'])
decoder_input_data  = np.zeros((num_samples, max_decoder_seq_length), dtype='float32')
decoder_input_data.fill(target_token_index['<PAD>'])
# decoder_target_data = np.zeros((num_samples, max_decoder_seq_length), dtype='float32')
# decoder_target_data.fill(target_token_index['<PAD>'])
decoder_target_data = np.zeros((num_samples, max_decoder_seq_length, num_decoder_tokens), dtype='float32')


In [54]:
# setup the tensors from the input data for the model
for i, (input_text_seq, target_text_seq) in enumerate(zip(input_texts_seq, target_texts_seq)):
    for j, w in enumerate(input_text_seq):
        encoder_input_data[i, j] = input_token_index[w]
    for j, w in enumerate(target_text_seq):
        # decoder_input_data is ahead of decoder_target_data by one timestep
        decoder_input_data[i, j] = target_token_index[w]
        if j > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start token <GO>.
            decoder_target_data[i, j - 1, target_token_index[w]] = 1



In [55]:
print(encoder_input_data[0])
print(encoder_input_data.shape)
print(decoder_input_data[0])
print(decoder_input_data.shape)
print(decoder_target_data[0])
print(decoder_target_data.shape)

[ 841.   13.   13.   13.   13.]
(10000, 5)
[    6.  4288.     5.     7.     7.     7.     7.     7.     7.     7.
     7.     7.]
(10000, 12)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
(10000, 12, 4578)


In [56]:
# model setup using LSTM

latent_dim = 128  # Latent dimensionality of the encoding space.

# encoder
encoder_input = Input(shape=encoder_input_data.shape[1:])
print(encoder_input.shape)
embedded_encoder = Embedding(input_dim=num_encoder_tokens, 
                             input_length=max_encoder_seq_length, output_dim=256)(encoder_input)
print(embedded_encoder.shape)
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(embedded_encoder)
# we'll only use the encoder state
encoder_states = [state_h, state_c]

# decoder
decoder_input = Input(shape=decoder_input_data.shape[1:])
print(decoder_input.shape)
embedded_decoder = Embedding(input_dim=num_decoder_tokens, 
                             input_length=max_decoder_seq_length, output_dim=256)(decoder_input)
# decoder will return full output sequence and internal states
# internal states will be used during inference and not during training.
decoder_lstm = LSTM(latent_dim, return_state=True, return_sequences=True, name='decoder_lstm')
# Set up the decoder, using `encoder_states` as initial state.
decoder_output, _, _ = decoder_lstm(embedded_decoder, initial_state=encoder_states)

# decoder_output = Dense(256, activation='relu', name='decoder_dense1')(decoder_outputs)
# decoder_output = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense2')(decoder_output)
print(decoder_output.shape)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_output = decoder_dense(decoder_output)


model = Model([encoder_input, decoder_input], decoder_output)


(?, 5)
(?, 5, 256)
(?, 12)
(?, ?, 128)


In [57]:
import json
model.summary()
# print(encoder_input_data.shape)
# print(encoder_input_data.shape[-1])
# model_as_json = json.loads(model.to_json())
# print(json.dumps(model_as_json, indent=2))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_22 (InputLayer)            (None, 5)             0                                            
____________________________________________________________________________________________________
input_23 (InputLayer)            (None, 12)            0                                            
____________________________________________________________________________________________________
embedding_22 (Embedding)         (None, 5, 256)        569856      input_22[0][0]                   
____________________________________________________________________________________________________
embedding_23 (Embedding)         (None, 12, 256)       1171968     input_23[0][0]                   
___________________________________________________________________________________________

In [58]:
from keras.callbacks import ModelCheckpoint
from keras.utils.np_utils import to_categorical

# Run training

batch_size = 200  # Batch size for training.
epochs = 10  # Number of epochs to train for.
# learning_rate = 0.005

checkpointer = ModelCheckpoint(filepath='seq2seq_weights_best_2.hdf5', 
                           verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
# reshape encoder_target
# tmp_target_data = decoder_target_data.reshape((-1, decoder_target_data.shape[1], 1))
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
         batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[checkpointer])
# save the model
model.save('seq2seq_2v2.h5')

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from keras.models import load_model

# del model
model = load_model('seq2seq_2v2.h5')

model.layers

In [62]:
# Inference... testing the model
# Here's the drill from the Keras tutorial code:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
#    and a "start of sequence" token as target.
#    Output will be the next target token
# 3) Repeat with the current target token and current states

from keras.models import load_model

# model = load_model('seq2seq_2.h5')

# inference models
encoder_model = Model(encoder_input, encoder_states)
# encoder_model.summary()

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
# decoder_inputs = Input(shape=(None,))
# embedded_decoder = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
# decoder_lstm = LSTM(latent_dim, return_state=True, return_sequences=True)

decoder_output, state_h, state_c = decoder_lstm(embedded_decoder, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]

# decoder_output = Dense(256, activation='relu', name='decoder_dense1')(decoder_output)
# decoder_output = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense2')(decoder_output)
decoder_output = decoder_dense(decoder_output)

decoder_model = Model([decoder_input]+decoder_state_inputs, [decoder_output]+decoder_states)

# save the model
decoder_model.save('seq2seq_inference_2v2.h5')

ValueError: Layer decoder_lstm expects 7 inputs, but it received 3 input tensors. Input received: [<tf.Tensor 'embedding_24/Gather:0' shape=(?, 12, 256) dtype=float32>, <tf.Tensor 'input_30:0' shape=(?, 128) dtype=float32>, <tf.Tensor 'input_31:0' shape=(?, 128) dtype=float32>]

In [16]:
# model.summary()
# encoder_model.summary()
decoder_model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, None, 4578)    0                                            
____________________________________________________________________________________________________
input_5 (InputLayer)             (None, 128)           0                                            
____________________________________________________________________________________________________
input_6 (InputLayer)             (None, 128)           0                                            
____________________________________________________________________________________________________
decoder_lstm (LSTM)              [(None, None, 128), ( 2409984     input_4[0][0]                    
                                                                   input_5[0][0]           

In [17]:
def decode_sequence(input_seq):
    # get encoded state vectors from input
    states = encoder_model.predict(input_seq)
    # define empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # set the first token of target sequence with the start token
    target_seq[0, 0, target_token_index['<GO>']] = 1
    
    # loop on the batch of sequences
    done = False
    decoded_sequence = []
    while not done:
        output_tokens, h, c = decoder_model.predict([target_seq]+states)
        decoded_token_index = np.argmax(output_tokens[0, -1, :])
        decoded_token = target_index_token[decoded_token_index]
        decoded_sequence.append(decoded_token)
        # we are done if we hit stop token or the sequence is at max length
        if (decoded_token == '<EOS>' or 
           len(decoded_sequence) > max_decoder_seq_length):
            done = True
            
        # update the target sequence of length 1
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, decoded_token_index] = 1
        # update states
        states = [h, c]
    
    return decoded_sequence

In [18]:
# testing
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index: seq_index+1]
    decoded_sequence = decode_sequence(input_seq)
    print('++--------------++')
    print('input seq  : ', input_texts_seq[seq_index])
    #print('dseq: ', decoded_sequence)
    #print('decoded seq: ', logits_to_text(decoded_sequence[0], target_texts_tokenizer))
    print('decoded seq: ', decoded_sequence)

++--------------++
input seq  :  ['go']
decoded seq:  ['arrête', 'de', 'de', 'de', 'maison', '<EOS>']
++--------------++
input seq  :  ['run']
decoded seq:  ['arrête', '<EOS>']
++--------------++
input seq  :  ['run']
decoded seq:  ['arrête', '<EOS>']
++--------------++
input seq  :  ['wow']
decoded seq:  ["c'est", 'un', '<EOS>']
++--------------++
input seq  :  ['fire']
decoded seq:  ['arrête', 'de', 'de', 'de', 'de', 'maison', '<EOS>']
++--------------++
input seq  :  ['help']
decoded seq:  ['arrête', '<EOS>']
++--------------++
input seq  :  ['jump']
decoded seq:  ['arrête', 'de', 'de', 'de', 'de', 'maison', '<EOS>']
++--------------++
input seq  :  ['stop']
decoded seq:  ['arrête', 'de', 'de', 'de', 'de', 'maison', '<EOS>']
++--------------++
input seq  :  ['stop']
decoded seq:  ['arrête', 'de', 'de', 'de', 'de', 'maison', '<EOS>']
++--------------++
input seq  :  ['stop']
decoded seq:  ['arrête', 'de', 'de', 'de', 'de', 'maison', '<EOS>']
