# Example-1 based on Keras tutorial on Seq2Seq [blog](https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html).

[dataset source (english-french)](http://www.manythings.org/anki/fra-eng.zip)

This version uses characters as tokens and LSTM layers in the model

### data prep

In [1]:
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, Dense

Using TensorFlow backend.


In [2]:
filename = 'fra.txt'
input_texts = []
target_texts = []
lines = open(filename).read().split('\n')
num_samples = 10000

input_chars = set()
target_chars = set()

# process the lines
for line in lines[:min(num_samples, len(lines)-1)]:
    input_text, target_text = line.split('\t')
    # delimiter target_text with '\t' for start char and '\n' for end char
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for c in input_text:
        if c not in input_chars:
            input_chars.add(c)
    for c in target_text:
        if c not in target_chars:
            target_chars.add(c)

input_chars  = sorted(list(input_chars))
target_chars = sorted(list(target_chars))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])


print ('number of samples: ', len(input_texts))
print ('number of unique input  tokens:', len(input_chars))
print ('number of unique output tokens:', len(target_chars))
print ('Max sequence length for inputs:', max_encoder_seq_length)
print ('Max sequence length for outputs:', max_decoder_seq_length)


number of samples:  10000
number of unique input  tokens: 71
number of unique output tokens: 93
Max sequence length for inputs: 16
Max sequence length for outputs: 59


In [5]:
print('input_chars : ', input_chars)
print('target_chars: ', target_chars)

input_chars :  [' ', '!', '$', '&', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '’']
target_chars:  ['\t', '\n', ' ', '!', '$', '&', "'", '(', ')', ',', '-', '.', '0', '1', '5', '6', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '«', '»', 'À', 'Ç', 'É', 'Ê', 'à', 'â', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ù', 'û', 'œ', '\u2009', '‘', '’', '\u202f']


In [6]:
input_token_index  = dict([(c, i) for i, c in enumerate(input_chars)])
input_index_token  = dict([(i, c) for i, c in enumerate(input_chars)])
target_token_index = dict([(c, i) for i, c in enumerate(target_chars)])
target_index_token = dict([(i, c) for i, c in enumerate(target_chars)])


In [7]:


# initialization of the tensors used for training
encoder_input_data  = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')
decoder_input_data  = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')


In [8]:
# setup the tensors from the input data for the model
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for j, c in enumerate(input_text):
        encoder_input_data[i, j, input_token_index[c]] = 1
    for j, c in enumerate(target_text):
        # decoder_input_data is ahead of decoder_target_data by one timestep
        decoder_input_data[i, j, target_token_index[c]] = 1
        if j > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, j - 1, target_token_index[c]] = 1.


In [9]:
# model setup using LSTM

latent_dim = 256  # Latent dimensionality of the encoding space.

# encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# we'll only use the encoder state
encoder_states = [state_h, state_c]

# decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# decoder will return full output sequence and internal states
# internal states will be used during inference and not during training.
decoder_lstm = LSTM(latent_dim, return_state=True, return_sequences=True)
# Set up the decoder, using `encoder_states` as initial state.
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [10]:
# Run training
from keras.callbacks import ModelCheckpoint

batch_size = 64  # Batch size for training.
epochs = 20  # Number of epochs to train for.

checkpointer = ModelCheckpoint(filepath='seq2seq_weights_best_1.hdf5', 
                           verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
                  metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
         batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[checkpointer])
# save the model
model.save('seq2seq_1.h5')

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
from keras.models import load_model

# del model
model = load_model('seq2seq_1.h5')

model.layers

[<keras.engine.topology.InputLayer at 0x11e86dac8>,
 <keras.engine.topology.InputLayer at 0x11e86da90>,
 <keras.layers.recurrent.LSTM at 0x11e86dc18>,
 <keras.layers.recurrent.LSTM at 0x11e86db00>,
 <keras.layers.core.Dense at 0x11e86dc88>]

In [12]:
# Inference... testing the model
# Here's the drill from the Keras tutorial code:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
#    and a "start of sequence" token as target.
#    Output will be the next target token
# 3) Repeat with the current target token and current states

# inference models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs]+decoder_state_inputs, [decoder_outputs]+decoder_states)



In [13]:
model.summary()
encoder_model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, None, 71)      0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, None, 93)      0                                            
____________________________________________________________________________________________________
lstm_1 (LSTM)                    [(None, 256), (None,  335872      input_1[0][0]                    
____________________________________________________________________________________________________
lstm_2 (LSTM)                    [(None, None, 256), ( 358400      input_2[0][0]                    
                                                                   lstm_1[0][1]            

In [14]:
def decode_sequence(input_seq):
    # get encoded state vectors from input
    states = encoder_model.predict(input_seq)
    # define empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # set the first character of target sequence with the start character
    target_seq[0, 0, target_token_index['\t']] = 1
    
    # loop on the batch of sequences
    done = False
    decoded_sequence = ''
    while not done:
        output_tokens, h, c = decoder_model.predict([target_seq]+states)
        decoded_token_index = np.argmax(output_tokens[0, -1, :])
        decoded_char = target_index_token[decoded_token_index]
        decoded_sequence += decoded_char
        # we are done if we hit stop char or the sequence is at max length
        if (decoded_char == '\n' or 
           len(decoded_sequence) > max_decoder_seq_length):
            done = True
            
        # update the target sequence of length 1
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, decoded_token_index] = 1
        # update states
        states = [h, c]
    
    return decoded_sequence

In [15]:
# testing
for seq_index in range(100):
    input_seq = encoder_input_data[seq_index: seq_index+1]
    decoded_sequence = decode_sequence(input_seq)
    print('++--------------++')
    print('input seq  : ', input_texts[seq_index])
    print('decoded seq: ', decoded_sequence )

++--------------++
input seq  :  Go.
decoded seq:  Arrêtez de hurler.

++--------------++
input seq  :  Run!
decoded seq:  Arrêtez de hurler.

++--------------++
input seq  :  Run!
decoded seq:  Arrêtez de hurler.

++--------------++
input seq  :  Wow!
decoded seq:  Attends un coup !

++--------------++
input seq  :  Fire!
decoded seq:  Attendez un coup !

++--------------++
input seq  :  Help!
decoded seq:  Arrêtez de l'arder.

++--------------++
input seq  :  Jump.
decoded seq:  Attends un coup !

++--------------++
input seq  :  Stop!
decoded seq:  Arrête de te dis laiguer.

++--------------++
input seq  :  Stop!
decoded seq:  Arrête de te dis laiguer.

++--------------++
input seq  :  Stop!
decoded seq:  Arrête de te dis laiguer.

++--------------++
input seq  :  Wait!
decoded seq:  Attendez un coup !

++--------------++
input seq  :  Wait!
decoded seq:  Attendez un coup !

++--------------++
input seq  :  I see.
decoded seq:  Je l'ai fait prêt.

++--------------++
input seq  :  I 