# 10. Sequence-to-sequence models and attention

###  10.4.1 Preparing the corpus for training

In [1]:
import os
# from nlpia.loaders import get_data, DATA_PATH
# df = get_data(os.path.join(DATA_PATH, '..', 'book', 'data', 'dialog.txt'))
import pandas as pd
df = pd.read_csv("../../data/moviedialog.csv", index_col=0)
df.columns = 'statement reply'.split()
df = df.fillna(' ')
input_texts, target_texts = [], []  # <1>
input_vocabulary = set()  # <2>
output_vocabulary = set()
start_token = '\t'  # <3>
stop_token = '\n'
max_training_samples = min(25000, len(df) - 1)  # <4>

for input_text, target_text in zip(df.statement, df.reply):
    target_text = start_token + target_text \
        + stop_token  # <5>
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:  # <6>
        if char not in input_vocabulary:
            input_vocabulary.add(char)
    for char in target_text:
        if char not in output_vocabulary:
            output_vocabulary.add(char)

# <1> The arrays hold the input and target text read from the corpus file.
# <2> The sets hold the seen characters in the input and target text.
# <3> The target sequence is annotated with a start (first) and stop (last) token; the characters representing the tokens are defined here. These tokens can't be part of the normal sequence text and should be uniquely used as start and stop tokens.
# <4> `max_training_samples` defines how many lines are used for the training. It is the lower number of either a user-defined maximum or the total number of lines loaded from the file.
# <5> The `target_text` needs to be wrapped with the start and stop tokens.
# <6> Compile the vocabulary -- set of the unique characters seen in the input_texts

### 10.4.2 Building your character dictionary

In [2]:
input_vocabulary = sorted(input_vocabulary)  # <1>
output_vocabulary = sorted(output_vocabulary)

input_vocab_size = len(input_vocabulary)  # <2>
output_vocab_size = len(output_vocabulary)
max_encoder_seq_length = max(
    [len(txt) for txt in input_texts])  # <3>
max_decoder_seq_length = max(
    [len(txt) for txt in target_texts])

input_token_index = dict([(char, i) for i, char in enumerate(input_vocabulary)])  # <4>
target_token_index = dict(
    [(char, i) for i, char in enumerate(output_vocabulary)])
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())  # <5>
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

# <1> You convert the character sets into sorted lists of characters, which you then use to generate the dictionary.
# <2> For the input and target data, you determine the maximum number of unique characters, which you use to build the one-hot matrices.
# <3> For the input and target data, you also determine the maximum number of sequence tokens.
# <4> Loop over the `input_characetrs` and `output_vocabulary` to create the lookup dictionaries, which you use to generate the one-hot vectors.
# <5> Loop over the newly created dictionaries to create the reverse lookups.

### 10.4.3 Generate one-hot encoded training sets

In [3]:
import numpy as np  # <1> # noqa

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, input_vocab_size),
    dtype='float32')  # <2>
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, output_vocab_size),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, output_vocab_size),
    dtype='float32')

for i, (input_text, target_text) in enumerate(
        zip(input_texts, target_texts)):  # <3>
    for t, char in enumerate(input_text):  # <4>
        encoder_input_data[
            i, t, input_token_index[char]] = 1.  # <5>
    for t, char in enumerate(target_text):  # <6>
        decoder_input_data[
            i, t, target_token_index[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1

# <1> You use numpy for the matrix manipulations.
# <2> The training tensors are initialized as zero tensors with the shape of number of samples (this number should be equal for the input and target samples) times the maximum number of sequence tokens times the number of possible characters.
# <3> Loop over the training samples; input and target texts need to match.
# <4> Loop over each character of each sample.
# <5> Set the index for the character at each time step to one; all other indices remain at zero. This creates the one-hot encoded representation of the training samples.
# <6> For the training data for the decoder, you create the `decoder_input_data` and `decoder_target_data` (which is one time step behind the _decoder_input_data_).


### 10.4.4 Train your sequence-sequence chatbot

In [4]:
from keras.models import Model  # noqa
from keras.layers import Input, LSTM, Dense  # noqa

batch_size = 16    # <1>
epochs = 1 #100       # <2>
num_neurons = 16 #256  # <3>

encoder_inputs = Input(shape=(None, input_vocab_size))
encoder = LSTM(num_neurons, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, output_vocab_size))
decoder_lstm = LSTM(num_neurons, return_sequences=True,
                    return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['acc'])
model.fit([encoder_input_data, decoder_input_data],
          decoder_target_data, batch_size=batch_size, epochs=epochs,
          validation_split=0.1)  # <4>

# <1> In this example, you set the batch size to 64 samples. Increasing the batch size can speed up the training; it might also require more memory
# <2> Training a sequence-to-sequence network can be lengthy and easily require 100 epochs
# <3> In this example, you set the number of neuron dimensions to 256
# <4> You withold 10% of the samples for validation tests after each epoch

2022-12-15 15:03:00.089554: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-15 15:03:02.045923: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-15 15:03:02.046203: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-15 15:03:06.230484: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-



<keras.callbacks.History at 0x7f1f6043e650>

### 10.4.5 Assemble the model for sequence generation

In [6]:
encoder_model = Model(encoder_inputs, encoder_states)
thought_input = [
    Input(shape=(num_neurons,)), Input(shape=(num_neurons,))]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=thought_input)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    inputs=[decoder_inputs] + thought_input,
    outputs=[decoder_outputs] + decoder_states)

### 10.4.6 Predicting a sequence

In [7]:
def decode_sequence(input_seq):
    thought = encoder_model.predict(input_seq)  # <1>

    target_seq = np.zeros((1, 1, output_vocab_size))  # <2>
    target_seq[0, 0, target_token_index[stop_token]
        ] = 1.  # <3>
    stop_condition = False
    generated_sequence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + thought) # <4>

        generated_token_idx = np.argmax(output_tokens[0, -1, :])
        generated_char = reverse_target_char_index[generated_token_idx]
        generated_sequence += generated_char
        if (generated_char == stop_token or
                len(generated_sequence) > max_decoder_seq_length
                ):  # <5>
            stop_condition = True

        target_seq = np.zeros((1, 1, output_vocab_size))  # <6>
        target_seq[0, 0, generated_token_idx] = 1.
        thought = [h, c]  # <7>

    return generated_sequence

# <1> Generate the thought vector as the input to the ecoder
# <2> In contrast to the training, target_seq starts off as a zero tensor
# <3> The first input token to the decoder is the start token
# <4> Passing the already-genereated tokens and the latest state to the decoder to predict the next sequence element
# <5> Setting the stop_condition to True will stop the loop
# <6> Update the target sequence and use the last generated token as the input to the next generation step
# <7> Update th ethought vector state

### 10.4.7 Generating a response

In [8]:
def response(input_text):
    input_seq = np.zeros((1, max_encoder_seq_length, input_vocab_size),
        dtype="float32")
    for t, char in enumerate(input_text): # <1>
        input_seq[0, t, input_token_index[char]] = 1.
    decoded_sentence = decode_sequence(input_seq) # <2>
    print("bot Reply (Decoded sentence):", decoded_sentence)

# <1> Loop over each character of the input text to generate the one-hot tensor for encoder to generate the thought vector from
# <2> Use the decode_sequence function to call the trained model and generate the response sequence

### 10.4.8 Converse with your chatbot

In [9]:
response("what is the internet?")

bot Reply (Decoded sentence): i                                                                                                      


In [10]:
response("why?")

bot Reply (Decoded sentence): i                                                                                                      


In [11]:
response("do you like coffeee?")

bot Reply (Decoded sentence): i                                                                                                      


In [12]:
response("do you like football?")

bot Reply (Decoded sentence): i                                                                                                      
