# Building a Chatbot

In this project, we will build a chatbot using conversations from Cornell University's [Movie Dialogue Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html). The main features of our model are LSTM cells, a bidirectional dynamic RNN, and decoders with attention. 

The conversations will be cleaned rather extensively to help the model to produce better responses. As part of the cleaning process, punctuation will be removed, rare words will be replaced with "UNK" (our "unknown" token), longer sentences will not be used, and all letters will be in the lowercase. 

With a larger amount of data, it would be more practical to keep features, such as punctuation. However, I am using FloydHub's GPU services and I don't want to get carried away with too training for too long.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import time
from enum import Enum

from corpus import Corpus

tf.__version__

'1.7.0'

Most of the code to load the data is courtesy of https://github.com/suriyadeepan/practical_seq2seq/blob/master/datasets/cornell_corpus/data.py.

### Inspect and Load the Data

In [2]:
cornell_corpus = Corpus("movie_lines.txt", "movie_conversations.txt", max_vocab=8100)
questions_int = cornell_corpus.prompts_int
answers_int = cornell_corpus.answers_int

vocab_dicts = (cornell_corpus.vocab2int, cornell_corpus.int2vocab)
(questions_vocab_to_int, questions_int_to_vocab) = vocab_dicts
(answers_vocab_to_int, answers_int_to_vocab) = vocab_dicts

eos = cornell_corpus.eos
unk = cornell_corpus.unk
pad = cornell_corpus.pad
go = cornell_corpus.go

In [3]:
# Sort questions and answers by the length of questions.
# This will reduce the amount of padding during training
# Which should speed up training and help to reduce the loss

max_source_line_length = max( [len(sentence) for sentence in questions_int])
max_targ_line_length = max([len(sentence) for sentence in answers_int])
max_line_length = max(max_source_line_length, max_targ_line_length)

sorted_questions = []
sorted_answers = []

for length in range(1, max_line_length+1):
    for i in enumerate(questions_int):
        if len(i[1]) == length:
            sorted_questions.append(questions_int[i[0]])
            sorted_answers.append(answers_int[i[0]])

print(len(sorted_questions))
print(len(sorted_answers))
print()
for i in range(3):
    print(sorted_questions[i])
    print(sorted_answers[i])
    print()

221414
221414

[5647]
[2, 111, 4, 5647, 1, 19, 52, 2, 3175, 11, 5, 2350, 8102, 5533, 11, 2157, 26, 598, 1, 72, 671, 304, 43, 97, 8103]

[60]
[8102, 2540, 43, 260, 3, 28, 98, 2323, 70, 43, 504, 500, 309, 84, 7, 24, 34, 36, 43, 48, 475, 11, 7, 79, 89, 8103]

[53]
[76, 8103]



In [4]:
#FIXME: This really should be something like "preprocess_targets"
def process_decoding_input(target_data, vocab_to_int, batch_size):
    '''Remove the last word id from each batch and concat the <GO> to the begining of each batch'''
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    return dec_input


In [5]:
class GraphMode(Enum):
    TRAIN = 0
    VALID = 1
    TEST = 2
    SERVE = 3
    
#def convert_graph_mode(value, dtype=None, as_)

In [6]:
def dropout_cell(rnn_size, keep_prob):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    return tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)

def multi_dropout_cell(rnn_size, keep_prob, num_layers):    
    return tf.contrib.rnn.MultiRNNCell( [dropout_cell(rnn_size, keep_prob) for _ in range(num_layers)] )

In [7]:
def encoding_layer(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_lengths):
    """
    Create the encoding layer
    
    Returns a tuple `(outputs, output_states)` where
      outputs is a 2-tuple of vectors of dimensions [sequence_length, rnn_size] for the forward and backward passes
      output_states is a 2-tupe of the final hidden states of the forward and backward passes
    
    """
    forward_cell = multi_dropout_cell(rnn_size, keep_prob, num_layers)
    backward_cell = multi_dropout_cell(rnn_size, keep_prob, num_layers)
    outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw = forward_cell,
                                                   cell_bw = backward_cell,
                                                   sequence_length = sequence_lengths,
                                                   inputs = rnn_inputs, 
                                                   dtype=tf.float32)
    return outputs, states

## Decoding

In [8]:
def decoding_layer(enc_state, enc_outputs, dec_embed_input, dec_embeddings, #Inputs
                        rnn_size, num_layers, output_layer, #Architecture
                        keep_prob, beam_width, #Hypeparameters
                        target_lengths, batch_size,
                        vocab_to_int): 
    
    with tf.variable_scope("decoding", reuse=tf.AUTO_REUSE) as decoding_scope:
        dec_cell = multi_dropout_cell(rnn_size, keep_prob, num_layers)
        init_dec_state_size = batch_size
        
        #TRAINING
        train_attn = tf.contrib.seq2seq.BahdanauAttention(num_units=dec_cell.output_size, memory=enc_outputs)
        train_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, train_attn,
                                                    attention_layer_size=dec_cell.output_size)
        
        
        helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, target_lengths, time_major=False)
        train_decoder = tf.contrib.seq2seq.BasicDecoder(train_cell, helper,
                            train_cell.zero_state(init_dec_state_size, tf.float32).clone(cell_state=enc_state),
                            output_layer = output_layer)
        outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(train_decoder, scope=decoding_scope)
        logits = outputs.rnn_output

        #INFERENCE
        #Tile inputs
        enc_state = tf.contrib.seq2seq.tile_batch(enc_state, beam_width)
        enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs, beam_width)
        init_dec_state_size *= beam_width
        
        infer_attn = tf.contrib.seq2seq.BahdanauAttention(num_units=dec_cell.output_size, memory=enc_outputs)
        infer_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, infer_attn,
                                                    attention_layer_size=dec_cell.output_size)
        
        
        decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell = infer_cell,
            embedding = dec_embeddings,
            start_tokens = tf.tile([vocab_to_int["<GO>"]], [batch_size]), #Not by batch_size*beam_width, strangely
            end_token = vocab_to_int["<EOS>"],
            beam_width = beam_width,
            initial_state = infer_cell.zero_state(init_dec_state_size, tf.float32).clone(cell_state=enc_state),
            output_layer = output_layer
        )
            
        final_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, scope=decoding_scope)
        ids = final_decoder_output.predicted_ids
        beams = ids
                
    return logits, beams

In [9]:
def seq2seq_model(input_data, target_data, keep_prob, batch_size,
                  source_lengths, target_sequence_lengths,
                  answers_vocab_size, questions_vocab_size, enc_embedding_size, dec_embedding_size,
                  rnn_size, num_layers, beam_width, 
                  questions_vocab_to_int):
    
    
    augmented_questions_vocab_size = questions_vocab_size + 1 #Add an extra row for metatokens
    
    enc_embed_input = tf.contrib.layers.embed_sequence(input_data, 
                                                       augmented_questions_vocab_size, #questions_vocab_size, 
                                                       enc_embedding_size,
                                                       initializer = tf.random_uniform_initializer(0,1))
    
    enc_outputs, enc_states = encoding_layer(enc_embed_input, rnn_size, num_layers, keep_prob, source_lengths)    
    concatenated_enc_output = tf.concat(enc_outputs, -1)
    init_dec_state = enc_states[0]    
    
    
    augmented_answers_vocab_size = answers_vocab_size + 1
    dec_input = process_decoding_input(target_data, questions_vocab_to_int, batch_size)
    dec_embeddings = tf.Variable(tf.random_uniform([augmented_answers_vocab_size, dec_embedding_size], 0, 1))    
    dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, dec_input)
    
    output_layer = tf.layers.Dense(answers_vocab_size,bias_initializer=tf.zeros_initializer(),activation=tf.nn.relu)
    logits, beams = decoding_layer(init_dec_state,
                            concatenated_enc_output,
                            dec_embed_input,
                            dec_embeddings,
                            rnn_size, 
                            num_layers,
                            output_layer,
                            keep_prob,
                            beam_width,
                            target_sequence_lengths, 
                            batch_size,
                            answers_vocab_to_int,
                            )
    
    
    return logits, beams

In [10]:
# Set the Hyperparameters

#Network Architecture
rnn_size = 128
num_layers = 2
encoding_embedding_size = 1024
decoding_embedding_size = 1024

#Training
epochs = 100
batch_size = 128
learning_rate = 0.001
learning_rate_decay = 0.3
min_learning_rate = 0.00001
keep_probability = 0.75
vocab_size = len(answers_vocab_to_int)
#Decoding
beam_width = 10

In [11]:
def model_inputs(batch_size):
    '''Create palceholders for inputs to the model'''
    input_data = tf.placeholder(tf.int32, [batch_size, None], name='input')
    targets = tf.placeholder(tf.int32, [batch_size, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    return input_data, targets, lr, keep_prob

In [12]:
# Reset the graph to ensure that it is ready for training
tf.reset_default_graph()

device = "/cpu:0"

with tf.device(device):
    # Placeholders for feed_dict    
    input_data, targets, lr, keep_prob = model_inputs(batch_size)

    source_lengths = tf.placeholder(tf.int32, [batch_size], name="source_lengths")

    max_sequence_length_batch = tf.placeholder(tf.int32)
    input_shape = tf.shape(input_data)
    target_sequence_lengths = tf.fill([input_shape[0]], max_sequence_length_batch)

    def true_lengths(samples, pad_token):
        lengths = []
        for sample in sample:
            length = 1 #We need to include at least one <EOS>
            i = 0
            while sample[i] != pad_token:
                length += 1
                i += 1
            lengths.append(length)
        return lengths

    # Create the training and inference logits
    #FIXME: Change "batch_size" to input_shape[0]?
    train_logits, beams = \
    seq2seq_model(input_data,
                  #tf.reverse(input_data, [-1]),
                  targets, keep_prob, batch_size,
                  source_lengths,
        target_sequence_lengths, 
        len(answers_vocab_to_int), len(questions_vocab_to_int),
        encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, beam_width, questions_vocab_to_int)

    print(targets.shape)
    print(train_logits.shape)
    # Find the shape of the input data for sequence_loss
    with tf.name_scope("optimization"):

        cost = tf.losses.huber_loss(
                       train_logits,
                       tf.one_hot(targets,vocab_size,axis=-1),
                       delta=1.0,
                       scope=None,
                       loss_collection=tf.GraphKeys.LOSSES,
                       reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
        '''
        cost = tf.contrib.seq2seq.sequence_loss(
        train_logits,
        targets,
        tf.ones([batch_size, max_sequence_length_batch]) #FIXME: Weight the <PAD> tokens as 0
        )
        '''

       # cost=tf.reduce_mean(tf.nn.l2_loss(train_logits - tf.one_hot(targets,vocab_size,axis=-1)))
        #cost = tf.reduce_mean(tf.square(tf.subtract(train_logits,targets)))
        #cost = losses * tf.ones([batch_size, max_sequence_length_batch])
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)


Instructions for updating:
Use the retry module or similar alternatives.
(128, ?)
(128, ?, 8104)


In [13]:
def pad_sentence_batch(sentence_batch, vocab_to_int):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    
    #Increment every index so we can use index 0 for metatokens like padding
    return [ [index+1 for index in sentence] + [0] * (max_sentence - len(sentence)) for sentence in sentence_batch]
    
    #return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [14]:
def batch_data(questions, answers, batch_size):
    """Batch questions and answers together"""
    for batch_i in range(0, len(questions)//batch_size):
        start_i = batch_i * batch_size
        questions_batch = questions[start_i:start_i + batch_size]
        answers_batch = answers[start_i:start_i + batch_size]
        
        source_lengths = np.array( [len(sentence) for sentence in questions_batch] )
        
        
        pad_questions_batch = np.array(pad_sentence_batch(questions_batch, questions_vocab_to_int))
        pad_answers_batch = np.array(pad_sentence_batch(answers_batch, answers_vocab_to_int))
        yield source_lengths, pad_questions_batch, pad_answers_batch

In [15]:
# Validate the training with 10% of the data
train_valid_split = int(len(sorted_questions)*0.15)

# Split the questions and answers into training and validating data
train_questions = sorted_questions[train_valid_split:]
train_answers = sorted_answers[train_valid_split:]

valid_questions = sorted_questions[:train_valid_split]
valid_answers = sorted_answers[:train_valid_split]

print(len(train_questions))
print(len(valid_questions))

188202
33212


In [16]:
#TRAINING
display_step = 1 # Check training loss after every 100 batches
total_train_loss = 0 # Record the training loss for each display step

#VALIDATION
stop_early = 0 
stop = 5 # If the validation loss does decrease in 5 consecutive checks, stop training
validation_check = 1 #((len(train_questions))//batch_size//2)-1 #Check validation loss every half-epoch
summary_valid_loss = [] # Record the validation loss for saving improvements in the model


checkpoint = "./checkpoints/best_model.ckpt" 

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(1, epochs+1):
        for batch_i, (q_lengths, questions_batch, answers_batch) in enumerate(
                batch_data(train_questions, train_answers, batch_size)):
            print(answers_batch)
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: questions_batch,
                 targets: answers_batch,
                 source_lengths: q_lengths, #NEW
                 lr: learning_rate,
                 max_sequence_length_batch: answers_batch.shape[1],
                 keep_prob: keep_probability})

            total_train_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time
            
            total_train_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>9.6f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(train_questions) // batch_size, 
                              total_train_loss / display_step, 
                              batch_time*display_step))
                total_train_loss = 0

            if batch_i % validation_check == 0 and batch_i > 0:
                total_valid_loss = 0
                start_time = time.time()
                for batch_ii, (q_lengths, questions_batch, answers_batch) in \
                        enumerate(batch_data(valid_questions, valid_answers, batch_size)):
                    valid_loss = sess.run(
                    cost, {input_data: questions_batch,
                           targets: answers_batch,
                           lr: learning_rate,
                           source_lengths: q_lengths, #NEW
                           max_sequence_length_batch: answers_batch.shape[1],
                           keep_prob: 1})
                    total_valid_loss += valid_loss
                end_time = time.time()
                batch_time = end_time - start_time
                avg_valid_loss = total_valid_loss / (len(valid_questions) / batch_size)
                print('Valid Loss: {:>9.6f}, Seconds: {:>5.2f}'.format(avg_valid_loss, batch_time))

                # Reduce learning rate, but not below its minimum value
                learning_rate *= learning_rate_decay
                if learning_rate < min_learning_rate:
                    learning_rate = min_learning_rate

                summary_valid_loss.append(avg_valid_loss)
                if avg_valid_loss <= min(summary_valid_loss):
                    print('New Record!') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
    
        if stop_early == stop:
            print("Stopping Training.")
            break


[[  14   13    1 ...    0    0    0]
 [   2   95    3 ...    0    0    0]
 [   1   34    7 ...    0    0    0]
 ...
 [8103   11    2 ...    0    0    0]
 [   2   20  830 ...    0    0    0]
 [  85  180    9 ...    0    0    0]]
Epoch   1/100 Batch    0/1470 - Loss:  0.000365, Seconds: 3.56
[[  14   70 8104 ...    0    0    0]
 [   2   16  872 ...    0    0    0]
 [1705  282   22 ...    0    0    0]
 ...
 [  23    1    9 ...    0    0    0]
 [  22  264 8104 ...    0    0    0]
 [  28 8104    0 ...    0    0    0]]
Epoch   1/100 Batch    1/1470 - Loss:  0.000383, Seconds: 3.83


KeyboardInterrupt: 

In [18]:
def question_to_seq(question, vocab_to_int, int_to_vocab):
    '''Prepare the question for the model'''
        
    cleaned_question = [Corpus.clean_sequence(int_to_vocab[word]) for word in question]
    return [vocab_to_int.get(word, vocab_to_int[unk]) for word in cleaned_question]

In [None]:
# Create your own input question
#input_question = 'How are you?'

# Use a question from the data as your input
random = np.random.choice(len(sorted_questions))
input_question = sorted_questions[random]

# Prepare the question
input_question = question_to_seq(input_question, questions_vocab_to_int, questions_int_to_vocab)

# Pad the questions until it equals the max_line_length
input_question = input_question + [questions_vocab_to_int["<PAD>"]] * (max_line_length - len(input_question))
# Add empty questions so the the input_data is the correct shape
batch_shell = np.zeros((batch_size, max_line_length))
# Set the first question to be out input question
batch_shell[0] = input_question 

# Remove the padding from the Question and Answer
pad_q = questions_vocab_to_int["<PAD>"]
pad_a = answers_vocab_to_int["<EOS>"]    
print('Question')
print('  Word Ids:      {}'.format([i for i in input_question if i != pad_q]))
print('  Input Words: {}'.format([questions_int_to_vocab[i] for i in input_question if i != pad_q]))

saver = tf.train.Saver()
with tf.Session() as sess:
    # Run the model with the input question
    saver.restore(sess, checkpoint)
    beam_output = sess.run(beams, {input_data: batch_shell, 
                                                keep_prob: 1.0})[0]


for i in range(beam_width):
    beam = beam_output[:, i]
    print('\nAnswer', i)
    print('  Word Ids:      {}'.format([i for i in beam if i != pad_a]))
    print('  Response Words: {}'.format([answers_int_to_vocab[i] for i in beam if i != pad_a]))

Question
  Word Ids:      [0, 90, 235, 0, 90, 212]
  Input Words: ['you', 'look', 'uh', 'you', 'look', 'fine']
INFO:tensorflow:Restoring parameters from ./checkpoints/best_model.ckpt
