# LSTM Bot

## Project Overview

Chatbot that can converse with you at the command line. The chatbot will use a Sequence to Sequence text generation architecture with an LSTM/GRU as it's memory unit.

---
Code based on Matthew Inkawhich's <https://github.com/MatthewInkawhich>

In [2]:
"""
Chatbot Tutorial
================
Code based on Matthew Inkawhich's <https://github.com/MatthewInkawhich>
"""

import torch
import torch.nn as nn
from torch import optim
import random
import os
from chatbot.prepare_data import load_prepare_data, train_dataset, trim_unfrequent_words, batch_to_train_data, MAX_SENTENCE_LENGTH, get_indexes_from_sentence, normalize_string
from chatbot.models import Seq2Seq
from chatbot.globals import SOS_TOKEN, device

# Load/Assemble voc and pairs
save_dir = os.path.join("checkpoints")
vocab, pairs = load_prepare_data(train_dataset)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

MIN_COUNT = 3    # Minimum word count threshold for trimming

# Trim voc and pairs
pairs = trim_unfrequent_words(vocab, pairs, MIN_COUNT)

# Example for validation
small_batch_size = 5
batches = batch_to_train_data(vocab, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

######################################################################
# Training

def train_step(input_variable, lengths, target_variable, mask, max_target_len, seq2seq, encoder_optimizer,
               decoder_optimizer, batch_size, clip):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss, n_totals, print_losses = seq2seq.forward_batch(input_variable, target_variable, max_target_len, lengths, mask,
                                                   teacher_forcing_ratio, batch_size)

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(seq2seq.encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(seq2seq.decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    avg_loss = sum(print_losses) / n_totals
    return avg_loss

def train_loop(vocab, pairs, seq2seq, encoder_optimizer, decoder_optimizer, embedding, save_dir, epochs,
               batch_size, print_every, save_every, clip, load_filename):

    # Load batches for each iteration
    training_batches = [batch_to_train_data(vocab, [random.choice(pairs) for _ in range(batch_size)])
                        for _ in range(epochs)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if load_filename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for epoch in range(start_iteration, epochs + 1):
        training_batch = training_batches[epoch - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train_step(input_variable, lengths, target_variable, mask, max_target_len, seq2seq, encoder_optimizer,
                          decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if epoch % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Epoch: {}; Completion: {:.1f}%; Avg loss: {:.4f}".format(epoch, epoch / epochs * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (epoch % save_every == 0):
            directory = os.path.join(save_dir, '{}'.format(hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': epoch,
                'vocab_dict': vocab.__dict__,
                'encoder': encoder.state_dict(),
                'encoder_optimizer': encoder_optimizer.state_dict(),
                'decoder': decoder.state_dict(),
                'decoder_optimizer': decoder_optimizer.state_dict(),
                'loss': loss,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(epoch, 'checkpoint')))


######################################################################
# Define Evaluation
# -----------------
#
# After training a model, we want to be able to talk to the bot ourselves.
# First, we must define how we want the model to decode the encoded input.
#
# Greedy decoding
# ~~~~~~~~~~~~~~~
#
# Greedy decoding is the decoding method that we use during training when
# we are **NOT** using teacher forcing. In other words, for each time
# step, we simply choose the word from ``decoder_output`` with the highest
# softmax value. This decoding method is optimal on a single time-step
# level.
#
# To facilite the greedy decoding operation, we define a
# ``GreedySearchDecoder`` class. When run, an object of this class takes
# an input sequence (``input_seq``) of shape *(input_seq length, 1)*, a
# scalar input length (``input_length``) tensor, and a ``max_length`` to
# bound the response sentence length. The input sentence is evaluated
# using the following computational graph:
#
# **Computation Graph:**
#
#    1) Forward input through encoder model.
#    2) Prepare encoder's final hidden layer to be first hidden input to the decoder.
#    3) Initialize decoder's first input as SOS_token.
#    4) Initialize tensors to append decoded words to.
#    5) Iteratively decode one word token at a time:
#        a) Forward pass through decoder.
#        b) Obtain most likely word token and its softmax score.
#        c) Record token and score.
#        d) Prepare current token to be next decoder input.
#    6) Return collections of word tokens and scores.
#

class SearchDecoder(nn.Module):
    def __init__(self, seq2seq):
        super(SearchDecoder, self).__init__()
        self.seq2seq = seq2seq

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.seq2seq.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.num_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_TOKEN
        # Initialize tensors to append decoded words to
        tokens = torch.zeros([0], device=device, dtype=torch.long)
        scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.seq2seq.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            tokens = torch.cat((tokens, decoder_input), dim=0)
            scores = torch.cat((scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return tokens, scores


######################################################################
# Evaluate my text
# ~~~~~~~~~~~~~~~~
#
# Now that we have our decoding method defined, we can write functions for
# evaluating a string input sentence. The ``evaluate`` function manages
# the low-level process of handling the input sentence. We first format
# the sentence as an input batch of word indexes with *batch_size==1*. We
# do this by converting the words of the sentence to their corresponding
# indexes, and transposing the dimensions to prepare the tensor for our
# models. We also create a ``lengths`` tensor which contains the length of
# our input sentence. In this case, ``lengths`` is scalar because we are
# only evaluating one sentence at a time (batch_size==1). Next, we obtain
# the decoded response sentence tensor using our ``GreedySearchDecoder``
# object (``searcher``). Finally, we convert the response’s indexes to
# words and return the list of decoded words.
#
# ``evaluateInput`` acts as the user interface for our chatbot. When
# called, an input text field will spawn in which we can enter our query
# sentence. After typing our input sentence and pressing *Enter*, our text
# is normalized in the same way as our training data, and is ultimately
# fed to the ``evaluate`` function to obtain a decoded output sentence. We
# loop this process, so we can keep chatting with our bot until we enter
# either “q” or “quit”.
#
# Finally, if a sentence is entered that contains a word that is not in
# the vocabulary, we handle this gracefully by printing an error message
# and prompting the user to enter another sentence.
#

def evaluate(searcher, vocab, sentence, max_sentence_length=MAX_SENTENCE_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [get_indexes_from_sentence(vocab, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_sentence_length)
    # indexes -> words
    decoded_words = [vocab.index2word[token.item()] for token in tokens]
    return decoded_words


def eval_input(searcher, vocab):
    while(True):
        try:
            # Get input sentence
            input_sentence = input('You> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalize_string(input_sentence)
            # Evaluate sentence
            output_words = evaluate(searcher, vocab, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Chatbot>', ' '.join(output_words))

        except KeyError:
            print("ERROR: Unknown word.")


######################################################################
# Run Model
# ---------
#
# Finally, it is time to run our model!
#
# Regardless of whether we want to train or test the chatbot model, we
# must initialize the individual encoder and decoder models. In the
# following block, we set our desired configurations, choose to start from
# scratch or set a checkpoint to load from, and build and initialize the
# models. Feel free to play with different model configurations to
# optimize performance.
#

# Configure models
hidden_size = 500
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch

checkpoint_iter = 4000
load_filename = os.path.join(save_dir, '{}'.format(hidden_size), '{}_checkpoint.tar'.format(checkpoint_iter))
# load_filename = None      # Comment this to train from scratch, uncomment to load saved checkpoint

# Load model if a loadFilename is provided
if load_filename:
    print('Loading from ', load_filename)
    # If loading on same machine the model was trained on
    checkpoint = torch.load(load_filename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['encoder']
    decoder_sd = checkpoint['decoder']
    encoder_optimizer_sd = checkpoint['encoder_optimizer']
    decoder_optimizer_sd = checkpoint['decoder_optimizer']
    embedding_sd = checkpoint['embedding']
    vocab.__dict__ = checkpoint['vocab_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(vocab.num_words, hidden_size)
if load_filename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
seq2seq = Seq2Seq(hidden_size, hidden_size, vocab.num_words, embedding)
encoder = seq2seq.encoder
decoder = seq2seq.decoder
if load_filename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Ready!')


######################################################################
# Run Training
# ~~~~~~~~~~~~
#
# Run the following block if you want to train the model.
#
# First we set training parameters, then we initialize our optimizers, and
# finally we call the ``trainIters`` function to run our training
# iterations.
#

# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 0.5
learning_rate = 0.0001
decoder_learning_ratio = 5.0
epochs = 4000

print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if load_filename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")
train_loop(vocab, pairs, seq2seq, encoder_optimizer, decoder_optimizer, embedding, save_dir, epochs, batch_size,
           print_every, save_every, clip, load_filename)


######################################################################
# Run Evaluation
# ~~~~~~~~~~~~~~
#
# To chat with your model, run the following block.
#

# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = SearchDecoder(seq2seq)

# Begin chatting (uncomment and run the following line to begin)
eval_input(searcher, vocab)


######################################################################
# Conclusion
# ----------
#
# That’s all for this one, folks. Congratulations, you now know the
# fundamentals to building a generative chatbot model! If you’re
# interested, you can try tailoring the chatbot’s behavior by tweaking the
# model and training parameters and customizing the data that you train
# the model on.
#
# Check out the other tutorials for more cool deep learning applications
# in PyTorch!
#

Start preparing training data ...
Read 11873 sentence pairs
Trimmed to 872 sentence pairs
Counting words...
Counted words: 3389

pairs:
['were the normans in normandy', 'th and th centuries']
['from countries did the norse originate', 'denmark iceland and norway']
['was the duke in the battle of hastings', 'william the conqueror']
['was dyrrachium located', 'the adriatic']
['did emma marry', 'king ethelred ii']
['was emma s brother', 'duke richard ii']
['kicked ethelred out', 'sweyn forkbeard']
['did harold ii die', 'battle of hastings']
['killed harold ii', 'william ii']
['was margaret s husband', 'king malcolm iii of scotland']
keep_words 642 / 3386 = 0.1896
Trimmed from 872 pairs to 25, 0.0287 of total
input_variable: tensor([[229, 252,  30, 118,  72],
        [ 15,  46,   4,  10,  15],
        [183,   4, 136, 513, 422],
        [106, 251, 243, 314,  30],
        [ 12, 252,  15,   5, 423],
        [ 22,   5, 244,   2,   2],
        [ 62,   4, 168,   0,   0],
        [  4, 161,   2, 

  loss = crossEntropy.masked_select(mask).mean()
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch: 1; Completion: 2.5%; Avg loss: 6.4737
Epoch: 2; Completion: 5.0%; Avg loss: 6.3452
Epoch: 3; Completion: 7.5%; Avg loss: 6.2201
Epoch: 4; Completion: 10.0%; Avg loss: 5.9907
Epoch: 5; Completion: 12.5%; Avg loss: 5.7518
Epoch: 6; Completion: 15.0%; Avg loss: 5.2150
Epoch: 7; Completion: 17.5%; Avg loss: 5.0415
Epoch: 8; Completion: 20.0%; Avg loss: 5.0504
Epoch: 9; Completion: 22.5%; Avg loss: 4.7782
Epoch: 10; Completion: 25.0%; Avg loss: 4.6882
Epoch: 11; Completion: 27.5%; Avg loss: 4.1717
Epoch: 12; Completion: 30.0%; Avg loss: 4.0441
Epoch: 13; Completion: 32.5%; Avg loss: 3.7499
Epoch: 14; Completion: 35.0%; Avg loss: 3.7443
Epoch: 15; Completion: 37.5%; Avg loss: 3.6436
Epoch: 16; Completion: 40.0%; Avg loss: 3.5096
Epoch: 17; Completion: 42.5%; Avg loss: 3.4072
Epoch: 18; Completion: 45.0%; Avg loss: 3.4090
Epoch: 19; Completion: 47.5%; Avg loss: 3.1824
Epoch: 20; Completion: 50.0%; Avg loss: 3.2454
Epoch: 21; Completion: 52.5%; Avg loss: 3.3352
Epoch: 22; Completion: 55