# ChatBot

## Project Overview

Chatbot that can converse with you at the command line. The chatbot will use a Sequence to Sequence text generation architecture with an LSTM/GRU as it's memory unit.

The dataset chosen is Squad v2 from `torchtext.datasets`. Check `prepare_data.py` for details

In [1]:
"""
Code based on Matthew Inkawhich's <https://github.com/MatthewInkawhich>
"""

import torch
import torch.nn as nn
from torch import optim
import random
import os
from chatbot.prepare_data import load_prepare_data, trim_unfrequent_words, batch_to_train_data, MAX_SENTENCE_LENGTH, get_indexes_from_sentence, normalize_string
from chatbot.models import Seq2Seq, SearchDecoder
from chatbot.globals import SOS_TOKEN, device
from torchtext.datasets import SQuAD2

## Data wrangling

In [2]:
train_dataset = SQuAD2(split='dev')

# Load/Assemble voc and pairs
save_dir = os.path.join("checkpoints")
vocab, pairs = load_prepare_data(train_dataset)

# Print first 10 pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

MIN_COUNT = 3    # Min word count for trimming

# Trim voc and pairs
pairs = trim_unfrequent_words(vocab, pairs, MIN_COUNT)

# Check sample data with a small batch size
small_batch_size = 5
batches = batch_to_train_data(vocab, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

Start preparing training data ...
Read 11873 sentence pairs
Trimmed to 872 sentence pairs
Counting words...
Counted words: 3389

pairs:
['were the normans in normandy', 'th and th centuries']
['from countries did the norse originate', 'denmark iceland and norway']
['was the duke in the battle of hastings', 'william the conqueror']
['was dyrrachium located', 'the adriatic']
['did emma marry', 'king ethelred ii']
['was emma s brother', 'duke richard ii']
['kicked ethelred out', 'sweyn forkbeard']
['did harold ii die', 'battle of hastings']
['killed harold ii', 'william ii']
['was margaret s husband', 'king malcolm iii of scotland']
keep_words 642 / 3386 = 0.1896
Trimmed from 872 pairs to 25, 0.0287 of total
input_variable: tensor([[ 46,  46, 186, 118,  72],
        [  4,   4,  12,  10,  15],
        [589, 101, 388, 513, 422],
        [ 15, 399,  85, 314,  30],
        [206,  97,   4,   5, 423],
        [516,   5, 387,   2,   2],
        [ 94, 632,   2,   0,   0],
        [  2,   2,   0, 

## Build Model

We define 3 main classes:
- `Encoder`
- `Decoder`
- `Seq2Seq`
Additionally, the `Attention` and `SearchDecoder` classes. And a loss function: `mask_NLLLoss`

Check `chatbot/models.py`

For both train or evaluate the model, we must initialize encoder and decoder (through Seq2Seq)


In [3]:
# Hyperparams
hidden_size = 500
batch_size = 64

# Set checkpoint to load from disk. None to start training from scratch
checkpoint_iter = 4000
checkpoint_filename = os.path.join(save_dir, '{}'.format(hidden_size), '{}_checkpoint.tar'.format(checkpoint_iter))
checkpoint_filename = None      # Comment this to train from scratch, uncomment to load saved checkpoint

# Load model if a loadFilename is provided
if checkpoint_filename:
    print('Loading checkpoint from ', checkpoint_filename)
    # If loading on same machine the model was trained on
    checkpoint = torch.load(checkpoint_filename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['encoder']
    decoder_sd = checkpoint['decoder']
    encoder_optimizer_sd = checkpoint['encoder_optimizer']
    decoder_optimizer_sd = checkpoint['decoder_optimizer']
    embedding_sd = checkpoint['embedding']
    vocab.__dict__ = checkpoint['vocab_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(vocab.num_words, hidden_size)
if checkpoint_filename:
    embedding.load_state_dict(embedding_sd)

# Initialize Seq2seq (and encoder & decoder)
seq2seq = Seq2Seq(hidden_size, hidden_size, vocab.num_words, embedding)
encoder = seq2seq.encoder
decoder = seq2seq.decoder

if checkpoint_filename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Ready!')


Building encoder and decoder ...
Ready!


## Training

We have `train_step` for a single iteration and `train_loop` which runs several epochs. The latter also saves checkpoints every `save_every` iteration in the `checkpoints` folder.

In [4]:
def train_step(input_variable, lengths, target_variable, mask, max_target_len, seq2seq, encoder_optimizer,
               decoder_optimizer, batch_size, clip):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss, n_totals, print_losses = seq2seq.forward_batch(input_variable, target_variable, max_target_len, lengths, mask,
                                                   teacher_forcing_ratio, batch_size)
    loss.backward()

    # Clip gradients in place
    _ = nn.utils.clip_grad_norm_(seq2seq.encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(seq2seq.decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    avg_loss = sum(print_losses) / n_totals
    return avg_loss

def train_loop(vocab, pairs, seq2seq, encoder_optimizer, decoder_optimizer, embedding, save_dir, epochs,
               batch_size, print_every, save_every, clip, checkpoint_filename):

    # Load batches for each iteration
    training_batches = [batch_to_train_data(vocab, [random.choice(pairs) for _ in range(batch_size)])
                        for _ in range(epochs)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if checkpoint_filename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for epoch in range(start_iteration, epochs + 1):
        training_batch = training_batches[epoch - 1]
        # Extract from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train_step(input_variable, lengths, target_variable, mask, max_target_len, seq2seq, encoder_optimizer,
                          decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if epoch % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Epoch: {}; Completion: {:.1f}%; Avg loss: {:.4f}".format(epoch, epoch / epochs * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (epoch % save_every == 0):
            directory = os.path.join(save_dir, '{}'.format(hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': epoch,
                'vocab_dict': vocab.__dict__,
                'encoder': encoder.state_dict(),
                'encoder_optimizer': encoder_optimizer.state_dict(),
                'decoder': decoder.state_dict(),
                'decoder_optimizer': decoder_optimizer.state_dict(),
                'loss': loss,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(epoch, 'checkpoint')))


### Run training

In [5]:
# Configure training/optimization hyperparams.
clip = 50.0
teacher_forcing_ratio = 0.5
learning_rate = 0.0001
decoder_learning_ratio = 5.0
epochs = 4000

print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if checkpoint_filename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Start training...")
train_loop(vocab, pairs, seq2seq, encoder_optimizer, decoder_optimizer, embedding, save_dir, epochs, batch_size,
           print_every, save_every, clip, checkpoint_filename)


Start training...
Initializing ...
Training...


  loss = cross_entropy.masked_select(mask).mean()
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch: 1; Completion: 2.5%; Avg loss: 6.4762
Epoch: 2; Completion: 5.0%; Avg loss: 6.3502
Epoch: 3; Completion: 7.5%; Avg loss: 6.1476
Epoch: 4; Completion: 10.0%; Avg loss: 5.7190
Epoch: 5; Completion: 12.5%; Avg loss: 5.6533
Epoch: 6; Completion: 15.0%; Avg loss: 5.2770
Epoch: 7; Completion: 17.5%; Avg loss: 5.0650
Epoch: 8; Completion: 20.0%; Avg loss: 4.6833
Epoch: 9; Completion: 22.5%; Avg loss: 4.5040
Epoch: 10; Completion: 25.0%; Avg loss: 4.4756
Epoch: 11; Completion: 27.5%; Avg loss: 4.0854
Epoch: 12; Completion: 30.0%; Avg loss: 3.8745
Epoch: 13; Completion: 32.5%; Avg loss: 3.6492
Epoch: 14; Completion: 35.0%; Avg loss: 3.6343
Epoch: 15; Completion: 37.5%; Avg loss: 3.5598
Epoch: 16; Completion: 40.0%; Avg loss: 3.3270
Epoch: 17; Completion: 42.5%; Avg loss: 3.4147
Epoch: 18; Completion: 45.0%; Avg loss: 3.2389
Epoch: 19; Completion: 47.5%; Avg loss: 3.2522
Epoch: 20; Completion: 50.0%; Avg loss: 3.1396
Epoch: 21; Completion: 52.5%; Avg loss: 3.0715
Epoch: 22; Completion: 55

## Evaluation

In [6]:
# Manages the low-level process of handling the input sentence
def evaluate(searcher, vocab, sentence, max_sentence_length=MAX_SENTENCE_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [get_indexes_from_sentence(vocab, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_sentence_length)
    # indexes -> words
    decoded_words = [vocab.index2word[token.item()] for token in tokens]
    return decoded_words


def eval_input(searcher, vocab):
    while(True):
        try:
            # Get input sentence
            input_sentence = input('You> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalize_string(input_sentence)
            # Evaluate sentence
            output_words = evaluate(searcher, vocab, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Chatbot>', ' '.join(output_words))

        except KeyError:
            print("ERROR: Unknown word.")


######################################################################
# Run Evaluation
# ~~~~~~~~~~~~~~
#
# To chat with your model, run the following block.
#

# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = SearchDecoder(seq2seq)

# Begin chatting (uncomment and run the following line to begin)
eval_input(searcher, vocab)


Chatbot> the of
Chatbot> the of
Chatbot> the of
ERROR: Unknown word.
ERROR: Unknown word.
