# ChatBot

## Project Overview

Chatbot that can converse with you at the command line. The chatbot will use a Sequence to Sequence text generation architecture with an LSTM/GRU as it's memory unit.

The dataset chosen is Squad v2 from `torchtext.datasets`. Check `prepare_data.py` for details

In [1]:
"""
Code based on Matthew Inkawhich's tutorial <https://github.com/MatthewInkawhich>
"""

import torch
import torch.nn as nn
from torch import optim
import random
import os
from prepare_data import load_prepare_data, trim_unfrequent_words, batch_to_train_data, get_indexes_from_sentence, normalize_string
from models import Seq2Seq, SearchDecoder
from globals import SOS_TOKEN, MIN_COUNT, MAX_SENTENCE_LENGTH, device
from torchtext.datasets import SQuAD2

## Data wrangling

In [2]:
train_dataset = SQuAD2(split='train')

# Load/Assemble voc and pairs
save_dir = os.path.join("checkpoints")
vocab, pairs = load_prepare_data(train_dataset)

# Print first 10 pairs
print("\nSample question/answer pairs:")
for pair in pairs[:10]:
    print(pair)

# Trim voc and pairs
pairs = trim_unfrequent_words(vocab, pairs, MIN_COUNT)

# Check sample data with a small batch size
small_batch_size = 5
batches = batch_to_train_data(vocab, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

# print("input_variable:", input_variable)
# print("lengths:", lengths)
# print("target_variable:", target_variable)
# print("mask:", mask)
# print("max_target_len:", max_target_len)

Preparing training data...
Trimmed to 10291 sentence pairs
Counted  18213  words in vocab

Sample question/answer pairs:
['where did beyonce get her name from', 'her mother s maiden name']
['what race was beyonce s father', 'african american']
['beyonce s mother worked in what industry', 'hairdresser and salon owner']
['which of her teachers discovered beyonce s musical talent', 'dance instructor darlette johnson']
['what choir did beyonce sing in for two years', 'st . john s united methodist church']
['who signed the girl group on october', 'dwayne wiggins s grass roots entertainment']
['what event caused beyonce s depression', 'split with luckett and rober']
['how long was beyonce depressed', 'a couple of years']
['who replaced luckett and roberson in destiny s child', 'farrah franklin and michelle williams .']
['who filed a lawsuit over survivor', 'luckett and roberson']
keep_words 5382 / 18210 = 0.2956
Trimmed from 10291 pairs to 2383, 0.2316 of total


## Build Model

We define 3 main classes:
- `Encoder`
- `Decoder`
- `Seq2Seq`
Additionally, the `Attention` and `SearchDecoder` classes. And a loss function: `mask_NLLLoss`

Check `chatbot/models.py`

For both train or evaluate the model, we must initialize encoder and decoder (through Seq2Seq)


In [3]:
# Hyperparams
hidden_size = 1000
batch_size = 64

# Set checkpoint to load from disk. None to start training from scratch
checkpoint_iter = 4000
checkpoint_filename = os.path.join(save_dir, '{}'.format(hidden_size), '{}_checkpoint.tar'.format(checkpoint_iter))
# checkpoint_filename = None      # Comment this to train from scratch, uncomment to load saved checkpoint (trained model)

# Load model if a loadFilename is provided
if checkpoint_filename:
    print('Loading checkpoint from ', checkpoint_filename)
    # If loading on same machine the model was trained on
    checkpoint = torch.load(checkpoint_filename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['encoder']
    decoder_sd = checkpoint['decoder']
    encoder_optimizer_sd = checkpoint['encoder_optimizer']
    decoder_optimizer_sd = checkpoint['decoder_optimizer']
    embedding_sd = checkpoint['embedding']
    vocab.__dict__ = checkpoint['vocab_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(vocab.num_words, hidden_size)
if checkpoint_filename:
    embedding.load_state_dict(embedding_sd)

# Initialize Seq2seq (and encoder & decoder)
seq2seq = Seq2Seq(hidden_size, hidden_size, vocab.num_words, embedding)
encoder = seq2seq.encoder
decoder = seq2seq.decoder

if checkpoint_filename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Ready!')


Building encoder and decoder ...
Ready!


## Training

We have `train_step` for a single iteration and `train_loop` which runs several epochs. The latter also saves checkpoints every `save_every` iteration in the `checkpoints` folder.

In [4]:
def train_step(input_variable, lengths, target_variable, mask, max_target_len, seq2seq, encoder_optimizer,
               decoder_optimizer, batch_size, clip):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss, n_totals, print_losses = seq2seq.forward_batch(input_variable, target_variable, max_target_len, lengths, mask,
                                                   teacher_forcing_ratio, batch_size)
    loss.backward()

    # Clip gradients in place
    _ = nn.utils.clip_grad_norm_(seq2seq.encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(seq2seq.decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    avg_loss = sum(print_losses) / n_totals
    return avg_loss

def train_loop(vocab, pairs, seq2seq, encoder_optimizer, decoder_optimizer, embedding, save_dir, epochs,
               batch_size, print_every, save_every, clip, checkpoint_filename):

    # Load batches for each iteration
    training_batches = [batch_to_train_data(vocab, [random.choice(pairs) for _ in range(batch_size)])
                        for _ in range(epochs)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if checkpoint_filename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for epoch in range(start_iteration, epochs + 1):
        training_batch = training_batches[epoch - 1]
        # Extract from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train_step(input_variable, lengths, target_variable, mask, max_target_len, seq2seq, encoder_optimizer,
                          decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if epoch % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Epoch: {}; Completion: {:.1f}%; Avg loss: {:.4f}".format(epoch, epoch / epochs * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (epoch % save_every == 0):
            directory = os.path.join(save_dir, '{}'.format(hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': epoch,
                'vocab_dict': vocab.__dict__,
                'encoder': encoder.state_dict(),
                'encoder_optimizer': encoder_optimizer.state_dict(),
                'decoder': decoder.state_dict(),
                'decoder_optimizer': decoder_optimizer.state_dict(),
                'loss': loss,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(epoch, 'checkpoint')))


### Run training

In [5]:
# Configure training/optimization hyperparams.
clip = 50.0
teacher_forcing_ratio = 0.5
learning_rate = 0.0001
decoder_learning_ratio = 5.0
epochs = 4000

print_every = 10
save_every = 500

encoder.train()
decoder.train()

# Instantiate ptimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if checkpoint_filename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Start training...")
train_loop(vocab, pairs, seq2seq, encoder_optimizer, decoder_optimizer, embedding, save_dir, epochs, batch_size,
           print_every, save_every, clip, checkpoint_filename)


Start training...
Initializing ...
Training...


  loss = cross_entropy.masked_select(mask).mean()
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch: 10; Completion: 0.2%; Avg loss: 8.0014
Epoch: 20; Completion: 0.5%; Avg loss: 6.5781
Epoch: 30; Completion: 0.8%; Avg loss: 6.2205
Epoch: 40; Completion: 1.0%; Avg loss: 6.1914
Epoch: 50; Completion: 1.2%; Avg loss: 6.1113
Epoch: 60; Completion: 1.5%; Avg loss: 6.0349
Epoch: 70; Completion: 1.8%; Avg loss: 5.9107
Epoch: 80; Completion: 2.0%; Avg loss: 5.9231
Epoch: 90; Completion: 2.2%; Avg loss: 5.9018
Epoch: 100; Completion: 2.5%; Avg loss: 5.8078
Epoch: 110; Completion: 2.8%; Avg loss: 5.8198
Epoch: 120; Completion: 3.0%; Avg loss: 5.8195
Epoch: 130; Completion: 3.2%; Avg loss: 5.7150
Epoch: 140; Completion: 3.5%; Avg loss: 5.6077
Epoch: 150; Completion: 3.8%; Avg loss: 5.6171
Epoch: 160; Completion: 4.0%; Avg loss: 5.5671
Epoch: 170; Completion: 4.2%; Avg loss: 5.4948
Epoch: 180; Completion: 4.5%; Avg loss: 5.3419
Epoch: 190; Completion: 4.8%; Avg loss: 5.1994
Epoch: 200; Completion: 5.0%; Avg loss: 5.2025
Epoch: 210; Completion: 5.2%; Avg loss: 4.9796
Epoch: 220; Completion

## Evaluation

In [6]:
# Manages the low-level process of handling the input sentence
def evaluate(searcher, vocab, sentence, max_sentence_length=MAX_SENTENCE_LENGTH):
    # words to indexes
    indexes_batch = [get_indexes_from_sentence(vocab, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_sentence_length)
    # indexes to words
    decoded_words = [vocab.index2word[token.item()] for token in tokens]
    return decoded_words


def eval_input(searcher, vocab):
    while(True):
        try:
            input_sentence = input('You> ')
            print('You> ', input_sentence, flush=True)
            if input_sentence == 'q' or input_sentence == 'quit': break
            input_sentence = normalize_string(input_sentence)
            output_words = evaluate(searcher, vocab, input_sentence)
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Chatbot>', ' '.join(output_words), flush=True)

        except KeyError:
            print("ERROR: Unknown word.")

In [26]:
# Run Evaluation!
# Set models to eval mode
encoder.eval()
decoder.eval()

# Instantiate search module
searcher = SearchDecoder(seq2seq)

# Begin chatting with Chatbot
eval_input(searcher, vocab)


You>  where do you live
Chatbot> southern asian asia
You>  what do you like in asia
Chatbot> interactions with other resistance
You>  I also do politics
Chatbot> vinyl in some bc
You>  vinyl? are you speaking in code?
Chatbot> gather are some metals
You>  Digging gold in Asia I guess
ERROR: Unknown word.
You>  tell me more about that business
Chatbot> middle of the first dense
You>  ok, please let's talk by phone
ERROR: Unknown word.
You>  q
