# LSTM Chatbot
_ _ _
## Overview  

I am training a simple seq2seq architecture using movie
scripts from the [Cornell Movie-Dialogs Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html). <br/> Let start by downloading the [dataset](https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip) first.  

## Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools
import math
import json
from io import open
from torch import optim
from torch.jit import script, trace

## Data Preprocessing  

### File Formatting Functions

In [2]:
def load_data(fileName):
    lines = {}
    conversations = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            lineJson = json.loads(line)
            lineObj = {}
            lineObj["lineID"] = lineJson["id"]
            lineObj["characterID"] = lineJson["speaker"]
            lineObj["text"] = lineJson["text"]
            lines[lineObj['lineID']] = lineObj
            if lineJson["conversation_id"] not in conversations:
                convObj = {}
                convObj["conversationID"] = lineJson["conversation_id"]
                convObj["movieID"] = lineJson["meta"]["movie_id"]
                convObj["lines"] = [lineObj]
            else:
                convObj = conversations[lineJson["conversation_id"]]
                convObj["lines"].insert(0, lineObj)
            conversations[convObj["conversationID"]] = convObj
    return lines, conversations

def extract_pairs(conversations):
    pairs = []
    for conversation in conversations.values():
        for i in range(len(conversation["lines"]) - 1):
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            if inputLine and targetLine:
                pairs.append([inputLine, targetLine])
    return pairs

Call the above functions to create a formatted file.




In [3]:
corpus_name = "movie-corpus"
corpus = os.path.join("data", corpus_name)
datafile = os.path.join(corpus, "formatted_data.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
lines = {}
conversations = {}
# Load lines and conversations
print('Loading data...')
lines, conversations = load_data(os.path.join(corpus, "utterances.jsonl"))
# Write formatted_data.txt
print('Writing formatted file...')
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in extract_pairs(conversations):
        writer.writerow(pair)
print('Done!')

Loading data...
Writing formatted file...
Done!


## Vocabulary




In [4]:
PAD_token = 0  # Padding for short sentences
SOS_token = 1  # Start of sentence
EOS_token = 2  # End of sentence

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))
        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 
        for word in keep_words:
            self.add_word(word)

### Important parameters for building vocabulary

In [5]:
MAX_LENGTH = 10  # Maximum sentence length
MIN_RARE_COUNT = 3 # Minimum count for eliminating rare words

- Convert the Unicode strings to ASCII
- Convert all letters to lowercase
- Trim all non-letter characters except for basic punctuation
- Filter out sentences with length greater than the ``MAX_LENGTH``





In [6]:
# https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

def read_vocs(datafile, corpus_name):
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

def filter_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

def load_prepare_data(corpus, corpus_name, datafile):
    voc, pairs = read_vocs(datafile, corpus_name)
    print("Read {} sentence pairs".format(len(pairs)))
    pairs = filter_pairs(pairs)
    print("Trimmed to {} sentence pairs".format(len(pairs)))
    for pair in pairs:
        voc.add_sentence(pair[0])
        voc.add_sentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs

voc, pairs = load_prepare_data(corpus, corpus_name, datafile)

Read 221282 sentence pairs
Trimmed to 64313 sentence pairs
Counted words: 18082


### Trim rarely used words from vocabulary.




In [7]:
def trim_rare_words(voc, pairs, min_count):
    voc.trim(min_count)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break
        if keep_input and keep_output:
            keep_pairs.append(pair)
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs),
        len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs

pairs = trim_rare_words(voc, pairs, MIN_RARE_COUNT)

keep_words 7833 / 18079 = 0.4333
Trimmed from 64313 pairs to 53131, 0.8261 of total


### Convert to Tensors



In [8]:
def indexes_from_sentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

def zero_padding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binary_matrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def input_var(l, voc):
    indexes_batch = [indexes_from_sentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    pad_list = zero_padding(indexes_batch)
    pad_var = torch.LongTensor(pad_list)
    return pad_var, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def output_var(l, voc):
    indexes_batch = [indexes_from_sentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    pad_list = zero_padding(indexes_batch)
    mask = binary_matrix(pad_list)
    mask = torch.BoolTensor(mask)
    pad_var = torch.LongTensor(pad_list)
    return pad_var, mask, max_target_len

# Returns all items for a given batch of pairs
def batch_to_train_data(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = input_var(input_batch, voc)
    output, mask, max_target_len = output_var(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

# Example for validation
small_batch_size = 5
batches = batch_to_train_data(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

# Let print all variables
print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[  11,  175,  287,   19,  162],
        [  83,    4,  288,   17,  665],
        [  11,   11,  210,   92,   14],
        [ 200,  257,   14,   10,    2],
        [ 512,   10,    2,    2,    0],
        [ 743,    2,    0,    0,    0],
        [ 483,    0,    0,    0,    0],
        [2498,    0,    0,    0,    0],
        [  14,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([10,  6,  5,  5,  4])
target_variable: tensor([[  33,   25,   24,  111,  112],
        [  36,  590,   64,   24,   10],
        [  17,   14,  555,  154,    2],
        [1758,    2,    7,  832,    0],
        [  14,    0,   72,   16,    0],
        [   2,    0,   14,   72,    0],
        [   0,    0,    2,   99,    0],
        [   0,    0,    0,   10,    0],
        [   0,    0,    0,    2,    0]])
mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  Tr

## Define Model  

### Encoder

In [9]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.lstm = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.lstm(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        return outputs, hidden

### Attention

In [10]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

### Attention Decoder

In [11]:
class Decoder(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(Decoder, self).__init__()

        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.lstm = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.lstm(embedded, last_hidden)
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        return output, hidden

### Sequence to sequence

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, ratio):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.teacher_forcing_ratio = ratio

    def forward(self, input_seq, input_length, target, mask, max_target_len, batch_size):
        loss = 0
        print_losses = []
        n_totals = 0
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
        decoder_input = decoder_input.to(self.device)
        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        if use_teacher_forcing:
            for t in range(max_target_len):
                decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
                decoder_input = target[t].view(1, -1)
                decoder_input = decoder_input.to(self.device)
                mask_loss, nTotal = maskNLLLoss(decoder_output, target[t], mask[t], self.device)
                loss += mask_loss
                print_losses.append(mask_loss.item() * nTotal)
                n_totals += nTotal
        else:
            for t in range(max_target_len):
                decoder_output, decoder_hidden = self.decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                _, topi = decoder_output.topk(1)
                decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
                decoder_input = decoder_input.to(self.device)
                mask_loss, nTotal = maskNLLLoss(decoder_output, target[t], mask[t], self.device)
                loss += mask_loss
                print_losses.append(mask_loss.item() * nTotal)
                n_totals += nTotal
        return loss, n_totals, print_losses

### Mask Loss

In [13]:
def maskNLLLoss(inp, target, mask, device):
    #target = target.to(device)
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

### Training Functions

In [14]:
def train_step(input_variable, lengths, target_variable, mask, seq2seq,
          seq2seq_optimizer, max_target_len, batch_size, clip, device):

    seq2seq_optimizer.zero_grad()
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    lengths = lengths.to("cpu")
    loss, n_totals, print_losses = seq2seq(input_variable, lengths, target_variable,
                                          mask, max_target_len, batch_size)
    loss.backward()
    # Clip at its place
    _ = nn.utils.clip_grad_norm_(seq2seq.parameters(), clip)
    seq2seq_optimizer.step()
    return sum(print_losses) / n_totals

def train(voc, pairs, seq2seq, seq2seq_optimizer, embedding,
          encoder_n_layers, decoder_n_layers, n_iteration,
          batch_size, print_every, clip, corpus_name, device):

    training_batches = [batch_to_train_data(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch
        loss = train_step(input_variable, lengths, target_variable, mask, seq2seq,
                     seq2seq_optimizer, max_target_len, batch_size, clip, device)
        print_loss += loss
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

### Important Training Parameters

In [21]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
attn_model = 'dot' #'general', 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
clip = 50.0
teacher_forcing_ratio = 0.5
learning_rate = 0.0005
n_iteration = 20000
print_every = 500

### Initialize Model

In [16]:
embedding = nn.Embedding(voc.num_words, hidden_size)
encoder = Encoder(hidden_size, embedding, encoder_n_layers, dropout)
decoder = Decoder(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)
encoder.train()
decoder.train()
seq2seq = Seq2Seq(encoder, decoder, device, teacher_forcing_ratio)
seq2seq = seq2seq.to(device)
seq2seq.train()
seq2seq_optimizer = optim.Adam(seq2seq.parameters(), lr=learning_rate)

for state in seq2seq_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

### Start training...

In [17]:
train(voc, pairs, seq2seq, seq2seq_optimizer,
      embedding, encoder_n_layers, decoder_n_layers, n_iteration, batch_size,
      print_every, clip, corpus_name, device)

Initializing ...
Training...
Iteration: 500; Percent complete: 2.5%; Average loss: 4.6804
Iteration: 1000; Percent complete: 5.0%; Average loss: 4.3220
Iteration: 1500; Percent complete: 7.5%; Average loss: 4.1529
Iteration: 2000; Percent complete: 10.0%; Average loss: 4.0934
Iteration: 2500; Percent complete: 12.5%; Average loss: 3.9846
Iteration: 3000; Percent complete: 15.0%; Average loss: 3.9340
Iteration: 3500; Percent complete: 17.5%; Average loss: 3.7831
Iteration: 4000; Percent complete: 20.0%; Average loss: 3.6860
Iteration: 4500; Percent complete: 22.5%; Average loss: 3.5487
Iteration: 5000; Percent complete: 25.0%; Average loss: 3.4249
Iteration: 5500; Percent complete: 27.5%; Average loss: 3.3220
Iteration: 6000; Percent complete: 30.0%; Average loss: 3.1692
Iteration: 6500; Percent complete: 32.5%; Average loss: 3.0810
Iteration: 7000; Percent complete: 35.0%; Average loss: 2.8828
Iteration: 7500; Percent complete: 37.5%; Average loss: 2.7786
Iteration: 8000; Percent compl

### Evaluation



In [18]:
class GreedySearchDecoder(nn.Module):
   
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length): 
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        for _ in range(max_length):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        return all_tokens, all_scores

In [19]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    indexes_batch = [indexes_from_sentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    tokens, scores = searcher(input_batch, lengths, max_length)
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words

def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            input_sentence = input('> ')
            if input_sentence == 'q' or input_sentence == 'quit': break
            input_sentence = normalize_string(input_sentence)
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))
        except KeyError:
            print("Error: Encountered unknown word.")

### Start evaluation...

In [20]:
encoder.eval()
decoder.eval()
searcher = GreedySearchDecoder(encoder, decoder)
evaluateInput(encoder, decoder, searcher, voc)

> hi
Bot: hi . d . your . .
> i am your dad
Bot: why is that guy ?
> so tell me what is your name
Bot: bob ! on me !
> bob are you cute
Bot: but !
> what
Bot: oh jesus . the baby the story ?
> baby story
Bot: you you you you you
> what
Bot: oh jesus . the baby the story ?
> shutup
Error: Encountered unknown word.
> please go away
Bot: i ll give you some .
> give me what
Bot: oh shut it !
> you shut it
Bot: no s it . it s . .
> Now you are loosing your mind
Error: Encountered unknown word.
> are you smart
Bot: i m trying to be fucked .
> I can fuck you
Bot: fuck you ! ! !
> Shut it bitch
Bot: fuck is it . .
> q


## References

1) Yuan-Kuei Wu’s pytorch-chatbot implementation:
   https://github.com/ywk991112/pytorch-chatbot

2) Sean Robertson’s practical-pytorch seq2seq-translation example:
   https://github.com/spro/practical-pytorch/tree/master/seq2seq-translation

3) FloydHub’s Cornell Movie Corpus preprocessing code:
   https://github.com/floydhub/textutil-preprocess-cornell-movie-corpus
