In [1]:
import re, math
import numpy as np

import nltk
nltk.download('wordnet') 
from nltk.corpus import wordnet

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F 
import import_ipynb
from MoveData import Options, csv2datatools
from Encoder import Encoder
from Decoder import Decoder

[nltk_data] Downloading package wordnet to /home/carson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


importing Jupyter notebook from MoveData.ipynb
importing Jupyter notebook from Encoder.ipynb
importing Jupyter notebook from Elements.ipynb
importing Jupyter notebook from Decoder.ipynb


, the one below imports `torch` so you can use PyTorch, it also imports some python code that I wrote in the folder *scripts* that I will explain to you after I show you a toy example of how the whole code works together, using a chatbot that says cute/flirty/snide/anything you want/etc language. nltk is the [Natural Language Toolkit](https://www.nltk.org/) that we will be using for things such as synonym matching, that way when you say "adore", Chloe knows it means the same thing as "like", even if "adore" is not in Chloe's vocabulary. To do this nltk will need to download a folder called corpora. Running the next cell will do that for you.


The cell below you have seen before, we will need the input and output vocabulary fields `infield, outfield` for our demonstration of how a sequence of words is represented by the transformer and the role that probability plays in the model's outputs. remove the triple quotes ''' code goes here ''' and run the cell below

In [7]:
csv_path = '../saved/chat_pairs.csv'
opt = Options(batchsize = 4)
data_iter, infield, outfield, opt = csv2datatools(csv_path,'en', opt)

uncomment the last line and Run the cell below to see how we go from string words to integers and how when we cannot find a word in our vocabulary, we try to find a synonym for that word that is in our vocabulary. 

In [3]:
def get_synonym(word, field, explain=False):
    syns = wordnet.synsets(word)
    for s in syns:
        if explain: print('synonym:', s.name())
        for l in s.lemmas():
            if explain: print('-lemma:', l.name())
            if field.vocab.stoi[l.name()] != 0:
                if explain: print('found in vocab', l.name())
                return field.vocab.stoi[l.name()]
    return 0 # if we cannot find a synonym, return 0

#print('token = ', get_synonym("fine", infield, explain=True))

As we mentioned before, one ability, or limitation depending on how you look at it, of chloe is her fixed vocabulary, each word or symbol in her vocabulary is assigned an integer. For example the word hi is assigned 3, the word dog is 17, a word not in the vocabulary is 0,this integer is the `token` output of the `get_synonym` function below. 

The neural nework sees every word as a vector. [A vector of 3 real numbers forms the coordinates in 3D space](https://youtu.be/fNk_zzaMoSs), we use several more dimensions than 3 in this example,  if we use 512 dimensions, this means that each word is a point in 512 dimensional space, but the same concepts apply to 3D space in that the location of that word in 3D space tells you it's [meaning and meaning relative to other words](https://youtu.be/8rXD5-xhemo?t=1550).

<img src="../saved/images/wordvectors.png" height=400 width=400>

In the image you see that similar words are close to each other, not only that, the direction they are separated from eachother also carries meaning. In the image, there are 3 clusters of words and the separation between them has something to do with age. If you stack all the vectors on top of eachother row by row, you get a matrix. Remember how each word is represented by both a vector and an integer? well this integer is the index for a row in the matrix. The matrix is called the embedding matrix. you might say that we "embed" words into the matrix.

In [4]:
def string2tensor(string, inputfield, explain=False):
    sentence = inputfield.preprocess(string)
    if explain: print(sentence)
    integer_sequence = []
    for tok in sentence:
        if inputfield.vocab.stoi[tok] != 0:
            integer_sequence.append(inputfield.vocab.stoi[tok])
        else:
            integer_sequence.append(get_synonym(tok, inputfield))
    return torch.LongTensor([integer_sequence])

In [9]:
input_sequence = string2tensor("ok, aren't -you [a robot?", infield, explain=True)
print(input_sequence)

['ok', ',', 'are', "n't", 'you', 'a', 'robot', '?']
tensor([[10,  0,  7,  0,  5, 12, 26,  6]])


Lets take our first look inside the Transformer, like older sequence to sequence models, Transformers also encode the sentence into vector representations and pass those representations along to the decoder to generate the response/reply/output/translation/etc. The Encoder and Decoder have subcomponents that we will discuss later. For now, just know that we can use the different parts to the transformer separately such as `model.encoder(arguments)` define Transformer class, instantiate a model and load the weights you trained in START_HERE into that model by running the next 2 cells. Note, for the time being, we are assuming that the vocabulary consists all all the words in the training set, nothing more nothing less, so if since running START_HERE you have added some lines of data, simply retrain the model from STAT_HERE before moving on

In [5]:
class Transformer(nn.Module):
    def __init__(self, in_vocab_size, out_vocab_size, emb_dim, n_layers, heads, dropout):
        super().__init__()
        self.encoder = Encoder(in_vocab_size, emb_dim, n_layers, heads, dropout)
        self.decoder = Decoder(out_vocab_size, emb_dim, n_layers, heads, dropout)
        self.out = nn.Linear(emb_dim, out_vocab_size)
    def forward(self, src_seq, trg_seq, src_mask, trg_mask):
        e_output = self.encoder(src_seq, src_mask)
        d_output = self.decoder(trg_seq, e_output, src_mask, trg_mask)
        output = self.out(d_output)
        return output

In [8]:
emb_dim, n_layers, heads, dropout = 64, 2, 8, 0.1 
opt.save_path = '../saved/weights/model_weights'

model = Transformer(len(infield.vocab), len(outfield.vocab), emb_dim, n_layers, heads, dropout)

if opt.device != -1:
    model = model.cuda()
    
model.load_state_dict(torch.load(opt.save_path))

<All keys matched successfully>

In [47]:
infield.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f7c77a72630>>,
            {'<unk>': 0,
             '<pad>': 1,
             '<sos>': 2,
             '<eos>': 3,
             'chloe': 4,
             'you': 5,
             '?': 6,
             'are': 7,
             'bye': 8,
             'i': 9,
             'ok': 10,
             'later': 11,
             'a': 12,
             'alive': 13,
             'cya': 14,
             'do': 15,
             'dunno': 16,
             'go': 17,
             'goodbye': 18,
             'goodnight': 19,
             'got': 20,
             'hello': 21,
             'hi': 22,
             'how': 23,
             'ill': 24,
             'm': 25,
             'robot': 26,
             'see': 27,
             'ta': 28,
             'talk': 29,
             'think': 30,
             'to': 31,
             'true': 32,
             'ttyl': 33,
             'what': 34,
             'who': 35,
             'why':

In [51]:
batch = next(iter(data_iter)).input_text.transpose(0,1)
print(batch, batch.shape, infield.vocab.stoi['<pad>'])

tensor([[ 2, 14,  4,  3],
        [ 2, 18,  4,  3],
        [ 2,  8,  4,  3],
        [ 2,  9, 16,  3]], device='cuda:0') torch.Size([4, 4]) 1


In [52]:

init_tok = outfield.vocab.stoi['<sos>'] # 2 (int) 
input_mask = batch != infield.vocab.stoi['<pad>'] # tensor([[True, True, True, True, True, True, True, True]])
print(input_mask.shape)
input_mask = input_mask.unsqueeze(-2)
print(input_mask.shape)
e_output = model.encoder(batch, input_mask)

tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]], device='cuda:0') torch.Size([4, 4])
tensor([[[True, True, True, True]],

        [[True, True, True, True]],

        [[True, True, True, True]],

        [[True, True, True, True]]], device='cuda:0') torch.Size([4, 1, 4])


In [35]:
torch.CharTensor(2, 3).unsqueeze(-1).shape

torch.Size([2, 3, 1])

In [None]:
def init_vars(src, model, SRC, TRG, opt):
    init_tok = TRG.vocab.stoi['<sos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    e_output = model.encoder(src, src_mask)
    outputs = torch.LongTensor([[init_tok]])
    if opt.device != -1:
        outputs = outputs.cuda()
    trg_mask = nopeak_mask(1, opt)
    out = model.out(model.decoder(outputs, e_output, src_mask, trg_mask))
    out = F.softmax(out, dim=-1)
    probs, ix = out[:, -1].data.topk(opt.k)
    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
    outputs = torch.zeros(opt.k, opt.max_len).long()
    if opt.device != -1:
        outputs = outputs.cuda()
    outputs[:, 0] = init_tok
    outputs[:, 1] = ix[0]
    e_outputs = torch.zeros(opt.k, e_output.size(-2),e_output.size(-1))
    if opt.device != -1:
        e_outputs = e_outputs.cuda()
    e_outputs[:, :] = e_output[0]
    return outputs, e_outputs, log_scores

In [None]:
def k_best_outputs(outputs, out, log_scores, i, k):
    probs, ix = out[:, -1].data.topk(k)
    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
    k_probs, k_ix = log_probs.view(-1).topk(k)
    row = k_ix // k
    col = k_ix % k
    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]
    log_scores = k_probs.unsqueeze(0)
    return outputs, log_scores

In [None]:
def beam_search(src, model, SRC, TRG, opt):
    outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, opt)
    eos_tok = TRG.vocab.stoi['<eos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    ind = None
    for i in range(2, opt.max_len):
        trg_mask = nopeak_mask(i, opt)
        out = model.out(model.decoder(outputs[:,:i], e_outputs, src_mask, trg_mask))
        out = F.softmax(out, dim=-1)
        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, opt.k)
        ones = (outputs==eos_tok).nonzero() # Occurrences of end symbols for all input sentences.
        sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
        for vec in ones:
            i = vec[0]
            if sentence_lengths[i]==0: # First end symbol has not been found yet
                sentence_lengths[i] = vec[1] # Position of first end symbol
        num_finished_sentences = len([s for s in sentence_lengths if s > 0])
        if num_finished_sentences == opt.k:
            alpha = 0.7
            div = 1/(sentence_lengths.type_as(log_scores)**alpha)
            _, ind = torch.max(log_scores * div, 1)
            ind = ind.data[0]
            break
    if ind is None:
        print(outputs[0]==eos_tok)
        print((outputs[0]==eos_tok).nonzero())
        length = (outputs[0]==eos_tok).nonzero()[0]
        return ' '.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
    else:
        length = (outputs[ind]==eos_tok).nonzero()[0]
        return ' '.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])

In [None]:
def talk_to_model(sentence, model, opt, SRC, TRG):
    model.eval()
    indexed = []
    sentence = SRC.preprocess(sentence)
    for tok in sentence:
        if SRC.vocab.stoi[tok] != 0:
            indexed.append(SRC.vocab.stoi[tok])
        else:
            indexed.append(get_synonym(tok, SRC))
    sentence = Variable(torch.LongTensor([indexed]))
    if opt.device != -1:
        sentence = sentence.cuda()
    sentence = beam_search(sentence, model, SRC, TRG, opt)
    return  multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)

In [None]:
def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)