In [44]:
import re, math
import numpy as np

import nltk
nltk.download('wordnet') 
from nltk.corpus import wordnet

import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.autograd import Variable
import torch.nn.functional as F 
import import_ipynb
from MoveData import Options, json2datatools, num_batches, nopeak_mask, create_masks
from EncoderDecoder import Encoder, Decoder

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/carsonlam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Neural Network as an Agent

In this section we will be putting everything together. We learned AI to make AI robots so let finally do that. 

If you are not using this notebook to learn, change the below variable `teaching` to `False` so that other notebooks can import the functions defined in this notebook without running all the examples, if you are here to learn and interact with the notebook, change it to `True`

The cell below you have seen before, we will need the input and output vocabulary fields `infield, outfield` for our demonstration of how a sequence of words is represented by the transformer and the role that probability plays in the model's outputs. Note, for the time being, we are assuming that the vocabulary consists of all the words in the training set, nothing more nothing less, so if since running START_HERE you have added some lines of data, simply retrain the model from START_HERE before moving on

In [8]:
teaching = True

opt = Options(batchsize=2, device = torch.device("cpu"), epochs=25, lr=0.01, 
              beam_width=3, max_len = 25, save_path = '../saved/weights/model_weights')

data_iter, infield, outfield, opt = json2datatools(path='../saved/pairs.json', opt=opt)

Until we built more capabilities into chloe, we will have to reply on a few software tools. The first tool is a tool for expanding chloe's vocabulary without having to learn them from scratch.

nltk is the [Natural Language Toolkit](https://www.nltk.org/) that we will be using for things such as synonym matching, that way when you say "adore", Chloe knows it means the same thing as "like", even if "adore" is not in Chloe's vocabulary. 

Run the cell below to see how we go from string words to integers and how when we cannot find a word in our vocabulary, we try to find a synonym for that word that is in our vocabulary. 

In [10]:
def get_synonym(word, field, explain=False):
    syns = wordnet.synsets(word)
    for s in syns:
        if explain: print('synonym:', s.name())
        for l in s.lemmas():
            if explain: print('-lemma:', l.name())
            if field.vocab.stoi[l.name()] != 0:
                if explain: print('found in vocab', l.name())
                return field.vocab.stoi[l.name()]
    return 0 # if we cannot find a synonym, return 0


In [11]:
if teaching:
    print('token = ', get_synonym("fine", infield, explain=True))

synonym: fine.n.01
-lemma: fine
-lemma: mulct
-lemma: amercement
synonym: ticket.v.01
-lemma: ticket
-lemma: fine
synonym: all_right.s.01
-lemma: all_right
-lemma: fine
-lemma: o.k.
-lemma: ok
found in vocab ok
token =  6


The next function takes your sentence in the form of text and converts it to a sequence of tokens within a torch tensor

In [12]:
def string2tensor(string, inputfield, explain=False):
    '''
    input:
        string (str) input sentence
        inputfield a PyTorch torchtext.data.Field object
        explain, set this to True if you want to see how the sentence was split 
    output:
        sequence of tokens (torch tensor of integers) shape  
    '''
    sentence = inputfield.preprocess(string)
    if explain: print(sentence)
    integer_sequence = []
    for tok in sentence:
        if inputfield.vocab.stoi[tok] != 0:
            integer_sequence.append(inputfield.vocab.stoi[tok])
        else:
            integer_sequence.append(get_synonym(tok, inputfield))
    return torch.LongTensor([integer_sequence])

In [13]:
input_sequence = string2tensor("ok, aren't -you [a robot?", infield, explain=True)
print(input_sequence)

['ok', ',', "aren't", 'you', 'a', 'robot', '?']
tensor([[ 6,  0,  0,  3,  9, 31,  2]])


Lets assemble the Encoder and Decoder into the Transformer, like older sequence to sequence models, Transformers also encode the sentence into vector representations and pass those representations along to the decoder to generate the response/reply/output/translation/etc. The Encoder and Decoder we can use separately such as `model.encoder(arguments)`. The very last part of the transformer is the mapping of each vector in the decoder output to logits for each token in the output vocabulary `output = self.out(d_output)`. There are as many logits as there are tokens in the output vocabulary. In the conceptual diagram we pretend that each decoder output is represented with 4 dimensional vectors and the last linear layer maps this vector to the output vocabulary which is only 5 tokens including the end of sentence `<eos>` token. In a more realistic model, the decoder might output 512 dimensional vectors and the last linear layer maps this vector to the output vocabulary which is a few thousand tokens wide, including words, punctuation marks and the end of sentence, unknown etc tokens. The softmax later on will balance all these logits to sum to 1.0 so that you can treat this as a probability distribution over the vocabulary from which the agent will draw/sample its next word. 

<img src="../saved/images/vec2vocab.png" height=500 width=600>

Define the Transformer class, instantiate a model and load the weights you trained in START_HERE into that model by running the next 2 cells. 

In [14]:
class Transformer(nn.Module):
    def __init__(self, in_vocab_size, out_vocab_size, emb_dim, n_layers, heads, dropout):
        super().__init__()
        self.encoder = Encoder(in_vocab_size, emb_dim, n_layers, heads, dropout)
        self.decoder = Decoder(out_vocab_size, emb_dim, n_layers, heads, dropout)
        self.out = nn.Linear(emb_dim, out_vocab_size)
    def forward(self, src_seq, trg_seq, src_mask, trg_mask):
        e_output = self.encoder(src_seq, src_mask)
        d_output = self.decoder(trg_seq, e_output, src_mask, trg_mask)
        output = self.out(d_output)
        return output

In [16]:
emb_dim, n_layers, heads, dropout = 32, 3, 8, 0.01 
chloe = Transformer(len(infield.vocab), len(outfield.vocab), emb_dim, n_layers, heads, dropout)
    
chloe.load_state_dict(torch.load(opt.save_path))

<All keys matched successfully>

In [127]:
def talk(input_str, chloe, opt):

    input_sequence = string2tensor(input_str, infield)
    input_mask = (input_sequence != infield.vocab.stoi['<pad>']).unsqueeze(-2)
    chloe.eval()
    encoding = chloe.encoder(input_sequence, input_mask)
    init_tok = outfield.vocab.stoi['<sos>'] 
    decoder_input = torch.LongTensor([[init_tok]])
    logprobs = torch.Tensor([[]])
    for pos in range(opt.max_len):
        decoder_input_mask = nopeak_mask(size=pos+1, opt=opt)
        out = chloe.out(chloe.decoder(decoder_input, encoding, input_mask, decoder_input_mask))
        softout = F.softmax(out, dim=-1)
        distr = Categorical(probs=softout)
        action = distr.sample()[:,-1].unsqueeze(0)
        logprob = -distr.log_prob(action)[:,-1].unsqueeze(0)
        decoder_input = torch.cat((decoder_input, action), dim=1)
        logprobs = torch.cat((logprobs, logprob), dim=1)
        if outfield.vocab.itos[action] == '<eos>':
            de_str = ' '.join([outfield.vocab.itos[tok] for tok in decoder_input[0]])
            return decoder_input, de_str, logprobs
        
    de_str = ' '.join([outfield.vocab.itos[tok] for tok in decoder_input[0]])
    return decoder_input, de_str, logprobs



#outputs = torch.zeros(1, opt.max_len).long()
#print(' '.join([outfield.vocab.itos[tok] for tok in out[0]]))
#outfield.vocab.itos[ix]
#outfield.vocab.stoi

In [129]:
decoder_input, de_str, logprobs = talk("how?", chloe, opt)
print(de_str)

<sos> meowci beaucoup <eos>


In [130]:
def init_vars(src, model, SRC, TRG, opt):
    init_tok = TRG.vocab.stoi['<sos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    e_output = model.encoder(src, src_mask)
    outputs = torch.LongTensor([[init_tok]])
    if opt.device == torch.device("cuda:0"):
        outputs = outputs.cuda()
    trg_mask = nopeak_mask(1, opt)
    out = model.out(model.decoder(outputs, e_output, src_mask, trg_mask))
    out = F.softmax(out, dim=-1)
    probs, ix = out[:, -1].data.topk(opt.k)
    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0)
    outputs = torch.zeros(opt.k, opt.max_len).long()
    if opt.device != -1:
        outputs = outputs.cuda()
    outputs[:, 0] = init_tok
    outputs[:, 1] = ix[0]
    e_outputs = torch.zeros(opt.k, e_output.size(-2),e_output.size(-1))
    if opt.device != -1:
        e_outputs = e_outputs.cuda()
    e_outputs[:, :] = e_output[0]
    return outputs, e_outputs, log_scores

In [51]:
def k_best_outputs(outputs, out, log_scores, i, k):
    probs, ix = out[:, -1].data.topk(k)
    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1) + log_scores.transpose(0,1)
    k_probs, k_ix = log_probs.view(-1).topk(k)
    row = k_ix // k
    col = k_ix % k
    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]
    log_scores = k_probs.unsqueeze(0)
    return outputs, log_scores

tensor([[ 2, 14,  4,  3],
        [ 2, 18,  4,  3],
        [ 2,  8,  4,  3],
        [ 2,  9, 16,  3]], device='cuda:0') torch.Size([4, 4]) 1


In [35]:
def beam_search(src, model, SRC, TRG, opt):
    outputs, e_outputs, log_scores = init_vars(src, model, SRC, TRG, opt)
    eos_tok = TRG.vocab.stoi['<eos>']
    src_mask = (src != SRC.vocab.stoi['<pad>']).unsqueeze(-2)
    ind = None
    for i in range(2, opt.max_len):
        trg_mask = nopeak_mask(i, opt)
        out = model.out(model.decoder(outputs[:,:i], e_outputs, src_mask, trg_mask))
        out = F.softmax(out, dim=-1)
        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, opt.k)
        ones = (outputs==eos_tok).nonzero() # Occurrences of end symbols for all input sentences.
        sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).cuda()
        for vec in ones:
            i = vec[0]
            if sentence_lengths[i]==0: # First end symbol has not been found yet
                sentence_lengths[i] = vec[1] # Position of first end symbol
        num_finished_sentences = len([s for s in sentence_lengths if s > 0])
        if num_finished_sentences == opt.k:
            alpha = 0.7
            div = 1/(sentence_lengths.type_as(log_scores)**alpha)
            _, ind = torch.max(log_scores * div, 1)
            ind = ind.data[0]
            break
    if ind is None:
        print(outputs[0]==eos_tok)
        print((outputs[0]==eos_tok).nonzero())
        length = (outputs[0]==eos_tok).nonzero()[0]
        return ' '.join([TRG.vocab.itos[tok] for tok in outputs[0][1:length]])
    else:
        length = (outputs[ind]==eos_tok).nonzero()[0]
        return ' '.join([TRG.vocab.itos[tok] for tok in outputs[ind][1:length]])

torch.Size([2, 3, 1])

In [None]:
def talk_to_model(sentence, model, opt, SRC, TRG):
    model.eval()
    indexed = []
    sentence = SRC.preprocess(sentence)
    for tok in sentence:
        if SRC.vocab.stoi[tok] != 0:
            indexed.append(SRC.vocab.stoi[tok])
        else:
            indexed.append(get_synonym(tok, SRC))
    sentence = Variable(torch.LongTensor([indexed]))
    if opt.device != -1:
        sentence = sentence.cuda()
    sentence = beam_search(sentence, model, SRC, TRG, opt)
    return  multiple_replace({' ?' : '?',' !':'!',' .':'.','\' ':'\'',' ,':','}, sentence)

In [None]:
def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))
  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

In [52]:
torch.CharTensor(2, 3).unsqueeze(-1).shape
init_tok = outfield.vocab.stoi['<sos>'] # 2 (int) 
input_mask = batch != infield.vocab.stoi['<pad>'] # tensor([[True, True, True, True, True, True, True, True]])
print(input_mask.shape)
input_mask = input_mask.unsqueeze(-2)
print(input_mask.shape)
e_output = model.encoder(batch, input_mask)

tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]], device='cuda:0') torch.Size([4, 4])
tensor([[[True, True, True, True]],

        [[True, True, True, True]],

        [[True, True, True, True]],

        [[True, True, True, True]]], device='cuda:0') torch.Size([4, 1, 4])
