In [14]:
import math, time, os, datetime, shutil, pickle

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

import import_ipynb
from MoveData import *
from EncoderDecoder import *
from Talk import *
from Trainer import *
from LearningDynamics import *

importing Jupyter notebook from LearningDynamics.ipynb


In [94]:
def talk_to_model(input_str, model, opt, infield, outfield):
    '''
    input:
        input_str is a string, it is what you want to say to the dialogue model
        model is a Transformer model with encoder, decoder and a last layer linear transformation
        opt is an options object with the maximum length of the output sequence opt.max_len
        infield and outfield are the data.fields that store the vocabulary
    output:
        an output string response from the dialogue model
    Note: this version assumes we are evaluating the model on CPU 
    '''
    model.eval()
    model.cpu()
    input_sequence = string2tensor(input_str, infield) # string to tensor 
    input_mask = (input_sequence != infield.vocab.stoi['<pad>']).unsqueeze(-2) #make input mask
    #encoding = model.encoder(input_sequence, input_mask, model.memory, model.mem_mask) # use the encoder rerepresent the input
    encoding = model.encoder(input_sequence, input_mask)
    init_tok = outfield.vocab.stoi['<sos>'] # this is the integer for the start token
    decoder_input = torch.LongTensor([[init_tok]]) # use start token to initiate the decoder
    
    # continue obtaining the next decoder token until decoder outputs and end token or til max_len 
    for pos in range(opt.max_len):
        decoder_input_mask = nopeak_mask(size=pos+1, opt=opt) # make target mask, pos+1 casue pos starts at 0
        # the out vector contains the logits that are rebalanced by the softmax
        out = model.out(model.decoder(decoder_input, decoder_input_mask, encoding, input_mask))
        softout = F.softmax(out, dim=-1) 
        #softout is a categorical probability distribution over the output vocab
        distr = Categorical(probs=softout)
        action = distr.sample()[:,-1].unsqueeze(0) # sample from that distribution to get next token
        # concatenate that token to our running list of output tokens 
        decoder_input = torch.cat((decoder_input, action), dim=1) 
        # if the model outputs an end of sentence token, it is done with this sentence
        if outfield.vocab.itos[action] == '<eos>':
            # [0] because we are assuming batch size of 1 
            # [1:-1] excludes the start and end token from the output string 
            de_str = ' '.join([outfield.vocab.itos[tok] for tok in decoder_input[0][1:-1]])
            return de_str
        
    de_str = ' '.join([outfield.vocab.itos[tok] for tok in decoder_input[0]])
    return de_str

In [95]:
class MemoryTransformer(nn.Module):
    def __init__(self, in_vocab_size, out_vocab_size, emb_dim, n_layers, 
                 heads, mem_slots, dropout):
        super().__init__()
        
        self.batch_size = None
        dim_k = emb_dim // heads
        self.mem_slots = mem_slots
        
        self.encoder = Encoder(in_vocab_size, emb_dim, n_layers, heads, dropout)
        self.decoder = Decoder(out_vocab_size, emb_dim, n_layers, heads, dropout)
        self.out = nn.Linear(emb_dim, out_vocab_size)

    def update_memory(self):
        mem_dialogue = torch.cat([self.memory, self.e_output, self.d_output], dim=-2) 
        new_memory, _ = self.MHDPA(self.memory, mem_dialogue, mem_dialogue)
        new_mem_norm = self.NormalizeMemory(new_memory + self.memory)
        z_t = torch.sigmoid(self.z_gate(self.memory))
        self.memory = (1 - z_t)*self.memory + z_t*new_mem_norm
        mem_mask = np.ones((1, 1, self.memory.size(-2))).astype('uint8')
        self.mem_mask =  torch.from_numpy(mem_mask) == 1
        
    def forward(self, in_toks, in_mask, out_toks, out_mask):  
        self.in_encoded = self.encoder(in_toks, in_mask)
        self.d_output = self.decoder(out_toks, out_mask, self.in_encoded, in_mask)
        output = self.out(self.d_output)
        return output

In [96]:
opt = Options(batchsize=1, device = torch.device("cpu"), epochs=20, lr=0.005, 
              max_len = 25, save_path = '../saved/weights/memory_weights')

data_iter, infield, outfield, opt = json2datatools(path='../saved/memory.json', opt=opt)

emb_dim, n_layers, heads, mem_slots, dropout = 32, 2, 8, 1, 0.01 
chloe = MemoryTransformer(len(infield.vocab), len(outfield.vocab), 
                          emb_dim, n_layers, heads, mem_slots, dropout)

#load_subset_weights(chloe, opt)
print(talk_to_model("my name is fluffy", chloe, opt, infield, outfield))

hi fluffy


In [98]:

conversation_list = [
{"listen":"my name is fluffy", "reply":"hey fluffy!"},
{"listen":"what is my name?", "reply":"fluffy pillow"},
{"listen":"my name is fluffy what is my name?", "reply":"fluffy pillow"},
{"listen":"my name is snuggles", "reply":"hello snuggles!"},
{"listen":"what is my name?", "reply":"snuggles the bunny"},
{"listen":"my name is snuggles what is my name?", "reply":"snuggles the bunny"},
{"listen":"my name is bobo", "reply":"hi bobo!"},
{"listen":"what is my name?", "reply":"you are bobo"},
{"listen":"my name is bobo what is my name?", "reply":"you are bobo"},
                    ]

optimizer = torch.optim.Adam(chloe.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.6, patience=3)
#scheduler = CosineWithRestarts(optimizer, T_max=len(conversation_list))

sos_tok = torch.LongTensor([[outfield.vocab.stoi['<sos>']]]) 
eos_tok = torch.LongTensor([[outfield.vocab.stoi['<eos>']]]) 

chloe.train()
start = time.time()
best_loss = 100
for epoch in range(opt.epochs):
    total_loss = 0
    for i in range(len(conversation_list)):
        listen_string = conversation_list[i]["listen"]
        reply_string = conversation_list[i]["reply"]
        listen_toks = string2tensor(listen_string, infield)
        reply_toks = string2tensor(reply_string, outfield)
        reply_start = torch.cat((sos_tok,reply_toks), dim=1)
        reply_labels = torch.cat((reply_toks,eos_tok), dim=1).contiguous().view(-1)
        
        listen_mask, reply_mask = create_masks(listen_toks, reply_start, opt)
        
        logits = chloe(listen_toks, listen_mask, reply_start, reply_mask)
        
        #chloe.update_memory() # Update Memory
        
        flat_logits = logits.view(-1, logits.size(-1))
        optimizer.zero_grad()
        batch_loss = F.cross_entropy(flat_logits, reply_labels, ignore_index = opt.trg_pad)

        batch_loss.backward() #batch_loss.backward(retain_graph=True) #
        torch.nn.utils.clip_grad_norm_(chloe.parameters(), max_norm = 1.0) 
        optimizer.step()

        total_loss += batch_loss.item()

    epoch_loss = total_loss/len(conversation_list)
    scheduler.step(epoch_loss)

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(chloe.state_dict(), opt.save_path)
    print("%dm: epoch %d loss = %.3f" %((time.time() - start)//60, 
                                        epoch, epoch_loss))
    total_loss = 0

chloe.eval()

test_list = [
    " my name is fluffy ",
    " what is my name? ",
    " my name is fluffy what is my name?",
    " my name is snuggles",
    " what is my name? ",
    " my name is snuggles what is my name? ",
    " my name is bobo ",
    " what is my name? ",
    " my name is bobo what is my name? "
]

for i in test_list:
    print(" > ", i, " > ",  talk_to_model(i,chloe,opt,infield,outfield))

0m: epoch 0 loss = 0.440
0m: epoch 1 loss = 0.394
0m: epoch 2 loss = 0.249
0m: epoch 3 loss = 0.230
0m: epoch 4 loss = 0.234
0m: epoch 5 loss = 0.280
0m: epoch 6 loss = 0.252
0m: epoch 7 loss = 0.233
0m: epoch 8 loss = 0.213
0m: epoch 9 loss = 0.215
0m: epoch 10 loss = 0.212
0m: epoch 11 loss = 0.210
0m: epoch 12 loss = 0.248
0m: epoch 13 loss = 0.203
0m: epoch 14 loss = 0.217
0m: epoch 15 loss = 0.197
0m: epoch 16 loss = 0.156
0m: epoch 17 loss = 0.159
0m: epoch 18 loss = 0.417
0m: epoch 19 loss = 0.152
 >   my name is fluffy   >  hey fluffy !
 >   what is my name?   >  fluffy pillow
 >   my name is fluffy what is my name?  >  fluffy pillow
 >   my name is snuggles  >  hello snuggles !
 >   what is my name?   >  snuggles the bunny
 >   my name is snuggles what is my name?   >  snuggles the bunny
 >   my name is bobo   >  hi bobo !
 >   what is my name?   >  snuggles the bunny
 >   my name is bobo what is my name?   >  snuggles the bunny


In [None]:
def trainer(model, data_iterator, options, optimizer, scheduler):

    if torch.cuda.is_available() and options.device == torch.device("cuda:0"):
        print("a GPU was detected, model will be trained on GPU")
        model = model.cuda()
    else:
        print("training on cpu")

    model.train()
    start = time.time()
    best_loss = 100
    for epoch in range(options.epochs):
        total_loss = 0
        for i, batch in enumerate(data_iterator): 
            src = batch.listen.transpose(0,1)
            trg = batch.reply.transpose(0,1)
            trg_input = trg[:, :-1]
            src_mask, trg_mask = create_masks(src, trg_input, options)
            preds = model(src, src_mask, trg_input, trg_mask)
            
            ys = trg[:, 1:].contiguous().view(-1)
            optimizer.zero_grad()
            batch_loss = F.cross_entropy(preds.view(-1, preds.size(-1)), 
                                         ys, ignore_index = options.trg_pad)
            batch_loss.backward()
            optimizer.step()
            total_loss += batch_loss.item()

        epoch_loss = total_loss/(num_batches(data_iterator)+1)
        scheduler.step(epoch_loss)

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            torch.save(model.state_dict(), options.save_path)
        print("%dm: epoch %d loss = %.3f" %((time.time() - start)//60, epoch, epoch_loss))
        total_loss = 0

    return model
#load_subset_weights(chloe, opt)
optimizer = torch.optim.Adam(chloe.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=3)

chloe = trainer(chloe, data_iter, opt, optimizer, scheduler)
print(talk_to_chloe("my name is snuggles", chloe, opt, infield, outfield))
chloe.update_memory() # Update Memory 
print(talk_to_chloe("what is my name?", chloe, opt, infield, outfield))

training on cpu
0m: epoch 0 loss = 0.263
0m: epoch 1 loss = 0.136
0m: epoch 2 loss = 0.173
0m: epoch 3 loss = 0.277
0m: epoch 4 loss = 0.193
0m: epoch 5 loss = 0.148
0m: epoch 6 loss = 0.152
0m: epoch 7 loss = 0.113
0m: epoch 8 loss = 0.131
0m: epoch 9 loss = 0.080
0m: epoch 10 loss = 0.064
0m: epoch 11 loss = 0.167
0m: epoch 12 loss = 0.122
0m: epoch 13 loss = 0.100
0m: epoch 14 loss = 0.159
0m: epoch 15 loss = 0.081
0m: epoch 16 loss = 0.087


0m: epoch 0 loss = 0.446
0m: epoch 1 loss = 0.719
0m: epoch 2 loss = 0.354
0m: epoch 3 loss = 0.475
0m: epoch 4 loss = 0.280
0m: epoch 5 loss = 0.335
0m: epoch 6 loss = 0.250
0m: epoch 7 loss = 0.215
0m: epoch 8 loss = 0.194
0m: epoch 9 loss = 0.190
0m: epoch 10 loss = 0.190
0m: epoch 11 loss = 0.193
0m: epoch 12 loss = 0.192
0m: epoch 13 loss = 0.193
0m: epoch 14 loss = 0.191
0m: epoch 15 loss = 0.191
0m: epoch 16 loss = 0.193
0m: epoch 17 loss = 0.191
0m: epoch 18 loss = 0.191
0m: epoch 19 loss = 0.190
> my name is fluffy > hello fluffy !
> what is my name? > hello bobo !
> my name is snuggles > hello snuggles !
> what is my name? > hello bobo !
> my name is bobo > its bobo silly
> what is my name? > its bobo silly


Next we need to train the memory. How do we do this? we need to talk to the model and allow it to accumulate at least one cycle of conversation, then teach it to respond correctly given the previous listen-reply exchange

meowci beaucoup !


thank am hi
<unk> meowci <unk>
thank meowci hi
<unk> meowci <unk>
