In [193]:
import math, time, os, datetime, shutil, pickle

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

import import_ipynb
from MoveData import *
from EncoderDecoder import *
from Talk import *
from Trainer import *
from LearningDynamics import *

#from Beam import translate_sentence

In [204]:
class EncodeMem(nn.Module):
    def __init__(self, vocab_size, emb_dim, n_layers, heads, dropout):
        
        super().__init__()
        
        self.n_layers = n_layers
        self.embed = Embedder(vocab_size, emb_dim)
        self.pe = PositionalEncoder(emb_dim, dropout=dropout)
        self.layers = get_clones(EncoderLayer(emb_dim, heads, dropout), n_layers)
        self.norm = Norm(emb_dim)
        
        self.memory = None
        
    def forward(self, source_sequence, source_mask):
        '''
        input:
            source_sequence (sequence of source tokens) of shape (batch size, sequence length)
            source_mask (mask over input sequence) of shape (batch size, 1, sequence length)
        output: 
            sequence of vectors after embedding, postional encoding, attention and normalization
            shape (batch size, sequence length, embedding dimensions)
        '''
        if isinstance(self.memory, torch.Tensor):
            self.source_sequence = torch.cat([source_sequence, self.memory], dim=-1)
        else:
            self.source_sequence = source_sequence
            
        self.memory = source_sequence
        vector_sequence = self.embed(self.source_sequence)    
        vector_sequence = self.pe(vector_sequence)
        source_mask = (self.source_sequence != -1).unsqueeze(-2)
        
        for i in range(self.n_layers):
            vector_sequence = self.layers[i](vector_sequence, source_mask)
            
        vector_sequence = self.norm(vector_sequence)
        
        return vector_sequence

In [205]:
class MemoryTransformer(nn.Module):
    def __init__(self, in_vocab_size, out_vocab_size, emb_dim, n_layers, 
                 heads, mem_slots, dropout):
        
        super().__init__()
        
        self.emb_dim = emb_dim
        dim_k = emb_dim // heads
        self.mem_slots = mem_slots
        
        self.encoder = EncodeMem(in_vocab_size, emb_dim, n_layers, heads, dropout)
        self.decoder = Decoder(out_vocab_size, emb_dim, n_layers, heads, dropout)
        self.out = nn.Linear(emb_dim, out_vocab_size)
        
    def repackage_hidden(self, h):
        if isinstance(h, torch.Tensor):
            return h.detach()
        elif h == None:
            return None
        else:
            return tuple(self.repackage_hidden(v) for v in h)
        
    def forward(self, in_toks, in_mask, out_toks, out_mask):
        self.repackage_hidden(self.encoder.memory)
        in_encoded = self.encoder(in_toks, in_mask)
        in_mask = (self.encoder.source_sequence != -1).unsqueeze(-2)
        self.d_output = self.decoder(out_toks, out_mask, in_encoded, in_mask)
        output = self.out(self.d_output)
        return output

In [206]:
def talk_to_model(input_str, model, opt, infield, outfield):
    '''
    input:
        input_str is a string, it is what you want to say to the dialogue model
        model is a encoder, decoder and a last layer linear transformation
        opt is an options object with the maximum length of the output sequence opt.max_len
        infield and outfield are the data.fields that store the vocabulary
    output:
        an output string response from the dialogue model
    '''
    model.eval()
    model.cpu()
    input_sequence = string2tensor(input_str, infield) # string to tensor 
    input_mask = (input_sequence != infield.vocab.stoi['<pad>']).unsqueeze(-2) #make input mask
    encoding = model.encoder(input_sequence, input_mask)
    init_tok = outfield.vocab.stoi['<sos>'] # this is the integer for the start token
    decoder_input = torch.LongTensor([[init_tok]]) # use start token to initiate the decoder
    
    for pos in range(opt.max_len):
        decoder_input_mask = nopeak_mask(size=pos+1, opt=opt) # make target mask, pos+1 casue pos starts at 0
        input_mask = (model.encoder.source_sequence != -1).unsqueeze(-2)
        #print(decoder_input.shape, decoder_input_mask.shape, encoding.shape, input_mask.shape)
        out = model.out(model.decoder(decoder_input, decoder_input_mask, encoding, input_mask))
        softout = F.softmax(out, dim=-1) 

        distr = Categorical(probs=softout)
        action = distr.sample()[:,-1].unsqueeze(0) # sample from that distribution to get next token
        decoder_input = torch.cat((decoder_input, action), dim=1) 

        if outfield.vocab.itos[action] == '<eos>':
            de_str = ' '.join([outfield.vocab.itos[tok] for tok in decoder_input[0][1:-1]])
            return de_str
        
    de_str = ' '.join([outfield.vocab.itos[tok] for tok in decoder_input[0]])
    return de_str

In [233]:
opt = Options(batchsize=1, device = torch.device("cpu"), epochs=20, lr=0.001, 
              max_len = 25, save_path = '../saved/weights/memory_weights')

data_iter, infield, outfield, opt = json2datatools(path='../saved/memory.json', opt=opt)

emb_dim, n_layers, heads, mem_slots, dropout = 8, 2, 4, 1, 0.01 
chloe = MemoryTransformer(len(infield.vocab), len(outfield.vocab), 
                          emb_dim, n_layers, heads, mem_slots, dropout)

load_subset_weights(chloe, opt)
print(talk_to_model("my name is bobo", chloe, opt, infield, outfield))
print(talk_to_model("what is my name?", chloe, opt, infield, outfield))

hi bobo !
you are fluffy pillow


In [236]:
#scheduler = CosineWithRestarts(optimizer, T_max=len(conversation_list))
load_subset_weights(chloe, opt)
chloe.eval()

test_list = [
    " ",
    " my name is chloe",
    " what is my name? ",
    " my name is fluffy ",
    " what is my name? ",
    " my name is snuggles",
    " what is my name? ",
    " my name is bobo ",
    " what is my name? ",
    " ",
]

opt.k = 10

for i in test_list:
    print(" > ", i, " > ",  talk_to_model(i,chloe,opt,infield,outfield))
    #chloe.update_memory() # Update Memory

 >     >  <unk>
 >   my name is chloe  >  hi chloe !
 >   what is my name?   >  you are chloe
 >   my name is fluffy   >  hey fluffy !
 >   what is my name?   >  fluffy pillow
 >   my name is snuggles  >  hello snuggles !
 >   what is my name?   >  snuggles the bunny
 >   my name is bobo   >  hi bobo !
 >   what is my name?   >  you are bobo
 >     >  <unk> ? are chloe


In [235]:

conversation_list = [
{"listen":" ", "reply":"so"},
{"listen":"my name is chloe", "reply":"hi chloe!"},
{"listen":"what is my name?", "reply":"you are chloe"},
{"listen":"my name is fluffy", "reply":"hey fluffy!"},
{"listen":"what is my name?", "reply":"fluffy pillow"},
{"listen":"my name is snuggles", "reply":"hello snuggles!"},
{"listen":"what is my name?", "reply":"snuggles the bunny"},
{"listen":"my name is bobo", "reply":"hi bobo!"},
{"listen":"what is my name?", "reply":"you are bobo"},
                    ]

optimizer = torch.optim.Adam(chloe.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=5)

sos_tok = torch.LongTensor([[outfield.vocab.stoi['<sos>']]]) 
eos_tok = torch.LongTensor([[outfield.vocab.stoi['<eos>']]]) 

chloe.train()
start = time.time()
best_loss = 100
opt.epochs = 50 
for epoch in range(opt.epochs):
    total_loss = 0
    for i in range(len(conversation_list)):
        listen_string = conversation_list[i]["listen"]
        reply_string = conversation_list[i]["reply"]
        listen_toks = string2tensor(listen_string, infield)
        reply_toks = string2tensor(reply_string, outfield)
        reply_start = torch.cat((sos_tok,reply_toks), dim=1)
        reply_labels = torch.cat((reply_toks,eos_tok), dim=1).contiguous().view(-1)
        
        listen_mask, reply_mask = create_masks(listen_toks, reply_start, opt)
        
        logits = chloe(listen_toks, listen_mask, reply_start, reply_mask)
        
        #chloe.update_memory() # Update Memory
        
        flat_logits = logits.view(-1, logits.size(-1))
        optimizer.zero_grad()
        batch_loss = F.cross_entropy(flat_logits, reply_labels, ignore_index = opt.trg_pad)

        batch_loss.backward() #batch_loss.backward(retain_graph=True) #
        torch.nn.utils.clip_grad_norm_(chloe.parameters(), max_norm = 1.0) 
        optimizer.step()

        total_loss += batch_loss.item()

    epoch_loss = total_loss/len(conversation_list)
    scheduler.step(epoch_loss)

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        torch.save(chloe.state_dict(), opt.save_path)
        print("%dm: epoch %d loss = %.3f" %((time.time() - start)//60, 
                                        epoch, epoch_loss))
    
    total_loss = 0
    
print("finished")

0m: epoch 0 loss = 1.321
0m: epoch 1 loss = 0.715
0m: epoch 2 loss = 0.684
0m: epoch 3 loss = 0.632
0m: epoch 4 loss = 0.541
0m: epoch 5 loss = 0.376
0m: epoch 8 loss = 0.362
0m: epoch 10 loss = 0.285
0m: epoch 11 loss = 0.232
0m: epoch 13 loss = 0.219
0m: epoch 16 loss = 0.183
0m: epoch 18 loss = 0.156
0m: epoch 20 loss = 0.140
0m: epoch 34 loss = 0.098
0m: epoch 36 loss = 0.063
0m: epoch 37 loss = 0.057
finished


Next we need to train the memory. How do we do this? we need to talk to the model and allow it to accumulate at least one cycle of conversation, then teach it to respond correctly given the previous listen-reply exchange