In [1]:
import math, time, os, datetime, shutil, pickle

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

import import_ipynb
from MoveData import *
from Elements import * 
from Talk import *
from Trainer import *

importing Jupyter notebook from MoveData.ipynb
importing Jupyter notebook from Elements.ipynb
importing Jupyter notebook from Talk.ipynb
importing Jupyter notebook from EncoderDecoder.ipynb
importing Jupyter notebook from Trainer.ipynb


[nltk_data] Downloading package wordnet to /home/carson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, emb_dim, dim_k = None, dropout = 0.1):
        super().__init__()
        
        self.emb_dim = emb_dim
        self.dim_k = dim_k if dim_k else emb_dim // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(emb_dim,self.dim_k*num_heads)
        self.k_linear = nn.Linear(emb_dim,self.dim_k*num_heads)
        self.v_linear = nn.Linear(emb_dim,self.dim_k*num_heads)

        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(self.dim_k*num_heads,emb_dim)
    
    def attention(self, q, k, v, dim_k, mask=None, dropout=None, explain=False):
        k = k.transpose(-2, -1)
        if explain: print('q, k', q.shape, k.shape)
        # matrix multiplication is done using the last two dimensions
        # (batch_size,num_heads,q_seq_len,dim_k)X(batch_size,num_heads,dim_k,k_seq_len)
        #(batch_size,num_heads,q_seq_len,k_seq_len)
        scores = torch.matmul(q, k) / math.sqrt(dim_k) 
        if explain: print('scores.shape', scores.shape)
        if mask is not None:
            mask = mask.unsqueeze(1)
            if explain: print('mask.shape', mask.shape)
            scores = scores.masked_fill(mask == 0, -1e9) 
        softscores = F.softmax(scores, dim=-1)
        if dropout is not None: softscores = dropout(softscores)
            
        #(batch_size,num_heads,seq_len,seq_len)X(batch_size,num_heads,seq_len,dim_k)
        output = torch.matmul(softscores, v)
        return output, scores #=(batch_size,num_heads,seq_len,dim_k)
    
    def forward(self, q, k, v, mask=None, explain=False):
        '''
        inputs:
            q has shape (batch size, q_sequence length, embedding dimensions)
            k,v are shape (batch size, kv_sequence length, embedding dimensions)
            source_mask of shape (batch size, 1, kv_sequence length)
        outputs: sequence of vectors, re-represented using attention
            shape (batch size, q_sequence length, embedding dimensions)
        use:
            The encoder layer places the same source vector sequence into q,k,v 
            and source_mask into mask.
            The decoder layer uses this twice, once with decoder inputs as q,k,v 
            and target mask as mask. then with decoder inputs as q, encoder outputs
            as k, v and source mask as mask
        '''
        # k,q,v are each shape (batch size, sequence length, dim_k * num_heads)
        batch_size = q.size(0)
        q = self.q_linear(q)
        k = self.k_linear(k)
        v = self.v_linear(v)
        if explain: print("(batch size, sequence length, dim_k * num_heads)", k.shape)
        # k,q,v are each shape (batch size, sequence length, num_heads, dim_k)
        k = k.view(batch_size,-1,self.num_heads,self.dim_k)
        q = q.view(batch_size,-1,self.num_heads,self.dim_k)
        v = v.view(batch_size,-1,self.num_heads,self.dim_k)
        # transpose to shape (batch_size, num_heads, sequence length, dim_k)
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        if explain: print("(batch_size,num_heads,seq_length,dim_k)",k.shape)
        # calculate attention using function we will define next
        attn, scores = self.attention(q, k, v, self.dim_k, mask, self.dropout, explain)
        if explain: print("attn(batch_size,num_heads,seq_length,dim_k)", attn.shape)
        # concatenate heads and 
        concat=attn.transpose(1,2).contiguous().view(batch_size,-1,self.dim_k*self.num_heads)
        if explain: print("concat.shape", concat.shape)
        # put through final linear layer
        output = self.out(concat)
        if explain: print("MultiHeadAttention output.shape", output.shape)
        return output, scores

In [44]:
class ContextLayer(nn.Module):

    def __init__(self, emb_dim, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(emb_dim)
        self.norm_2 = Norm(emb_dim)
        self.norm_3 = Norm(emb_dim)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, emb_dim, dropout=dropout)
        self.attn_2 = MultiHeadAttention(heads, emb_dim, dropout=dropout)
        self.ff = FeedForward(emb_dim, dropout=dropout)

    def forward(self, fc_vecs, fc_mask, cn_vecs, cn_mask, explain = False):
        '''
        fc = focus, the sequence of vectors we are re-representing using the context
        cn = context, the sequence of vectors that forms the context
        inputs:
            fc_vecs (batch size, fc_seq_len, emb_dim)
            fc_mask (batch size, fc_seq_len, fc_seq_len)
            cn_vecs (batch size, cn_seq_len, emb_dim)
            cn_mask (batch size, 1, cn_sequence_len)
        ouputs:
            fc_vecs (batch size, fc_seq_len, emb_dim)
        '''
        fc_nrm = self.norm_1(fc_vecs)
        #Self Attention 
        fc_attn, fc_scores = self.attn_1(fc_nrm,fc_nrm,fc_nrm,fc_mask,explain)
        fc_vecs = fc_vecs + self.dropout_1(fc_attn)
        fc_nrm = self.norm_2(fc_vecs)
        #Context Attention 
        fc_attn, fc_scores = self.attn_2(fc_nrm,cn_vecs,cn_vecs,cn_mask,explain)
        fc_vecs = fc_vecs + self.dropout_2(fc_attn)
        fc_nrm = self.norm_3(fc_vecs)
        fc_vecs = fc_vecs + self.dropout_3(self.ff(fc_nrm))
        return fc_vecs

In [45]:
class Context(nn.Module):

    def __init__(self, vocab_size, emb_dim, n_layers, heads, dropout):
        super().__init__()
        self.n_layers = n_layers
        self.embed = Embedder(vocab_size, emb_dim)
        self.pe = PositionalEncoder(emb_dim, dropout=dropout)
        self.layers = get_clones(ContextLayer(emb_dim, heads, dropout), n_layers)
        self.norm = Norm(emb_dim)
    def forward(self,  fc_toks, fc_mask, cn_vecs, cn_mask, explain = False):
        '''
        fc = focus, the sequence of vectors we are re-representing using the context
        cn = context, the sequence of vectors that forms the context
        inputs:
            fc_toks (batch size, fc_seq_len)
            fc_mask (batch size, fc_seq_len, fc_seq_len)
            cn_vecs (batch size, cn_seq_len, emb_dim)
            cn_mask (batch size, 1, cn_sequence_len)
        ouputs:
            fc_vecs (batch size, fc_seq_len, emb_dim)
        '''
        x = self.embed(fc_toks)
        x = self.pe(x)
        for i in range(self.n_layers):
            x = self.layers[i](x, fc_mask, cn_vecs, cn_mask, explain)
        return self.norm(x)

In [46]:
class Memory(nn.Module):
    def __init__(self, in_vocab_size, out_vocab_size, emb_dim, 
                 n_layers, num_heads, mem_slots, dropout):
        
        super().__init__() 
        
        self.mem_slots = mem_slots
        self.emb_dim = emb_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.dim_k = self.emb_dim // self.num_heads
        self.batch_size = None 
        
        with torch.no_grad():
            self.memory = torch.eye(self.mem_slots)
        if self.emb_dim > self.mem_slots:
          difference = self.emb_dim - self.mem_slots
          pad = torch.zeros((self.mem_slots, difference))
          self.memory = torch.cat([self.memory, pad], -1)
        elif self.emb_dim < self.mem_slots:
          self.memory = self.memory[:, :self.emb_dim]
        
        mem_mask = np.ones((1,self.mem_slots)).astype('uint8')
        
        self.mem_mask =  torch.from_numpy(mem_mask) == 1
        
        self.in_mem = Context(in_vocab_size, emb_dim, 
                              n_layers, num_heads, dropout)
        
        self.out_en = Context(out_vocab_size, emb_dim, 
                               n_layers, num_heads, dropout)
        
        self.out = nn.Linear(emb_dim, out_vocab_size)
        
        self.mem_update = MultiHeadAttention(self.num_heads, self.emb_dim, self.dim_k,
                                             self.dropout)
        
    def batch_memory(self,src_seq):
        self.batch_size = src_seq.size(0)
        self.memory = torch.stack([self.memory for _ in range(self.batch_size)])
        self.mem_mask = torch.stack([self.mem_mask for _ in range(self.batch_size)])
        
    def forward(self, in_toks, in_mask, ou_toks, ou_mask, explain = False):
        '''
        in = input, the sequence we are encoding given the memory
        ou = output, the sequence we are predicting the next action for
             given the memory and input
        inputs:
            in_toks (batch size, in_seq_len)
            in_mask (batch size, in_seq_len, in_seq_len)
            ou_toks (batch size, ou_seq_len)
            ou_mask (batch size, 1, ou_sequence_len)
        ouputs:
            ou_toks (batch size, ou_seq_len, out_vocab_size)
        '''
        if self.batch_size == None: self.batch_memory(in_toks)

        en_vecs = self.in_mem(in_toks, in_mask, self.memory, self.mem_mask, explain)

        ou_vecs = self.out_en(ou_toks, ou_mask, en_vecs, in_mask, explain)
        
        logits = self.out(ou_vecs)
        
        mem_dialogue = torch.cat([self.memory, en_vecs, ou_vecs], dim=-2) 
        
        self.memory, scores = self.mem_update(self.memory,mem_dialogue,
                                              mem_dialogue)
        return logits

In [83]:
opt = Options(batchsize=1, device = torch.device("cpu"), epochs=1, lr=0.01, 
              max_len = 20, save_path = '../saved/weights/memory_weights')

data_iter, infield, outfield, opt = json2datatools(path='../saved/pairs.json', opt=opt)

emb_dim, n_layers, num_heads, mem_slots, dropout = 32, 1, 8, 4, 0.01 

chloe = Memory(len(infield.vocab), len(outfield.vocab), 
               emb_dim, n_layers, num_heads, mem_slots, dropout)

In [84]:
conversation_list = [
    {"listen":"my name is fluffy", "reply":"hello fluffy!"},
    {"listen":"what is my name?", "reply":"its fluffy silly"},
    {"listen":"my name is snuggles", "reply":"hello snuggles!"},
    {"listen":"what is my name?", "reply":"its snuggles silly"},
                    ]

In [85]:
optimizer = torch.optim.Adam(chloe.parameters(),lr=opt.lr,betas=(0.9, 0.98),eps=1e-9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min',factor=0.9,patience=3)
sos_tok = torch.LongTensor([[outfield.vocab.stoi['<sos>']]]) 
eos_tok = torch.LongTensor([[outfield.vocab.stoi['<eos>']]]) 

model.train()
start = time.time()
best_loss = 100
for epoch in range(opt.epochs):
    total_loss = 0
    for i in range(len(conversation_list)):
        
        listen_toks = string2tensor(conversation_list[i]["listen"], infield)
        reply_toks = string2tensor(conversation_list[i]["reply"], infield)
        reply_start = torch.cat((sos_tok,reply_toks), dim=1)
        reply_labels = torch.cat((reply_toks,eos_tok), dim=1)
        listen_mask, reply_mask = create_masks(listen_toks, reply_start, opt)
        en_vecs = model.in_mem(listen_toks,listen_mask,model.memory,model.mem_mask)

        for pos in range(opt.max_len):
            
            decoder_input = reply_start[:,:pos+1]
            reply_labels_sofar = reply_labels[:,:pos+1].contiguous().view(-1)
            
            decoder_input_mask = nopeak_mask(size=pos+1, opt=opt) 
            
            ou_vecs = model.out_en(decoder_input, decoder_input_mask, en_vecs, listen_mask)

            mem_dialogue = torch.cat([model.memory, en_vecs, ou_vecs], dim=-2) 

            model.memory, scores = model.mem_update(model.memory,mem_dialogue,mem_dialogue)

            out = model.out(ou_vecs)

            flat_output = out.view(-1, out.size(-1))
            
            optimizer.zero_grad()
            batch_loss = F.cross_entropy(flat_output, reply_labels_sofar, 
                                         ignore_index = opt.trg_pad)
            batch_loss.backward(retain_graph=True)
            optimizer.step()
            
            total_loss += batch_loss.item()

            epoch_loss = total_loss/len(conversation_list)
            scheduler.step(epoch_loss)

            if epoch_loss < best_loss:
                best_loss = epoch_loss
                torch.save(chloe.state_dict(), opt.save_path)
            print("%dm: epoch %d loss = %.3f" %((time.time() - start)//60, 
                                                epoch, epoch_loss))
            total_loss = 0

0m: epoch 0 loss = 1.041
0m: epoch 0 loss = 1.055
0m: epoch 0 loss = 0.826
0m: epoch 0 loss = 0.855


In [86]:
def mem_talk(input_str, model, opt, infield, outfield):
    '''
    input:
        input_str is a string, it is what you want to say to the dialogue model
        model is a Transformer model with encoder, decoder and a last layer linear transformation
        opt is an options object with the maximum length of the output sequence opt.max_len
        infield and outfield are the data.fields that store the vocabulary
    output:
        an output string response from the dialogue model
    
    Note: this version assumes we are evaluating the model on CPU 
    '''
    model.eval()
    input_sequence = string2tensor(input_str, infield) # string to tensor 
    input_mask = (input_sequence != infield.vocab.stoi['<pad>']).unsqueeze(-2) #make input mask
    # use the encoder rerepresent the input
    en_vecs = model.in_mem(input_sequence, input_mask, model.memory, model.mem_mask)
    init_tok = outfield.vocab.stoi['<sos>'] # this is the integer for the start token
    decoder_input = torch.LongTensor([[init_tok]]) # use start token to initiate the decoder
    logprobs = torch.Tensor([[]])
    
    # continue obtaining the next decoder token until decoder outputs and end token or til max_len
    for pos in range(opt.max_len):
        decoder_input_mask = nopeak_mask(size=pos+1, opt=opt) # make target mask, pos+1 casue pos starts at 0
        # the out vector contains the logits that are rebalanced by the softmax
        ou_vecs = model.out_en(decoder_input, decoder_input_mask, en_vecs, input_mask)
        
        mem_dialogue = torch.cat([model.memory, en_vecs, ou_vecs], dim=-2) 
        
        model.memory, scores = model.mem_update(model.memory,mem_dialogue,mem_dialogue)
        
        out = model.out(ou_vecs)
        
        #softout is a categorical probability distribution over the output vocab
        softout = F.softmax(out, dim=-1)
        distr = Categorical(probs=softout)
        action = distr.sample()[:,-1].unsqueeze(0)
        logprob = -distr.log_prob(action)[:,-1].unsqueeze(0)
        # concatenate that token to our running list of output tokens 
        decoder_input = torch.cat((decoder_input, action), dim=1)
        logprobs = torch.cat((logprobs, logprob), dim=1)
        # if the model outputs an end of sentence token, it is done with this sentence
        if outfield.vocab.itos[action] == '<eos>':
            # [0] because we are assuming batch size of 1 
            # [1:-1] excludes the start and end token from the output string 
            de_str = ' '.join([outfield.vocab.itos[tok] for tok in decoder_input[0]])
            return decoder_input, de_str, logprobs

    de_str = ' '.join([outfield.vocab.itos[tok] for tok in decoder_input[0]])
    return decoder_input, de_str, logprobs

In [87]:
input_str = "my name is fluffy"
decoder_input, de_str, logprobs = mem_talk(input_str, chloe, opt, infield, outfield)
print(de_str)

<sos> hi name just <eos>
