## QA over unstructured data

Using Match LSTM, Pointer Networks, as mentioned in paper https://arxiv.org/pdf/1608.07905.pdf

We start with the pre-processing provided by https://github.com/MurtyShikhar/Question-Answering to clean up the data and make neat para, ques files.

In [34]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

device = torch.device("cpu")

In [35]:
# Macros 
DATA_LOC = './data/squad/'
DEBUG = True

# nn Macros
QUES_LEN, PARA_LEN =  4, 9
VOCAB_SIZE = 3000                    # @TODO: get actual size
HIDDEN_DIM = 10
EMBEDDING_DIM = 30
BATCH_SIZE = 12


dummy_para = torch.randint(0,VOCAB_SIZE-1,(PARA_LEN*BATCH_SIZE,)).view(BATCH_SIZE,PARA_LEN).long()
print (dummy_para.shape)
dummy_question = torch.randint(0,VOCAB_SIZE-1,(QUES_LEN*BATCH_SIZE,)).view(BATCH_SIZE,QUES_LEN).long()
print (dummy_question.shape)

torch.Size([12, 9])
torch.Size([12, 4])


### Encoder 
Use a simple lstm class to have encoder for question and paragraph. 
The output of these will be used in the match lstm

$H^p = LSTM(P)$ 


$H^q = LSTM(Q)$

In [36]:
class Encoder(nn.Module):
    
    def __init__(self, inputlen, hiddendim, embeddingdim, vocablen):
        super(Encoder, self).__init__()
        
        # Catch dim
        self.inputlen, self.hiddendim, self.embeddingdim, self.vocablen = inputlen, hiddendim, embeddingdim, vocablen
        
        # Embedding Layer
        self.embedding = nn.Embedding(self.vocablen, self.embeddingdim)
       
        # LSTM Layer
        self.lstm = nn.LSTM(self.embeddingdim, self.hiddendim)
        
    def init_hidden(self):
        
        # Returns a new hidden layer var for LSTM
        return (torch.zeros(1, BATCH_SIZE, self.hiddendim), torch.zeros(1, BATCH_SIZE, self.hiddendim))
    
    def forward(self, x, h):
        
        # Input: x (1, batch, ) (current input)
        # Hidden: h (1, batch, hiddendim) (last hidden state)
        
        if DEBUG: print("x: ", x.shape)
        if DEBUG: print("h: ", h[0].shape, h[1].shape)
        
        x_emb = self.embedding(x)
        if DEBUG: print("x_embedded: ", x_emb.shape)
            
        ycap, h = self.lstm(x_emb.view(-1, BATCH_SIZE, self.embeddingdim), h)
        if DEBUG: print("ycap: ", ycap.shape)
        
        return ycap, h
    
    
# with torch.no_grad():
#     print ("Trying out question encoder LSTM")
#     model = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
#     dummy_x = torch.tensor([22,45,12], dtype=torch.long)
#     hidden = model.init_hidden()
#     ycap, h = model(dummy_x, hidden)
    
#     print(ycap.shape)
#     print(h[0].shape, h[1].shape)

with torch.no_grad():
    print("LSTM with batches")
    ques_model = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
    para_model = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
    ques_hidden = ques_model.init_hidden()
    para_hidden = para_model.init_hidden()
    ques_embedded,hidden_ques = ques_model(dummy_question,ques_hidden)
    para_embedded,hidden_para = para_model(dummy_para,para_hidden)

LSTM with batches
x:  torch.Size([12, 4])
h:  torch.Size([1, 12, 10]) torch.Size([1, 12, 10])
x_embedded:  torch.Size([12, 4, 30])
ycap:  torch.Size([4, 12, 10])
x:  torch.Size([12, 9])
h:  torch.Size([1, 12, 10]) torch.Size([1, 12, 10])
x_embedded:  torch.Size([12, 9, 30])
ycap:  torch.Size([9, 12, 10])


In [37]:
print (ques_embedded.shape) # question_length,batch,embedding_dim
print (para_embedded.shape) # para_length,batch,embedding_dim
print (hidden_para[0].shape,hidden_para[1].shape)

torch.Size([4, 12, 10])
torch.Size([9, 12, 10])
torch.Size([1, 12, 10]) torch.Size([1, 12, 10])


In [40]:
# QuesEncoder = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
# ParaEncoder = Encoder(PARA_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)

### Match LSTM

Use a match LSTM to compute a **summarized sequential vector** for the paragraph w.r.t the question.

Consider the summarized vector ($H^r$) as the output of a new decoder, where the inputs are $H^p, H^q$ computed above. 

1. Attend the para word $i$ with the entire question ($H^q$)
  
    1. $\vec{G}_i = tanh(W^qH^q + repeat(W^ph^p_i + W^r\vec{h^r_{i-1} + b^p}))$
    
    2. *Computing it*: Here, $\vec{G}_i$ is equivalent to `energy`, computed differently.
    
    3. Use a linear layer to compute the content within the $repeat$ fn.
    
    4. Add with another linear (without bias) with $H_q$
    
    5. $tanh$ the bloody thing
  
  
2. Softmax over it to get $\alpha$ weights.

    1. $\vec{\alpha_i} = softmax(w^t\vec{G}_i + repeat(b))$
    
3. Use the attention weight vector $\vec{\alpha_i}$ to obtain a weighted version of the question and combine it with the current token of the passage to form a vector $\vec{z_i}$

[@TODO]

In [42]:
class MatchLSTMEncoder(nn.Module):
    
    def __init__(self, hidden_dim, ques_len ):
        
        super(MatchLSTMEncoder, self).__init__()
        
        self.hidden_dim, self.ques_len = hidden_dim, ques_len
        
        # Catch lens and params
        self.lin_g_repeat = nn.Linear(2*self.hidden_dim, hidden_dim)
        self.lin_g_nobias = nn.Linear(self.hidden_dim, hidden_dim)
        
        self.alpha_i_w = nn.Parameter(torch.FloatTensor(self.hidden_dim, 1))
        self.alpha_i_b= nn.Parameter(torch.FloatTensor((1)))
        
        self.lstm_summary = nn.LSTM(self.hidden_dim*(self.ques_len+2), self.hidden_dim)
                                      
    
    def forward(self, h_pi, h_ri, H_q, hidden):
        
        # For h_r i
        # encoded para word: h_pi (1, batch, hidden_dim  )
        # encoded ques all: H_q (seqlen, batch, hidden_dim)
        # last hidden state: h_ri (1, batch, hidden_dim) (i-1th) 
        
        if DEBUG:
            print( "h_pi: \t\t  ", h_pi.shape)
            print( "h_ri: \t\t  ", h_ri.shape)
            print( "H_q: \t\t  ", H_q.shape)
        
        lin_repeat_input = torch.cat((h_pi, h_ri), dim=2)
        if DEBUG: print("lin_repeat_input: ", lin_repeat_input.shape)
        
        lin_g_input_b = self.lin_g_repeat(lin_repeat_input)
        if DEBUG: print("lin_g_input_b unrepeated: ", lin_g_input_b.shape)
            
        lin_g_input_b = lin_g_input_b.repeat(H_q.shape[0], 1, 1)
        if DEBUG: print("lin_g_input_b: \t", lin_g_input_b.shape)
            
        # lin_g_input_a = self.lin_g_nobias.matmul(H_q.view(-1, self.ques_len, self.hidden_dim)) #self.lin_g_nobias(H_q)
        lin_g_input_a =  self.lin_g_nobias(H_q)
        if DEBUG: print("lin_g_input_a: ", lin_g_input_a.shape)
            
        G_i = F.tanh(lin_g_input_a + lin_g_input_b)
        if DEBUG: print("G_i: ", G_i.shape)
        # Note; G_i should be a 1D vector over ques_len
        
        # Attention weights
        alpha_i_input_a = G_i.view(BATCH_SIZE, -1, self.hidden_dim).matmul(self.alpha_i_w).view(BATCH_SIZE, 1, -1)
        if DEBUG: print("alpha_i_input_a: ", alpha_i_input_a.shape)
            
        alpha_i_input = alpha_i_input_a.add_(self.alpha_i_b.view(-1,1,1).repeat(1,1,self.ques_len))
        if DEBUG: print("alpha_i_input: ", alpha_i_input.shape)
        
        # Softmax over alpha inputs
        alpha_i = F.softmax(alpha_i_input, dim=-1)
        if DEBUG: print("alpha_i: ", alpha_i.shape)
            
        # Weighted summary of question with alpha    
        z_i_input_b = (
                        H_q.view(BATCH_SIZE, QUES_LEN, -1) *
                       (alpha_i.view(BATCH_SIZE, self.ques_len, -1).repeat(1,1,self.hidden_dim))
                      ).view(self.ques_len,BATCH_SIZE,-1)
        if DEBUG: print("z_i_input_b: ", z_i_input_b.shape)
            
        z_i = torch.cat((h_pi, z_i_input_b), dim=0)
        if DEBUG: print("z_i: ", z_i.shape)
                        
        # Pass z_i, h_ri to the LSTM 
        lstm_input = torch.cat((z_i.view(1,BATCH_SIZE,-1), h_ri), dim=2)
        if DEBUG: print("lstm_input: ", lstm_input.shape)
        
        h_ri, hidden = self.lstm_summary(lstm_input, hidden)
        if DEBUG:
            print("h_ri new: ", h_ri.shape)
            print("hidden new: ", hidden[0].shape, hidden[1].shape)
        
        
        return h_ri, hidden
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, BATCH_SIZE, self.hidden_dim),
                torch.zeros(1, BATCH_SIZE, self.hidden_dim))

# with torch.no_grad():
#     model = MatchLSTMEncoder(HIDDEN_DIM, QUES_LEN)
#     h_pi = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
#     h_ri = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
#     hidden = model.init_hidden()
#     H_q = torch.randn(QUES_LEN, BATCH_SIZE, HIDDEN_DIM)
    
#     op, hid = model(h_pi, h_ri, H_q, hidden)
    
#     print("\nDone:op", op.shape)
#     print("Done:hid", hid[0].shape, hid[1].shape)
    
with torch.no_grad():
    matchLSTMEncoder = MatchLSTMEncoder(HIDDEN_DIM, QUES_LEN)
    hidden = matchLSTMEncoder.init_hidden()
    h_ri = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
    if DEBUG:
        print ("init h_ri shape is: ", h_ri.shape)
        print ("the para length is ", len(para_embedded))
    for i in range(len(para_embedded)):
        h_ri, hidden =  matchLSTMEncoder(para_embedded[i].view(1,BATCH_SIZE,-1), h_ri, ques_embedded, hidden)
        para_embedded[i] = h_ri
        DEBUG = False
    DEBUG = True
                                                        

init h_ri shape is:  torch.Size([1, 12, 10])
the para length is  9
h_pi: 		   torch.Size([1, 12, 10])
h_ri: 		   torch.Size([1, 12, 10])
H_q: 		   torch.Size([4, 12, 10])
lin_repeat_input:  torch.Size([1, 12, 20])
lin_g_input_b unrepeated:  torch.Size([1, 12, 10])
lin_g_input_b: 	 torch.Size([4, 12, 10])
lin_g_input_a:  torch.Size([4, 12, 10])
G_i:  torch.Size([4, 12, 10])
alpha_i_input_a:  torch.Size([12, 1, 4])
alpha_i_input:  torch.Size([12, 1, 4])
alpha_i:  torch.Size([12, 1, 4])
z_i_input_b:  torch.Size([4, 12, 10])
z_i:  torch.Size([5, 12, 10])
lstm_input:  torch.Size([1, 12, 60])
h_ri new:  torch.Size([1, 12, 10])
hidden new:  torch.Size([1, 12, 10]) torch.Size([1, 12, 10])


In [39]:
print("para embedded dim are :",para_embedded.shape)

para embedded dim are : torch.Size([9, 12, 10])


### Pointer Network

Using a ptrnet over $H_r$ to unfold and get most probable spans.
We use the **boundry model** to do that (predict start and end of seq).

A simple energy -> softmax -> decoder. Where softmaxed energy is supervised.

In [47]:
class PointerDecoder(nn.Module):
    
    def __init__(self, hidden_dim):
        super(PointerDecoder, self).__init__()
        
        # Keep args
        self.hidden_dim = hidden_dim
        
        self.lin_f_repeat = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.lin_f_nobias = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
        
        self.beta_k_w = nn.Parameter(torch.FloatTensor(self.hidden_dim, 1))
        self.beta_k_b = nn.Parameter(torch.FloatTensor(1))
        
        self.lstm = nn.LSTM(self.hidden_dim*(PARA_LEN+1), self.hidden_dim)

    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, h_ak, H_r, hidden):
        
        # h_ak (current decoder's last op) (1,batch,hiddendim)
        # H_r (weighted summary of para) (P, batch, hiddendim)
        
        if DEBUG:
            print("h_ak: \t\t", h_ak.shape)
            print("H_r: \t\t", H_r.shape)
            print("hidden: \t", hidden[0].shape, hidden[1].shape)
            
        # Prepare inputs for the tanh used to compute energy
        f_input_b = self.lin_f_repeat(h_ak)
        if DEBUG: print("f_input_b unrepeated: ", f_input_b.shape)
        
        #H_r shape is ([PARA_LEN, BATCHSIZE, EmbeddingDIM])
        f_input_b = f_input_b.repeat(H_r.shape[0], 1, 1)
        if DEBUG: print("f_input_b repeated: ", f_input_b.shape)
            
        f_input_a = self.lin_f_nobias(H_r)
        if DEBUG: print("f_input_a: ", f_input_a.shape)
            
        # Send it off to tanh now
        F_k = F.tanh(f_input_a+f_input_b)
        if DEBUG: print("F_k:\t", F_k.shape) #PARA_LEN,BATCHSIZE,EmbeddingDim
            
        # Attention weights
        beta_k_input_a = F_k.view(BATCH_SIZE, -1, self.hidden_dim).matmul(self.beta_k_w).view(BATCH_SIZE, 1, -1)
        if DEBUG: print("beta_k_input_a: ", beta_k_input_a.shape)
            
        beta_k_input = beta_k_input_a.add_(self.beta_k_b.repeat(1,1,PARA_LEN))
        if DEBUG: print("beta_k_input: ", beta_k_input.shape)
            
        beta_k = F.softmax(beta_k_input, dim=-1)
        if DEBUG: print("beta_k: ", beta_k.shape)
            
        lstm_input_a = H_r.view(BATCH_SIZE, PARA_LEN, -1) * (beta_k.view(BATCH_SIZE, PARA_LEN, -1).repeat(1,1,self.hidden_dim))
        if DEBUG: print("lstm_input_a: ", lstm_input_a.shape)
            
        lstm_input = torch.cat((lstm_input_a.view(1, BATCH_SIZE,-1), h_ak.view(1, BATCH_SIZE, -1)), dim=2)
        if DEBUG: print("lstm_input: ", lstm_input.shape)
        
        h_ak, hidden = self.lstm(lstm_input, hidden)
        
        return h_ak, hidden, beta_k
        
        
        return "Poop"
    
    
with torch.no_grad():
    pointerDecoder = PointerDecoder(HIDDEN_DIM)
    h_ak = torch.randn(1,BATCH_SIZE,HIDDEN_DIM)
#     H_r = torch.randn(PARA_LEN, BATCH_SIZE, HIDDEN_DIM)
    pointerHidden = pointerDecoder.init_hidden()
    h_ak, hidden, beta_k = pointerDecoder(h_ak, para_embedded, hidden)

h_ak: 		 torch.Size([1, 12, 10])
H_r: 		 torch.Size([9, 12, 10])
hidden: 	 torch.Size([1, 12, 10]) torch.Size([1, 12, 10])
f_input_b unrepeated:  torch.Size([1, 12, 10])
f_input_b repeated:  torch.Size([9, 12, 10])
f_input_a:  torch.Size([9, 12, 10])
F_k:	 torch.Size([9, 12, 10])
beta_k_input_a:  torch.Size([12, 1, 9])
beta_k_input:  torch.Size([12, 1, 9])
beta_k:  torch.Size([12, 1, 9])
lstm_input_a:  torch.Size([12, 9, 10])
lstm_input:  torch.Size([1, 12, 100])


In [7]:
def create_dummy_data(batch_size,dimension,vocab_size,max_passage_length=50,max_question_length=10):
    '''
        Create dummy data of given batch size. If batch size is -1 then the function returns a pair of 
        passage and question
    '''
    #@TODO: Implement logic for batch != -1
    min_index = 1
    max_index = vocab_size
    
    if batch_size == -1:
        passage_length = torch.LongTensor(1).random_(30, max_passage_length).item()
        passage_node = torch.randint(min_index,max_index,(passage_length,))
        question_length = torch.LongTensor(1).random_(7, max_question_length).item()
        question_node = torch.randint(min_index,max_index,(question_length,))
        return passage_node,question_node

In [10]:
print (create_dummy_data(-1,10,10,max_passage_length=50,max_question_length=10))

(tensor([ 7.,  6.,  6.,  8.,  3.,  9.,  7.,  3.,  3.,  8.,  7.,  2.,
         6.,  2.,  6.,  6.,  5.,  9.,  4.,  3.,  1.,  3.,  4.,  3.,
         1.,  1.,  6.,  8.,  3.,  6.,  8.]), tensor([ 9.,  9.,  9.,  2.,  3.,  5.,  2.]))


In [18]:
QUES_LEN = 15
PARA_LEN = 50
EMBEDDING_DIM = 5
VOCAB_SIZE = 30

quesEncoder = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
paraEncoder = Encoder(PARA_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
matchLSTMEncoder = MatchLSTMEncoder(HIDDEN_DIM, QUES_LEN)
pointerDecoder = PointerDecoder(HIDDEN_DIM)


dummy_question = torch.randint(0,VOCAB_SIZE-1,(QUES_LEN,)).view(1,1,QUES_LEN).long()
dummy_passage = torch.randint(0,VOCAB_SIZE-1,(PARA_LEN,)).view(1,1,PARA_LEN).long()


# Init hidden state of each model
quesEncoderHidden = quesEncoder.init_hidden()
paraEncoderHidden = quesEncoder.init_hidden()
pointerDecoderHidden = quesEncoder.init_hidden()
matchLSTMEncoderHidden = quesEncoder.init_hidden()




# Passing it through LSTM layer(pre-processing layer)
dummy_ques_encoded, quesEncoderHidden = quesEncoder(dummy_question, quesEncoderHidden)
dummy_passage_encoded, passageEncoderHidden = paraEncoder(dummy_passage, paraEncoderHidden)
# Tthe second  is just the most recent hidden state


print ("dummy question encoded shape is :", quesEncoderHidden[0].shape)
print ("dummy paragraph shape is :", passageEncoderHidden[0].shape)


for i in range(len(paraEncoderHidden[0])):
    h_pi = paraEncoderHidden[0][i]
    
    
# Passing it through MatchLSTM to get attended paragraph reperesentation.

# H_q = torch.randn(QUES_LEN, BATCH_SIZE, HIDDEN_DIM)
# h_pi = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
# h_ri = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
# op, hid = model(h_pi, h_ri, H_q, hidden)


# def Match_LSTM_Pointer():

#     data = create_dummy_data(-1,10,10,max_passage_length=50,max_question_length=10)

x:  torch.Size([1, 1, 15])
h:  torch.Size([1, 1, 10]) torch.Size([1, 1, 10])
x_embedded:  torch.Size([1, 1, 15, 5])
ycap:  torch.Size([15, 1, 10])
x:  torch.Size([1, 1, 50])
h:  torch.Size([1, 1, 10]) torch.Size([1, 1, 10])
x_embedded:  torch.Size([1, 1, 50, 5])
ycap:  torch.Size([50, 1, 10])
dummy question encoded shape is : torch.Size([1, 1, 10])
dummy paragraph shape is : torch.Size([1, 1, 10])
