## QA over unstructured data

Using Match LSTM, Pointer Networks, as mentioned in paper https://arxiv.org/pdf/1608.07905.pdf

We start with the pre-processing provided by https://github.com/MurtyShikhar/Question-Answering to clean up the data and make neat para, ques files.

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

device = torch.device("cpu")

In [2]:
# Macros 
DATA_LOC = './data/squad/'
DEBUG = True

# nn Macros
QUES_LEN, PARA_LEN =  4, 3
VOCAB_SIZE = 3000                    # @TODO: get actual size
HIDDEN_DIM = 10
EMBEDDING_DIM = 300
BATCH_SIZE = 1

### Encoder 
Use a simple lstm class to have encoder for question and paragraph. 
The output of these will be used in the match lstm

$H^p = LSTM(P)$ 


$H^q = LSTM(Q)$

In [3]:
class Encoder(nn.Module):
    
    def __init__(self, inputlen, hiddendim, embeddingdim, vocablen):
        super(Encoder, self).__init__()
        
        # Catch dim
        self.inputlen, self.hiddendim, self.embeddingdim, self.vocablen = inputlen, hiddendim, embeddingdim, vocablen
        
        # Embedding Layer
        self.embedding = nn.Embedding(self.vocablen, self.embeddingdim)
       
        # LSTM Layer
        self.lstm = nn.LSTM(self.embeddingdim, self.hiddendim)
        
    def init_hidden(self):
        
        # Returns a new hidden layer var for LSTM
        return (torch.zeros(1, BATCH_SIZE, self.hiddendim), torch.zeros(1, BATCH_SIZE, self.hiddendim))
    
    def forward(self, x, h):
        
        # Input: x (1, batch, ) (current input)
        # Hidden: h (1, batch, hiddendim) (last hidden state)
        
        if DEBUG: print("x: ", x.shape)
        if DEBUG: print("h: ", h[0].shape, h[1].shape)
        
        x_emb = self.embedding(x)
        if DEBUG: print("x_embedded: ", x_emb.shape)
            
        ycap, h = self.lstm(x_emb.view(-1, BATCH_SIZE, self.embeddingdim), h)
        if DEBUG: print("ycap: ", ycap.shape)
        
        return ycap, h
    
    
with torch.no_grad():
    print ("Trying out question encoder LSTM")
    model = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
    dummy_x = torch.tensor([22,45,12], dtype=torch.long)
    hidden = model.init_hidden()
    ycap, h = model(dummy_x, hidden)
    
    print(ycap.shape)
    print(h[0].shape, h[1].shape)

Trying out question encoder LSTM
x:  torch.Size([3])
h:  torch.Size([1, 1, 10]) torch.Size([1, 1, 10])
x_embedded:  torch.Size([3, 300])
ycap:  torch.Size([3, 1, 10])
torch.Size([3, 1, 10])
torch.Size([1, 1, 10]) torch.Size([1, 1, 10])


In [4]:
QuesEncoder = Encoder(QUES_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)
ParaEncoder = Encoder(PARA_LEN, HIDDEN_DIM, EMBEDDING_DIM, VOCAB_SIZE)

### Match LSTM

Use a match LSTM to compute a **summarized sequential vector** for the paragraph w.r.t the question.

Consider the summarized vector ($H^r$) as the output of a new decoder, where the inputs are $H^p, H^q$ computed above. 

1. Attend the para word $i$ with the entire question ($H^q$)
  
    1. $\vec{G}_i = tanh(W^qH^q + repeat(W^ph^p_i + W^r\vec{h^r_{i-1} + b^p}))$
    
    2. *Computing it*: Here, $\vec{G}_i$ is equivalent to `energy`, computed differently.
    
    3. Use a linear layer to compute the content within the $repeat$ fn.
    
    4. Add with another linear (without bias) with $H_q$
    
    5. $tanh$ the bloody thing
  
  
2. Softmax over it to get $\alpha$ weights.

    1. $\vec{\alpha_i} = softmax(w^t\vec{G}_i + repeat(b))$
    
3. Use the attention weight vector $\vec{\alpha_i}$ to obtain a weighted version of the question and combine it with the current token of the passage to form a vector $\vec{z_i}$

[@TODO]

In [17]:
class MatchLSTM(nn.Module):
    
    def __init__(self, hidden_dim, ques_len ):
        
        super(MatchLSTM, self).__init__()
        
        self.hidden_dim, self.ques_len = hidden_dim, ques_len
        
        # Catch lens and params
        self.lin_g_repeat = nn.Linear(2*self.hidden_dim, hidden_dim)
        self.lin_g_nobias = nn.Linear(self.hidden_dim, hidden_dim)
        
        self.alpha_i_w = nn.Parameter(torch.FloatTensor(self.hidden_dim, 1))
        self.alpha_i_b= nn.Parameter(torch.FloatTensor((1)))
        
        self.lstm_summary = nn.LSTM(self.hidden_dim*(self.ques_len+2), self.hidden_dim)
                                      
    
    def forward(self, h_pi, h_ri, H_q, hidden):
        
        # For h_r i
        # encoded para word: h_pi (1, batch, hidden_dim  )
        # encoded ques all: H_q (seqlen, batch, hidden_dim)
        # last hidden state: h_ri (1, batch, hidden_dim) (i-1th) 
        
        if DEBUG:
            print( "h_pi: \t\t  ", h_pi.shape)
            print( "h_ri: \t\t  ", h_ri.shape)
            print( "H_q: \t\t  ", H_q.shape)
        
        lin_repeat_input = torch.cat((h_pi, h_ri), dim=2)
        if DEBUG: print("lin_repeat_input: ", lin_repeat_input.shape)
        
        lin_g_input_b = self.lin_g_repeat(lin_repeat_input)
        if DEBUG: print("lin_g_input_b unrepeated: ", lin_g_input_b.shape)
            
        lin_g_input_b = lin_g_input_b.repeat(H_q.shape[0], 1, 1)
        if DEBUG: print("lin_g_input_b: \t", lin_g_input_b.shape)
            
        # lin_g_input_a = self.lin_g_nobias.matmul(H_q.view(-1, self.ques_len, self.hidden_dim)) #self.lin_g_nobias(H_q)
        lin_g_input_a =  self.lin_g_nobias(H_q)
        if DEBUG: print("lin_g_input_a: ", lin_g_input_a.shape)
            
        G_i = F.tanh(lin_g_input_a + lin_g_input_b)
        if DEBUG: print("G_i: ", G_i.shape)
        # Note; G_i should be a 1D vector over ques_len
        
        # Attention weights
        alpha_i_input_a = G_i.view(BATCH_SIZE, -1, self.hidden_dim).matmul(self.alpha_i_w).view(BATCH_SIZE, 1, -1)
        if DEBUG: print("alpha_i_input_a: ", alpha_i_input_a.shape)
            
        alpha_i_input = alpha_i_input_a.add_(self.alpha_i_b.view(-1,1,1).repeat(1,1,self.ques_len))
        if DEBUG: print("alpha_i_input: ", alpha_i_input.shape)
        
        # Softmax over alpha inputs
        alpha_i = F.softmax(alpha_i_input, dim=-1)
        if DEBUG: print("alpha_i: ", alpha_i.shape)
            
        # Weighted summary of question with alpha    
        z_i_input_b = H_q.view(BATCH_SIZE, QUES_LEN, -1) * (alpha_i.view(BATCH_SIZE, self.ques_len, -1).repeat(1,1,self.hidden_dim))
        if DEBUG: print("z_i_input_b: ", z_i_input_b.shape)
            
        z_i = torch.cat((h_pi, z_i_input_b), dim=1)
        if DEBUG: print("z_i: ", z_i.shape)
                        
        # Pass z_i, h_ri to the LSTM 
        lstm_input = torch.cat((z_i.view(1,BATCH_SIZE,-1), h_ri), dim=2)
        if DEBUG: print("lstm_input: ", lstm_input.shape)
        
        h_ri, hidden = self.lstm_summary(lstm_input, hidden)
        if DEBUG:
            print("h_ri new: ", h_ri.shape)
            print("hidden new: ", hidden[0].shape, hidden[1].shape)
        
        
        return h_ri, hidden
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

with torch.no_grad():
    model = MatchLSTM(HIDDEN_DIM, QUES_LEN)
    h_pi = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
    h_ri = torch.randn(1, BATCH_SIZE, HIDDEN_DIM)
    hidden = model.init_hidden()
    H_q = torch.randn(QUES_LEN, BATCH_SIZE, HIDDEN_DIM)
    
    op, hid = model(h_pi, h_ri, H_q, hidden)
    
    print("\nDone:op", op.shape)
    print("Done:hid", hid[0].shape, hid[1].shape)
                                                        

h_pi: 		   torch.Size([1, 1, 10])
h_ri: 		   torch.Size([1, 1, 10])
H_q: 		   torch.Size([4, 1, 10])
lin_repeat_input:  torch.Size([1, 1, 20])
lin_g_input_b unrepeated:  torch.Size([1, 1, 10])
lin_g_input_b: 		 torch.Size([4, 1, 10])
lin_g_input_a:  torch.Size([4, 1, 10])
G_i:  torch.Size([4, 1, 10])
alpha_i_input_a:  torch.Size([1, 1, 4])
alpha_i_input:  torch.Size([1, 1, 4])
alpha_i:  torch.Size([1, 1, 4])
z_i_input_b:  torch.Size([1, 4, 10])
z_i:  torch.Size([1, 5, 10])
lstm_input:  torch.Size([1, 1, 60])
h_ri new:  torch.Size([1, 1, 10])
hidden new:  torch.Size([1, 1, 10]) torch.Size([1, 1, 10])

Done:op torch.Size([1, 1, 10])
Done:hid torch.Size([1, 1, 10]) torch.Size([1, 1, 10])
