In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pickle
import os


with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\glove_word_embeddings.pkl", "rb") as input_file:
    emb_matrix = pickle.load(input_file)
    
names = ["validation_context","train_context","validation_question","train_question"]
data_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"

word_index_padded =[os.path.join(data_dir + name + "_word_index_padded.pkl")  for name in names ]

with open(word_index_padded[0], "rb") as input_file:
    validation_context_word_index_padded = pickle.load(input_file)
with open(word_index_padded[1], "rb") as input_file:
    train_context_word_index_padded = pickle.load(input_file)
with open(word_index_padded[2], "rb") as input_file:
    validation_question_word_index_padded = pickle.load(input_file)
with open(word_index_padded[3], "rb") as input_file:
    train_question_word_index_padded = pickle.load(input_file)
    
validation_context_word_mask = (validation_context_word_index_padded != 0).type(torch.int32) 
train_context_word_mask = (train_context_word_index_padded != 0).type(torch.int32) 
validation_question_word_mask = (validation_question_word_index_padded != 0).type(torch.int32) 
train_question_word_mask = (train_question_word_index_padded != 0).type(torch.int32) 


def get_pretrained_embedding(embedding_matrix):
    embedding = nn.Embedding(*embedding_matrix.shape)
    embedding.weight = nn.Parameter(torch.from_numpy(embedding_matrix).float())
    embedding.weight.requires_grad = False
    return embedding


def init_lstm_forget_bias(lstm):
    for names in lstm._all_weights:
        for name in names:
            if name.startswith('bias_'):
                # set forget bias to 1
                bias = getattr(lstm, name)
                n = bias.size(0)
                start, end = n // 4, n // 2
                bias.data.fill_(0.)
                bias.data[start:end].fill_(1.)

class Word_Level_Encoder(nn.Module):
    
    def __init__(self, hidden_dim, embedding_matrix, dropout_ratio):
        super(Word_Level_Encoder, self).__init__()
        self.hidden_dim = hidden_dim

        self.embedding = get_pretrained_embedding(embedding_matrix)
        self.embedding_dim = self.embedding.embedding_dim

        # batch_first = True
        # Input: has a dimension of B * m * embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.encoder = nn.LSTM(self.embedding_dim, hidden_dim, 1, batch_first=True,
                              bidirectional=False, dropout=dropout_ratio) 
                                     
        init_lstm_forget_bias(self.encoder)
        self.dropout_emb = nn.Dropout(p=dropout_ratio)
        
        # creates a random vector with size= hidden_dim
        self.sentinel = nn.Parameter(torch.rand(hidden_dim,))

    def forward(self, word_sequence_indexes, word_sequence_mask):
        # stores length of per instance for context/question
        length_per_instance = torch.sum(word_sequence_mask, 1)
        
        # sorts the length_per_instance vector in decreasing order
        length_per_instance_sorted, length_per_instance_argsort = torch.sort(length_per_instance, 0, True) 
        
        _, length_per_instance_argsort_argsort = torch.sort(length_per_instance_argsort, 0)
        
        # selects the word indexes from word_sequences_indexes matrix according to of length_per_instance_argsort
        word_sequence_indexes_sorted = torch.index_select(word_sequence_indexes, 0, length_per_instance_argsort)

        # returns the word_sequences_embeddings_sorted matrix with the embeddings for each token/word from word_sequence_indexes_sorted
        word_sequence_embeddings_sorted = self.embedding(word_sequence_indexes_sorted)
        
        # All RNN modules accept packed sequences as inputs.
        # Input: word_sequence_embeddings_sorted has a dimension of B x m x l (l is the size of the glove_embedding/ pre-trained embedding/embedding_dim)
        packed_word_sequence_embeddings_sorted = pack_padded_sequence(word_sequence_embeddings_sorted, length_per_instance_sorted, batch_first=True)
        
        # nn.LSTM encoder gets an input of pack_padded_sequence of dimensions: B x m x l (l is the embedding_dim)
        # since the input was a packed sequence, the output will also be a packed sequence
        output, _ = self.encoder(packed_word_sequence_embeddings_sorted)
        
        # Pads a packed batch of variable length sequences.
        # It is an inverse operation to pack_padded_sequence().
        output_to_LSTM_padded, _ = pad_packed_sequence(output, batch_first=True)
        
        # Returns a contiguous tensor containing the same data as self 
        output_to_LSTM_padded = output_to_LSTM_padded.contiguous()
        
        # dimension:  B x m x l
        output_to_LSTM_padded_sorted = torch.index_select(output_to_LSTM_padded, 0, length_per_instance_argsort_argsort)  
        output_to_LSTM_padded_sorted = self.dropout_emb(output_to_LSTM_padded_sorted)

        # list() creates a list of elements if an iterable is passed
        batch_size, _ = list(word_sequence_mask.size())
        
        
        sentinel_matrix = self.sentinel.unsqueeze(0).expand(batch_size, self.hidden_dim).unsqueeze(1).contiguous()  # B x 1 x l
        length_per_instance = length_per_instance.unsqueeze(1).expand(batch_size, self.hidden_dim).unsqueeze(1)

        # sentinel to be concatenated to the data
        sentinel_zero = torch.zeros(batch_size, 1, self.hidden_dim)
        
        # copy sentinel vector at the end
        output_to_LSTM_padded_sorted_with_sentinel = torch.cat([output_to_LSTM_padded_sorted, sentinel_zero], 1)  # B x (m + 1) x l
        
        
        output_to_LSTM_padded_sorted_with_sentinel = output_to_LSTM_padded_sorted_with_sentinel.scatter_(1, length_per_instance, sentinel_matrix )

        return output_to_LSTM_padded_sorted_with_sentinel
    
    
hidden_dim = 300
dropout_ratio = 0.2
encoder = Word_Level_Encoder(hidden_dim, emb_matrix, dropout_ratio)

e = encoder(validation_context_word_index_padded.type(torch.long)[:10],validation_context_word_mask[:10])

In [None]:
class DynamicDecoder(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, max_number_of_iterations, dropout_ratio):
        super(DynamicDecoder, self).__init__()
        self.max_number_of_iterations = max_number_of_iterations
        
        # batch_first = True
        # Input: has a dimension of B * m * embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.decoder = nn.LSTM(4 * hidden_dim, hidden_dim, 1, batch_first=True, bidirectional=False)
        init_lstm_forget_bias(self.decoder)

        self.maxout_start = MaxOutHighway(hidden_dim, maxout_pool_size, dropout_ratio)
        self.maxout_end = MaxOutHighway(hidden_dim, maxout_pool_size, dropout_ratio)

    def forward(self, U, document_word_sequence_mask, answer_start,answer_end):
        batch_size, max_word_length, _ = list(U.size()) # U has dimension : B x m x 2l

        curr_mask_start,  curr_mask_end = None, None
        results_mask_start, results_start = [], []
        results_mask_end, results_end = [], []
        step_losses = []

        mask_matrix = (1.0 - document_word_sequence_mask.float()) * (-1e30)
        indices = torch.arange(0, batch_size, out=torch.LongTensor(batch_size))

        # initialize start_zero, end_zero: these are the initial values of start and end indices
        # start_i_minus_1 = the first index for the context/question 
        # end_i_minus_1 = the last index for the context/question 
        start_i_minus_1 = torch.zeros(batch_size, ).long()
        end_i_minus_1 = torch.sum(document_word_sequence_mask, 1) - 1

        

        # After every iteration the hidden and current state 
        # at t = length of the sequence (for the one-directional lstm) will
        # be returned by the lstm
        # the hidden_state_i(h_i) will serve as an input to next lstm
        hidden_and_current_state_i = None
        start_target = None
        end_target = None
        
        # this sets the start and end target (ie. the y_label) for an answer
        if answer_start is not None:
            start_target = answer_start
            end_target = answer_end
            
        # this is just an initialization of u_start
        # u_start_i_minus_1 is essentially u_start_zero outside the loop
        u_start_i_minus_1 = U[indices, start_i_minus_1, :]  # b x 2l
        
        # Why do we need an iterative procedure to predict the start and end indices for na answer ? 
        # Solution: there may exist several intuitive answer spans within the document, each corresponding to a
        #local maxima. An iterative technique to select an answer span by alternating between
        #predicting the start point and predicting the end point. This iterative procedure allows the model to
        #recover from initial local maxima corresponding to incorrect answer spans.
        for _ in range(self.max_number_of_iterations):
            u_end_i_minus_1 = U[indices, end_i_minus_1, :]  # b x 2l
            
            # u_concatenated is fed to the lstm
            u_concatenated = torch.cat((u_start_i_minus_1, u_end_i_minus_1), 1)  # b x 4l

            # the hidden_and_current_state_i = h_i,c_i are essentially hidden and current cell states 
            # for t = length of the sequence (for the one-directional lstm) after every iteration
            lstm_output, hidden_and_current_state_i = self.decoder(u_concatenated.unsqueeze(1), hidden_and_current_state_i)
            h_i, c_i = hidden_and_current_state_i

            start_i_minus_1, curr_mask_start, step_loss_start = self.maxout_start(h_i, U, curr_mask_start, start_i_minus_1,
                                                                u_concatenated, mask_matrix, start_target)
            u_start_i_minus_1 = U[indices, start_i_minus_1, :]  # b x 2l
            u_concatenated = torch.cat((start_i_minus_1, u_end_i_minus_1), 1)  # b x 4l

            end_i_minus_1, curr_mask_end, step_loss_end = self.maxout_end(h_i, U, curr_mask_end, end_i_minus_1,
                                                              u_concatenated, mask_matrix, end_target)

            if answer_start is not None:
                step_loss = step_loss_s + step_loss_e
                step_losses.append(step_loss)

            results_mask_start.append(curr_mask_start)
            results_start.append(start_i_minus_1)
            results_mask_end.append(curr_mask_end)
            results_end.append(end_i_minus_1)

        result_pos_start = torch.sum(torch.stack(results_mask_start, 1), 1).long()
        result_pos_start = result_pos_start - 1
        index_start = torch.gather(torch.stack(results_start, 1), 1, result_pos_start.unsqueeze(1)).squeeze()

        result_pos_end = torch.sum(torch.stack(results_mask_end, 1), 1).long()
        result_pos_end = result_pos_end - 1
        index_end = torch.gather(torch.stack(results_end, 1), 1, result_pos_end.unsqueeze(1)).squeeze()

        loss = None

        if span is not None:
            sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
            batch_avg_loss = sum_losses / self.max_number_of_iterations
            loss = torch.mean(batch_avg_loss)

        return loss, index_start, index_end



In [None]:
class Highway_Maxout_Network(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, dropout_ratio):
        super(Highway_Maxout_Network, self).__init__()
        self.hidden_dim = hidden_dim
        self.maxout_pool_size = maxout_pool_size

        self.r = nn.Linear(5 * hidden_dim, hidden_dim, bias=False)
       

        self.max_out_layer1 = nn.Linear(3 * hidden_dim, hidden_dim*maxout_pool_size)
        
        self.max_out_layer2 = nn.Linear(hidden_dim, hidden_dim*maxout_pool_size)
       

        self.max_out_layer3 = nn.Linear(2 * hidden_dim, maxout_pool_size)

        self.loss = nn.CrossEntropyLoss()

    def forward(self, h_i, U, curr_mask_matrix, index_i_minus_1, u_concatenated, mask_matrix, target=None):
        batch_size, max_word_length , _ = list(U.size())

        # concatenation of ( h_i of dimension = b x l ; u_concatenated of dimension = b x 4l ) along dimension 1 = gives b x 5l
        # self.r(b x 5l) ====> b x l (change of vector space)
        r = F.tanh(self.r(torch.cat((h_i.view(-1, self.hidden_dim), u_concatenated), 1)))  # b x 5l => b x l
       

        # hidden_dim = l
        r_expanded = r.unsqueeze(1).expand(b, m, self.hidden_dim).contiguous()  # b x m x l

        m_t1_input = torch.cat((U, r_expanded), 2).view(-1, 3*self.hidden_dim)  # b*m x 3l

        m_t1_output = self.max_out_layer1(m_t1_input)  # b*m x p*l
        
        m_t1_output_resized, _ = m_t1_output.view(-1, self.hidden_dim, self.maxout_pool_size).max(2) # b*m x l

        # m_t2_input =  m_t1_output_resized
        m_t2_output = self.max_out_layer2(m_t1_output_resized)  # b*m x l*p
        
        m_t2_output_resized, _ = m_t2_output.view(-1, self.hidden_dim, self.maxout_pool_size).max(2)  # b*m x l

        m_t3_input = torch.cat((m_t1_output_resized, m_t2_output_resized), 1)  # b*m x 2l
        alpha = self.max_out_layer3(m_t3_input)  # b * m x p
        alpha, _ = alpha.max(1)  # b*m
        alpha = alpha.view(-1, m) # b x m

        alpha = alpha + mask_matrix  # b x m
        alpha = F.log_softmax(alpha, 1)  # b x m
        _, indexes_for_max_alpha = torch.max(alpha, dim=1)

        if curr_mask_matrix is None:
            curr_mask_matrix = (indexes_for_max_alpha == indexes_for_max_alpha)
        else:
            indexes_for_max_alpha = indexes_for_max_alpha*curr_mask_matrix.long()
            index_i_minus_1 = index_i_minus_1*curr_mask_matrix.long()
            curr_mask_matrix = (index_i_minus_1 != index_i_minus_1)

        step_loss = None

        if target is not None:
            step_loss = self.loss(alpha, target)
            step_loss = step_loss * curr_mask_matrix.float()

        return indexes_for_max_alpha, curr_mask_matrix, step_loss

In [None]:
class Fusion_BiLSTM(nn.Module):
    def __init__(self, hidden_dim, dropout_ratio):
        super(FusionBiLSTM, self).__init__()
         # batch_first = True
        # Input: has a dimension of B * m * embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.fusion_bilstm = nn.LSTM(3 * hidden_dim, hidden_dim, 1, batch_first=True,
                                     bidirectional=True, dropout=dropout_ratio)
        init_lstm_forget_bias(self.fusion_bilstm)
        self.dropout = nn.Dropout(p=dropout_ratio)

    def forward(self, seq, mask):
        
        # stores length of per instance for context/question
        length_per_instance = torch.sum(word_sequence_mask, 1)
        
        # sorts the length_per_instance vector in decreasing order
        length_per_instance_sorted, length_per_instance_argsort = torch.sort(length_per_instance, 0, True) 
        
        _, length_per_instance_argsort_argsort = torch.sort(length_per_instance_argsort, 0)
        
        # selects the word indexes from word_sequences_indexes matrix according to of length_per_instance_argsort
        word_sequence_embeddings_sorted = torch.index_select(word_sequence_embeddings, 0, length_per_instance_argsort)

      
        # All RNN modules accept packed sequences as inputs.
        # Input: word_sequence_embeddings_sorted has a dimension of B x m x l (l is the size of the glove_embedding/ pre-trained embedding/embedding_dim)
        packed_word_sequence_embeddings_sorted = pack_padded_sequence(word_sequence_embeddings_sorted, length_per_instance_sorted, batch_first=True)
        
        # nn.LSTM encoder gets an input of pack_padded_sequence of dimensions: B x m x l (l is the embedding_dim)
        # since the input was a packed sequence, the output will also be a packed sequence
        output, _ = self.fusion_bilstm(packed_word_sequence_embeddings_sorted)
        
        # Pads a packed batch of variable length sequences.
        # It is an inverse operation to pack_padded_sequence().
        output_to_BiLSTM_padded, _ = pad_packed_sequence(output, batch_first=True)
        
        # Returns a contiguous tensor containing the same data as self 
        output_to_BiLSTM_padded = output_to_BiLSTM_padded.contiguous()
        
        # dimension:  B x m x l
        output_to_BiLSTM_padded_sorted = torch.index_select(output_to_BiLSTM_padded, 0, length_per_instance_argsort_argsort)  
        output_to_BiLSTM_padded_sorted = self.dropout(output_to_BiLSTM_padded_sorted)

        return output_to_BiLSTM_padded_sorted

In [None]:
class Coattention_Encoder(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, embedding_matrix, max_dec_steps, dropout_ratio):
        super(Coattention_Encoder, self).__init__()
        self.hidden_dim = hidden_dim

        self.encoder = Encoder(hidden_dim, embedding_matrix, dropout_ratio)

        ## nn.Linear(input_dim, output_dim)
        self.question_proj = nn.Linear(hidden_dim, hidden_dim)
        
        self.fusion_bilstm = FusionBiLSTM(hidden_dim, dropout_ratio)
        self.decoder = DynamicDecoder(hidden_dim, maxout_pool_size, max_dec_steps, dropout_ratio)
        self.dropout = nn.Dropout(p=dropout_ratio)

    def forward(self, question_word_sequence_indexes, question_word_sequence_mask, document_word_sequence_indexes, document_word_sequence_mask, span=None):
        
        ############## m = number of instances in document ;  n= number of instances in question ############################33
        Q = self.encoder(question_word_sequence_indexes, question_word_sequence_mask) # B x (n + 1) x l
        D = self.encoder(document_word_sequence_indexes, document_word_sequence_mask)  # B x (m + 1) x l

        # view function is meant to reshape the tensor.(Similar to reshape function in numpy)
        # view( row_size = -1 ,means that number of rows are unknown, column_size)
        
        
        # pass the Q tensor through a non-linearity 
        Q = F.tanh(self.question_proj(Q.view(-1, self.hidden_dim))).view(Q.size()) #B x (n + 1) x l

        ##################################   Co-Attention starts here  #######################################
        
        ########################################   Step - 1  ##################################################
        # transpose(tensor, first_dimension to be transposed, second_dimension to be transposed)
        Q_transpose = torch.transpose(Q, 1, 2) #dimension: B x l x (n + 1)
        
        # Performs a batch matrix-matrix product of matrices stored in batch1 and batch2.
        # batch1 and batch2 must be 3-D tensors each containing the same number of matrices.
        L = torch.bmm(D, Q_transpose) # dimension of L : B x (m + 1) x (n + 1)

        ####################################### Step-2 ######################################################
        A_Q = F.softmax(L, dim=2) # B x (m + 1) x (n + 1)


        D_transpose = torch.transpose(D, 1, 2) #dimension: B x l x (m + 1)
        C_Q = torch.bmm(D_transpose, A_Q) # (B x l x (m + 1)) x (B x (m + 1) x (n + 1)) => B x l x (n + 1)

        ####################################### Step-3 #######################################################
        L_tranpose = torch.transpose(L,1,2)
        A_D = F.softmax(L_tranpose, dim=2)  # B x (n + 1) x (m + 1)
        
        
        # concatenation along dimension=1:(B x l x (n + 1) ; B x l x (n + 1)  -----> B x 2l x (n + 1) ) x (B x (n + 1) x (m + 1)) ====> B x 2l x (m + 1)
        C_D = torch.bmm(torch.cat((Q_transpose, C_Q), 1), A_D) # B x 2l x (m + 1)
        C_D_transpose = torch.transpose(C_D, 1, 2)  # B x (m + 1) x 2l

        
        #######################################  Step-4 ##########################################################
        #fusion BiLSTM
        # concatenation along dimension = 2:  (B x (m + 1) x 2l ; B x (m + 1) x l  -----> B x (m + 1) x 3l )
        bi_lstm_input = torch.cat((C_D_transpose, D), 2) # B x (m + 1) x 3l
        bi_lstm_input = self.dropout(bi_lstm_input)
        
        
       
        U = self.fusion_bilstm(bi_lstm_input, document_word_sequence_mask) # B x m x 2l

        loss, index_start, index_end = self.decoder(U, document_word_sequence_mask, answer_start, answer_end)
        if answer_start is not None:
            return loss, index_start, index_end
        else:
            return index_start, index_end