In [152]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import logging
import code
import gc
from torch.optim import Adam


There are two ways of letting the model know your intention i.e do you want to train the model or do you want to use the model to evaluate. In case of model.train() the model knows it has to learn the layers and when we use model.eval() it indicates the model that nothing new is to be learnt and the model is used for testing. model.eval() is also necessary because in pytorch if we are using batchnorm and during test if we want to just pass a single image, pytorch throws an error if model.eval() is not specified.

In [411]:
class Train_Model:

    def __init__(self, config, model):
        self.config = config
        self.model = model
        self.loss_function = torch.nn.CrossEntropyLoss()
        self.parameters_trainable = list(filter(lambda p: p.requires_grad, self.model.parameters()))
        self.optimizer = optim.Adam(self.parameters_trainable, lr=self.config.lr)

        self.glove_path = os.path.join(config.data_dir, "glove_word_embeddings.pkl")
        self.num_epochs = config.num_epochs
        self.data_dir = config.data_dir
        self.names = config.names
        self.batch_size = config.batch_size
        self.print_every = config.print_every
        self.max_context_length = config.max_context_length
        self.max_question_length = config.max_question_length

    def get_data(self, batch, is_train=True):
        
        question_word_index_batch = batch.question_word_index_batch

        context_word_index_batch = batch.context_word_index_batch
        
        span_tensor_batch = batch.span_tensor_batch

        if is_train:
            return context_word_index_batch, question_word_index_batch,span_tensor_batch
        else:
            return context_word_index_batch, question_word_index_batch
      
    def get_grad_norm(self, parameters, norm_type=2):
        parameters = list(filter(lambda p: p.grad is not None, parameters))
        total_norm = 0
        for p in parameters:
            param_norm = p.grad.data.norm(norm_type)
            total_norm += param_norm ** norm_type
        total_norm = total_norm ** (1. / norm_type)
        return total_norm

    def get_param_norm(self, parameters, norm_type=2):
        total_norm = 0
        for p in parameters:
            param_norm = p.data.norm(norm_type)
            total_norm += param_norm ** norm_type
        total_norm = total_norm ** (1. / norm_type)
        return total_norm
        
    def train_one_batch(self, batch, model, optimizer, parameters):
        model.train()
        optimizer.zero_grad()
        context_word_index_batch, question_word_index_batch,  span_tensor_batch = self.get_data(batch)
        

        
        context_word_index_padded_per_batch = pad_data(context_word_index_batch)
        question_word_index_padded_per_batch = pad_data(question_word_index_batch)
        

        context_ids = np.array(context_word_index_padded_per_batch) # shape (batch_size, context_len)
        context_mask_per_batch = (context_ids != 0).astype(np.int32) # shape (batch_size, context_len)
        context_word_mask_per_batch_new = torch.from_numpy(context_mask_per_batch)

        question_ids = np.array(question_word_index_padded_per_batch) # shape (batch_size, context_len)
        question_mask_per_batch = (question_ids != 0).astype(np.int32) # shape (batch_size, context_len)
        question_word_mask_per_batch_new = torch.from_numpy(question_mask_per_batch)


        loss, _, _ = model(context_word_index_padded_per_batch,context_word_mask_per_batch_new, question_word_index_padded_per_batch, question_word_mask_per_batch_new, span_tensor_batch)

        l2_reg = None
        for W in parameters:
            if l2_reg is None:
                l2_reg = W.norm(2)
            else:
                l2_reg = l2_reg + W.norm(2)
        loss = loss + config.reg_lambda * l2_reg
        
        print(loss.grad_fn)
        print(loss)

#         loss.backward()

        param_norm = self.get_param_norm(parameters)
        grad_norm = self.get_grad_norm(parameters)
        
        print("param_norm   " ,end = "")
        print(param_norm)

#         clip_grad_norm_(parameters, config.max_grad_norm)
        optimizer.step()

        return loss.item(), param_norm, grad_norm
    
    
    def train(self):


        model = self.model
        parameters = list(filter(lambda p: p.requires_grad, model.parameters()))
        
        optimizer = Adam(parameters, lr=config.lr, amsgrad=True)

        num_parameters = sum(p.numel() for p in parameters)
        logging.info("Number of params: %d" % num_parameters)

        exp_loss, best_dev_f1, best_dev_em = None, None, None

        epoch = 0
        global_step = 0

        logging.info("Beginning training loop...")
        for epoch in range(1):
            epoch_tic = time.time()
            for batch in get_batch_generator(self.data_dir, self.names, self.batch_size, self.max_context_length, self.max_question_length):

                global_step += 1
                iter_tic = time.time()

                
                loss, param_norm, grad_norm = self.train_one_batch(batch, model, optimizer, parameters)

                print("loss for batch" + str(global_step) + " = " + str(loss))
    
                iter_toc = time.time()
                iter_time = iter_toc - iter_tic




            epoch_toc = time.time()
            logging.info("End of epoch %i. Time for epoch: %f" % (epoch, epoch_toc - epoch_tic))

        sys.stdout.flush()

In [418]:
hidden_dim = 100
dropout_ratio = 0.2
maxout_pool_size=16
max_number_of_iterations = 5
with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\glove_word_embeddings.pkl", "rb") as input_file:
    embedding_matrix = pickle.load(input_file)
    
with autograd.set_detect_anomaly(True):
    model = DCN_Model(hidden_dim, embedding_matrix, dropout_ratio, maxout_pool_size, max_number_of_iterations)

    # model = model.cpu()
    train_model = Train_Model(config, model)

    train_model.train()

Embedding(106154, 100)


  "num_layers={}".format(dropout, num_layers))


Refilling batches...
Refilling batches took 0.00 seconds


100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 2507.06it/s]


question_representation.(Output to Encoder Layer) ==  torch.Size([5, 18, 100])
context_representation. (Output to Encoder Layer)  ==  torch.Size([5, 304, 100])
size of U.(U is output of Co-attention encoder) ==  torch.Size([5, 303, 200])
<AddBackward0 object at 0x0000024071BD4D30>
tensor(4.5891, grad_fn=<AddBackward0>)
param_norm   tensor(65.6269)
loss for batch1 = 4.589073181152344


100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 2754.80it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 11024.45it/s]


question_representation.(Output to Encoder Layer) ==  torch.Size([11, 21, 100])
context_representation. (Output to Encoder Layer)  ==  torch.Size([11, 304, 100])
size of U.(U is output of Co-attention encoder) ==  torch.Size([11, 303, 200])
<AddBackward0 object at 0x0000024038396B38>
tensor(3.6539, grad_fn=<AddBackward0>)
param_norm   tensor(65.6269)
loss for batch2 = 3.6538546085357666


In [272]:
import tqdm as tqdm
import torch
import random
# import nltk
import numpy as np
import pickle
import sys
import copy
import os.path
import tqdm as tqdm

datapath = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"

def find_max_length(data):

    """ Finds the maximum sequence length for data 
        Args:
            data: The data from which sequences will be chosen
    """
    temp = 0
    index = 0
    for i, _ in enumerate(data):

        if (len(data[i]) > temp):
            temp = len(data[i])
            index = i
    return temp,index


def pad_data(data):

    """ Pad the data to max_length given
        Args: 
            data: Data that needs to be padded
            max_length : The length to be achieved with padding
        Returns:
            padded_data : Each sequence is padded to make it of length
                          max_length.
    """
    padded_data = []
    max_length,index =  find_max_length(data)

    for lines in tqdm.tqdm(data):
        if (len(lines) < max_length):
            temp = np.lib.pad(lines, (0,max_length - len(lines)),
                'constant', constant_values=0)
        else:
            temp = lines[:max_length]
        padded_data.append(temp)

    padded_data = torch.from_numpy(np.array(padded_data)).type(torch.int64)

    return padded_data


def index_files_using_word_to_index(filename, _dict, max_words):
    
    f = open(filename, "r", encoding="utf-8")

    lines = f.readlines()
    lines  = [l.lower() for l in lines]
    encoded_lines = []
    for l in lines:
        tokens = l.split()
        tokens = tokens[:max_words]
        temp = []
        for t in tokens:
            if t in _dict:
                temp.append(_dict[t])
            else:
                temp.append(1)

        encoded_lines.append(temp[:])

    return encoded_lines

    
with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\dictionaries.pkl", "rb") as input_file:
    dictionaries = pickle.load(input_file)
word_to_index = dictionaries["word_to_index"]


In [265]:



from __future__ import absolute_import
from __future__ import division

import torch
import random
import re
import time
import os
import pickle
import tqdm as tqdm

import numpy as np
from six.moves import xrange

class Batch():
    """A class to hold the information needed for a training batch"""
    def __init__(self,names,context_word_index_batch,question_word_index_batch, span_tensor_batch):
        
        self.names = names
        self.context_word_index_batch = context_word_index_batch

        self.question_word_index_batch = question_word_index_batch
        self.span_tensor_batch = span_tensor_batch
        self.batch_size = len(self.context_word_index_batch)





def refill_batches(batches,batch_size,names, max_context_length, max_question_length,context_word_index,question_word_index,span_tensor):

    """

    Adds more batches into the "batches" list.
    Inputs:
      batches: list to add batches to

      names: list containing strings of file names ["train_context","train_question"] or ["validation_context","validation_question"]
      data_dir : paths to {train/dev}.{context/question/answer} data files
      batch_size: integer ==> how big to make the batches
      max_context_length, max_question_length: max length of context and question respectively

    """
    print ("Refilling batches...")
    tic = time.time()
    examples = [] 



        # add to examples
    examples.append((context_word_index, question_word_index, span_tensor))

        
        

    # Make into batches and append to the list batches
    for batch_start in xrange(0, len(examples[0][0]), batch_size):

        # Note: each of these is a list length batch_size of lists of ints (except on last iter when it might be less than batch_size)
        context_word_index_batch = examples[0][0][batch_start:batch_start+batch_size]
        question_word_index_batch = examples[0][1][batch_start:batch_start+batch_size]
        span_tensor_batch = examples[0][2][batch_start:batch_start+batch_size]

        
        batches.append((context_word_index_batch, question_word_index_batch,span_tensor_batch))


    

    # shuffle the batches
    random.shuffle(batches)

    toc = time.time()
    print ("Refilling batches took %.2f seconds" % (toc-tic))
    return batches


def get_batch_generator(data_dir, names, batch_size, max_context_length, max_question_length):
    """
    This function returns a generator object that yields batches.
    The last batch in the dataset will be a partial batch.
    Read this to understand generators and the yield keyword in Python: https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
    Inputs:
      names: list containing strings of file names = ["train_context","train_question"] or ["validation_context","validation_question"]
      data_dir : paths to {train/dev}.{context/question/answer} data files
      batch_size: integer ==> how big to make the batches
      max_context_length, max_question_length: max length of context and question respectively

    """

    context_path_train = os.path.join(datapath, "train.context")
    question_path_train = os.path.join(datapath, "train.question")


    context_word_index_old = index_files_using_word_to_index(context_path_train, word_to_index, max_context_length)
    question_word_index_old = index_files_using_word_to_index(question_path_train, word_to_index, max_question_length)
    

    with open(data_dir + "//" + "answer_end_pkl.pkl", "rb") as input_file:
        answer_end_pkl = pickle.load(input_file)
    with open(data_dir + "//" + "answer_start_pkl.pkl", "rb") as input_file:
        answer_start_pkl = pickle.load(input_file)


    answer_end = torch.from_numpy(np.array([int(i) for i in answer_end_pkl])).long()
    answer_start = torch.from_numpy(np.array([int(i) for i in answer_start_pkl])).long()              
    answer_start = torch.unsqueeze(answer_start, 1)
    answer_end = torch.unsqueeze(answer_end, 1)

    span_tensor_old = torch.cat((answer_start, answer_end), 1)
    span_tensor = span_tensor_old[67:83]
    context_word_index = context_word_index_old[67:83]
    question_word_index = question_word_index_old[67:83]



    batches = []
    count = 0

    while (True):
        count = count + 1
        if len(batches) == 0: # add more batches
            if(count > 2):
                break
            batches = refill_batches(batches,batch_size,names, max_context_length, max_question_length,context_word_index,question_word_index,span_tensor)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (context_word_index_batch, question_word_index_batch,span_tensor_batch) = batches.pop(0)
        

        if(len(context_word_index_batch) == 0):
            break
            


        # Make into a Batch object
        batch = Batch(names,context_word_index_batch, question_word_index_batch, span_tensor_batch)

        yield batch

    return

In [297]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import logging
import code
import pickle
import os
from torch import autograd
from torch.autograd import Variable
from torch.nn import Embedding
from argparse import ArgumentParser


class DCN_Model(nn.Module):

    def __init__(self, hidden_dim, embedding_matrix, dropout_ratio, maxout_pool_size, max_number_of_iterations):
        super(DCN_Model, self).__init__()

        self.encoder = Word_Level_Encoder(hidden_dim, emb_matrix, dropout_ratio)
        self.coattention_encoder = Coattention_Encoder(hidden_dim, maxout_pool_size, embedding_matrix, max_number_of_iterations, dropout_ratio)
        self.decoder = Dynamic_Decoder(hidden_dim, maxout_pool_size, max_number_of_iterations, dropout_ratio)

    def forward(self, context_word_indexes, context_word_mask, question_word_indexes, question_word_mask,span_tensor):
        passage_representation = self.encoder.forward(context_word_indexes, context_word_mask)

        question_representation = self.encoder.forward(question_word_indexes, question_word_mask)
       

        U_matrix = self.coattention_encoder.forward(question_representation, passage_representation,context_word_mask)

        loss, index_start, index_end = self.decoder.forward(U_matrix, context_word_mask, span_tensor)

        return loss, index_start, index_end

In [408]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pickle
import os


with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\glove_word_embeddings.pkl", "rb") as input_file:
    emb_matrix = pickle.load(input_file)
    
names = ["validation_context","train_context","validation_question","train_question"]
data_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"



def get_pretrained_embedding(embedding_matrix):
    embedding = nn.Embedding(*embedding_matrix.shape)
    embedding.weight = nn.Parameter(torch.from_numpy(embedding_matrix).float())
    embedding.weight.requires_grad = False
    return embedding


class Word_Level_Encoder(nn.Module):
    
    def __init__(self, hidden_dim, embedding_matrix, dropout_ratio):
        super(Word_Level_Encoder, self).__init__()
        self.hidden_dim = hidden_dim

        self.embedding = get_pretrained_embedding(embedding_matrix)
        self.embedding_dim = self.embedding.embedding_dim

        # batch_first = True
        # Input: has a dimension of B x m x embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.encoder = nn.LSTM(self.embedding_dim, self.hidden_dim, 1, batch_first=True,
                              bidirectional=False, dropout=dropout_ratio) 
                                     
#         self.dropout_emb = nn.Dropout(p=dropout_ratio)
        
        # creates a random vector with size= hidden_dim
        self.sentinel = nn.Parameter(torch.rand(hidden_dim,))

    def forward(self, word_sequence_indexes, word_sequence_mask):
        
        # stores length of per instance for context/question
        # tensor of size = B
        length_per_instance = torch.sum(word_sequence_mask, 1)


        # returns the word_sequences_embeddings with the embeddings for each token/word from word_sequence_indexes
        # word_sequence_embeddings is a tensor of dimension of B x m x l
        word_sequence_embeddings = self.embedding(word_sequence_indexes)
        
        # All RNN modules accept packed sequences as inputs.
        # Input: word_sequence_embeddings has a dimension of B x m x l (l is the size of the glove_embedding/ pre-trained embedding/embedding_dim)
        packed_word_sequence_embeddings = pack_padded_sequence(word_sequence_embeddings,length_per_instance,batch_first=True,enforce_sorted=False)

        
        
        # nn.LSTM encoder gets an input of pack_padded_sequence of dimensions
        # since the input was a packed sequence, the output will also be a packed sequence
        output, _ = self.encoder(packed_word_sequence_embeddings)
       
        
        # Pads a packed batch of variable length sequences.
        # It is an inverse operation to pack_padded_sequence().
        # dimension:  B x m x l
        output_to_LSTM_padded, _ = pad_packed_sequence(output, batch_first=True)
        
        

        # list() creates a list of elements if an iterable is passed
        # batch_size is a scalar which stores the value of batch size. (batch_size = B)
        batch_size, _ = list(word_sequence_mask.size())
        
        
        # dimension of sentinel matrix =  B x 1 x l (replicates or expands along given dimension)
        length_per_instance_new_dim = length_per_instance.unsqueeze(1).expand(batch_size, self.hidden_dim).unsqueeze(1)
        

        # sentinel to be concatenated to the data
        # dimension of sentinel_zero =  B x 1 x l
        sentinel_zero = torch.zeros(batch_size, 1, self.hidden_dim)
        
        # copy sentinel vector at the end
        # dimension of output_to_LSTM_padded_with_sentinel =  B x (m + 1) x l
        output_to_LSTM_padded_with_sentinel = torch.cat([output_to_LSTM_padded, sentinel_zero], 1)  
        
        
        
        return output_to_LSTM_padded_with_sentinel
    
    


In [419]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pickle
import os

class Highway_Maxout_Network(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, dropout_ratio):
        super(Highway_Maxout_Network, self).__init__()
        self.hidden_dim = hidden_dim # l
        self.maxout_pool_size = maxout_pool_size # p

        # Affine mapping from 5l ==> l
        self.r = nn.Linear(5 * hidden_dim, hidden_dim, bias=False) 
       

        # Affine mapping from 3*l ==> l*p
        self.max_out_layer1 = nn.Linear(3 * hidden_dim, hidden_dim*maxout_pool_size)
        
        # Affine mapping from l ==> l*p
        self.max_out_layer2 = nn.Linear(hidden_dim, hidden_dim*maxout_pool_size)
       
        # Affine mapping from 2*l ==> p
        self.max_out_layer3 = nn.Linear(2 * hidden_dim, maxout_pool_size)

        self.loss = nn.CrossEntropyLoss()

    def forward(self, h_i, U, curr_mask_vector, index_i_minus_1, u_concatenated, mask_matrix, target=None):
        batch_size, max_word_length , _ = list(U.size())

        # concatenation of ( h_i of dimension = b x l ; u_concatenated of dimension = b x 4l ) along dimension 1 = gives b x 5l
        # self.r(b x 5l) ====> b x l (change of vector space)
        r = torch.tanh(self.r(torch.cat((h_i.view(-1, self.hidden_dim), u_concatenated), 1)))  # b x 5l => b x l
       

        # hidden_dim = l
        r_expanded = r.unsqueeze(1).expand(batch_size, max_word_length, self.hidden_dim).contiguous()  # b x m x l

        m_t1_input = torch.cat((U, r_expanded), 2).view(-1, 3*self.hidden_dim)  # b*m x 3l

        m_t1_output = self.max_out_layer1(m_t1_input)  # b*m x p*l
        
        m_t1_output_resized, _ = m_t1_output.view(-1, self.hidden_dim, self.maxout_pool_size).max(2) # b*m x l

        # m_t2_input =  m_t1_output_resized
        m_t2_output = self.max_out_layer2(m_t1_output_resized)  # b*m x l*p
        
        m_t2_output_resized, _ = m_t2_output.view(-1, self.hidden_dim, self.maxout_pool_size).max(2)  # b*m x l

        m_t3_input = torch.cat((m_t1_output_resized, m_t2_output_resized), 1)  # b*m x 2l
        alpha1 = self.max_out_layer3(m_t3_input)  # b * m x p
        alpha2, _ = alpha1.max(1)  # b*m
        alpha3 = alpha2.view(-1, max_word_length) # b x m

        alpha3 = alpha3 + mask_matrix  # b x m
        
        # alpha can be treated as probabilities that assign probability masses todifferent words in context. The word with
        # maximum weight(probability) becomes the index(start/end)
        alpha4 = F.softmax(alpha3, 1)  # b x m
        _, index_i = torch.max(alpha4, dim=1) # b

        if curr_mask_vector is None:
            curr_mask_vector = (index_i == index_i) # b
        else:
            index_i = index_i*curr_mask_vector.long()  # b
            index_i_minus_1 = index_i_minus_1*curr_mask_vector.long()  # b
            curr_mask_vector = (index_i != index_i_minus_1) # b

        step_loss = None
        
        

        target[target < 0] = 0
        
        
        ## loss is only calculated only on that the predicted index at i_th time-step which varies 
        ## from the predicted index at time-step (i-1)_th time-step
        if target is not None:
            step_loss = self.loss(alpha4, target)  # b
            step_loss1 = step_loss * curr_mask_vector.float() # b

        return index_i, curr_mask_vector, step_loss1 # all have dimension: b

class Dynamic_Decoder(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, max_number_of_iterations, dropout_ratio):
        super(Dynamic_Decoder, self).__init__()
        self.max_number_of_iterations = max_number_of_iterations
        
        # batch_first = True
        # Input: has a dimension of B * m * embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.decoder = nn.LSTM(4 * hidden_dim, hidden_dim, 1, batch_first=True, bidirectional=False)

        self.maxout_start = Highway_Maxout_Network(hidden_dim, maxout_pool_size, dropout_ratio)
        self.maxout_end = Highway_Maxout_Network(hidden_dim, maxout_pool_size, dropout_ratio)

    def forward(self, U, document_word_sequence_mask,span_tensor):
        batch_size, max_word_length, _ = list(U.size()) # U has dimension : B x m x 2l

        curr_mask_start,  curr_mask_end = None, None
        results_mask_start, results_start = [], []
        results_mask_end, results_end = [], []
        step_losses = []
        

        # dimension = B x m
        mask_matrix = (1.0 - document_word_sequence_mask.float()) * (-1e30)
        
        # dimension = B
        indices = torch.arange(0, batch_size)

        
        # initialize start_i_minus_1, end_i_minus_1: these are the initial values of start and end indices
        # start_i_minus_1 = the first index for the context/question 
        # end_i_minus_1 = the last index for the context/question 
        
        # dimension = B
        start_i_minus_1 = torch.zeros(batch_size).long()
        
        # dimension = B
        end_i_minus_1 = torch.sum(document_word_sequence_mask, 1) - 1

        

        # After every iteration the hidden and current state 
        # at t = length of the sequence (for the one-directional lstm) will
        # be returned by the lstm
        # the hidden_state_i(h_i) will serve as an input to next lstm
        hidden_and_current_state_i = None
        start_target = None
        end_target = None
        
        # this sets the start and end target (ie. the y_label) for an answer
        if span_tensor is not None:
            # Dimension = B
            start_target = span_tensor[:,0]
            
            
            # Dimension = B
            end_target = span_tensor[:,1]
            
            
        # this is just an initialization of u_start
        # u_start_i_minus_1 is essentially u_start_zero outside the loop
        u_start_i_minus_1 = U[indices, start_i_minus_1, :]  # b x 2l
        
        # Why do we need an iterative procedure to predict the start and end indices for na answer ? 
        # Solution: there may exist several intuitive answer spans within the document, each corresponding to a
        # local maxima. An iterative technique to select an answer span by alternating between
        # predicting the start point and predicting the end point. This iterative procedure allows the model to
        # recover from initial local maxima corresponding to incorrect answer spans.
        for _ in range(self.max_number_of_iterations):
            u_end_i_minus_1 = U[indices, end_i_minus_1, :]  # b x 2l
            
            # u_concatenated is fed to the lstm
            u_concatenated = torch.cat((u_start_i_minus_1, u_end_i_minus_1), 1)  # b x 4l

            # the hidden_and_current_state_i = h_i,c_i are essentially hidden and current cell states 
            # for t = length of the sequence (for the one-directional lstm) after every iteration
            lstm_output, hidden_and_current_state_i = self.decoder(u_concatenated.unsqueeze(1), hidden_and_current_state_i)
            h_i, c_i = hidden_and_current_state_i

            # Inputs to the Highway_Maxout_Network(to find start index) are: hidden_state_i(h_i), start_i_minus_1(index), u_concatenated ==>(u_start_i_minus_1;u_end_i_minus_1) 
            start_i_minus_1, curr_mask_start, step_loss_start = self.maxout_start(h_i, U, curr_mask_start, start_i_minus_1,
                                                                u_concatenated, mask_matrix, start_target)
            u_start_i_minus_1 = U[indices, start_i_minus_1, :]  # b x 2l

            u_concatenated = torch.cat((u_start_i_minus_1, u_end_i_minus_1), 1)  # b x 4l

            # Inputs to the Highway_Maxout_Network(to find end index) are: hidden_state_i(h_i), end_i_minus_1(index), u_concatenated ==>(u_start_i_minus_1;u_end_i_minus_1) 
            end_i_minus_1, curr_mask_end, step_loss_end = self.maxout_end(h_i, U, curr_mask_end, end_i_minus_1,
                                                              u_concatenated, mask_matrix, end_target)

            # we minimize the cumulative softmax cross entropy of the start and end points across all iterations.
            if span_tensor is not None:
                step_loss = step_loss_start + step_loss_end
                step_losses.append(step_loss)

            
            results_mask_start.append(curr_mask_start) # appends all the curr_mask_start ==> dimension: b x num_iterations
            results_start.append(start_i_minus_1) # appends all the start_indexes ==> dimension: b x num_iterations
            results_mask_end.append(curr_mask_end) # appends all the curr_mask_end ==> dimension: b x num_iterations
            results_end.append(end_i_minus_1) # appends all the end_indexes ==> dimension: b x num_iterations

        
        result_pos_start1 = torch.sum(torch.stack(results_mask_start, 1), 1).long()

        result_pos_start = result_pos_start1 - 1
        index_start = torch.gather(torch.stack(results_start, 1), 1, result_pos_start.unsqueeze(1)).squeeze()

        result_pos_end1 = torch.sum(torch.stack(results_mask_end, 1), 1).long()

        result_pos_end = result_pos_end1 - 1
        index_end = torch.gather(torch.stack(results_end, 1), 1, result_pos_end.unsqueeze(1)).squeeze()

        loss = None

        if span_tensor is not None:
            sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
            batch_avg_loss = sum_losses / self.max_number_of_iterations
            loss = torch.mean(batch_avg_loss)

            
        return loss, index_start, index_end



In [407]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pickle
import os


class Coattention_Encoder(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, embedding_matrix, max_number_of_iterations, dropout_ratio):
        super(Coattention_Encoder, self).__init__()
        self.hidden_dim = hidden_dim

        ## nn.Linear(input_dim, output_dim)
        # Affine mapping from l ==> l
        self.question_proj = nn.Linear(hidden_dim, hidden_dim)
        
        self.fusion_bilstm = Fusion_BiLSTM(hidden_dim, dropout_ratio)
#         self.dropout = nn.Dropout(p=dropout_ratio)

    def forward(self, question_representation, context_representation,document_word_sequence_mask):
        
        ############## m = max length of instances in one batch of document ;  n= max length of instances in one batch of question ############################33
        Q = question_representation # B x (n + 1) x l
        D = context_representation  # B x (m + 1) x l
        
        print("question_representation.(Output to Encoder Layer) ==  " + str(Q.size()))
        print("context_representation. (Output to Encoder Layer)  ==  " + str(D.size()))

        # view function is meant to reshape the tensor.(Similar to reshape function in numpy)
        # view( row_size = -1 ,means that number of rows are unknown, column_size)
        # pass the Q tensor through a non-linearity 
        Q2 = torch.tanh(self.question_proj(Q.view(-1, self.hidden_dim))).view(Q.size()) #B x (n + 1) x l

        ##################################   Co-Attention starts here  #######################################
        
        ########################################   Step - 1  ##################################################
        # transpose(tensor, first_dimension to be transposed, second_dimension to be transposed)
        Q_transpose = torch.transpose(Q2, 1, 2) #dimension: B x l x (n + 1)
        
        # Performs a batch matrix-matrix product of matrices stored in batch1 and batch2.
        # batch1 and batch2 must be 3-D tensors each containing the same number of matrices.
        L = torch.bmm(D, Q_transpose) # dimension of L : B x (m + 1) x (n + 1)

        ####################################### Step-2 ######################################################
        A_Q = F.softmax(L, dim=2) # B x (m + 1) x (n + 1)


        D_transpose = torch.transpose(D, 1, 2) #dimension: B x l x (m + 1)
        C_Q = torch.bmm(D_transpose, A_Q) # (B x l x (m + 1)) x (B x (m + 1) x (n + 1)) => B x l x (n + 1)

        ####################################### Step-3 #######################################################
        L_tranpose = torch.transpose(L,1,2)
        A_D = F.softmax(L_tranpose, dim=2)  # B x (n + 1) x (m + 1)
        
        
        # concatenation along dimension=1:(B x l x (n + 1) ; B x l x (n + 1)  -----> B x 2l x (n + 1) ) x (B x (n + 1) x (m + 1)) ====> B x 2l x (m + 1)
        C_D = torch.bmm(torch.cat((Q_transpose, C_Q), 1), A_D) # B x 2l x (m + 1)
        C_D_transpose = torch.transpose(C_D, 1, 2)  # B x (m + 1) x 2l

        
        #######################################  Step-4 ##########################################################
        #fusion BiLSTM
        # concatenation along dimension = 2:  (B x (m + 1) x 2l ; B x (m + 1) x l  -----> B x (m + 1) x 3l )
        bi_lstm_input = torch.cat((C_D_transpose, D), 2) # B x (m + 1) x 3l
       
        U = self.fusion_bilstm(bi_lstm_input, document_word_sequence_mask) # B x m x 2l
        
        print("size of U.(U is output of Co-attention encoder) ==  " + str(U.size()))
        
        return U


class Fusion_BiLSTM(nn.Module):
    def __init__(self, hidden_dim, dropout_ratio):
        super(Fusion_BiLSTM, self).__init__()
        
         # batch_first = True
        # Input: has a dimension of B * m * embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.fusion_bilstm = nn.LSTM(3 * hidden_dim, hidden_dim, 1, batch_first=True,
                                     bidirectional=True, dropout=dropout_ratio)
        
#         self.dropout = nn.Dropout(p=dropout_ratio)

    def forward(self, word_sequence_embeddings, word_sequence_mask):
        
        # stores length of per instance for context/question
        length_per_instance = torch.sum(word_sequence_mask, 1)
        
      
        # All RNN modules accept packed sequences as inputs.
        # Input: word_sequence_embeddings has a dimension of B x m+1 x 3l (l is the size of the glove_embedding/ pre-trained embedding/embedding_dim)
        packed_word_sequence_embeddings = pack_padded_sequence(word_sequence_embeddings, length_per_instance, batch_first=True,enforce_sorted=False)
        
        # since the input was a packed sequence, the output will also be a packed sequence
        output, _ = self.fusion_bilstm(packed_word_sequence_embeddings)
        
        # Pads a packed batch of variable length sequences.
        # It is an inverse operation to pack_padded_sequence().
        # dimension:  B x m x 2l
        output_to_BiLSTM_padded, _ = pad_packed_sequence(output, batch_first=True)


        return output_to_BiLSTM_padded

In [293]:
import os

class Config(object):
    pass

config = Config()
config.data_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"
config.word_embedding_size = 100
config.hidden_dim = 300
config.dropout_ratio = 0.15
config.max_context_length = 600
config.max_question_length = 30


#vector with zeros for unknown words
config.num_iterations = 2
config.maxout_pool_size=16

config.lr = 0.001
config.dropout_ratio = 0.15

config.max_grad_norm = 5.0
config.batch_size = 11
config.num_epochs = 2

# config.print_every = 100
# config.save_every = 50000000
# config.eval_every = 1000

# config.model_type = 'co-attention'
config.reg_lambda = 0.00007
config.names = ["train_context","train_question"]
config.print_every = 100

In [97]:
with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\train_word_index.context_pkl.pkl", "rb") as input_file:
    train_word_index = pickle.load(input_file)


In [98]:
train_word_index = (np.array(train_word_index))

In [72]:
len(train_word_index[81509])

136

In [73]:

train_word_index = np.sort(train_word_index)

In [74]:
len((train_word_index))

130319

In [99]:
def find_max_length(data):

    """ Finds the maximum sequence length for data 
        Args:
            data: The data from which sequences will be chosen

    """
    temp = 0
    index = 0
    for i, _ in enumerate(data):

        if (len(data[i]) > temp):
            temp = len(data[i])
            index = i

    return temp,index
find_max_length(train_word_index)

(700, 81510)

In [101]:
len(train_word_index[81510])

700

In [72]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()

In [25]:
input.size()

torch.Size([3, 5])

In [26]:
target.size()

torch.Size([3])

In [30]:
loss = nn.CrossEntropyLoss()
output = loss(input, target)
output.backward()

In [31]:
output

tensor(1.5614, grad_fn=<NllLossBackward>)

In [33]:
input = 0
y_actual = 0

In [36]:
input

0