
There are two ways of letting the model know your intention i.e do you want to train the model or do you want to use the model to evaluate. In case of model.train() the model knows it has to learn the layers and when we use model.eval() it indicates the model that nothing new is to be learnt and the model is used for testing. model.eval() is also necessary because in pytorch if we are using batchnorm and during test if we want to just pass a single image, pytorch throws an error if model.eval() is not specified.

In [14]:
import os
import tqdm 
import numpy as np
import json
import spacy
import nltk


class Squad_preprocessor():
    def __init__(self,tokenizer,data_directory = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD"):
        self.data_directory = data_directory
        self.glove_directory = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\glove.6B"
        self.train_file = "train_v2.json"
        self.test_file = "test_v2.json"
        self.out_prefix = "train"
        self.tokenizer = tokenizer
        self.num_train_examples = 0
        self.context_lengths = None
        self.vocab = {}
        
    def load_data(self,filename = "train_v2.json"):
        full_path = os.path.join(self.data_directory,filename)
        
        with open(full_path) as datafile:
            self.data = json.load(datafile)
            
#         print(len(self.data["data"]))
            
    def break_file(self, prefix, filename = "train_v2.json", count_examples = False):
        self.load_data(filename)
        self.out_prefix = prefix
        
        ##### creating data directories for different parts of the data namely:
        ## 1) context
        ## 2) question
        ## 3) answer_text
        ## 4) answer_start
        ## 5) answer_end
      
        
        ###       the SQuAD dataset has the following layout:
        # "data" ---> "title", "paragraphs" 
        #                            |
        #                            -----> "context" , "qas"
        #                                                 |
        #                                                 -----> "answers", "id", "is_impossible", "question"
        #
        #    ie. one context has several questions and their respective answers     

        
#         counter = 0
        with open(os.path.join(self.data_directory, self.out_prefix +'.context'), 'w', encoding='utf-8') as context_file, \
             open(os.path.join(self.data_directory, self.out_prefix +'.question'), 'w', encoding='utf-8') as question_file, \
             open(os.path.join(self.data_directory, self.out_prefix + '.answer_text'), 'w', encoding= 'utf-8') as answer_text_file, \
             open(os.path.join(self.data_directory, self.out_prefix + '.answer_start'), 'w', encoding= 'utf-8') as answer_start_file, \
             open(os.path.join(self.data_directory, self.out_prefix + '.answer_end'), 'w', encoding= 'utf-8') as answer_end_file:
             
                   
                    for article_idx in range(len(self.data["data"])):
                        paragraphs = self.data["data"][article_idx]["paragraphs"] ## all the paragraphs in data directory

                        for paragraph_idx in range(len(paragraphs)):
                            context = paragraphs[paragraph_idx]["context"] ## each context in a given paragraph directory
                            context = context.lower()
                            context_tokens = self.tokenizer(context)
                            context_tokens.insert(0,"<sos>")
#                             print(context_tokens)
# 
                            ## each context has a range of "answers", "id", "is_impossible", "question" 

                            qas = paragraphs[paragraph_idx]["qas"] ##  "qas" referrring to a single "context"

                            for qas_idx in range(len(qas)):  ### disecting the "qas" into "answers", "id", "is_impossible", "question" 
                                question = qas[qas_idx]["question"]  
                                question = question.lower()
                                question_tokens = self.tokenizer(question)
                                question_tokens.insert(0,"<sos>")

                                ## we select the first answer id from the range of answers we are given for a particular question
                                
                                if(len(qas[qas_idx]["answers"]) == 0 ):
                                    
                                    answer_text_tokens = "<sos>"
                                    word_level_answer_start = 1
                                    word_level_answer_end = 1
                                    
                                    
                                    
                                else:
                                    
                                    answer_id = 0
                                    answer_text = qas[qas_idx]["answers"][answer_id]["text"]
#                                     print(answer_text)
                                    
                                    answer_text = answer_text.lower()
                                    answer_text_tokens = self.tokenizer(answer_text) ## we atke the first option as the answer

                                    char_level_answer_start = qas[qas_idx]["answers"][answer_id]["answer_start"]
                                    word_level_answer_start = len(context[:char_level_answer_start].split())+1
                                    word_level_answer_end = word_level_answer_start + len(answer_text.split()) 


                
                                       
                                    
        
                                    
        
                                    context_file.write(' '.join(token for token in context_tokens)+'\n')
                                    question_file.write(' '.join(token for token in question_tokens)+'\n')
                                    answer_text_file.write(' '.join(token for token in answer_text_tokens)+'\n')

                                    answer_start_file.write(str(word_level_answer_start)+ "\n")
                                    answer_end_file.write(str(word_level_answer_end) + "\n")

                                    
                                                
    
    def conduct_preprocess(self):
        self.break_file("train", self.train_file, True)
        self.break_file("test", self.test_file, False)
        
            
            
    
preprocess = Squad_preprocessor(nltk.word_tokenize,"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD")
preprocess.conduct_preprocess()  

data_directory = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD"
context_file = open(os.path.join(data_directory, 'train.context'), 'r', encoding='utf-8').readlines() 
question_file = open(os.path.join(data_directory, 'train.question'), 'r', encoding='utf-8').readlines() 
answer_text_file = open(os.path.join(data_directory,  'train.answer_text'), 'r', encoding= 'utf-8').readlines()  
answer_start_file = open(os.path.join(data_directory,  'train.answer_start'), 'r', encoding= 'utf-8').readlines() 
answer_end_file = open(os.path.join(data_directory,  'train.answer_end'), 'r', encoding= 'utf-8').readlines() 
        
with open(os.path.join(data_directory, "validation.context" ), 'w', encoding='utf-8') as f:
    for item in context_file[8000:]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "validation.question" ), 'w', encoding='utf-8') as f:
    for item in question_file[8000:]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "validation.answer_text" ), 'w', encoding='utf-8') as f:
    for item in answer_text_file[8000:]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "validation.answer_start" ), 'w', encoding='utf-8') as f:
    for item in answer_start_file[8000:]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "validation.answer_end" ), 'w', encoding='utf-8') as f:
    for item in answer_end_file[8000:]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "train.context"), 'w', encoding='utf-8') as f:
    for item in context_file[:8000]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "train.question" ), 'w', encoding='utf-8') as f:
    for item in question_file[:8000]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "train.answer_text" ), 'w', encoding='utf-8') as f:
    for item in answer_text_file[:8000]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "train.answer_start"), 'w', encoding='utf-8') as f:
    for item in answer_start_file[:8000]:
        f.write("%s" % item)
with open(os.path.join(data_directory, "train.answer_end" ), 'w', encoding='utf-8') as f:
    for item in answer_end_file[:8000]:
        f.write("%s" % item)

In [16]:

import os.path
import operator
import pickle
from nltk.tokenize import WhitespaceTokenizer 
from gensim.models import Word2Vec, KeyedVectors
from collections import defaultdict
from math import sqrt
import numpy as np 
import codecs
import re
import string
import sys
import tqdm as tqdm
import os
from collections import Counter
# import Squad_processor
import spacy

import nltk
from argparse import ArgumentParser


class Vocabulary():
    
    def __init__(self, vocab_input_files = ["E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\train.context","E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\train.question"],
                        vocab_output_filename = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\vocab.dat"):
        """
        This function works the same as contructors and is used to initilaize the parameters used in the making the model
        
        """
        self.vocab = {}
        self.vocab_output_filename = vocab_output_filename
        self.vocab_input_files = vocab_input_files
        self.word_list = []
        self.word_to_index = {} # dictionary with keys as words and values as their corresponding index number
        self.char_to_index = {} # dictionary with keys as characters and values as their corresponding index number
        self.word_to_index["<pad>"] = 0
        self.word_to_index["<sos>"] = 1
        self.word_to_index["<unk>"] = 2
#         self.word_to_index["<SOS>"]
        ## self.index_to_word = # dictionary with values as words and keys as their corresponding index number
        ## self.index_to_char = # dictionary with values as characters and keys as their corresponding index number
        
    
    def normalize_answer(self,s):
        """Lower text and remove punctuation, articles and extra whitespace."""

        def remove_articles(text):
            return re.sub(r'\b(a|an|the)\b', ' ', text)

        def white_space_fix(text):
            return ' '.join(text.split())

        def remove_punc(text):
            exclude = set(string.punctuation)
            return ''.join(ch for ch in text if ch not in exclude)

        def lower(text):
            return text.lower()

        return white_space_fix(remove_articles(remove_punc(lower(s))))

    
    
    def create_vocabulary(self,vocab_freq = 0, vocab_size = 30000, data_path="E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD"):
        """
        This function creates dictionaries namely:
        1) word_to_index
        2) char_to_index
        3) index_to_word
        4) index_to_char
        
        and dumps them into pickle file namely: "dictionaries.pkl"
        """
        

        for filename in self.vocab_input_files:
            with open(filename,'r', encoding = 'utf-8') as file_input:
                
                for line in file_input:
                    words = self.normalize_answer(line).strip().split()
#                     print(words)
                    for word in words:
                        if not (word in self.vocab):
                            self.vocab[word] = 1
                        else:
                            self.vocab[word] +=1 

        if vocab_freq == 0:
            vocab_words = sorted(self.vocab,key=self.vocab.get,reverse=True)


#         print(vocab_words)
                    
        temp_index = 3
        for word in vocab_words:
            if temp_index < vocab_size and word not in self.word_to_index:
                self.word_to_index[word] = temp_index
                temp_index += 1
                
#         print(len(self.word_to_index))

        self.vocab_size = len(self.word_to_index)
        self.index_to_word =  {v: k for k, v in self.word_to_index.items()}


        characters = list(string.printable.lower())
        characters.remove(' ')

        char_ind = 1
        for c in characters:
            if c not in self.char_to_index:
                self.char_to_index[c] = char_ind
                char_ind += 1


        self.index_to_char = {v: k for k,v in self.char_to_index.items()}

        dict_all = {"word_to_index" : self.word_to_index, "char_to_index" : self.char_to_index,"index_to_word": self.index_to_word, "index_to_char": self.index_to_char}

        pickle.dump(dict_all, open(os.path.join(data_path, "dictionaries.pkl"), "wb")) ## creates dictionaries and stores in memory as pickle files




vocab = Vocabulary(["E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\train.context","E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\train.question"],
                        "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\vocab.dat")
vocab.create_vocabulary(0,30000, "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD")

In [17]:

import numpy as np
import os
import tqdm as tqdm
import pickle

import numpy as np
import os
import spacy
# import ujson as json
import urllib.request

# from args import get_setup_args
from codecs import open
from collections import Counter
from subprocess import run
from tqdm import tqdm
from zipfile import ZipFile



class Embedding_Matrix():
    
    def __init__(self,embedding_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD"):
#         embedding_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD"
        with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\dictionaries.pkl", "rb") as input_file:
            dictionaries = pickle.load(input_file)
        self.word_to_index = dictionaries["word_to_index"]
        self.char_to_index = dictionaries["char_to_index"]
        self.index_to_word = dictionaries["index_to_word"]
        self.index_to_char = dictionaries["index_to_char"]
        
        
    def index_files_using_char_to_index(self, filename, _dict, max_words, max_chars):

        f = open(filename, "r", encoding="utf-8")
        lines = f.readlines()
        lines = [l.lower() for l in lines]
        encoded_lines = []
        for l in lines:
            tokens = l.split()
            tokens = tokens[:max_words]
            encoded_tokens = []
            for t in tokens:
                l = list(t)
                l = l[:max_chars] ## there is a max limit for the length of characters = max_chars
                encoded_chars = []
                for j in l:
                    if j in _dict:
                        encoded_chars.append(_dict[j])
                    else:
                        encoded_chars.append(0)  ## if the character id not in dictionary put '0' in its place
                encoded_tokens.append(encoded_chars)
            encoded_lines.append(encoded_tokens)

        return encoded_lines

    def index_files_using_word_to_index(self, filename, _dict, max_words):
        
        f = open(filename, "r", encoding="utf-8")

        lines = f.readlines()
        lines  = [l.lower() for l in lines]
        encoded_lines = []
        for l in lines:
            tokens = l.split()
            tokens = tokens[:max_words]
            temp = []
            for t in tokens:
                if t in _dict:
#                     print(_dict[t])
                    temp.append(_dict[t])
                else:
                    temp.append(_dict["<unk>"])

            encoded_lines.append(temp[:])
#         close(filename)
#             print("HEllo")

        return encoded_lines
    
    def index_files_to_char_level_and_word_level(self, datapath = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD", max_words=0, max_chars=0):
#         files = [".context", ".question", ".answer_text"]
        files = [".context",".question", ".answer_text"]


        for f in files:
            read_path_train = os.path.join(datapath, "train" + f)
            write_path_train_word = os.path.join(datapath, "train_word_index" + f + "_pkl.pkl")
#             write_path_train_char = os.path.join(datapath, "train_char_index" + f + "_pkl.pkl")

            read_path_valid = os.path.join(datapath, "validation" + f)
            write_path_valid_word = os.path.join(datapath, "validation_word_index" + f + "_pkl.pkl")
#             write_path_valid_char = os.path.join(datapath, "validation_char_index" + f + "_pkl.pkl")



            temp_train_word = self.index_files_using_word_to_index(read_path_train, self.word_to_index, max_words)
            temp_valid_word = self.index_files_using_word_to_index(read_path_valid, self.word_to_index, max_words)

#             temp_train_char = index_files_using_char_to_index(read_path_train, self.char_to_index, max_words,max_chars)
#             temp_valid_char = index_files_using_char_to_index(read_path_valid, self.char_to_index, max_words,max_chars)

            write_file_train_word = open(write_path_train_word, "wb")
            pickle.dump(temp_train_word, write_file_train_word)

#             write_file_train_char = open(write_path_train_char, "wb")
#             pickle.dump(temp_train_char, write_file_train_char)

            write_file_valid_word = open(write_path_valid_word, "wb")
            pickle.dump(temp_valid_word, write_file_valid_word)

#             write_file_valid_char = open(write_path_valid_char, "wb")
#             pickle.dump(temp_valid_char, write_file_valid_char)

    def get_glove_embeddings(self, word_embedding_size = 100, char_embedding_size = 20 , embedding_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD" ):



        glove_embeddings = os.path.join(embedding_dir, "glove_embeddings100.txt")

        glove_embeddings = open(glove_embeddings,'r', encoding = 'utf-8')



        #     glove_embeddings = pickle.load(open(glove_embeddings))

        #####################  CHECK HOW GLOVE EMBEDDINGS WORK ##############
        temp_embeddings = []

        for word in self.word_to_index:

                if word in ['<pad>', '<sos>']:
                    temp_vector = np.zeros((word_embedding_size))
                elif word not in glove_embeddings:
                    temp_vector = np.random.uniform(-np.sqrt(3)/np.sqrt(word_embedding_size), np.sqrt(3)/np.sqrt(word_embedding_size), word_embedding_size)
                else:
                    temp_vector = glove_embeddings[word]

                temp_embeddings.append(temp_vector)

        temp_embeddings = np.asarray(temp_embeddings)
        temp_embeddings = temp_embeddings.astype(np.float32)
        self.word_embeddings = temp_embeddings


#         char_embeddings = []
# #         print (char_embedding_size)
#         char_embeddings.append(np.zeros((char_embedding_size)))

#         for i in range(len(self.char_to_index)):
#             temp_vector = np.random.uniform(-np.sqrt(3)/np.sqrt(char_embedding_size), np.sqrt(3)/np.sqrt(char_embedding_size), char_embedding_size)
#             char_embeddings.append(temp_vector)

#         char_embeddings = np.asarray(char_embeddings)
#         char_embeddings = char_embeddings.astype(np.float32)

#         self.char_embeddings = char_embeddings

#         pickle.dump(char_embeddings, open(os.path.join(embedding_dir, "char_embeddings" + ".pkl"), "wb")) 
        pickle.dump(temp_embeddings, open(os.path.join(embedding_dir, "glove_word_embeddings" + ".pkl"), "wb"))


#         return self.word_embeddings, self.char_embeddings

embedding = Embedding_Matrix(embedding_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD")
embedding.get_glove_embeddings(word_embedding_size = 100, char_embedding_size = 20 , embedding_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD" )
embedding.index_files_to_char_level_and_word_level(datapath = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD", max_words=700, max_chars=10)


In [38]:
class Train_Model:

    def __init__(self, config, model):
        self.config = config
        self.model = model
        self.loss_function = torch.nn.CrossEntropyLoss()
        self.parameters_trainable = list(filter(lambda p: p.requires_grad, self.model.parameters()))
        self.optimizer = optim.Adam(self.parameters_trainable, lr=self.config.lr)

        self.glove_path = os.path.join(config.data_dir, "glove_word_embeddings.pkl")
        self.num_epochs = config.num_epochs
        self.data_dir = config.data_dir
        self.names = config.names
        self.batch_size = config.batch_size
        self.print_every = config.print_every
        self.max_context_length = config.max_context_length
        self.max_question_length = config.max_question_length
        self.model_dir = config.model_dir
        self.early_stop = config.early_stop
        self.print_and_validate_every = config.print_and_validate_every
        
    def save_model(self, model, optimizer, loss, global_step, epoch ,prefix):
        # A state_dict is simply a Python dictionary object that maps each layer to its parameter tensor
        
        if(prefix == "best_model"):
            model_state = model.state_dict()
            model_state = {k: v for k, v in model_state.items() if 'embedding' not in k}

            state = {
                'global_step': global_step,
                'epoch': epoch,
                'model': model_state,
                'optimizer': optimizer.state_dict(),
                'current_loss': loss
            }
            model_save_path = os.path.join(self.model_dir, 'best_model' )
            torch.save(state, model_save_path)
            
        elif(prefix == "last_model"):
            model_state = model.state_dict()
            model_state = {k: v for k, v in model_state.items() if 'embedding' not in k}

            state = {
                'global_step': global_step,
                'epoch': epoch,
                'model': model_state,
                'optimizer': optimizer.state_dict(),
                'current_loss': loss
            }
            model_save_path = os.path.join(self.model_dir, 'last_model' )
            torch.save(state, model_save_path)
            
            
    def get_f1_em_score(self, prefix, num_samples=100):


        f1_total = 0.
        em_total = 0.
        example_num = 0

#         tic = time.time()

        for batch in get_batch_generator(self.data_dir, self.names, self.batch_size, self.max_context_length, self.max_question_length,prefix):

            _,start_pos_prediction, end_pos_prediction = self.test_one_batch(batch)

            start_pos_prediction = start_pos_prediction.tolist()
            end_pos_prediction = end_pos_prediction.tolist()

            for index, (pred_answer_start, pred_answer_end, true_answer_tokens) in enumerate(zip(start_pos_prediction, end_pos_prediction, batch.answer_tokens_batch)):
                    
                example_num += 1
                pred_answer_tokens = batch.context_tokens_batch[index][pred_answer_start : pred_answer_end + 1]
                pred_answer = " ".join(pred_answer_tokens)

                true_answer = " ".join(true_answer_tokens)

                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

#         toc = time.time()
#         logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic))

        return f1_total, em_total
    def get_validation_loss(self,prefix):
#         logging.info("Calculating dev loss...")
#         tic = time.time()
#         loss_per_batch, batch_lengths = [], []
        total_validation_loss = 0.0
        validation_set_size = 0
        for batch in get_batch_generator(self.data_dir, self.names, self.batch_size, self.max_context_length, self.max_question_length,prefix):

            validation_batch_loss, _, _ = self.test_one_batch(batch)
            validation_set_size += batch.batch_size
            total_validation_loss += validation_batch_loss
#             batch_lengths.append(curr_batch_size)
#             i += 1
#             if i == 10:
#                 break
#         total_num_examples = sum(batch_lengths)
#         toc = time.time()
#         print(validation_set_size)

        validation_loss = total_validation_loss / validation_set_size
#         print "Computed validation loss = %f " % (validation_loss)

        return validation_loss

    def get_data(self, batch, is_train=True):
        
        question_word_index_batch = batch.question_word_index_batch

        context_word_index_batch = batch.context_word_index_batch
        
        span_tensor_batch = batch.span_tensor_batch

        if is_train:
            return context_word_index_batch, question_word_index_batch,span_tensor_batch
        else:
            return context_word_index_batch, question_word_index_batch
      
    def get_grad_norm(self, parameters, norm_type=2):
        parameters = list(filter(lambda p: p.grad is not None, parameters))
        total_norm = 0
        for p in parameters:
            param_norm = p.grad.data.norm(norm_type)
            total_norm += param_norm ** norm_type
        total_norm = total_norm ** (1. / norm_type)
        return total_norm

    def get_param_norm(self, parameters, norm_type=2):
        total_norm = 0
        for p in parameters:
            param_norm = p.data.norm(norm_type)
            total_norm += param_norm ** norm_type
        total_norm = total_norm ** (1. / norm_type)
        return total_norm
    
    def test_one_batch(self, batch):
        
        self.model.eval()

        context_word_index_batch, question_word_index_batch,  span_tensor_batch = self.get_data(batch)

#         print(context_word_index_batch)


        context_word_index_padded_per_batch = Variable(pad_data(context_word_index_batch))
#         print(context_word_index_padded_per_batch)
        context_word_index_padded_per_batch.requires_grad = False
        question_word_index_padded_per_batch = Variable(pad_data(question_word_index_batch))
        question_word_index_padded_per_batch.requires_grad = False


        context_ids = np.array(context_word_index_padded_per_batch) # shape (batch_size, context_len)
        context_mask_per_batch = (context_ids != 0).astype(np.int32) # shape (batch_size, context_len)
        context_word_mask_per_batch_new = Variable(torch.from_numpy(context_mask_per_batch))
        context_word_mask_per_batch_new.requires_grad = False

        question_ids = np.array(question_word_index_padded_per_batch) # shape (batch_size, context_len)
        question_mask_per_batch = (question_ids != 0).astype(np.int32) # shape (batch_size, context_len)
        question_word_mask_per_batch_new = Variable(torch.from_numpy(question_mask_per_batch))
        question_word_mask_per_batch_new.requires_grad = False

        span_tensor_batch = Variable(span_tensor_batch)
        
        span_tensor_batch.requires_grad = False

        start_index_prediction, end_index_prediction, loss = self.model(context_word_index_padded_per_batch,context_word_mask_per_batch_new, question_word_index_padded_per_batch, question_word_mask_per_batch_new, span_tensor_batch)
    
        self.model.train()

        return loss.item(),start_index_prediction, end_index_prediction
        
    def train_one_batch(self, batch):



        self.optimizer.zero_grad()
        context_word_index_batch, question_word_index_batch,  span_tensor_batch = self.get_data(batch)



        context_word_index_padded_per_batch = Variable(pad_data(context_word_index_batch))
        context_word_index_padded_per_batch.requires_grad = False
        question_word_index_padded_per_batch = Variable(pad_data(question_word_index_batch))
        question_word_index_padded_per_batch.requires_grad = False


        context_ids = np.array(context_word_index_padded_per_batch) # shape (batch_size, context_len)
        context_mask_per_batch = (context_ids != 0).astype(np.int32) # shape (batch_size, context_len)
        context_word_mask_per_batch_new = Variable(torch.from_numpy(context_mask_per_batch))
        context_word_mask_per_batch_new.requires_grad = False

        question_ids = np.array(question_word_index_padded_per_batch) # shape (batch_size, context_len)
        question_mask_per_batch = (question_ids != 0).astype(np.int32) # shape (batch_size, context_len)
        question_word_mask_per_batch_new = Variable(torch.from_numpy(question_mask_per_batch))
        question_word_mask_per_batch_new.requires_grad = False

        span_tensor_batch = Variable(span_tensor_batch)
        
        span_tensor_batch.requires_grad = False

        _, _, loss = self.model(context_word_index_padded_per_batch,context_word_mask_per_batch_new, question_word_index_padded_per_batch, question_word_mask_per_batch_new, span_tensor_batch)


        
        
        
#         l2_reg = None
#         for W in self.parameters:
            
#             if l2_reg is None:
#                 l2_reg = W.norm(2)
#             else:
#                 l2_reg = l2_reg + W.norm(2)
#         loss = loss + config.reg_lambda * l2_reg
        

        
        loss.backward()
        

        
        param_norm = self.get_param_norm(self.parameters_trainable)
        grad_norm = self.get_grad_norm(self.parameters_trainable)
        
        
#         clip_grad_norm_(parameters, config.max_grad_norm)
        self.optimizer.step()
    

        return loss.item(), param_norm, grad_norm
    
    
    def train(self):


        num_parameters = sum(p.numel() for p in self.parameters_trainable)

        best_validation_f1, best_validation_em = None, None
        best_validation_epoch = 0
        epoch = 0
        global_step = 0

        loss_array = []
        logging.info("Beginning training loop...")
        for epoch in range(200):
            total_loss = 0.0
            epoch_tic = time.time()
            for batch in get_batch_generator(self.data_dir, self.names, self.batch_size, self.max_context_length, self.max_question_length,"train"):
                
                global_step += 1
                iter_tic = time.time()
                
                train_batch_loss, param_norm, grad_norm = self.train_one_batch(batch)

#                 total_loss = total_loss + loss
#                 loss_array.append(total_loss)
    
                iter_toc = time.time()
                iter_time = iter_toc - iter_tic
            
                if global_step % self.print_and_validate_every == 0:
                    
#                     print(self.get_validation_loss("validation"))
                    validation_batch_loss = self.get_validation_loss("validation")
                    
#                     print()


#                 if global_step % config.save_every == 0:
#                     logging.info("Saving to %s..." % model_dir)
#                     self.save_model(model, optimizer, loss, global_step, epoch, model_dir)

#                 if global_step % config.validate_every == 0:
                    
#                     logging.info("Epoch %d, Iter %d, dev loss: %f" % (epoch, global_step, dev_loss))
#                     write_summary(dev_loss, "dev/loss", summary_writer, global_step)

                    train_batch_f1, train_batch_em = self.get_f1_em_score("train", num_samples=100)
#                     logging.info("Epoch %d, Iter %d, Train F1 score: %f, Train EM score: %f" % (
#                         epoch, global_step, train_f1, train_em))
#                     write_summary(train_f1, "train/F1", summary_writer, global_step)
#                     write_summary(train_em, "train/EM", summary_writer, global_step)

                    validation_batch_f1, validation_batch_em = self.get_f1_em_score("validation", num_samples=100)
#                     logging.info(
#                         "Epoch %d, Iter %d, Dev F1 score: %f, Dev EM score: %f" % (epoch, global_step, dev_f1, dev_em))
#                     write_summary(dev_f1, "dev/F1", summary_writer, global_step)
#                     write_summary(dev_em, "dev/EM", summary_writer, global_step)



                    if best_validation_f1 is None or validation_batch_f1 > best_validation_f1:
                        best_validation_f1 = validation_batch_f1

                    if best_validation_em is None or validation_batch_em > best_validation_em:
                        best_validation_em = validation_batch_em
                        best_validation_epoch = epoch+1
#                         logging.info("Saving to %s..." % bestmodel_dir)
                        self.save_model(self.model, self.optimizer, validation_batch_loss, global_step, epoch, "best_model")
                        
                    print ("Epoch : {} Step : {} Train_batch Loss : {} Validation_batch Loss :{} " .format(epoch+1, global_step, train_batch_loss, validation_batch_loss))
            
                    print("Train_batch F1:{} Train_batch EM:{} Validation_batch_F1: {} Best_validation_batch F1:{} Best_validation_batch EM :{} ".format(train_batch_f1,train_batch_em,validation_batch_f1,best_validation_f1,best_validation_em))

                
            if (epoch - best_validation_epoch > self.early_stop):
                break
            self.save_model(self.model, self.optimizer, train_batch_loss, global_step, epoch+1 ,"last_model")
#             self.save_model(model, optimizer, loss, global_step, epoch, bestmodel_dir)
#             torch.save(self.model.state_dict(), open(os.path.join(outdir, "last_model"), "wb"))
#             print("total loss for epoch number = " + str(epoch+1) + " = " + str(total_loss))

#             epoch_toc = time.time()
            print("End of epoch %i." % (epoch+1))

        sys.stdout.flush()

In [39]:
from torchviz import make_dot
hidden_dim = 100
dropout_ratio = 0.15
maxout_pool_size=16
max_number_of_iterations = 3
with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\glove_word_embeddings.pkl", "rb") as input_file:
    embedding_matrix = pickle.load(input_file)
    
with autograd.set_detect_anomaly(True):
    model = DCN_Model(hidden_dim, embedding_matrix, dropout_ratio, maxout_pool_size, max_number_of_iterations)

    
    # model = model.cpu()
    train_model = Train_Model(config, model)

    train_model.train()


  "num_layers={}".format(dropout, num_layers))


Epoch : 1 Step : 2 Train_batch Loss : 10.03243350982666 Validation_batch Loss :0.4990212440490723 
Train_batch F1:0.057028985507246374 Train_batch EM:0.0 Validation_batch_F1: 0.1113545089448704 Best_validation_batch F1:0.1113545089448704 Best_validation_batch EM :0.0 
End of epoch 1.
Epoch : 2 Step : 4 Train_batch Loss : 10.027390480041504 Validation_batch Loss :0.49893956184387206 
Train_batch F1:0.057028985507246374 Train_batch EM:0.0 Validation_batch_F1: 0.08417220262152483 Best_validation_batch F1:0.1113545089448704 Best_validation_batch EM :0.0 
End of epoch 2.
Epoch : 3 Step : 6 Train_batch Loss : 10.017292976379395 Validation_batch Loss :0.49881811141967775 
Train_batch F1:0.11199974574116449 Train_batch EM:0.0 Validation_batch_F1: 0.026486555874212576 Best_validation_batch F1:0.1113545089448704 Best_validation_batch EM :0.0 
End of epoch 3.
Epoch : 4 Step : 8 Train_batch Loss : 9.998069763183594 Validation_batch Loss :0.4986070156097412 
Train_batch F1:0.11199974574116449 Train

KeyboardInterrupt: 

In [19]:
import tqdm as tqdm
import torch
import random
# import nltk
import numpy as np
import pickle
import sys
import copy
import os.path
# import tqdm as tqdm

datapath = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"

def find_max_length(data):

    """ Finds the maximum sequence length for data 
        Args:
            data: The data from which sequences will be chosen
    """
    temp = 0
    index = 0
    for i, _ in enumerate(data):

        if (len(data[i]) > temp):
            temp = len(data[i])
            index = i
    return temp,index


def pad_data(data):

    """ Pad the data to max_length given
        Args: 
            data: Data that needs to be padded
            max_length : The length to be achieved with padding
        Returns:
            padded_data : Each sequence is padded to make it of length
                          max_length.
    """
    padded_data = []
    max_length,index =  find_max_length(data)

    for lines in data:
        if (len(lines) < max_length):
            temp = np.lib.pad(lines, (0,max_length - len(lines)),
                'constant', constant_values=0)
        else:
            temp = lines[:max_length]
        padded_data.append(temp)

    padded_data = torch.from_numpy(np.array(padded_data)).type(torch.int64)

    return padded_data


def index_files_using_word_to_index(filename, _dict, max_words):
    
    f = open(filename, "r", encoding="utf-8")

    lines = f.readlines()
    lines  = [l.lower() for l in lines]
    encoded_lines = []
    for l in lines:
        tokens = l.split()
        tokens = tokens[:max_words]
        temp = []
        for t in tokens:
            if t in _dict:
                temp.append(_dict[t])
            else:
                temp.append(1)

        encoded_lines.append(temp[:])

    return encoded_lines

from __future__ import print_function

import json
import re
import string
import sys
from collections import Counter


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))
    
with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\dictionaries.pkl", "rb") as input_file:
    dictionaries = pickle.load(input_file)
word_to_index = dictionaries["word_to_index"]


In [20]:



from __future__ import absolute_import
from __future__ import division

import torch
import random
import re
import time
import os
import pickle
import tqdm as tqdm

import numpy as np
from six.moves import xrange

class Batch():
    """A class to hold the information needed for a training batch"""
    def __init__(self,names,context_word_index_batch,question_word_index_batch, span_tensor_batch,context_tokens_batch,questions_tokens_batch,answer_tokens_batch):
        
        self.names = names
        self.context_word_index_batch = context_word_index_batch

        self.question_word_index_batch = question_word_index_batch
        self.span_tensor_batch = span_tensor_batch
        self.context_tokens_batch = context_tokens_batch
        self.questions_tokens_batch = questions_tokens_batch
        self.answer_tokens_batch = answer_tokens_batch
        self.batch_size = len(self.context_word_index_batch)





def refill_batches(batches,batch_size,names, max_context_length, max_question_length,context_word_index,question_word_index,span_tensor,context_tokens,question_tokens,answer_tokens):

    """

    Adds more batches into the "batches" list.
    Inputs:
      batches: list to add batches to

      names: list containing strings of file names ["train_context","train_question"] or ["validation_context","validation_question"]
      data_dir : paths to {train/dev}.{context/question/answer} data files
      batch_size: integer ==> how big to make the batches
      max_context_length, max_question_length: max length of context and question respectively

    """
#     print ("Refilling batches...")
    tic = time.time()
    examples = [] 



        # add to examples
    examples.append((context_word_index, question_word_index, span_tensor,context_tokens,question_tokens,answer_tokens))

        
        

    # Make into batches and append to the list batches
    for batch_start in xrange(0, len(examples[0][0]), batch_size):

        # Note: each of these is a list length batch_size of lists of ints (except on last iter when it might be less than batch_size)
        context_word_index_batch = examples[0][0][batch_start:batch_start+batch_size]
        question_word_index_batch = examples[0][1][batch_start:batch_start+batch_size]
        span_tensor_batch = examples[0][2][batch_start:batch_start+batch_size]
        context_tokens_batch = examples[0][3][batch_start:batch_start+batch_size]
        questions_tokens_batch = examples[0][4][batch_start:batch_start+batch_size]
        answer_tokens_batch = examples[0][5][batch_start:batch_start+batch_size]

#         print("Batch " + str(batch_start + 1) + " loaded")
        
        batches.append((context_word_index_batch, question_word_index_batch,span_tensor_batch,context_tokens_batch,questions_tokens_batch,answer_tokens_batch))


    

    # shuffle the batches
    random.shuffle(batches)

    toc = time.time()
#     print ("Refilling batches took %.2f seconds" % (toc-tic))
    return batches


def get_batch_generator(data_dir,names, batch_size, max_context_length, max_question_length,prefix):
    """
    This function returns a generator object that yields batches.
    The last batch in the dataset will be a partial batch.
    Read this to understand generators and the yield keyword in Python: https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
    Inputs:
      names: list containing strings of file names = ["train_context","train_question"] or ["validation_context","validation_question"]
      data_dir : paths to {train/dev}.{context/question/answer} data files
      batch_size: integer ==> how big to make the batches
      max_context_length, max_question_length: max length of context and question respectively

    """
    
#     if(prefix == "train"):

    context_path_train = os.path.join(data_dir, prefix + ".context")
    question_path_train = os.path.join(data_dir, prefix +  ".question")
    answer_path_train = os.path.join(data_dir, prefix +  ".answer_text")
#     print(os.path.join(data_dir, prefix + ".context"))

    context_tokens = open(context_path_train, "r", encoding="utf-8").readlines()
    question_tokens =  open(question_path_train, "r", encoding="utf-8").readlines()
    answer_tokens = open(answer_path_train, "r", encoding="utf-8").readlines()
#         print(question_tokens)

#     lines = f.readlines()
#     lines  = [l.lower() for l in lines]

    context_word_index_old = index_files_using_word_to_index(context_path_train, word_to_index, max_context_length)
    question_word_index_old = index_files_using_word_to_index(question_path_train, word_to_index, max_question_length)
    
    answer_start_path = os.path.join(data_dir  + prefix +  ".answer_start")
    answer_start_list = open(answer_start_path, "r", encoding="utf-8").readlines()
    
    answer_end_path = os.path.join(data_dir + "//" + prefix +  ".answer_end")
    answer_end_list = open(answer_end_path, "r", encoding="utf-8").readlines()

#     with open(data_dir + "//" + prefix +  "answer_end", "r") as input_file:
#         answer_end_pkl = pickle.load(input_file)
#         print("answer_end_pkl")
#         print(answer_end_pkl)
#     with open(data_dir + "//" + prefix +  "answer_start", "r") as input_file:
#         answer_start_pkl = pickle.load(input_file)
            
#     elif(prefix == "validation"):
#         context_path_train = os.path.join(datapath, "validation.context")
#         question_path_train = os.path.join(datapath, "validation.question")
#         answer_path_train = os.path.join(datapath, "validation.answer_text")

#         context_tokens = open(context_path_train, "r", encoding="utf-8").readlines()
#         question_tokens =  open(question_path_train, "r", encoding="utf-8").readlines()
#         answer_tokens = open(answer_path_train, "r", encoding="utf-8").readlines()


#         context_word_index_old = index_files_using_word_to_index(context_path_train, word_to_index, max_context_length)
#         question_word_index_old = index_files_using_word_to_index(question_path_train, word_to_index, max_question_length)


#         ### check for answers if they are validation or train
#         with open(data_dir + "//" + "answer_end_pkl.pkl", "rb") as input_file:
#             answer_end_pkl = pickle.load(input_file)
#         with open(data_dir + "//" + "answer_start_pkl.pkl", "rb") as input_file:
#             answer_start_pkl = pickle.load(input_file)


            
    context_tokens = context_tokens[30:50]
    question_tokens = question_tokens[30:50]
    answer_tokens = answer_tokens[30:50]

    answer_end = torch.from_numpy(np.array([int(i) for i in answer_end_list])).long()
    answer_start = torch.from_numpy(np.array([int(i) for i in answer_start_list])).long()              
    answer_start = torch.unsqueeze(answer_start, 1)
    answer_end = torch.unsqueeze(answer_end, 1)

    span_tensor_old = torch.cat((answer_start, answer_end), 1)
    span_tensor = span_tensor_old[30:50]
    context_word_index = context_word_index_old[30:50]
    question_word_index = question_word_index_old[30:50]



    batches = []
    count = 0

    while (True):
        count = count + 1
        if len(batches) == 0: # add more batches
            if(count > 2):
                break
            batches = refill_batches(batches,batch_size,names, max_context_length, max_question_length,context_word_index,question_word_index,span_tensor,context_tokens,question_tokens,answer_tokens)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (context_word_index_batch, question_word_index_batch,span_tensor_batch,context_tokens,question_tokens,answer_tokens) = batches.pop(0)
        

        if(len(context_word_index_batch) == 0):
            break
            


        # Make into a Batch object
        batch = Batch(names,context_word_index_batch, question_word_index_batch, span_tensor_batch,context_tokens,question_tokens,answer_tokens)

        yield batch

    return

In [21]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import logging
import code
import pickle
import os
from torch import autograd
from torch.autograd import Variable
from torch.nn import Embedding
from argparse import ArgumentParser


class DCN_Model(nn.Module):

    def __init__(self, hidden_dim, embedding_matrix, dropout_ratio, maxout_pool_size, max_number_of_iterations):
        super(DCN_Model, self).__init__()

        self.encoder = Word_Level_Encoder(hidden_dim, emb_matrix, dropout_ratio)
        self.coattention_encoder = Coattention_Encoder(hidden_dim, maxout_pool_size, embedding_matrix, max_number_of_iterations, dropout_ratio)
        self.decoder = Dynamic_Decoder(hidden_dim, maxout_pool_size, max_number_of_iterations, dropout_ratio)

    def forward(self, context_word_indexes, context_word_mask, question_word_indexes, question_word_mask,span_tensor):
        passage_representation = self.encoder.forward(context_word_indexes, context_word_mask)

        question_representation = self.encoder.forward(question_word_indexes, question_word_mask)
       

        U_matrix = self.coattention_encoder.forward(question_representation, passage_representation,context_word_mask)

        loss, index_start, index_end = self.decoder.forward(U_matrix, context_word_mask, span_tensor)

        return index_start, index_end, loss

In [22]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pickle
import os


with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\glove_word_embeddings.pkl", "rb") as input_file:
    emb_matrix = pickle.load(input_file)
    
names = ["validation_context","train_context","validation_question","train_question"]
data_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"



def get_pretrained_embedding(embedding_matrix):
    embedding = nn.Embedding(*embedding_matrix.shape)
    embedding.weight = nn.Parameter(torch.from_numpy(embedding_matrix).float())
    embedding.weight.requires_grad = False
    return embedding


class Word_Level_Encoder(nn.Module):
    
    def __init__(self, hidden_dim, embedding_matrix, dropout_ratio):
        super(Word_Level_Encoder, self).__init__()
        self.hidden_dim = hidden_dim

        self.embedding = get_pretrained_embedding(embedding_matrix)
        self.embedding_dim = self.embedding.embedding_dim

        # batch_first = True
        # Input: has a dimension of B x m x embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.encoder = nn.LSTM(self.embedding_dim, self.hidden_dim, 1, batch_first=True,
                              bidirectional=False, dropout=dropout_ratio) 
                                     
#         self.dropout_emb = nn.Dropout(p=dropout_ratio)
        
        # creates a random vector with size= hidden_dim
        self.sentinel = nn.Parameter(torch.rand(hidden_dim,))
        
    def initHidden(self,batch_size):
        h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim), requires_grad = False) # Initial hidden state
        c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim), requires_grad = False) # Initial cell state
        return h0, c0


    def forward(self, word_sequence_indexes, word_sequence_mask):
        
        # stores length of per instance for context/question
        # tensor of size = B
        length_per_instance = torch.sum(word_sequence_mask, 1)


        initial_hidden_states = self.initHidden(len(length_per_instance))
        # returns the word_sequences_embeddings with the embeddings for each token/word from word_sequence_indexes
        # word_sequence_embeddings is a tensor of dimension of B x m x l
        word_sequence_embeddings = self.embedding(word_sequence_indexes)
        
        # All RNN modules accept packed sequences as inputs.
        # Input: word_sequence_embeddings has a dimension of B x m x l (l is the size of the glove_embedding/ pre-trained embedding/embedding_dim)
        packed_word_sequence_embeddings = pack_padded_sequence(word_sequence_embeddings,length_per_instance,batch_first=True,enforce_sorted=False)

        
        
        # nn.LSTM encoder gets an input of pack_padded_sequence of dimensions
        # since the input was a packed sequence, the output will also be a packed sequence
        output, _ = self.encoder(packed_word_sequence_embeddings,initial_hidden_states)
       
        
        # Pads a packed batch of variable length sequences.
        # It is an inverse operation to pack_padded_sequence().
        # dimension:  B x m x l
        output_to_LSTM_padded, _ = pad_packed_sequence(output, batch_first=True)
        
        

        # list() creates a list of elements if an iterable is passed
        # batch_size is a scalar which stores the value of batch size. (batch_size = B)
        batch_size, _ = list(word_sequence_mask.size())
        
        
        # dimension of sentinel matrix =  B x 1 x l (replicates or expands along given dimension)
        length_per_instance_new_dim = length_per_instance.unsqueeze(1).expand(batch_size, self.hidden_dim).unsqueeze(1)
        

        # sentinel to be concatenated to the data
        # dimension of sentinel_zero =  B x 1 x l
        sentinel_zero = torch.zeros(batch_size, 1, self.hidden_dim)
        
        # copy sentinel vector at the end
        # dimension of output_to_LSTM_padded_with_sentinel =  B x (m + 1) x l
        output_to_LSTM_padded_with_sentinel = torch.cat([output_to_LSTM_padded, sentinel_zero], 1)  
        
        
        
        return output_to_LSTM_padded_with_sentinel
    
    


In [23]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pickle
import os

class Highway_Maxout_Network(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, dropout_ratio):
        super(Highway_Maxout_Network, self).__init__()
        self.hidden_dim = hidden_dim # l
        self.maxout_pool_size = maxout_pool_size # p

        # Affine mapping from 5l ==> l
        self.r = nn.Linear(5 * hidden_dim, hidden_dim, bias=False) 
       

        # Affine mapping from 3*l ==> l*p
        self.max_out_layer1 = nn.Linear(3 * hidden_dim, hidden_dim*maxout_pool_size)
        
        # Affine mapping from l ==> l*p
        self.max_out_layer2 = nn.Linear(hidden_dim, hidden_dim*maxout_pool_size)
       
        # Affine mapping from 2*l ==> p
        self.max_out_layer3 = nn.Linear(2 * hidden_dim, maxout_pool_size)

        self.loss = nn.CrossEntropyLoss()
        
    
    def forward(self, h_i, U, curr_mask_vector, index_i_minus_1, u_concatenated, mask_matrix, target=None):
        batch_size, max_word_length , _ = list(U.size())

        # concatenation of ( h_i of dimension = b x l ; u_concatenated of dimension = b x 4l ) along dimension 1 = gives b x 5l
        # self.r(b x 5l) ====> b x l (change of vector space)
        r = torch.tanh(self.r(torch.cat((h_i.view(-1, self.hidden_dim), u_concatenated), 1)))  # b x 5l => b x l
       

        # hidden_dim = l
        r_expanded = r.unsqueeze(1).expand(batch_size, max_word_length, self.hidden_dim).contiguous()  # b x m x l

        m_t1_input = torch.cat((U, r_expanded), 2).view(-1, 3*self.hidden_dim)  # b*m x 3l

        m_t1_output = self.max_out_layer1(m_t1_input)  # b*m x p*l
        
        m_t1_output_resized, _ = m_t1_output.view(-1, self.hidden_dim, self.maxout_pool_size).max(2) # b*m x l

        # m_t2_input =  m_t1_output_resized
        m_t2_output = self.max_out_layer2(m_t1_output_resized)  # b*m x l*p
        
        m_t2_output_resized, _ = m_t2_output.view(-1, self.hidden_dim, self.maxout_pool_size).max(2)  # b*m x l

        m_t3_input = torch.cat((m_t1_output_resized, m_t2_output_resized), 1)  # b*m x 2l
        alpha1 = self.max_out_layer3(m_t3_input)  # b * m x p
        alpha2, _ = alpha1.max(1)  # b*m
        alpha3 = alpha2.view(-1, max_word_length) # b x m

#         print("alpha " + str(alpha3.size()) )
#         print("mask matrix " + str(mask_matrix.size()))
        alpha3 = alpha3 + mask_matrix  # b x m
#         print("alpha3")
#         print(alpha3)
        
        # alpha can be treated as probabilities that assign probability masses todifferent words in context. The word with
        # maximum weight(probability) becomes the index(start/end)
        alpha4 = F.softmax(alpha3, 1)  # b x m
        _, index_i = torch.max(alpha4, dim=1) # b

        if curr_mask_vector is None:
            curr_mask_vector = (index_i == index_i) # b
        else:
            index_i = index_i*curr_mask_vector.long()  # b
            index_i_minus_1 = index_i_minus_1*curr_mask_vector.long()  # b
            curr_mask_vector = (index_i != index_i_minus_1) # b

        step_loss = None
        
        

#         target[target < 0] = 0
        
        
        ## loss is only calculated only on that the predicted index at i_th time-step which varies 
        ## from the predicted index at time-step (i-1)_th time-step
#         print(target)
        if target is not None:
            step_loss = self.loss(alpha3, target)  # b
#             print("step_loss")
#             print(step_loss)
# #             step_loss1 = step_loss * curr_mask_vector.float() # b
#             print("step_loss1")
#             print(step_loss1)

        return index_i, curr_mask_vector, step_loss # all have dimension: b

class Dynamic_Decoder(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, max_number_of_iterations, dropout_ratio):
        super(Dynamic_Decoder, self).__init__()
        self.max_number_of_iterations = max_number_of_iterations
        
        self.hidden_dim = hidden_dim
        # batch_first = True
        # Input: has a dimension of B * m * embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.decoder = nn.LSTM(4 * hidden_dim, hidden_dim, 1, batch_first=True, bidirectional=False)

        self.maxout_start = Highway_Maxout_Network(hidden_dim, maxout_pool_size, dropout_ratio)
        self.maxout_end = Highway_Maxout_Network(hidden_dim, maxout_pool_size, dropout_ratio)
        
    def initHidden(self,batch_size):
        h0 = Variable(torch.zeros(1, batch_size, self.hidden_dim), requires_grad = False) # Initial hidden state
        c0 = Variable(torch.zeros(1, batch_size, self.hidden_dim), requires_grad = False) # Initial cell state
        return h0, c0


    def forward(self, U, document_word_sequence_mask,span_tensor):
        batch_size, max_word_length, _ = list(U.size()) # U has dimension : B x m x 2l

        curr_mask_start,  curr_mask_end = None, None
        results_mask_start, results_start = [], []
        results_mask_end, results_end = [], []
        step_losses = []
        

        # dimension = B x m
        mask_matrix = (1.0 - document_word_sequence_mask.float()) * (-1e30)
        
        # dimension = B
        indices = torch.arange(0, batch_size)

        
        # initialize start_i_minus_1, end_i_minus_1: these are the initial values of start and end indices
        # start_i_minus_1 = the first index for the context/question 
        # end_i_minus_1 = the last index for the context/question 
        
        # dimension = B
        start_i_minus_1 = torch.zeros(batch_size).long()
        
        # dimension = B
        end_i_minus_1 = torch.sum(document_word_sequence_mask, 1) - 1

        

        # After every iteration the hidden and current state 
        # at t = length of the sequence (for the one-directional lstm) will
        # be returned by the lstm
        # the hidden_state_i(h_i) will serve as an input to next lstm
        hidden_and_current_state_i = self.initHidden(batch_size)
        start_target = None
        end_target = None
        
        # this sets the start and end target (ie. the y_label) for an answer
        if span_tensor is not None:
            # Dimension = B
            start_target = span_tensor[:,0]
            
            
            # Dimension = B
            end_target = span_tensor[:,1]
            
    
            
        # this is just an initialization of u_start_i_minus_1
        # u_start_i_minus_1 is essentially u_start_zero outside the loop
        u_start_i_minus_1 = U[indices, start_i_minus_1, :]  # B x 2l
        
        # Why do we need an iterative procedure to predict the start and end indices for an answer ? 
        # Solution: there may exist several intuitive answer spans within the document, each corresponding to a
        # local maxima. An iterative technique to select an answer span by alternating between
        # predicting the start point and predicting the end point. This iterative procedure allows the model to
        # recover from initial local maxima corresponding to incorrect answer spans.
        for _ in range(self.max_number_of_iterations):
            u_end_i_minus_1 = U[indices, end_i_minus_1, :]  # B x 2l
            
            # u_concatenated is fed to the lstm
            u_concatenated = torch.cat((u_start_i_minus_1, u_end_i_minus_1), 1)  # B x 4l
            
        

            # the hidden_and_current_state_i = h_i,c_i are essentially hidden and current cell states 
            # for t = length of the sequence (for the one-directional lstm) after every iteration
            # u_concatenated.unsqueeze(1) has a dimension : B x 1 x 4l
            lstm_output, hidden_and_current_state_i = self.decoder(u_concatenated.unsqueeze(1), hidden_and_current_state_i)
            
            # h_i has dimension = 1 x B x l
            # c_i has dimension = 1 x B x l
            h_i, c_i = hidden_and_current_state_i
            
            

            # Inputs to the Highway_Maxout_Network(to find start index) are: hidden_state_i(h_i), start_i_minus_1(index), u_concatenated ==>(u_start_i_minus_1;u_end_i_minus_1) 
            start_i_minus_1, curr_mask_start, step_loss_start = self.maxout_start(h_i, U, curr_mask_start, start_i_minus_1,
                                                                u_concatenated, mask_matrix, start_target)
            
            
            
            u_start_i_minus_1 = U[indices, start_i_minus_1, :]  # B x 2l

            u_concatenated = torch.cat((u_start_i_minus_1, u_end_i_minus_1), 1)  # b x 4l

            # Inputs to the Highway_Maxout_Network(to find end index) are: hidden_state_i(h_i), end_i_minus_1(index), u_concatenated ==>(u_start_i_minus_1;u_end_i_minus_1) 
            end_i_minus_1, curr_mask_end, step_loss_end = self.maxout_end(h_i, U, curr_mask_end, end_i_minus_1,
                                                              u_concatenated, mask_matrix, end_target)

            # we minimize the cumulative softmax cross entropy of the start and end points across all iterations.
            if span_tensor is not None:
                step_loss = step_loss_start + step_loss_end
#                 print(step_loss)
                step_losses.append(step_loss)
            
            results_mask_start.append(curr_mask_start) # appends all the curr_mask_start ==> dimension: num_iterations x B
            results_start.append(start_i_minus_1) # appends all the start_indexes ==> dimension: num_iterations x B
            results_mask_end.append(curr_mask_end) # appends all the curr_mask_end ==> dimension: num_iterations x B
            results_end.append(end_i_minus_1) # appends all the end_indexes ==> dimension: num_iterations x B

        
        
        # Dimension = B
        result_pos_start1 = torch.sum(torch.stack(results_mask_start, 1), 1).long()
        result_pos_start = result_pos_start1 - 1
        
        # Dimension = B
        index_start = torch.gather(torch.stack(results_start, 1), 1, result_pos_start.unsqueeze(1)).squeeze()

        # Dimension = B
        result_pos_end1 = torch.sum(torch.stack(results_mask_end, 1), 1).long()
        result_pos_end = result_pos_end1 - 1
        
        # Dimension = B
        index_end = torch.gather(torch.stack(results_end, 1), 1, result_pos_end.unsqueeze(1)).squeeze()

        loss = None

#         print("step_losses")
#         print(sum(step_losses))
        if span_tensor is not None:
            # step losses has dimension = num_iterations x B
            sum_losses = sum(step_losses)
            batch_avg_loss = sum_losses / self.max_number_of_iterations
            loss = batch_avg_loss

            
        return loss, index_start, index_end



In [24]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pickle
import os


class Coattention_Encoder(nn.Module):
    def __init__(self, hidden_dim, maxout_pool_size, embedding_matrix, max_number_of_iterations, dropout_ratio):
        super(Coattention_Encoder, self).__init__()
        self.hidden_dim = hidden_dim

        ## nn.Linear(input_dim, output_dim)
        # Affine mapping from l ==> l
        self.question_proj = nn.Linear(hidden_dim, hidden_dim)
        
        self.fusion_bilstm = Fusion_BiLSTM(hidden_dim, dropout_ratio)
#         self.dropout = nn.Dropout(p=dropout_ratio)

    def forward(self, question_representation, context_representation,document_word_sequence_mask):
        
        ############## m = max length of instances in one batch of document ;  n= max length of instances in one batch of question ############################33
        Q = question_representation # B x (n + 1) x l
        D = context_representation  # B x (m + 1) x l
        
#         print("question_representation.(Output to Encoder Layer) ==  " + str(Q.size()))
#         print("context_representation. (Output to Encoder Layer)  ==  " + str(D.size()))

        # view function is meant to reshape the tensor.(Similar to reshape function in numpy)
        # view( row_size = -1 ,means that number of rows are unknown, column_size)
        # pass the Q tensor through a non-linearity 
        Q2 = torch.tanh(self.question_proj(Q.view(-1, self.hidden_dim))).view(Q.size()) #B x (n + 1) x l

        ##################################   Co-Attention starts here  #######################################
        
        ########################################   Step - 1  ##################################################
        # transpose(tensor, first_dimension to be transposed, second_dimension to be transposed)
        Q_transpose = torch.transpose(Q2, 1, 2) #dimension: B x l x (n + 1)
        
        # Performs a batch matrix-matrix product of matrices stored in batch1 and batch2.
        # batch1 and batch2 must be 3-D tensors each containing the same number of matrices.
        L = torch.bmm(D, Q_transpose) # dimension of L : B x (m + 1) x (n + 1)

        ####################################### Step-2 ######################################################
        A_Q = F.softmax(L, dim=2) # B x (m + 1) x (n + 1)


        D_transpose = torch.transpose(D, 1, 2) #dimension: B x l x (m + 1)
        C_Q = torch.bmm(D_transpose, A_Q) # (B x l x (m + 1)) x (B x (m + 1) x (n + 1)) => B x l x (n + 1)

        ####################################### Step-3 #######################################################
        L_tranpose = torch.transpose(L,1,2)
        A_D = F.softmax(L_tranpose, dim=2)  # B x (n + 1) x (m + 1)
        
        
        # concatenation along dimension=1:(B x l x (n + 1) ; B x l x (n + 1)  -----> B x 2l x (n + 1) ) x (B x (n + 1) x (m + 1)) ====> B x 2l x (m + 1)
        C_D = torch.bmm(torch.cat((Q_transpose, C_Q), 1), A_D) # B x 2l x (m + 1)
        C_D_transpose = torch.transpose(C_D, 1, 2)  # B x (m + 1) x 2l

        
        #######################################  Step-4 ##########################################################
        #fusion BiLSTM
        # concatenation along dimension = 2:  (B x (m + 1) x 2l ; B x (m + 1) x l  -----> B x (m + 1) x 3l )
        bi_lstm_input = torch.cat((C_D_transpose, D), 2) # B x (m + 1) x 3l
       
        U = self.fusion_bilstm(bi_lstm_input, document_word_sequence_mask) # B x m x 2l
        
#         print("size of U.(U is output of Co-attention encoder) ==  " + str(U.size()))
        
        return U


class Fusion_BiLSTM(nn.Module):
    def __init__(self, hidden_dim, dropout_ratio):
        super(Fusion_BiLSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
         # batch_first = True
        # Input: has a dimension of B * m * embedding_dim
        # Function parameters: input_size, hidden_size, num_layers_of_LSTM = 1(here)
        self.fusion_bilstm = nn.LSTM(3 * hidden_dim, hidden_dim, 1, batch_first=True,
                                     bidirectional=True, dropout=dropout_ratio)
        
#         self.dropout = nn.Dropout(p=dropout_ratio)

    def initHidden(self,batch_size):
        h0 = Variable(torch.zeros(2, batch_size, self.hidden_dim), requires_grad = False) # Initial hidden state
        c0 = Variable(torch.zeros(2, batch_size, self.hidden_dim), requires_grad = False) # Initial cell state
        return h0, c0

    def forward(self, word_sequence_embeddings, word_sequence_mask):
        
        # stores length of per instance for context/question
        length_per_instance = torch.sum(word_sequence_mask, 1)
        
        initial_hidden_states = self.initHidden(len(length_per_instance))
      
        # All RNN modules accept packed sequences as inputs.
        # Input: word_sequence_embeddings has a dimension of B x m+1 x 3l (l is the size of the glove_embedding/ pre-trained embedding/embedding_dim)
        packed_word_sequence_embeddings = pack_padded_sequence(word_sequence_embeddings, length_per_instance, batch_first=True,enforce_sorted=False)
        
        # since the input was a packed sequence, the output will also be a packed sequence
        output, _ = self.fusion_bilstm(packed_word_sequence_embeddings,initial_hidden_states)
        
        # Pads a packed batch of variable length sequences.
        # It is an inverse operation to pack_padded_sequence().
        # dimension:  B x m x 2l
        output_to_BiLSTM_padded, _ = pad_packed_sequence(output, batch_first=True)


        return output_to_BiLSTM_padded

In [25]:
import os

class Config(object):
    pass

config = Config()
config.data_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"
config.word_embedding_size = 100
config.hidden_dim = 300
config.dropout_ratio = 0.15
config.max_context_length = 600
config.max_question_length = 30
config.print_and_validate_every = 2

#vector with zeros for unknown words
config.num_iterations = 2
config.maxout_pool_size=16

config.lr = 0.001
config.dropout_ratio = 0.15
config.early_stop = 10
config.vocab_size = 50000

config.max_grad_norm = 5.0
config.batch_size = 20
config.num_epochs = 2
config.model_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dynamic_Coattention_Networks\\Models\\saved_models"

# config.print_every = 100
# config.save_every = 50000000
# config.eval_every = 1000

# config.model_type = 'co-attention'
config.reg_lambda = 0.00007
config.names = ["train_context","train_question"]
config.print_every = 100

In [None]:
FINAL TO_DOs
1)keep a cap on vocabulary(50k), initialize other words randomly
2)integrate the code to pickle vocab and answer/context/question tokens/indices in the end-to-end model
3) break train file into train and validation, and use validation file as test file