The dataset iterator file creates batches of training data and yields a batch: ("yields" returns a generator element where "generators" are iterables which can loop over its elements only once and then they are destroyed from memory.)

In [148]:

from __future__ import absolute_import
from __future__ import division

import torch
import random
import re
import time
import os
import pickle
import tqdm as tqdm

import numpy as np
from six.moves import xrange
# from data_util.vocab import PAD_ID, UNK_ID

"""This file contains code to read tokenized data from file,
truncate, pad and process it into batches ready for training"""




class Batch(object):
    """A class to hold the information needed for a training batch"""
# Batch(names,context_word_index_padded, context_word_mask, question_word_index_padded, question_word_mask, answer_start, answer_end)
    def __init__(self,names, context_word_index_padded, context_word_mask, question_word_index_padded, question_word_mask, answer_start, answer_end):
        """
        Inputs:
          {context/qn}_ids: Numpy arrays.
            Shape (batch_size, {context_len/question_len}). Contains padding.
          {context/qn}_mask: Numpy arrays, same shape as _ids.
            Contains 1s where there is real data, 0s where there is padding.
          {context/qn/ans}_tokens: Lists length batch_size, containing lists (unpadded) of tokens (strings)
          ans_span: numpy array, shape (batch_size, 2)
          uuid: a list (length batch_size) of strings.
            Not needed for training. Used by official_eval mode.
        """
        self.names = names
        self.context_word_index_padded = context_word_index_padded
        self.context_word_mask = context_word_mask
#         self.context_tokens = context_tokens

        self.question_word_index_padded = question_word_index_padded
        self.question_word_mask = question_word_mask
#         self.qn_tokens = qn_tokens

        self.answer_start = answer_start
        self.answer_end = answer_end

#         self.uuids = uuids

        self.batch_size = len(self.context_word_index_padded)




def refill_batches(batches,batch_size,names, max_context_length, max_question_length,context_word_index_padded,question_word_index_padded, answer_start, answer_end ):
#     refill_batches(batches, word_to_index, context_file, qn_file, ans_file, batch_size, context_len, question_len, discard_long):
    
    """
    
    Adds more batches into the "batches" list.
    Inputs:
      batches: list to add batches to
     
      names: list containing strings of file names ["train_context","train_question"] or ["validation_context","validation_question"]
      data_dir : paths to {train/dev}.{context/question/answer} data files
      batch_size: integer ==> how big to make the batches
      max_context_length, max_question_length: max length of context and question respectively
      
    """
    print ("Refilling batches...")
    tic = time.time()
    examples = [] # list of (qn_ids, context_ids, ans_span, ans_tokens) triples
    
   

    
    
    
#     context_line, qn_line, ans_line = context_file.readline(), qn_file.readline(), ans_file.readline() # read the next line from each

    while True: # while you haven't reached the end
        
        # add to examples
        examples.append((context_word_index_padded, question_word_index_padded, answer_start, answer_end))

        # stop refilling if you have 1 batch : change it later 
        ################## add number of batches you need ###########################33
        if len(examples) == batch_size * 1 :
            break



    # Make into batches and append to the list batches
    for batch_start in xrange(0, len(examples), batch_size):

        # Note: each of these is a list length batch_size of lists of ints (except on last iter when it might be less than batch_size)
        context_word_index_padded, question_word_index_padded, answer_start, answer_end = zip(*examples[batch_start:batch_start+batch_size])

        batches.append((context_word_index_padded, question_word_index_padded, answer_start, answer_end))

    # shuffle the batches
    random.shuffle(batches)

    toc = time.time()
    print ("Refilling batches took %.2f seconds" % (toc-tic))
    return


def get_batch_generator(data_dir, names, batch_size, max_context_length, max_question_length,context_word_index_padded,question_word_index_padded,answer_start, answer_end):
    """
    This function returns a generator object that yields batches.
    The last batch in the dataset will be a partial batch.
    Read this to understand generators and the yield keyword in Python: https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
    Inputs:
      names: list containing strings of file names = ["train_context","train_question"] or ["validation_context","validation_question"]
      data_dir : paths to {train/dev}.{context/question/answer} data files
      batch_size: integer ==> how big to make the batches
      max_context_length, max_question_length: max length of context and question respectively
     
    """
    
#     with open(r"E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\glove_word_embeddings.pkl", "rb") as input_file:
#         emb_matrix = pickle.load(input_file)
#     with open(data_dir + "//" + "answer_end_pkl.pkl", "rb") as input_file:
#         answer_end_pkl = pickle.load(input_file)
#     with open(data_dir + "//" + "answer_start_pkl.pkl", "rb") as input_file:
#         answer_start_pkl = pickle.load(input_file)

#     answer_end = torch.from_numpy(np.array([int(i) for i in answer_end_pkl])).int()
#     answer_start = torch.from_numpy(np.array([int(i) for i in answer_start_pkl])).int()              
   
    
    batches = []

    while True:
        if len(batches) == 0: # add more batches
            refill_batches(batches,batch_size,names, max_context_length, max_question_length,context_word_index_padded,question_word_index_padded,answer_start, answer_end)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (context_word_index_padded, question_word_index_padded, answer_start, answer_end) = batches.pop(0)

#         # Pad context_ids and qn_ids
#         qn_ids = padded(qn_ids, question_len) # pad questions to length question_len
#         context_ids = padded(context_ids, context_len) # pad contexts to length context_len

        # Make qn_ids into a np array and create qn_mask
#         qn_ids = np.array(qn_ids) # shape (batch_size, question_len)
#         qn_mask = (qn_ids != PAD_ID).astype(np.int32) # shape (batch_size, question_len)

#         # Make context_ids into a np array and create context_mask
#         context_ids = np.array(context_ids) # shape (batch_size, context_len)
#         context_mask = (context_ids != PAD_ID).astype(np.int32) # shape (batch_size, context_len)


        print(context_word_index_padded[0])
        print((context_word_index_padded[0] != 0).type(torch.int32) )
        context_word_mask = int(context_word_index_padded != 0)
#     .int()
#     .astype(torch.int32) 
        question_word_mask = int(question_word_index_padded != 0)
#     .int()
#     astype(torch.int32)
        # Make ans_span into a np array
#         ans_span = np.array(ans_span) # shape (batch_size, 2)

        # Make into a Batch object
        batch = Batch(names,context_word_index_padded, context_word_mask, question_word_index_padded, question_word_mask, answer_start, answer_end)

        yield batch

    return

In [149]:

answers = ["train.answer_start", "train.answer_end"]
answers_indexes =[os.path.join(data_dir + answer )  for answer in answers ]
with open(answers_indexes[0], "r" ,encoding="utf-8") as input_file:
    answer_start = input_file
    lines  = []
    for line in answer_start:
        lines.append(line)
    write_path_train_word = os.path.join(data_dir, "answer_start" + "_pkl.pkl")
    write_file_train_word = open(write_path_train_word, "wb")
    pickle.dump(lines, write_file_train_word)
with open(answers_indexes[1], "r" ,encoding="utf-8") as input_file:
    answer_start = input_file
    lines  = []
    for line in answer_start:
        lines.append(line)
    write_path_train_word = os.path.join(data_dir, "answer_end" + "_pkl.pkl")
    write_file_train_word = open(write_path_train_word, "wb")
    pickle.dump(lines, write_file_train_word)

In [151]:
word_index_padded =[os.path.join(data_dir + name + "_word_index_padded.pkl")  for name in names ]
with open(word_index_padded[0], "rb") as input_file:
    context_word_index_padded = pickle.load(input_file)
with open(word_index_padded[1], "rb") as input_file:
    question_word_index_padded = pickle.load(input_file)

with open(data_dir + "//" + "answer_end_pkl.pkl", "rb") as input_file:
    answer_end_pkl = pickle.load(input_file)
with open(data_dir + "//" + "answer_start_pkl.pkl", "rb") as input_file:
    answer_start_pkl = pickle.load(input_file)
                
answer_end = torch.from_numpy(np.array([int(i) for i in answer_end_pkl])).int()
answer_start = torch.from_numpy(np.array([int(i) for i in answer_start_pkl])).int()              

In [128]:
answer_end.size()

torch.Size([130319])

In [153]:
a = get_batch_generator(data_dir, names, batch_size, max_context_length, max_question_length,context_word_index_padded,question_word_index_padded,answer_start, answer_end)
for i in a:
    print(i.batch_size)

Refilling batches...
Refilling batches took 0.00 seconds
tensor([[  702, 26517, 38840,  ...,     0,     0,     0],
        [  702, 26517, 38840,  ...,     0,     0,     0],
        [  702, 26517, 38840,  ...,     0,     0,     0],
        ...,
        [    2,   213,    23,  ...,     0,     0,     0],
        [    2,   213,    23,  ...,     0,     0,     0],
        [    2,   213,    23,  ...,     0,     0,     0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)
7
Refilling batches...
Refilling batches took 0.00 seconds
(tensor([[  702, 26517, 38840,  ...,     0,     0,     0],
        [  702, 26517, 38840,  ...,     0,     0,     0],
        [  702, 26517, 38840,  ...,     0,     0,     0],
        ...,
        [    2,   213,    23,  ...,     0,     0,     0],
        [    2,   213,    23,  ...,    

AttributeError: 'bool' object has no attribute 'type'

In [119]:
x = torch.Tensor([1, 2, 3])
r = torch.unsqueeze(x, 0)  
print(r.size())# Size: 1x3
r = torch.unsqueeze(x, 1)    
print(r.size())# Size: 1x3


torch.Size([1, 3])
torch.Size([3, 1])


In [117]:
torch.Tensor([1, 2, 3]).size()

torch.Size([3])

In [135]:
with open(data_dir + "//" + "answer_end_pkl.pkl", "rb") as input_file:
    answer_end_pkl = pickle.load(input_file)
with open(data_dir + "//" + "answer_start_pkl.pkl", "rb") as input_file:
    answer_start_pkl = pickle.load(input_file)
                
answer_end = torch.from_numpy(np.array([int(i) for i in answer_end_pkl])).int()
answer_start = torch.from_numpy(np.array([int(i) for i in answer_start_pkl])).int()              

In [136]:
answer_end = torch.unsqueeze(answer_end, 1)    
answer_start = torch.unsqueeze(answer_start, 1)
print(answer_start.size())
span = torch.cat((answer_start,answer_end),1)

torch.Size([130319, 1])


In [142]:
span[:,0]

tensor([39, 28, 83,  ..., -1, -1, -1], dtype=torch.int32)

In [204]:



from __future__ import absolute_import
from __future__ import division

import torch
import random
import re
import time
import os
import pickle
import tqdm as tqdm

import numpy as np
from six.moves import xrange

class Batch():
    """A class to hold the information needed for a training batch"""
    def __init__(self,names,context_word_index_padded, context_word_mask, question_word_index_padded, question_word_mask, span_tensor):
        
        self.names = names
        self.context_word_index_padded = context_word_index_padded
        self.context_word_mask = context_word_mask


        self.question_word_index_padded = question_word_index_padded
        self.question_word_mask = question_word_mask
        self.span_tensor = span_tensor
        self.batch_size = len(self.context_word_index_padded)





def refill_batches(batches,batch_size,names, max_context_length, max_question_length,context_word_index_padded,question_word_index_padded,span_tensor):

    """

    Adds more batches into the "batches" list.
    Inputs:
      batches: list to add batches to

      names: list containing strings of file names ["train_context","train_question"] or ["validation_context","validation_question"]
      data_dir : paths to {train/dev}.{context/question/answer} data files
      batch_size: integer ==> how big to make the batches
      max_context_length, max_question_length: max length of context and question respectively

    """
    print ("Refilling batches...")
    tic = time.time()
    examples = [] 



        # add to examples
    examples.append((context_word_index_padded, question_word_index_padded, span_tensor))

        
        

    # Make into batches and append to the list batches
    for batch_start in xrange(0, len(examples[0][0]), batch_size):

        # Note: each of these is a list length batch_size of lists of ints (except on last iter when it might be less than batch_size)
        context_word_index_padded = examples[0][0][batch_start:batch_start+batch_size]
        question_word_index_padded = examples[0][1][batch_start:batch_start+batch_size]
        span_tensor = examples[0][2][batch_start:batch_start+batch_size]

        batches.append((context_word_index_padded, question_word_index_padded,span_tensor))


    

    # shuffle the batches
    random.shuffle(batches)

    toc = time.time()
    print ("Refilling batches took %.2f seconds" % (toc-tic))
    return batches


def get_batch_generator(data_dir, names, batch_size, max_context_length, max_question_length):
    """
    This function returns a generator object that yields batches.
    The last batch in the dataset will be a partial batch.
    Read this to understand generators and the yield keyword in Python: https://stackoverflow.com/questions/231767/what-does-the-yield-keyword-do
    Inputs:
      names: list containing strings of file names = ["train_context","train_question"] or ["validation_context","validation_question"]
      data_dir : paths to {train/dev}.{context/question/answer} data files
      batch_size: integer ==> how big to make the batches
      max_context_length, max_question_length: max length of context and question respectively

    """
    word_index_padded =[os.path.join(data_dir + name + "_word_index_padded.pkl")  for name in names ]
    with open(word_index_padded[0], "rb") as input_file:
        context_word_index_padded = pickle.load(input_file)
    with open(word_index_padded[1], "rb") as input_file:
        question_word_index_padded = pickle.load(input_file)

    with open(data_dir + "//" + "answer_end_pkl.pkl", "rb") as input_file:
        answer_end_pkl = pickle.load(input_file)
    with open(data_dir + "//" + "answer_start_pkl.pkl", "rb") as input_file:
        answer_start_pkl = pickle.load(input_file)

    context_word_index_padded = context_word_index_padded[10:33]
    question_word_index_padded = question_word_index_padded[10:33]

    answer_end = torch.from_numpy(np.array([int(i) for i in answer_end_pkl])).long()
    answer_start = torch.from_numpy(np.array([int(i) for i in answer_start_pkl])).long()              
    answer_start = torch.unsqueeze(answer_start, 1)
    answer_end = torch.unsqueeze(answer_end, 1)

    span_tensor = torch.cat((answer_start, answer_end), 1)
    span_tensor = span_tensor[10:33]



    batches = []
    count = 0

    while (True):
        count = count + 1
        if len(batches) == 0: # add more batches
            if(count > 2):
                break
            batches = refill_batches(batches,batch_size,names, max_context_length, max_question_length,context_word_index_padded,question_word_index_padded,span_tensor)
        if len(batches) == 0:
            break

        # Get next batch. These are all lists length batch_size
        (context_word_index_padded_per_batch, question_word_index_padded_per_batch, span_tensor_per_batch) = batches.pop(0)

        if(len(context_word_index_padded_per_batch) == 0):
            break
            
        context_ids = np.array(context_word_index_padded_per_batch[0]) # shape (batch_size, context_len)
        context_mask = (context_ids != 0).astype(np.int32) # shape (batch_size, context_len)
        context_word_mask = torch.from_numpy(context_mask)

        question_ids = np.array(question_word_index_padded_per_batch[0]) # shape (batch_size, context_len)
        question_mask = (question_ids != 0).astype(np.int32) # shape (batch_size, context_len)
        question_word_mask = torch.from_numpy(question_mask)

        # Make into a Batch object
        batch = Batch(names,context_word_index_padded_per_batch, context_word_mask, question_word_index_padded_per_batch, question_word_mask, span_tensor_per_batch)

        yield batch

    return

In [205]:
names = ["train_context","train_question"]
data_dir = "E:\\Internships_19\\Internship(Summer_19)\\Q&A_Toolkit\\Dataset_analysis\\SQuAD\\"
max_context_length = 400
max_question_length = 30
batch_size = 5
# object = Batch(names)
for batch in get_batch_generator(data_dir, names, batch_size, max_context_length, max_question_length):
    print(batch.batch_size)
    print(batch.span_tensor)

Refilling batches...
Refilling batches took 0.00 seconds
5
tensor([[30, 30],
        [54, 55],
        [87, 87],
        [37, 37],
        [91, 91]])
5
tensor([[26, 26],
        [62, 63],
        [ 7,  8],
        [15, 15],
        [37, 37]])
3
tensor([[117, 117],
        [163, 163],
        [  3,   4]])
5
tensor([[79, 81],
        [83, 83],
        [92, 92],
        [44, 45],
        [79, 81]])
5
tensor([[ 54,  55],
        [ 71,  72],
        [ 14,  17],
        [154, 154],
        [163, 163]])
