In [1]:
from collections import Counter
import string
import re
import argparse
import json
import sys
import numpy as np
import nltk
import random
import math
import os
import pickle
from tqdm import tqdm

In [2]:
TRAINING = False

in_pkl_path = "./"
out_pkl_path = "./"
word_vocab_pkl_name = "word_vocabulary.pkl"
char_vocab_pkl_name = "char_vocabulary.pkl"

if TRAINING:
    in_pkl_name = "dataset_formatted_train.pkl"
#     out_pkl_name = "dataset_formatted_2_train.pkl"
    out_pkl_name = "preprocessed_train.pkl"
    GLOVE_FILE_PATH = '/home/bhargav/data/glove/glove.840B.300d.txt'
    EMBEDDING_SIZE = 300
    GLOVE_STORE = './precomputed_glove.npy'
    use_fraction = 0.5
else:
    in_pkl_name = "dataset_formatted_dev.pkl"
#     out_pkl_name = "dataset_formatted_2_dev.pkl"
    use_fraction = 1
    out_pkl_name = "preprocessed_dev.pkl"
    
    
    
pad_symbol = '<pad>'
unk_symbol = '<unk>'

In [3]:
def pickler(path,pkl_name,obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def unpickler(path,pkl_name):
    with open(os.path.join(path, pkl_name) ,'rb') as f:
        obj = pickle.load(f)
    return obj

In [4]:
class Vocabulary:
    
    def __init__(self, is_padded = False, unk = '<unk>', pad='<pad>'):
        self.vocab = Counter([])
        self.word_to_id = {}
        self.id_to_word = {}
        self.min_word_count = 2
        self.unk = unk
        self.pad = pad
        self.is_padded = is_padded
        
    def fit(self,text):
        for sent in text:
            self.vocab.update(sent)
    
    def freeze_vocab(self, min_word_count = 5):
        self.min_word_count = min_word_count
        sorted_counts = sorted(self.vocab.items(), key=lambda x: x[1], reverse = True)
        sorted_counts_filtered = [item for item in sorted_counts if item[1] >= self.min_word_count]
        for i, item in enumerate(sorted_counts_filtered):
            self.id_to_word[i] = item[0]
            self.word_to_id[item[0]] = i
        
        if(not self.is_padded):
            self.word_to_id[self.pad] = pad_id = len(self.word_to_id)
            self.id_to_word[pad_id] = self.pad
#         else:
#             assert(self.word_to_id.get(self.pad, -10) == -10)
            
#         assert(self.word_to_id.get(self.unk, -10) == -10)
        self.word_to_id[self.unk] = unk_id = len(self.word_to_id)
        self.id_to_word[unk_id] = self.unk
            
        
    
    def transform_sent(self, text):
        return [self.word_to_id.get(item, self.word_to_id[self.unk]) for item in text]
    
    def batch_transform(self, text_list):
        out = []
        for text in text_list:
            out.append(self.transform_sent(text))
        return out
        

In [5]:
# These functions might corrupt the input lists. Dont use them after this function is called

def pad_context(passages, max_passage_len, max_sent_len, pad_symbol = '<pad>'):
    passages_new = []
    unpadded_lengths = []
    for passage in passages:
        new_sents = []
        lengths = []
        passage = passage[:max_passage_len]
        blank_sentence = [pad_symbol]*(max_sent_len)
        passage += [blank_sentence]*(max_passage_len - len(passage))    
        for sent in passage:
            sent = sent[:max_sent_len]
            if(len(sent) == 0):
                lengths.append(0)
            elif(sent[0] == pad_symbol):
                lengths.append(0)
            else:
                lengths.append(len(sent))
            sent += [pad_symbol]*(max_sent_len - len(sent))
            new_sents.append(sent)
        unpadded_lengths.append(lengths)
        passages_new.append(new_sents)
    return passages_new, unpadded_lengths
        

        
def pad_questions(questions, max_question_len, pad_symbol = '<pad>'):
    questions_new = []
    for question in questions:
        question = question[:max_question_len]
        question += [pad_symbol] * (max_question_len - len(question))    
        questions_new.append(question)
    return questions_new

In [6]:
dataset_original = unpickler(in_pkl_path,in_pkl_name)

In [7]:
print(len(dataset_original['passages']))
print(len(dataset_original['questions']))
print(len(dataset_original['answer_spans']))
print(len(dataset_original['answer_sentences']))
print(len(dataset_original['data_points']))

500
7984
7984
7983
7983


In [8]:
unpadded_question_lengths = [len(q) for q in dataset_original['questions']]

In [9]:
unpadded_answer_lengths = [len(q) for q in dataset_original['answer_spans']]

In [10]:
para_len = []
sent_len = []
for p in dataset_original['passages']:
    para_len.append(len(p))
    for s in p:
        sent_len.append(len(s))


print("=====Paragraphs=====")

para_len = np.array(para_len)
print("Avg para len:{}".format(para_len.mean()))
print("min para len:{}".format(para_len.min()))
print("max para len:{}".format(para_len.max()))

print("=====Sentences=====")

sent_len = np.array(sent_len)
print("Avg sent len:{}".format(sent_len.mean()))
print("min sent len:{}".format(sent_len.min()))
print("max sent len:{}".format(sent_len.max()))

=====Paragraphs=====
Avg para len:15.548
min para len:3
max para len:88
=====Sentences=====
Avg sent len:19.311294057113454
min sent len:1
max sent len:137


In [11]:
max_para_len = 25
np.sum(np.greater(para_len,max_para_len))/para_len.shape[0]

0.078

In [12]:
max_sent_len = 40
np.sum(np.greater(sent_len,max_sent_len))/sent_len.shape[0]

0.06959094417288397

In [13]:
question_lens = np.array([len(q) for q in dataset_original['questions']])
print("Avg question len:{}".format(question_lens.mean()))
print("min question len:{}".format(question_lens.min()))
print("max question len:{}".format(question_lens.max()))

Avg question len:6.478081162324649
min question len:0
max question len:24


In [14]:
max_question_len = 10
np.sum(np.greater(question_lens,max_question_len))/question_lens.shape[0]

0.06763527054108216

In [15]:
answer_lens = np.array([len(q) for q in dataset_original['answer_spans']])
print("Avg answer span len:{}".format(answer_lens.mean()))
print("min answer span len:{}".format(answer_lens.min()))
print("max answer span len:{}".format(answer_lens.max()))

Avg answer span len:10.05936873747495
min answer span len:0
max answer span len:337


In [16]:
max_answer_len = 25
np.sum(np.greater(answer_lens,max_answer_len))/answer_lens.shape[0]

0.0499749498997996

In [17]:
questions_padded = pad_questions(dataset_original['questions'], max_question_len=max_question_len)

In [18]:
answers_padded = pad_questions(dataset_original['answer_spans'], max_question_len=max_answer_len)

In [19]:
passages_padded, unpadded_paragraph_len  = pad_context(dataset_original['passages'], max_para_len, max_sent_len)

In [20]:
answer_sentences_fixed = pad_questions(dataset_original['answer_sentences'], 
                                       max_question_len=max_para_len,
                                      pad_symbol = 0)

In [21]:
def char_pad_sentence(sent, max_word_len, pad_symbol='<pad>'):
    out_sent = []
#     sent = sent_in
    for word in sent:
        if(word == pad_symbol):
            out_sent += [pad_symbol] * max_word_len
        else:
            word = word[:max_word_len]
            out_sent += list(word)
            out_sent += [pad_symbol] * (max_word_len - len(word))
    return out_sent

def batch_char_pad_sentence(sent_batch_in, max_word_len, pad_symbol='<pad>'):
    padded = []
    for sent in sent_batch_in:
        sent_padded = char_pad_sentence(sent, max_word_len=max_word_len, pad_symbol=pad_symbol)
        padded.append(sent_padded)
    return padded

In [22]:
def align(list_of_rows, num_columns):
    list_of_columns = [[] for i in range(num_columns)]
    for row in list_of_rows:
        for i,item in enumerate(row):
            list_of_columns[i].append(item)
    return list_of_columns

In [23]:
max_word_len = 10

In [24]:
char_questions = batch_char_pad_sentence(questions_padded, max_word_len)

In [25]:
char_answers = batch_char_pad_sentence(answers_padded, max_word_len)

In [26]:
def make_char_level_paragraphs(paragraphs_padded,max_word_len):
    char_paragraphs = []
    for para in tqdm(paragraphs_padded):
        padded_para = batch_char_pad_sentence(para, max_word_len=max_word_len)
        char_paragraphs.append(padded_para)
    return char_paragraphs

In [27]:
char_passages_padded = make_char_level_paragraphs(passages_padded,max_word_len)

100%|██████████| 500/500 [00:00<00:00, 1236.19it/s]


In [28]:
passages_padded_aligned = align(passages_padded, max_para_len)

In [29]:
char_passages_padded_aligned = align(char_passages_padded, max_para_len)

## Create/load vocabulary

In [30]:
if TRAINING:
    word_vocab = Vocabulary(is_padded=True)

    #fit vocab on words
    for para_group in tqdm(passages_padded_aligned):
            word_vocab.fit(para_group)

    word_vocab.fit(questions_padded)
    word_vocab.fit(answers_padded)

    word_vocab.freeze_vocab(min_word_count = 3)

    char_vocab = Vocabulary(is_padded=True)

    #fit vocab on characters
    for para_group in tqdm(char_passages_padded_aligned):
            char_vocab.fit(para_group)

    char_vocab.fit(char_questions)
    char_vocab.fit(char_answers)
    
    char_vocab.freeze_vocab(min_word_count = 1)
    
    print("Size of word vocab after filtering: ", len(word_vocab.word_to_id))
    print("Saving word vocabulary")
    pickler(out_pkl_path, word_vocab_pkl_name, word_vocab)
    
    print("Size of char vocab after filtering: ", len(char_vocab.word_to_id))
    print("Saving char vocabulary")
    pickler(out_pkl_path, char_vocab_pkl_name, char_vocab)
    print("Done")
else:
    print("Loading word vocab")
    word_vocab = unpickler(out_pkl_path, word_vocab_pkl_name)
    print("Size of word vocab: ", len(word_vocab.word_to_id))
    print("Loading char vocab")
    char_vocab = unpickler(out_pkl_path, char_vocab_pkl_name)
    print("Size of char vocab: ", len(char_vocab.word_to_id))
    print("Done")

Loading word vocab
Size of word vocab:  37212
Loading char vocab
Size of char vocab:  592
Done


In [31]:
print("word: index of unk:{} , pad:{}".format(word_vocab.word_to_id["<unk>"], 
                                              word_vocab.word_to_id["<pad>"]))

print("char: index of unk:{} , pad:{}".format(char_vocab.word_to_id["<unk>"], 
                                              char_vocab.word_to_id["<pad>"]))

word: index of unk:37211 , pad:0
char: index of unk:591 , pad:0


## Transform words and chars to IDs

In [32]:
question_word_idx = word_vocab.batch_transform(questions_padded)

In [33]:
answer_word_idx = word_vocab.batch_transform(answers_padded)

In [34]:
question_char_idx = char_vocab.batch_transform(char_questions)

In [35]:
answer_char_idx = char_vocab.batch_transform(char_answers)

In [36]:
def convert_paras_to_idx(all_passages, vocab):
    paragraphs_idx = []
    for sent_group in tqdm(all_passages):
        sent_group_idx = vocab.batch_transform(sent_group)
        paragraphs_idx.append(sent_group_idx)
    return paragraphs_idx

In [37]:
passages_word_idx = convert_paras_to_idx(passages_padded_aligned, word_vocab)

100%|██████████| 25/25 [00:00<00:00, 97.42it/s]


In [38]:
passages_char_idx = convert_paras_to_idx(char_passages_padded_aligned, char_vocab)

100%|██████████| 25/25 [00:02<00:00, 12.40it/s]


In [39]:
data_dict = {"passages_word":passages_word_idx , "passages_char":passages_char_idx,
            "questions_word":question_word_idx, "questions_char":question_char_idx,
             "answers_word":answer_word_idx, "answers_char":answer_char_idx,
             "supporting_facts":answer_sentences_fixed, 
             "unpadded_question_lengths":unpadded_question_lengths, 
             "unpadded_passage_lengths":unpadded_paragraph_len,
            "unpadded_answer_lengths":unpadded_answer_lengths, 
             "data_points":dataset_original['data_points']}

In [40]:
pickler(out_pkl_path, out_pkl_name, data_dict)

## Prepare GloVe

In [41]:
if TRAINING:
    VOCAB_SIZE = len(word_vocab.word_to_id)
    embeddings_index = {}
    f = open(GLOVE_FILE_PATH,encoding='utf8')
    for line in f:
          values = line.split(' ')
          word = values[0]
          coefs = np.asarray(values[1:], dtype='float32')
          embeddings_index[word] = coefs
    f.close()
    
    print("Read GloVe file")
    
    # make sure GloVE doesn't have <unk> and <pad>.  NOTE: These will be handled separately later
    # assert(embeddings_index.get('<pad>',-10) == -10)
    # assert(embeddings_index.get('<unk>',-10) == -10)

    
    # prepare embedding matrix
    print("Preparing embedding matrix")
    count_not_found = 0
    embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_SIZE))
    for word, i in word_vocab.word_to_id.items():
        if((word == '<unk>') or (word == '<pad>')):
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            count_not_found += 1
            
    # initialize <unk> to mean of all embeddings
    embedding_matrix[word_vocab.word_to_id['<unk>']] = embedding_matrix.mean(axis = 0)
    
    print("Embedding matrix shape: ",embedding_matrix.shape)  
    print("Number of words not found in GloVe: ",count_not_found)
    print("Number of words in GloVe: ", len(embeddings_index))
    np.save(GLOVE_STORE, embedding_matrix)

print("Done")

Done
