# Preprocess HotpotQA for BERT
Notebook output: (question,context) pairs. Model should predict which sentences in the context are required to answer the question

In [1]:
from collections import Counter
import string
import re
import argparse
import json
import sys
import numpy as np
import nltk
import random
import math
import os
import pickle
from tqdm import tqdm, trange

In [2]:
from pytorch_pretrained_bert import BertTokenizer

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
def pickler(path,pkl_name,obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def unpickler(path,pkl_name):
    with open(os.path.join(path, pkl_name) ,'rb') as f:
        obj = pickle.load(f)
    return obj

In [5]:
TRAINING = False

out_pkl_path = "./"

if(TRAINING):
    file_path = "/home/bhargav/data/hotpotqa/hotpot_train_v1.json"
    out_pkl_name = "preprocessed_train.pkl"
    problem_indices = [8437, 25197, 34122, 46031, 52955, 63867, 82250]
else:
    file_path = "/home/bhargav/data/hotpotqa/hotpot_dev_distractor_v1.json"
    out_pkl_name = "preprocessed_dev.pkl"
    problem_indices = [5059]

In [6]:
with open(file_path, encoding='utf8') as file:
    dataset = json.load(file)

In [7]:
def tokenize(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

In [8]:
questions = []
paragraphs = [] 
supporting_facts = []


for item_index, item in enumerate(tqdm(dataset)):
    if(item_index in problem_indices):
        continue
    question = tokenize(item["question"])
    questions.append(question)
    paragraph_names = []
    paragraph_text = []
    for i,para in enumerate(item["context"]):
        para_name = para[0]
        para_sents = para[1]
        paragraph_names.append(para_name)
        paragraph_text.append([tokenize(s) for s in para_sents])
    paragraphs.append(paragraph_text)
    supp_fact_list = []
    for sup_fact in item["supporting_facts"]:
        para_name = sup_fact[0]
        supporting_fact_index = sup_fact[1] 
        para_index = paragraph_names.index(para_name)
        supp_fact_list.append([para_index, supporting_fact_index])
    supporting_facts.append(supp_fact_list)

100%|██████████| 7405/7405 [03:52<00:00, 31.82it/s]


In [26]:
indices_of_supporting_facts = []
for datapoint in supporting_facts:
    for s_f in datapoint:
        indices_of_supporting_facts.append(s_f[1])
        
indices_of_supporting_facts = np.array(indices_of_supporting_facts)

print("Max index of supporting fact: {}".format(indices_of_supporting_facts.max()))
print("Avg index of supporting fact: {}".format(indices_of_supporting_facts.mean()))

Max index of supporting fact: 11
Avg index of supporting fact: 0.7631798233431476


In [34]:
max_s_f_index = 5
np.sum(np.greater(indices_of_supporting_facts,max_s_f_index))/indices_of_supporting_facts.shape[0]

0.005721904338647853

In [9]:
question_lengths = np.array([len(q) for q in questions])

print("Avg question len:{}".format(question_lengths.mean()))
print("min question len:{}".format(question_lengths.min()))
print("max question len:{}".format(question_lengths.max()))

Avg question len:19.590626688276608
min question len:7
max question len:65


In [10]:
document_lengths = []

for doc in paragraphs:
    doc_len = 0
    for para in doc:
        for sent in para:
            doc_len += len(sent)
    document_lengths.append(doc_len)

document_lengths = np.array(document_lengths)


print("Avg document len:{}".format(document_lengths.mean()))
print("min document len:{}".format(document_lengths.min()))
print("max document len:{}".format(document_lengths.max()))

Avg document len:1194.0179632631011
min document len:66
max document len:3222


In [11]:
question_doc_combined_len = question_lengths + document_lengths

print("Avg combined len:{}".format(question_doc_combined_len.mean()))
print("min combined len:{}".format(question_doc_combined_len.min()))
print("max combined len:{}".format(question_doc_combined_len.max()))

Avg combined len:1213.6085899513776
min combined len:80
max combined len:3231


In [12]:
supporting_fact_lengths = []

for i,doc in enumerate(paragraphs):
    supp_fact_len = 0
    for j, para in enumerate(doc):
        for k, sent in enumerate(para):
            if([j,k] in supporting_facts[i]):
                supp_fact_len += len(sent)
    supporting_fact_lengths.append(supp_fact_len)
    
supporting_fact_lengths +=  np.array(supporting_fact_lengths)

print("Avg supporting facts len:{}".format(supporting_fact_lengths.mean()))
print("min supporting facts len:{}".format(supporting_fact_lengths.min()))
print("max supporting facts len:{}".format(supporting_fact_lengths.max()))

Avg supporting facts len:159.5958941112912
min supporting facts len:32
max supporting facts len:666


In [13]:
question_plus_supp_fact_only = question_lengths + supporting_fact_lengths

print("Avg question_plus_supp_fact_onlys len:{}".format(question_plus_supp_fact_only.mean()))
print("min question_plus_supp_fact_only len:{}".format(question_plus_supp_fact_only.min()))
print("max question_plus_supp_fact_only len:{}".format(question_plus_supp_fact_only.max()))

Avg question_plus_supp_fact_onlys len:179.1865207995678
min question_plus_supp_fact_only len:44
max question_plus_supp_fact_only len:707


In [14]:
question_plus_gold_paragraphs_lengths = []

for i,doc in enumerate(paragraphs):
    supp_sentences_len = question_lengths[i]
    for j, para in enumerate(doc):
        if(j in [f[0] for f in supporting_facts[i]]):
            for k, sent in enumerate(para):
                supp_sentences_len += len(sent)
    question_plus_gold_paragraphs_lengths.append(supp_sentences_len)
    
question_plus_gold_paragraphs_lengths +=  np.array(question_plus_gold_paragraphs_lengths)

print("Avg question_plus_gold_paragraphs_lengths len:{}".format(question_plus_gold_paragraphs_lengths.mean()))
print("min question_plus_gold_paragraphs_lengths len:{}".format(question_plus_gold_paragraphs_lengths.min()))
print("max question_plus_gold_paragraphs_lengths len:{}".format(question_plus_gold_paragraphs_lengths.max()))

Avg question_plus_gold_paragraphs_lengths len:415.1766612641815
min question_plus_gold_paragraphs_lengths len:100
max question_plus_gold_paragraphs_lengths len:1356


In [15]:
max_len = 500
np.sum(np.greater(question_plus_gold_paragraphs_lengths,max_len))/question_plus_gold_paragraphs_lengths.shape[0]

0.2516207455429498

In [16]:
max_len = 300
np.sum(np.greater(question_plus_supp_fact_only,max_len))/question_plus_supp_fact_only.shape[0]

0.05172879524581307

In [17]:
num_sentences_per_para = []

for doc in paragraphs:
    for para in doc:
        num_sentences_per_para.append(len(para))

num_sentences_per_para = np.array(num_sentences_per_para)


print("Avg num_sentences_per_para:{}".format(num_sentences_per_para.mean()))
print("min num_sentences_per_para:{}".format(num_sentences_per_para.min()))
print("max num_sentences_per_para:{}".format(num_sentences_per_para.max()))

Avg num_sentences_per_para:4.1582711358393265
min num_sentences_per_para:1
max num_sentences_per_para:85


In [20]:
max_len = 7
np.sum(np.greater(num_sentences_per_para,max_len))/num_sentences_per_para.shape[0]

0.07671325824399511

In [35]:
question_plus_para_len = []

for i,doc in enumerate(paragraphs):
    for j, para in enumerate(doc):
        sentences_len = question_lengths[i]
        for k, sent in enumerate(para):
            sentences_len += len(sent)
        question_plus_para_len.append(sentences_len)
    
question_plus_para_len +=  np.array(question_plus_para_len)

print("Avg question_plus_para_len:{}".format(question_plus_para_len.mean()))
print("min question_plus_para_len:{}".format(question_plus_para_len.min()))
print("max question_plus_para_len:{}".format(question_plus_para_len.max()))

Avg question_plus_para_len:279.1744605780974
min question_plus_para_len:42
max question_plus_para_len:3742


In [40]:
max_len = 500
np.sum(np.greater(question_plus_para_len,max_len))/question_plus_para_len.shape[0]

0.06299362192970552