# Preprocessing HotpotQA
- Form paragraphs only out of the supporting facts

In [1]:
from collections import Counter
import string
import re
import argparse
import json
import sys
import numpy as np
import nltk
import random
import math
import os
import pickle
from tqdm import tqdm, trange

import pdb

from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
                                                  BertTokenizer,
                                                  whitespace_tokenize)

In [2]:
def pickler(path,pkl_name,obj):
    with open(os.path.join(path, pkl_name), 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def unpickler(path,pkl_name):
    with open(os.path.join(path, pkl_name) ,'rb') as f:
        obj = pickle.load(f)
    return obj

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
cls_id = tokenizer.convert_tokens_to_ids(["[CLS]"])[0]
sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
pad_id = tokenizer.convert_tokens_to_ids(["[PAD]"])[0]
                                         

print("[CLS] : {}".format(cls_id))
print("[SEP] : {}".format(sep_id))
print("[PAD] : {}".format(pad_id))

[CLS] : 101
[SEP] : 102
[PAD] : 0


In [5]:
TRAINING = True

out_pkl_path = "./"

if(TRAINING):
    file_path = "../../../hotpotqa/hotpot_train_v1.1.json"
    out_pkl_name = "preprocessed_train.pkl"
    small_out_pkl_name = "preprocessed_train_small.pkl"
    small_dataset_size = 5000
else:
    file_path = "../../../hotpotqa/hotpot_dev_distractor_v1.json"
    out_pkl_name = "preprocessed_dev_0.4.pkl"
    small_out_pkl_name = "preprocessed_dev_small.pkl"
    small_dataset_size = 500

pred_pkl_path = '../pred_for_threshold/'
pred_pkl_name = 't_0.4'

max_seq_len = 510
max_num_paragraphs = 10

In [6]:
if(not TRAINING):
    predictions = unpickler(pred_pkl_path, pred_pkl_name)

In [7]:
if(not TRAINING):
    predictions.keys()

In [8]:
if(not TRAINING):
    list(predictions['sp'].keys())[:5]

In [9]:
if(not TRAINING):
    type(predictions['sp']['5a8b57f25542995d1e6f1371'][0][1])

In [10]:
with open(file_path, encoding='utf8') as file:
    dataset = json.load(file)

In [11]:
def tokenize(text, tokens_to_text_mapping, bert_tokenizer):
    out_list = []
    tokens = whitespace_tokenize(text)
    for tok in tokens:
        ids = bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.tokenize(tok))
        tokens_to_text_mapping[tuple(ids)] = tok
        out_list += ids
    return out_list

def un_tokenize(ids, tokens_to_text_mapping, bert_tokenizer):
    out_list = []
    start = 0
    end = start
    # remove " from the list because it causes confusion between Book "shelf ... 
    # and Book" shelf ... while un-tokenizing
    # ids = list(filter(lambda x: x != 1000, ids)) 
    while (start < len(ids)) and (end < len(ids)):
        i = len(ids)
        decoded_anything = False
        while (decoded_anything == False) and (i > start):
            if(tuple(ids[start:i]) in tokens_to_text_mapping.keys()):
                out_list.append(tokens_to_text_mapping[tuple(ids[start:i])])
                decoded_anything = True
            else:
                i -= 1
        if(decoded_anything == False):
            start += 1
            end = start
        else:
            start = i
            end = i
    return " ".join(out_list)

In [12]:
question_ids = []
questions = []
paragraphs = [] 
answers = []
answers_string = []
question_indices = []
yes_no_span = []
supporting_facts = []
ids_to_word_mappings = []

skipped = []

for item_index, item in enumerate(tqdm(dataset)):
    answers_string.append(item["answer"])
    id_to_word = {}
    paragraph_names = []
    for i,para in enumerate(item["context"]):
        para_name = para[0]
        paragraph_names.append(para_name)
    supp_fact_list = []
    
    if(TRAINING):
        supporting_facts_for_this_question = item["supporting_facts"]
    else:
        supporting_facts_for_this_question = predictions[item["_id"]]
        
    for sup_fact in supporting_facts_for_this_question:
        para_name = sup_fact[0]
        supporting_fact_index = sup_fact[1] 
        para_index = paragraph_names.index(para_name)
        supp_fact_list.append([para_index, supporting_fact_index])

    gold_paragraphs = []
    reorganized_sf_list = []
    for p_index, supporting_para in enumerate(set([para_index for para_index, supporting_fact_index in supp_fact_list])):
        para = item["context"][supporting_para]
        para_name = para[0]
        para_sents = para[1]
        para_sents[0] = para_name + ". " +para_sents[0]
        gold_paragraphs.append([tokenize(s, id_to_word, tokenizer) for s in para_sents])
        supporting_facts_in_this_para = []
        for para_index, supporting_fact_index in supp_fact_list:
            if(para_index == supporting_para):
                supporting_facts_in_this_para.append(supporting_fact_index)
        reorganized_sf_list.append(supporting_facts_in_this_para)
    
    supporting_facts.append(reorganized_sf_list)
    paragraphs.append(gold_paragraphs)
    question_indices.append(item_index)
    question_ids.append(item["_id"])
    question = tokenize(item["question"], id_to_word, tokenizer)
    questions.append(question)
    answer_str = item["answer"]
    if(answer_str == "yes"):
        yes_no_span.append(0)
    elif(answer_str == "no"):
        yes_no_span.append(1)
    else:
        yes_no_span.append(2)
    answer_tokenized = tokenize(answer_str, {}, tokenizer)
    answers.append(answer_tokenized)
    ids_to_word_mappings.append(id_to_word)

100%|██████████| 90447/90447 [05:26<00:00, 277.26it/s]


In [13]:
supporting_facts[0]

[[0], [0]]

In [14]:
print(len(paragraphs))
print(len(answers))
print(len(questions))

90447
90447
90447


In [15]:
print(len(answers_string)) #unfiltered list

90447


In [16]:
print("Skipped {} examples".format(len(skipped)))

Skipped 0 examples


In [17]:
set([len(p) for p in paragraphs])

{2}

In [18]:
supporting_facts[1]

[[0], [0]]

In [19]:
question_lengths = np.array([len(q) for q in questions])
print("Min question length: {}".format(question_lengths.min()))
print("Avg question length: {}".format(question_lengths.mean()))
print("Max question length: {}".format(question_lengths.max()))

Min question length: 4
Avg question length: 22.443585746348692
Max question length: 141


In [20]:
max_question_len = 40
np.sum(np.greater(question_lengths,max_question_len))/question_lengths.shape[0]

0.07017369288091369

In [21]:
combined_gold_para_lengths = []
for para_list in paragraphs:
    length = 0
    for para in para_list:
        for sentence in para:
            length += len(sentence)
    combined_gold_para_lengths.append(length)

combined_gold_para_lengths = np.array(combined_gold_para_lengths)

print("Min combined_gold_para_lengths: {}".format(combined_gold_para_lengths.min()))
print("Avg combined_gold_para_lengths: {}".format(combined_gold_para_lengths.mean()))
print("Max combined_gold_para_lengths: {}".format(combined_gold_para_lengths.max()))

Min combined_gold_para_lengths: 38
Avg combined_gold_para_lengths: 197.25920152133293
Max combined_gold_para_lengths: 806


In [22]:
max_passage_length = max_seq_len - max_question_len - 2
print("max passage length: ",max_passage_length)
print(np.sum(np.greater(combined_gold_para_lengths,max_passage_length))/combined_gold_para_lengths.shape[0])

max passage length:  468
0.0030957356241776953


In [23]:
def pad_trim(sequences, max_len, pad_symbol=0):
    sequences_out = []
    for sequence in sequences:
        seq = sequence[:max_len]
        seq += [pad_symbol] * (max_len - len(seq))
        sequences_out.append(seq)
    return sequences_out

In [24]:
questions_fixed_len = pad_trim(questions, max_question_len, pad_symbol=0)

In [25]:
set([len(q) for q in questions_fixed_len])

{40}

In [26]:
def find_all_in_sentence(sequence, key):
    start_indices = []
    end_indices = []
    for i in range(len(sequence)):
        if(sequence[i:i+len(key)] == key):
            start_indices.append(i)
            end_indices.append(i+len(key)-1)
    assert(len(start_indices) == len(end_indices))
    return start_indices,end_indices

def find_answer_locations(passages, answers, yes_no_span):
    assert(len(passages) == len(answers))
    answer_start_indices = []
    answer_end_indices = []
    for i in range(len(passages)):
        if(yes_no_span[i] != 2):
            answer_start_indices.append([0])
            answer_end_indices.append([0])
        else:    
            s, e = find_all_in_sentence(passages[i], answers[i])
            assert(len(s) == len(e))
            answer_start_indices.append(s)
            answer_end_indices.append(e)
    return answer_start_indices, answer_end_indices

In [27]:
def make_para_out_of_supporting_facts(passages, supporting_facts):
    skipped = []
    out_passages = []
    assert(len(passages) == len(supporting_facts))
    for i in range(len(passages)):
        new_passage = []
        for j,passage in enumerate(passages[i]):
            for s_f in supporting_facts[i][j]:
                try:
                    new_passage += passage[s_f]
                except:
                    skipped.append(i)
        out_passages.append(new_passage)
    return out_passages, skipped

In [28]:
make_para_out_of_supporting_facts(passages= [[[["Para1", "Sentence1"],["Para1", "Sentence2"]],
                                  [["Para2", "Sentence1"],["Para2", "Sentence2"]]]], supporting_facts=[[[0],[1]]])

([['Para1', 'Sentence1', 'Para2', 'Sentence2']], [])

In [29]:
paragraphs_sf_only, skipped_sf_index_out_of_range = make_para_out_of_supporting_facts(passages=paragraphs, supporting_facts=supporting_facts)

In [30]:
len(skipped_sf_index_out_of_range)

22

In [31]:
len(paragraphs_sf_only)

90447

In [32]:
len(paragraphs_sf_only[2050])

48

In [33]:
fixed_length_context = pad_trim(paragraphs_sf_only, max_passage_length, pad_symbol=0)

In [34]:
answer_start_indices, answer_end_indices = find_answer_locations(fixed_length_context, answers=answers, yes_no_span=yes_no_span)

In [35]:
assert(len(answer_start_indices) == len(answer_end_indices))

In [36]:
len(paragraphs)

90447

In [37]:
len(answers)

90447

In [38]:
len(answer_start_indices)

90447

In [39]:
num_occurrences_of_answer = []
question_indices_without_answer = []
for i in range(len(answer_start_indices)):
    if(yes_no_span[i] == 2):
        if(len(answer_start_indices[i]) == 0):
            question_indices_without_answer.append(i)
        else:
            num_occurrences_of_answer.append(len(answer_start_indices[i]))

num_occurrences_of_answer = np.array(num_occurrences_of_answer)

print("Number of span types where answer string is not found in context: {}".format(len(question_indices_without_answer)))

print("Min occurrences of answer in gold context: {}".format(num_occurrences_of_answer.min()))
print("Avg occurrences of answer in gold context: {}".format(num_occurrences_of_answer.mean()))
print("Max occurrences of answer in gold context: {}".format(num_occurrences_of_answer.max()))

Number of span types where answer string is not found in context: 98
Min occurrences of answer in gold context: 1
Avg occurrences of answer in gold context: 1.565525286327002
Max occurrences of answer in gold context: 16


In [40]:
i = 2
print(answer_start_indices[i])
print(answer_end_indices[i])

[104]
[106]


In [41]:
if TRAINING:
    for i in range(len(questions)-1, -1, -1):
        if(i in question_indices_without_answer or i in skipped_sf_index_out_of_range):
            del(question_ids[i])
            del(questions_fixed_len[i])
            del(fixed_length_context[i])
            del(question_indices[i])
            del(yes_no_span[i])
            del(answer_start_indices[i])
            del(answer_end_indices[i])
            del(ids_to_word_mappings[i])

In [42]:
segment_id = [0] + [0]*max_question_len + [1] + [1]*max_passage_length

In [43]:
assert(len(segment_id) == max_seq_len)

In [44]:
question_context_sequences = []
for i in range(len(questions_fixed_len)):
    seq = [cls_id] + questions_fixed_len[i] + [sep_id] + fixed_length_context[i]
    question_context_sequences.append(seq)

In [45]:
for seq in question_context_sequences:
    assert(len(seq) == max_seq_len)

In [46]:
# offset all answer pointers by max_question_len + 2

answer_start_indices_offset = []
answer_end_indices_offset = []

for i in range(len(answer_end_indices)):
    start = []
    end = []
    assert(len(answer_end_indices[i]) == len(answer_start_indices[i]))
    for j in range(len(answer_end_indices[i])):
        start.append(answer_start_indices[i][j] + max_question_len + 2)
        end.append(answer_end_indices[i][j] + max_question_len + 2)
    answer_start_indices_offset.append(start)
    answer_end_indices_offset.append(end)

In [47]:
i = 2
print(answer_start_indices_offset[i])
print(answer_end_indices_offset[i])

[146]
[148]


Things to pkl:
- question_context_sequences
- question_ids
- question_indices
- yes_no_span
- answer_start_indices_offset
- answer_end_indices_offset
- max_question_len
- max_seq_len
- segment_id
- ids_to_word_mappings

In [48]:
print("question_context_sequences: {}".format(len(question_context_sequences)))
print("question_ids: {}".format(len(question_ids)))
print("question_indices: {}".format(len(question_indices)))
print("yes_no_span: {}".format(len(yes_no_span)))
print("answer_start_indices_offset: {}".format(len(answer_start_indices_offset)))
print("answer_end_indices_offset: {}".format(len(answer_end_indices_offset)))

question_context_sequences: 90327
question_ids: 90327
question_indices: 90327
yes_no_span: 90327
answer_start_indices_offset: 90327
answer_end_indices_offset: 90327


In [49]:
out_dict = {
    "question_context_sequences": question_context_sequences,
    "question_ids": question_ids,
    "question_indices": question_indices,
    "yes_no_span": yes_no_span,
    "answer_start_indices_offset": answer_start_indices_offset,
    "answer_end_indices_offset": answer_end_indices_offset,
    "segment_id": segment_id,
    "max_question_len":max_question_len,
    "max_seq_len": max_seq_len,
    "ids_to_word_mappings": ids_to_word_mappings
}

In [50]:
small_out_dict = {
    "question_context_sequences": out_dict['question_context_sequences'][:small_dataset_size],
    "question_ids": out_dict['question_ids'][:small_dataset_size],
    "question_indices": out_dict['question_indices'][:small_dataset_size],
    "yes_no_span": out_dict['yes_no_span'][:small_dataset_size],
    "answer_start_indices_offset": out_dict['answer_start_indices_offset'][:small_dataset_size],
    "answer_end_indices_offset": out_dict['answer_end_indices_offset'][:small_dataset_size],
    "segment_id": out_dict['segment_id'],
    "max_question_len": out_dict['max_question_len'],
    "max_seq_len": out_dict['max_seq_len'],
    "ids_to_word_mappings": out_dict["ids_to_word_mappings"][:small_dataset_size]
}    


In [51]:
out_dict['answer_string'] = answers_string

In [52]:
small_out_dict['answer_string'] = answers_string

In [53]:
assert(list(out_dict.keys()) == list(small_out_dict.keys()))

In [54]:
len(small_out_dict["question_context_sequences"][0])

510

In [55]:
len(small_out_dict["segment_id"])

510

In [56]:
pickler(out_pkl_path, small_out_pkl_name, small_out_dict)
print("done")

done


In [57]:
pickler(out_pkl_path, out_pkl_name, out_dict)
print("done")

done
