In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import csv
import ujson, os
import json
from tqdm import tqdm
import os.path
import pandas as pd
import time
from collections import Counter, defaultdict, OrderedDict
import random
import ast

In [2]:
prefix = "concurrentqa/" # FILL IN PATH TO REPOSITORY

# Load original data

In [3]:
st = time.time()
passages_path = f'{prefix}/datasets/hotpotqa/hotpot_index/wiki_id2doc.json'
with open(passages_path) as f:
    wiki_id2doc = json.load(f)
    passages = []
    for k, v in wiki_id2doc.items():
        v['id'] = k
        passages.append(v)
        
print(f"Loaded full set of documents in {time.time() - st}")
st = time.time()

st = time.time()
df = pd.DataFrame(passages)
print(f"Loaded full set of documents in {time.time() - st}")
st = time.time()
df.head(1)

Loaded full set of documents in 110.8920521736145


In [5]:
alldomaintitles = []
title2sent_map = {}
count = 0
for k, v in tqdm(wiki_id2doc.items()):
    title = v['title']
    sents = v['text']
    sents = sents.split(". ")
    
    alldomaintitles.append(title)
    title2sent_map[title] = sents
        
print(len(title2sent_map))

  0%|          | 17394/5233329 [00:00<00:30, 173827.02it/s]

['One Night Stand is a 1984 film directed by John Duigan.']

['Welty McCullogh (October 10, 1847 – August 31, 1889) was a Republican member of the U.S', 'House of Representatives from Pennsylvania.']

['The Neuropterida are a clade of holometabolous insects with over 5,700 described species, containing the orders Neuroptera (lacewings, antlions), Megaloptera (dobsonflies, alderflies), and Raphidioptera (snakeflies).']

['Bafia (beukpak) people inhabit the Mbam region in the centre province of Cameroon', 'Their origins are said to share many similarities with those of the Bamun and Tikar people', 'A division during migratory movements caused the two sets of groups to settle in different areas', 'Later, the islamisation of most of the Bamun territory further separated them', 'A yearly festival held in Fumban (Bamun territory) is considered by many to symbolize the recognition of their common heritage.']

['The Viti Levu giant pigeon ("Natunaornis gigoura") is an extinct flightless pigeon

100%|██████████| 5233329/5233329 [01:44<00:00, 49845.21it/s] 

5233329





In [None]:
with open(f'{prefix}/datasets/hotpotqa/hotpot/hotpot_qas_val.json') as f:
    qa_entries = []
    for line in f:
        entry = ast.literal_eval(line)
        qa_entries.append(entry)
print(f"QAs set has {len(qa_entries)} data points.")

### The following generates new private/public data splits and prepares everything for running retrieval on HotpotQA Dev Data

In [14]:
def process_item(item, domain=-1):
    item['domain'] = domain
    return item

def get_domain_splits(private_prop, alldomaintitles):
    title2domain = {}
    domain1titles = []
    domain2titles = []
    random.seed(0)
    
    # splits by title randomly 
    for title in tqdm(alldomaintitles):
        randnum = random.random()
        if randnum > private_prop:
            title2domain[title] = 0
            domain1titles.append(title)
        else:
            title2domain[title] = 1
            domain2titles.append(title)
            
    return title2domain, domain1titles, domain2titles

In [18]:
# we will do this for private splits of different sizes; private_prop 0.5 means equal 50-50 public-private splits
for private_prop in [0.5]:
    # path where you will save the generated data
    path = f"{prefix}/datasets/hotpotqa_pair/hotpot_privateprop_{private_prop}/"
    if not os.path.exists(path):
        os.makedirs(path)

    title2domain, domain1titles, domain2titles = get_domain_splits(private_prop, alldomaintitles)
            
    # Save the passages
    print(f"Num domain 1 titles: {len(domain1titles)}")
    print(f"Num domain 2 titles: {len(domain2titles)}\n")
    
    if not os.path.exists(path):
        print(f"Making dir at: {path}")
        os.makedirs(path)
    
    df['domain1'] = df['title'].apply(lambda x: title2domain[x]==0)
    df['domain2'] = df['title'].apply(lambda x: title2domain[x]==1)
    
    sub_df = df[df['domain1'] == True]
    dic = sub_df.to_dict('index')
    with open(f'{path}/domain0psgs.json', "w") as f:
        json.dump(dic, f)
    print("Saved domain 1 passages.")

    sub_df = df[df['domain2'] == True]
    dic = sub_df.to_dict('index')
    with open(f'{path}/domain1psgs.json', "w") as f:
        json.dump(dic, f)
    print("Saved domain 2 passages.")
        
    # determine private and public entities of those appearing in the queries 
    entity_df = pd.DataFrame(entitytitles.items(), columns=['entitytitle', 'domain'])
    entity_df['domain'] = entity_df['entitytitle'].apply(lambda x: title2domain[x]==0)
    entitytitle2domain_cache = {}
    for ind, row in entity_df.iterrows():
        entitytitle2domain_cache[row['entitytitle']] = row['domain']
    
    # Save the questions
    localquestions = []
    globalquestions = []
    not_in_corpus = 0
    for idx, item in tqdm(enumerate(qa_entries)):
        sps = item['sp']
        
        domains  = [entitytitle2domain_cache[sp['title']] for sp in sps]
        domain1_exists = any(d == True  for d in domains)
        domain2_exists = any(d == False for d in domains)
        neither_exists = not domain1_exists and not domain2_exists
        
        if domain1_exists and not domain2_exists:
            localquestions.append(process_item(item, domain=domains))
        if not domain1_exists and domain2_exists:
            globalquestions.append(process_item(item, domain=domains))
        if neither_exists:
            not_in_corpus += 1

    print(f"There are: {len(localquestions)} local questions")
    print(f"There are: {len(globalquestions)} global questions")
    print(f"For {not_in_corpus} questions, could not find documents in corpus.")
    
    if not os.path.exists(f'{path}/domain_0/'):
        os.makedirs(f'{path}/domain_0/')
    if not os.path.exists(f'{path}/domain_1/'):
        os.makedirs(f'{path}/domain_1/')
    
    with open(f'{path}/domain_0/hotpot_qas_val_domain0.json', 'w') as f:
        for question in localquestions:
            json.dump(question, f)
            f.write("\n")
    with open(f'{path}/domain_1/hotpot_qas_val_domain1.json', 'w') as f:
        for question in globalquestions:
            json.dump(question, f)
            f.write("\n")
            
    allquestions = localquestions.copy()
    allquestions.extend(globalquestions.copy())
    with open(f'{path}/hotpot_qas_val_all.json', 'w') as f:
        for question in allquestions:
            json.dump(question, f)
            f.write("\n")
            
    print(f"Saved data for private proportion {private_prop}.")
    

7405it [00:00, 156836.59it/s]

There are: 1837 local questions
There are: 1823 global questions
For 0 questions, could not find documents in corpus.
Fore 0 questions, contains private and public entities.




