In [1]:
import sh
import json
import os

from collections import defaultdict

In [2]:
base_path = "/scratch/ddr8143/repos/DPR/downloads/data"
ambigqa_path = f"{base_path}/ambigqa"
ambigqa_light_path = f"{base_path}/ambigqa_light"
nq_path = f"{base_path}/retriever"
qp_path = f"{base_path}/qampari"

wikipath = "/scratch/ddr8143/wikipedia/enwiki_20220701"

## Inspect QAMPARI Data

In [3]:
sh.ls(qp_path)

dev_data.jsonl	test_data.jsonl  train_data.jsonl

In [4]:
qp_data = []
for l in open(f"{qp_path}/dev_data.jsonl").readlines():
    qp_data.append(json.loads(l))

In [5]:
qp_data[0]

{'entities': [{'entity_url': 'https://en.wikipedia.org/wiki/Ryoichi_Ikegami',
   'entity_text': 'Ryoichi Ikegami',
   'aliases': ['Ryoichi Ikegami']}],
 'question_text': 'What manga was drawn by Ryoichi Ikegami?',
 'answer_list': [{'answer_text': 'Heat',
   'aid': '799__wikidata_simple__test__0',
   'aliases': ['Heat'],
   'answer_url': 'https://en.wikipedia.org/wiki/Heat_(manga)',
   'proof': [{'proof_text': ' is a seinen manga series written by buronson and illustrated by ryoichi ikegami.',
     'found_in_url': 'https://en.wikipedia.org/wiki/Heat_(manga)',
     'pid': '799__wikidata_simple__test__0__0'}]},
  {'answer_text': 'Mai, the Psychic Girl',
   'aid': '799__wikidata_simple__test__1',
   'aliases': ['Mai, the Psychic Girl'],
   'answer_url': 'https://en.wikipedia.org/wiki/Mai,_the_Psychic_Girl',
   'proof': [{'proof_text': 'mai, the psychic girl, known simply as in japan, is a manga written by kazuya kudō and illustrated by ryoichi ikegami.\nthe main character is mai kuju, a 14

## Inspect Wiki Data

In [10]:
#wiki_segment = f"{wikipath}/AA"

In [3]:
wikipath

'/scratch/ddr8143/wikipedia/enwiki_20220701'

In [4]:
def get_wikiseg_path(wikipath, segment):
    return f"{wikipath}/{segment}"

In [5]:
def get_metadata_path(wikipath, segment):
    return f"{get_wikiseg_path(wikipath, segment)}/metadata.json"

In [11]:
# sh.ls(wiki_segment)
# os.listdir(wikisegment)

wiki_00  wiki_12  wiki_24  wiki_36  wiki_48  wiki_60  wiki_72  wiki_84	wiki_96
wiki_01  wiki_13  wiki_25  wiki_37  wiki_49  wiki_61  wiki_73  wiki_85	wiki_97
wiki_02  wiki_14  wiki_26  wiki_38  wiki_50  wiki_62  wiki_74  wiki_86	wiki_98
wiki_03  wiki_15  wiki_27  wiki_39  wiki_51  wiki_63  wiki_75  wiki_87	wiki_99
wiki_04  wiki_16  wiki_28  wiki_40  wiki_52  wiki_64  wiki_76  wiki_88
wiki_05  wiki_17  wiki_29  wiki_41  wiki_53  wiki_65  wiki_77  wiki_89
wiki_06  wiki_18  wiki_30  wiki_42  wiki_54  wiki_66  wiki_78  wiki_90
wiki_07  wiki_19  wiki_31  wiki_43  wiki_55  wiki_67  wiki_79  wiki_91
wiki_08  wiki_20  wiki_32  wiki_44  wiki_56  wiki_68  wiki_80  wiki_92
wiki_09  wiki_21  wiki_33  wiki_45  wiki_57  wiki_69  wiki_81  wiki_93
wiki_10  wiki_22  wiki_34  wiki_46  wiki_58  wiki_70  wiki_82  wiki_94
wiki_11  wiki_23  wiki_35  wiki_47  wiki_59  wiki_71  wiki_83  wiki_95

In [3]:
#title_to_info = defaultdict(list)

In [6]:
def get_segment_metadata(wikipath, segment, force=False):
    mdpath = get_metadata_path(wikipath, segment)
    if not force and os.path.exists(mdpath):
        print(f">> Metadata exists: {mdpath}")
        return
    
    wiki_segment = get_wikiseg_path(wikipath, segment)
    seg_title_to_info = defaultdict(list)
    seg_title_to_info_wtext = defaultdict(list)
    for subseg in sorted(os.listdir(wiki_segment)):
        if "metadata" in subseg:
            continue
        subseg_path = f"{wiki_segment}/{subseg}"
        #print(f">>     Processing {subseg_path}")
        with open(f"{wiki_segment}/{subseg}") as f:
            for i, jl in enumerate(f):
                l = json.loads(jl)
                try:
                    ltitle = l['title']
                    ldata = {
                        'id': l['id'],
                        'has_text': l['text'] != '',
                        'url': l['url'],
                    }
                    seg_title_to_info[ltitle].append(ldata)
                    if ldata['has_text']:
                        seg_title_to_info_wtext[ltitle].append({k: v for k, v in ldata.items() if k != 'has_text'})
                except:
                    print("Exception!!!")
                    print(l)
    
    # Validate results
    duplicate_titles = {k: v for k, v in seg_title_to_info.items() if len(v) > 1}
    assert len(duplicate_titles) == 0, f"Number duplicate titles: {len(duplicate_titles)} for wiki segment: {wiki_segment}"
    
    # Write metadata
    metadata = {
        "all_titles": dict(seg_title_to_info),
        "titles_with_text": dict(seg_title_to_info_wtext),
    }
    with open(mdpath, 'w+') as mdf:
        json.dump(metadata, mdf)
    print(f">> Wrote metadata for {len(seg_title_to_info):6} titles ({len(seg_title_to_info_wtext):6} with text) to {mdpath}")

In [8]:
for i, segment in enumerate(sorted(os.listdir(wikipath))):
    get_segment_metadata(wikipath, segment, force=False)

>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AA/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AB/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AC/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AD/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AE/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AF/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AG/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AH/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AI/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AJ/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AK/metadata.json
>> Metadata exists: /scratch/ddr8143/wikipedia/enwiki_20220701/AL/metadata.json
>> Metadata exists: /scratch/ddr8143/wik

In [55]:
AE_meta = json.load(open('/scratch/ddr8143/wikipedia/enwiki_20220701/AE/metadata.json'))

In [56]:
AE_meta.keys()

dict_keys(['all_titles', 'titles_with_text'])

In [57]:
len(AE_meta['all_titles'])

10498

In [58]:
len(AE_meta['titles_with_text'])

7248

In [None]:
len(AE_meta[''])

In [25]:
for k, v in seg_title_to_info.items():
    if len(v) > 1:
        print(k, v)

In [26]:
len(title_to_info)

109838

In [2]:
list(dict(title_to_info))[0]

NameError: name 'title_to_info' is not defined