In [2]:
import wikipedia
import os
import json
import tqdm
import argparse

import multiqa_utils.general_utils as gu
import multiqa_utils.qampari_utils as qu
import multiqa_utils.retrieval_utils as ru
import multiqa_utils.wikipedia_utils as wu
import multiqa_utils.distributed_utils as du

%load_ext autoreload
%autoreload 2

## First, build the ent_string to wikipage_title cache

In [552]:
path_args = gu.current_default_path_args()

In [4]:
all_titles = wu.build_gt_wikititle_set(path_args, force=False)

>> Index already exists: /scratch/ddr8143/wikipedia/tagme_dumps_qampari_wikipedia/postprocessed/gt_title_set.json


In [5]:
curr_cache = wu.get_initial_str2wikipage_cache(all_titles, path_args, force=False)

>> Loading the cache
>> Initial cache size: 6388942


In [None]:
# Without any flags set true (add_elq, add_gpt, add_wikitags) this just
#   goes through the current list and removes strings already in cache.
ru.aggregate_strs_to_add_to_cache(
    path_args,
    use_tqdm=True,
    curr_cache=curr_cache,
)

In [None]:
strs_to_add = json.load(open(path_args.strs_for_cache_path))

In [None]:
du.distributed_build_str2wikipage_cache(
    path_args,
    job_id=0,
    total_num_jobs=40000,
    all_strs_to_add=strs_to_add,
    use_tqdm=True,    
)

**After a Distributed Run, aggregate the results**

In [560]:
du.aggregate_checkpoint_dicts(path_args.cache_path, remove_processed=False, dry_run_remove=False)

>> Aggregating all 11 versions of: /scratch/ddr8143/wikipedia/tagme_dumps_qampari_wikipedia/postprocessed/str2wikipage_cache.json
>> Length of final dict: 7198496
>> Dumped: /scratch/ddr8143/wikipedia/tagme_dumps_qampari_wikipedia/postprocessed/str2wikipage_cache.json


In [561]:
du.aggregate_checkpoint_dicts(path_args.disambig_cache_path, remove_processed=False, dry_run_remove=False)

>> Aggregating all 11 versions of: /scratch/ddr8143/wikipedia/tagme_dumps_qampari_wikipedia/postprocessed/str2wikipage_disambig_cache.json
>> Length of final dict: 4968
>> Dumped: /scratch/ddr8143/wikipedia/tagme_dumps_qampari_wikipedia/postprocessed/str2wikipage_disambig_cache.json


In [562]:
curr_cache = wu.get_initial_str2wikipage_cache(all_titles, path_args, force=False)

>> Loading the cache
>> Initial cache size: 7198496


In [563]:
# Without any flags set true (add_elq, add_gpt, add_wikitags) this just
#   goes through the current list and removes strings already in cache.
ru.aggregate_strs_to_add_to_cache(
    path_args,
    use_tqdm=True,
    curr_cache=curr_cache,
)

>> Load existing string list: /scratch/ddr8143/wikipedia/tagme_dumps_qampari_wikipedia/postprocessed/strs_to_add_to_cache_v1.json
>> Initial string list length: 1459129
>> Removing strings already in cache
>> New string list length: 889318
>> Writing file
>> Dumped to: /scratch/ddr8143/wikipedia/tagme_dumps_qampari_wikipedia/postprocessed/strs_to_add_to_cache_v1.json


In [549]:
def remove_unneeded_lookup_strings(init_list, text_cache_set, notext_cache_set):
    init_len = len(init_list)
    print(">> Initial list length:", init_len)
    
    # Remove all cases that start with hashes
    new_list = [s for s in init_list if s[0] != '#']
    print(">> After removing start with hashes:", len(new_list))
    
    # Replace cases with hashes with just the part before the anchor
    new_list = [s if '#' not in s else s.split('#')[0] for s in new_list]
    new_list = [s for s in new_list if s not in text_cache_set]
    print(">> After removing anchors:", len(new_list))
    
    # Remove all that are in notext cache set (fixed by redirects)
    new_list = [s for s in new_list if s not in notext_cache_set]
    print(">> After removing redirects:", len(new_list))
    
    # Fix amps in both directions
    new_list = [s.replace('&amp;', '&') for s in new_list]
    new_list = [s for s in new_list if s not in notext_cache_set and s not in text_cache_set]
    new_list = [s.replace('&', '&amp;') for s in new_list]
    new_list = [s for s in new_list if s not in notext_cache_set and s not in text_cache_set]
    print(">> After fixing amps:", len(new_list))
    
    final_len = len(new_list)
    removed = init_len - final_len
    print(f"Removed {removed} strings: {removed * 100.0 / init_len:0.2f}% of the list")
    print("Final Length:", len(new_list))
    return new_list

In [550]:
strs_to_add = json.load(open(path_args.strs_for_cache_path))

In [554]:
new_strs_to_add = remove_unneeded_lookup_strings(strs_to_add, set(all_titles), no_text_page_titles)

>> Initial list length: 3668245
>> After removing start with hashes: 3640074
>> After removing anchors: 3329325
>> After removing redirects: 1461231
>> After fixing amps: 1461048
Removed 2207197 strings: 60.17% of the list
Final Length: 1461048


In [555]:
strs_to_add = json.load(open(path_args.strs_for_cache_path))

## Try to see if we can identify redirects ourselves

In [139]:
# Conclusion, the way we could do it is to call the online API
#import requests
def call_online_redirect_api(existing_page_name):
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": existing_page_name,
        "prop": "redirects"
    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()

    PAGES = DATA["query"]["pages"]


    redirects = []
    for k, v in PAGES.items():
        if 'redirects' in v:
            for re in v["redirects"]:
                redirects.append(re['title'])
                print(re["title"] + " redirect to " + v["title"])
    return redirects

In [181]:
def url_to_redirect_title(url_str, session=None):
    if session is None:
        session = requests.Session()
    response = session.get(url=url_str)
    title = [l for l in response.text.split('\n') if '<title>' in l]
    return title

In [186]:
ED_wiki_35 = gu.loadjsonl("/scratch/ddr8143/wikipedia/qampari_wikipedia/parsed_dumps/AA/wiki_35")

In [198]:
redirect_pages = []
for kdata in ED_wiki_35:
    kdata.keys()
    if kdata['text'] == '':
        print(kdata)
        redirect_pages.append(kdata)

{'id': '3619', 'revid': '9784415', 'url': 'https://en.wikipedia.org/wiki?curid=3619', 'title': 'Military of Botswana', 'text': ''}
{'id': '3622', 'revid': '9784415', 'url': 'https://en.wikipedia.org/wiki?curid=3622', 'title': 'Geography of Bouvet Island', 'text': ''}
{'id': '3623', 'revid': '9784415', 'url': 'https://en.wikipedia.org/wiki?curid=3623', 'title': 'Bouvet Island/People', 'text': ''}
{'id': '3624', 'revid': '9784415', 'url': 'https://en.wikipedia.org/wiki?curid=3624', 'title': 'Government of Bouvet Island', 'text': ''}
{'id': '3625', 'revid': '9784415', 'url': 'https://en.wikipedia.org/wiki?curid=3625', 'title': 'Economy of Bouvet Island', 'text': ''}
{'id': '3626', 'revid': '9784415', 'url': 'https://en.wikipedia.org/wiki?curid=3626', 'title': 'Communications in Bouvet Island', 'text': ''}
{'id': '3627', 'revid': '9784415', 'url': 'https://en.wikipedia.org/wiki?curid=3627', 'title': 'Bouvet Island/Transportation', 'text': ''}
{'id': '3628', 'revid': '9784415', 'url': 'http

## Answer the Question: How Many Answers are Wikipages?

In [206]:
def get_no_text_pages(subseg_path, verbose=False):
    no_text_pages = []
    for kdata in gu.loadjsonl(subseg_path):
        if kdata['clean_text'] == '':
            no_text_pages.append(kdata)
    return no_text_pages

In [207]:
path_args.gt_wiki_dir

'/scratch/ddr8143/wikipedia/tagme_dumps_qampari_wikipedia/'

In [217]:
len(all_no_text_pages_real)

9574158

In [218]:
no_text_page_titles = set()
for p in all_no_text_pages_real:
    no_text_page_titles.add(gu.normalize(p['title']))

In [221]:
qmp_dev = qu.load_dev_data()

In [510]:
def ans_normalize_and_split(ans):
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    a = ans.strip("[]\" *\n,")
    if "http" in a:
        a = a.split("http")[0]
    if "|" in a:
        a = a.split("|")[0]
    if "\n" in a:
        a = a.split("\n")[0]
    a = a.strip("[] *\n,")
    a = a.replace("’", "'").replace(":", '').replace("&", "&amp;")
    a = _RE_COMBINE_WHITESPACE.sub(" ", a).strip()
    a = gu.normalize(a)
    return a

In [439]:
def extract_answer_text(ans_dict):
    return ans_dict['answer_text']

In [426]:
def extract_answer_url(ans_dict):
    if 'answer_url' not in ans_dict:
        return None
    return ans_dict['answer_url'].split('wiki/')[-1]

In [395]:
def answers_from_dataset(dataset, answer_fxn):
    all_ans = set()
    for d in dataset:
        for a_dict in d['answer_list']:
            a = answer_fxn(a_dict)
            if a is None:
                continue
            all_ans.add(a)
    return all_ans

In [408]:
def normed_answers_to_answerinfo(dataset, answer_fxn, norm_fxn):
    answer_dict = {}
    for d in dataset:
        for a_dict in d['answer_list']:
            a = answer_fxn(a_dict)
            if a is None or a.strip() == '':
                continue
            #print(a)
            #print(norm_fxn(a))
            answer_dict[norm_fxn(a)] = {
                "question_text": d['question_text'],
                **a_dict,
            }
    return answer_dict

In [443]:
def get_answer_composition(dataset, no_text_titles, text_titles, answer_fxn, norm_fxn):
    all_ans = answers_from_dataset(dataset, answer_fxn)
    norm_all_ans = set([norm_fxn(a) for a in all_ans if norm_fxn(a) != ''])
    in_text = norm_all_ans & text_titles
    in_no_text = norm_all_ans & no_text_titles
    in_neither = norm_all_ans - in_text - in_no_text
    num_ans = len(all_ans)
    print(f"all_ans: {num_ans}| in text: {len(in_text)} | in notext: {len(in_no_text)} | in neither: {len(in_neither)} ({len(in_neither)*100.0 /num_ans:0.2f}%)")
    return in_neither

In [435]:
print("=== Dev Answer_URLs Missing In Page Titles (default normalize) ===\n")
in_neither_urls_default = get_answer_composition(qmp_dev, no_text_page_titles, set(all_titles), extract_answer_url, gu.normalize)
print()
answer_info_urls_default = normed_answers_to_answerinfo(qmp_dev, extract_answer_url, gu.normalize)
for a in in_neither_urls_default:
    print(f"{a:40}", answer_info_urls_default[a]['answer_url'])

=== Dev Answer_URLs Missing In Page Titles (default normalize) ===

all_ans: 10833| in text: 10808 | in notext: 1144 | in neither: 12

2001:_a_space_odyssey_(film)             https://en.wikipedia.org/wiki/2001:_A_Space_Odyssey_(film)
armageddon_time                          https://en.wikipedia.org/wiki/Armageddon_Time
hexen:_beyond_heretic                    https://en.wikipedia.org/wiki/Hexen:_Beyond_Heretic
sydney_trains_a_&_b_sets                 https://en.wikipedia.org/wiki/Sydney_Trains_A_&_B_sets
lincoln_zephyr_(china)                   https://en.wikipedia.org/wiki/Lincoln_Zephyr_(China)
enemy_territory:_quake_wars              https://en.wikipedia.org/wiki/Enemy_Territory:_Quake_Wars
bob_biswas                               https://en.wikipedia.org/wiki/Bob_Biswas
chumo_the_holy_of_goguryeo               https://en.wikipedia.org/wiki/Chumo_the_Holy_of_Goguryeo
star_trek:_discovery                     https://en.wikipedia.org/wiki/Star_Trek:_Discovery
henry_&_june            

In [437]:
print("=== Dev Answer_URLs Missing In Page Titles (aggressive normalize) ===\n")
in_neither_urls_better = get_answer_composition(qmp_dev, no_text_page_titles, set(all_titles), extract_answer_url, ans_normalize_and_split)
print()
answer_info_urls_better = normed_answers_to_answerinfo(qmp_dev, extract_answer_url, ans_normalize_and_split)
for a in in_neither_urls_better:
    print(f"{a:40}", answer_info_urls_better[a]['answer_url'])

=== Dev Answer_URLs Missing In Page Titles (aggressive normalize) ===

all_ans: 10833| in text: 10811 | in notext: 1146 | in neither: 7

armageddon_time                          https://en.wikipedia.org/wiki/Armageddon_Time
lincoln_zephyr_(china)                   https://en.wikipedia.org/wiki/Lincoln_Zephyr_(China)
mishima_a_life_in_four_chapters          https://en.wikipedia.org/wiki/Mishima:_A_Life_in_Four_Chapters
hexen_beyond_heretic                     https://en.wikipedia.org/wiki/Hexen:_Beyond_Heretic
bob_biswas                               https://en.wikipedia.org/wiki/Bob_Biswas
chumo_the_holy_of_goguryeo               https://en.wikipedia.org/wiki/Chumo_the_Holy_of_Goguryeo
2001_a_space_odyssey_(film)              https://en.wikipedia.org/wiki/2001:_A_Space_Odyssey_(film)


In [475]:
print("=== Dev Answer Texts Missing In Page Titles (default normalize) ===\n")
in_neither_text_default = get_answer_composition(qmp_dev, no_text_page_titles, set(all_titles), extract_answer_text, gu.normalize)
print()
answer_info_text_default = normed_answers_to_answerinfo(qmp_dev, extract_answer_text, gu.normalize)
for a in list(in_neither_text_default)[:30]:
    print(f"{a:100}   |{answer_info_text_default[a]['answer_text']}|")

=== Dev Answer Texts Missing In Page Titles (default normalize) ===

all_ans: 12462| in text: 10379 | in notext: 2682 | in neither: 899 (7.21%)

[[chilis]]                                                                                             |[[Chilis]]|
_viktor_gyökeres                                                                                      | Viktor Gyökeres|
pakistan_aeronautical_complex_(pac)                                                                    |Pakistan Aeronautical Complex (PAC)|
chesapeake_&_delaware_canal_bridge                                                                     |Chesapeake & Delaware Canal Bridge|
conning_towers_nautilus_park                                                                           |Conning Towers Nautilus Park|
[[soviet_aircraft_carrier_admiral_gorshkov|admiral_flota_sovetskogo_soyuza_gorshkov]]                  |[[Soviet aircraft carrier Admiral Gorshkov|Admiral Flota Sovetskogo Soyuza Gorshkov]]|
webcrawler*

In [511]:
print("=== Dev Answer Texts Missing In Page Titles (aggressive normalize) ===\n")
in_neither_text_better = get_answer_composition(qmp_dev, no_text_page_titles, set(all_titles), extract_answer_text, ans_normalize_and_split)
print()
answer_info_text_better = normed_answers_to_answerinfo(qmp_dev, extract_answer_text, ans_normalize_and_split)
for a in list(in_neither_text_better)[110:150]:
    print(f"{a:80}   {answer_info_text_better[a]['answer_text']}")

=== Dev Answer Texts Missing In Page Titles (aggressive normalize) ===

all_ans: 12462| in text: 10573 | in notext: 2759 | in neither: 651 (5.22%)

poems_and_stories                                                                  Poems and Stories
russian_ship_of_the_line_imperator_nikolai_i                                       Russian ship of the line Imperator Nikolai I
o_ōnomatsu                                                                         o Ōnomatsu
hermann_waldaestel                                                                 Hermann Waldaestel
semyon_budyonnyy                                                                   Semyon Budyonnyy
jen_kamerman                                                                       Jen Kamerman
micheon                                                                            Micheon
johnny_gravel                                                                      Johnny Gravel
ludlow_massacre_strike                    

In [None]:
# TODO: Try to directly match the links
# --> If we can do this it dramatically cuts down on the number of wikipedia api calls we'd use even
#   if we don't directly match ELQ results

In [None]:
# TODO: Try to directly match the ELQ results
# --> if we can do this then we can run ELQ on wikipedia as a whole instead of using the wikipedia api

## Previously Ran
But haven't retested after the refactor.