In [7]:
import wikipedia
import os
import json
import tqdm
import argparse
import pickle

import multiqa_utils.general_utils as gu
import multiqa_utils.qampari_utils as qu
import multiqa_utils.retrieval_utils as ru
import multiqa_utils.wikipedia_utils as wu
import multiqa_utils.distributed_utils as du

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Setup Everything

In [2]:
path_args = gu.current_default_path_args()

In [None]:
text_titles = wu.build_gt_wikititle_set(path_args, force=False)

In [3]:
notext_titles = [gu.normalize(tt['title']) for tt in json.load(open(path_args.no_text_pages))]

In [None]:
curr_cache = wu.get_initial_str2wikipage_cache(text_titles, path_args, force=False)

In [19]:
qmp_dev = qu.load_dev_data()

In [None]:
gpt_ans = json.load(open(path_args.gpt_ans_path))

In [None]:
elq_ans = gu.loadjsonl(path_args.elq_ans_path)

## Figure Out Redirects

In [4]:
notext_title_set = set(notext_titles)

In [10]:
print("Number to pages without titles:", len(notext_title_set))

Number to pages without titles: 9158265


In [8]:
redirects = pickle.load(open('/scratch/ddr8143/wikipedia/old_redirects.pkl', 'rb'))
print("Number redirects:", len(redirects))

Number redirects: 7387458


In [15]:
notext_in_redircts = [t for t in notext_title_set if t in redirects]
notext_not_in_redirects = [t for t in notext_title_set if t not in redirects]
num_ntTOT = len(notext_title_set)
num_ntINr = len(notext_in_redircts)
num_ntNOTINr = len(notext_not_in_redirects)
print(f"{num_ntINr} ({num_ntINr * 100.0 / num_ntTOT:0.1f}%) in redirects, {num_ntNOTINr} ({num_ntNOTINr * 100.0 / num_ntTOT:0.1f}%) not in redirects (total: {num_ntINr + num_ntNOTINr} / {num_ntTOT})")


7083331 (77.3%) in redirects, 2074934 (22.7%) not in redirects (total: 9158265 / 9158265)


## Try Retrieval

In [20]:
qmp_ex = qmp_dev[0]
qmp_ex['question_text']

'What manga was drawn by Ryoichi Ikegami?'

In [21]:
qmp_ex.keys()

dict_keys(['entities', 'question_text', 'answer_list', 'qid'])

In [22]:
qu.print_answer_data(qmp_ex, answer_fxn=qu.extract_answer_text)

Type:                799__wikidata_simple__dev
Question:            What manga was drawn by Ryoichi Ikegami?
Question Keywords:   [31mWhat[0m, [31mmanga[0m, [31mdrawn[0m, [31mRyoichi[0m, [31mIkegami[0m
Answers:             [32mHeat[0m, [32mMai, the Psychic Girl[0m, [32mWounded Man[0m, [32mSanctuary[0m, [32mCrying Freeman[0m, [32mStrain[0m

Answer:  [32mHeat[0m
    Answer URL: https://en.wikipedia.org/wiki/Heat_(manga)
    Proofs:
    >> ([32mHeat[0m (manga))  is a seinen manga series written by buronson and illustrated by ryoichi
       ikegami.

Answer:  [32mMai, the Psychic Girl[0m
    Answer URL: https://en.wikipedia.org/wiki/Mai,_the_Psychic_Girl
    Proofs:
    >> ([32mMai, the Psychic Girl[0m) [32mmai, the psychic girl[0m, known simply as in japan, is a
       manga written by kazuya kudō and illustrated by ryoichi ikegami. the main character is mai kuju, a
       14-year-old japanese girl with powerful psychic abilities.

Answer:  [32mWounded Man

In [None]:
import pickle

In [None]:
redirects = pickle.load(open('/scratch/ddr8143/wikipedia/old_redirects.pkl', 'rb'))

In [None]:
type(redirects)

In [None]:
type(notext_titles)

In [None]:
notext_titles[0]

In [None]:
len(redirects)

In [None]:
no_text_t_norm = [gu.normalize(t['title']) for t in notext_titles]
no_text_t_not_in_redirects = [t for t in no_text_t_norm if t not in redirects]

In [None]:
no_text_t_in_redirects = [t for t in no_text_t_norm if t in redirects]

In [None]:
print(" No text pages:", len(no_text_t_norm))
print("---------------------------------")
print(f"{'No text not in redirects:':30}", len(no_text_t_not_in_redirects), f"{len(no_text_t_not_in_redirects)*100.0/len(no_text_t_norm):0.2f}%")
print("No text in redirects:", len(no_text_t_in_redirects), f"{len(no_text_t_in_redirects)*100.0/len(no_text_t_norm):0.2f}%"))
print(" Which sums to:", len(no_text_t_not_in_redirects) + len(no_text_t_in_redirects))

In [None]:
redirects_l = list(redirects.items())

In [None]:
for r in redirects_l[:10]:
    print(r)

In [None]:
i = 0
for k, v in redirects.items():
    print(f"{'|'+ k +'|':100} {'|'+v+'|'}")
    i+= 1
    if i > 100:
        break