In [1]:
import argparse
import jsonlines
import os

import elq.main_dense as main_dense

import multiqa_utils.qampari_utils as qu

%load_ext autoreload
%autoreload 2

## Question 1: Are all the answers of QAMPARI pages in wikipedia?

From looking at examples, it seems the answer is yes.  But only some of the answers come with an "answer_url" in the answer list dict.  For the rest, there's a found_in_url that almost certainly is a page that has a link to the entity that we care about.  So, my conclusion is yes, every answer is an entity.  But aggregating the entities might be slightly harder.

In [2]:
qmp_dev = qu.load_dev_data()

In [3]:
qmp_dev[0].keys()

dict_keys(['entities', 'question_text', 'answer_list', 'qid'])

In [4]:
for i in [0, 1, 3]:
    print(">>", i, qmp_dev[i]['question_text'])
    num_answer_url = 0
    for a in qmp_dev[i]['answer_list']:
        ans = a['answer_text']
        if 'answer_url' in a:
            num_answer_url += 1
        else:
            print(ans, a['proof'])
    print(num_answer_url, "------\n")

>> 0 What manga was drawn by Ryoichi Ikegami?
6 ------

>> 1 Harmony Korine was both screenwriter and director of what movie?
6 ------

>> 3 Who directed a film that had P. Balachandran as a screenwriter?
Kamal [{'proof_text': 'Ulladakkam () is 1991 indian malayalam-language psychological thriller film directed by kamal and written by p. Balachandran from a story by cheriyan kalpakavadi.', 'found_in_url': 'https://en.wikipedia.org/wiki/Ulladakkam', 'pid': '366__wikidata_comp__dev__0__0'}, {'proof_text': 'Ulladakkam () is 1991 indian malayalam-language psychological thriller film directed by kamal and written by p. Balachandran from a story by cheriyan kalpakavadi.', 'found_in_url': 'https://en.wikipedia.org/wiki/Ulladakkam', 'pid': '366__wikidata_comp__dev__0__1'}]
P. Balachandran [{'proof_text': 'Ivan megharoopan is a 2012 malayalam biographical film written and directed by p. Balachandran.', 'found_in_url': 'https://en.wikipedia.org/wiki/Ivan_Megharoopan', 'pid': '366__wikidata_comp_

In [5]:
## Lets Entity Link the QMP Dev Questions

In [6]:
models_path = "/scratch/ddr8143/repos/BLINK/models/" # the path where you stored the ELQ models

config = {
    "interactive": False,
    "biencoder_model": models_path+"elq_wiki_large.bin",
    "biencoder_config": models_path+"elq_large_params.txt",
    "cand_token_ids_path": models_path+"entity_token_ids_128.t7",
    "entity_catalogue": models_path+"entity.jsonl",
    "entity_encoding": models_path+"all_entities_large.t7",
    "output_path": "/scratch/ddr8143/repos/BLINK/logs/", # logging directory
    "faiss_index": "hnsw",
    "index_path": models_path+"faiss_hnsw_index.pkl",
    "num_cand_mentions": 10,
    "num_cand_entities": 10,
    "threshold_type": "joint",
    "threshold": -4.5,
    "base_path": "/scratch/ddr8143/repos/BLINK/",
}

args = argparse.Namespace(**config)

In [7]:
models = main_dense.load_models(args, logger=None)

In [8]:
data_to_link = [
    {
        "id": 0,
        "text": "paris is capital of which country?".lower(),
    },
    {
        "id": 1,
        "text": "paris is great granddaughter of whom?".lower(),
    },
    {
        "id": 2,
        "text": "who discovered o in the periodic table?".lower(),
    },
]

In [9]:
predictions = main_dense.run(args, None, *models, test_data=data_to_link)

100%|██████████| 1/1 [00:04<00:00,  4.80s/it]

*--------*





In [10]:
predictions

[{'id': 0,
  'text': 'paris is capital of which country?',
  'scores': [-0.9338257312774658, -3.9720420837402344],
  'pred_tuples_string': [['Paris', 'paris'], ['Capital city', 'capital']],
  'pred_triples': [('11245', 0, 1), ('100454', 2, 3)],
  'tokens': [3000, 2003, 3007, 1997, 2029, 2406, 1029]},
 {'id': 1,
  'text': 'paris is great granddaughter of whom?',
  'scores': [-3.798149824142456],
  'pred_tuples_string': [['Paris Hilton', 'paris']],
  'pred_triples': [('1610293', 0, 1)],
  'tokens': [3000, 2003, 2307, 12787, 1997, 3183, 1029]},
 {'id': 2,
  'text': 'who discovered o in the periodic table?',
  'scores': [-0.5392036437988281, -3.7034592628479004],
  'pred_tuples_string': [['Periodic table', 'periodic table'],
   ['Oxygen', 'o']],
  'pred_triples': [('11282', 5, 7), ('10935', 2, 3)],
  'tokens': [2040, 3603, 1051, 1999, 1996, 15861, 2795, 1029]}]

In [26]:
qmp_qs_outdir = "/scratch/ddr8143/multiqa/qampari_data/eql_default_tagging_v0_qmp_dev.jsonl"

In [39]:
def elq_tag_questions_and_dump(models, qlist, outdir, chunk_size=10):
    assert not os.path.exists(outdir)
        
    new_data_to_link = []
    with jsonlines.open(outdir, mode='w') as writer:
        for i in range(len(qlist)):
            if len(new_data_to_link) < chunk_size:
                new_data_to_link.append({"id": qlist[i]['qid'], "text": qlist[i]['question_text']})
            else:
                preds = main_dense.run(args, None, *models, test_data=new_data_to_link)
                for p in preds:
                    writer.write(p)
                new_data_to_link = []
        
        # Score and dump the final set
        preds = main_dense.run(args, None, *models, test_data=new_data_to_link)
        for p in preds:
            writer.write(p)
    print(">> Wrote all entity links to:", outdir)

In [40]:
elq_tag_questions_and_dump(models, qmp_dev, qmp_qs_outdir, chunk_size=1000)



  0%|          | 0/125 [00:00<?, ?it/s][A[A

  1%|          | 1/125 [00:12<25:58, 12.57s/it][A[A

  2%|▏         | 2/125 [00:24<25:10, 12.28s/it][A[A

  2%|▏         | 3/125 [00:36<24:53, 12.24s/it][A[A

  3%|▎         | 4/125 [00:49<25:10, 12.48s/it][A[A

  4%|▍         | 5/125 [01:02<25:35, 12.79s/it][A[A

  5%|▍         | 6/125 [01:14<24:30, 12.36s/it][A[A

  6%|▌         | 7/125 [01:26<24:26, 12.43s/it][A[A

  6%|▋         | 8/125 [01:40<24:40, 12.65s/it][A[A

  7%|▋         | 9/125 [01:51<24:04, 12.45s/it][A[A

  8%|▊         | 10/125 [02:04<23:56, 12.49s/it][A[A

  9%|▉         | 11/125 [02:17<24:01, 12.65s/it][A[A

 10%|▉         | 12/125 [02:30<23:50, 12.66s/it][A[A

 10%|█         | 13/125 [02:43<24:11, 12.96s/it][A[A

 11%|█         | 14/125 [02:59<25:09, 13.60s/it][A[A

 12%|█▏        | 15/125 [03:11<24:22, 13.29s/it][A[A

 13%|█▎        | 16/125 [03:24<23:57, 13.19s/it][A[A

 14%|█▎        | 17/125 [03:37<23:43, 13.18s/it][A[A

 14%|█▍ 

*--------*
>> Wrote all entity links to: /scratch/ddr8143/multiqa/qampari_data/eql_default_tagging_v0_qmp_dev.jsonl
