In [1]:
import argparse
import jsonlines
import os

import elq.main_dense as main_dense

import multiqa_utils.qampari_utils as qu
import multiqa_utils.entity_linking_utils as elu

%load_ext autoreload
%autoreload 2

## Remember: This requires the el4qa conda env, not my default one.

## Question 1: Are all the answers of QAMPARI pages in wikipedia?

From looking at examples, it seems the answer is yes.  But only some of the answers come with an "answer_url" in the answer list dict.  For the rest, there's a found_in_url that almost certainly is a page that has a link to the entity that we care about.  So, my conclusion is yes, every answer is an entity.  But aggregating the entities might be slightly harder.

In [2]:
qmp_dev = qu.load_dev_data()

In [3]:
qmp_dev[0].keys()

dict_keys(['entities', 'question_text', 'answer_list', 'qid'])

In [4]:
for i in [0, 1, 3]:
    print(">>", i, qmp_dev[i]['question_text'])
    num_answer_url = 0
    for a in qmp_dev[i]['answer_list']:
        ans = a['answer_text']
        if 'answer_url' in a:
            num_answer_url += 1
        else:
            print(ans, a['proof'])
    print(num_answer_url, "------\n")

>> 0 What manga was drawn by Ryoichi Ikegami?
6 ------

>> 1 Harmony Korine was both screenwriter and director of what movie?
6 ------

>> 3 Who directed a film that had P. Balachandran as a screenwriter?
Kamal [{'proof_text': 'Ulladakkam () is 1991 indian malayalam-language psychological thriller film directed by kamal and written by p. Balachandran from a story by cheriyan kalpakavadi.', 'found_in_url': 'https://en.wikipedia.org/wiki/Ulladakkam', 'pid': '366__wikidata_comp__dev__0__0'}, {'proof_text': 'Ulladakkam () is 1991 indian malayalam-language psychological thriller film directed by kamal and written by p. Balachandran from a story by cheriyan kalpakavadi.', 'found_in_url': 'https://en.wikipedia.org/wiki/Ulladakkam', 'pid': '366__wikidata_comp__dev__0__1'}]
P. Balachandran [{'proof_text': 'Ivan megharoopan is a 2012 malayalam biographical film written and directed by p. Balachandran.', 'found_in_url': 'https://en.wikipedia.org/wiki/Ivan_Megharoopan', 'pid': '366__wikidata_comp_

## Lets Entity Link the QMP Dev Questions

**First, lets test the utils on a small set.**

In [2]:
config_namespace, models = elu.load_default_entity_linking_models(output_path="./logs/")

>> Loading models, may take a few minutes.


In [8]:
data_to_link = [
    {
        "id": 0,
        "text": "paris is capital of which country?".lower(),
    },
    {
        "id": 1,
        "text": "paris is great granddaughter of whom?".lower(),
    },
    {
        "id": 2,
        "text": "who discovered o in the periodic table?".lower(),
    },
]

In [None]:
predictions = main_dense.run(config_namespace, None, *models, test_data=data_to_link)

In [10]:
predictions

[{'id': 0,
  'text': 'paris is capital of which country?',
  'scores': [-0.9338257312774658, -3.9720420837402344],
  'pred_tuples_string': [['Paris', 'paris'], ['Capital city', 'capital']],
  'pred_triples': [('11245', 0, 1), ('100454', 2, 3)],
  'tokens': [3000, 2003, 3007, 1997, 2029, 2406, 1029]},
 {'id': 1,
  'text': 'paris is great granddaughter of whom?',
  'scores': [-3.798149824142456],
  'pred_tuples_string': [['Paris Hilton', 'paris']],
  'pred_triples': [('1610293', 0, 1)],
  'tokens': [3000, 2003, 2307, 12787, 1997, 3183, 1029]},
 {'id': 2,
  'text': 'who discovered o in the periodic table?',
  'scores': [-0.5392036437988281, -3.7034592628479004],
  'pred_tuples_string': [['Periodic table', 'periodic table'],
   ['Oxygen', 'o']],
  'pred_triples': [('11282', 5, 7), ('10935', 2, 3)],
  'tokens': [2040, 3603, 1051, 1999, 1996, 15861, 2795, 1029]}]

**Now, run on the full dev set!**

In [7]:
qmp_dev = qu.load_dev_data()

In [8]:
qmp_qs_outfile_dev = "/scratch/ddr8143/multiqa/qampari_data/eql_default_tagging_v0_qmp_dev.jsonl"

In [12]:
# Note, I made small changes to extract this as a
# util and it hasn't been thoroughly tested.  Might be
# slightly off.
elu.elq_tag_data_and_dump(
    config_namespace=config_namespace,
    models=models,
    dlist=qmp_dev,
    outfile=qmp_qs_outfile_dev,
    id_key='qid',
    text_key='question_text',
    chunk_size=1000,
)

Entity link 1,000 items.
>> File already exists to loaded: 1,000 and remaining to extract: 0
>> All data already entity linked.


In [None]:
1000/125

In [3]:
qmp_train = qu.load_train_data()

In [4]:
qmp_qs_outfile_train = "/scratch/ddr8143/multiqa/qampari_data/eql_default_tagging_v0_qmp_train.jsonl"

In [6]:
# Note, I made small changes to extract this as a
# util and it hasn't been thoroughly tested.  Might be
# slightly off.
elu.elq_tag_data_and_dump(
    config_namespace=config_namespace,
    models=models,
    dlist=qmp_train,
    outfile=qmp_qs_outfile_train,
    id_key='qid',
    text_key='question_text',
    chunk_size=1000,
)

Entity link 61,911 items.
>> File already exists to loaded: 61,000 and remaining to extract: 911


100%|██████████| 114/114 [12:13<00:00,  6.20s/it]


*--------*
>> Wrote all entity links to: /scratch/ddr8143/multiqa/qampari_data/eql_default_tagging_v0_qmp_train.jsonl
