# Fetching evidence

As is the case in the original RoMQA experiments, we will create three versions of the data with different means of retrieving evidence.
Here, the text corpus is taken to be the entire set of TREX documents.
We will retrieve using three methods:

1. We'll find gold evidence from database
2. For the closed setting: we'll run BM25 document retrieval for each candidate entity, followed by DPR sentence retrieval using the question.
3. For the open setting: we'll run BM25 document retrieval for the question, followed by DPR sentence retrieval again using the question.

## Gold evidence retrieval

In [1]:
import db_utils as D
import bz2
import ujson as json
import sqlite3
import os
from tqdm.auto import tqdm

fdb = 'annotations/data.db'
db = sqlite3.connect(fdb, isolation_level=None)

In [2]:
with open('dev.json', 'rt') as f:
    dev = json.load(f)

In [10]:
ent_uri2id = {}
prop_uri2id = {}

for uri, i in db.execute('SELECT uri, id FROM ents'):
    ent_uri2id[uri] = i
    
for uri, i in db.execute('SELECT uri, id FROM props'):
    prop_uri2id[uri] = i

In [17]:
docs_id2text = {}
for i, text in db.execute('SELECT id, text FROM docs'):
    docs_id2text[i] = text

In [28]:
import functools


@functools.lru_cache(maxsize=1024*1024)
def retrieve_evidence_for_fact(ent, other_ent, prop, prop_dir):
    if prop_dir == 'subj':
        subj, obj = ent, other_ent
    elif prop_dir == 'obj':
        subj, obj = other_ent, ent
    else:
        raise NotImplementedError
    
    subj_id = ent_uri2id[subj]
    obj_id = ent_uri2id[obj]
    prop_id = prop_uri2id[prop]
    match = {}
    for doc_id, start, end in db.execute('SELECT E.doc_id, E.start, E.end FROM evidence E, trips T WHERE T.id=E.trip_id AND T.subj_id=? AND T.obj_id=? AND T.prop_id=?', (subj_id, obj_id, prop_id)):
        text = docs_id2text[doc_id][start:end]
        match[text] = dict(doc_id=doc_id, start=start, end=end)
    lst = []
    for k, v in match.items():
        v['text'] = k
        lst.append(v)
    return lst
    

def retrieve_evidence_for_candidate(candidate, constraints):
    evidence = []
    for constraint in constraints:
        evidence.extend(retrieve_evidence_for_fact(candidate['uri'], constraint['other_ent']['uri'], constraint['prop']['uri'], constraint['prop_dir']))
    return evidence
    
    
print(dev[0]['question'])
print(dev[0]['candidates'][0])
retrieve_evidence_for_candidate(dev[0]['candidates'][0], dev[0]['constraints'])

Member of Dundee United F.C. born in Paisley. Not a citizen of Scotland
{'uri': 'http://www.wikidata.org/entity/Q4545973', 'is_answer': False, 'text': 'George Gordon, Lord Haddo', 'aliases': [], 'desc': 'Scottish Freemason and the eldest son of George Gordon, 3rd Earl of Aberdeen'}


[{'doc_id': 2015609,
  'start': 0,
  'end': 144,
  'text': 'George Gordon, Lord Haddo (28 January 1764 – 2 October 1791) was a Scottish Freemason and the eldest son of George Gordon, 3rd Earl of Aberdeen.'}]

In [29]:
!mkdir -p evidence/gold

In [31]:
import copy


dev_mapped = []
for ex in tqdm(dev):
    ex = copy.deepcopy(ex)
    for c in ex['candidates']:
        c['evidence'] = retrieve_evidence_for_candidate(c, ex['constraints'])
    dev_mapped.append(ex)
    
with bz2.open('evidence/gold/dev.json.bz2', 'wt') as f:
    json.dump(dev_mapped, f)

  0%|          | 0/7068 [00:00<?, ?it/s]

In [34]:
with open('test.noanswer.json', 'rt') as f:
    test = json.load(f)

test_mapped = []
for ex in tqdm(test):
    ex = copy.deepcopy(ex)
    for c in ex['candidates']:
        c['evidence'] = retrieve_evidence_for_candidate(c, ex['constraints'])
    test_mapped.append(ex)
    
with bz2.open('evidence/gold/test.noanswer.json.bz2', 'wt') as f:
    json.dump(test_mapped, f)

  0%|          | 0/10649 [00:00<?, ?it/s]

In [35]:
with open('train.json', 'rt') as f:
    train = json.load(f)

train_mapped = []
for ex in tqdm(train):
    ex = copy.deepcopy(ex)
    for c in ex['candidates']:
        c['evidence'] = retrieve_evidence_for_candidate(c, ex['constraints'])
    train_mapped.append(ex)
    
with bz2.open('evidence/gold/train.json.bz2', 'wt') as f:
    json.dump(train_mapped, f)

  0%|          | 0/11260 [00:00<?, ?it/s]

In [37]:
dev_mapped[0]['candidates'][:5]

[{'uri': 'http://www.wikidata.org/entity/Q4545973',
  'is_answer': False,
  'text': 'George Gordon, Lord Haddo',
  'aliases': [],
  'desc': 'Scottish Freemason and the eldest son of George Gordon, 3rd Earl of Aberdeen',
  'evidence': [{'doc_id': 2015609,
    'start': 0,
    'end': 144,
    'text': 'George Gordon, Lord Haddo (28 January 1764 – 2 October 1791) was a Scottish Freemason and the eldest son of George Gordon, 3rd Earl of Aberdeen.'}]},
 {'uri': 'http://www.wikidata.org/entity/Q3607690',
  'is_answer': False,
  'text': 'Alan Combe',
  'aliases': [],
  'desc': 'Scottish footballer',
  'evidence': [{'doc_id': 1773034,
    'start': 89,
    'end': 193,
    'text': 'Born in Edinburgh, Combe played for Cowdenbeath, St Mirren, Dundee United, Bradford City and Kilmarnock.'}]},
 {'uri': 'http://www.wikidata.org/entity/Q6223506',
  'is_answer': False,
  'text': 'John Brown',
  'aliases': ['John Thomas Brown'],
  'desc': 'Scottish footballer (1935-2000)',
  'evidence': [{'doc_id': 320681

## BM25 + DPR Retrieval

First, build BM25 index

In [42]:
from rank_bm25 import BM25Okapi
import string
import pickle
from stop_words import STOP_WORDS


def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc


# build a index
fbm25 = 'evidence/bm25_index.pkl'
sorted_docs = sorted(list(docs_id2text.items()), key=lambda tup: tup[0])


if not os.path.isfile(fbm25):
    tokenized_corpus = []
    for i, text in tqdm(sorted_docs, 'tokenizing docs'):
        tokenized_corpus.append(bm25_tokenizer(text))
    print('building bm25')
    bm25 = BM25Okapi(tokenized_corpus)
    print('saving bm25')
    with open(fbm25, 'wb') as f:
        pickle.dump(bm25, f)
    with bz2.open('evidence/sorted_docs.json.bz2', 'wt') as f:
        json.dump(sorted_docs, f)

tokenizing docs:   0%|          | 0/3348807 [00:00<?, ?it/s]

building bm25
saving bm25


The remaining steps are fairly computationally involved and split into actual python scripts.

Find top k documents for each entity

In [None]:
!python bm25_retrieve.py

In [48]:
import numpy as np


top_k = 5


all_entities = set()
for ex in train_mapped + dev_mapped + test_mapped:
    for c in ex['candidates']:
        all_entities.add(c['text'])
        
        
all_entities = sorted(list(all_entities))
ent2evidence = {}
for ent in tqdm(all_entities):
    scores = bm25.get_scores(bm25_tokenizer(ent))
    top_k_inds = np.argpartition(scores, -top_k)[-top_k:]
    out = []
    for i in top_k_inds:
        out.append(dict(score=scores[i], doc_index=i))
    ent2evidence[ent] = out
    
with bz2.open('evidence/bm25_ent2evidence.json.bz2', 'wt') as f:
    json.dump(ent2evidence, f)

  0%|          | 0/605175 [00:00<?, ?it/s]


KeyboardInterrupt



Find top k documents for each question

In [None]:
top_k = 5


all_questions = set()
for ex in train_mapped + dev_mapped + test_mapped:
    all_questions.add(ex['question'])
        
        
all_questions = sorted(list(all_questions))
question2evidence = {}
for question in tqdm(all_questions):
    scores = bm25.get_scores(bm25_tokenizer(question))
    top_k_inds = np.argpartition(bm25_scores, -top_k)[-top_k:]
    out = []
    for i in top_k_inds:
        out.append(dict(score=scores[i], doc_index=i))
    question2evidence[question] = out
    
with bz2.open('evidence/bm25_question2evidence.json.bz2', 'wt') as f:
    json.dump(question2evidence, f)

For closed setting, for each question and each candidate, find top k sentences using DPR. This is fairly involved so we'll do this in an actual python script.

In [None]:
!python dpr_retrieve.py

For open setting, for each question, find top k sentences using DPR

In [None]:
!python dpr_retrieve_open.py