# Linking

This notebook contains the steps to recreate the RoMQA dataset, by combining the RoMQA database with annotations to form the dataset.

In [1]:
import db_utils as D
import bz2
import ujson as json
import sqlite3
import os
from tqdm.auto import tqdm

In [2]:
if not os.path.isfile('annotations/data.db'):
    !wget -nc https://s3.us-west-1.wasabisys.com/vzhong-public/RoMQA/data.db.bz2 -O annotations/data.db.bz2
    !bunzip2 annotations/data.db.bz2

In [3]:
fdb = 'annotations/data.db'
db = sqlite3.connect(fdb, isolation_level=None)

print('loading entities')
entities = {}
for uri, text, aliases, desc in db.execute('SELECT uri, text, aliases, desc FROM ents'):
    entities[uri] = dict(uri=uri, text=text, aliases=json.loads(aliases), desc=desc)

loading entities


In [4]:
print('loading propositions')
props = {}
for uri, text, aliases, desc in db.execute('SELECT uri, text, aliases, desc FROM props'):
    props[uri] = dict(uri=uri, text=text, aliases=json.loads(aliases), desc=desc)

loading propositions


In [5]:
import copy

def load_split(fbzip):
    with bz2.open(fbzip, 'rt') as f:
        return json.load(f)

def map_split(data, map_answer=True):
    out = []
    for ex in tqdm(data):
        ex = copy.deepcopy(ex)
        for c in ex['candidates']:
            c.update(entities[c['uri']])
        for c in ex['constraints']:
            c['prop'] = props[c['prop']]
            c['other_ent'] = entities[c['other_ent']]
        if map_answer:
            ex['complete_answer'] = [entities[a] for a in ex['complete_answer']]
        out.append(ex)
    return out
        

dev = load_split('annotations/dev.unaligned.json.bz2')
dev_mapped = map_split(dev)
with open('dev.json', 'wt') as f:
    json.dump(dev_mapped, f)

  0%|          | 0/7068 [00:00<?, ?it/s]

In [6]:
test = load_split('annotations/test.unaligned.noanswer.json.bz2')
test_mapped = map_split(test, map_answer=False)
with open('test.noanswer.json', 'wt') as f:
    json.dump(test_mapped, f)

  0%|          | 0/10649 [00:00<?, ?it/s]

In [7]:
train = load_split('annotations/train.unaligned.json.bz2')
train_mapped = map_split(train)
with open('train.json', 'wt') as f:
    json.dump(train_mapped, f)

  0%|          | 0/11260 [00:00<?, ?it/s]