In [None]:
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.

# Build entity mapping database

_Note: this notebook is included to document the process and so will not run out of the box._

In order to provide an interface for querying the model, we build a full text searchable mapping from string to entity and relation ID. To do this, we:
 - Download a full list of entity and relation labels from wikidata.
 - Use OGB's `data/ogbl_wikikg2/mapping/` metadata to filter entities of interest, and map them to contiguous OGB dataset IDs.
 - Build a SQLite database with FTS3 indicies for efficient local retrieval.
 
Also contains the command to build a faster-loading `.npz` file containing ogbl-wikikg2.

In [None]:
import bz2
import json
import tqdm
import itertools as it
from pathlib import Path
import csv
import gzip
import sys

import kge_mapping
import kge_training

In [None]:
%%time

# Source:
# https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2

labels = {}
path = Path("/localdata/research/scratch/douglaso/latest-all.json.bz2")

f0 = open(path, "rb")
f = bz2.BZ2File(f0)
f.readline()  # opening "["
tq = tqdm.tqdm(it.count())
for n in tq:
    line = f.readline().decode().rstrip("\n ,")
    if line == "]":
        break
    e = json.loads()
    labels[e["id"]] = e["labels"].get("en", dict(value=""))["value"]
    if n % int(1e3) == 0:
        tq.set_description(f"{f0.tell() / path.stat().st_size:.0%}, {f0.tell() / 2**30:.1f} GiB")

In [None]:
%%time

# Source:
# https://hay.toolforge.org/propbrowse/props.json

with Path("/localdata/research/scratch/douglaso/props.json").open() as f:
    props = {item["id"]: item["label"] for item in json.load(f)}

In [None]:
%%time

records = []

with gzip.open("data/ogbl_wikikg2/mapping/nodeidx2entityid.csv.gz", "rt") as f:
    for item in csv.DictReader(f):
        records.append(dict(
            type="entity",
            idx=int(item["node idx"]),
            wikidata_id=item["entity id"],
            wikidata_label=labels.get(item["entity id"], ""),
        ))

with gzip.open("data/ogbl_wikikg2/mapping/reltype2relid.csv.gz", "rt") as f:
    for item in csv.DictReader(f):
        records.append(dict(
            type="relation",
            idx=int(item["reltype"]),
            wikidata_id=item["rel id"],
            wikidata_label=props.get(item["rel id"], ""),
        ))

print(f"Missing entity labels: {sum(not r['wikidata_label'] for r in records if r['type'] == 'entity') / (sum(1 for r in records if r['type'] == 'entity')):.1%}")
print(f"Missing relation labels: {sum(not r['wikidata_label'] for r in records if r['type'] == 'relation') / (sum(1 for r in records if r['type'] == 'relation')):.1%}")

with gzip.open("data/ogbl_wikikg2_mapping.jsonl.gz", "wt") as f:
    for record in records:
        print(json.dumps(record), file=f)

In [None]:
%%time

kge_mapping.Database.build(
    Path("data/ogbl_wikikg2_mapping.sqlite"),
    kge_mapping.RawData.load(Path("data/ogbl_wikikg2_mapping.jsonl.gz"), Path("data")),
)

In [None]:
%%time

kge_training.Dataset.build_wikikg2(
    Path("data"),
    Path("data/ogbl_wikikg2.npz"),
    seed=1000,
)