Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# OpenBioLink2020 HQ

## Triple pre-processing

Install dependencies

In [1]:
!pip install -q wheel
!pip install -q git+https://github.com/graphcore-research/bess-kge.git

In [2]:
import pickle
import zipfile
import requests
from io import BytesIO
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from besskge.dataset import KGDataset

Download dataset from source

In [3]:
data_path = Path("../data/openbiolink/")
data_path.mkdir(parents=True, exist_ok=True)
filename = data_path.joinpath("drkg.tar.gz")

res = requests.get(url="https://zenodo.org/record/3834052/files/HQ_DIR.zip")
with zipfile.ZipFile(BytesIO(res.content)) as zip_f:
    zip_f.extractall(path=data_path)

In [8]:
column_names = ["h_label", "r_label", "t_label", "quality", "TP/TN", "source"]
train_triples = pd.read_csv(
    data_path.joinpath("HQ_DIR/train_test_data/train_sample.csv"),
    header=None,
    names=column_names,
    sep="\t",
)
valid_triples = pd.read_csv(
    data_path.joinpath("HQ_DIR/train_test_data/val_sample.csv"),
    header=None,
    names=column_names,
    sep="\t",
)
test_triples = pd.read_csv(
    data_path.joinpath("HQ_DIR/train_test_data/test_sample.csv"),
    header=None,
    names=column_names,
    sep="\t",
)

In [4]:
df_triples = {"train": train_triples, "valid": valid_triples, "test": test_triples}
df_triples["train"]

Unnamed: 0,h_label,r_label,t_label,quality,TP/TN,source
0,NCBIGENE:11200,GENE_PHENOTYPE,HP:0009919,,1,HPO
1,NCBIGENE:2649,GENE_EXPRESSED_ANATOMY,UBERON:0000059,gold quality,1,Bgee
2,NCBIGENE:534,GENE_EXPRESSED_ANATOMY,UBERON:0000467,gold quality,1,Bgee
3,NCBIGENE:2036,GENE_BINDING_GENE,NCBIGENE:5295,900,1,STRING
4,NCBIGENE:51195,GENE_UNDEREXPRESSED_ANATOMY,CL:0000738,high quality,1,Bgee
...,...,...,...,...,...,...
4191997,NCBIGENE:6701,GENE_PATHWAY,REACTOME:R-HSA-6809371,,1,CDT
4191998,NCBIGENE:3118,GENE_CATALYSIS_GENE,NCBIGENE:3119,926,1,STRING
4191999,NCBIGENE:1269,GENE_REACTION_GENE,NCBIGENE:6376,900,1,STRING
4192000,NCBIGENE:1785,GENE_EXPRESSED_ANATOMY,UBERON:0003729,gold quality,1,Bgee


In [5]:
df_triples["train"]["r_label"].unique()

array(['GENE_PHENOTYPE', 'GENE_EXPRESSED_ANATOMY', 'GENE_BINDING_GENE',
       'GENE_UNDEREXPRESSED_ANATOMY', 'GENE_GENE', 'GENE_REACTION_GENE',
       'DRUG_REACTION_GENE', 'GENE_GO', 'GENE_PATHWAY',
       'GENE_OVEREXPRESSED_ANATOMY', 'GENE_DRUG', 'DRUG_CATALYSIS_GENE',
       'DRUG_BINDING_GENE', 'PART_OF', 'GENE_INHIBITION_GENE',
       'DRUG_INHIBITION_GENE', 'DRUG_PHENOTYPE', 'IS_A',
       'GENE_CATALYSIS_GENE', 'GENE_ACTIVATION_GENE', 'DIS_DRUG',
       'DRUG_ACTIVATION_GENE', 'DIS_PHENOTYPE', 'GENE_PTMOD_GENE',
       'DRUG_BINDINH_GENE', 'GENE_DIS', 'DRUG_BINDACT_GENE',
       'GENE_EXPRESSION_GENE'], dtype=object)

We can just use `besskge.dataset.KGDataset.from_dataframe` to preprocess and build the KGDataset.

In [6]:
openbiolink = KGDataset.from_dataframe(
    df_triples,
    head_column="h_label",
    relation_column="r_label",
    tail_column="t_label",
)

print(f"Number of entities: {openbiolink.n_entity:,}\n")
print(f"Number of relation types: {openbiolink.n_relation_type}\n")

Number of entities: 184,635

Number of relation types: 28



## Save

Save triples and dictionaries.

In [7]:
# Recover and concatenate all triples

all_triples = np.concatenate([trip for trip in openbiolink.triples.values()], axis=0)
all_triples.shape

(4563405, 3)

In [16]:
ent_dict = openbiolink.entity_dict
rel_dict = openbiolink.relation_dict
type_offset = openbiolink.type_offsets

# Sanity check

triple_id = 14500

ent_dict[all_triples[triple_id, 0]], rel_dict[all_triples[triple_id, 1]], ent_dict[
    all_triples[triple_id, 2]
]

('NCBIGENE:83743', 'GENE_GENE', 'NCBIGENE:6059')

In [20]:
part = df_triples["train"][
    df_triples["train"]["h_label"] == ent_dict[all_triples[triple_id, 0]]
]
part[part["t_label"] == ent_dict[all_triples[triple_id, 2]]]

Unnamed: 0,h_label,r_label,t_label,quality,TP/TN,source
14500,NCBIGENE:83743,GENE_GENE,NCBIGENE:6059,702,1,STRING


In [21]:
torch.save(all_triples, data_path.joinpath("triples.pt"))
with open(data_path.joinpath("entity_dict.pkl"), "wb") as f:
    pickle.dump(ent_dict, f)
with open(data_path.joinpath("relation_dict.pkl"), "wb") as f:
    pickle.dump(rel_dict, f)
with open(data_path.joinpath("type_offset.pkl"), "wb") as f:
    pickle.dump(type_offset, f)