Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# Hetionet

## Triple pre-processing

Import dependencies

In [2]:
import urllib.request
from pathlib import Path
import pickle

import numpy as np
import pandas as pd
import torch

Download dataset from source

In [3]:
data_path = Path("../data/hetionet/")
data_path.mkdir(parents=True, exist_ok=True)

# Edges (triples)
urllib.request.urlretrieve(
    "https://github.com/hetio/hetionet/raw/master/hetnet/tsv/hetionet-v1.0-edges.sif.gz",
    filename=data_path.joinpath("edges.sif.gz"),
)

# Nodes (entity metadata)
urllib.request.urlretrieve(
    "https://github.com/hetio/hetionet/raw/master/hetnet/tsv/hetionet-v1.0-nodes.tsv",
    filename=data_path.joinpath("nodes.tsv"),
)

# Metaedges (relation metadata)
urllib.request.urlretrieve(
    "https://github.com/hetio/hetionet/raw/master/describe/edges/metaedges.tsv",
    filename=data_path.joinpath("metaedges.tsv"),
)

In [4]:
df_triples = pd.read_csv(data_path.joinpath("edges.sif.gz"), sep="\t")
df_triples

Unnamed: 0,source,metaedge,target
0,Gene::9021,GpBP,Biological Process::GO:0071357
1,Gene::51676,GpBP,Biological Process::GO:0098780
2,Gene::19,GpBP,Biological Process::GO:0055088
3,Gene::3176,GpBP,Biological Process::GO:0010243
4,Gene::3039,GpBP,Biological Process::GO:0006898
...,...,...,...
2250192,Anatomy::UBERON:0000057,AeG,Gene::65009
2250193,Anatomy::UBERON:0000474,AeG,Gene::80279
2250194,Anatomy::UBERON:0002048,AeG,Gene::1211
2250195,Anatomy::UBERON:0002048,AeG,Gene::8843


From the documentation, entity IDs are of the form [entity-type]::ID. In particular, since IDs are sorted alphabetically, all entities of same type already have contiguous IDs.

In [5]:
df_nodes = pd.read_csv(data_path.joinpath("nodes.tsv"), sep="\t")
df_nodes

Unnamed: 0,id,name,kind
0,Anatomy::UBERON:0000002,uterine cervix,Anatomy
1,Anatomy::UBERON:0000004,nose,Anatomy
2,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy
3,Anatomy::UBERON:0000007,pituitary gland,Anatomy
4,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy
...,...,...,...
47026,Symptom::D064250,Hypertriglyceridemic Waist,Symptom
47027,Symptom::D065634,Cerebrospinal Fluid Leak,Symptom
47028,Symptom::D065635,Benign Paroxysmal Positional Vertigo,Symptom
47029,Symptom::D065906,Hyperlactatemia,Symptom


However, not all 47031 entities actually appear in the triples.

In [6]:
ht_nodes = pd.concat([df_triples["source"], df_triples["target"]])
node_used, node_ids = np.unique(ht_nodes.values, return_inverse=True)
print(f"Unique nodes appearing in triples: {node_used.shape[0]}")

Unique nodes appearing in triples: 45158


We therefore discard unused entities, when defining the final mapping entity -> ID.

In [7]:
df_nodes_used = pd.merge(
    pd.DataFrame(ht_nodes).groupby(0).first().reset_index(),
    df_nodes,
    left_on=0,
    right_on="id",
    how="left",
)[["id", "name", "kind"]]
assert np.all(df_nodes_used["id"].values == node_used)
df_nodes_used

Unnamed: 0,id,name,kind
0,Anatomy::UBERON:0000002,uterine cervix,Anatomy
1,Anatomy::UBERON:0000004,nose,Anatomy
2,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy
3,Anatomy::UBERON:0000007,pituitary gland,Anatomy
4,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy
...,...,...,...
45153,Symptom::D063766,Pediatric Obesity,Symptom
45154,Symptom::D063806,Myalgia,Symptom
45155,Symptom::D064250,Hypertriglyceridemic Waist,Symptom
45156,Symptom::D065634,Cerebrospinal Fluid Leak,Symptom


Let's create the entity dictionary by concatenating the ID (which contains the entity type label) and the explicit name for the entity.

In [8]:
ent_dict = (df_nodes_used["id"] + " (" + df_nodes_used["name"] + ")").values.tolist()
ent_dict[:20]

['Anatomy::UBERON:0000002 (uterine cervix)',
 'Anatomy::UBERON:0000004 (nose)',
 'Anatomy::UBERON:0000006 (islet of Langerhans)',
 'Anatomy::UBERON:0000007 (pituitary gland)',
 'Anatomy::UBERON:0000010 (peripheral nervous system)',
 'Anatomy::UBERON:0000011 (parasympathetic nervous system)',
 'Anatomy::UBERON:0000013 (sympathetic nervous system)',
 'Anatomy::UBERON:0000020 (sense organ)',
 'Anatomy::UBERON:0000026 (appendage)',
 'Anatomy::UBERON:0000029 (lymph node)',
 'Anatomy::UBERON:0000033 (head)',
 'Anatomy::UBERON:0000038 (follicular fluid)',
 'Anatomy::UBERON:0000042 (serous membrane)',
 'Anatomy::UBERON:0000043 (tendon)',
 'Anatomy::UBERON:0000045 (ganglion)',
 'Anatomy::UBERON:0000053 (macula lutea)',
 'Anatomy::UBERON:0000054 (macula)',
 'Anatomy::UBERON:0000056 (ureter)',
 'Anatomy::UBERON:0000057 (urethra)',
 'Anatomy::UBERON:0000165 (mouth)']

In [9]:
type_offset = df_nodes_used.groupby("kind")["kind"].count().cumsum().shift(1)
type_offset.iloc[0] = 0
type_offset = type_offset.astype("int64").to_dict()
type_offset

{'Anatomy': 0,
 'Biological Process': 400,
 'Cellular Component': 11781,
 'Compound': 13172,
 'Disease': 14710,
 'Gene': 14846,
 'Molecular Function': 33991,
 'Pathway': 36875,
 'Pharmacologic Class': 38697,
 'Side Effect': 39042,
 'Symptom': 44743}

Let's now extract relations.

In [10]:
rel_dict, rel_id = np.unique(df_triples["metaedge"].values, return_inverse=True)
rel_dict

array(['AdG', 'AeG', 'AuG', 'CbG', 'CcSE', 'CdG', 'CpD', 'CrC', 'CtD',
       'CuG', 'DaG', 'DdG', 'DlA', 'DpS', 'DrD', 'DuG', 'GcG', 'GiG',
       'GpBP', 'GpCC', 'GpMF', 'GpPW', 'Gr>G', 'PCiC'], dtype=object)

The meaning of these labels, together with metrics on relation types, can be found in the `metaedges.tsv` source file.

In [11]:
df_metaedge = pd.read_csv(data_path.joinpath("metaedges.tsv"), sep="\t")
df_metaedge["edges_%"] = round(
    100 * df_metaedge["edges"] / df_metaedge["edges"].sum(), 2
)
df_metaedge

Unnamed: 0,metaedge,abbreviation,edges,source_nodes,target_nodes,unbiased,edges_%
0,Anatomy - downregulates - Gene,AdG,102240,36,15097,102240,4.54
1,Anatomy - expresses - Gene,AeG,526407,241,18094,453477,23.39
2,Anatomy - upregulates - Gene,AuG,97848,36,15929,97848,4.35
3,Compound - binds - Gene,CbG,11571,1389,1689,0,0.51
4,Compound - causes - Side Effect,CcSE,138944,1071,5701,0,6.17
5,Compound - downregulates - Gene,CdG,21102,734,2880,21102,0.94
6,Compound - palliates - Disease,CpD,390,221,50,0,0.02
7,Compound - resembles - Compound,CrC,6486,1042,1054,6486,0.29
8,Compound - treats - Disease,CtD,755,387,77,0,0.03
9,Compound - upregulates - Gene,CuG,18756,703,3247,18756,0.83


In [12]:
rel_dict = (
    df_metaedge["abbreviation"] + " (" + df_metaedge["metaedge"] + ")"
).values.tolist()
rel_dict

['AdG (Anatomy - downregulates - Gene)',
 'AeG (Anatomy - expresses - Gene)',
 'AuG (Anatomy - upregulates - Gene)',
 'CbG (Compound - binds - Gene)',
 'CcSE (Compound - causes - Side Effect)',
 'CdG (Compound - downregulates - Gene)',
 'CpD (Compound - palliates - Disease)',
 'CrC (Compound - resembles - Compound)',
 'CtD (Compound - treats - Disease)',
 'CuG (Compound - upregulates - Gene)',
 'DaG (Disease - associates - Gene)',
 'DdG (Disease - downregulates - Gene)',
 'DlA (Disease - localizes - Anatomy)',
 'DpS (Disease - presents - Symptom)',
 'DrD (Disease - resembles - Disease)',
 'DuG (Disease - upregulates - Gene)',
 'GcG (Gene - covaries - Gene)',
 'GiG (Gene - interacts - Gene)',
 'GpBP (Gene - participates - Biological Process)',
 'GpCC (Gene - participates - Cellular Component)',
 'GpMF (Gene - participates - Molecular Function)',
 'GpPW (Gene - participates - Pathway)',
 'Gr>G (Gene > regulates > Gene)',
 'PCiC (Pharmacologic Class - includes - Compound)']

We can now finalize the array of triples.

In [13]:
n_triples = df_triples.shape[0]
head_id = node_ids[:n_triples]
tail_id = node_ids[n_triples:]

triples = np.stack([head_id, rel_id, tail_id], axis=1)
triples, triples.shape

(array([[32771,    18,  9084],
        [25549,    18, 10082],
        [18772,    18,  7987],
        ...,
        [  279,     1, 16756],
        [  279,     1, 32574],
        [  311,     1, 19162]]),
 (2250197, 3))

Sanity check:

In [14]:
triple_id = 4321

ent_dict[triples[triple_id, 0]], rel_dict[triples[triple_id, 1]], ent_dict[
    triples[triple_id, 2]
]

('Gene::80270 (HSD3B7)',
 'GpBP (Gene - participates - Biological Process)',
 'Biological Process::GO:0032787 (monocarboxylic acid metabolic process)')

which correctly coincides with

In [15]:
df_triples.iloc[triple_id]

source                         Gene::80270
metaedge                              GpBP
target      Biological Process::GO:0032787
Name: 4321, dtype: object

## Save

Save triples and dictionaries.

In [24]:
torch.save(triples, data_path.joinpath("triples.pt"))
with open(data_path.joinpath("entity_dict.pkl"), "wb") as f:
    pickle.dump(ent_dict, f)
with open(data_path.joinpath("relation_dict.pkl"), "wb") as f:
    pickle.dump(rel_dict, f)
with open(data_path.joinpath("type_offset.pkl"), "wb") as f:
    pickle.dump(type_offset, f)