Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# PharMeBINet

## Triple pre-processing

Download dataset from source

In [1]:
import pickle
import tarfile
import urllib.request
from pathlib import Path

import numpy as np
import pandas as pd
import torch

In [2]:
data_path = Path("../data/pharmebinet/")
data_path.mkdir(parents=True, exist_ok=True)
filename = data_path.joinpath("edges.tar.gz")

urllib.request.urlretrieve(
    "https://zenodo.org/record/7011027/files/pharmebinet_tsv_2022_08_19_v2.tar.gz",
    filename=filename,
)
with tarfile.open(filename, "r:gz") as tarf:
    tarf.extractall(path=data_path)

In [3]:
# Edges dataframe

df_triples = pd.read_csv(data_path.joinpath("edges.tsv"), sep="\t")
df_triples

Unnamed: 0,relationship_id,type,start_id,end_id,properties,resource,license,source,url
0,0,INVOLVED_IN_PiiBP,192000,304346,"{""date"": [""20210905""], ""with_from"": [""PANTHER:...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:0034613
1,1,INVOLVED_IN_PiiBP,192000,297825,"{""date"": [""20210623""], ""with_from"": [""PANTHER:...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:0007165
2,2,INVOLVED_IN_PiiBP,192000,297959,"{""date"": [""20210623""], ""with_from"": [""PANTHER:...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:0007346
3,3,INVOLVED_IN_PiiBP,192000,317503,"{""date"": [""20200226""], ""go"": ""yes"", ""pubMed_id...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:1901020
4,4,INVOLVED_IN_PiiBP,192000,295456,"{""date"": [""20211127""], ""with_from"": [""UniProtK...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:0001764
...,...,...,...,...,...,...,...,...,...
15883648,15884076,UPREGULATES_AuG,2853771,20550,"{""unbiased"": true, ""hetionet"": ""yes""}",['Hetionet'],,Bgee,
15883649,15884077,UPREGULATES_AuG,2853652,15012,"{""unbiased"": true, ""hetionet"": ""yes""}",['Hetionet'],,Bgee,
15883650,15884078,UPREGULATES_AuG,2853552,33011,"{""unbiased"": true, ""hetionet"": ""yes""}",['Hetionet'],,Bgee,
15883651,15884079,UPREGULATES_AuG,2853739,7999,"{""unbiased"": true, ""hetionet"": ""yes""}",['Hetionet'],,Bgee,


In [4]:
# Node dataframe

df_nodes = pd.read_csv(data_path.joinpath("nodes.tsv"), sep="\t")
df_nodes

  df_nodes = pd.read_csv(data_path.joinpath("nodes.tsv"), sep="\t")


Unnamed: 0,node_id,labels,properties,name,identifier,resource,license,source,url
0,0,Gene,"{""chromosome"": ""1"", ""go"": ""yes"", ""xrefs"": [""Ph...",IQ motif containing GTPase activating protein 3,128239,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/128239
1,1,Gene,"{""synonyms"": [""LONP"", ""LONPL"", ""PLON"", ""PSLON""...","lon peptidase 2, peroxisomal",83752,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/83752
2,2,Gene,"{""synonyms"": [""LRP1"", ""Rrp47"", ""SUN-CoR"", ""SUN...",C1D nuclear receptor corepressor,10438,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/10438
3,3,Gene,"{""synonyms"": [""JEAP""], ""chromosome"": ""11"", ""go...",angiomotin like 1,154810,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/154810
4,4,Gene,"{""synonyms"": [""BRNRS"", ""MAO-A""], ""chromosome"":...",monoamine oxidase A,4128,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/4128
...,...,...,...,...,...,...,...,...,...
2869402,2869545,Enzyme|Protein|Target,"{""protein_existence"": ""evidence at protein lev...",Uridine 5'-monophosphate synthase,P11172,"['DrugBank', 'GO', 'IID', 'Reactome', 'UniProt']",CC BY 4.0,UniProt,https://www.uniprot.org/uniprot/P11172
2869403,2869546,Enzyme|Protein|Target,"{""protein_existence"": ""evidence at protein lev...",Uridine phosphorylase 1,Q16831,"['DrugBank', 'GO', 'IID', 'Reactome', 'UniProt']",CC BY 4.0,UniProt,https://www.uniprot.org/uniprot/Q16831
2869404,2869547,Enzyme|Protein|Target,"{""protein_existence"": ""evidence at protein lev...",Urokinase-type plasminogen activator,P00749,"['DrugBank', 'GO', 'IID', 'Reactome', 'UniProt']",CC BY 4.0,UniProt,https://www.uniprot.org/uniprot/P00749
2869405,2869548,Enzyme|Protein|Target,"{""protein_existence"": ""evidence at protein lev...",Vitamin K epoxide reductase complex subunit 1,Q9BQB6,"['DrugBank', 'GO', 'IID', 'Reactome', 'UniProt']",CC BY 4.0,UniProt,https://www.uniprot.org/uniprot/Q9BQB6


Discard entities not appearing in any triple.

In [5]:
ht_nodes = pd.concat([df_triples["start_id"], df_triples["end_id"]], ignore_index=True)
node_used = np.unique(ht_nodes.values)
print(f"Unique nodes appearing in triples: {node_used.shape[0]}")

Unique nodes appearing in triples: 2653751


In [6]:
df_nodes_used = pd.merge(
    pd.DataFrame(ht_nodes).groupby(0).first().reset_index(),
    df_nodes,
    left_on=0,
    right_on="node_id",
    how="left",
)[["node_id", "identifier", "name", "labels"]]
assert np.all(df_nodes_used["node_id"].values == node_used)
df_nodes_used

Unnamed: 0,node_id,identifier,name,labels
0,0,128239,IQ motif containing GTPase activating protein 3,Gene
1,1,83752,"lon peptidase 2, peroxisomal",Gene
2,2,10438,C1D nuclear receptor corepressor,Gene
3,3,154810,angiomotin like 1,Gene
4,4,4128,monoamine oxidase A,Gene
...,...,...,...,...
2653746,2869545,P11172,Uridine 5'-monophosphate synthase,Enzyme|Protein|Target
2653747,2869546,Q16831,Uridine phosphorylase 1,Enzyme|Protein|Target
2653748,2869547,P00749,Urokinase-type plasminogen activator,Enzyme|Protein|Target
2653749,2869548,Q9BQB6,Vitamin K epoxide reductase complex subunit 1,Enzyme|Protein|Target


Reorder entities so that entities with same label are contiguous.

In [8]:
df_nodes_used_sorted = df_nodes_used.sort_values("labels")
df_nodes_used_sorted["name"] = (
    df_nodes_used_sorted["identifier"].astype(str)
    + " ("
    + df_nodes_used_sorted["name"]
    + ")"
)

In [9]:
ent_dict = df_nodes_used_sorted["name"].values.tolist()
ent_dict[:20]

['UBERON:0001679 (ethmoid bone)',
 'UBERON:0001684 (mandible)',
 'UBERON:0001103 (diaphragm)',
 'UBERON:0001193 (hepatic artery)',
 'UBERON:0002055 (zona reticularis of adrenal gland)',
 'UBERON:0001827 (secretion of lacrimal gland)',
 'UBERON:0002250 (popliteal artery)',
 'UBERON:0002018 (synovial membrane of synovial joint)',
 'UBERON:0001914 (colostrum)',
 'UBERON:0002130 (cerebellar nuclear complex)',
 'UBERON:0001759 (vagus nerve)',
 'UBERON:0001645 (trigeminal nerve)',
 'UBERON:0002299 (alveolus of lung)',
 'UBERON:0001225 (cortex of kidney)',
 'UBERON:0001132 (parathyroid gland)',
 'UBERON:0000178 (blood)',
 'UBERON:0001463 (manual digit 1)',
 'UBERON:0003889 (fallopian tube)',
 'UBERON:0001910 (medial forebrain bundle)',
 'UBERON:0001567 (cheek)']

We consider all different relation labels as independent types.

In [10]:
type_offset = df_nodes_used_sorted.groupby("labels")["labels"].count().cumsum().shift(1)
type_offset.iloc[0] = 0
type_offset

labels
Anatomy                                                        0.0
BiologicalProcess                                            400.0
BlackBoxEvent|ReactionLikeEvent                            28729.0
Carrier|Enzyme|Protein                                     30672.0
Carrier|Enzyme|Protein|Target                              30673.0
                                                           ...    
Reaction|ReactionLikeEvent                               2631237.0
Treatment                                                2641770.0
VariantAnnotation|VariantDrugAnnotation                  2642368.0
VariantAnnotation|VariantFunctionalAnalysisAnnotation    2646949.0
VariantAnnotation|VariantPhenotypeAnnotation             2648025.0
Name: labels, Length: 66, dtype: float64

In [100]:
type_offset = type_offset.astype("int64").to_dict()

Build inverse index to map entity IDs in `df_triples` to the new entity mapping.

In [101]:
rev_idx = -np.ones(df_nodes["node_id"].max() + 1, dtype=np.int64)
rev_idx[df_nodes_used_sorted["node_id"].values] = np.arange(
    df_nodes_used_sorted.shape[0]
)

heads = rev_idx[df_triples["start_id"]]
tails = rev_idx[df_triples["end_id"]]

Relations

In [102]:
rel_dict, rel_id = np.unique(df_triples["type"].values, return_inverse=True)
len(rel_dict), rel_dict[:20]

(208,
 array(['ACTS_UPSTREAM_OF_GauoBP',
        'ACTS_UPSTREAM_OF_NEGATIVE_EFFECT_GauoneBP',
        'ACTS_UPSTREAM_OF_NEGATIVE_EFFECT_PauoneBP',
        'ACTS_UPSTREAM_OF_OR_WITHIN_GauoowBP',
        'ACTS_UPSTREAM_OF_OR_WITHIN_NEGATIVE_EFFECT_GauoowneBP',
        'ACTS_UPSTREAM_OF_OR_WITHIN_NEGATIVE_EFFECT_PauoowneBP',
        'ACTS_UPSTREAM_OF_OR_WITHIN_POSITIVE_EFFECT_GauoowpeBP',
        'ACTS_UPSTREAM_OF_OR_WITHIN_POSITIVE_EFFECT_PauoowpeBP',
        'ACTS_UPSTREAM_OF_OR_WITHIN_PauoowBP',
        'ACTS_UPSTREAM_OF_POSITIVE_EFFECT_GauopeBP',
        'ACTS_UPSTREAM_OF_POSITIVE_EFFECT_PauopeBP',
        'ACTS_UPSTREAM_OF_PauoBP', 'AFFECTS_DEGENERATION_CHadP',
        'AFFECTS_DEGENERATION_GadCH', 'AFFECTS_DEGENERATION_PadCH',
        'ASSOCIATES_CAaCH', 'ASSOCIATES_CAaG', 'ASSOCIATES_CAaPC',
        'ASSOCIATES_CAaPT', 'ASSOCIATES_CAaV'], dtype=object))

Finalize the array of triples.

In [103]:
triples = np.stack([heads, rel_id, tails], axis=1)
triples, triples.shape

(array([[2630567,     112,    2839],
        [2630567,     112,   26915],
        [2630567,     112,   27049],
        ...,
        [     38,     202,  267178],
        [    369,     202,  286998],
        [     38,     202,  286354]]),
 (15883653, 3))

Sanity check:

In [106]:
triple_id = 14256

ent_dict[triples[triple_id, 0]], rel_dict[triples[triple_id, 1]], ent_dict[
    triples[triple_id, 2]
]

('myocardium', 'EXPRESSES_AeG', 'RNA polymerase I subunit E')

which correctly coincides with

In [113]:
print(
    f"head: {df_nodes[df_nodes['node_id'] == df_triples.iloc[triple_id]['start_id']]['name'].iloc[0]}\n",
    f"relation : {df_triples.iloc[triple_id]['type']}\n",
    f"tail: {df_nodes[df_nodes['node_id'] == df_triples.iloc[triple_id]['end_id']]['name'].iloc[0]}",
)

head: myocardium
 relation : EXPRESSES_AeG
 tail: RNA polymerase I subunit E


## Save

Save triples and dictionaries.

In [115]:
torch.save(triples, data_path.joinpath("triples.pt"))
with open(data_path.joinpath("entity_dict.pkl"), "wb") as f:
    pickle.dump(ent_dict, f)
with open(data_path.joinpath("relation_dict.pkl"), "wb") as f:
    pickle.dump(rel_dict, f)
with open(data_path.joinpath("type_offset.pkl"), "wb") as f:
    pickle.dump(type_offset, f)