Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# Link common triples in Hetionet and PharMeBINet

In [None]:
import ast
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import torch

phbnet_data_path = Path("../datasets/data/pharmebinet/")
hetionet_data_path = Path("../datasets/data/hetionet/")

## Load original data

In [21]:
hetionet_triples = pd.read_csv(hetionet_data_path.joinpath("edges.sif.gz"), sep="\t")
hetionet_nodes = pd.read_csv(hetionet_data_path.joinpath("nodes.tsv"), sep="\t")
hetionet_rels = pd.read_csv(hetionet_data_path.joinpath("metaedges.tsv"), sep="\t")

In [22]:
hetionet_triples

Unnamed: 0,source,metaedge,target
0,Gene::9021,GpBP,Biological Process::GO:0071357
1,Gene::51676,GpBP,Biological Process::GO:0098780
2,Gene::19,GpBP,Biological Process::GO:0055088
3,Gene::3176,GpBP,Biological Process::GO:0010243
4,Gene::3039,GpBP,Biological Process::GO:0006898
...,...,...,...
2250192,Anatomy::UBERON:0000057,AeG,Gene::65009
2250193,Anatomy::UBERON:0000474,AeG,Gene::80279
2250194,Anatomy::UBERON:0002048,AeG,Gene::1211
2250195,Anatomy::UBERON:0002048,AeG,Gene::8843


In [23]:
hetionet_nodes

Unnamed: 0,id,name,kind
0,Anatomy::UBERON:0000002,uterine cervix,Anatomy
1,Anatomy::UBERON:0000004,nose,Anatomy
2,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy
3,Anatomy::UBERON:0000007,pituitary gland,Anatomy
4,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy
...,...,...,...
47026,Symptom::D064250,Hypertriglyceridemic Waist,Symptom
47027,Symptom::D065634,Cerebrospinal Fluid Leak,Symptom
47028,Symptom::D065635,Benign Paroxysmal Positional Vertigo,Symptom
47029,Symptom::D065906,Hyperlactatemia,Symptom


In [24]:
hetionet_rels

Unnamed: 0,metaedge,abbreviation,edges,source_nodes,target_nodes,unbiased
0,Anatomy - downregulates - Gene,AdG,102240,36,15097,102240
1,Anatomy - expresses - Gene,AeG,526407,241,18094,453477
2,Anatomy - upregulates - Gene,AuG,97848,36,15929,97848
3,Compound - binds - Gene,CbG,11571,1389,1689,0
4,Compound - causes - Side Effect,CcSE,138944,1071,5701,0
5,Compound - downregulates - Gene,CdG,21102,734,2880,21102
6,Compound - palliates - Disease,CpD,390,221,50,0
7,Compound - resembles - Compound,CrC,6486,1042,1054,6486
8,Compound - treats - Disease,CtD,755,387,77,0
9,Compound - upregulates - Gene,CuG,18756,703,3247,18756


In [25]:
phbnet_triples = pd.read_csv(phbnet_data_path.joinpath("edges.tsv"), sep="\t")
phbnet_nodes = pd.read_csv(phbnet_data_path.joinpath("nodes.tsv"), sep="\t")

  phbnet_nodes = pd.read_csv(phbnet_data_path.joinpath("nodes.tsv"), sep="\t")


In [26]:
phbnet_triples[phbnet_triples.duplicated(["start_id", "end_id", "type"])][
    "type"
].value_counts()

type
HAS_ChPR                                    5853
ASSOCIATES_CHaP                                8
OCCURS_IN_PWoiPW                               5
PARTICIPATES_IN_PWpiRLE                        5
ASSOCIATES_PaCH                                4
DOWNREGULATES_CHdP                             4
LEADS_TO_PWltD                                 4
IS_ACTIVE_IN_METABOLISM_CHiaimP                3
UPREGULATES_CHuP                               2
IS_ACTIVE_ON_DNA_OR_RNA_LEVEL_CHiaodorlP       2
Name: count, dtype: int64

In [27]:
phbnet_triples

Unnamed: 0,relationship_id,type,start_id,end_id,properties,resource,license,source,url
0,0,INVOLVED_IN_PiiBP,192000,304346,"{""date"": [""20210905""], ""with_from"": [""PANTHER:...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:0034613
1,1,INVOLVED_IN_PiiBP,192000,297825,"{""date"": [""20210623""], ""with_from"": [""PANTHER:...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:0007165
2,2,INVOLVED_IN_PiiBP,192000,297959,"{""date"": [""20210623""], ""with_from"": [""PANTHER:...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:0007346
3,3,INVOLVED_IN_PiiBP,192000,317503,"{""date"": [""20200226""], ""go"": ""yes"", ""pubMed_id...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:1901020
4,4,INVOLVED_IN_PiiBP,192000,295456,"{""date"": [""20211127""], ""with_from"": [""UniProtK...",['GO'],CC BY 4.0,Gene Ontology,http://purl.obolibrary.org/obo/GO:0001764
...,...,...,...,...,...,...,...,...,...
15883648,15884076,UPREGULATES_AuG,2853771,20550,"{""unbiased"": true, ""hetionet"": ""yes""}",['Hetionet'],,Bgee,
15883649,15884077,UPREGULATES_AuG,2853652,15012,"{""unbiased"": true, ""hetionet"": ""yes""}",['Hetionet'],,Bgee,
15883650,15884078,UPREGULATES_AuG,2853552,33011,"{""unbiased"": true, ""hetionet"": ""yes""}",['Hetionet'],,Bgee,
15883651,15884079,UPREGULATES_AuG,2853739,7999,"{""unbiased"": true, ""hetionet"": ""yes""}",['Hetionet'],,Bgee,


In [28]:
phbnet_nodes

Unnamed: 0,node_id,labels,properties,name,identifier,resource,license,source,url
0,0,Gene,"{""chromosome"": ""1"", ""go"": ""yes"", ""xrefs"": [""Ph...",IQ motif containing GTPase activating protein 3,128239,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/128239
1,1,Gene,"{""synonyms"": [""LONP"", ""LONPL"", ""PLON"", ""PSLON""...","lon peptidase 2, peroxisomal",83752,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/83752
2,2,Gene,"{""synonyms"": [""LRP1"", ""Rrp47"", ""SUN-CoR"", ""SUN...",C1D nuclear receptor corepressor,10438,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/10438
3,3,Gene,"{""synonyms"": [""JEAP""], ""chromosome"": ""11"", ""go...",angiomotin like 1,154810,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/154810
4,4,Gene,"{""synonyms"": [""BRNRS"", ""MAO-A""], ""chromosome"":...",monoamine oxidase A,4128,"['CTD', 'ClinVar', 'GO', 'Hetionet', 'NCBI', '...",CC0 1.0,Entrez Gene,http://identifiers.org/ncbigene/4128
...,...,...,...,...,...,...,...,...,...
2869402,2869545,Enzyme|Protein|Target,"{""protein_existence"": ""evidence at protein lev...",Uridine 5'-monophosphate synthase,P11172,"['DrugBank', 'GO', 'IID', 'Reactome', 'UniProt']",CC BY 4.0,UniProt,https://www.uniprot.org/uniprot/P11172
2869403,2869546,Enzyme|Protein|Target,"{""protein_existence"": ""evidence at protein lev...",Uridine phosphorylase 1,Q16831,"['DrugBank', 'GO', 'IID', 'Reactome', 'UniProt']",CC BY 4.0,UniProt,https://www.uniprot.org/uniprot/Q16831
2869404,2869547,Enzyme|Protein|Target,"{""protein_existence"": ""evidence at protein lev...",Urokinase-type plasminogen activator,P00749,"['DrugBank', 'GO', 'IID', 'Reactome', 'UniProt']",CC BY 4.0,UniProt,https://www.uniprot.org/uniprot/P00749
2869405,2869548,Enzyme|Protein|Target,"{""protein_existence"": ""evidence at protein lev...",Vitamin K epoxide reductase complex subunit 1,Q9BQB6,"['DrugBank', 'GO', 'IID', 'Reactome', 'UniProt']",CC BY 4.0,UniProt,https://www.uniprot.org/uniprot/Q9BQB6


## Bridge datasets

Every node in Hetionet has a label of the form `ENTITY_TYPE::IDENTIFIER`. In PharMeBINet, the `IDENTIFIER` appears in the column `identifier` of the DataFrame `phbnet_nodes`. This allows us to link the triples that appear in both datasets.

We consider as example one relation type common to both datasets.

In [86]:
hetionet_rel = "CbG"
phbnet_rel = "BINDS_CHbG"

In [87]:
hetioent_triples_rel = hetionet_triples[hetionet_triples["metaedge"] == hetionet_rel]
hetioent_triples_rel = pd.concat(
    [
        hetioent_triples_rel["source"]
        .str.split("::", expand=True)
        .rename(columns={0: "source_type", 1: "source_id"}),
        hetioent_triples_rel["target"]
        .str.split("::", expand=True)
        .rename(columns={0: "target_type", 1: "target_id"}),
    ],
    axis=1,
)
hetioent_triples_rel

Unnamed: 0,source_type,source_id,target_type,target_id
728491,Compound,DB00514,Gene,1136
728492,Compound,DB00686,Gene,2246
728493,Compound,DB00786,Gene,4317
728494,Compound,DB01209,Gene,4988
728495,Compound,DB01588,Gene,2560
...,...,...,...,...
740057,Compound,DB01656,Gene,5143
740058,Compound,DB01021,Gene,11238
740059,Compound,DB02546,Gene,79885
740060,Compound,DB04946,Gene,1576


In [88]:
def mondo_to_doid(df):
    # MONDO -> DOID IDs for disease nodes
    return df.apply(
        lambda x: [
            aid
            for aid in ast.literal_eval(x.properties)["xrefs"]
            if aid.split(":")[0] == "DOID"
        ][0],
        axis=1,
    )


phbnet_triples_rel = phbnet_triples[phbnet_triples["type"] == phbnet_rel]
h_type, t_type = hetioent_triples_rel.iloc[0][["source_type", "target_type"]]

if h_type == "Disease":
    phbnet_triples_rel["source_id"] = mondo_to_doid(
        phbnet_nodes.set_index("node_id").loc[phbnet_triples_rel.start_id]
    ).values
else:
    phbnet_triples_rel["source_id"] = (
        phbnet_nodes.set_index("node_id")["identifier"]
        .loc[phbnet_triples_rel.start_id]
        .values
    )

if t_type == "Disease":
    phbnet_triples_rel["target_id"] = mondo_to_doid(
        phbnet_nodes.set_index("node_id").loc[phbnet_triples_rel.end_id]
    ).values
else:
    phbnet_triples_rel["target_id"] = (
        phbnet_nodes.set_index("node_id")["identifier"]
        .loc[phbnet_triples_rel.end_id]
        .values
    )

phbnet_triples_rel

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phbnet_triples_rel["source_id"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phbnet_triples_rel["target_id"] = (


Unnamed: 0,relationship_id,type,start_id,end_id,properties,resource,license,source,url,source_id,target_id
9000,9000,BINDS_CHbG,23000,31195,"{""urls"": [""https://www.ebi.ac.uk/chembl/compou...",['Hetionet'],CC BY 4.0,DrugCentral (ChEMBL),,DB01148,27115
9001,9001,BINDS_CHbG,23000,20923,"{""urls"": [""https://www.ebi.ac.uk/chembl/compou...",['Hetionet'],CC BY 4.0,DrugCentral (ChEMBL),,DB01148,5150
9002,9002,BINDS_CHbG,23000,13344,"{""unbiased"": false, ""pubMed_ids"": [""10858873"",...",['Hetionet'],CC BY-NC 4.0,DrugBank (target),,DB01148,1129
9003,9003,BINDS_CHbG,23000,22459,"{""unbiased"": false, ""pubMed_ids"": [""17016423"",...",['Hetionet'],CC BY-NC 4.0,DrugBank (target),,DB01148,1128
9004,9004,BINDS_CHbG,23000,6113,"{""urls"": [""https://www.ebi.ac.uk/chembl/compou...",['Hetionet'],CC BY 4.0,DrugCentral (ChEMBL),,DB01148,5144
...,...,...,...,...,...,...,...,...,...,...,...
15849730,15849873,BINDS_CHbG,2723762,33672,"{""ctd"": ""yes"", ""url_ctd"": ""http://ctdbase.org/...",['CTD'],© 2002–2012 MDI Biological Laboratory. © 2012–...,CTD,http://ctdbase.org/detail.go?type=gene&acc=7157,C065087,7157
15849731,15849874,BINDS_CHbG,2713233,33672,"{""ctd"": ""yes"", ""url_ctd"": ""http://ctdbase.org/...",['CTD'],© 2002–2012 MDI Biological Laboratory. © 2012–...,CTD,http://ctdbase.org/detail.go?type=gene&acc=7157,C074702,7157
15849732,15849875,BINDS_CHbG,2693300,33672,"{""ctd"": ""yes"", ""url_ctd"": ""http://ctdbase.org/...",['CTD'],© 2002–2012 MDI Biological Laboratory. © 2012–...,CTD,http://ctdbase.org/detail.go?type=gene&acc=7157,D000171,7157
15849733,15849876,BINDS_CHbG,2688151,33672,"{""ctd"": ""yes"", ""url_ctd"": ""http://ctdbase.org/...",['CTD'],© 2002–2012 MDI Biological Laboratory. © 2012–...,CTD,http://ctdbase.org/detail.go?type=gene&acc=7157,D015123,7157


Correspondence of row indices (in the original dfs, or equivalently in the `triples.pt` numpy arrays saved at the end of our pre-processing notebooks) for the triples of the selected relation type appearing in both datasets: `index_hetio <-> index_phbnet`.

In [89]:
idx_bridge = pd.merge(
    hetioent_triples_rel[["source_id", "target_id"]].reset_index(),
    phbnet_triples_rel[["source_id", "target_id"]].reset_index(),
    on=["source_id", "target_id"],
    suffixes=["_hetio", "_phbnet"],
)
idx_bridge

Unnamed: 0,index_hetio,source_id,target_id,index_phbnet
0,728491,DB00514,1136,96205
1,728492,DB00686,2246,124500
2,728493,DB00786,4317,96682
3,728494,DB01209,4988,51074
4,728495,DB01588,2560,24771
...,...,...,...,...
11539,740057,DB01656,5143,38881
11540,740058,DB01021,11238,51916
11541,740059,DB02546,79885,124618
11542,740060,DB04946,1576,51630


## Sanity Check

Check on a random triple

In [90]:
# triple index in `idx_bridge``

idx = 500

Hetionet - original df

In [91]:
htr = hetionet_triples.loc[idx_bridge.loc[idx, "index_hetio"]]
hetionet_nodes[hetionet_nodes.id == htr["source"]], hetionet_nodes[
    hetionet_nodes.id == htr["target"]
]

(                      id                name      kind
 14048  Compound::DB01021  Trichlormethiazide  Compound,
               id  name  kind
 32057  Gene::771  CA12  Gene)

Hetionet - pre-processed

In [92]:
hetionet_entity_dict = pickle.load(
    open(hetionet_data_path.joinpath("entity_dict.pkl"), "rb")
)
hetionet_rel_dict = pickle.load(
    open(hetionet_data_path.joinpath("relation_dict.pkl"), "rb")
)
hetionet_saved_triples = torch.load(hetionet_data_path.joinpath("triples.pt"))

h, r, t = hetionet_saved_triples[idx_bridge.loc[idx, "index_hetio"]]
hetionet_entity_dict[h], hetionet_rel_dict[r], hetionet_entity_dict[t]

('Compound::DB01021 (Trichlormethiazide)',
 'CbG (Compound - binds - Gene)',
 'Gene::771 (CA12)')

PharMeBINet - original data

In [93]:
ptr = phbnet_triples.loc[idx_bridge.loc[idx, "index_phbnet"]]
phbnet_nodes[phbnet_nodes.node_id == ptr["start_id"]]["name"], phbnet_nodes[
    phbnet_nodes.node_id == ptr["end_id"]
]["name"]

(23469    Trichlormethiazide
 Name: name, dtype: object,
 29006    carbonic anhydrase 12
 Name: name, dtype: object)

PharMeBINet - pre-processed

In [94]:
phbnet_entity_dict = pickle.load(
    open(phbnet_data_path.joinpath("entity_dict.pkl"), "rb")
)
phbnet_rel_dict = pickle.load(
    open(phbnet_data_path.joinpath("relation_dict.pkl"), "rb")
)
phbnet_saved_triples = torch.load(phbnet_data_path.joinpath("triples.pt"))

h, r, t = phbnet_saved_triples[idx_bridge.loc[idx, "index_phbnet"]]
phbnet_entity_dict[h], phbnet_rel_dict[r], phbnet_entity_dict[t]

('Trichlormethiazide', 'BINDS_CHbG', 'carbonic anhydrase 12')

## Manual split

Take 10% of triples of the required relation type (same triples for both datasets) as test set; use everyhing else for training.

In [202]:
test_split = 0.1
seed = 123

# the test triples will be the rows in `idx_bridge` with the following indices
test_idx = np.random.default_rng(seed=seed).permutation(idx_bridge.index)[
    : int(test_split * idx_bridge.shape[0])
]
print(f"# test triples: {test_idx.shape[0]}")

# test triples: 760


In [203]:
hetionet_test_idx = idx_bridge.loc[test_idx]["index_hetio"].values
hetionet_other_idx = np.setdiff1d(
    np.arange(hetionet_saved_triples.shape[0]), hetionet_test_idx
)
hetionet_manual_triples = {
    "train": hetionet_saved_triples[hetionet_other_idx],
    "valid": hetionet_saved_triples[hetionet_other_idx][
        :1
    ],  # put 1 triple here for compatibility with train.py
    "test": hetionet_saved_triples[hetionet_test_idx],
}

with open(
    hetionet_data_path.joinpath(f"triples_manual_{hetionet_rel}_seed{seed}.pkl"), "wb"
) as fout:
    pickle.dump(hetionet_manual_triples, fout)

In [204]:
phbnet_test_idx = idx_bridge.loc[test_idx]["index_phbnet"].values
phbnet_other_idx = np.setdiff1d(
    np.arange(phbnet_saved_triples.shape[0]), phbnet_test_idx
)
phbnet_manual_triples = {
    "train": phbnet_saved_triples[phbnet_other_idx],
    "valid": phbnet_saved_triples[phbnet_other_idx][
        :1
    ],  # put 1 triple here for compatibility with train.py
    "test": phbnet_saved_triples[phbnet_test_idx],
}

with open(
    phbnet_data_path.joinpath(f"triples_manual_{phbnet_rel}_seed{seed}.pkl"), "wb"
) as fout:
    pickle.dump(phbnet_manual_triples, fout)

## Save candidate tails

Save the set of common tails, between the two datasets, used by the selected relation type. It is guaranteed that all test triples have tails in this set.

In [95]:
# index of one triple realizing each possible tail used by the relation type
tr_idx_hetio = hetioent_triples_rel.reset_index().groupby("target_id").first()["index"]
tr_idx_phbnet = phbnet_triples_rel.reset_index().groupby("target_id").first()["index"]

# tails that are common to both datasets
tr_idx_bridge = pd.merge(
    tr_idx_hetio,
    tr_idx_phbnet,
    left_index=True,
    right_index=True,
    suffixes=["_hetio", "_phbnet"],
)
print(f"# candidate tails: {tr_idx_bridge.shape[0]}")

# tail IDs in the two datasets
tails_hetio = hetionet_saved_triples[tr_idx_bridge.index_hetio, 2]
torch.save(
    tails_hetio, hetionet_data_path.joinpath(f"candidate_tails_{hetionet_rel}.pt")
)

tails_phbnet = phbnet_saved_triples[tr_idx_bridge.index_phbnet, 2]
torch.save(tails_phbnet, phbnet_data_path.joinpath(f"candidate_tails_{phbnet_rel}.pt"))

# candidate tails: 1689
