In [3]:
import pykeen
from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset

In [25]:
import pykeen
pykeen.env()


Key,Value
OS,posix
Platform,Linux
Release,6.5.0-35-generic
Time,Sun Jun 16 01:29:59 2024
Python,3.12.4
PyKEEN,1.10.2
PyKEEN Hash,UNHASHED
PyKEEN Branch,master
PyTorch,2.3.1+cu121
CUDA Available?,true


In [4]:
dataset = get_dataset(dataset="hetionet")

In [5]:
dataset.num_entities

45158

In [6]:
dataset.num_relations

24

In [112]:
# Run transe model with baseline parameters
def run_baseline(config: dict):

    pipeline_result = pipeline(
        dataset=config["dataset"],
        dataset_kwargs={
            "random_state": config["seed"],
            "create_inverse_triples": config["train"]["create_inverse"],
        },
        model=config["model"]["name"],
        model_kwargs={
            "embedding_dim": config["model"]["embedding_dim"],
            "random_seed": config["seed"],
        },
        training_loop="sLCWA",
        training_kwargs={
            "num_epochs": config["train"]["num_epoch"],
        },
        optimizer=config["optimizer"]["class"],
        optimizer_kwargs={"lr": config["optimizer"]["lr"]},
        negative_sampler_kwargs={
            "num_negs_per_pos": config["train"]["num_negative"],
        },
        stopper='early',
        random_seed=config["seed"],
        evaluator_kwargs={"filtered": True},
        use_testing_data=False,
    )
    
    pipeline_result.save_to_directory(config["save"]["path"])
    return pipeline_result


import yaml


def load_config(config_path: str) -> dict:
    """Load a YAML config file"""

    with open(config_path, "r", encoding="utf-8") as file:
        config = yaml.safe_load(file)

    return config

run_config: dict = load_config('transe.yaml')

In [None]:
result = run_baseline(run_config)

In [113]:
def print_metrics(result):
    print("MRR: {}".format(result.metric_results.get_metric("inverseharmonicmeanrank")))
    print("Hits@1: {}".format(result.metric_results.get_metric("hits_at_1")))
    print("Hits@3: {}".format(result.metric_results.get_metric("hits_at_3")))
    print("Hits@10: {}".format(result.metric_results.get_metric("hits_at_10")))

In [114]:
def print_metrics(metric_results):
    print("MRR: {}".format(metric_results.get_metric("inverseharmonicmeanrank")))
    print("Hits@1: {}".format(metric_results.get_metric("hits_at_1")))
    print("Hits@3: {}".format(metric_results.get_metric("hits_at_3")))
    print("Hits@10: {}".format(metric_results.get_metric("hits_at_10")))

In [None]:
result_ep100 = run_baseline(run_config)

In [116]:
print_metrics(result_ep100)

MRR: 0.06534771621227264
Hits@1: 0.027228690783041506
Hits@3: 0.06708070393742778
Hits@10: 0.14232068260599057


In [117]:
pack_ep100 = predict_triples(model=result_ep100.model, triples=dataset.testing)

df_ep100 = pack_ep100.process(factory=result_ep100.training).df

df_ep100.nlargest(n=30, columns="score")



Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score
214855,13774,Compound::DB00741,7,CrC,13659,Compound::DB00624,-4.139701
27758,13439,Compound::DB00390,7,CrC,14109,Compound::DB01092,-4.150275
195529,13491,Compound::DB00445,8,CtD,14788,Disease::DOID:2531,-4.154389
101763,14784,Disease::DOID:219,14,DrD,14774,Disease::DOID:1793,-4.170156
136918,13614,Compound::DB00578,7,CrC,14071,Compound::DB01053,-4.195703
122879,13750,Compound::DB00717,7,CrC,13659,Compound::DB00624,-4.390118
157352,14742,Disease::DOID:11934,14,DrD,14783,Disease::DOID:2174,-4.400328
139022,14806,Disease::DOID:3953,14,DrD,14772,Disease::DOID:1781,-4.405519
126803,13785,Compound::DB00752,7,CrC,13241,Compound::DB00182,-4.40939
54739,14087,Compound::DB01069,7,CrC,13480,Compound::DB00433,-4.452825


In [65]:
triples_factory = dataset.training

In [118]:
# The relation Compound - treats - Disease
ctd_id = torch.as_tensor(triples_factory.relations_to_ids(["CtD"]))

ctd_id

tensor([8])

In [119]:
# Filter test triples to include only those with the "CtD" relation
test_triples = dataset.testing.mapped_triples
ctd_triples = test_triples[test_triples[:, 1] == ctd_id]

In [71]:
# Make predictions for the filtered triples
from pykeen.evaluation import RankBasedEvaluator
evaluator = RankBasedEvaluator()
metrics = evaluator.evaluate(result_ep100.model, mapped_triples=ctd_triples)

# Print the metrics
print(metrics)

given. This means you probably forgot to pass (at least) the training triples. Try:

    additional_filter_triples=[dataset.training.mapped_triples]

Or if you want to use the Bordes et al. (2013) approach to filtering, do:

    additional_filter_triples=[
        dataset.training.mapped_triples,
        dataset.validation.mapped_triples,
    ]

Evaluating on cuda:0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71.0/71.0 [00:00<00:00, 470triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.16s seconds


<pykeen.evaluation.rank_based_evaluator.RankBasedMetricResults object at 0x722587170530>


In [76]:
print_metrics(metrics)

MRR: 0.08314799517393112
Hits@1: 0.02112676056338028
Hits@3: 0.08450704225352113
Hits@10: 0.2112676056338028


In [79]:
pack_ctd = predict_triples(model=result_ep100.model, triples=ctd_triples)

df_ctd = pack_ctd.process(factory=result_ep100.training).df

df_ctd.nlargest(n=30, columns="score")



Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score
63,13491,Compound::DB00445,8,CtD,14788,Disease::DOID:2531,-4.154389
26,13600,Compound::DB00563,8,CtD,14826,Disease::DOID:7148,-4.750464
14,13654,Compound::DB00619,8,CtD,14788,Disease::DOID:2531,-4.78891
3,14458,Compound::DB04572,8,CtD,14788,Disease::DOID:2531,-4.832768
62,13487,Compound::DB00441,8,CtD,14757,Disease::DOID:1324,-4.902373
24,14194,Compound::DB01181,8,CtD,14788,Disease::DOID:2531,-5.158857
59,13607,Compound::DB00570,8,CtD,14757,Disease::DOID:1324,-5.21343
52,13491,Compound::DB00445,8,CtD,14774,Disease::DOID:1793,-5.238699
44,13882,Compound::DB00853,8,CtD,14755,Disease::DOID:1319,-5.285707
66,13424,Compound::DB00373,8,CtD,14802,Disease::DOID:3393,-5.294956


In [80]:
id_to_entity = {v: k for k, v in dataset.entity_to_id.items()}

In [81]:
id_to_entity[13491]

'Compound::DB00445'

In [106]:
import requests

# Define a cache dictionary for disease labels
disease_cache = {}

# Get disease label from by id from  Disease Ontology
def get_disease_label(disease_id):
    if disease_id.startswith("Disease::"):
        disease_id = disease_id.split("::")[1]

    # Check if the disease ID is already in the cache
    if disease_id in disease_cache:
        return disease_cache[disease_id]
    
    url = f"https://www.ebi.ac.uk/ols/api/ontologies/doid/terms?obo_id=DOID:{disease_id.split(':')[1]}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if 'label' in data['_embedded']['terms'][0]:
            label = data['_embedded']['terms'][0]['label']
            # Store the label in the cache
            disease_cache[disease_id] = label
            return label
    
    # If the request fails or the label is not found, return the ID itself
    disease_cache[disease_id] = disease_id
    return disease_id

In [98]:
import pandas as pd

def create_drug_mapping(file_path):
    # Read the TSV file into a pandas DataFrame
    df = pd.read_csv(file_path, sep='\t')

    # Create a dictionary from the DataFrame
    drug_mapping = dict(zip(df['drugbankId'], df['name']))

    return drug_mapping

In [99]:
def get_drug_label(drug_id, drug_mapping):
    # Remove the prefix if present
    if drug_id.startswith("Compound::"):
        drug_id = drug_id.split("::")[1]
    
    # Return the label if present, otherwise return the ID itself
    return drug_mapping.get(drug_id, drug_id)

# Example usage
file_path = 'drug-mappings.tsv'
drug_mapping = create_drug_mapping(file_path)

In [107]:
get_drug_label("Compound::DB00445", drug_mapping)

'Epirubicin'

In [110]:
df_ctd_clean = pd.DataFrame(df_ctd)

# Replace DrugBank IDs with labels
df_ctd_clean['head_label'] = df_ctd_clean['head_label'].apply(lambda x: get_drug_label(x, drug_mapping))

# Replace Disease IDs with labels
df_ctd_clean['tail_label'] = df_ctd_clean['tail_label'].apply(get_disease_label)

# Drop the columns 'head_id', 'relation_id', 'tail_id'
df_ctd_clean = df_ctd_clean.drop(columns=['head_id', 'relation_id', 'tail_id'])

# Rename the columns for clarity
df_ctd_clean = df_ctd_clean.rename(columns={
    'head_label': 'Compound',
    'relation_label': 'Relation',
    'tail_label': 'Disease'
})

# Display the updated DataFrame
df_ctd_clean.nlargest(n=30, columns="score")

Unnamed: 0,Compound,Relation,Disease,score
63,Epirubicin,CtD,hematologic cancer,-4.154389
26,Methotrexate,CtD,rheumatoid arthritis,-4.750464
14,Imatinib,CtD,hematologic cancer,-4.78891
3,Thiotepa,CtD,hematologic cancer,-4.832768
62,Gemcitabine,CtD,lung cancer,-4.902373
24,Ifosfamide,CtD,hematologic cancer,-5.158857
59,Vinblastine,CtD,lung cancer,-5.21343
52,Epirubicin,CtD,pancreatic cancer,-5.238699
44,Temozolomide,CtD,brain cancer,-5.285707
66,Timolol,CtD,coronary artery disease,-5.294956
