In [10]:
import pykeen
from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset

In [1]:
import pykeen
pykeen.env()


Key,Value
OS,posix
Platform,Linux
Release,6.5.0-41-generic
Time,Wed Jun 19 21:14:10 2024
Python,3.12.4
PyKEEN,1.10.2
PyKEEN Hash,aa183023
PyKEEN Branch,master
PyTorch,2.3.1+cu121
CUDA Available?,true


In [12]:
dataset = get_dataset(dataset="hetionet")

In [13]:
dataset.num_entities

45158

In [130]:
dataset.num_relations

24

In [283]:
# Run transe model with baseline parameters
def run_baseline(config: dict):

    pipeline_result = pipeline(
        dataset=config["dataset"],
        dataset_kwargs={
            "random_state": config["seed"],
            "create_inverse_triples": config["train"]["create_inverse"],
        },
        model=config["model"]["name"],
        model_kwargs={
            "embedding_dim": config["model"]["embedding_dim"],
            "random_seed": config["seed"],
        },
        training_loop="sLCWA",
        training_kwargs={
            "num_epochs": config["train"]["num_epoch"],
        },
        optimizer=config["optimizer"]["class"],
        optimizer_kwargs={"lr": config["optimizer"]["lr"]},
        negative_sampler_kwargs={
            "num_negs_per_pos": config["train"]["num_negative"],
        },
        evaluation_relation_whitelist = set(ctd_config['train']['evaluation_relation_whitelist']),
        stopper='early',
        stopper_kwargs=dict(frequency=10, patience=3, relative_delta=0.002),
        random_seed=config["seed"],
        evaluator_kwargs={"filtered": True},
        use_testing_data=False,
        result_tracker='mlflow',
        result_tracker_kwargs=dict(
        tracking_uri='http://localhost:5000',
        experiment_name=config["experiment_name"],
    ),
    )
    
    pipeline_result.save_to_directory(config["save"]["path"])
    return pipeline_result


import yaml


def load_config(config_path: str) -> dict:
    """Load a YAML config file"""

    with open(config_path, "r", encoding="utf-8") as file:
        config = yaml.safe_load(file)

    return config

run_config: dict = load_config('config/transe.yaml')

In [15]:
def print_metrics(metric_results):
    print("MRR: {}".format(metric_results.get_metric("inverseharmonicmeanrank")))
    print("Hits@1: {}".format(metric_results.get_metric("hits_at_1")))
    print("Hits@3: {}".format(metric_results.get_metric("hits_at_3")))
    print("Hits@10: {}".format(metric_results.get_metric("hits_at_10")))

In [147]:
# result_ep100 = run_baseline(run_config)

model_ep100 = torch.load('results/baseline/transe_hetionet_ep100/trained_model.pkl')

In [None]:
# print_metrics(result_ep100.metric_results)

In [166]:
from pykeen.predict import predict_all
pack_ep100_all = predict_all(model=model_ep100, k=50)

                                                                                                                                                                                                                                                                                                           

In [173]:
df_ep100_all = pack_ep100_all.process(factory=dataset.training).df

df_ep100_all.nlargest(n=50, columns="score")

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score
0,59,Anatomy::UBERON:0001016,14,DrD,10,Anatomy::UBERON:0000033,-3.015952
1,30068,Gene::7415,7,CrC,22231,Gene::3065,-3.092649
2,38131,Pathway::PC7_7457,7,CrC,37454,Pathway::PC7_4043,-3.134512
3,29,Anatomy::UBERON:0000948,14,DrD,279,Anatomy::UBERON:0002048,-3.15509
28,65,Anatomy::UBERON:0001064,7,CrC,65,Anatomy::UBERON:0001064,-3.163475
29,66,Anatomy::UBERON:0001067,7,CrC,66,Anatomy::UBERON:0001067,-3.163475
30,67,Anatomy::UBERON:0001070,7,CrC,67,Anatomy::UBERON:0001070,-3.163475
31,69,Anatomy::UBERON:0001088,7,CrC,69,Anatomy::UBERON:0001088,-3.163475
32,75,Anatomy::UBERON:0001111,7,CrC,75,Anatomy::UBERON:0001111,-3.163475
33,78,Anatomy::UBERON:0001135,7,CrC,78,Anatomy::UBERON:0001135,-3.163475


In [168]:
pack_ep100 = predict_triples(model=model_ep100, triples=dataset.testing)

df_ep100 = pack_ep100.process(factory=dataset.training).df

df_ep100.nlargest(n=30, columns="score")



Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score
214855,13774,Compound::DB00741,7,CrC,13659,Compound::DB00624,-4.139701
27758,13439,Compound::DB00390,7,CrC,14109,Compound::DB01092,-4.150275
195529,13491,Compound::DB00445,8,CtD,14788,Disease::DOID:2531,-4.154389
101763,14784,Disease::DOID:219,14,DrD,14774,Disease::DOID:1793,-4.170156
136918,13614,Compound::DB00578,7,CrC,14071,Compound::DB01053,-4.195703
122879,13750,Compound::DB00717,7,CrC,13659,Compound::DB00624,-4.390118
157352,14742,Disease::DOID:11934,14,DrD,14783,Disease::DOID:2174,-4.400328
139022,14806,Disease::DOID:3953,14,DrD,14772,Disease::DOID:1781,-4.405519
126803,13785,Compound::DB00752,7,CrC,13241,Compound::DB00182,-4.40939
54739,14087,Compound::DB01069,7,CrC,13480,Compound::DB00433,-4.452825


In [151]:
triples_factory = dataset.training

In [152]:
# The relation Compound - treats - Disease
ctd_id = torch.as_tensor(triples_factory.relations_to_ids(["CtD"]))

ctd_id

tensor([8])

In [174]:
# Filter test triples to include only those with the "CtD" relation
test_triples = dataset.testing.mapped_triples
ctd_triples = test_triples[test_triples[:, 1] == ctd_id]

In [189]:
# Make predictions for the filtered triples
from pykeen.evaluation import RankBasedEvaluator
evaluator = RankBasedEvaluator()
metrics = evaluator.evaluate(model_ep100, mapped_triples=ctd_triples)

given. This means you probably forgot to pass (at least) the training triples. Try:

    additional_filter_triples=[dataset.training.mapped_triples]

Or if you want to use the Bordes et al. (2013) approach to filtering, do:

    additional_filter_triples=[
        dataset.training.mapped_triples,
        dataset.validation.mapped_triples,
    ]

Evaluating on cuda:0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71.0/71.0 [00:00<00:00, 910triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.09s seconds


In [190]:
print_metrics(metrics)

MRR: 0.08314799517393112
Hits@1: 0.02112676056338028
Hits@3: 0.08450704225352113
Hits@10: 0.2112676056338028


In [176]:
pack_ctd = predict_triples(model=model_ep100, triples=ctd_triples)

df_ctd = pack_ctd.process(factory=dataset.training).df

df_ctd.nlargest(n=30, columns="score")



Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score
63,13491,Compound::DB00445,8,CtD,14788,Disease::DOID:2531,-4.154389
26,13600,Compound::DB00563,8,CtD,14826,Disease::DOID:7148,-4.750464
14,13654,Compound::DB00619,8,CtD,14788,Disease::DOID:2531,-4.78891
3,14458,Compound::DB04572,8,CtD,14788,Disease::DOID:2531,-4.832768
62,13487,Compound::DB00441,8,CtD,14757,Disease::DOID:1324,-4.902373
24,14194,Compound::DB01181,8,CtD,14788,Disease::DOID:2531,-5.158857
59,13607,Compound::DB00570,8,CtD,14757,Disease::DOID:1324,-5.21343
52,13491,Compound::DB00445,8,CtD,14774,Disease::DOID:1793,-5.238699
44,13882,Compound::DB00853,8,CtD,14755,Disease::DOID:1319,-5.285707
66,13424,Compound::DB00373,8,CtD,14802,Disease::DOID:3393,-5.294956


In [177]:
id_to_entity = {v: k for k, v in dataset.entity_to_id.items()}

id_to_entity[13491]

'Compound::DB00445'

In [178]:
import requests

# Define a cache dictionary for disease labels
disease_cache = {}

# Get disease label from by id from  Disease Ontology
def get_disease_label(disease_id):
    if disease_id.startswith("Disease::"):
        disease_id = disease_id.split("::")[1]

    # Check if the disease ID is already in the cache
    if disease_id in disease_cache:
        return disease_cache[disease_id]
    
    url = f"https://www.ebi.ac.uk/ols/api/ontologies/doid/terms?obo_id=DOID:{disease_id.split(':')[1]}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if 'label' in data['_embedded']['terms'][0]:
            label = data['_embedded']['terms'][0]['label']
            # Store the label in the cache
            disease_cache[disease_id] = label
            return label
    
    # If the request fails or the label is not found, return the ID itself
    disease_cache[disease_id] = disease_id
    return disease_id

In [46]:
import pandas as pd

def create_drug_mapping(file_path):
    # Read the TSV file into a pandas DataFrame
    df = pd.read_csv(file_path, sep='\t')

    # Create a dictionary from the DataFrame
    drug_mapping = dict(zip(df['drugbankId'], df['name']))

    return drug_mapping

In [47]:
def get_drug_label(drug_id, drug_mapping):
    # Remove the prefix if present
    if drug_id.startswith("Compound::"):
        drug_id = drug_id.split("::")[1]
    
    # Return the label if present, otherwise return the ID itself
    return drug_mapping.get(drug_id, drug_id)

# Example usage
file_path = 'drug-mappings.tsv'
drug_mapping = create_drug_mapping(file_path)

In [49]:
get_drug_label("Compound::DB00445", drug_mapping)

'Epirubicin'

In [182]:
df_ctd_clean = pd.DataFrame(df_ctd)

# Replace DrugBank IDs with labels
df_ctd_clean['head_label'] = df_ctd_clean['head_label'].apply(lambda x: get_drug_label(x, drug_mapping))

# Replace Disease IDs with labels
df_ctd_clean['tail_label'] = df_ctd_clean['tail_label'].apply(get_disease_label)

# Drop the columns 'head_id', 'relation_id', 'tail_id'
df_ctd_clean = df_ctd_clean.drop(columns=['head_id', 'relation_id', 'tail_id'])

# Rename the columns for clarity
df_ctd_clean = df_ctd_clean.rename(columns={
    'head_label': 'Compound',
    'relation_label': 'Relation',
    'tail_label': 'Disease'
})

# Display the updated DataFrame
df_ctd_clean.nlargest(n=30, columns="score")

Unnamed: 0,Compound,Relation,Disease,score
63,Epirubicin,CtD,hematologic cancer,-4.154389
26,Methotrexate,CtD,rheumatoid arthritis,-4.750464
14,Imatinib,CtD,hematologic cancer,-4.78891
3,Thiotepa,CtD,hematologic cancer,-4.832768
62,Gemcitabine,CtD,lung cancer,-4.902373
24,Ifosfamide,CtD,hematologic cancer,-5.158857
59,Vinblastine,CtD,lung cancer,-5.21343
52,Epirubicin,CtD,pancreatic cancer,-5.238699
44,Temozolomide,CtD,brain cancer,-5.285707
66,Timolol,CtD,coronary artery disease,-5.294956


In [266]:
run_config_best_hpo: dict = load_config('config/transe-best-hpo.yaml')

In [267]:
run_config_best_hpo


{'type': 'baseline',
 'dataset': 'Hetionet',
 'experiment_name': 'TransE best parameters',
 'model': {'name': 'TransE', 'embedding_dim': 300},
 'optimizer': {'class': 'Adagrad', 'lr': 0.02},
 'train': {'loss_function': 'MarginRankingLoss',
  'num_epoch': 500,
  'num_negative': 61,
  'create_inverse': False},
 'save': {'path': 'results/baseline/transe_hetionet_best_hpo'},
 'seed': 84}

In [None]:
result_best_hpo = run_baseline(run_config_best_hpo)

In [187]:
result_best_hpo

PipelineResult(random_seed=84, model=TransE(
  (loss): MarginRankingLoss(
    (margin_activation): ReLU()
  )
  (interaction): TransEInteraction()
  (entity_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(45158, 300)
    )
  )
  (relation_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(24, 300)
    )
  )
  (weight_regularizers): ModuleList()
), training=TriplesFactory(num_entities=45158, num_relations=24, create_inverse_triples=False, num_triples=1800157, path="/home/david/.data/pykeen/datasets/hetionet/hetionet-v1.0-edges.sif.gz"), training_loop=<pykeen.training.slcwa.SLCWATrainingLoop object at 0x722587ce4380>, losses=[0.20368533536253564, 0.0979249069792461, 0.08039730824360601, 0.0702952014757013, 0.06372962425864462, 0.05921052526513091, 0.055881498103370134, 0.05343031228453226, 0.05128351813425926, 0.04956635025931698, 0.048323689740356716, 0.04724220059275949, 0.04611819865927948, 0.04524698816647751, 0.044422573

In [188]:
print_metrics(result_best_hpo.metric_results)


MRR: 0.07458487898111343
Hits@1: 0.014005421740289752
Hits@3: 0.08435472402453115
Hits@10: 0.1997289129855124


In [191]:
metrics_best_hpo = evaluator.evaluate(result_best_hpo.model, mapped_triples=ctd_triples)

given. This means you probably forgot to pass (at least) the training triples. Try:

    additional_filter_triples=[dataset.training.mapped_triples]

Or if you want to use the Bordes et al. (2013) approach to filtering, do:

    additional_filter_triples=[
        dataset.training.mapped_triples,
        dataset.validation.mapped_triples,
    ]

Evaluating on cuda:0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71.0/71.0 [00:00<00:00, 471triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 0.17s seconds


In [192]:
print_metrics(metrics_best_hpo)

MRR: 0.10783362388610841
Hits@1: 0.0
Hits@3: 0.1056338028169014
Hits@10: 0.30985915492957744


In [195]:
run_config_best_hpo2: dict = load_config('config/transe-best-hpo2.yaml')

In [196]:
run_config_best_hpo2

{'type': 'baseline',
 'dataset': 'Hetionet',
 'experiment_name': 'TransE best parameters Early stopper 0.002',
 'model': {'name': 'TransE', 'embedding_dim': 300},
 'optimizer': {'class': 'Adagrad', 'lr': 0.02},
 'train': {'loss_function': 'MarginRankingLoss',
  'num_epoch': 500,
  'num_negative': 61,
  'create_inverse': False},
 'save': {'path': 'results/baseline/transe_hetionet_best_hpo2'},
 'seed': 84}

In [None]:
result_best_hpo2 = run_baseline(run_config_best_hpo2)

In [198]:
print_metrics(result_best_hpo2.metric_results)

MRR: 0.07222168147563934
Hits@1: 0.009085859034752466
Hits@3: 0.08287485556839393
Hits@10: 0.2029019642698427


In [300]:
# Make predictions on test for best_hpo2 model

metrics = evaluator.evaluate(result_best_hpo2.model, mapped_triples=dataset.testing.mapped_triples)

given. This means you probably forgot to pass (at least) the training triples. Try:

    additional_filter_triples=[dataset.training.mapped_triples]

Or if you want to use the Bordes et al. (2013) approach to filtering, do:

    additional_filter_triples=[
        dataset.training.mapped_triples,
        dataset.validation.mapped_triples,
    ]

Evaluating on cuda:0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 225k/225k [05:15<00:00, 712triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 316.19s seconds


In [301]:
print_metrics(metrics)

MRR: 0.06745758652687073
Hits@1: 0.007819304950671051
Hits@3: 0.06488089947560217
Hits@10: 0.1891631854946227


In [211]:
pack_best_hpo2 = predict_all(model=result_best_hpo2.model, k=100000)

                                                                                                                                                                                                                                                                                                           

In [220]:
pred_best_hpo2 = pack_best_hpo2.process(factory=dataset.training)


100000

In [221]:
len(pred_best_hpo2.df)

100000

In [222]:
pred_best_hpo2_add_training = pred_best_hpo2.add_membership_columns(training=result.training)

In [223]:
pred_best_hpo2_add_training_df = pred_best_hpo2_add_training.df
len(pred_best_hpo2_add_training_df)


100000

In [224]:
# Remove rows where head_id and tail_id are the same
pred_best_hpo2_add_training_filtered = pred_best_hpo2_add_training_df[pred_best_hpo2_add_training_df['head_id'] != pred_best_hpo2_add_training_df['tail_id']]

In [225]:
pred_best_hpo2_add_training_filtered

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
90316,35393,Molecular Function::GO:0016462,14,DrD,35510,Molecular Function::GO:0016818,-9.249692,False
90317,7288,Biological Process::GO:0048858,14,DrD,4703,Biological Process::GO:0032990,-9.353173,False
90318,12021,Cellular Component::GO:0005789,14,DrD,12669,Cellular Component::GO:0042175,-9.567289,False
90319,7793,Biological Process::GO:0051707,14,DrD,2510,Biological Process::GO:0009607,-9.605646,False
90320,35510,Molecular Function::GO:0016818,7,CrC,35393,Molecular Function::GO:0016462,-9.608790,False
...,...,...,...,...,...,...,...,...
91558,3288,Biological Process::GO:0016567,7,CrC,8883,Biological Process::GO:0070647,-12.337824,False
91559,1836,Biological Process::GO:0006812,7,CrC,3834,Biological Process::GO:0030001,-12.339102,False
91560,34169,Molecular Function::GO:0003690,14,DrD,36870,Molecular Function::GO:1990837,-12.339445,False
91561,7755,Biological Process::GO:0051603,14,DrD,6120,Biological Process::GO:0043632,-12.339492,False


In [228]:
pred_best_hpo2_add_training_filtered.nlargest(n=30, columns="score")

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
90316,35393,Molecular Function::GO:0016462,14,DrD,35510,Molecular Function::GO:0016818,-9.249692,False
90317,7288,Biological Process::GO:0048858,14,DrD,4703,Biological Process::GO:0032990,-9.353173,False
90318,12021,Cellular Component::GO:0005789,14,DrD,12669,Cellular Component::GO:0042175,-9.567289,False
90319,7793,Biological Process::GO:0051707,14,DrD,2510,Biological Process::GO:0009607,-9.605646,False
90320,35510,Molecular Function::GO:0016818,7,CrC,35393,Molecular Function::GO:0016462,-9.60879,False
90321,2510,Biological Process::GO:0009607,7,CrC,7793,Biological Process::GO:0051707,-9.609782,False
90322,250,Anatomy::UBERON:0001895,14,DrD,274,Anatomy::UBERON:0002028,-9.616606,False
90323,274,Anatomy::UBERON:0002028,14,DrD,250,Anatomy::UBERON:0001895,-9.666279,False
90324,250,Anatomy::UBERON:0001895,14,DrD,320,Anatomy::UBERON:0002298,-9.675369,False
90325,2101,Biological Process::GO:0007283,14,DrD,7073,Biological Process::GO:0048232,-9.736124,False


In [235]:
pred_best_hpo2_add_training_filtered.groupby(['relation_label']).size()

relation_label
CrC    466
DrD    781
dtype: int64

In [232]:
dataset.validation

TriplesFactory(num_entities=45158, num_relations=24, create_inverse_triples=False, num_triples=225020, path="/home/david/.data/pykeen/datasets/hetionet/hetionet-v1.0-edges.sif.gz")

In [39]:
from pykeen import predict
compound_treats_arthritis = predict.predict_target(
    model=best_hpo_model,
    relation="CtD",
    tail="Disease::DOID:7148",
    triples_factory=dataset.training,
)

In [40]:
compound_treats_arthritis_df = compound_treats_arthritis.df

In [41]:
compound_treats_arthritis_df

Unnamed: 0,head_id,score,head_label
14826,14826,-18.055531,Disease::DOID:7148
13826,13826,-18.602427,Compound::DB00795
13889,13889,-18.843971,Compound::DB00860
13198,13198,-18.893045,Compound::DB00136
13670,13670,-18.908836,Compound::DB00635
...,...,...,...
39556,39556,-27.169384,Side Effect::C0015397
44648,44648,-27.176758,Side Effect::C2830004
44523,44523,-27.192657,Side Effect::C1868771
42407,42407,-27.231775,Side Effect::C0392197


In [42]:
compound_treats_arthritis_df_filtered = compound_treats_arthritis_df[compound_treats_arthritis_df['head_label'].str.contains("Compound")]


In [43]:
compound_treats_arthritis_df_filtered

Unnamed: 0,head_id,score,head_label
13826,13826,-18.602427,Compound::DB00795
13889,13889,-18.843971,Compound::DB00860
13198,13198,-18.893045,Compound::DB00136
13670,13670,-18.908836,Compound::DB00635
13447,13447,-18.920061,Compound::DB00398
...,...,...,...
14365,14365,-24.540871,Compound::DB01423
14283,14283,-24.611282,Compound::DB01288
14428,14428,-24.784145,Compound::DB01612
14686,14686,-24.802370,Compound::DB08979


In [50]:
import pandas as pd
compound_treats_arthritis = pd.DataFrame(compound_treats_arthritis_df_filtered)
# Replace DrugBank IDs with labels
compound_treats_arthritis['head_label'] = compound_treats_arthritis['head_label'].apply(lambda x: get_drug_label(x, drug_mapping))

compound_treats_arthritis

In [52]:
compound_treats_arthritis.nlargest(n=10, columns="score")

Unnamed: 0,head_id,score,head_label
13826,13826,-18.602427,Sulfasalazine
13889,13889,-18.843971,Prednisolone
13198,13198,-18.893045,Calcitriol
13670,13670,-18.908836,Prednisone
13447,13447,-18.920061,Sorafenib
13175,13175,-19.030796,Cyclosporine
14245,14245,-19.051506,Dexamethasone
13805,13805,-19.130682,Etoposide
14662,14662,-19.137554,Dabrafenib
13600,13600,-19.251726,Methotrexate


In [53]:
compound_treats_arthritis.to_csv('transe-top-10-arthritis.csv', index=False)

In [284]:
evaluation_relation_whitelist = {'CtD'}

In [293]:
ctd_config: dict = load_config('config/transe-only-ctd-relation.yaml')
ctd_config

{'type': 'baseline',
 'dataset': 'Hetionet',
 'experiment_name': 'TransE trained optimizing only Compound Treats Disease',
 'model': {'name': 'TransE', 'embedding_dim': 300},
 'optimizer': {'class': 'Adagrad', 'lr': 0.02},
 'train': {'loss_function': 'MarginRankingLoss',
  'num_epoch': 500,
  'num_negative': 61,
  'create_inverse': False,
  'evaluation_relation_whitelist': ['CtD']},
 'save': {'path': 'results/baseline/transe_hetionet_only_ctd_relation'},
 'seed': 84}

In [294]:
set(ctd_config['train']['evaluation_relation_whitelist'])

{'CtD'}

In [None]:
result_ctd = run_baseline(ctd_config)

In [296]:
print_metrics(result_ctd.metric_results)

MRR: 0.0709473192691803
Hits@1: 0.0
Hits@3: 0.0625
Hits@10: 0.19444444444444445


In [304]:
dataset.training

TriplesFactory(num_entities=45158, num_relations=24, create_inverse_triples=False, num_triples=1800157, path="/home/david/.data/pykeen/datasets/hetionet/hetionet-v1.0-edges.sif.gz")

In [309]:
# Run model with config from checkpoint
def run_pipeline(config: dict, checkpoint_name):

    pipeline_result = pipeline(
        dataset=config["dataset"],
        dataset_kwargs={
            "random_state": config["seed"],
            "create_inverse_triples": config["train"]["create_inverse"],
        },
        model=config["model"]["name"],
        model_kwargs={
            "embedding_dim": config["model"]["embedding_dim"],
            "random_seed": config["seed"],
        },
        training_loop="sLCWA",
        training_kwargs={
            "num_epochs": config["train"]["num_epoch"],
            "checkpoint_name": checkpoint_name
        },
        optimizer=config["optimizer"]["class"],
        optimizer_kwargs={"lr": config["optimizer"]["lr"]},
        negative_sampler_kwargs={
            "num_negs_per_pos": config["train"]["num_negative"],
        },
        stopper='early',
        stopper_kwargs=dict(frequency=10, patience=3, relative_delta=0.002),
        random_seed=config["seed"],
        evaluator_kwargs={"filtered": True},
        use_testing_data=False,
        result_tracker='mlflow',
        result_tracker_kwargs=dict(
        tracking_uri='http://localhost:5000',
        experiment_name=config["experiment_name"],
    ),
    )
    
    pipeline_result.save_to_directory(config["save"]["path"])
    return pipeline_result

In [6]:
import torch
best_hpo_model = torch.load('results/baseline/transe_hetionet_best_hpo2/trained_model.pkl')

In [14]:
# Make predictions on the test triples
from pykeen.evaluation import RankBasedEvaluator
evaluator = RankBasedEvaluator()
metrics_testing = evaluator.evaluate(best_hpo_model, mapped_triples=dataset.testing.mapped_triples)

The filtered setting was enabled, but there were no `additional_filter_triples`
given. This means you probably forgot to pass (at least) the training triples. Try:

    additional_filter_triples=[dataset.training.mapped_triples]

Or if you want to use the Bordes et al. (2013) approach to filtering, do:

    additional_filter_triples=[
        dataset.training.mapped_triples,
        dataset.validation.mapped_triples,
    ]

Evaluating on cuda:0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 225k/225k [05:46<00:00, 650triple/s]


In [16]:
print_metrics(metrics_testing)

MRR: 0.06745758652687073
Hits@1: 0.007819304950671051
Hits@3: 0.06488089947560217
Hits@10: 0.1891631854946227


In [18]:
# Let's predict the top 100k predictions
from pykeen.predict import predict_all
pack_top_100k = predict_all(model=best_hpo_model, k=100000)


predict is an expensive operation, involving 48,941,879,136 score evaluations.
                                                                                                                                                                                                                                                                                                           

AttributeError: 'TransE' object has no attribute 'process'

In [21]:
predictions_top_100k = pack_top_100k.process(factory=dataset.training)

In [22]:
predictions_top_100k.df.nlargest(n=50, columns="score")

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score
0,595,Biological Process::GO:0001579,14,DrD,595,Biological Process::GO:0001579,-7.733004
1,10767,Biological Process::GO:1903034,14,DrD,10767,Biological Process::GO:1903034,-7.733004
2,14708,Compound::DB09020,14,DrD,14708,Compound::DB09020,-7.733004
3,14983,Gene::100287178,14,DrD,14983,Gene::100287178,-7.733004
4,17412,Gene::1376,14,DrD,17412,Gene::1376,-7.733004
5,17726,Gene::1456,14,DrD,17726,Gene::1456,-7.733004
6,22560,Gene::338442,14,DrD,22560,Gene::338442,-7.733004
7,25641,Gene::5207,14,DrD,25641,Gene::5207,-7.733004
8,25815,Gene::53822,14,DrD,25815,Gene::53822,-7.733004
9,26044,Gene::54596,14,DrD,26044,Gene::54596,-7.733004


In [23]:
# Let's filter the triples from the train dataset

predictions_top_100k_filtered = predictions_top_100k.filter_triples(dataset.training)

In [25]:
len(predictions_top_100k_filtered.df)

99999

In [27]:
predictions_top_100k_filtered_df = predictions_top_100k_filtered.df

In [28]:
# Remove rows where head_id and tail_id are the same (reflexive)
predictions_top_100k_filtered_no_reflexive= predictions_top_100k_filtered_df[predictions_top_100k_filtered_df['head_id'] != predictions_top_100k_filtered_df['tail_id']]

In [29]:
predictions_top_100k_filtered_no_reflexive

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score
90316,35393,Molecular Function::GO:0016462,14,DrD,35510,Molecular Function::GO:0016818,-9.249692
90317,7288,Biological Process::GO:0048858,14,DrD,4703,Biological Process::GO:0032990,-9.353173
90318,12021,Cellular Component::GO:0005789,14,DrD,12669,Cellular Component::GO:0042175,-9.567289
90319,7793,Biological Process::GO:0051707,14,DrD,2510,Biological Process::GO:0009607,-9.605646
90320,35510,Molecular Function::GO:0016818,7,CrC,35393,Molecular Function::GO:0016462,-9.608790
...,...,...,...,...,...,...,...
91558,3288,Biological Process::GO:0016567,7,CrC,8883,Biological Process::GO:0070647,-12.337824
91559,1836,Biological Process::GO:0006812,7,CrC,3834,Biological Process::GO:0030001,-12.339102
91560,34169,Molecular Function::GO:0003690,14,DrD,36870,Molecular Function::GO:1990837,-12.339445
91561,7755,Biological Process::GO:0051603,14,DrD,6120,Biological Process::GO:0043632,-12.339492


In [34]:
top30_filtered = predictions_top_100k_filtered_no_reflexive.nlargest(n=30, columns="score")

In [35]:
top30_filtered = top30_filtered.drop(['head_id', 'relation_id', 'tail_id'], axis=1)

In [36]:
top30_filtered

Unnamed: 0,head_label,relation_label,tail_label,score
90316,Molecular Function::GO:0016462,DrD,Molecular Function::GO:0016818,-9.249692
90317,Biological Process::GO:0048858,DrD,Biological Process::GO:0032990,-9.353173
90318,Cellular Component::GO:0005789,DrD,Cellular Component::GO:0042175,-9.567289
90319,Biological Process::GO:0051707,DrD,Biological Process::GO:0009607,-9.605646
90320,Molecular Function::GO:0016818,CrC,Molecular Function::GO:0016462,-9.60879
90321,Biological Process::GO:0009607,CrC,Biological Process::GO:0051707,-9.609782
90322,Anatomy::UBERON:0001895,DrD,Anatomy::UBERON:0002028,-9.616606
90323,Anatomy::UBERON:0002028,DrD,Anatomy::UBERON:0001895,-9.666279
90324,Anatomy::UBERON:0001895,DrD,Anatomy::UBERON:0002298,-9.675369
90325,Biological Process::GO:0007283,DrD,Biological Process::GO:0048232,-9.736124


In [37]:
top30_filtered.to_csv('transe-top-30-global.csv', index=False)