In [1]:
# Define Neo4j connections
from neo4j import GraphDatabase
import pandas as pd
from pykeen.triples import TriplesFactory
from pykeen.predict import predict_target
from pykeen.pipeline import pipeline


host = 'bolt://localhost:7687'
user = 'neo4j'
password = '12345678'
driver = GraphDatabase.driver(host,auth=(user, password))
                                         

def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def diff_models(model_name,node,target_node,entity_name,links):
    """
    input: 
        model_name: name of the model
        node: name of the source node
        target_node: name of the target node
        entity_name: name of the entity
        links: name of the relation
    output:
        push the predicted links to the graph with the new relation named as predicted_{links}_{model_name}
    """
    data_query = f"""
    MATCH (g)-[r]->(d)
    RETURN toString(id(g)) as source, toString(id(d)) AS target, type(r) as type
    """
    #Getting the data from neo4j
    data  = run_query(data_query)
    #Creating the triples factory
    tf = TriplesFactory.from_labeled_triples(
    data[["source", "type", "target"]].values,
    create_inverse_triples=False,
    entity_to_id=None,
    relation_to_id=None,
    compact_id=False,
    filter_out_candidate_inverse_relations=True,
    metadata=None,
    )
    #Splitting the data
    training, testing, validation = tf.split([.8, .1, .1])
    #Running the pipeline
    result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model=model_name,
    stopper='early',
    epochs=20,
    dimensions=512,
    random_seed=420
    )
    predicted_link = "predicted_"+links + "_"+model_name
    #Getting the id
    id_query = f"""
    MATCH (s:{node})
    WHERE s.name = '{entity_name}'
    RETURN toString(id(s)) as id
    """
    id = run_query(id_query)['id'][0]
    pred_transr_gd = predict_target(model = result.model,head=id,relation= links, triples_factory=result.training)
    df_pred_transr_gd = pred_transr_gd.df
    candidate_nodes = df_pred_transr_gd.head(5)['tail_label'].to_list()
    push_query = f"""
    MATCH (n)
    WHERE id(n) = toInteger($compound_id)
    UNWIND $candidates as ca
    MATCH (c)
    WHERE id(c) = toInteger(ca)
    MERGE (n)-[:{predicted_link}]->(c)
    """
    run_query(push_query, {'compound_id':id, 'candidates': candidate_nodes})
    return result,testing,training,validation, predicted_link,pred_transr_gd

In [3]:
def get_difference_entity(original_link,predicted_link,entity_name,source_node,target_node): 
    db_original_query = f"""
        MATCH (g:{source_node})-[:{original_link}]-(d:{target_node})
        WHERE g.name = '{entity_name}'
        RETURN toString(g.name) as source, toString(d.name) AS target, "{original_link}" as type
        """
    db_predicted_query = f"""
        MATCH (g:{source_node})-[:{predicted_link}]-(d:{target_node})
        WHERE g.name = '{entity_name}'
        RETURN toString(g.name) as source, toString(d.name) AS target, "{predicted_link}" as type
        """
    #Getting the data from neo4j
    original  = run_query(db_original_query)
    predicted = run_query(db_predicted_query)
    difference = predicted[~predicted['target'].isin(original['target'])]['target']
    return difference

In [4]:
result,testing,training,validation,predicted_link,pred_transr_gd = diff_models('RotatE','Compound','Gene','Methotrexate','binds')

using automatically assigned random_state=1003271739
Training epochs on cuda:0:  45%|████▌     | 9/20 [01:49<01:57, 10.73s/epoch, loss=0.141, prev_loss=0.154]INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=128.
INFO:pykeen.evaluation.evaluator:Evaluation took 47.56s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.0826390372427544. Saved model weights to /home/jacky/.data/pykeen/checkpoints/best-model-weights-e4209e77-432f-404f-b9ab-d39e01ff0457.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.
Training epochs on cuda:0:  95%|█████████▌| 19/20 [04:30<00:11, 11.55s/epoch, loss=0.0788, prev_loss=0.0822]INFO:pykeen.evaluation.evaluator:Evaluation took 47.57s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 20: 0.09250160222174748. Saved model weights to /home/jacky/.data/pykeen/checkpoints/best-

In [5]:
pred_transr_gd.df

Unnamed: 0,tail_id,score,tail_label
2136,2136,-4.211718,12180
9223,9223,-4.493398,1946
2288,2288,-4.624982,1234
1789,1789,-4.652893,1184
17311,17311,-4.682837,7342
...,...,...,...
17625,17625,-8.746363,7664
7297,7297,-8.807863,17477
18264,18264,-8.846934,8306
15499,15499,-8.853883,5487


In [6]:
print(get_difference_entity('binds',predicted_link,'Methotrexate','Compound','Gene'))

1             CYP3A4
2             CYP2C9
3    CYP3A7-CYP3A51P
4            SLC22A1
Name: target, dtype: object


In [7]:
from pykeen.evaluation import RankBasedEvaluator

# Create an evaluator
evaluator = RankBasedEvaluator()

# Evaluate the model
metrics = evaluator.evaluate(result.model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])


INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=128.
Evaluating on cuda:0: 100%|██████████| 56.2k/56.2k [00:47<00:00, 1.18ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 49.42s seconds


In [8]:
print(f"Hits@1: {metrics.get_metric('hits@1')}")
print(f"Hits@3: {metrics.get_metric('hits@3')}")
print(f"Hits@5: {metrics.get_metric('hits@5')}")
print(f"Hits@10: {metrics.get_metric('hits@10')}")
print(f"Mean Reciprocal Rank: {metrics.get_metric('mean_reciprocal_rank')}")

Hits@1: 0.02302748700420138
Hits@3: 0.048627430036317025
Hits@5: 0.06611835077974791
Hits@10: 0.0978156376842555
Mean Reciprocal Rank: 0.05043206736445427


In [10]:
result,testing,training,validation,predicted_link,pred_transr_gd =diff_models('TransE','Compound','Gene','Methotrexate','binds')

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [430292, 56173, 56173]
INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /home/jacky/.data/pykeen/checkpoints/best-model-weights-67d40b0e-89ae-4e5f-b9ca-79a978683217.pt
Training epochs on cuda:0:  45%|████▌     | 9/20 [01:13<01:22,  7.53s/epoch, loss=0.165, prev_loss=0.171]INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=1024.
INFO:pykeen.evaluation.evaluator:Evaluation took 17.58s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.032373204208427535. Saved model weights to /home/jacky/.data/pykeen/checkpoints/best-model-weights-67d40b0e-89ae-4e5f-b9ca-79a978683217.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.
Training epochs on cuda:0:  95%|█████████▌| 19/20 [02

In [11]:
from pykeen.evaluation import RankBasedEvaluator

# Create an evaluator
evaluator = RankBasedEvaluator()

# Evaluate the model
metrics = evaluator.evaluate(result.model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=1024.
Evaluating on cuda:0: 100%|██████████| 56.2k/56.2k [00:16<00:00, 3.34ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 18.85s seconds


In [12]:
print(f"Hits@1: {metrics.get_metric('hits@1')}")
print(f"Hits@3: {metrics.get_metric('hits@3')}")
print(f"Hits@5: {metrics.get_metric('hits@5')}")
print(f"Hits@10: {metrics.get_metric('hits@10')}")
print(f"Mean Reciprocal Rank: {metrics.get_metric('mean_reciprocal_rank')}")

Hits@1: 0.0035604293877841667
Hits@3: 0.01387677353888879
Hits@5: 0.023089384579780322
Hits@10: 0.04090933366564008
Mean Reciprocal Rank: 0.019052209332585335


In [13]:
print(get_difference_entity('binds',predicted_link,'Methotrexate','Compound','Gene'))

0    CYP1A2
1    CYP3A5
2      BCHE
4    ADRA2A
5    CYP2C9
6    CYP1A1
7    GABRA2
8    CHRNA2
Name: target, dtype: object


In [14]:
result,testing,training,validation,predicted_link,pred_transr_gd =diff_models('SimplE','Compound','Gene','Methotrexate','binds')

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [430296, 56173, 56173]
INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /home/jacky/.data/pykeen/checkpoints/best-model-weights-a9da4856-85db-48c3-ba35-3c2ed27d7722.pt
Training epochs on cuda:0:  45%|████▌     | 9/20 [02:06<02:23, 13.05s/epoch, loss=0.602, prev_loss=0.702]INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=256.
INFO:pykeen.evaluation.evaluator:Evaluation took 47.43s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.002429993057162694. Saved model weights to /home/jacky/.data/pykeen/checkpoints/best-model-weights-a9da4856-85db-48c3-ba35-3c2ed27d7722.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.
Training epochs on cuda:0:  95%|█████████▌| 19/20 [05:

In [15]:
from pykeen.evaluation import RankBasedEvaluator

# Create an evaluator
evaluator = RankBasedEvaluator()

# Evaluate the model
metrics = evaluator.evaluate(result.model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=256.
Evaluating on cuda:0: 100%|██████████| 56.2k/56.2k [00:47<00:00, 1.19ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 49.12s seconds


In [16]:
print(f"Hits@1: {metrics.get_metric('hits@1')}")
print(f"Hits@3: {metrics.get_metric('hits@3')}")
print(f"Hits@5: {metrics.get_metric('hits@5')}")
print(f"Hits@10: {metrics.get_metric('hits@10')}")
print(f"Mean Reciprocal Rank: {metrics.get_metric('mean_reciprocal_rank')}")

Hits@1: 0.0010592277428657895
Hits@3: 0.0036583411959482314
Hits@5: 0.005945917077599559
Hits@10: 0.010779199971516565
Mean Reciprocal Rank: 0.0058013666421175


In [17]:
print(get_difference_entity('binds',predicted_link,'Methotrexate','Compound','Gene'))

0     CHST3
1      NRAS
2     NT5C2
3    ZNF100
4      CHN1
Name: target, dtype: object


In [18]:
result,testing,training,validation,predicted_link,pred_transr_gd =diff_models('RotatE','Disease','Gene','colon cancer','associates')

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [430299, 56173, 56174]
INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /home/jacky/.data/pykeen/checkpoints/best-model-weights-553f5a7c-e1a8-4aed-aa74-46eeedbc95a9.pt
Training epochs on cuda:0:  45%|████▌     | 9/20 [01:47<01:55, 10.46s/epoch, loss=0.14, prev_loss=0.154] INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=128.
INFO:pykeen.evaluation.evaluator:Evaluation took 46.98s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.0834460782568448. Saved model weights to /home/jacky/.data/pykeen/checkpoints/best-model-weights-553f5a7c-e1a8-4aed-aa74-46eeedbc95a9.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.
Training epochs on cuda:0:  95%|█████████▌| 19/20 [04:24

In [19]:
from pykeen.evaluation import RankBasedEvaluator

# Create an evaluator
evaluator = RankBasedEvaluator()

# Evaluate the model
metrics = evaluator.evaluate(result.model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=128.
Evaluating on cuda:0: 100%|██████████| 56.2k/56.2k [00:47<00:00, 1.19ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 49.24s seconds


In [20]:
print(f"Hits@1: {metrics.get_metric('hits@1')}")
print(f"Hits@3: {metrics.get_metric('hits@3')}")
print(f"Hits@5: {metrics.get_metric('hits@5')}")
print(f"Hits@10: {metrics.get_metric('hits@10')}")
print(f"Mean Reciprocal Rank: {metrics.get_metric('mean_reciprocal_rank')}")

Hits@1: 0.02240400192263187
Hits@3: 0.048938101935093375
Hits@5: 0.067122995033201
Hits@10: 0.10007476901714346
Mean Reciprocal Rank: 0.0505916066467762


In [21]:
print(get_difference_entity('associates',predicted_link,'colon cancer','Disease','Gene'))

2    BIRC5
3      FAS
4     RGS2
Name: target, dtype: object


In [22]:
result,testing,training,validation,predicted_link,pred_transr_gd =diff_models('TransE','Disease','Gene','colon cancer','associates')

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [430304, 56174, 56174]
INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /home/jacky/.data/pykeen/checkpoints/best-model-weights-2c1657cf-4726-47ec-8930-e75d81a5125a.pt
Training epochs on cuda:0:  45%|████▌     | 9/20 [01:09<01:17,  7.05s/epoch, loss=0.165, prev_loss=0.17] INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=1024.
INFO:pykeen.evaluation.evaluator:Evaluation took 17.32s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.03217680777583936. Saved model weights to /home/jacky/.data/pykeen/checkpoints/best-model-weights-2c1657cf-4726-47ec-8930-e75d81a5125a.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.
Training epochs on cuda:0:  95%|█████████▌| 19/20 [02:

In [23]:
from pykeen.evaluation import RankBasedEvaluator

# Create an evaluator
evaluator = RankBasedEvaluator()

# Evaluate the model
metrics = evaluator.evaluate(result.model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=1024.
Evaluating on cuda:0: 100%|██████████| 56.2k/56.2k [00:16<00:00, 3.49ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 18.11s seconds


In [24]:
print(f"Hits@1: {metrics.get_metric('hits@1')}")
print(f"Hits@3: {metrics.get_metric('hits@3')}")
print(f"Hits@5: {metrics.get_metric('hits@5')}")
print(f"Hits@10: {metrics.get_metric('hits@10')}")
print(f"Mean Reciprocal Rank: {metrics.get_metric('mean_reciprocal_rank')}")

Hits@1: 0.0042368355466942
Hits@3: 0.014277067682557766
Hits@5: 0.02270623420087585
Hits@10: 0.04040125324883398
Mean Reciprocal Rank: 0.019445380195975304


In [25]:
print(get_difference_entity('associates',predicted_link,'colon cancer','Disease','Gene'))

0      APOE
1    NFKBIA
2       WT1
3       CRP
4       AGT
Name: target, dtype: object


In [26]:
result,testing,training,validation,predicted_link,pred_transr_gd =diff_models('SimplE','Disease','Gene','colon cancer','associates')

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [430308, 56174, 56175]
INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /home/jacky/.data/pykeen/checkpoints/best-model-weights-4d550e7e-0875-4e60-9594-0f969b601331.pt
Training epochs on cuda:0:  45%|████▌     | 9/20 [02:00<02:08, 11.70s/epoch, loss=0.584, prev_loss=0.688]INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=256.
INFO:pykeen.evaluation.evaluator:Evaluation took 44.98s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.0024388072986203826. Saved model weights to /home/jacky/.data/pykeen/checkpoints/best-model-weights-4d550e7e-0875-4e60-9594-0f969b601331.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.
Training epochs on cuda:0:  95%|█████████▌| 19/20 [04

In [27]:
from pykeen.evaluation import RankBasedEvaluator

# Create an evaluator
evaluator = RankBasedEvaluator()

# Evaluate the model
metrics = evaluator.evaluate(result.model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...


INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=256.
Evaluating on cuda:0: 100%|██████████| 56.2k/56.2k [00:47<00:00, 1.17ktriple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 50.00s seconds


In [28]:
print(f"Hits@1: {metrics.get_metric('hits@1')}")
print(f"Hits@3: {metrics.get_metric('hits@3')}")
print(f"Hits@5: {metrics.get_metric('hits@5')}")
print(f"Hits@10: {metrics.get_metric('hits@10')}")
print(f"Mean Reciprocal Rank: {metrics.get_metric('mean_reciprocal_rank')}")

Hits@1: 0.0010859116317157404
Hits@3: 0.003480257770498807
Hits@5: 0.005545270053761527
Hits@10: 0.010316160501299534
Mean Reciprocal Rank: 0.005759171210229398


In [29]:
print(get_difference_entity('associates',predicted_link,'colon cancer','Disease','Gene'))

0      ABHD3
1    ALDH3B2
2     COL1A1
3      SEL1L
4      TRAP1
Name: target, dtype: object
