In [1]:
import torch
import pandas as pd
import numpy as np
import os

# PyKEEN core
import pykeen
from pykeen.pipeline import pipeline
from pykeen.datasets import Nations
from pykeen.triples import TriplesFactory
from pykeen.evaluation import RankBasedEvaluator
import math
from pykeen import predict
device = "cuda" if torch.cuda.is_available() else "cpu"


from pykeen.models import TransE


  from .autonotebook import tqdm as notebook_tqdm


## Train Model

In [4]:

# 1. TRAIN THE TransE model 
triples_path = "../../abox_export.tsv"
tf = TriplesFactory.from_path(triples_path)
train_tf, test_tf = tf.split([0.8, 0.2])

# Train a TransE model
result = pipeline(
    model='TransE',
    training=train_tf,
    testing=test_tf,
    model_kwargs=dict(embedding_dim=256),
    training_kwargs=dict(num_epochs=25),
    negative_sampler_kwargs=dict(
        num_negs_per_pos=3,
    ),
    optimizer_kwargs=dict(lr=0.01),
    random_seed=42,
    device='cuda' if torch.cuda.is_available() else 'cpu',
)

INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [19931, 7193]
INFO:pykeen.pipeline.api:Using device: cpu
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
Training epochs on cpu: 100%|██████████| 25/25 [00:40<00:00,  1.62s/epoch, loss=0.369, prev_loss=0.364]
Evaluating on cpu: 100%|██████████| 7.19k/7.19k [01:33<00:00, 76.6triple/s]
INFO:pykeen.evaluation.evaluator:Evaluation took 93.98s seconds


## Citing Paper and Author prediction

In [6]:
relation_to_id

{'<http://SDM.org/research/aboutTopic>': 0,
 '<http://SDM.org/research/cites>': 1,
 '<http://SDM.org/research/hasAbstract>': 2,
 '<http://SDM.org/research/hasAuthor>': 3,
 '<http://SDM.org/research/hasContent>': 4,
 '<http://SDM.org/research/hasCorrespondingAuthor>': 5,
 '<http://SDM.org/research/hasEdition>': 6,
 '<http://SDM.org/research/hasKeywords>': 7,
 '<http://SDM.org/research/hasNumber>': 8,
 '<http://SDM.org/research/hasProceedings>': 9,
 '<http://SDM.org/research/hasReview>': 10,
 '<http://SDM.org/research/hasVolume>': 11,
 '<http://SDM.org/research/heldIn>': 12,
 '<http://SDM.org/research/heldOn>': 13,
 '<http://SDM.org/research/includesPaper>': 14,
 '<http://SDM.org/research/performedBy>': 15,
 '<http://SDM.org/research/publishedIn>': 16,
 '<http://SDM.org/research/volumeYear>': 17,
 '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>': 18,
 '<http://www.w3.org/2000/01/rdf-schema#domain>': 19,
 '<http://www.w3.org/2000/01/rdf-schema#range>': 20,
 '<http://www.w3.org/2000/01/

In [13]:
import torch

model = result.model
entity_to_id = tf.entity_to_id
relation_to_id = tf.relation_to_id
id_to_entity = {v: k for k, v in entity_to_id.items()}

# Step 1: Extract all triples from the full triples factory (training + test)
all_triples = tf.triples

# Step 2: Build sets of paper and author entity IDs by looking at the relations
author_entities = set()
paper_entities = set()

for head, rel, tail in all_triples:
    if rel == '<http://SDM.org/research/hasAuthor>':
        # Head is paper, tail is author
        if head in entity_to_id:
            paper_entities.add(entity_to_id[head])
        if tail in entity_to_id:
            author_entities.add(entity_to_id[tail])
    elif rel == '<http://SDM.org/research/cites>':
        # Both head and tail are papers
        if head in entity_to_id:
            paper_entities.add(entity_to_id[head])
        if tail in entity_to_id:
            paper_entities.add(entity_to_id[tail])

# Step 3: Pick a test triple (paper, relation, entity)
test_triple = test_tf.triples[0]
paper_id = test_triple[0]
print("Test Paper:", paper_id)

device = next(model.parameters()).device
paper_idx = entity_to_id[paper_id]
cites_idx = relation_to_id['<http://SDM.org/research/cites>']
author_idx = relation_to_id['<http://SDM.org/research/hasAuthor>']

# Step 4: Get embeddings for the paper and relations
paper_emb = model.entity_representations[0](torch.tensor([paper_idx], device=device))
cites_emb = model.relation_representations[0](torch.tensor([cites_idx], device=device))
author_emb = model.relation_representations[0](torch.tensor([author_idx], device=device))

all_entities_emb = model.entity_representations[0]().detach()

# --- Predict cited paper (head + relation ≈ tail) ---
expected_cited = paper_emb + cites_emb
distances = torch.norm(all_entities_emb - expected_cited, dim=1)

# Filter out self and non-paper entities
distances[paper_idx] = float('inf')  # exclude self
for idx in range(len(distances)):
    if idx not in paper_entities:
        distances[idx] = float('inf')

predicted_cited_idx = torch.argmin(distances).item()
print("Predicted Cited Paper:", id_to_entity[predicted_cited_idx])

# --- Predict author (paper - relation ≈ author) ---
expected_author = paper_emb - author_emb
distances = torch.norm(all_entities_emb - expected_author, dim=1)

# Filter out non-author entities
for idx in range(len(distances)):
    if idx not in author_entities:
        distances[idx] = float('inf')

predicted_author_idx = torch.argmin(distances).item()
print("Predicted Author:", id_to_entity[predicted_author_idx])




Test Paper: <http://SDM.org/research/MAGIC_Microlensing_Analysis_Guided_by_Intelligent_Computation>
Predicted Cited Paper: <http://SDM.org/research/Identifying_multi-target_drugs_for_prostate_cancer_using_machine_learning-assisted_transcriptomic_analysis>
Predicted Author: <http://SDM.org/research/Hugo_Richard>
