### 1. Framework imports

In [2]:
import os
import time
import json
import argparse

import torch
import numpy as np

from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset


### 2. Defining function for training and testing

In [None]:
def train_and_measure(
    model_name: str,
    dataset_name: str, 
    epochs: int = 100,
    batch_size: int = 256,
    device: str = 'cuda',
    inference_batch_size: int = 1024,
    output_dir: str = "./pykeen_results"):
    os.makedirs(output_dir, exist_ok=True)

    print(f"Training model={model_name!r} on dataset={dataset_name!r} | epochs={epochs} | batch_size={batch_size} | device={device}")

    # 1) TREINO: tempo "wall-clock" medido externamente + pipeline() (que também retorna train_seconds)
    t0 = time.perf_counter()
    result = pipeline(
        model=model_name,
        dataset=dataset_name,
        epochs=epochs,  # atalho para training_kwargs['num_epochs']
        device=device,  # aceita 'cuda' / 'cpu' / torch.device
        training_kwargs=dict(batch_size=batch_size, use_tqdm_batch=False),  # use_tqdm_batch=False para logs mais limpos
    )
    t1 = time.perf_counter()
    wallclock_train_seconds = t1 - t0

    # pipeline returns PipelineResult; try extrair os tempos informados internamente
    train_seconds_reported = getattr(result, "train_seconds", None)
    evaluate_seconds_reported = getattr(result, "evaluat    e_seconds", None)

    print(f"Wall-clock total (pipeline call) = {wallclock_train_seconds:.3f} s")
    if train_seconds_reported is not None:
        print(f"PipelineResult.train_seconds = {train_seconds_reported:.3f} s")
    if evaluate_seconds_reported is not None:
        print(f"PipelineResult.evaluate_seconds = {evaluate_seconds_reported:.3f} s")
    
    # 2) PEGAR MODELO E DATASET (para inferência)
    model = result.model  # modelo já treinado
    # garantir que temos o dataset instanciado para acessar testing split
    dataset = get_dataset(dataset_name) if isinstance(dataset_name, str) else dataset_name
    if getattr(dataset, "testing", None) is None:
        raise RuntimeError("O dataset não expõe uma divisão 'testing' -- verifique o dataset ou passe training/testing separadamente.")

    testing_mapped = dataset.testing.mapped_triples  # numpy array shape (n_triples, 3)
    n_test = int(testing_mapped.shape[0])
    print(f"Test triples: {n_test}")

    # 3) INFERÊNCIA: medir tempo para pontuar TODOS os triples de teste em batches
    # mover os tensores para o mesmo device do modelo
    model_device = getattr(model, "device", torch.device("cpu"))
    if isinstance(model_device, str):
        model_device = torch.device(model_device)
    device_name = model_device

    # torch tensor com índices (long)
    triples_tensor = torch.tensor(testing_mapped, dtype=torch.long, device=device_name)

    # iterar em batches e chamar model.score_hrt(...) para forçar forward pass
    print(f"Measuring inference (scoring) on device {device_name} with batch_size={inference_batch_size} ...")
    inference_batch = inference_batch_size
    infer_t0 = time.perf_counter()
    with torch.no_grad():
        for i in range(0, n_test, inference_batch):
            batch_triples = triples_tensor[i : i + inference_batch]  # shape (b,3)
            scores = model.score_hrt(batch_triples)  # tensor (b,1) ou (b,)
            # materialize para forçar computação (move para CPU e numpy)
            _ = scores.cpu().numpy()
    infer_t1 = time.perf_counter()
    inference_seconds = infer_t1 - infer_t0
    throughput = n_test / inference_seconds if inference_seconds > 0 else float("inf")

    print(f"Inference total = {inference_seconds:.3f} s | avg per triple = {inference_seconds / n_test:.6f} s | throughput = {throughput:.2f} triples/s")

In [22]:
train_and_measure(
        model_name='ComplEx',
        dataset_name='FB15k-237',
        output_dir='/pykeen_results',
    )

INFO:pykeen.datasets.utils:Caching preprocessed dataset to file:///C:/Users/grkremer/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.datasets.base:downloading data from https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.2.zip to C:\Users\grkremer\.data\pykeen\datasets\fb15k237\FB15K-237.2.zip


Training model='ComplEx' on dataset='FB15k-237' | epochs=100 | batch_size=256 | device=cuda


Downloading FB15K-237.2.zip: 0.00B [00:00, ?B/s]

INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=14505, num_relations=237, create_inverse_triples=False, num_triples=272115, path=Release\train.txt) to file:///C:/Users/grkremer/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
INFO:pykeen.datasets.base:Stored training factory to file:///C:/Users/grkremer/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=14505, num_relations=237, create_inverse_triples=False, num_triples=20438, path=Release\test.txt) to file:///C:/Users/grkremer/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
INFO:pykeen.datasets.base:Stored testing factory to file:///C:/Users/grkremer/.data/pykeen/datasets/fb15k237/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=14505, num_relations=237, create_inverse_triples=False, num_

Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/20.4k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 157.16s seconds


Wall-clock total (pipeline call) = 1036.466 s
PipelineResult.train_seconds = 850.288 s


TypeError: get_dataset() takes 0 positional arguments but 1 was given

In [24]:
train_and_measure(
        model_name='TransE',
        dataset_name='Nations',
        output_dir='/pykeen_results',
    )

INFO:pykeen.datasets.utils:Loading cached preprocessed dataset from file:///C:/Users/grkremer/.data/pykeen/datasets/nations/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.triples.triples_factory:Loading from file:///C:/Users/grkremer/.data/pykeen/datasets/nations/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
INFO:pykeen.triples.triples_factory:Loading from file:///C:/Users/grkremer/.data/pykeen/datasets/nations/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
INFO:pykeen.triples.triples_factory:Loading from file:///C:/Users/grkremer/.data/pykeen/datasets/nations/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/validation
INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Training model='TransE' on dataset='Nations' | epochs=100 | batch_size=256 | device=cuda


Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Evaluating on cuda:0:   0%|          | 0.00/201 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.05s seconds


Wall-clock total (pipeline call) = 15.548 s
PipelineResult.train_seconds = 15.474 s


TypeError: get_dataset() takes 0 positional arguments but 1 was given

In [1]:
import torch
print(torch.cuda.is_available())

True
