In [1]:
import numpy as np
import torch
import os
from collections import defaultdict
import random

# --- Configuration ---
# The dimensionality of your embeddings (e.g., 50, 100, 200)
EMBEDDING_DIM = 256
# The number of nearest neighbors to retrieve in each search step
K_NEIGHBORS = 25
K_RESULTS = 10

def rotate_proj(heads, relations):
    """
    Performs RotatE projection: heads ◦ relations, where ◦ is Hadamard product in the complex plane.
    Assumes the first half of the embedding dim is the real part and the second half is imaginary.
    """
    re_head, im_head = torch.chunk(heads, 2, dim=-1)

    #Make phases of relations uniformly distributed in [-pi, pi]
    pi = 3.14159265358979323846

    embedding_range = torch.nn.Parameter(
    torch.Tensor([(24.0 + 2.0) / EMBEDDING_DIM]), 
    requires_grad=False
    )

    phase_relation = relations/(embedding_range.item()/pi)

    re_relation = torch.cos(phase_relation)
    im_relation = torch.sin(phase_relation)
    
    # RotatE complex multiplication:
    # re(h*r) = re(h)*re(r) - im(h)*im(r)
    # im(h*r) = re(h)*im(r) + im(h)*re(r)
    re_projected = re_head * re_relation - im_head * im_relation
    im_projected = re_head * im_relation + im_head * re_relation
    
    projected_vec = torch.cat([re_projected, im_projected], dim=-1)
    return projected_vec  # Ensure it has shape (1, dim)

def transe_proj(heads, relations):
    return heads + relations

def find_neighbors_torch(query_vectors, entity_embeddings_torch, k):
    """
    Finds the k-nearest neighbors for one or more query vectors using PyTorch.
    This is the GPU-accelerated replacement for the scikit-learn model.

    Args:
        query_vectors (torch.Tensor): A tensor of shape (num_queries, dim)
        k (int): The number of neighbors to find.

    Returns:
        torch.Tensor: A tensor of shape (num_queries, k) containing neighbor indices.
    """
    # Calculate pairwise Euclidean distances between query vectors and all entity embeddings
    # `torch.cdist` is highly optimized for this operation on GPU.

    # query_vectors = torch.nn.functional.normalize(query_vectors, dim=1)
    distances = torch.cdist(query_vectors, entity_embeddings_torch)

    # Find the indices of the k smallest distances for each query vector.
    # We use `torch.topk` with largest=False to get the smallest values (nearest neighbors).
    _, indices = torch.topk(distances, k, dim=1, largest=False)

    return indices

def project_and_find_neighbors_torch(start_node_id, relation_id, k, entity_embeddings_torch, relation_embeddings_torch):
    """
    Projects a starting node by a relation and finds k-nearest neighbors using PyTorch.
    """
    start_node_vec = entity_embeddings_torch[start_node_id]
    relation_vec = relation_embeddings_torch[relation_id]

    # Project by adding the relation vector (TransE-style projection)
    projected_vec = transe_proj(start_node_vec, relation_vec)

    # Project RotatE style
    # projected_vec = rotate_proj(start_node_vec, relation_vec)

    # Find the k-nearest neighbors to the projected vector.
    # We add a dimension to make it a (1, dim) tensor for find_neighbors_torch.
    indices = find_neighbors_torch(projected_vec.unsqueeze(0), entity_embeddings_torch, k)

    # Return the IDs of the neighboring entities as a set
    return set(indices[0].cpu().numpy())

def execute_query_torch(query, entity_embeddings_torch, relation_embeddings_torch, entity_to_id, relation_to_id, id_to_entity, id_to_relation, device="cuda"):
    """
    Executes a multi-part query using PyTorch for all vector operations.
    """
    conjunction_parts = query["conjunctions"]
    final_relation = query["final_projection"]

    # --- Step 1: Process each part of the conjunction ---
    print("Step 1: Processing conjunctions...")
    intermediate_results = []
    for part in conjunction_parts:
        node_name = part["node"]
        relation_name = part["relation"]
        print(f"  - Finding neighbors for: {relation_name}({node_name})")
        neighbors = project_and_find_neighbors_torch(node_name, relation_name, K_NEIGHBORS, entity_embeddings_torch, relation_embeddings_torch)
        intermediate_results.append(neighbors)
        print(f"    -> Found {len(neighbors)} potential candidates.")

    # --- Step 2: Find the intersection of the results ---
    print("\nStep 2: Finding intersection of candidate sets...")
    if not intermediate_results:
        print("No conjunctions to process. Aborting.")
        return []

    intersection_ids = intermediate_results[0].copy()
    for other_set in intermediate_results[1:]:
        intersection_ids.intersection_update(other_set)

    if not intersection_ids:
        print("Intersection is empty. No results found.")
        return []

    print(f"  -> Found {len(intersection_ids)} entities in the intersection.")
    intersection_names = [idx for idx in intersection_ids]
    print(f"  -> Intersected nodes: {intersection_names}")

    # --- Step 3: Project the intersection set with the final relation ---
    print(f"\nStep 3: Projecting intersection with final relation '{final_relation}'...")
    relation_vec = relation_embeddings_torch[final_relation]

    # Get the embeddings of all nodes in the intersection
    intersection_indices = torch.tensor(list(intersection_ids), device=device, dtype=torch.long)
    intersection_embeddings = entity_embeddings_torch[intersection_indices]

    # Project all of them by the final relation vector
    final_projected_vecs = transe_proj(intersection_embeddings, relation_vec)
    # final_projected_vecs  = rotate_proj(intersection_embeddings, relation_vec)

    # --- Step 4: Find the nearest neighbors to the final projected vectors ---
    print(f"\nStep 4: Finding final results (nearest neighbors to final projection)...")
    indices = find_neighbors_torch(final_projected_vecs, entity_embeddings_torch, K_RESULTS)

    # Flatten the list of lists and get unique entity IDs
    final_result_ids = set(indices.flatten().cpu().numpy())

    # get only the intersection
    # uniques, counts = indices.unique(return_counts=True)
    # final_result_ids = uniques[counts > 1].cpu().numpy()

    # Convert IDs back to names for the final result
    # final_results = [id_to_entity[idx] for idx in final_result_ids]
    final_results = [idx for idx in final_result_ids]

    return final_results

In [2]:
def load_data(embeddings_folder='embeddings', dicts_folder='dictionaries'):
    """
    Loads embeddings and dictionary files from the specified folders.
    The i-th row in an embedding file corresponds to the entity/relation with key 'i' in the dict file.
    """
    print(f"Loading data from folders: '{embeddings_folder}' and '{dicts_folder}'...")

    # Load embedding matrices from the embeddings folder
    entity_embeddings_np = np.load(os.path.join(embeddings_folder, 'entity_embedding.npy'))
    relation_embeddings_np = np.load(os.path.join(embeddings_folder, 'relation_embedding.npy'))

    # Load entity dictionary and create a name -> id mapping
    entity_to_id = {}
    with open(os.path.join(dicts_folder, 'entities.dict'), 'r') as f:
        for line in f:
            idx, name = line.strip().split('\t')
            entity_to_id[name] = int(idx)

    # Load relation dictionary and create a name -> id mapping
    relation_to_id = {}
    with open(os.path.join(dicts_folder, 'relations.dict'), 'r') as f:
        for line in f:
            idx, name = line.strip().split('\t')
            relation_to_id[name] = int(idx)

    # Create a reverse mapping for easy lookup of names from IDs
    id_to_entity = {v: k for k, v in entity_to_id.items()}
    id_to_relation = {v: k for k, v in relation_to_id.items()}

    print("Embeddings and dictionaries loaded successfully.")
    return entity_embeddings_np, relation_embeddings_np, entity_to_id, relation_to_id, id_to_entity, id_to_relation

def find_interesting_query_from_triplets(triplets_file, entity_to_id, relation_to_id, k=10):
    triplets_by_id = []
    indexing_dict = {}

    with open(triplets_file, 'r') as f:
        for line in f:
            h_name, r_name, t_name = line.strip().split('\t')
            if h_name in entity_to_id and r_name in relation_to_id and t_name in entity_to_id:
                h_id = entity_to_id[h_name]
                r_id = relation_to_id[r_name]
                t_id = entity_to_id[t_name]
                triplets_by_id.append((h_id, r_id, t_id))

                if h_id not in indexing_dict:
                    indexing_dict[h_id] = {'in': np.empty((0, 2), dtype=np.int64),
                                        'out': np.empty((0, 2), dtype=np.int64),
                                        'count': 0}
                if t_id not in indexing_dict:
                    indexing_dict[t_id] = {'in': np.empty((0, 2), dtype=np.int64),
                                        'out': np.empty((0, 2), dtype=np.int64),
                                        'count': 0}

                indexing_dict[h_id]['out'] = np.vstack([indexing_dict[h_id]['out'], [r_id, t_id]])
                indexing_dict[h_id]['count'] += 1
                
                indexing_dict[t_id]['in']  = np.vstack([indexing_dict[t_id]['in'], [r_id, h_id]])
                indexing_dict[t_id]['count'] += 1

    tail_target_nodes = [eid for eid, data in indexing_dict.items() if data["count"] > 300]

    random.seed(42)
    random.shuffle(tail_target_nodes)

    queries = []
    targets = []

    for target in tail_target_nodes:
        relations =  np.unique(indexing_dict[target]['in'][:,0])
        projection_relation = np.random.choice(indexing_dict[target]['out'][:, 0]) if indexing_dict[target]['out'].shape[0] > 0 else None
        np.random.shuffle(relations)

        if relations.shape[0] < 2 or projection_relation is None:
            continue

        relation1 = relations[0]
        relation2 = relations[1]

        head1 = np.random.choice(indexing_dict[target]['in'][indexing_dict[target]['in'][:, 0] == relation1, 1])
        head2 = np.random.choice(indexing_dict[target]['in'][indexing_dict[target]['in'][:, 0] == relation2, 1])

        target_tails = indexing_dict[target]['out'][indexing_dict[target]['out'][:, 0] == projection_relation, 1]

        query_triplets = [
            (head1, relation1, -1),
            (head2, relation2, -1),
            (-1, projection_relation, -1)
        ]   

        queries.append(query_triplets)
        targets.append(target_tails)

    return queries, targets, triplets_by_id

def parse_triplets_to_query(triplets_by_id):
    """
    Converts a list of triplets into the structured query dictionary.
    """
    if not triplets_by_id:
        raise ValueError("Triplet array cannot be empty.")

    query = {"conjunctions": [], "final_projection": None}
    final_proj_triplet = triplets_by_id[-1]
    query["final_projection"] = final_proj_triplet[1]

    for head_id, rel_id, tail_id in triplets_by_id[:-1]:
        conjunction = {"node": head_id, "relation": rel_id}
        query["conjunctions"].append(conjunction)
        
    return query

In [3]:
# Define directories for your data
# EMBEDDINGS_DIR = '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/models/RotatE_FB15k_0'
EMBEDDINGS_DIR = '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/models/TransE_FB15k_1'
DICTS_DIR = '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/data/FB15k'

# Load the data as NumPy arrays first
entity_embeddings_np, relation_embeddings_np, entity_to_id, relation_to_id, id_to_entity, id_to_relation = load_data(
    embeddings_folder=EMBEDDINGS_DIR,
    dicts_folder=DICTS_DIR
)

# --- 3. Set up PyTorch and move data to GPU if available ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Convert numpy arrays to PyTorch tensors and move them to the selected device
entity_embeddings_torch = torch.from_numpy(entity_embeddings_np).to(device)
relation_embeddings_torch = torch.from_numpy(relation_embeddings_np).to(device)

entity_embeddings_torch = torch.nn.functional.normalize(entity_embeddings_torch, dim=1)
relation_embeddings_torch = torch.nn.functional.normalize(relation_embeddings_torch, dim=1)

print("Data moved to device.")

# --- Automatically find a query from the graph data ---
query_triplets_by_id, query_tails, _ = find_interesting_query_from_triplets(
    os.path.join(DICTS_DIR, 'merged.txt'), 
    entity_to_id, 
    relation_to_id
)

Loading data from folders: '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/models/TransE_FB15k_1' and '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/data/FB15k'...
Embeddings and dictionaries loaded successfully.

Using device: cuda
Data moved to device.


In [4]:
idx = 6

query_triplets = query_triplets_by_id[idx] if query_triplets_by_id else None

if query_triplets:
    # Parse the discovered triplets into the structured query format
    example_query = parse_triplets_to_query(query_triplets)

    print("\n--- Executing Discovered Query ---")
    print(f"Parsed Query: {example_query}")
    print("-" * 25)

    final_results = execute_query_torch(example_query, entity_embeddings_torch, relation_embeddings_torch, entity_to_id, relation_to_id, id_to_entity, id_to_relation)

    print("\n--- Query Finished ---")
    print(f"Final results: {final_results}")
    print(f"Found {len(final_results)} unique entities.")
else:
    print("\nCould not automatically discover a suitable query to run.")

print("-----------------------")

[q for q in query_tails[idx] if q in final_results ]


--- Executing Discovered Query ---
Parsed Query: {'conjunctions': [{'node': np.int64(5668), 'relation': np.int64(463)}, {'node': np.int64(4203), 'relation': np.int64(1303)}], 'final_projection': np.int64(795)}
-------------------------
Step 1: Processing conjunctions...
  - Finding neighbors for: 463(5668)
    -> Found 25 potential candidates.
  - Finding neighbors for: 1303(4203)
    -> Found 25 potential candidates.

Step 2: Finding intersection of candidate sets...
Intersection is empty. No results found.

--- Query Finished ---
Final results: []
Found 0 unique entities.
-----------------------


[]

In [5]:
np.unique(query_tails[idx])

array([  269,   311,   426,   958,  1040,  1074,  1242,  1322,  1595,
        1840,  1941,  1980,  2025,  2453,  2575,  3005,  3193,  3364,
        3805,  3818,  4133,  4502,  4790,  4988,  4998,  5233,  5433,
        5477,  5747,  5826,  5924,  6036,  6048,  6122,  6255,  6404,
        6514,  6519,  6613,  6779,  6850,  6874,  6932,  7105,  7128,
        7240,  7358,  7569,  7758,  7769,  8155,  8611,  8665,  8966,
        8990,  9085,  9261,  9482,  9916,  9920,  9942, 10054, 10149,
       10156, 10272, 10462, 10567, 10723, 11015, 11314, 11709, 11770,
       12262, 12279, 13364, 13673, 13918, 13941, 14076, 14202, 14264,
       14296, 14494, 14576, 14771, 14941])

In [6]:
# Define directories for your data
# EMBEDDINGS_DIR = '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/models/RotatE_FB15k_0'
EMBEDDINGS_DIR = '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/models/TransE_FB15k_1'
DICTS_DIR = '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/data/FB15k'

# Load the data as NumPy arrays first
entity_embeddings_np, relation_embeddings_np, entity_to_id, relation_to_id, id_to_entity, id_to_relation = load_data(
    embeddings_folder=EMBEDDINGS_DIR,
    dicts_folder=DICTS_DIR
)

# --- 3. Set up PyTorch and move data to GPU if available ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Convert numpy arrays to PyTorch tensors and move them to the selected device
entity_embeddings_torch = torch.from_numpy(entity_embeddings_np).to(device)
relation_embeddings_torch = torch.from_numpy(relation_embeddings_np).to(device)

# entity_embeddings_torch = torch.nn.functional.normalize(entity_embeddings_torch, dim=1)
# relation_embeddings_torch = torch.nn.functional.normalize(relation_embeddings_torch, dim=1)

print("Data moved to device.")

# --- Automatically find a query from the graph data ---
query_triplets_by_id, query_tails, triplets = find_interesting_query_from_triplets(
    os.path.join(DICTS_DIR, 'merged.txt'), 
    entity_to_id, 
    relation_to_id
)

Loading data from folders: '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/models/TransE_FB15k_1' and '/home/cc/PHD/dglframework/KnowledgeGraphEmbedding/data/FB15k'...
Embeddings and dictionaries loaded successfully.

Using device: cuda
Data moved to device.


In [7]:
#TODO: Implementare ranking al posto di knn per mantenere la stessa valutazione

head, rel, tail = triplets[1]

# query_vectors = rotate_proj(entity_embeddings_torch[head], relation_embeddings_torch[rel])
query_vectors = transe_proj(entity_embeddings_torch[head], relation_embeddings_torch[rel])

distances = torch.cdist(query_vectors.unsqueeze(0), entity_embeddings_torch)

# Find the indices of the k smallest distances for each query vector.
# We use `torch.topk` with largest=False to get the smallest values (nearest neighbors).
_, indices = torch.topk(distances, 100, dim=1, largest=False)

In [None]:
tails = [tail for _,_, tail in triplets]

head, rel, true_tail = triplets[11]

head = torch.index_select(
    entity_embeddings_torch, 
    dim=0, 
    index=torch.tensor(head, device="cuda")
).unsqueeze(1)

relation = torch.index_select(
    relation_embeddings_torch,
    dim=0,
    index=torch.tensor(rel, device="cuda")
).unsqueeze(1)

tail = torch.index_select(
    entity_embeddings_torch, 
    dim=0, 
    index=torch.tensor(tails, device="cuda").view(-1)
).view(1, len(tails), -1)

print(head.shape, relation.shape, tail.shape)

scores = (head + relation) - tail

scores = torch.norm(scores, p=1, dim=2)

argsort = torch.argsort(scores, dim = 1, descending=True)

(argsort == true_tail).nonzero()

torch.Size([1, 1, 256]) torch.Size([1, 1, 256]) torch.Size([1, 592213, 256])


tensor([[     0, 294298]], device='cuda:0')

In [12]:
heads.shape

torch.Size([151606528])

In [None]:
print(f"tail: {tail}")
indices[0]

In [16]:
tails = [tail for _,_, tail in triplets]

head, rel, tail = triplets[1]

entity_embeddings_torch[head].expand(-1, len(tails)).shape

RuntimeError: The expanded size of the tensor (592213) must match the existing size (256) at non-singleton dimension 1.  Target sizes: [-1, 592213].  Tensor sizes: [256]

In [21]:
entity_embeddings_torch[head].unsqueeze(0).expand(len(tails),-1).shape

torch.Size([592213, 256])