#### The follwoing cell will load the Dice embeddings models of Wikidata

In [2]:
import pandas as pd
import numpy as np


entity2id_df = pd.read_csv("5-Epochs/entity_to_idx.csv", header=0)
relation2id_df = pd.read_csv("5-Epochs/relation_to_idx.csv", header=0)

entity2id = dict(zip(entity2id_df["entity"], entity2id_df["index"]))
relation2id = dict(zip(relation2id_df["relation"], relation2id_df["index"]))

#
print("Loading entity embeddings...")
entity_emb_df = pd.read_csv("5-Epochs/Keci_entity_embeddings.csv", header=None)


entity_emb_df.rename(columns={0: "entity"}, inplace=True)
entity_embeddings = entity_emb_df.drop(columns=["entity"]).astype(np.float32).values
entity_emb_uris = entity_emb_df["entity"].values

print("Entity embeddings shape:", entity_embeddings.shape)


print("Loading relation embeddings...")
relation_emb_df = pd.read_csv("5-Epochs/Keci_relation_embeddings.csv", header=None)

relation_emb_df.rename(columns={0: "relation"}, inplace=True)
relation_embeddings = relation_emb_df.drop(columns=["relation"]).astype(np.float32).values
relation_emb_uris = relation_emb_df["relation"].values

print("Relation embeddings shape:", relation_embeddings.shape)




Loading entity embeddings...
Entity embeddings shape: (10587165, 64)
Loading relation embeddings...
Relation embeddings shape: (1261, 64)


#### The next cell will do link prediction


In [5]:
import numpy as np

def score_triple_normalized(h_emb, r_emb, t_emb):
    """Normalized TransE score in [0,1]."""
    dist = np.linalg.norm(h_emb + r_emb - t_emb)
    return 1.0 / (1.0 + dist)


def score_path(h_emb, r_embs, t_emb):
    """Multi-hop TransE score with relation sequence r_embs."""
    total_r = np.sum(r_embs, axis=0)
    dist = np.linalg.norm(h_emb + total_r - t_emb)
    return 1.0 / (1.0 + dist)


def predict_relation_normalized(triples, entity2id, relation2id, entity_embeddings, relation_embeddings, top_k=3):
    """Relation prediction with normalized scores."""
    results = []

    for h_uri, t_uri in triples:
        if h_uri not in entity2id or t_uri not in entity2id:
            results.append({"head": h_uri, "tail": t_uri, "predictions": [(None, 0.0)]})
            continue

        h_id = entity2id[h_uri]
        t_id = entity2id[t_uri]
        h_emb = entity_embeddings[h_id]
        t_emb = entity_embeddings[t_id]

        scores = []
        for r_uri, r_id in relation2id.items():
            r_emb = relation_embeddings[r_id]
            score = score_triple_normalized(h_emb, r_emb, t_emb)
            scores.append((r_uri, score))

        top_relations = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
        results.append({"head": h_uri, "tail": t_uri, "predictions": top_relations})

    return results


def evaluate_lists_normalized(lists, entity2id, relation2id, entity_embeddings, relation_embeddings):
    """Aggregate list quality by mean normalized top-1 score."""
    list_scores = []
    for i, triples in enumerate(lists, 1):
        results = predict_relation_normalized(triples, entity2id, relation2id, entity_embeddings, relation_embeddings, top_k=1)
        scores = [preds[0][1] for r in results if (preds := r["predictions"]) and preds[0][0] is not None]
        mean_score = np.mean(scores) if scores else 0.0
        list_scores.append((f"list{i}", mean_score))
    return sorted(list_scores, key=lambda x: x[1], reverse=True)


In [6]:
# Example lists of (h, t) pairs
list1 = [
    ("<http://www.wikidata.org/entity/Q146>", "<http://www.wikidata.org/entity/Q5>"),
    ("<http://www.wikidata.org/entity/Q42>", "<http://www.wikidata.org/entity/Q571>")
]

list2 = [
    ("<http://www.wikidata.org/entity/Q937>", "<http://www.wikidata.org/entity/Q11629>"),
    ("<http://www.wikidata.org/entity/Q169>", "<http://www.wikidata.org/entity/Q30>")
]

list3 = [
    ("<http://www.wikidata.org/entity/Q1065>", "<http://www.wikidata.org/entity/Q17>"),
    ("<http://www.wikidata.org/entity/Q76>", "<http://www.wikidata.org/entity/Q6256>")
]
lists = [list1, list2, list3]
ranking = evaluate_lists_normalized(lists, entity2id, relation2id, entity_embeddings, relation_embeddings)

print("Ranking of lists by mean top relation score:")
for name, score in ranking:
    print(f"{name}: {score:.4f}")


Ranking of lists by mean top relation score:
list2: 0.0869
list3: 0.0847
list1: 0.0818


#### The nex cells will do an example of multiphop

In [15]:
# Example multi-hop scoring
h = "<http://www.wikidata.org/entity/Q146>"
t = "<http://www.wikidata.org/entity/Q5>"

if h in entity2id and t in entity2id:
    h_emb = entity_embeddings[entity2id[h]]
    t_emb = entity_embeddings[entity2id[t]]

    
    r1 = relation_embeddings[relation2id["<http://www.wikidata.org/prop/direct/P31>"]]
    r2 = relation_embeddings[relation2id["<http://www.wikidata.org/prop/direct/P279>"]]

    score = score_path(h_emb, [r1, r2], t_emb)
    print(f"Multi-hop score (h -P31-> e -P279-> t): {score:.4f}")


Multi-hop score (h -P31-> e -P279-> t): 0.0675


#### The next cell is the scoring function that we shown in the paper

In [8]:
import numpy as np

# --- Basic scores ---

def score_lp(h_emb, r_emb, t_emb):
    """Link prediction (TransE normalized score)."""
    dist = np.linalg.norm(h_emb + r_emb - t_emb)
    return 1.0 / (1.0 + dist)

def score_approx(h_emb, r_emb, t_emb, extra_relations=None):
    """
    Approximate score (semantic / multi-hop).
    For now, implement as multi-hop plausibility: h + r + extra ~ t.
    """
    if extra_relations is None:
        return 0.0
    total_r = np.sum(extra_relations, axis=0)
    dist = np.linalg.norm(h_emb + r_emb + total_r - t_emb)
    return 1.0 / (1.0 + dist)

# --- Confidence score for one triple ---

def confidence_score(h_uri, r_uri, t_uri, entity2id, relation2id, 
                     entity_embeddings, relation_embeddings,
                     extra_relations=None,
                     lambda1=0.6, lambda2=0.4):
    """Compute confidence score for a triple (h,r,t)."""
    if h_uri not in entity2id or t_uri not in entity2id or r_uri not in relation2id:
        return 0.0

    h_emb = entity_embeddings[entity2id[h_uri]]
    r_emb = relation_embeddings[relation2id[r_uri]]
    t_emb = entity_embeddings[entity2id[t_uri]]

    lp = score_lp(h_emb, r_emb, t_emb)
    approx = score_approx(h_emb, r_emb, t_emb, extra_relations)
    return lambda1 * lp + lambda2 * approx

# --- Document-level score ---

def document_score(triples, entity2id, relation2id, entity_embeddings, relation_embeddings,
                   lambda1=0.6, lambda2=0.4):
    """
    Compute aggregated confidence score for a set of triples (document-level).
    triples = [(h,r,t), ...]
    """
    scores = []
    for h, r, t in triples:
        conf = confidence_score(h, r, t,
                                entity2id, relation2id,
                                entity_embeddings, relation_embeddings,
                                extra_relations=None,
                                lambda1=lambda1, lambda2=lambda2)
        scores.append(conf)
    return np.mean(scores) if scores else 0.0


In [13]:
# Examples predictions
triples_pred_1 = [
    ("<http://www.wikidata.org/entity/Q42>",  # Douglas Adams
     "<http://www.wikidata.org/prop/direct/P106>",  # occupation
     "<http://www.wikidata.org/entity/Q36180>"),  # science fiction writer

    ("<http://www.wikidata.org/entity/Q146>",  # Cat
     "<http://www.wikidata.org/prop/direct/P31>",  # instance of
     "<http://www.wikidata.org/entity/Q146>"),  # Cat (self-loop, sometimes plausible)

    ("<http://www.wikidata.org/entity/Q5>",   # Human
     "<http://www.wikidata.org/prop/direct/P31>",  # instance of
     "<http://www.wikidata.org/entity/Q215627>")   # person
]


triples_pred_2 = [("<http://www.wikidata.org/entity/Q42>",  # Douglas Adams
     "<http://www.wikidata.org/prop/direct/P31>",  # instance of
     "<http://www.wikidata.org/entity/Q146>"),  # Cat (nonsense)

    ("<http://www.wikidata.org/entity/Q146>",  # Cat
     "<http://www.wikidata.org/prop/direct/P106>",  # occupation
     "<http://www.wikidata.org/entity/Q5>"),   # Human (wrong)

    ("<http://www.wikidata.org/entity/Q5>",  # Human
     "<http://www.wikidata.org/prop/direct/P279>",  # subclass of
     "<http://www.wikidata.org/entity/Q146>")  # Cat (wrong way around)

]


triples_pred_3 = [
        ("<http://www.wikidata.org/entity/Q146>",  # Cat
     "<http://www.wikidata.org/prop/direct/P31>",  # instance of
     "<http://www.wikidata.org/entity/Q5>"),  # Human (weird)

    ("<http://www.wikidata.org/entity/Q42>",  # Douglas Adams
     "<http://www.wikidata.org/prop/direct/P31>",  # instance of
     "<http://www.wikidata.org/entity/Q36180>"),  # science fiction writer (not "instance of")

    ("<http://www.wikidata.org/entity/Q146>",  # Cat
     "<http://www.wikidata.org/prop/direct/P106>",  # occupation
     "<http://www.wikidata.org/entity/Q36180>")  # writer (nonsense)
    
]


##### Based on the above examples output list the follwoing cells will choose the best and worst

In [14]:
scores = {
    "output 1": document_score(triples_pred_1, entity2id, relation2id, entity_embeddings, relation_embeddings),
    "output 2": document_score(triples_pred_2, entity2id, relation2id, entity_embeddings, relation_embeddings),
    "output 3": document_score(triples_pred_3, entity2id, relation2id, entity_embeddings, relation_embeddings),
}

for k,v in scores.items():
    print(f"{k}: {v:.4f}")

best_doc = max(scores, key=scores.get)
worst_doc = min(scores, key=scores.get)

print(f"\nBest output: {best_doc} (score={scores[best_doc]:.4f})")
print(f"Worst ouput: {worst_doc} (score={scores[worst_doc]:.4f})")


output 1: 0.1680
output 2: 0.0427
output 3: 0.0473

Best output: output 1 (score=0.1680)
Worst ouput: output 2 (score=0.0427)
