In [1]:
from functools import partial
from warnings import filterwarnings

import numpy as np
import pandas as pd
from tqdm import tqdm

import libs.embeddings as embeddings
from libs.utils import Timer
from libs.graph import KnowledgeGraph
from libs.taxonomy import Taxonomy
from libs.separability.evaluation import class_distance, evaluate
from libs.separability.data import get_centroids


kg = KnowledgeGraph.from_dir("data/dbpedia")

In [None]:
# Load classes to separate

fa = "data/separability/types_A.txt"
fb = "data/separability/types_B.txt"
missing_ids = "data/missing_ids.txt"

with open(fa, "r") as f:
    As = f.read().split()
    
with open(fb, "r") as f:
    Bs = f.read().split()
    
with open(missing_ids, "r") as f:
    invalid_ids = {int(x) for x in f.read().split()}


In [None]:
# Compute distances between classes

T = Taxonomy.load("full")
centroids, counts = get_centroids(kg, T)
class_distance = partial(class_distance, tax=T, centroids=centroids)


In [None]:
filterwarnings("ignore")  # Ignore ill-defined precision warnings

size = 300
min_size = 10
results = {}
verbose = False

df = None
MODELS = ["ComplEx", "DistMult", "RDF2Vec", "TransE", "TransH", "TransD"]

for model in MODELS:
    E = embeddings.load(model)
    _, dim = E.shape
    stop = False
    for a in tqdm(As, desc=model):
        if a == "owl:Thing": 
            continue
        all_ias = kg.sample_instances(size, a, force_size=False, exclude_ids=invalid_ids)
        all_na = len(all_ias)
        
        for b in Bs:
            if b == a or b == "owl:Thing": 
                continue
            if (b, a) in results: 
                results[(a,b)] = results[(b, a)]
                continue
            params = {}
            if T[b] < T[a]: 
                params["except_type"] = a
            ias = kg.sample_instances(size, a, force_size=False, except_type=b, exclude_ids=invalid_ids) if T[a] < T[b] \
                else all_ias
            ibs = kg.sample_instances(size, b, force_size=False, exclude_ids=invalid_ids, **params)
            na, nb = len(ias), len(ibs)
            if na < min_size or nb < min_size: 
                continue
            indices = [*ias, *ibs]
            y = np.concatenate([np.zeros(na), np.ones(nb)])
            X = E[indices]
            res = evaluate(X, y)
            dist = class_distance(a, b)
            
            results[(a,b)] = {"a": a, "b": b, "ca": counts[a], "cb": counts[b], **dist, **res}
                
        curr_df = pd.DataFrame(results.values())
        curr_df["model"] = model
        if df is None:
            df = curr_df
        else:
            df = pd.concat([df, curr_df])
        
df.to_csv(f"results/separability/all.csv")
df.head()