In [1]:
from functools import partial
from warnings import filterwarnings

import numpy as np
import pandas as pd
from tqdm import tqdm

import libs.embeddings as embeddings
from libs.utils import Timer
from libs.graph import KnowledgeGraph
from libs.taxonomy import Taxonomy
from libs.separability.evaluation import class_distance, evaluate
from libs.separability.data import get_centroids


kg = KnowledgeGraph.from_dir("data/dbpedia")

In [None]:
# Load classes to separate

fa = "data/separability/types_A.txt"
fb = "data/separability/types_B.txt"
missing_ids = "data/missing_ids.txt"

with open(fa, "r") as f:
    As = f.read().split()
    
with open(fb, "r") as f:
    Bs = f.read().split()
    
with open(missing_ids, "r") as f:
    invalid_ids = {int(x) for x in f.read().split()}


In [None]:
# Compute distances between classes

T = Taxonomy.load("full")
centroids, counts = get_centroids(kg, T)
class_distance = partial(class_distance, tax=T, centroids=centroids)


In [None]:
filterwarnings("ignore")  # Ignore ill-defined precision warnings

size = 300
min_size = 10
results = {}
verbose = False

df = None
MODELS = ["ComplEx", "DistMult", "RDF2Vec", "TransE", "TransH", "TransD"]

for model in MODELS:
    E = embeddings.load(model)
    _, dim = E.shape
    stop = False
    for a in tqdm(As, desc=model):
        if a == "owl:Thing": 
            continue
        all_ias = kg.sample_instances(size, a, force_size=False, exclude_ids=invalid_ids)
        all_na = len(all_ias)
        
        for b in Bs:
            if b == a or b == "owl:Thing": 
                continue
            if (b, a) in results: 
                results[(a,b)] = results[(b, a)]
                continue
            params = {}
            if T[b] < T[a]:
                params["except_type"] = a
            ias = kg.sample_instances(size, a, force_size=False, except_type=b, exclude_ids=invalid_ids) if T[a] < T[b] \
                else all_ias
            ibs = kg.sample_instances(size, b, force_size=False, exclude_ids=invalid_ids, **params)
            na, nb = len(ias), len(ibs)
            if na < min_size or nb < min_size: 
                continue
            indices = [*ias, *ibs]
            y = np.concatenate([np.zeros(na), np.ones(nb)])
            X = E[indices]
            res = evaluate(X, y)
            dist = class_distance(a, b)
            
            results[(a,b)] = {"a": a, "b": b, "ca": counts[a], "cb": counts[b], **dist, **res}
                
        curr_df = pd.DataFrame(results.values())
        curr_df["model"] = model
        if df is None:
            df = curr_df
        else:
            df = pd.concat([df, curr_df])
        
df.to_csv(f"results/separability/all.csv")
df.head()


Here, we compute the lexical distance between classes. The embedding file needs to be downloaded from [https://fasttext.cc/docs/en/english-vectors.html](https://fasttext.cc/docs/en/english-vectors.html) (use the Common Crawl, 600B tokens version).

In [None]:
import re 
import itertools as it

from scipy import stats
from scipy.spatial.distance import euclidean, cosine


# Step 1: load embedding vectors
with open("data/word_embeddings/crawl-300d-2M.vec", "r") as f:
    n_words, dim = map(int, next(f).split())
    E = np.zeros((n_words, dim))
    words = {}
    for i, line in enumerate(f):
        word, *vec = line.split()
        words[word] = i
        E[i] = np.array(vec, dtype=float)
        
        


In [4]:
def camel_case_split(s):  
    return [s.lower() for s in re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', s)]

def extract_keywords(cls):
    return camel_case_split(cls.replace("dbo:", "").replace("owl:", ""))

assert extract_keywords("dbo:ClericalAdministrativeRegion") == ("clerical", "administrative", "region")

# Step 2: retrieve all classes in the dataframe and split them into keywords
classes = set(df.a.unique()) | set(df.b.unique())
vocab = set(it.chain(*[extract_keywords(cls) for cls in classes]))
assert not vocab - word.keys()

# Step 3: average word embeddings to get the vector representation of each class
class_vector = {}
for c in classes:
    kw = extract_keywords(c)
    class_vector[c] = sum(E[words[k]] for k in kw) / len(kw)
    
class_vector["owl:Thing"] = E[words["thing"]]

# Step 4: define distances over class names
def word_distance(l):
    a, b = l
    return euclidean(class_vector[a], class_vector[b])

def word_cdistance(l):
    a, b = l
    return cosine(class_vector[a], class_vector[b])

def word_norm_distance(l):
    a, b = l
    va, vb = class_vector[a], class_vector[b]
    return euclidean(va/np.linalg.norm(va), vb/np.linalg.norm(vb))

word_norm_distance(["dbo:OfficeHolder", "dbo:Politician"])  # This should be around 1.167




In [4]:
df["hsize"] = df[["ca", "cb"]].apply(stats.hmean, axis=1)
df["gsize"] = df[["ca", "cb"]].apply(stats.gmean, axis=1)
df["cos"] = df[["a", "b"]].apply(word_cdistance, axis=1)
df["euc"] = df[["a", "b"]].apply(word_distance, axis=1)
df["neuc"] = df[["a", "b"]].apply(word_norm_distance, axis=1)
df["taxcos"] = 0.5 * (df.taxo/df.taxo.max() + df.cos/df.cos.max())
df["taxeuc"] = 0.5 * (df.taxo/df.taxo.max() + df.euc/df.euc.max())
df["taxneuc"] = 0.5 * (df.taxo/df.taxo.max() + df.neuc/df.neuc.max())

fname = "results/separability/all_sized.csv"
df.to_csv(fname)