# Reading files and preprocessing

In [1]:
import os, json
import numpy as np
base_path = !pwd
base_path = base_path[0]
list_files = [base_path+"/data/caligraph/"+f for f in os.listdir(base_path+"/data/caligraph/") if os.path.isfile(base_path+"/data/caligraph/"+f)]

In [2]:
list_files

['/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-provenance.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-transitive-types.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-to-dbpedia-mappings.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-labels.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-ontology.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-relations.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-types.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-class-to-dbpedia.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-provenance.nt']

In [3]:
with open(list_files[2]) as file:
    caligraph2dbpedia_mappings = file.readlines()

In [4]:
def get_map(mapping):
    x,_,y,_ = mapping.split()
    return x.strip('<>'), y.strip('<>')

In [5]:
caligraph2dbpedia_mappings = dict(map(lambda x: get_map(x), caligraph2dbpedia_mappings))

In [6]:
list(caligraph2dbpedia_mappings.items())[13]

('http://caligraph.org/resource/Cameroon_sheep',
 'http://dbpedia.org/resource/Cameroon_sheep')

In [7]:
#dbpedia2caligraph_mappings = {value: key for key,value in caligraph2dbpedia_mappings.items()}

In [8]:
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [9]:
word_vectors_caligraph = KeyedVectors.load("./Caligraph_Dbpedia/caligraph/caligraph-v211_500_4_sg_200_vectors.kv", mmap='r')

In [10]:
word_vectors_dbpedia = KeyedVectors.load("./Caligraph_Dbpedia/dbpedia/dbpedia.kv", mmap='r')

### There are mismatches between entity IRIs in 'caligraph2dbpedia_mappings' and those in the computed embeddings, see below. We will write a function that fixes it.

In [11]:
def repair_namespace(iri, kg='dbpedia'):
    if kg == 'dbpedia':
        if 'owl#' in iri:
            return iri
        iri = iri.replace('dbr:', 'http://dbpedia.org/resource/')
        return 'http://dbpedia.org/resource/' + iri.split('/')[-1]
    elif kg == 'caligraph':
        if 'owl#' in iri or 'ontology' in iri:
            return iri
        return 'http://caligraph.org/resource/' + iri.split('/')[-1]         

In [12]:
#Emb_keys_db = set(map(lambda t: repair_namespace(t), word_vectors_dbpedia.key_to_index.keys()))

In [13]:
#Emb_keys_cal = set(map(lambda t: repair_namespace(t, 'caligraph'), word_vectors_caligraph.key_to_index.keys()))

In [14]:
from tqdm import tqdm

### Creating entity to vector maps

In [15]:
entity2vec_db = {}
entity2vec_cal = {}

In [16]:
for ent in tqdm(word_vectors_dbpedia.key_to_index):
    try:
        entity2vec_db[repair_namespace(ent)] = np.array(word_vectors_dbpedia.get_vector(ent))
    except KeyError:
        if repair_namespace(ent) in entity2vec_db:
            entity2vec_db.pop(repair_namespace(ent))

100%|██████████| 15048578/15048578 [02:18<00:00, 108524.23it/s]


In [17]:
for ent in tqdm(word_vectors_caligraph.key_to_index):
    try:
        entity2vec_cal[repair_namespace(ent, 'caligraph')] = np.array(word_vectors_caligraph.get_vector(ent), )
    except KeyError:
        if repair_namespace(ent) in entity2vec_cal:
            entity2vec_cal.pop(repair_namespace(ent))

100%|██████████| 16429696/16429696 [02:19<00:00, 117376.20it/s]


In [18]:
del word_vectors_dbpedia, word_vectors_caligraph

In [19]:
new_aligned_entity_dict = dict()

for key, value in tqdm(caligraph2dbpedia_mappings.items()):
    if key in entity2vec_cal and value in entity2vec_db:
        new_aligned_entity_dict.update({key: value})
print('There are ', len(new_aligned_entity_dict), ' aligned entities with available embeddings')

100%|██████████| 8320865/8320865 [00:20<00:00, 402377.94it/s]

There are  3745232  aligned entities with available embeddings





In [None]:
#with open('Caligraph_Dbpedia/caligraph2dbpediaalignment.json', 'w') as file:
#    json.dump(new_aligned_entity_dict, file, indent=3)

# Computing aligned KG embeddings using Orthogonal Procrustes

In [20]:
from sklearn.model_selection import train_test_split
from scipy.spatial import procrustes
from scipy.linalg import orthogonal_procrustes
import time, gc

## Get the embedding matrices of aligned entities

In [21]:
def get_source_and_target_matrices(alignment_dict, entity2vec1, entity2vec2, test_size=0.1):
    """This function takes the dictionary of aligned entities between two KGs and their corresponding embeddings (as entity to vector dictionaries)
    and returns S, T, S_eval, T_eval, and R defined as follows:
    
    -- S: Normalized large subset of the source embeddings, i.e. the matrix of aligned entity embeddings in the first knowledge graph
    
    -- T: Normalized large subset of the matrix of aligned entity embeddings in the second knowledge graph
    
    -- S_eval and T_eval are portions of S and T sampled for evaluation if test_size > 0
    
    -- R: The rotation matrix that most closely maps S to T, i.e. ||A@S-T|| is minimized
    """
    if test_size > 0:
        train_ents, eval_ents = train_test_split(list(alignment_dict.keys()), test_size=test_size)
    else:
        train_ents = alignment_dict.keys()
    
    S = np.empty((len(train_ents), 200), )
    T = np.empty((len(train_ents), 200), )
    if test_size > 0:
        S_eval = np.empty((len(eval_ents), 200), )
        T_eval = np.empty((len(eval_ents), 200), )

    for i, key in tqdm(enumerate(train_ents), total=len(train_ents), desc='Computing S and T'):
        S[i] = entity2vec_cal[key]
        T[i] = entity2vec_db[new_aligned_entity_dict[key]]
        
    if test_size > 0:
        for i, key in tqdm(enumerate(eval_ents), total=len(eval_ents), desc='Computing S_eval and T_eval'):
            S_eval[i] = entity2vec_cal[key]
            T_eval[i] = entity2vec_db[new_aligned_entity_dict[key]]
        
    print('\nNow computing R...')
    
    t0 = time.time()
    R, scale = orthogonal_procrustes(S/np.sqrt((S**2).sum()), T/np.sqrt((T**2).sum()), check_finite=True)
    print('\nCompleted after '+str(time.time()-t0)+' seconds')
    print('scale: ', scale)
    
    if test_size > 0:
        return S/np.sqrt((S**2).sum()), T/np.sqrt((T**2).sum()), S_eval/np.sqrt((S_eval**2).sum()), T_eval/np.sqrt((T_eval**2).sum()), R
    else:
        return S/np.sqrt((S**2).sum()), T/np.sqrt((T**2).sum()), R
    

In [23]:
def get_non_aligned_entity_embedding_matrices(alignment_dict, entity2vec1, entity2vec2):
    """
    Inputs the dictionary of aligned entities between two KGs and their corresponding embeddings, and returns the normalized embedding matrices of 
    
    non-aligned entities
    """
    A_neg_S = np.empty((len(entity2vec1)-len(alignment_dict), 200))
    keys = sorted(set(entity2vec1.keys())-set(alignment_dict.keys()))
    for i, key in tqdm(enumerate(keys), total=A_neg_S.shape[0], desc='Computing A_neg_S...'):
        A_neg_S[i] = entity2vec1[key]
    
    B_neg_T = np.empty((len(entity2vec2)-len(alignment_dict), 200))
    keys = sorted(set(entity2vec2.keys())-set(alignment_dict.values()))
    for i, key in tqdm(enumerate(keys), total=B_neg_T.shape[0], desc='Computing B_neg_T...'):
        B_neg_T[i] = entity2vec2[key]
    
    return A_neg_S/np.sqrt((A_neg_S**2).sum()), B_neg_T/np.sqrt((B_neg_T**2).sum())
    

## Evaluating Procrustes alignment

In [22]:
S, T, S_eval, T_eval, R = get_source_and_target_matrices(new_aligned_entity_dict, entity2vec_cal, entity2vec_db, test_size=0.1)

Computing S and T: 100%|██████████| 3370708/3370708 [00:28<00:00, 118221.79it/s]
Computing S_eval and T_eval: 100%|██████████| 374524/374524 [00:02<00:00, 136812.51it/s]



Now computing R...

Completed after 18.258718013763428 seconds
scale:  0.43751756553668897


In [67]:
from sklearn.neighbors import NearestNeighbors
import random

In [68]:
def evaluate_alignment_knn(S_eval, T_eval, R):
    """The function takes the evaluation sets, i.e. correct alignments that were left out, and returns the accuracy computed as the proportion
    of correct alignment predictions
    
    """
    print('#'*50)
    print('Evaluation started...')
    print('#'*50)
    model = NearestNeighbors(n_neighbors=1, n_jobs=-1)
    print('Fitting...')
    model.fit(S_eval@R)
    print('Predicting...')
    preds = model.kneighbors(T_eval, 1, return_distance=False)
    preds = preds.reshape(-1,)
    acc = (np.array(preds) == np.arange(S_eval.shape[0])).astype(float).sum()
    #for i in tqdm(range(S_eval.shape[0])):
    #    row = S_eval[i][None, :]
    #    acc += ((T_eval - (row@R).squeeze())**2).sum(axis=1).argmin() == i
    #    #if i > 0 and i%100 == 0:
    #    #    print('acc:', acc/i)
    return acc / S_eval.shape[0]

In [73]:
def evaluate_alignment(S_eval, T_eval, R, num_candidates=10):
    """The function takes the evaluation sets, i.e. correct alignments that were left out, and returns the accuracy computed as the proportion
    of correct alignment predictions
    
    """
    print('#'*50)
    print('Evaluation started...')
    print('#'*50)
    acc = 0
    ids = list(range(S_eval.shape[0]))
    for i in tqdm(range(S_eval.shape[0])):
        s_i = S_eval[i][None, :]@R
        rand_ids = list(set(random.sample(ids, k=num_candidates))-{i})
        candidates = np.concatenate([T_eval[i][None, :], T_eval[rand_ids[:num_candidates-1]]], axis=0)
        acc += ((candidates-s_i)**2).sum(1).squeeze().argmin() == 1
    return acc / S_eval.shape[0]

In [74]:
acc = evaluate_alignment(S_eval.astype(np.float16), T_eval.astype(np.float16), R.astype(np.float16))
print('Accuracy on validation data: ', acc)

##################################################
Evaluation started...
##################################################


100%|██████████| 374524/374524 [05:20<00:00, 1168.86it/s]

Accuracy on validation data:  0.08155685616943106





In [75]:
acc = evaluate_alignment_knn(S_eval.astype(np.float16), T_eval.astype(np.float16), R.astype(np.float16))
print('Accuracy on validation data: ', acc)

##################################################
Evaluation started...
##################################################
Fitting...
Predicting...
Accuracy on validation data:  0.0003497773173414788


## Computing and storing universal embeddings

In [None]:
#del S, T, S_eval, T_eval, R
#gc.collect()

In [25]:
list_merged_entities = sorted(set(entity2vec_cal.keys())-set(new_aligned_entity_dict.keys())) +\
sorted(set(entity2vec_db.keys())-set(new_aligned_entity_dict.values())) + \
list(new_aligned_entity_dict.keys())
with open('Caligraph_Dbpedia/list_merged_entities_cal_db.txt', 'w') as file:
    file.write(','.join(list_merged_entities))
del list_merged_entities

In [26]:
gc.collect()

0

In [27]:
S, T, R = get_source_and_target_matrices(new_aligned_entity_dict, entity2vec_cal, entity2vec_db, test_size=0.0)

Computing S and T: 100%|██████████| 3745232/3745232 [00:29<00:00, 128254.38it/s]



Now computing R...

Completed after 16.62036395072937 seconds
scale:  0.437522492358588


In [28]:
A_neg_S, B_neg_T = get_non_aligned_entity_embedding_matrices(new_aligned_entity_dict, entity2vec_cal, entity2vec_db)

Computing A_neg_S...: 100%|██████████| 12670001/12670001 [00:37<00:00, 341252.82it/s]
Computing B_neg_T...: 100%|██████████| 6280733/6280733 [00:21<00:00, 291409.57it/s]


In [29]:
del entity2vec_cal, entity2vec_db
gc.collect()

0

In [30]:
# compute every s_i as (s_i@R+t_i)/2
S = (S@R + T)/2
del T
gc.collect()
Universal_Emb = np.concatenate([A_neg_S@R, B_neg_T, S], axis=0)

In [33]:
np.save('Caligraph_Dbpedia/Universal_Emb.npy', Universal_Emb)

In [32]:
Universal_Emb.shape

(22695966, 200)