# Dbpedia and Caligraph---Reading files and preprocessing

In [1]:
import os, json
import numpy as np
base_path = !pwd
base_path = base_path[0]
list_files = [base_path+"/data/caligraph/"+f for f in os.listdir(base_path+"/data/caligraph/") if os.path.isfile(base_path+"/data/caligraph/"+f)]

In [2]:
list_files

['/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-provenance.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-transitive-types.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-to-dbpedia-mappings.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-labels.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-ontology.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-relations.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-types.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-class-to-dbpedia.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-provenance.nt']

In [3]:
with open(list_files[2]) as file:
    caligraph2dbpedia_mappings = file.readlines()

In [4]:
def get_map(mapping):
    x,_,y,_ = mapping.split()
    return x.strip('<>'), y.strip('<>')

In [5]:
caligraph2dbpedia_mappings = dict(map(lambda x: get_map(x), caligraph2dbpedia_mappings))

In [6]:
list(caligraph2dbpedia_mappings.items())[13]

('http://caligraph.org/resource/Cameroon_sheep',
 'http://dbpedia.org/resource/Cameroon_sheep')

In [7]:
#dbpedia2caligraph_mappings = {value: key for key,value in caligraph2dbpedia_mappings.items()}

In [8]:
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [9]:
word_vectors_caligraph = KeyedVectors.load("./Caligraph_Dbpedia/caligraph/caligraph-v211_500_4_sg_200_vectors.kv", mmap='r')

In [10]:
word_vectors_dbpedia = KeyedVectors.load("./Caligraph_Dbpedia/dbpedia/dbpedia.kv", mmap='r')

### There are mismatches between entity IRIs in 'caligraph2dbpedia_mappings' and those in the computed embeddings, see below. We will write a function that fixes it.

In [11]:
def repair_namespace(iri, kg='dbpedia'):
    if kg == 'dbpedia':
        if 'owl#' in iri:
            return iri
        iri = iri.replace('dbr:', 'http://dbpedia.org/resource/')
        return 'http://dbpedia.org/resource/' + iri.split('/')[-1]
    elif kg == 'caligraph':
        if 'owl#' in iri or 'ontology' in iri:
            return iri
        return 'http://caligraph.org/resource/' + iri.split('/')[-1]         

In [12]:
#Emb_keys_db = set(map(lambda t: repair_namespace(t), word_vectors_dbpedia.key_to_index.keys()))

In [13]:
#Emb_keys_cal = set(map(lambda t: repair_namespace(t, 'caligraph'), word_vectors_caligraph.key_to_index.keys()))

In [14]:
from tqdm import tqdm

### Creating entity to vector maps

In [15]:
entity2vec_db = {}
entity2vec_cal = {}

In [16]:
for ent in tqdm(word_vectors_dbpedia.key_to_index):
    try:
        entity2vec_db[repair_namespace(ent)] = np.array(word_vectors_dbpedia.get_vector(ent))
    except KeyError:
        if repair_namespace(ent) in entity2vec_db:
            entity2vec_db.pop(repair_namespace(ent))

100%|██████████| 15048578/15048578 [02:26<00:00, 102954.58it/s]


In [17]:
for ent in tqdm(word_vectors_caligraph.key_to_index):
    try:
        entity2vec_cal[repair_namespace(ent, 'caligraph')] = np.array(word_vectors_caligraph.get_vector(ent), )
    except KeyError:
        if repair_namespace(ent) in entity2vec_cal:
            entity2vec_cal.pop(repair_namespace(ent))

100%|██████████| 16429696/16429696 [02:50<00:00, 96382.36it/s] 


In [18]:
del word_vectors_dbpedia, word_vectors_caligraph

In [19]:
new_aligned_entity_dict = dict()

for key, value in tqdm(caligraph2dbpedia_mappings.items()):
    if key in entity2vec_cal and value in entity2vec_db:
        new_aligned_entity_dict.update({key: value})
print('There are ', len(new_aligned_entity_dict), ' aligned entities with available embeddings')

100%|██████████| 8320865/8320865 [00:25<00:00, 331921.01it/s]

There are  3745232  aligned entities with available embeddings





In [None]:
#with open('Caligraph_Dbpedia/caligraph2dbpediaalignment.json', 'w') as file:
#    json.dump(new_aligned_entity_dict, file, indent=3)

# Computing aligned KG embeddings using Orthogonal Procrustes

In [20]:
from sklearn.model_selection import train_test_split
from scipy.spatial import procrustes
from scipy.linalg import orthogonal_procrustes
import time, gc

## Get the embedding matrices of aligned an non-aligned entities

In [1]:
def get_source_and_target_matrices(alignment_dict, entity2vec1, entity2vec2, emb_dim=200, test_size=0.1):
    """This function takes the dictionary of aligned entities between two KGs and their corresponding embeddings (as entity to vector dictionaries)
    and returns S, T, S_eval, T_eval, and R defined as follows:
    
    -- S: Normalized large subset of the source embeddings, i.e. the matrix of aligned entity embeddings in the first knowledge graph
    
    -- T: Normalized large subset of the matrix of aligned entity embeddings in the second knowledge graph
    
    -- S_eval and T_eval are portions of S and T sampled for evaluation if test_size > 0
    
    -- R: The rotation matrix that most closely maps S to T, i.e. ||A@S-T|| is minimized
    """
    if test_size > 0:
        train_ents, eval_ents = train_test_split(list(alignment_dict.keys()), test_size=test_size, random_state=42)
    else:
        train_ents = alignment_dict.keys()
    
    S = np.empty((len(train_ents), emb_dim))
    T = np.empty((len(train_ents), emb_dim))
    if test_size > 0:
        S_eval = np.empty((len(eval_ents), emb_dim))
        T_eval = np.empty((len(eval_ents), emb_dim))

    for i, key in tqdm(enumerate(train_ents), total=len(train_ents), desc='Computing S and T'):
        S[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
        T[i] = entity2vec2[alignment_dict[key]] if isinstance(entity2vec2, dict) else entity2vec2.loc[alignment_dict[key]].values
        
    if test_size > 0:
        for i, key in tqdm(enumerate(eval_ents), total=len(eval_ents), desc='Computing S_eval and T_eval'):
            S_eval[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
            T_eval[i] = entity2vec2[alignment_dict[key]] if isinstance(entity2vec2, dict) else entity2vec2.loc[alignment_dict[key]].values
        
    print('\nNow computing R...')
    
    t0 = time.time()
    R, scale = orthogonal_procrustes(S/np.sqrt((S**2).sum()), T/np.sqrt((T**2).sum()), check_finite=True)
    print('\nCompleted after '+str(time.time()-t0)+' seconds')
    print('scale: ', scale)
    
    if test_size > 0:
        return S/np.sqrt((S**2).sum()), T/np.sqrt((T**2).sum()), S_eval/np.sqrt((S_eval**2).sum()), T_eval/np.sqrt((T_eval**2).sum()), R
    else:
        return S/np.sqrt((S**2).sum()), T/np.sqrt((T**2).sum()), R
    

In [2]:
def get_non_aligned_entity_embedding_matrices(alignment_dict, entity2vec1, entity2vec2, emb_dim=200):
    """
    Inputs the dictionary of aligned entities between two KGs and their corresponding embeddings, and returns the normalized embedding matrices of 
    
    non-aligned entities
    """
    A_neg_S = np.empty((len(entity2vec1)-len(alignment_dict), emb_dim))
    keys = sorted(set(entity2vec1.keys() if isinstance(entity2vec1, dict) else entity2vec1.index)-set(alignment_dict.keys()))
    for i, key in tqdm(enumerate(keys), total=A_neg_S.shape[0], desc='Computing A_neg_S...'):
        A_neg_S[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
    
    B_neg_T = np.empty((len(entity2vec2)-len(alignment_dict), emb_dim))
    keys = sorted(set(entity2vec2.keys() if isinstance(entity2vec2, dict) else entity2vec2.index)-set(alignment_dict.values()))
    for i, key in tqdm(enumerate(keys), total=B_neg_T.shape[0], desc='Computing B_neg_T...'):
        B_neg_T[i] = entity2vec2[key] if isinstance(entity2vec2, dict) else entity2vec2.loc[key].values
    
    return A_neg_S/np.sqrt((A_neg_S**2).sum()), B_neg_T/np.sqrt((B_neg_T**2).sum())
    

## Evaluating Procrustes alignment

In [23]:
S, T, S_eval, T_eval, R = get_source_and_target_matrices(new_aligned_entity_dict, entity2vec_cal, entity2vec_db, test_size=0.1)

Computing S and T: 100%|██████████| 3370708/3370708 [00:32<00:00, 105310.01it/s]
Computing S_eval and T_eval: 100%|██████████| 374524/374524 [00:02<00:00, 131021.31it/s]



Now computing R...

Completed after 22.48607349395752 seconds
scale:  0.4375346570394851


In [24]:
from sklearn.neighbors import NearestNeighbors
import random

In [3]:
def evaluate_alignment_knn(S_eval, T_eval, R):
    """The function takes the evaluation sets, i.e. correct alignments that were left out, and returns the accuracy computed as the proportion
    of correct alignment predictions
    
    """
    print('#'*50)
    print('Evaluation started...')
    print('#'*50)
    model = NearestNeighbors(n_neighbors=1, n_jobs=-1)
    print('Fitting...')
    model.fit(S_eval@R)
    print('Predicting...')
    preds = model.kneighbors(T_eval, 1, return_distance=False)
    preds = preds.reshape(-1,)
    acc = (preds == np.arange(S_eval.shape[0])).astype(float).sum()
    return acc / S_eval.shape[0]

In [5]:
def evaluate_alignment(S_eval, T_eval, R, num_candidates=10):
    """The function takes the evaluation sets, i.e. correct alignments that were left out, and returns the accuracy computed as the proportion
    of correct alignment predictions
    
    """
    print('#'*50)
    print('Evaluation started...')
    print('#'*50)
    acc = 0
    ids = list(range(S_eval.shape[0]))
    for i in tqdm(range(S_eval.shape[0])):
        s_i = S_eval[i][None, :]@R
        rand_ids = list(set(random.sample(ids, k=num_candidates))-{i})
        candidates = np.concatenate([T_eval[i][None, :], T_eval[rand_ids[:num_candidates-1]]], axis=0)
        acc += ((candidates-s_i)**2).sum(1).squeeze().argmin() == 1
    return acc / S_eval.shape[0]

In [None]:
acc = evaluate_alignment(S_eval.astype(np.float16), T_eval.astype(np.float16), R.astype(np.float16))
print('Accuracy on validation data: ', acc)

In [27]:
acc = evaluate_alignment_knn(S_eval.astype(np.float16), T_eval.astype(np.float16), R.astype(np.float16))
print('Accuracy on validation data: ', acc)

##################################################
Evaluation started...
##################################################
Fitting...
Predicting...
Accuracy on validation data:  1.0


## Computing and storing universal embeddings

In [None]:
#del S, T, S_eval, T_eval, R
#gc.collect()

In [None]:
list_merged_entities = sorted(set(entity2vec_cal.keys())-set(new_aligned_entity_dict.keys())) +\
sorted(set(entity2vec_db.keys())-set(new_aligned_entity_dict.values())) + \
list(new_aligned_entity_dict.keys())
with open('Caligraph_Dbpedia/list_merged_entities_cal_db.txt', 'w') as file:
    file.write(','.join(list_merged_entities))
del list_merged_entities

In [None]:
gc.collect()

In [None]:
S, T, R = get_source_and_target_matrices(new_aligned_entity_dict, entity2vec_cal, entity2vec_db, test_size=0.0)

In [None]:
A_neg_S, B_neg_T = get_non_aligned_entity_embedding_matrices(new_aligned_entity_dict, entity2vec_cal, entity2vec_db)

In [None]:
del entity2vec_cal, entity2vec_db
gc.collect()

In [None]:
# compute every s_i as (s_i@R+t_i)/2
S = (S@R + T)/2
del T
gc.collect()
Universal_Emb = np.concatenate([A_neg_S@R, B_neg_T, S], axis=0)

In [None]:
np.save('Caligraph_Dbpedia/Universal_Emb.npy', Universal_Emb)

In [None]:
Universal_Emb.shape

# French and English Dbpedia

In [6]:
import torch, pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.spatial import procrustes
from scipy.linalg import orthogonal_procrustes
import time, gc
from sklearn.neighbors import NearestNeighbors
import random
from tqdm import tqdm

In [7]:
random.seed(42)

In [8]:
def load_embeddings(full_embedding_path, entity_id_map):
    print('Loading embeddings...')
    model = torch.load(full_embedding_path, map_location='cpu')
    with open(entity_id_map) as file:
        entity_id_map = json.load(file)
    ent_emb = pd.DataFrame(model.entity_embeddings._embeddings.weight.data.tolist(), index=list(entity_id_map.keys()))
    return ent_emb

In [9]:
fr_dbpedia_emb = load_embeddings('Fr_En_Dbpedia/Fr/embeddings/TransE/trained_model.pkl', 'Fr_En_Dbpedia/Fr/embeddings/TransE/entity_to_ids.json')
eng_dbpedia_emb = load_embeddings('Fr_En_Dbpedia/En/embeddings/TransE/trained_model.pkl', 'Fr_En_Dbpedia/En/embeddings/TransE/entity_to_ids.json')

Loading embeddings...
Loading embeddings...


In [7]:
fr_dbpedia_emb.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.009833,-0.012834,-0.115936,-0.040705,0.076594,-0.03435,-0.020555,0.042299,0.027486,-0.074162,...,0.050411,-0.071838,0.001301,-0.057446,0.001407,0.041489,0.122214,0.017487,0.109715,-0.065048
1,-0.021399,-0.031363,-0.058264,0.04501,0.084431,-0.033651,-0.048864,0.000954,0.034992,-0.049416,...,-0.083037,-0.06391,-0.010611,-0.039704,-0.009805,0.044853,-0.029519,0.105641,0.093731,-0.015521
10,0.086678,0.013267,-0.06266,0.040125,0.051002,-0.063431,-0.060855,-0.007261,0.036402,-0.072729,...,-0.077436,-0.072368,-0.044481,-0.0727,-0.018492,0.030871,-0.049264,0.066449,0.063988,-0.029479


In [8]:
eng_dbpedia_emb.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
10500,-0.006989,0.000777,-0.113241,0.046147,0.030751,-0.080907,-0.059351,-0.129297,0.073091,-0.028684,...,-0.147443,-0.089282,-0.004109,-0.091981,0.008184,-0.005688,-0.051136,-0.028319,-0.025681,-0.146335
10501,-0.011741,0.06958,-0.141341,0.075984,0.083039,-0.02997,-0.032043,-0.016951,0.021821,0.04054,...,-0.127512,-0.049333,-0.035813,-0.0597,0.032912,0.008938,-0.022301,0.016085,0.077748,-0.144073
10502,0.019575,0.049873,-0.130476,0.071612,0.063691,-0.046625,-0.059835,-0.05737,0.06284,0.0157,...,-0.126775,-0.073543,-0.017385,-0.066693,0.02184,-0.00418,-0.035554,0.007543,0.059197,-0.19877


In [41]:
fr_dbpedia_emb.shape

(19661, 200)

In [42]:
eng_dbpedia_emb.shape

(19993, 200)

In [10]:
with open('Fr_En_Dbpedia/ref_ent_ids') as file:
    mapping = file.readlines()

In [11]:
fr_to_eng_ids = dict(list(map(lambda x: x.strip('\n').split('\t'), mapping)))

In [12]:
S, T, S_eval, T_eval, R = get_source_and_target_matrices(fr_to_eng_ids, fr_dbpedia_emb, eng_dbpedia_emb, test_size=0.1)

Computing S and T: 100%|██████████| 13500/13500 [00:02<00:00, 5499.36it/s]
Computing S_eval and T_eval: 100%|██████████| 1500/1500 [00:00<00:00, 5187.21it/s]



Now computing R...

Completed after 0.10346341133117676 seconds
scale:  0.9272847214147476


### Evaluate entity alignment

In [13]:
acc = evaluate_alignment(S_eval.astype(np.float32), T_eval.astype(np.float32), R.astype(np.float32))
print('Accuracy on validation data: ', acc)

##################################################
Evaluation started...
##################################################


100%|██████████| 1500/1500 [00:00<00:00, 11039.19it/s]

Accuracy on validation data:  0.052





In [14]:
acc = evaluate_alignment_knn(S_eval.astype(np.float32), T_eval.astype(np.float32), R.astype(np.float32))
print('Accuracy on validation data: ', acc)

##################################################
Evaluation started...
##################################################
Fitting...
Predicting...
Accuracy on validation data:  0.018666666666666668


## Compute and store universal embeddings

In [57]:
## We now want true entity IRIs. We only have their key ids
with open('Fr_En_Dbpedia/ent_ids_1') as file:
    entity_names_map_fr = file.readlines()

with open('Fr_En_Dbpedia/ent_ids_2') as file:
    entity_names_map_eng = file.readlines()
    
id_to_name_fr = dict(list(map(lambda x: x.strip('\n').split('\t'), entity_names_map_fr)))
id_to_name_eng = dict(list(map(lambda x: x.strip('\n').split('\t'), entity_names_map_eng)))

true_merged_entity_names = list(map(id_to_name_fr.get, sorted(set(fr_dbpedia_emb.index)-set(fr_to_eng_ids.keys())))) + \
                           list(map(id_to_name_eng.get, sorted(set(eng_dbpedia_emb.index)-set(fr_to_eng_ids.values())))) + \
                           list(map(id_to_name_fr.get, list(fr_to_eng_ids.keys())))
print(f'Total number of merged entities: {len(true_merged_entity_names)}')
with open('Fr_En_Dbpedia/list_merged_entities_Fr_Eng_dbpedia.txt', 'w') as file:
    file.write(','.join(true_merged_entity_names))

Total number of merged entities: 24654


In [70]:
Fr_to_Eng_entity_names = dict(zip(list(map(id_to_name_fr.get, fr_to_eng_ids.keys())),\
                                 list(map(id_to_name_eng.get, fr_to_eng_ids.values()))))

Eng_to_Fr_entity_names = {value:key for key,value in Fr_to_Eng_entity_names.items()}

with open('Fr_En_Dbpedia/Fr_to_Eng_entity_names.json', 'w') as file:
    json.dump(Fr_to_Eng_entity_names, file, ensure_ascii=False)
    
with open('Fr_En_Dbpedia/Eng_to_Fr_entity_names.json', 'w') as file:
    json.dump(Eng_to_Fr_entity_names, file, ensure_ascii=False)

In [59]:
S, T, R = get_source_and_target_matrices(fr_to_eng_ids, fr_dbpedia_emb, eng_dbpedia_emb, test_size=0.0)

Computing S and T: 100%|██████████| 15000/15000 [00:02<00:00, 5226.19it/s]



Now computing R...

Completed after 0.11648726463317871 seconds
scale:  0.9272167303554676


In [62]:
A_neg_S, B_neg_T = get_non_aligned_entity_embedding_matrices(fr_to_eng_ids, fr_dbpedia_emb, eng_dbpedia_emb)

Computing A_neg_S...: 100%|██████████| 4661/4661 [00:00<00:00, 10899.23it/s]
Computing B_neg_T...: 100%|██████████| 4993/4993 [00:00<00:00, 10936.88it/s]


In [63]:
# compute every s_i as (s_i@R+t_i)/2
S = (S@R + T)/2
gc.collect()
Universal_Emb = np.concatenate([A_neg_S@R, B_neg_T, S], axis=0)

In [64]:
np.save('Fr_En_Dbpedia/Universal_Emb.npy', Universal_Emb)