# Dbpedia and Caligraph---Reading files and preprocessing

In [1]:
import os, json
import numpy as np
import time, gc
gc.enable()
base_path = !pwd
base_path = base_path[0]
list_files = [base_path+"/data/caligraph/"+f for f in os.listdir(base_path+"/data/caligraph/") if os.path.isfile(base_path+"/data/caligraph/"+f)]

In [2]:
list_files

['/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-provenance.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-transitive-types.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-to-dbpedia-mappings.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-labels.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-ontology.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-relations.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-types.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-class-to-dbpedia.nt',
 '/home/nkouagou/Documents/Universal_Embeddings/data/caligraph/caligraph-instance-provenance.nt']

In [3]:
with open(list_files[2]) as file:
    caligraph2dbpedia_mappings = file.readlines()

In [4]:
def get_map(mapping):
    x,_,y,_ = mapping.split()
    return x.strip('<>'), y.strip('<>')

In [5]:
caligraph2dbpedia_mappings = dict(map(lambda x: get_map(x), caligraph2dbpedia_mappings))

In [6]:
list(caligraph2dbpedia_mappings.items())[13]

('http://caligraph.org/resource/Cameroon_sheep',
 'http://dbpedia.org/resource/Cameroon_sheep')

In [7]:
#dbpedia2caligraph_mappings = {value: key for key,value in caligraph2dbpedia_mappings.items()}

In [8]:
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [9]:
word_vectors_caligraph = KeyedVectors.load("./Caligraph_Dbpedia/caligraph/caligraph-v211_500_4_sg_200_vectors.kv", mmap='r')

In [10]:
word_vectors_dbpedia = KeyedVectors.load("./Caligraph_Dbpedia/dbpedia/dbpedia.kv", mmap='r')

### There are mismatches between entity IRIs in 'caligraph2dbpedia_mappings' and those in the computed embeddings, see below. We will write a function that fixes it.

In [11]:
def repair_namespace(iri, kg='dbpedia'):
    if kg == 'dbpedia':
        if 'owl#' in iri:
            return iri
        iri = iri.replace('dbr:', 'http://dbpedia.org/resource/')
        return 'http://dbpedia.org/resource/' + iri.split('/')[-1]
    elif kg == 'caligraph':
        if 'owl#' in iri or 'ontology' in iri:
            return iri
        return 'http://caligraph.org/resource/' + iri.split('/')[-1]         

In [12]:
#Emb_keys_db = set(map(lambda t: repair_namespace(t), word_vectors_dbpedia.key_to_index.keys()))

In [13]:
#Emb_keys_cal = set(map(lambda t: repair_namespace(t, 'caligraph'), word_vectors_caligraph.key_to_index.keys()))

In [12]:
from tqdm import tqdm

### Creating entity to vector maps

In [13]:
entity2vec_db = {}
entity2vec_cal = {}

In [14]:
for ent in tqdm(word_vectors_dbpedia.key_to_index):
    try:
        entity2vec_db[repair_namespace(ent)] = np.array(word_vectors_dbpedia.get_vector(ent)).astype(np.float16)
    except KeyError:
        if repair_namespace(ent) in entity2vec_db:
            entity2vec_db.pop(repair_namespace(ent))

100%|██████████| 15048578/15048578 [02:47<00:00, 89993.95it/s] 


In [15]:
for ent in tqdm(word_vectors_caligraph.key_to_index):
    try:
        entity2vec_cal[repair_namespace(ent, 'caligraph')] = np.array(word_vectors_caligraph.get_vector(ent)).astype(np.float16)
    except KeyError:
        if repair_namespace(ent) in entity2vec_cal:
            entity2vec_cal.pop(repair_namespace(ent))

100%|██████████| 16429696/16429696 [03:04<00:00, 88928.70it/s]


In [16]:
del word_vectors_dbpedia, word_vectors_caligraph
gc.collect()

0

In [23]:
#new_aligned_entity_dict = dict()
#
#for key, value in tqdm(caligraph2dbpedia_mappings.items()):
#    if key in entity2vec_cal and value in entity2vec_db:
#        new_aligned_entity_dict.update({key: value})
#
#
#print('There are ', len(new_aligned_entity_dict), ' aligned entities with available embeddings')

with open('Caligraph_Dbpedia/caligraph2dbpediaalignment.json', 'r') as file:
    new_aligned_entity_dict = json.load(file)

# Computing aligned KG embeddings with a simple neural network

In [20]:
import torch, torch.nn as nn
from torch.optim import Adam

In [26]:
class AlignmentModel(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.emb_dim = emb_dim
        self.R = torch.nn.Parameter(
                         torch.tensor(np.random.uniform(-1, 1, (emb_dim, emb_dim)),
                         dtype=torch.float, requires_grad=True))
        
    def forward(self, s):
        return torch.mm(s, self.R)

In [24]:
def get_batch(S, T, batch_size=128):
    for i in range(0, S.shape[0]-batch_size+1, batch_size):
        yield S[i:i+batch_size], T[i:i+batch_size]


def train(model, lr, epochs, S, T, batch_size=128):
    optimizer = Adam(model.parameters(), lr=lr)
    Loss = nn.MSELoss()
    for epoch in range(epochs):
        loss_epoch = 0
        for S_batch, T_batch in tqdm(get_batch(S, T, batch_size=batch_size), total=S.shape[0]//batch_size):
            proj = model(S_batch)
            loss = Loss(proj, T_batch)
            loss_epoch += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'Loss epoch {epoch} : {loss_epoch}')

In [30]:
#with open('Caligraph_Dbpedia/caligraph2dbpediaalignment.json', 'w') as file:
#    json.dump(new_aligned_entity_dict, file, indent=3)

# Computing aligned KG embeddings using Orthogonal Procrustes

In [17]:
from sklearn.model_selection import train_test_split
from scipy.spatial import procrustes
from scipy.linalg import orthogonal_procrustes

## Get the embedding matrices of aligned an non-aligned entities

In [18]:
def get_source_and_target_matrices(alignment_dict, entity2vec1, entity2vec2, given_test_set=None, emb_dim=200, test_size=0.1):
    """This function takes the dictionary of aligned entities between two KGs and their corresponding embeddings (as entity to vector dictionaries)
    and returns S, T, S_eval, T_eval, and R defined as follows:
    
    -- S: Normalized and scaled large subset of the source embeddings, i.e. the matrix of aligned entity embeddings in the first knowledge graph
    
    -- T: Normalized and scaled large subset of the matrix of aligned entity embeddings in the second knowledge graph
    
    -- S_eval and T_eval are portions of S and T sampled for evaluation if test_size > 0
    
    -- R: The rotation matrix that most closely maps S to T, i.e. ||A@S-T|| is minimized
    
    The mean and standard deviation of S, T are also returned
    """
    if test_size > 0:
        if given_test_set is None:
            train_ents, eval_ents = train_test_split(list(alignment_dict.keys()), test_size=test_size, random_state=42)
        else:
            eval_ents = given_test_set
            train_ents = list(set(alignment_dict.keys())-set(eval_ents))
    else:
        train_ents = alignment_dict.keys()
        
    S = np.empty((len(train_ents), emb_dim))
    T = np.empty((len(train_ents), emb_dim))
    if test_size > 0:
        S_eval = np.empty((len(eval_ents), emb_dim))
        T_eval = np.empty((len(eval_ents), emb_dim))

    for i, key in tqdm(enumerate(train_ents), total=len(train_ents), desc='Computing S and T'):
        S[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
        T[i] = entity2vec2[alignment_dict[key]] if isinstance(entity2vec2, dict) else entity2vec2.loc[alignment_dict[key]].values
        
    if test_size > 0:
        for i, key in tqdm(enumerate(eval_ents), total=len(eval_ents), desc='Computing S_eval and T_eval'):
            S_eval[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
            T_eval[i] = entity2vec2[alignment_dict[key]] if isinstance(entity2vec2, dict) else entity2vec2.loc[alignment_dict[key]].values
        
    print('\nNow computing R...')
    # Center and scale data
    mean_S = S.mean(axis=0)
    mean_T = T.mean(axis=0)
    scale_S = np.sqrt(((S-mean_S)**2).sum()/S.shape[0]) # scale, see https://en.wikipedia.org/wiki/Procrustes_analysis
    scale_T = np.sqrt(((T-mean_T)**2).sum()/T.shape[0])
    print('Scale S: ', scale_S)
    
    t0 = time.time()
    R, loss = orthogonal_procrustes((S-mean_S)/scale_S, (T-mean_T)/scale_T, check_finite=True)
    print('\nCompleted after '+str(time.time()-t0)+' seconds')
    print('Alignment loss: ', loss)
    if test_size > 0:
        return scale_S, scale_T, mean_S, mean_T, (S-mean_S)/scale_S, (T-mean_T)/scale_T, (S_eval-mean_S)/scale_S, (T_eval-mean_T)/scale_T, R
        #return scale_S, scale_T, mean_S, mean_T, S, T, S_eval, T_eval, R
    else:
        return scale_S, scale_T, mean_S, mean_T, (S-mean_S)/scale_S, (T-mean_T)/scale_T, R


In [19]:
def get_non_aligned_entity_embedding_matrices(alignment_dict, entity2vec1, entity2vec2, scale_S, scale_T, mean_S, mean_T, emb_dim=200):
    """
    Inputs the dictionary of aligned entities between two KGs and their corresponding embeddings, and returns the normalized embedding matrices of 
    
    non-aligned entities
    """
    A_neg_S = np.empty((len(entity2vec1)-len(alignment_dict), emb_dim))
    keys = sorted(set(entity2vec1.keys() if isinstance(entity2vec1, dict) else entity2vec1.index)-set(alignment_dict.keys()))
    for i, key in tqdm(enumerate(keys), total=A_neg_S.shape[0], desc='Computing A_neg_S...'):
        A_neg_S[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
    
    B_neg_T = np.empty((len(entity2vec2)-len(alignment_dict), emb_dim))
    keys = sorted(set(entity2vec2.keys() if isinstance(entity2vec2, dict) else entity2vec2.index)-set(alignment_dict.values()))
    for i, key in tqdm(enumerate(keys), total=B_neg_T.shape[0], desc='Computing B_neg_T...'):
        B_neg_T[i] = entity2vec2[key] if isinstance(entity2vec2, dict) else entity2vec2.loc[key].values
        
    return (A_neg_S-mean_S)/scale_S, (B_neg_T-mean_T)/scale_T
    

In [20]:
from sklearn.neighbors import NearestNeighbors
import random

In [21]:
def evaluate_alignment_knn(S_eval, T_eval, R, hit_values = [1, 3, 10]):
    """The function takes the evaluation sets, i.e. correct alignments that were left out, and returns the hits@ and MRR results w.r.t. correct alignments
    
    """
    print('#'*50)
    print('Evaluation started...')
    print('#'*50)
    model = NearestNeighbors(n_neighbors=S_eval.shape[0], n_jobs=-1)
    print('Fitting...')
    model.fit(T_eval)
    print('Predicting...')
    preds = model.kneighbors((S_eval@R+T_eval)/2, n_neighbors=S_eval.shape[0], return_distance=False)
    Hits = np.zeros(len(hit_values))
    MRR = 0.0
    for i in tqdm(range(S_eval.shape[0]), total=S_eval.shape[0]):
        pred_idx = (preds[i]==i).nonzero()[0][0] # if i in preds[i] else S_eval.shape[0]
        MRR += (1./(pred_idx+1))
        for j in range(len(Hits)):
            if pred_idx < hit_values[j]:
                Hits[j] += 1.0/S_eval.shape[0]
    MRR = MRR/S_eval.shape[0]
    print()
    print(', '.join([f'Hits@{hit_values[it]}: {Hits[it]}' for it in range(len(Hits))]+[f'MRR: {MRR}']))

In [24]:
def evaluate_alignment(S_eval, T_eval, R, num_candidates=10):
    """The function takes the evaluation sets, i.e. correct alignments that were left out, and returns the accuracy computed as the proportion
    of correct alignment predictions among num_candidates candidates
    
    """
    print('#'*50)
    print('Evaluation started...')
    print('#'*50)
    acc = 0
    ids = list(range(S_eval.shape[0]))
    for i in tqdm(range(S_eval.shape[0])):
        s_i = S_eval[i][None, :]@R
        rand_ids = list(set(random.sample(ids, k=num_candidates))-{i})
        candidates = np.concatenate([T_eval[i][None, :], T_eval[rand_ids[:num_candidates-1]]], axis=0)
        acc += ((candidates-s_i)**2).sum(1).squeeze().argmin() == 1
    return acc / S_eval.shape[0]

## Evaluate, compute and store universal embeddings

In [32]:
_, _, _, _, _, _, S_eval, T_eval, R = get_source_and_target_matrices(new_aligned_entity_dict,\
                                                                                           entity2vec_cal, entity2vec_db, test_size=0.1)

Computing S and T: 100%|██████████| 3370708/3370708 [00:33<00:00, 99186.78it/s] 
Computing S_eval and T_eval: 100%|██████████| 374524/374524 [00:03<00:00, 100404.60it/s]



Now computing R...
Scale S:  3.398963744580538

Completed after 18.295594930648804 seconds
Alignment loss:  684426.5036357479


### Evaluation on validation data

In [None]:
#evaluate_alignment_knn(S_eval.astype(np.float16), T_eval.astype(np.float16), R.astype(np.float16), hit_values=[1, 3, 5, 10])

##################################################
Evaluation started...
##################################################
Fitting...
Predicting...


In [31]:
#list_merged_entities = sorted(set(entity2vec_cal.keys())-set(new_aligned_entity_dict.keys())) +\
#sorted(set(entity2vec_db.keys())-set(new_aligned_entity_dict.values())) + \
#list(new_aligned_entity_dict.keys())
#with open('Caligraph_Dbpedia/list_merged_entities_cal_db.txt', 'w') as file:
#    file.write(','.join(list_merged_entities))
#del list_merged_entities

In [24]:
scale_S, scale_T, mean_S, mean_T, S, T, R = get_source_and_target_matrices(new_aligned_entity_dict,\
                                                                           entity2vec_cal, entity2vec_db, test_size=0.0)

Computing S and T: 100%|██████████| 3745232/3745232 [00:38<00:00, 97640.99it/s] 



Now computing R...
Scale S:  3.399197955650588

Completed after 21.183332681655884 seconds
Alignment loss:  760397.9543931824


### Simple neural network

In [55]:
#model = AlignmentModel(200)
#lr = 0.01
#epochs = 100
#batch_size = 512

In [9]:
#train(model, lr, epochs, torch.Tensor(S), torch.Tensor(T), batch_size)

### Evaluation on training data

In [None]:
#evaluate_alignment_knn(S, T, R, hit_values=[1, 3, 5, 10])

In [25]:
A_neg_S, B_neg_T = get_non_aligned_entity_embedding_matrices(new_aligned_entity_dict, entity2vec_cal, \
                                                             entity2vec_db, scale_S, scale_T, mean_S, mean_T)

Computing A_neg_S...: 100%|██████████| 12670001/12670001 [00:47<00:00, 264532.78it/s]
Computing B_neg_T...: 100%|██████████| 6280733/6280733 [00:24<00:00, 256525.47it/s]


In [26]:
del entity2vec_cal, entity2vec_db
gc.collect()

0

In [27]:
# compute every s_i as (s_i@R+t_i)/2
S = (S@R + T)/2
del T
gc.collect()
Universal_Emb = np.concatenate([A_neg_S@R, B_neg_T, S], axis=0)

In [32]:
Universal_Emb.shape

(22695966, 200)

In [None]:
np.save('Caligraph_Dbpedia/Universal_Emb.npy', Universal_Emb)

In [None]:
Universal_Emb.shape

In [None]:
del A_neg_S, B_neg_T, S, T, R, Universal_Emb
gc.collect()

## Shallom embeddings for Fr-En Dbpedia

In [1]:
import torch, pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.spatial import procrustes
from scipy.linalg import orthogonal_procrustes
import time, gc
from sklearn.neighbors import NearestNeighbors
import random
from tqdm import tqdm
gc.enable()
random.seed(42)

In [2]:
def get_source_and_target_matrices(alignment_dict, entity2vec1, entity2vec2, given_test_set=None, emb_dim=200, test_size=0.1):
    """This function takes the dictionary of aligned entities between two KGs and their corresponding embeddings (as entity to vector dictionaries)
    and returns S, T, S_eval, T_eval, and R defined as follows:
    
    -- S: Normalized and scaled large subset of the source embeddings, i.e. the matrix of aligned entity embeddings in the first knowledge graph
    
    -- T: Normalized and scaled large subset of the matrix of aligned entity embeddings in the second knowledge graph
    
    -- S_eval and T_eval are portions of S and T sampled for evaluation if test_size > 0
    
    -- R: The rotation matrix that most closely maps S to T, i.e. ||A@S-T|| is minimized
    
    The mean and standard deviation of S, T are also returned
    """
    if test_size > 0:
        if given_test_set is None:
            train_ents, eval_ents = train_test_split(list(alignment_dict.keys()), test_size=test_size, random_state=42)
        else:
            eval_ents = given_test_set
            train_ents = list(set(alignment_dict.keys())-set(eval_ents))
    else:
        train_ents = alignment_dict.keys()
    
    S = np.empty((len(train_ents), emb_dim))
    T = np.empty((len(train_ents), emb_dim))
    if test_size > 0:
        S_eval = np.empty((len(eval_ents), emb_dim))
        T_eval = np.empty((len(eval_ents), emb_dim))

    for i, key in tqdm(enumerate(train_ents), total=len(train_ents), desc='Computing S and T'):
        S[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
        T[i] = entity2vec2[alignment_dict[key]] if isinstance(entity2vec2, dict) else entity2vec2.loc[alignment_dict[key]].values
        
    if test_size > 0:
        for i, key in tqdm(enumerate(eval_ents), total=len(eval_ents), desc='Computing S_eval and T_eval'):
            S_eval[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
            T_eval[i] = entity2vec2[alignment_dict[key]] if isinstance(entity2vec2, dict) else entity2vec2.loc[alignment_dict[key]].values
        
    print('\nNow computing R...')
    # Center and scale data
    mean_S = S.mean(axis=0)
    mean_T = T.mean(axis=0)
    scale_S = np.sqrt(((S-mean_S)**2).sum()/S.shape[0]) # scale, see https://en.wikipedia.org/wiki/Procrustes_analysis
    scale_T = np.sqrt(((T-mean_T)**2).sum()/T.shape[0])
    print('Scale S: ', scale_S)
    
    t0 = time.time()
    R, loss = orthogonal_procrustes((S-mean_S)/scale_S, (T-mean_T)/scale_T, check_finite=True)
    print('\nCompleted after '+str(time.time()-t0)+' seconds')
    print('Alignment loss: ', loss)
    
    if test_size > 0:
        return scale_S, scale_T, mean_S, mean_T, (S-mean_S)/scale_S, (T-mean_T)/scale_T, (S_eval-mean_S)/scale_S, (T_eval-mean_T)/scale_T, R
        #return scale_S, scale_T, mean_S, mean_T, S, T, S_eval, T_eval, R
    else:
        return scale_S, scale_T, mean_S, mean_T, (S-mean_S)/scale_S, (T-mean_T)/scale_T, R
    

In [3]:
def get_non_aligned_entity_embedding_matrices(alignment_dict, entity2vec1, entity2vec2, scale_S, scale_T, mean_S, mean_T, emb_dim=200):
    """
    Inputs the dictionary of aligned entities between two KGs and their corresponding embeddings, and returns the normalized embedding matrices of 
    
    non-aligned entities
    """
    A_neg_S = np.empty((len(entity2vec1)-len(alignment_dict), emb_dim))
    keys = sorted(set(entity2vec1.keys() if isinstance(entity2vec1, dict) else entity2vec1.index)-set(alignment_dict.keys()))
    for i, key in tqdm(enumerate(keys), total=A_neg_S.shape[0], desc='Computing A_neg_S...'):
        A_neg_S[i] = entity2vec1[key] if isinstance(entity2vec1, dict) else entity2vec1.loc[key].values
    
    B_neg_T = np.empty((len(entity2vec2)-len(alignment_dict), emb_dim))
    keys = sorted(set(entity2vec2.keys() if isinstance(entity2vec2, dict) else entity2vec2.index)-set(alignment_dict.values()))
    for i, key in tqdm(enumerate(keys), total=B_neg_T.shape[0], desc='Computing B_neg_T...'):
        B_neg_T[i] = entity2vec2[key] if isinstance(entity2vec2, dict) else entity2vec2.loc[key].values
        
    return (A_neg_S-mean_S)/scale_S, (B_neg_T-mean_T)/scale_T
    

In [4]:
def evaluate_alignment_knn(S_eval, T_eval, R, hit_values = [1, 3, 10]):
    """The function takes the evaluation sets, i.e. correct alignments that were left out, and returns the hits@ and MRR results w.r.t. correct alignments
    
    """
    print('#'*50)
    print('Evaluation started...')
    print('#'*50)
    model = NearestNeighbors(n_neighbors=S_eval.shape[0], n_jobs=-1)
    print('Fitting...')
    model.fit(T_eval)
    print('Predicting...')
    preds = model.kneighbors((S_eval@R+T_eval)/2, n_neighbors=S_eval.shape[0], return_distance=False)
    Hits = np.zeros(len(hit_values))
    MRR = 0.0
    for i in tqdm(range(S_eval.shape[0]), total=S_eval.shape[0]):
        pred_idx = (preds[i]==i).nonzero()[0][0] # if i in preds[i] else S_eval.shape[0]
        MRR += (1./(pred_idx+1))
        for j in range(len(Hits)):
            if pred_idx < hit_values[j]:
                Hits[j] += 1.0/S_eval.shape[0]
    MRR = MRR/S_eval.shape[0]
    print()
    print(', '.join([f'Hits@{hit_values[it]}: {Hits[it]}' for it in range(len(Hits))]+[f'MRR: {MRR}']))

In [5]:
EnFr_shallom_embs_v1 = pd.read_csv('Shallom_EnFr_15K_V1/Shallom_entity_embeddings.csv')

In [6]:
Fr_shallom_embs_v1 = EnFr_shallom_embs_v1[EnFr_shallom_embs_v1['Unnamed: 0'].apply(lambda x: 'fr.dbpedia.org' in x)]

In [7]:
En_shallom_embs_v1 = EnFr_shallom_embs_v1.iloc[np.setdiff1d(np.arange(EnFr_shallom_embs_v1.shape[0]),\
                                                            np.array(Fr_shallom_embs_v1.index))].set_index('Unnamed: 0')

In [8]:
Fr_shallom_embs_v1 = Fr_shallom_embs_v1.set_index('Unnamed: 0')

In [9]:
En_shallom_embs_v1.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://dbpedia.org/resource/Jean_Raoux_(soldier),0.21293,-0.456497,-0.804834,-0.044535,0.363582,-0.178872,0.385574,-0.230661,0.05241,-0.681095,...,0.315135,0.18749,0.04747,0.199877,-0.045452,-0.219743,0.397735,-0.400206,-0.561423,-0.075315
http://dbpedia.org/resource/Casually_Dressed_&_Deep_in_Conversation,-0.040372,0.139974,0.110328,-0.021151,0.499682,-0.227622,-0.223775,0.232769,0.114076,0.240751,...,0.085454,-0.053321,0.221335,-0.010698,0.285245,-0.098031,-0.010921,-0.01219,-0.076274,0.084391
http://dbpedia.org/resource/Resolve_(song),0.093407,0.785593,0.767466,0.389821,0.748004,-0.499349,0.325547,0.308278,0.554449,0.535334,...,0.245889,0.659989,0.651668,-0.110942,0.624555,-0.312246,0.011697,-0.722449,-0.116795,0.297614


In [11]:
with open('OpenEA_dataset_v1.1/EN_FR_15K_V1/ent_links') as file:
    en_to_fr_ents_v1 = file.read().strip().split('\n')
en_to_fr_ents_v1 = dict([line.split('\t') for line in en_to_fr_ents_v1])

In [12]:
with open('OpenEA_dataset_v1.1/EN_FR_15K_V1/721_5fold/1/test_links') as file:
    test_set = file.read().strip().split('\n')
test_set = [line.split('\t')[0] for line in test_set]

In [13]:
len(test_set)

10500

In [14]:
_, _, _, _, _, _, S_eval, T_eval, R = get_source_and_target_matrices(en_to_fr_ents_v1,\
                                                En_shallom_embs_v1, Fr_shallom_embs_v1, given_test_set=test_set, emb_dim=300, test_size=0.1)

Computing S and T: 100%|██████████| 4500/4500 [00:00<00:00, 5095.39it/s]
Computing S_eval and T_eval: 100%|██████████| 10500/10500 [00:02<00:00, 5103.32it/s]



Now computing R...
Scale S:  7.206529678496194

Completed after 0.146575927734375 seconds
Alignment loss:  1746.1492506830389


In [15]:
evaluate_alignment_knn(S_eval, T_eval, R, hit_values=[1, 3, 5, 10])

##################################################
Evaluation started...
##################################################
Fitting...
Predicting...


100%|██████████| 10500/10500 [00:00<00:00, 44094.12it/s]


Hits@1: 0.7917142857144354, Hits@3: 0.8587619047620917, Hits@5: 0.8838095238097247, Hits@10: 0.9107619047621206, MRR: 0.8333291456075708





In [16]:
scale_S, scale_T, mean_S, mean_T, S, T, R = get_source_and_target_matrices(en_to_fr_ents_v1,\
                                                                                    En_shallom_embs_v1, Fr_shallom_embs_v1, emb_dim=300, test_size=0.0)

Computing S and T: 100%|██████████| 15000/15000 [00:02<00:00, 5174.04it/s]



Now computing R...
Scale S:  7.246380617560009

Completed after 0.1604175567626953 seconds
Alignment loss:  5129.786850424131


In [17]:
evaluate_alignment_knn(S, T, R, hit_values=[1, 3, 5, 10])

##################################################
Evaluation started...
##################################################
Fitting...
Predicting...


100%|██████████| 15000/15000 [00:00<00:00, 34964.48it/s]


Hits@1: 0.8333999999999278, Hits@3: 0.8867999999999219, Hits@5: 0.9061999999999197, Hits@10: 0.928066666666584, MRR: 0.8668758944345571





### Get non aligned entity embeddings

In [18]:
A_neg_S, B_neg_T = get_non_aligned_entity_embedding_matrices(en_to_fr_ents_v1, En_shallom_embs_v1, \
                                                             Fr_shallom_embs_v1, scale_S, scale_T, mean_S, mean_T, emb_dim=300)

Computing A_neg_S...: 0it [00:00, ?it/s]
Computing B_neg_T...: 0it [00:00, ?it/s]


In [19]:
# compute every s_i as (s_i@R+t_i)/2
S = (S@R + T)/2
del T
gc.collect()
Universal_Emb = np.concatenate([A_neg_S@R, B_neg_T, S], axis=0)

In [20]:
Universal_Emb.shape

(15000, 300)

In [21]:
np.save('Shallom_EnFr_15K_V1/Universal_Emb.npy', Universal_Emb)

In [22]:
list_merged_entities = sorted(set(En_shallom_embs_v1.index)-set(en_to_fr_ents_v1.keys())) +\
sorted(set(Fr_shallom_embs_v1.index)-set(en_to_fr_ents_v1.values())) + \
list(en_to_fr_ents_v1.keys())
print('Total: ', len(list_merged_entities))
with open('Shallom_EnFr_15K_V1/list_merged_entities_db_fr_en.txt', 'w') as file:
    file.write(','.join(list_merged_entities))
del list_merged_entities

Total:  15000


In [23]:
with open('Shallom_EnFr_15K_V1/english2french.txt', 'w') as file:
    json.dump(en_to_fr_ents_v1, file, ensure_ascii=False)

In [24]:
with open('Shallom_EnFr_15K_V1/french2english.txt', 'w') as file:
    json.dump({value:key for key,value in en_to_fr_ents_v1.items()}, file, ensure_ascii=False)

## Fr-En Dbpedia 100K

In [25]:
EnFr_shallom_embs_v1 = pd.read_csv('Shallom_EnFr_100K_V1/Shallom_entity_embeddings.csv')

In [26]:
Fr_shallom_embs_v1 = EnFr_shallom_embs_v1[EnFr_shallom_embs_v1['Unnamed: 0'].apply(lambda x: 'fr.dbpedia.org' in x)]

In [27]:
En_shallom_embs_v1 = EnFr_shallom_embs_v1.iloc[np.setdiff1d(np.arange(EnFr_shallom_embs_v1.shape[0]),\
                                                            np.array(Fr_shallom_embs_v1.index))].set_index('Unnamed: 0')

In [28]:
Fr_shallom_embs_v1 = Fr_shallom_embs_v1.set_index('Unnamed: 0')

In [29]:
En_shallom_embs_v1.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://dbpedia.org/resource/Early_Edition,-0.04222,-0.05832,-0.049045,0.066309,0.066571,-0.092764,-0.059072,0.050914,-0.081084,-0.0637,...,-0.055581,0.055496,-0.055208,-0.073004,-0.013545,0.04807,-0.022145,-0.052797,-0.065483,-0.066909
http://dbpedia.org/resource/Above_the_Law_(group),-0.046235,-0.123812,0.069596,0.065059,-0.047143,-0.042722,0.072884,0.062155,0.034932,0.113771,...,0.136137,-0.150952,-0.066169,0.040957,0.165238,0.015526,0.054035,0.07712,0.077142,0.072234
http://dbpedia.org/resource/Belmont-Broye,0.101312,0.129636,0.071503,-0.042299,0.2389,0.037403,0.302014,-0.131812,-0.002979,0.089553,...,-0.06941,0.276707,-0.216835,0.039518,-0.2537,0.036126,0.033697,0.118371,0.281929,0.09877


In [30]:
with open('OpenEA_dataset_v1.1/EN_FR_100K_V1/ent_links') as file:
    en_to_fr_ents_v1 = file.read().strip().split('\n')
en_to_fr_ents_v1 = dict([line.split('\t') for line in en_to_fr_ents_v1])

In [31]:
with open('OpenEA_dataset_v1.1/EN_FR_100K_V1/721_5fold/1/test_links') as file:
    test_set = file.read().strip().split('\n')
test_set = [line.split('\t')[0] for line in test_set]

In [33]:
_, _, _, _, _, _, S_eval, T_eval, R = get_source_and_target_matrices(en_to_fr_ents_v1,\
                                                En_shallom_embs_v1, Fr_shallom_embs_v1, given_test_set=test_set, emb_dim=25, test_size=0.1)

Computing S and T: 100%|██████████| 30000/30000 [00:06<00:00, 4926.36it/s]
Computing S_eval and T_eval: 100%|██████████| 70000/70000 [00:14<00:00, 4895.11it/s]



Now computing R...
Scale S:  0.6159915591570903

Completed after 0.01683950424194336 seconds
Alignment loss:  6808.175185166385


In [34]:
evaluate_alignment_knn(S_eval, T_eval, R, hit_values=[1, 3, 5, 10])

##################################################
Evaluation started...
##################################################
Fitting...
Predicting...


100%|██████████| 70000/70000 [00:08<00:00, 8300.73it/s] 


Hits@1: 0.11948571428572816, Hits@3: 0.19792857142859843, Hits@5: 0.23975714285717686, Hits@10: 0.30319999999994124, MRR: 0.18192219256295464





In [None]:
scale_S, scale_T, mean_S, mean_T, S, T, R = get_source_and_target_matrices(en_to_fr_ents_v1,\
                                                                                    En_shallom_embs_v1, Fr_shallom_embs_v1, emb_dim=300, test_size=0.0)

In [None]:
evaluate_alignment_knn(S, T, R, hit_values=[1, 3, 5, 10])

In [None]:
A_neg_S, B_neg_T = get_non_aligned_entity_embedding_matrices(en_to_fr_ents_v1, En_shallom_embs_v1, \
                                                             Fr_shallom_embs_v1, scale_S, scale_T, mean_S, mean_T, emb_dim=300)

In [None]:
# compute every s_i as (s_i@R+t_i)/2
S = (S@R + T)/2
del T
gc.collect()
Universal_Emb = np.concatenate([A_neg_S@R, B_neg_T, S], axis=0)

In [None]:
Universal_Emb.shape

In [None]:
np.save('Shallom_EnFr_100K_V1/Universal_Emb.npy', Universal_Emb)

In [None]:
with open('Shallom_EnFr_100K_V1/english2french.txt', 'w') as file:
    json.dump(en_to_fr_ents_v1, file, ensure_ascii=False)

In [None]:
with open('Shallom_EnFr_100K_V1/french2english.txt', 'w') as file:
    json.dump({value:key for key,value in en_to_fr_ents_v1.items()}, file, ensure_ascii=False)