## OpenEA datasets. We consider RotatE and TransE embedding approaches

In [2]:
import torch, pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
#from scipy.spatial import procrustes
from scipy.linalg import orthogonal_procrustes
import time, gc
from sklearn.neighbors import NearestNeighbors
import random
from tqdm import tqdm
gc.enable()
random.seed(42)

In [3]:
def get_source_and_target_matrices(alignment_dict, entity2vec1, entity2vec2, given_test_set=None, emb_dim=50, test_size=0.1, rescale=True, shift=True):
    """This function takes the dictionary of aligned entities between two KGs and their corresponding embeddings (as entity to vector dictionaries)
    and returns S, T, S_test, T_test, and R defined as follows:
    
    -- S: Normalized and scaled large subset of the source embeddings, i.e. the matrix of aligned entity embeddings in the first knowledge graph
    
    -- T: Normalized and scaled large subset of the matrix of aligned entity embeddings in the second knowledge graph
    
    -- S_test and T_test are portions of the full matrices sampled for evaluation if test_size > 0. If 'given_test_set' is given, it takes priority, i.e., no sampling is performed
    
    -- R: The rotation matrix that most closely maps S to T, i.e. ||A@S-T|| is minimized
    
    The mean and standard deviation of S, T are also returned
    """
    t0 = time.time()
    if test_size > 0 or given_test_set:
        if given_test_set is None:
            train_ents, test_ents = train_test_split(list(alignment_dict.keys()), test_size=test_size, random_state=42)
        else:
            test_ents = given_test_set
            train_ents = list(set(alignment_dict.keys())-set(test_ents))
    else:
        train_ents = alignment_dict.keys()
    
    S = entity2vec1.loc[train_ents].values
    T = entity2vec2.loc[list(map(alignment_dict.get, train_ents))].values
    
    S_test = entity2vec1.loc[test_ents].values
    T_test = entity2vec2.loc[list(map(alignment_dict.get, test_ents))].values
        
    print('\nNow computing R...')
    # Center and scale data
    mean_S = S.mean(axis=0)
    mean_T = T.mean(axis=0)
    scale_S = np.sqrt(((S-mean_S)**2).sum()/S.shape[0]) # scale, see https://en.wikipedia.org/wiki/Procrustes_analysis
    scale_T = np.sqrt(((T-mean_T)**2).sum()/T.shape[0])
    print('Scale S: ', scale_S)
    
    if shift and rescale:
        R, loss = orthogonal_procrustes((S-mean_S)/scale_S, (T-mean_T)/scale_T, check_finite=False)
        print('\nCompleted after '+str(time.time()-t0)+' seconds')
    elif shift:
        R, loss = orthogonal_procrustes((S-mean_S), (T-mean_T), check_finite=False)
        print('\nCompleted after '+str(time.time()-t0)+' seconds')
    elif rescale:
        R, loss = orthogonal_procrustes(S/scale_S, T/scale_T, check_finite=False)
        print('\nCompleted after '+str(time.time()-t0)+' seconds')
    else:
        R, loss = orthogonal_procrustes(S, T, check_finite=False)
        print('\nCompleted after '+str(time.time()-t0)+' seconds')
        
    print('Alignment loss: ', loss)
    if test_size > 0 or given_test_set:
        if shift and rescale:
            return scale_S, scale_T, mean_S, mean_T, (S-mean_S)/scale_S, (T-mean_T)/scale_T, (S_test-mean_S)/scale_S, (T_test-mean_T)/scale_T, R
        elif shift:
            return scale_S, scale_T, mean_S, mean_T, S-mean_S, T-mean_T, S_test-mean_S, T_test-mean_T, R
        elif rescale:
            return scale_S, scale_T, mean_S, mean_T, S/scale_S, T/scale_T, S_test/scale_S, T_test/scale_T, R
        else:
            return scale_S, scale_T, mean_S, mean_T, S, T, S_test, T_test, R
    else:
        if shift and rescale:
            return scale_S, scale_T, mean_S, mean_T, (S-mean_S)/scale_S, (T-mean_T)/scale_T, R
        elif shift:
            return scale_S, scale_T, mean_S, mean_T, S-mean_S, T-mean_T, R
        elif rescale:
            return scale_S, scale_T, mean_S, mean_T, S/scale_S, T/scale_T, R
        else:
            return scale_S, scale_T, mean_S, mean_T, S, T, R
        
    

In [4]:
def get_non_aligned_entity_embedding_matrices(alignment_dict, entity2vec1, entity2vec2, scale_S, scale_T, mean_S, mean_T, emb_dim=200):
    """
    Inputs the dictionary of aligned entities between two KGs and their corresponding embeddings, and returns the normalized embedding matrices of 
    
    non-aligned entities
    """
    A_neg_S = np.empty((len(entity2vec1)-len(alignment_dict), emb_dim))
    keys = sorted(set(entity2vec1.index)-set(alignment_dict.keys()))
    for i, key in tqdm(enumerate(keys), total=A_neg_S.shape[0], desc='Computing A_neg_S...'):
        A_neg_S[i] = entity2vec1.loc[key].values
    
    B_neg_T = np.empty((len(entity2vec2)-len(alignment_dict), emb_dim))
    keys = sorted(set(entity2vec2.index)-set(alignment_dict.values()))
    for i, key in tqdm(enumerate(keys), total=B_neg_T.shape[0], desc='Computing B_neg_T...'):
        B_neg_T[i] = entity2vec2.loc[key].values
        
    return (A_neg_S-mean_S)/scale_S, (B_neg_T-mean_T)/scale_T
    

In [5]:
def evaluate_alignment_knn(S_test, T_test, R, assume_known=False, hit_values = [1, 3, 10]):
    """The function takes the evaluation sets, i.e. correct alignments that were left out, and returns the hits@ and MRR results w.r.t. correct alignments
    
    --assume_known. A boolean variable. When set to True, the alignment results are computed using the fact that the test links are known
    
    """
    print('#'*50)
    print('Evaluation started...')
    print('#'*50)
    model = NearestNeighbors(n_neighbors=S_test.shape[0], n_jobs=-1)
    print('Fitting 1...')
    model.fit(T_test)
    print('Predicting 1...')
    if assume_known:
        preds = model.kneighbors((S_test@R+T_test)/2, n_neighbors=S_test.shape[0], return_distance=False)
    else:
        preds = model.kneighbors(S_test, n_neighbors=S_test.shape[0], return_distance=False)
    Hits1 = np.zeros(len(hit_values))
    MRR1 = 0.0
    for i in tqdm(range(S_test.shape[0]), total=S_test.shape[0]):
        pred_idx = (preds[i]==i).nonzero()[0][0]
        MRR1 += (1./(pred_idx+1))
        for j in range(len(Hits1)):
            if pred_idx < hit_values[j]:
                Hits1[j] = Hits1[j] + 1.0
    Hits1 = Hits1/S_test.shape[0]
    MRR1 = MRR1/S_test.shape[0]
    
    model = NearestNeighbors(n_neighbors=S_test.shape[0], n_jobs=-1)
    print('\nFitting 2...')
    if assume_known:
        model.fit((S_test@R+T_test)/2)
    else:
        model.fit(S_test)
    print('Predicting 2...')
    preds = model.kneighbors(T_test, n_neighbors=S_test.shape[0], return_distance=False)
    Hits2 = np.zeros(len(hit_values))
    MRR2 = 0.0
    for i in tqdm(range(S_test.shape[0]), total=S_test.shape[0]):
        pred_idx = (preds[i]==i).nonzero()[0][0] # if i in preds[i] else S_test.shape[0]
        MRR2 += (1./(pred_idx+1))
        for j in range(len(Hits2)):
            if pred_idx < hit_values[j]:
                Hits2[j] = Hits2[j] + 1.0
    Hits2 = Hits2/S_test.shape[0]
    MRR2 = MRR2/S_test.shape[0]
    
    Hits = (Hits1+Hits2)/2
    MRR = (MRR1+MRR2)/2
    print()
    print(', '.join([f'Hits@{hit_values[it]}: {Hits[it]}' for it in range(len(Hits))]+[f'MRR: {MRR}']))

## Define functions to load embeddings

In [6]:
def build_alignment_dict(kg_name):
    with open(f'OpenEA_dataset_v2.0/{kg_name}/ent_links') as file:
        kg1_to_kg2 = file.read().strip().split('\n')
    kg1_to_kg2 = dict([line.split('\t') for line in kg1_to_kg2])
    return kg1_to_kg2

In [7]:
def get_test_entities(kg_name, fold):
    with open(f'OpenEA_dataset_v2.0/{kg_name}/721_5fold/{fold}/test_links') as file:
        test_links = file.read().strip().split('\n')
    test_links = [line.split('\t')[0] for line in test_links]
    return test_links

In [8]:
def get_embeddings(kg_name, emb_model="TransE"):
    if emb_model == "TransE":
        model1 = torch.load(f'OpenEA_dataset_v2.0/{kg_name}/KG1_TransE/trained_model.pkl', map_location='cpu').eval()
        model2 = torch.load(f'OpenEA_dataset_v2.0/{kg_name}/KG2_TransE/trained_model.pkl', map_location='cpu').eval()
        with open(f'OpenEA_dataset_v2.0/{kg_name}/KG1_TransE/entity_to_ids.json') as file:
            ent_ids1 = json.load(file)
        with open(f'OpenEA_dataset_v2.0/{kg_name}/KG2_TransE/entity_to_ids.json') as file:
            ent_ids2 = json.load(file)
        emb1 = pd.DataFrame(model1.entity_representations[0](torch.tensor(list(ent_ids1.values())).long()).tolist(), index=list(ent_ids1.keys()))
        emb2 = pd.DataFrame(model2.entity_representations[0](torch.tensor(list(ent_ids2.values())).long()).tolist(), index=list(ent_ids2.keys()))
    else:
        model1 = torch.load(f'OpenEA_dataset_v2.0/{kg_name}/KG1_RotatE/trained_model.pkl', map_location='cpu')
        model2 = torch.load(f'OpenEA_dataset_v2.0/{kg_name}/KG2_RotatE/trained_model.pkl', map_location='cpu')
        with open(f'OpenEA_dataset_v2.0/{kg_name}/KG1_RotatE/entity_to_ids.json') as file:
            ent_ids1 = json.load(file)
        with open(f'OpenEA_dataset_v2.0/{kg_name}/KG2_RotatE/entity_to_ids.json') as file:
            ent_ids2 = json.load(file)
            
        emb1_real = model1.entity_representations[0](torch.tensor(list(ent_ids1.values())).long()).real
        emb1_img = model1.entity_representations[0](torch.tensor(list(ent_ids1.values())).long()).imag
        emb1 = torch.cat([emb1_real, emb1_img], axis=-1)
        
        emb2_real = model2.entity_representations[0](torch.tensor(list(ent_ids2.values())).long()).real
        emb2_img = model2.entity_representations[0](torch.tensor(list(ent_ids2.values())).long()).imag
        emb2 = torch.cat([emb2_real, emb2_img], axis=-1)
        
        emb1 = pd.DataFrame(emb1.tolist(), index=list(ent_ids1.keys()))
        emb2 = pd.DataFrame(emb2.tolist(), index=list(ent_ids2.keys()))
    return emb1, emb2

In [33]:
alignment = build_alignment_dict("EN_FR_15K_V1")
test_entities = get_test_entities("EN_FR_15K_V1", 1)
emb1, emb2 = get_embeddings("EN_FR_15K_V1")

In [22]:
emb1.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
,0.010982,-0.029167,0.050871,-0.08629,-0.006217,-0.006347,0.032373,-0.008046,-0.009607,-0.089063,...,-0.087316,-0.031542,0.031592,0.072944,0.042461,-0.089679,-0.05686,0.020393,0.085451,0.038364
(Live),0.001938,-0.010675,0.05612,-0.409321,0.19267,-0.287279,0.033135,-0.01002,0.000702,-0.279517,...,-0.096294,-0.027402,0.356884,0.091503,-0.018329,-0.076187,-0.052801,0.026452,0.089548,0.046538


In [23]:
emb2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
# (US Mainstream Rock),0.177219,-0.192868,-0.252988,-0.357779,-0.307341,0.047987,-0.455507,0.02786,0.191631,-0.109535,...,-0.160491,0.168667,0.087189,0.068281,-0.118001,-0.071306,-0.064774,-0.055225,-0.010741,-0.006808
# (US Modern Rock),0.160445,-0.179548,-0.263566,-0.34525,-0.318889,0.047559,-0.463212,0.0289,0.190284,-0.102479,...,-0.168589,0.156612,0.087659,0.054049,-0.118439,-0.069497,-0.081031,-0.067076,0.003138,-0.006222


In [34]:
scale_S, scale_T, mean_S, mean_T, S, T, S_test, T_test, R = get_source_and_target_matrices(alignment,\
                                                emb1, emb2, given_test_set=None, emb_dim=50, rescale=False, test_size=0.1)


Now computing R...
Scale S:  0.8341125927861606

Completed after 0.09527587890625 seconds
Alignment loss:  3285.2662817510895


In [35]:
S_test.shape

(1500, 50)

In [36]:
evaluate_alignment_knn(S_test, T_test, R, assume_known=False, hit_values=[5, 10, 50])

##################################################
Evaluation started...
##################################################
Fitting 1...
Predicting 1...


100%|██████████| 1500/1500 [00:00<00:00, 110461.69it/s]


Fitting 2...
Predicting 2...



100%|██████████| 1500/1500 [00:00<00:00, 108969.38it/s]


Hits@5: 0.0033333333333333335, Hits@10: 0.008, Hits@50: 0.039, MRR: 0.005619881159665557





## With RotatE embeddings

In [44]:
emb1, emb2 = get_embeddings("EN_FR_15K_V1", "RotatE")

In [45]:
emb1.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
,-0.001671,-0.001787,-0.002695,-0.00046,-3.8e-05,0.003914,0.001309,-0.003109,-0.001952,0.000562,...,-0.006454,-0.001332,0.000671,0.008175,-0.005879,0.003883,-0.00343,-0.003362,0.002849,0.00302
(Live),-0.007722,0.009663,0.003048,0.002181,-0.000333,0.004548,0.002331,-0.004285,0.001333,-0.010752,...,0.005537,0.00485,-0.011252,0.009744,0.00038,-0.003004,-0.003118,0.011277,-0.009715,-0.000115
(UK),0.003493,-0.000377,0.001211,0.003132,0.001576,0.001537,0.002182,-0.00183,0.005519,0.006237,...,-0.005256,0.000204,-0.000745,0.005057,-0.001026,0.00558,0.005439,0.000691,0.002482,0.003244


In [46]:
scale_S, scale_T, mean_S, mean_T, S, T, S_test, T_test, R = get_source_and_target_matrices(alignment,\
                                                emb1, emb2, given_test_set=None, emb_dim=400, rescale=False, test_size=0.1)


Now computing R...
Scale S:  1.3987306919266569

Completed after 0.36179161071777344 seconds
Alignment loss:  15137.798050691987


In [47]:
evaluate_alignment_knn(S_test, T_test, R, assume_known=False, hit_values=[5, 10, 50])

##################################################
Evaluation started...
##################################################
Fitting 1...
Predicting 1...


100%|██████████| 1500/1500 [00:00<00:00, 100705.19it/s]



Fitting 2...
Predicting 2...


100%|██████████| 1500/1500 [00:00<00:00, 111542.73it/s]


Hits@5: 0.003, Hits@10: 0.006333333333333333, Hits@50: 0.033, MRR: 0.005582873518672749





## Another dataset: EN_FR_100K_V1

In [48]:
alignment = build_alignment_dict("EN_FR_100K_V1")
test_entities = get_test_entities("EN_FR_100K_V1", 1)
emb1, emb2 = get_embeddings("EN_FR_100K_V1")

In [49]:
scale_S, scale_T, mean_S, mean_T, S, T, S_test, T_test, R = get_source_and_target_matrices(alignment,\
                                                emb1, emb2, given_test_set=None, emb_dim=50, rescale=False, test_size=0.1)


Now computing R...
Scale S:  0.5238434249154881

Completed after 0.6284322738647461 seconds
Alignment loss:  17580.037182179432


In [50]:
evaluate_alignment_knn(S_test, T_test, R, assume_known=False, hit_values=[5, 10, 50, 1000])

##################################################
Evaluation started...
##################################################
Fitting 1...
Predicting 1...


100%|██████████| 10000/10000 [00:00<00:00, 45222.39it/s]



Fitting 2...
Predicting 2...


100%|██████████| 10000/10000 [00:00<00:00, 46706.29it/s]


Hits@5: 0.0007, Hits@10: 0.0010999999999999998, Hits@50: 0.0056, Hits@1000: 0.10805000000000001, MRR: 0.0010776550452930855



