# DRKG Relation Similarity Analysis based on link recommendations

This notebook performs an similarity analysis of different link types in the DRKG based on their recommendation outcome. Speciffically, for a certain node we predict the K most similar neighbors for a certain link type. Then we repeat this prediction for all link types. Link types that have a significant overlap of predicted neighbors will be more similar.

In [3]:
import pandas as pd
import numpy as np
import os
import csv
import sys
import torch as th
sys.path.insert(1, '../utils')
from utils import download_and_extract
download_and_extract()

Define the function used for scoring the edges. This should cooincide with the function used to learn the embeddinds.

In [4]:
def transE_l2(head, rel, tail):
    gamma=12.0
    score = head + rel - tail
    return gamma - th.norm(score, p=2, dim=-1)

## Loading Mapping files

Load the mapping files that give the ids used by the embedding models for the corresponding DRKG id.   Load the entity and relation embeddings as well. Change input files and embedding files.

In [6]:
# folders holding data
folder_with_training_part='../data/drkg/embed/'
folder_with_embeddings='../data/drkg/embed/'

ids = []
entity2id = {}
with open(folder_with_training_part+"entities.tsv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=[ 'entity','id'])
    for row_val in reader:
        id = row_val['id']

        entity2id[row_val['entity']] = int(id)

print(len(entity2id))

rel2id = {}
with open(folder_with_training_part+"relations.tsv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['entity','id'])
    for row_val in reader:
        id = row_val['id']

        rel2id[row_val['entity']] = int(id)

print(len(rel2id))

node_emb = np.load(folder_with_embeddings+'DRKG_TransE_l2_entity.npy')
rel_emb = np.load(folder_with_embeddings+'DRKG_TransE_l2_relation.npy')

97238
107


## Loading triplets

Load triplets and map them to DRKG id space

In [None]:
head_ids = []
rel_ids = []
tail_ids = []
p0_rows = []
folder_with_training_part
with open("../data/drkg/drkg.tsv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['head', 'rel', 'tail'])
    for row_val in reader:
        head = row_val['head']
        rel = row_val['rel']
        tail = row_val['tail']

        head_id = entity2id[head]
        rel_id = rel2id[rel]
        tail_id = entity2id[tail]
        
        head_ids.append(head_id)
        rel_ids.append(rel_id)
        tail_ids.append(tail_id)
        p0_rows.append((head, rel, tail))
        
head_ids = np.array(head_ids)
rel_ids = np.array(rel_ids)
tail_ids = np.array(tail_ids)
triple_ids = np.arange(head_ids.shape[0])

## Link prediction

Specify number of seed nodes to select for link prediction

In [6]:
scores={}
L=100
device = th.device('cpu')
with th.no_grad():
    node_emb = th.tensor(node_emb).to(device)
    rel_emb = th.tensor(rel_emb).to(device)
    head_ids = th.tensor(head_ids).to(device)
    rel_ids = th.tensor(rel_ids).to(device)
    tail_ids = th.tensor(tail_ids).to(device)

    head_embedding = node_emb[head_ids]
    rel_embedding = rel_emb[rel_ids]
    tail_embedding = node_emb[tail_ids]
    # select L random heads
    
    perm = th.randperm(head_ids.shape[0])
    seeds = head_ids[perm[:L]]
    seed_heads = node_emb[seeds]

Predict the scores per link type among the selected seed nodes and all other nodes

In [7]:
for rel in rel2id.keys():
        rel_id=rel2id[rel]
        rel_embedding=((rel_emb[rel_id]).repeat(node_emb.shape[0],1))
        #print(rel_embedding.shape)
        scores[rel] =[transE_l2((seed_heads[i].repeat(node_emb.shape[0],1)), rel_embedding, node_emb) for i in range(seed_heads.shape[0])]# for i in range()]
        #print(scores[rel])
    

### Top K link predicition
Specify the number of top scoring neighbors to evaluate the proposed ovelap of link prediction.

In [8]:
K=10
top_neighbors={}
for rel in scores.keys():
    top_neighbors[rel]=[th.argsort(score, descending=True)[:K] for score in scores[rel]]

## Overlap among predicted neighbors
Calculate the overlap of predicted neighboring nodes for each per of relation types

In [9]:
overlap_of_predicted_neighbors=[]
keys=list(scores.keys()) 
for i in range(len(keys)):
    for j in range(i+1,len(keys)):
        e1=keys[i]
        e2=keys[j]
        n_1=top_neighbors[e1]
        n_2=top_neighbors[e2]
        jacard=0
        for l in range(len(n_1)):
            n1=list(n_1[l].cpu().numpy())
            n2=list(n_2[l].cpu().numpy())
            jacard+=float(len(set(n1).intersection(set(n2)))/len(set(n1).union(set(n2))))
        jacard=jacard/len(n_1)
        overlap_of_predicted_neighbors.append([e1,e2,jacard])

Store sorted overlap file

In [10]:
overlap_of_predicted_neighbors_sort=(sorted(overlap_of_predicted_neighbors,key=lambda x: float(x[2])))[::-1]

overlap_of_predicted_neighbors_store=["{}\t{}\t{}\n".format(j[0], j[1], j[2]) for j in overlap_of_predicted_neighbors_sort]

overlap_of_predicted_neighbors_store=["edge_type1\tedge_type2\tpercentage of overlapping predicted edges\n"]+overlap_of_predicted_neighbors_store
entity_file = "percentage_of_overlapping_predicted_edges_per_edge_pair"+str(K)+"v1.tsv"
with open(entity_file, 'w+') as f:
    f.writelines(overlap_of_predicted_neighbors_store)