In [7]:
import os
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher
from gensim.models import KeyedVectors
import tempfile
import pandas as pd
import numpy as np
import h5py, torch
from torchbiggraph.model import ComplexDiagonalDynamicOperator, DotComparator, CosComparator
import json

vector_dimension = 100
orientation = "objects"
label = "P19"

In [63]:
import csv

# make subset of embeddings with P31 Qnodes
all_p31 = set()
tsv_file = open(f"/out/output/allConstraintsAnalysis_Final/instances/instances.{orientation}.{label}.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")
next(read_tsv)
for line in read_tsv:
    all_p31.add(line[1])
print("loaded all P31s")
print(all_p31)

with open(f"/out/embeddings/embeddings.{label}.{orientation}.tsv", 'w') as file:
    writer = csv.writer(file, delimiter='\t')
    tsv_file = open("/out/embeddings/entities_output.tsv")
    read_tsv = csv.reader(tsv_file, delimiter="\t")
    for line in read_tsv:
        if line[0] in all_p31:
            writer.writerow(line)
            
convert_kgtk_to_w2v(f"/out/embeddings/embeddings.{label}.{orientation}.tsv", f"/out/embeddings/embeddings.{label}.{orientation}.w2v")

loaded all P31s
{'Q476028', 'Q34442', 'Q1569871', 'Q2989398', 'Q5150550', 'Q17297633', 'Q6936383', 'Q1750854', 'Q57821', 'Q25', 'Q3336843', 'Q832778', 'Q6857854', 'Q674546', 'Q190967', 'Q94730503', 'Q687121', 'Q15239622', 'Q189898', 'Q902104', 'Q618779', 'Q1258093', 'Q558116', 'Q79337953', 'Q9679135', 'Q4632675', 'Q991812', 'Q2824648', 'Q13414753', 'Q2424752', 'Q44613', 'Q27495502', 'Q202199', 'Q184358', 'Q4897819', 'Q7543008', 'Q21074597', 'Q8561195', 'Q5327160', 'Q215380', 'Q13410520', 'Q1381899', 'Q75338598', 'Q659103', 'Q958822', 'Q1849719', 'Q719487', 'Q577', 'Q2368508', 'Q608843', 'Q2232001', 'Q236036', 'Q1180262', 'Q161387', 'Q3192808', 'Q15066925', 'Q2168991', 'Q691960', 'Q3802482', 'Q56580032', 'Q34770', 'Q315023', 'Q991683', 'Q10943', 'Q3927259', 'Q17201685', 'Q18602451', 'Q3511697', 'Q18670606', 'Q70208', 'Q1074523', 'Q63099748', 'Q14192234', 'Q10373548', 'Q55190325', 'Q3924474', 'Q842412', 'Q2306813', 'Q19765902', 'Q21503295', 'Q3090263', 'Q13218391', 'Q3488975', 'Q11774097

In [5]:
# convert from kgtk format to word2vec format
def convert_kgtk_to_w2v(input_path, output_path):
    """
    Convert a KGTK file (node1/label/node2) that contains embeddings to the w2v format
    """
    vector_count = 0

    # Read the file once to count the lines as we need to put them at the top of the w2v file
    with open(input_path, "r") as kgtk_file:
        for line in kgtk_file:
            if line != "\n":
                vector_count += 1
        kgtk_file.close()

    with open(output_path, "w") as w2v_file:
        w2v_file.write("{} {}\n".format(vector_count, vector_dimension))
        with open(input_path, "r") as kgtk_file:
            for line in kgtk_file:
                items = line.split("\t")
                if items != ['\n']:
                    qnode = items[0]
                    vector = ""
                    for i in range(1,len(items)):
                        vector += items[i]
                        if i != len(items) - 1:
                            vector += " "
                    w2v_file.write(qnode + " " + vector)
            kgtk_file.close()
        w2v_file.close()

In [9]:
relation_names_list = json.load(open("/out/embeddings/dynamic_rel_names.json"))
entity_names_list = json.load(open("/out/embeddings/entity_names_all_0.json"))
print("loaded entities and rels")
prop_count = len(relation_names_list)

# operators
operator_lhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
operator_rhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
comparator = DotComparator()
cos_comparator = CosComparator()
print("loading model")
with h5py.File("/out/embeddings/model.v600.h5", "r") as hf:
    operator_state_dict_lhs = {
        "real": torch.from_numpy(hf["model/relations/0/operator/lhs/real"][...]),
        "imag": torch.from_numpy(hf["model/relations/0/operator/lhs/imag"][...]),
    }
    operator_state_dict_rhs = {
        "real": torch.from_numpy(hf["model/relations/0/operator/rhs/real"][...]),
        "imag": torch.from_numpy(hf["model/relations/0/operator/rhs/imag"][...]),
    }
print("loaded model")
    
operator_lhs.load_state_dict(operator_state_dict_lhs)
operator_rhs.load_state_dict(operator_state_dict_rhs)
print("loaded state dict")

entity_to_index = {}
for i, entity in enumerate(entity_names_list):
    entity_to_index[entity] = i
    

rel_index = {}
for i, rel in enumerate(relation_names_list):
    rel_index[rel] = i

loaded entities and rels
loading model
loaded model
loaded state dict


In [10]:
# Load the embeddings
#with h5py.File("/out/embeddings/embeddings_all_0.v600.h5", "r") as hf:
hf = np.memmap("/out/embeddings/wikidata-20210215-dwd-v2-similarity-embed.2021-10-03T12_14.complex.np.mmap", mode='r',
              shape=(212010680,100))
arnold_embedding = torch.from_numpy(hf)
print("loaded embeddings")
print(np.shape(arnold_embedding))
print(arnold_embedding[0])

loaded embeddings
torch.Size([212010680, 100])
tensor([ 72, 167, 196, 188, 212,  93,  94, 191, 159, 240, 128,  61,  17, 222,
          8, 190,  57,  44,  98,  61, 237, 146, 204, 189, 225, 160, 170,  62,
         85, 136, 135, 190,  15, 233,  60,  61,  87, 130,   1, 190,  99, 186,
        251,  61, 177, 243, 204,  60, 187,  24, 205, 190,   2, 215, 113, 190,
         97,  37, 128, 190, 144,  44,  37,  62, 249, 145, 120,  62, 178, 155,
        156, 190, 183, 200,  15, 190,  96, 202, 190,  60,  44, 130,  31, 190,
         59, 101,  78, 190,   0, 122, 173,  61,  62,  69,  27, 190, 102, 166,
        146,  61], dtype=torch.uint8)


  arnold_embedding = torch.from_numpy(hf)


In [11]:
def get_embed(head, relation=None):
    ''' This function generate the embeddings for the tail entities:
            Head entities: Obtained from the model
            Head + relation: Obtained using torch
        :param head: subject Qnode
        :param relation: optional property
    '''
    if relation is None:
        return arnold_embedding[entity_to_index[head], :].detach().numpy()
    return  operator_lhs(
                arnold_embedding[entity_to_index[head], :].view(1, vector_dimension),
                torch.tensor([rel_index[relation]])
            ).detach().numpy()[0]

In [12]:
def kgtk_most_similar(
    vectors,
    positive,
    relation_label="similarity_score",
    add_label_description=False,
    output_path=None,
    topn=25,
):
    """
    find topn similar Qnodes, add label and decription for the Qnodes
    
    :param vectors: vector space loaded into gensim KeyedVectors model
    :param positive: vector(s) or Qnode(s) to find similar entities for
    :param relation_label: name of the property to be used for the output file
    :param add_label_description: boolean parameter to add label and description for matched entities
    :param output_path: path to store the output file
    :param topn: desirednumber of similar entities
    """
    result = []
    if add_label_description:
        fp = tempfile.NamedTemporaryFile(
            mode="w", suffix=".tsv", delete=False, encoding="utf-8"
        )
        fp.write("node1\tlabel\tnode2\n")
        for (qnode, similarity) in vectors.most_similar(positive=positive, topn=topn):
            fp.write("{}\t{}\t{}\n".format(qnode, relation_label, similarity))
        filename = fp.name
        fp.close()
        
        os.environ["_temp_file"] = filename

        result = !$kypher -i label -i description -i "$_temp_file" --as sim \
--match 'sim: (n1)-[]->(similarity), label: (n1)-[]->(lab), description: (n1)-[]->(des)' \
--return 'distinct n1 as node1, similarity as node2, "similarity" as label, lab as `node1;label`, des as `node1;description`' \
--order-by 'cast(similarity, float) desc' 
        
        os.remove(filename)
        
    else:
        result.append("node1\tlabel\tnode2\n")
        for (qnode, similarity) in vectors.most_similar(positive=positive, topn=topn):
            result.append("{}\t{}\t{}\n".format(qnode, relation_label, similarity))

    if output_path:
        handle = open(output_path, "w")
        for line in result:
            handle.write(line)
            handle.write("\n")
        handle.close()
    else:
        columns = result[0].split("\t")
        data = []
        for line in result[1:]:
            data.append(line.split("\t"))
        return pd.DataFrame(data, columns=columns)

In [13]:
#ge_vectors = KeyedVectors.load_word2vec_format(f"/out/embeddings/embeddings.{label}.{orientation}.w2v", binary=False)
ge_vectors = KeyedVectors.load_word2vec_format(f"/out/embeddings/embeddings.P166.objects.w2v", binary=False)
print(ge_vectors[0])
print(ge_vectors[1])

[ 0.2643581   0.21156222  0.5788594   0.36401397 -0.7291234  -0.9513682
 -0.0573146   0.21806757 -0.38710174  1.0672288  -0.14235587 -0.4050787
  0.11165585 -0.26143607 -0.2416543   0.2743238  -0.87956345 -0.06023194
  0.00392611  0.3661835   0.34108764  0.12415829  0.43348312  0.21580559
  0.06566405 -0.6339239   1.4066758   0.32884046  0.19940382 -0.6690317
 -0.20738026 -0.16703208  0.81144345 -1.0962225   0.43661723 -0.27497283
 -1.3284373  -0.2518356   0.21857326  0.679327   -0.25119677  0.01217394
 -0.05831678  0.70708734  0.50010157 -0.05189103  0.27469996 -0.25614184
  1.4016455   0.68699235  0.7174062   0.15688139 -0.8468862  -0.24911048
 -0.8273213   0.04136847 -0.67413527  0.7202095   0.39384627  0.2784951
  0.18528341 -0.35719046 -0.50167567 -0.24029404 -0.10396154  0.36590058
 -0.5166115  -0.0693279  -0.46011594 -0.24058881  0.09629997  0.44318908
  0.47394228  1.128845    0.0400658  -0.4829106   0.20886903  0.853481
 -0.32131562  0.20796661 -1.0610319  -0.6526282   0.17811

In [14]:
_vector = get_embed('Q2263', 'P166')
print(kgtk_most_similar(ge_vectors, positive=[_vector], topn=10))

       node1             label                node2\n
0   Q1339438  similarity_score   0.3176147937774658\n
1    Q165503  similarity_score   0.3113202154636383\n
2  Q15090084  similarity_score   0.3110175132751465\n
3   Q1706722  similarity_score   0.2859508693218231\n
4   Q1895274  similarity_score   0.2754642963409424\n
5    Q974780  similarity_score    0.275143563747406\n
6    Q931830  similarity_score  0.27366748452186584\n
7  Q21450566  similarity_score   0.2706688642501831\n
8    Q397610  similarity_score  0.26408103108406067\n
9   Q1205214  similarity_score  0.26364681124687195\n


In [8]:
# import csv

# # make subset of embeddings with P166 nodes
# all_p166_objects = set()
# tsv_file = open(f"/out/data/propertiesSplit_final/claims.P166.tsv")
# read_tsv = csv.reader(tsv_file, delimiter="\t")
# next(read_tsv)
# for line in read_tsv:
#     all_p166_objects.add(line[2])
# print("loaded all p166 objects")
# print(len(all_p166_objects))

# with open(f"/out/embeddings/embeddings.P166.objects.tsv", 'w') as file:
#     writer = csv.writer(file, delimiter='\t')
#     tsv_file = open("/out/embeddings/entities_output.tsv")
#     read_tsv = csv.reader(tsv_file, delimiter="\t")
#     for line in read_tsv:
#         if line[0] in all_p166_objects:
#             writer.writerow(line)
# print("wrote to tsv format")
            
convert_kgtk_to_w2v(f"/out/embeddings/embeddings.P166.objects.tsv", f"/out/embeddings/embeddings.P166.objects.w2v")