In [None]:
!pip install faiss-gpu
!pip install sentence_transformers

Uncomment it for first run

In [None]:
#!wget http://74.82.28.99:9000/cskg/data.tgz
#!tar -zxvf data.tgz

In [None]:

!ls data

If you want to re-index embeddings. run build_index_db()

In [None]:
from functools import partial
import pickle
import re
from secrets import randbelow
from typing import Callable, List, Tuple
import csv

import faiss
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
class Vocab:
    def __init__(self, words) -> None:
        self.idx_to_word = words
        self.word_to_idx = {word: idx for idx, word in enumerate(words)}

In [87]:
class CSKG_EMBEDDINGS():
    
    def __init__(self) -> None:
        print("init")
        #self.embedding_file = 'cskg_embeddings.csv'
        self.model = SentenceTransformer('all-mpnet-base-v2')
    
    def read_embedding_file(self, file) -> Tuple[Vocab, np.ndarray]:
        with open(file, 'r') as f:
            vector_dim = len(next(f).split(",\"")[1].split(','))
            file_len = len(f.readlines())
            shape = tuple([file_len + 1,vector_dim])
            print(shape)
            embeddings = np.zeros(shape, dtype=np.float32)
        with open(self.embedding_file, 'r') as d:
            words = []
            for i, line in tqdm(enumerate(d), total=shape[0]):
                embedding = line.split("text_embedding,")[1].split("\"")[1].split(',')
                word = line.split("text_embedding,")[0].split(",")[0]
                embedding = np.array([float(x) for x in embedding])
                words.append(word)
                embeddings[i] = embedding
        self.words = words
        vocab = Vocab(words)
        
        return (vocab, embeddings)


    def build_index_db(self, metric: str, embeddings: np.ndarray, vocab):

        if metric == 'cosine':
            index = faiss.IndexFlatIP(embeddings.shape[-1])
        elif metric == 'l2':
            index = faiss.IndexFlatL2(embeddings.shape[-1])
        else:
            raise ValueError(f'Bad metric: {metric}')
        index.add(embeddings)
        faiss.write_index(index,"data/vector.index")  # save the index to disk
        self.vocab = vocab
        self.index = index

        with open("data/vocab.pic", 'wb') as f:
                pickle.dump(vocab, f, protocol=4)
        csv.field_size_limit(512000)
        rel = []
        with open("data/cskg_sentences1.tsv") as file:
            tsv_file = csv.reader(file, delimiter="\t")
            for line in tsv_file:
                rel.append(line[3])
        with open("data/relations.pic", 'wb') as f:
            pickle.dump(rel, f, protocol=4)
    
    def load_index_db(self):
        with open("data/vocab.pic", 'rb') as f:
            self.vocab = pickle.load(f)
        with open("data/relations.pic", 'rb') as f:
            self.relations = pickle.load(f)
        self.index = faiss.read_index("data/vector.index")
        # print("VOCAB LOADED, size: ", len(self.vocab.))
        # print("RELATIONS LOADED, size: ", len(self.relations))
        print("INDEX Loaded")
    
    def parse_relations(self, rel):
        isa_relations = []
        description_relations = []
        property_values_relations = []
        all_relations = {}
        for  relations in rel.split('+'):
            if 'isa' in relations:
                isa_relations = relations.split('->')[0].split('isa(')
            if 'description(' in relations:
                description_relations = relations.split('->')[0].split('description(')
            if 'property_values(' in relations:
                property_values_relations = relations.split('->')[0].split('property_values(')
            #print(relations)
            if (len(isa_relations) > 1):
                for isa in isa_relations[1].split(','):
                    #print("----------->ISA ",isa)
                    if len(isa.split(' ')) >= 2:
                        rel = isa.split(' ')[0]
                        node = isa.split(' ')[1]
                        old_node = []
                        if rel in all_relations:
                            old_node = all_relations[rel]
                            
                        old_node.append(node)
                        all_relations[rel] = old_node
            if (len(description_relations) > 1):
                for desc in description_relations[1].split(','):
                    #print("------------>DR ",desc)
                    if len(desc.split(' ')) >= 2:
                        rel = desc.split(' ')[0]
                        node = desc.split(' ')[1]
                        old_node = []
                        if rel in all_relations:
                            old_node = all_relations[rel]

                        old_node.append(node)
                        all_relations[rel] = old_node
            if (len(property_values_relations) > 1):
                
                for pv in property_values_relations[1].split(','):
                    #print("----------->PVR ",pv)
                    if len(pv.split(' ')) >= 2:
                        rel = pv.split(' ')[0]
                        node = pv.split(' ')[1]
                        old_node = []
                        if rel in all_relations:
                            old_node = all_relations[rel]
                            
                        old_node.append(node)
                        all_relations[rel] = old_node
        return(all_relations)
        #input()
    
    def query_for_events(self, rel, query, topk):
        if rel == '':
            query = "at:" + query.replace(" ", "_")    
        else:
            query = "at:" + rel + " at:" + query.replace(" ", "_")
        print("Query: ",query)
        query=np.array([self.model.encode(query)])
        #faiss.normalize_L2(query)
        scores, candidate_ids = self.index.search(query, topk)
        scores = scores.flatten()
        candidate_ids = candidate_ids.flatten()
        top_k_indices = np.argsort(scores)[:topk]
        scores = scores[top_k_indices]
        candidate_ids = candidate_ids[top_k_indices]
        all_events = []
        all_candidates = {}
        for candidate_id, score in zip(np.nditer(candidate_ids), np.nditer(scores)):
            candidate = self.vocab.idx_to_word[candidate_id]
            relation = self.relations[candidate_id]
            relation = relation.replace('\\\'','')
            kg_path = self.parse_relations(relation)
            for kg_rel in kg_path:
                for nbr in kg_path[kg_rel]:
                    #print(kg_rel, '->', nbr)
                    all_events.append({'neighbor': nbr, 'kg_relation': kg_rel, 'candidate': candidate, 'score': score})
                    all_candidates[candidate] = {'candidate': candidate, 'score': score, 'kg_path': kg_path}
        #print(all_events)
        return(all_events, all_candidates)

    def query_for_concepts(self, query, topk):
        query = "It is a /c/en/" + query.replace(" ", " /c/en/")
        print(query)
        query=np.array([self.model.encode(query)])
        #faiss.normalize_L2(query)
        scores, candidate_ids = self.index.search(query, topk)
        scores = scores.flatten()
        candidate_ids = candidate_ids.flatten()
        top_k_indices = np.argsort(scores)[:topk]
        scores = scores[top_k_indices]
        candidate_ids = candidate_ids[top_k_indices]
        
        for candidate_id, score in zip(np.nditer(candidate_ids), np.nditer(scores)):
            candidate = self.vocab.idx_to_word[candidate_id]
            relation = self.relations[candidate_id]
            relation = relation.replace('\\\'','')
            self.parse_relations(relation)

In [88]:
cskg_emb = CSKG_EMBEDDINGS()
cskg_emb.load_index_db()

init
INDEX Loaded


In [80]:
#query1 = 'men in tuxedo presents ring'
#query2 = 'women wears white dress'


query1 = 'suspicious man in night street'
query2 = 'man breaks the car window'
query3 = 'man wearing black drerss'
all_rel_right, c1 = cskg_emb.query_for_events('xNeed', query2, 5)
all_rel_middle, c2 = cskg_emb.query_for_events('xNeed', query3, 5)
all_rel_left, c3 = cskg_emb.query_for_events('xNeed', query1, 5)
#print("R->", all_rel_right)
#print("L-> ", all_rel_left)
# rel_right = []
# rel_left = []
left_nodes = []
right_nodes = []
middle_nodes = []
for n in all_rel_left:
    left_nodes.append(n['neighbor'])
for n in all_rel_right:
    right_nodes.append(n['neighbor'])
for n in all_rel_middle:
    middle_nodes.append(n['neighbor'])
#all_nodes = list(dict.fromkeys(all_nodes))
embeddings_left = cskg_emb.model.encode(left_nodes, convert_to_tensor=True)
embeddings_right = cskg_emb.model.encode(right_nodes, convert_to_tensor=True)
#Compute cosine-similarities for each sentence with each other sentence
# # cosine_scores = util.cos_sim(embeddings_left, embeddings_right)

# # #Find the pairs with the highest cosine similarity scores
# # pairs = []
# # for i in range(len(cosine_scores)-1):
# #     for j in range(i+1, len(cosine_scores)):
# #         pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

# # #Sort scores in decreasing order
# # pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

# # for pair in pairs[0:50]:
# #     i, j = pair['index']
# #     print("{} \t\t {} \t\t Score: {:.4f}".format(left_nodes[i], right_nodes[j], pair['score']))
# for cand in all_rel_right.values():
#     #print("Right: ",cand)
#     for path in cand['kg_path'].values():
#         #print(path)
#         for val in path:
#             rel_right.append(val)
# for cand in all_rel_left.values():
#     #print("Left: ",cand)
#     for path in cand['kg_path'].values():
#         #print(path)
#         for val in path:
#             rel_left.append(val)
# print("Left-> ",rel_left)
# print("Right-> ", rel_right)
vectors_number = min(len(left_nodes), len(right_nodes))
#embeddings1 = cskg_emb.model.encode(rel_right, convert_to_tensor=True)
#embeddings2 = cskg_emb.model.encode(rel_left, convert_to_tensor=True)

# cosine_scores = util.cos_sim(embeddings_left, embeddings_right)

# all = []


# for i in range(vectors_number):
#     all.append(( cosine_scores[i][i].item(), left_nodes[i] + '->' + right_nodes[i], i))
#     #print("{} \t\t {} \t\t Score: {:.4f}".format(rel_right[i], rel_left[i], cosine_scores[i][i]))
# all.sort(reverse=True)
# for pair in all:
#     print("pair:-> ", pair)
#     print("candidate L:-> ", all_rel_left[pair[2]]['candidate'])
#     print("candidate R:-> ", all_rel_right[pair[2]]['candidate'])
#print(all[:30])
sentences = left_nodes + right_nodes + middle_nodes
sentences = list(dict.fromkeys(sentences))
embeddings = cskg_emb.model.encode(sentences, convert_to_tensor=True)
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)



Query:  at:xNeed at:man_breaks_the_car_window
Query:  at:xNeed at:man_wearing_black_drerss
Query:  at:xNeed at:suspicious_man_in_night_street


In [122]:
#query1 = 'suspicious man in night street'
#query2 = 'man breaks the car window'
#query3 = 'man wearing black drerss'
#triplets = ['man has baseball cap', 'green grass on field', 'tribune full of people', 'people play game']
triplets = ['dark street', 'car in park lot', 'man wearing dark', 'man has black sunglasses', 'man is criminal', 'man breaks down a car door']
all_cand = []
for triple1 in triplets:
    for triple2 in triplets:
        if triple2 != triple1:
            #print(triple1, "->", triple2)
            a, event1 = cskg_emb.query_for_events('', triple1, 40)
            b, event2 = cskg_emb.query_for_events('', triple2, 40)
            #print(event1.keys(), "->", event2.keys())
            embeddings1 = cskg_emb.model.encode(list(event1.keys()), convert_to_tensor=True)
            embeddings2 = cskg_emb.model.encode(list(event2.keys()), convert_to_tensor=True)
            cosine_scores = util.cos_sim(embeddings1, embeddings2)
            all = []

            vectors_number = min(len(list(event1.keys())), len(list(event2.keys())))
            for i in range(vectors_number):
                if  cosine_scores[i][i].item() > 0.65:
                    all.append(( cosine_scores[i][i].item(), list(event1.keys())[i] + '->' + list(event2.keys())[i], i))
                #print("{} \t\t {} \t\t Score: {:.4f}".format(rel_right[i], rel_left[i], cosine_scores[i][i]))
            all.sort(reverse=True)
            #for pair in all:
                #print("pair:-> ", pair)
                #print("candidate L:-> ", all_rel_left[pair[2]]['candidate'])
                #print("candidate R:-> ", all_rel_right[pair[2]]['candidate'])
            #print(all[:10])
            all_cand.append(all)
print(all_cand)          
#     #for cand in c.keys():
#     all_cand.append(c.keys())
    
# # all_rel_right, c1 = cskg_emb.query_for_events('xEffect', query2, 15)
# # all_rel_middle, c2 = cskg_emb.query_for_events('xEffect', query3, 15)
# # all_rel_left, c3 = cskg_emb.query_for_events('xEffect', query1, 15)
# #all_cand = list(c1.keys()) +list(c2.keys()) + list(c3.keys())
# sentences = list(dict.fromkeys(all_cand))
# embeddings = cskg_emb.model.encode(sentences, convert_to_tensor=True)
# cosine_scores = util.cos_sim(embeddings, embeddings)

# #Find the pairs with the highest cosine similarity scores
# pairs = []
# for i in range(len(cosine_scores)-1):
#     for j in range(i+1, len(cosine_scores)):
#         pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

# #Sort scores in decreasing order
# pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
# for pair in pairs:
    # i, j = pair['index']
    # print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

Query:  at:dark_street
Query:  at:car_in_park_lot
Query:  at:dark_street
Query:  at:man_wearing_dark
Query:  at:dark_street
Query:  at:man_has_black_sunglasses
Query:  at:dark_street
Query:  at:man_is_criminal
Query:  at:dark_street
Query:  at:man_breaks_down_a_car_door
Query:  at:car_in_park_lot
Query:  at:dark_street
Query:  at:car_in_park_lot
Query:  at:man_wearing_dark
Query:  at:car_in_park_lot
Query:  at:man_has_black_sunglasses
Query:  at:car_in_park_lot
Query:  at:man_is_criminal
Query:  at:car_in_park_lot
Query:  at:man_breaks_down_a_car_door
Query:  at:man_wearing_dark
Query:  at:dark_street
Query:  at:man_wearing_dark
Query:  at:car_in_park_lot
Query:  at:man_wearing_dark
Query:  at:man_has_black_sunglasses
Query:  at:man_wearing_dark
Query:  at:man_is_criminal
Query:  at:man_wearing_dark
Query:  at:man_breaks_down_a_car_door
Query:  at:man_has_black_sunglasses
Query:  at:dark_street
Query:  at:man_has_black_sunglasses
Query:  at:car_in_park_lot
Query:  at:man_has_black_sung

In [81]:
all = []
for pair in pairs:
    i, j = pair['index']
    #print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))
    all.append(sentences[i])
    all.append(sentences[j])
all =  list(dict.fromkeys(all))
cands = []
for i in all_rel_right:
    if i['neighbor'] in all:
        cands.append(i['candidate'])
for i in all_rel_left:
    if i['neighbor'] in all:
        cands.append(i['candidate'])
for i in all_rel_middle:
    if i['neighbor'] in all:
        cands.append(i['candidate'])
cands = list(dict.fromkeys(cands))
print(cands)

['at:personx_exchanges_insurance_information', "at:personx_hits_personx's_car", 'at:personx_assesses_the_damage', 'at:personx_pops_the_hood', 'at:personx_causes_an_accident', 'at:personx_reports_the_____to_the_police', 'at:personx_keeps_track_of_persony', 'at:personx_finds_the_culprit', "at:personx_retraces_persony's_steps", 'at:personx_asks_____what_happened', 'at:personx_immediately_went', 'at:personx_calls_the_man', 'at:personx_waits_until_the_next_day', 'at:personx_comes_within_the_scope', 'at:personx_helps_the_man']


Run "Events" Query

In [None]:
def walk_by_timeline(direction, query):  
    
    if direction == 'forward':
        category = 'xWant'
    elif direction == 'backward':
        category = 'xNeed'
    else:
        print("No such direction: ", direction)
    #print("Next q: ",category, query)
    all_rel = cskg_emb.query_for_events(category, query, 7)
    #print("All: ", all_rel)
    outputs = []
    for alr in all_rel:
        #print (alr)
        for x in all_rel[alr]['kg_path']:
            if x == 'at:'+category:
                for xneed in all_rel[alr]['kg_path'][x]:
                    outputs.append(xneed)
    return(outputs)

In [None]:
forwards = walk_by_timeline('forward','men in tuxedo')
print(forwards)
all_forwards = []
all_forwards.append(forwards)
for topk in range(3):
    if (len(forwards) > 0):
        for forward in forwards:
            #print("Next: ",forward)
            forwards = walk_by_timeline('forward', forward.replace('at:', '').replace('_',' '))
            all_forwards.append(forwards)
print(all_forwards)

In [None]:
backwards = walk_by_timeline('backward','women in white dress')
#print(forwards)
all_backwards = []
all_backwards.append(backwards)
for topk in range(3):
    if (len(backwards) > 0):
        for backward in backwards:
            #print("Next: ",forward)
            backwards = walk_by_timeline('backward', forward.replace('at:', '').replace('_',' '))
            all_backwards.append(backwards)
#print(all_forwards)

In [None]:

forwards = []
for ff in all_forwards:
    for f in ff:
        forwards.append(f)
print(len(forwards))
backwards = []
for ff in all_backwards:
    for f in ff:
        backwards.append(f)
print(len(backwards))


In [None]:
embeddings1 = cskg_emb.model.encode(backwards, convert_to_tensor=True)
embeddings2 = cskg_emb.model.encode(forwards, convert_to_tensor=True)
from sentence_transformers import util
cosine_scores = util.cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(backwards)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(backwards[i], forwards[i], cosine_scores[i][i]))


In [None]:
print(cskg_emb.query_for_concepts('men car night street', 10))

Run "concepts" query 

In [None]:
print(cskg_emb.query_for_concepts('george_washington', 10))