In [12]:
!pip install faiss-gpu
!pip install sentence_transformers
!wget https://github.com/NEBULA3PR0JECT/cskg_data/raw/main/data.tgz
!tar -zxvf data.tgz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2022-07-17 10:24:33--  https://github.com/NEBULA3PR0JECT/cskg_data/raw/main/data.tgz
Resolving github.com (github.com)... 140.82.112.4
Connec

In [2]:
from functools import partial
import pickle
import re
from secrets import randbelow
from typing import Callable, List, Tuple
import csv

import faiss
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

class Vocab:
    def __init__(self, words) -> None:
        self.idx_to_word = words
        self.word_to_idx = {word: idx for idx, word in enumerate(words)}

In [6]:
class CSKG_EMBEDDINGS():
    
    def __init__(self) -> None:
        print("init")
        #self.embedding_file = 'cskg_embeddings.csv'
        self.model = SentenceTransformer('all-mpnet-base-v2')
    
    def read_embedding_file(self, file) -> Tuple[Vocab, np.ndarray]:
        with open(file, 'r') as f:
            vector_dim = len(next(f).split(",\"")[1].split(','))
            file_len = len(f.readlines())
            shape = tuple([file_len + 1,vector_dim])
            print(shape)
            embeddings = np.zeros(shape, dtype=np.float32)
        with open(self.embedding_file, 'r') as d:
            words = []
            for i, line in tqdm(enumerate(d), total=shape[0]):
                embedding = line.split("text_embedding,")[1].split("\"")[1].split(',')
                word = line.split("text_embedding,")[0].split(",")[0]
                embedding = np.array([float(x) for x in embedding])
                words.append(word)
                embeddings[i] = embedding
        self.words = words
        vocab = Vocab(words)
        
        return (vocab, embeddings)


    def build_index_db(self, metric: str, embeddings: np.ndarray, vocab):

        if metric == 'cosine':
            index = faiss.IndexFlatIP(embeddings.shape[-1])
        elif metric == 'l2':
            index = faiss.IndexFlatL2(embeddings.shape[-1])
        else:
            raise ValueError(f'Bad metric: {metric}')
        index.add(embeddings)
        faiss.write_index(index,"data/vector.index")  # save the index to disk
        self.vocab = vocab
        self.index = index

        with open("data/vocab.pic", 'wb') as f:
                pickle.dump(vocab, f, protocol=4)
        csv.field_size_limit(512000)
        rel = []
        with open("data/cskg_sentences1.tsv") as file:
            tsv_file = csv.reader(file, delimiter="\t")
            for line in tsv_file:
                rel.append(line[3])
        with open("data/relations.pic", 'wb') as f:
            pickle.dump(rel, f, protocol=4)
    
    def load_index_db(self):
        with open("data/vocab.pic", 'rb') as f:
            self.vocab = pickle.load(f)
        with open("data/relations.pic", 'rb') as f:
            self.relations = pickle.load(f)
        self.index = faiss.read_index("data/vector.index")
        # print("VOCAB LOADED, size: ", len(self.vocab.))
        # print("RELATIONS LOADED, size: ", len(self.relations))
        print("INDEX Loaded")
    
    def parse_relations(self, rel):
        isa_relations = []
        description_relations = []
        property_values_relations = []
        for  relations in rel.split('+'):
            if 'isa' in relations:
                isa_relations = relations.split('->')[0].split('isa(')
            if 'description(' in relations:
                description_relations = relations.split('->')[0].split('description(')
            if 'property_values(' in relations:
                property_values_relations = relations.split('->')[0].split('property_values(')
            #print(relations)
            if (len(isa_relations) > 0):
                for isa in isa_relations[1].split(','):
                    print("----------->ISA ",isa)
            if (len(description_relations) > 0):
                for desc in description_relations[1].split(','):
                    print("------------>DR ",desc)
            if (len(property_values_relations) > 0):
                for pv in property_values_relations[1].split(','):
                    print("----------->PVR ",pv)
        #input()
    
    def query_for_events(self, query):
        query = "at:" + query.replace(" ", "_")
        print(query)
        query=np.array([self.model.encode(query)])
        #faiss.normalize_L2(query)
        scores, candidate_ids = self.index.search(query, 5)
        scores = scores.flatten()
        candidate_ids = candidate_ids.flatten()
        top_k_indices = np.argsort(scores)[:5]
        scores = scores[top_k_indices]
        candidate_ids = candidate_ids[top_k_indices]
        
        for candidate_id, score in zip(np.nditer(candidate_ids), np.nditer(scores)):
            candidate = self.vocab.idx_to_word[candidate_id]
            relation = self.relations[candidate_id]
            # print(candidate, relation)
            # print(candidate[0:5])
            # if qtype == 'at' and candidate[0:3] == 'at:':
            #     print("Candidate: ",candidate, score)
            #     relation = relation.replace('\\\'','')
            #     cskg_emb.parse_relations(relation)
            #if qtype == 'cn' and candidate[0:5] == '/c/en':
            print("Candidate: ",candidate, score)
            relation = relation.replace('\\\'','')
            self.parse_relations(relation)

    def query_for_concepts(self, query):
        query = "/c/en/" + query.replace(" ", "_")
        print(query)
        query=np.array([self.model.encode(query)])
        #faiss.normalize_L2(query)
        scores, candidate_ids = self.index.search(query, 5)
        scores = scores.flatten()
        candidate_ids = candidate_ids.flatten()
        top_k_indices = np.argsort(scores)[:5]
        scores = scores[top_k_indices]
        candidate_ids = candidate_ids[top_k_indices]
        
        for candidate_id, score in zip(np.nditer(candidate_ids), np.nditer(scores)):
            candidate = self.vocab.idx_to_word[candidate_id]
            relation = self.relations[candidate_id]
            # print(candidate, relation)
            # print(candidate[0:5])
            # if qtype == 'at' and candidate[0:3] == 'at:':
            #     print("Candidate: ",candidate, score)
            #     relation = relation.replace('\\\'','')
            #     cskg_emb.parse_relations(relation)
            #if qtype == 'cn' and candidate[0:5] == '/c/en':
            print("Candidate: ",candidate, score)
            relation = relation.replace('\\\'','')
            self.parse_relations(relation)

In [9]:
cskg_emb = CSKG_EMBEDDINGS()
cskg_emb.load_index_db()

init
INDEX Loaded


In [11]:
print(cskg_emb.query_for_events('at:boat_in_the_sea'))

at:at:boat_in_the_sea
Candidate:  at:personx_takes_personx's_boat 0.63855773
----------->PVR  at:xAttr /c/en/adventurous
----------->PVR  at:xAttr /c/en/boater
----------->PVR  at:xAttr /c/en/brave
----------->PVR  at:xAttr /c/en/marine
----------->PVR  at:xAttr /c/en/sailor
----------->PVR  at:xEffect at:gets_exercise_if_the_boat_has_no_motor_installed
----------->PVR  at:xEffect at:gets_wet
----------->PVR  at:xIntent /c/en/go_boating
----------->PVR  at:xIntent at:to_go_fishing
----------->PVR  at:xNeed at:find_the_boat
----------->PVR  at:xNeed at:make_sure_the_boat_runs
----------->PVR  at:xNeed at:to_gas_the_boat_up
----------->PVR  at:xNeed at:to_get_permission
----------->PVR  at:xNeed at:to_get_the_key
----------->PVR  at:xNeed at:went_to_the_boating_place
----------->PVR  at:xReact /c/en/relaxed
----------->PVR  at:xReact /c/en/satisfied
----------->PVR  at:xWant at:to_cruise_around_in_the_water
----------->PVR  at:xWant at:to_go_fishing
----------->PVR  at:xWant at:to_go_som

In [10]:
print(cskg_emb.query_for_concepts('/c/en/boat'))

/c/en//c/en/boat
Candidate:  /c/en/tugboat/n 0.60683084
----------->ISA  /c/en/boat/n
----------->ISA  /c/en/motorboat/n
Candidate:  /c/en/speedboat/n 0.6068311
----------->ISA  /c/en/boat/n
----------->ISA  /c/en/motorboat/n
Candidate:  /c/en/runabout/n 0.6068311
----------->ISA  /c/en/boat/n
----------->ISA  /c/en/motorboat/n
Candidate:  /c/en/houseboat/n 0.61075974
----------->ISA  /c/en/boat/n
----------->ISA  /c/en/fuel_powered_device/n
----------->ISA  /c/en/home/n
Candidate:  /c/en/kayak/v/wn/sport 0.622153
----------->ISA  /c/en/boat/v/wn/navigation
None
