In [2]:
from sentence_transformers import SentenceTransformer
import subprocess
from docContentManager import DocContentManager
from keybert import KeyBERT
from itertools import chain
import hnswlib
from sklearn.metrics.pairwise import cosine_similarity
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

dimension = 384 
index = hnswlib.Index(space='cosine', dim=dimension)
index.load_index('hnsw_index_all.bin')
d = DocContentManager()


# KeyBERT - Query Expansion and Keyword Generation

In [3]:
def expandQueryByKeyBert(query, passage_list, k=5):
    all_passages = " ".join(passage_list)
    keywords = kw_model.extract_keywords(all_passages, stop_words=query.split() ,top_n=k)
    new_user_query =  " ".join([term for term, _ in keywords]) + query
    return new_user_query

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
# Function to extract keywords from a single query
def extract_keywords_from_query(query, lemmatize=False):
    # Tokenize the query
    words = word_tokenize(query)
    # Convert to lower case
    words = [word.lower() for word in words]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    if lemmatize:
        keywords_singular = [lemmatizer.lemmatize(word) for word in words]
    else:
        keywords_singular = words
    keywords = [word for word in keywords_singular if word not in stop_words and word.isalnum()]
    return keywords

In [5]:
def findKeywords(query, k=5):
    keywords = kw_model.extract_keywords(query, top_n=k)
    new_user_query =  " ".join([term for term, _ in keywords])
    return new_user_query


In [6]:
def executeBM25(query):

    relevant_docs = []

    # query = extract_keywords_from_query(query)
    # query = " ".join(query) 
    # print("Cleaning up query :: ", query)
    while True:
        cpp_executable_path = './bm25' 
        command = [cpp_executable_path, query]
        process = subprocess.run(command, capture_output=True, text=True)
        
        if process.returncode == 0:
            output_lines = process.stdout.strip().split('\n')

            # Process each line of output
            for line in output_lines:
                if line == "Word not found in index map." or line == "":
                    # print("New word - not found in corpus")
                    continue
                else:
                    pid = int(line)
                    if pid < 7999999: 
                        relevant_docs.append(pid)
        else:
            print("Error:", process.stderr)
        if len(relevant_docs) == 0 and len(query)>0:
            query = findKeywords(query)
        else:
            return relevant_docs
    

In [7]:
def get_embedding(inp_question):
    return sentence_transformer_model.encode(inp_question, convert_to_tensor=True)

In [34]:
def hnsw(input_question, k=100):
    # print(input_question, " in semantic_search()")

    # Encode input and get results from HNSW
    embeddings = sentence_transformer_model.encode(input_question, convert_to_tensor=True)
    labels, distances = index.knn_query(embeddings, k=k)
    labels = list(chain(*labels))
    # print(labels)

    # Get relevant words from HNSW returned documents 
    relevant_embeddings = ""
    for doc in labels:
        relevant_embeddings = relevant_embeddings + " " + d.fetchPassageContent(doc)
    relevant_embeddings = extract_keywords_from_query(relevant_embeddings)
    relevant_embeddings = " ".join(relevant_embeddings)
    new_query = findKeywords(relevant_embeddings, 5)


    # Execute BM25 on new query
    print("Post Pseudo Relevance Feedback Query:: ", new_query)
    relevant_docs = executeBM25(input_question)
    # print(relevant_docs)
    
    return relevant_docs

In [64]:
import time
start = time.time()
query = "panthera leo"
print("Given Query ::", query)
result = hnsw(query, 100)
end = time.time()

print("\n\n\nTook", end-start, " seconds to search")
print("Results\n")
for pid in result:
    print(str(pid)+" : ", d.fetchPassageContent(pid))
    print()



Given Query :: panthera leo
Post Pseudo Relevance Feedback Query::  lion panther lionesses panthera felines


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)





Took 1.388951063156128  seconds to search
Results

6141792 :  Classification. Lions belong to the genus Panthera which contains well known animals such as the tiger, leopard, and jaguar. Within the genus Panthera, the lion is further classifed as the species Panthera leo.The full scientific classification is as follows: 1  Kingdom: Animalia. 2  Phylum: Chordata. 3  Class: Mammalia. 4  Order: Carnivora. 5  Family: Felidae. 6  Genus: Panthera.ithin the genus Panthera, the lion is further classifed as the species Panthera leo. The full scientific classification is as follows: 1  Kingdom: Animalia. 2  Phylum: Chordata. 3  Class: Mammalia. 4  Order: Carnivora. 5  Family: Felidae. 6  Genus: Panthera.

1074234 :  Humans, of course, are Homo sapiens. The full classification for a lion would be: Kingdom, Animalia (animals); Phylum, Chordata (vertebrate animals); Class, Mammalia (mammals); Order, Carnivora (meat eaters); Family, Felidae (all cats); Genus, Panthera (great cats); Species, leo (

In [10]:
def query_passage_similarity(query, pid):
    embedding_i = get_embedding(query)
    embedding_j = get_embedding(d.fetchPassageContent(pid))
    similarity = cosine_similarity(embedding_i.reshape(1, -1), embedding_j.reshape(1, -1))[0, 0]
    return similarity

In [16]:
similarity_score = query_passage_similarity(query, result[0])
print("Similarity Score", similarity_score)

Similarity Score 0.23799196
