In [1]:
from sentence_transformers import SentenceTransformer
import subprocess
from docContentManager import DocContentManager
from keybert import KeyBERT
from itertools import chain
import hnswlib
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

dimension = 384 
index = hnswlib.Index(space='cosine', dim=dimension)
index.load_index('hnsw_index_all.bin')
d = DocContentManager()


# KeyBERT - Query Expansion and Keyword Generation

In [2]:
def expandQueryByKeyBert(query, passage_list, k=5):
    all_passages = " ".join(passage_list)
    keywords = kw_model.extract_keywords(all_passages, stop_words=query.split() ,top_n=k)
    new_user_query =  " ".join([term for term, _ in keywords]) + query
    return new_user_query

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
# Function to extract keywords from a single query
def extract_keywords_from_query(query, lemmatize=False):
    # Tokenize the query
    words = word_tokenize(query)
    # Convert to lower case
    words = [word.lower() for word in words]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    if lemmatize:
        keywords_singular = [lemmatizer.lemmatize(word) for word in words]
    else:
        keywords_singular = words
    keywords = [word for word in keywords_singular if word not in stop_words and word.isalnum()]
    return keywords

In [4]:
def findKeywords(query, k=5):
    keywords = kw_model.extract_keywords(query, top_n=k)
    new_user_query =  " ".join([term for term, _ in keywords])
    return new_user_query


In [5]:
def executeBM25(query):

    relevant_docs = []

    query = extract_keywords_from_query(query)
    query = " ".join(query) 
    print("Cleaning up query :: ", query)
    while True:
        cpp_executable_path = './bm25' 
        command = [cpp_executable_path, query]
        process = subprocess.run(command, capture_output=True, text=True)
        
        if process.returncode == 0:
            output_lines = process.stdout.strip().split('\n')

            # Process each line of output
            for line in output_lines:
                if line == "Word not found in index map." or line == "":
                    # print("New word - not found in corpus")
                    continue
                else:
                    pid = int(line)
                    if pid < 7999999: 
                        relevant_docs.append(pid)
        else:
            print("Error:", process.stderr)
        if len(relevant_docs) == 0 and len(query)>0:
            query = findKeywords(query)
        else:
            return relevant_docs
    

In [6]:
def get_embedding(inp_question):
    return sentence_transformer_model.encode(inp_question, convert_to_tensor=True)

In [10]:
from itertools import chain
def hnsw(input_question, k=100):
    print(input_question, " in semantic_search()")
    relevant_docs = executeBM25(input_question)
    relevant_embeddings = ""

    for doc in relevant_docs:
        relevant_embeddings = relevant_embeddings + " " + d.fetchPassageContent(doc)
    labels, distances = index.knn_query(get_embedding(relevant_embeddings), k=k)
    # print(labels[0])
    return labels[0]
    

In [14]:
result = hnsw("panthera lion")
for pid in result:
    print(str(pid)+" : ", d.fetchPassageContent(pid))
    print()



panthera lion  in semantic_search()
Cleaning up query ::  panthera lion


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


343230 :  Extraordinary sightings of the Timbavati White Lions at Kings Camp have been enjoyed by all. It looks like the Kubasa Pride have moved in and it looks like they are here to stay. The two older Machaton lionesses, (Djuma & Sengela) had an ill-fated run-in with this pride close to the camp and came out second best.

2291465 :  Lions are the dominant carnivores in their habitat and will drive away competitors or even kill them. The Lionâs head and body can be up to eight feet, two inches, and tail up to three feet, five inches. Its weight can be up to 550 pounds. Lions are primarily ground-dwellers, but occasionally jump up tree branches. Most Lions will remain in the same territory all year long, however some are nomadic and follow the seasonal prey.

952710 :  African Lion. Apart from the humans squeezing African lions off their lands, the King of Beasts has no natural enemies. Of the big cats, only the tiger is larger, although not by much. Adult male lions can weigh 330 to