# Load Modules

In [106]:
import pickle
from sentence_transformers import SentenceTransformer, util
import subprocess
import torch
from docContentManager import DocContentManager
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
kw_model = KeyBERT(model='all-MiniLM-L6-v2')


combined_corpus_embeddings_path = "embeddings/combined_corpus_embeddings"

# Load Embeddings

In [2]:
with open(combined_corpus_embeddings_path, 'rb') as file:
    corpus_embeddings = pickle.load(file)

In [5]:
corpus_embeddings = torch.tensor(corpus_embeddings)

  corpus_embeddings = torch.tensor(corpus_embeddings)


In [6]:
d = DocContentManager()

In [4]:
def generateEmbeddings(docs):
    query_embeddings = []
    for i in docs:
        if i<7999999:
            query_embeddings.append(corpus_embeddings[i])
    return query_embeddings

# KeyBERT - Query Expansion and Keyword Generation

In [107]:
def expandQueryByKeyBert(query, passage_list, k=5):
    all_passages = " ".join(passage_list)
    keywords = kw_model.extract_keywords(all_passages, stop_words=query.split() ,top_n=k)
    new_user_query =  " ".join([term for term, _ in keywords]) + query
    return new_user_query

In [60]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Function to extract keywords from a single query
def extract_keywords_from_query(query):
    # Tokenize the query
    words = word_tokenize(query)
    # Convert to lower case
    words = [word.lower() for word in words]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    keywords = [word for word in words if word not in stop_words and word.isalnum()]
    return keywords

In [76]:
def findKeywords(query):
    k = len(query.split()) - 1
    keywords = kw_model.extract_keywords(query, top_n=k)
    new_user_query =  " ".join([term for term, _ in keywords])
    return new_user_query


# BM25

In [88]:
def executeBM25(query):

    relevant_docs = []

    query = extract_keywords_from_query(query)
    query = " ".join(query) 
    while True:
        cpp_executable_path = './bm25' 
        command = [cpp_executable_path, query]
        process = subprocess.run(command, capture_output=True, text=True)
        
        if process.returncode == 0:
            output_lines = process.stdout.strip().split('\n')

            # Process each line of output
            for line in output_lines:
                if line == "Word not found in index map." or line == "":
                    # print("New word - not found in corpus")
                    continue
                else:
                    pid = int(line)
                    if pid < 7999999: 
                        relevant_docs.append(pid)
        else:
            print("Error:", process.stderr)
        if len(relevant_docs) == 0 and len(query)>0:
            query = findKeywords(query)
        else:
            return relevant_docs
    

# Semantic Search

In [104]:
def semantic_search(input_question, k=100):
    final_passages = []
    # print(input_question, " in semantic_search()")
    relevant_docs = executeBM25(input_question)
    # print(relevant_docs)
    relevant_embeddings = generateEmbeddings(relevant_docs)
    hits = util.semantic_search(relevant_embeddings, corpus_embeddings, top_k=k)
    for hit in hits[0]:
        final_passages.append(int(hit['corpus_id']))
    
    return final_passages

In [None]:
result = semantic_search("animals cages")
for pid in result:
    print(str(pid)+" : ")
    print(d.fetchPassageContent(pid))
    print()

# Nearest Neighbour Search

In [9]:
nn_model = NearestNeighbors(n_neighbors=100, metric='euclidean')
nn_model.fit(corpus_embeddings)

In [12]:
def nearestNeighbourSearch(input_question, k=100):
    relevant_docs = executeBM25(input_question)
    relevant_embeddings = []
    for doc in relevant_docs:
        content = d.fetchPassageContent(doc)
        user_query_embedding = sentence_transformer_model.encode([content])[0]
        relevant_embeddings.append(user_query_embedding)
    distances, indices = nn_model.kneighbors(relevant_embeddings, n_neighbors=k)
    return indices[0]


In [None]:
result = nearestNeighbourSearch("animals cages")
for pid in result:
    print(str(pid)+" : ")
    print(d.fetchPassageContent(pid))
    print()

# Evaluating Semantic Search

In [91]:
import pandas as pd

# Paths to MSMARCO Dev files
queries_path = 'validation/msmarco-test2019-queries.tsv'
top100_path = 'validation/msmarco-passagetest2019-top1000.tsv'
qrels_path = 'validation/2019qrels-pass.txt'


queries = pd.read_csv(queries_path, delimiter='\t', header=None, names=['qid', 'query'])
top100 = pd.read_csv(top100_path, delimiter='\t', header=None, names=['qid', 'pid', 'query', 'passage'])
qrels = pd.read_csv(qrels_path, delimiter=' ', header=None, names=['qid', 'Q0', 'docid', 'rating'])


In [89]:
def evaluate_semantic_search():
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for qid, group in top100.groupby('qid'):
        query = group['query'].tolist()[0]
        # print(query)

        relevant_documents = {x + 1 for x in set(group['pid'].tolist()) if x + 1 < 7999999}

        retrived_docs = semantic_search(query, 1000)

        retrieved_documents = set(retrived_docs)
        # print(relevant_documents)
        # print(retrieved_documents)

        diversity = diversity_at_k(relevant_documents, retrived_docs[:10])
        print(diversity)

        true_positive = len(retrieved_documents.intersection(relevant_documents))
        false_positive = len(retrieved_documents - relevant_documents)
        false_negative = len(relevant_documents - retrieved_documents)

        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        # print(precision, recall, f1)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        # print(true_positive, false_positive, false_negative)

    average_precision = sum(precision_scores) / len(precision_scores)
    average_recall = sum(recall_scores) / len(recall_scores)
    average_f1 = sum(f1_scores) / len(f1_scores)

    print("Average Precision:", average_precision)
    print("Average Recall:", average_recall)
    print("Average F1 Score:", average_f1)

evaluate_semantic_search()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[3489331]
0.9816580528563298


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[3683653]
0.9817519825296163


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1321512, 4659697, 3251676, 603543, 1529172, 3860801, 2360368, 267237]
0.9823684008060837


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[7718878, 4028080, 4614776, 5885963, 5904089, 2258265, 4882896, 538771, 2113123, 6343571, 1572233, 5801963, 5053471, 5747217, 2864832, 244155, 4168348, 4473623, 5807758, 6440972, 5446855, 7377685, 7031807, 99735, 5640832]
0.9826845511632353


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1249073, 5462144, 2627883, 41814, 1471240, 3808877, 2627881, 965329, 597905, 965327, 2276243, 2208668, 2627882, 3919066, 6752471, 6184945, 3665696, 5995701, 6598467, 6752472]
0.9826704450241162


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1868437]
0.981872064954101


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1400488, 6855920, 7745546, 2263531]
0.9823565020321995


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[4292611, 4289596, 2033966, 7647373, 6373390, 4289600, 4291626, 1527093, 4289597, 1787104, 1803403, 6447025, 7301139, 3620540, 5857161, 1787100, 6014619, 1312851, 5857162, 1419394, 5857163, 7389159, 5900792]
0.9826704450241162


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[6973593, 6973590, 6973592]
0.9817415042754182


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[7666831, 2986227, 123547, 6484575, 6774305, 4786305, 5197130, 7112968, 7112964, 7057928, 742425, 6969187, 1059517, 3914970, 6685092, 3216315, 5758954, 2307979, 4689523]


KeyboardInterrupt: 

# Diversity@K

In [99]:
def diversity_at_k(top_k_documents):
    num_documents = len(top_k_documents)
    diversity_scores = []

    for i in range(num_documents):
        for j in range(i + 1, num_documents):
            embedding_i = corpus_embeddings[top_k_documents[i]]
            embedding_j = corpus_embeddings[top_k_documents[j]]
            similarity = cosine_similarity(embedding_i.reshape(1, -1), embedding_j.reshape(1, -1))[0, 0]
            diversity_scores.append(1 - similarity)

    diversity_at_k = sum(diversity_scores) / len(diversity_scores)
    return diversity_at_k



In [105]:
diversity_sum = 0

for index, query in queries.iterrows():
    print(query['query'])
    print()
    search_result = semantic_search(query['query'])
    diversity_score = diversity_at_k(search_result)
    diversity_sum+=diversity_score
    print(query, diversity_score)

print("Diversity@K average ", diversity_sum/len(queries))

what slows down the flow of blood



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


qid                                1108939
query    what slows down the flow of blood
Name: 0, dtype: object 0.3796188066824518
what is the county for grand rapids, mn



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


qid                                      1112389
query    what is the county for grand rapids, mn
Name: 1, dtype: object 0.2956010374276325
what is ruclip



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 