# Load Modules

In [1]:
import pickle
from sentence_transformers import SentenceTransformer, util
import subprocess
import torch
from docContentManager import DocContentManager
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
kw_model = KeyBERT(model='all-MiniLM-L6-v2')


combined_corpus_embeddings_path = "embeddings/combined_corpus_embeddings"
queries_path = 'validation/msmarco-test2019-queries.tsv'
top100_path = 'validation/msmarco-passagetest2019-top1000.tsv'
qrels_path = 'validation/2019qrels-pass.txt'

# Load Embeddings

In [2]:
with open(combined_corpus_embeddings_path, 'rb') as file:
    corpus_embeddings = pickle.load(file)

In [3]:
corpus_embeddings = torch.tensor(corpus_embeddings)

  corpus_embeddings = torch.tensor(corpus_embeddings)


In [4]:
d = DocContentManager()

In [5]:
def generateEmbeddings(docs):
    query_embeddings = []
    for i in docs:
        if i<7999999:
            query_embeddings.append(corpus_embeddings[i])
    return query_embeddings

In [6]:
# Function to convert a query into an embedding
def get_embedding(inp_question):
    return sentence_transformer_model.encode(inp_question, convert_to_tensor=True)

# KeyBERT - Query Expansion and Keyword Generation

In [7]:
def expandQueryByKeyBert(query, passage_list, k=5):
    all_passages = " ".join(passage_list)
    keywords = kw_model.extract_keywords(all_passages, stop_words=query.split() ,top_n=k)
    new_user_query =  " ".join([term for term, _ in keywords]) + query
    return new_user_query

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Function to extract keywords from a single query
def extract_keywords_from_query(query):
    # Tokenize the query
    words = word_tokenize(query)
    # Convert to lower case
    words = [word.lower() for word in words]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    keywords = [word for word in words if word not in stop_words and word.isalnum()]
    return keywords

In [9]:
def findKeywords(query):
    k = len(query.split()) - 1
    keywords = kw_model.extract_keywords(query, top_n=k)
    new_user_query =  " ".join([term for term, _ in keywords])
    return new_user_query


# BM25

In [10]:
def executeBM25(query):

    relevant_docs = []

    query = extract_keywords_from_query(query)
    query = " ".join(query) 
    while True:
        cpp_executable_path = './bm25' 
        command = [cpp_executable_path, query]
        process = subprocess.run(command, capture_output=True, text=True)
        
        if process.returncode == 0:
            output_lines = process.stdout.strip().split('\n')

            # Process each line of output
            for line in output_lines:
                if line == "Word not found in index map." or line == "":
                    # print("New word - not found in corpus")
                    continue
                else:
                    pid = int(line)
                    if pid < 7999999: 
                        relevant_docs.append(pid)
        else:
            print("Error:", process.stderr)
        if len(relevant_docs) == 0 and len(query)>0:
            query = findKeywords(query)
        else:
            return relevant_docs
    

# Semantic Search

In [11]:
def semantic_search(input_question, k=100):
    final_passages = []
    # print(input_question, " in semantic_search()")
    relevant_docs = executeBM25(input_question)
    # print(relevant_docs)
    relevant_embeddings = generateEmbeddings(relevant_docs)
    hits = util.semantic_search(relevant_embeddings, corpus_embeddings, top_k=k)
    for hit in hits[0]:
        final_passages.append(int(hit['corpus_id']))
    
    return final_passages

In [20]:
import time
query = "panthera leo"
start = time.time()
result = semantic_search(query)
end = time.time()

print("Query :: ", query)
print("Took ", end-start, "s for the query")

for pid in result:
    print(str(pid)+" : ", d.fetchPassageContent(pid))
    print()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Query ::  panthera leo
Took  40.57480001449585 s for the query
6141792 :  Classification. Lions belong to the genus Panthera which contains well known animals such as the tiger, leopard, and jaguar. Within the genus Panthera, the lion is further classifed as the species Panthera leo.The full scientific classification is as follows: 1  Kingdom: Animalia. 2  Phylum: Chordata. 3  Class: Mammalia. 4  Order: Carnivora. 5  Family: Felidae. 6  Genus: Panthera.ithin the genus Panthera, the lion is further classifed as the species Panthera leo. The full scientific classification is as follows: 1  Kingdom: Animalia. 2  Phylum: Chordata. 3  Class: Mammalia. 4  Order: Carnivora. 5  Family: Felidae. 6  Genus: Panthera.

6141794 :  Two subspecies are currently recognised: the African lion (Panthera leo leo) and the Asiatic lion (Panthera leo persica) . The Asiatic lion is slightly smaller than its African cousin, and has a shorter, thinner mane and a fold of skin running the length of the belly that

# Nearest Neighbour Search

In [21]:
nn_model = NearestNeighbors(n_neighbors=100, metric='euclidean')
nn_model.fit(corpus_embeddings)

In [None]:
def nearestNeighbourSearch(input_question, k=100):
    relevant_docs = executeBM25(input_question)
    relevant_embeddings = []
    for doc in relevant_docs:
        content = d.fetchPassageContent(doc)
        user_query_embedding = sentence_transformer_model.encode([content])[0]
        relevant_embeddings.append(user_query_embedding)
    distances, indices = nn_model.kneighbors(relevant_embeddings, n_neighbors=k)
    return indices[0]


In [None]:
result = nearestNeighbourSearch("animals cages")
for pid in result:
    print(str(pid)+" : ")
    print(d.fetchPassageContent(pid))
    print()

# Evaluating Semantic Search

In [None]:
import pandas as pd

queries = pd.read_csv(queries_path, delimiter='\t', header=None, names=['qid', 'query'])
top100 = pd.read_csv(top100_path, delimiter='\t', header=None, names=['qid', 'pid', 'query', 'passage'])
qrels = pd.read_csv(qrels_path, delimiter=' ', header=None, names=['qid', 'Q0', 'docid', 'rating'])


In [None]:
def evaluate_semantic_search():
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for qid, group in top100.groupby('qid'):
        query = group['query'].tolist()[0]
        # print(query)

        relevant_documents = {x + 1 for x in set(group['pid'].tolist()) if x + 1 < 7999999}

        retrived_docs = semantic_search(query, 1000)

        retrieved_documents = set(retrived_docs)
        # print(relevant_documents)
        # print(retrieved_documents)

        true_positive = len(retrieved_documents.intersection(relevant_documents))
        false_positive = len(retrieved_documents - relevant_documents)
        false_negative = len(relevant_documents - retrieved_documents)

        precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
        recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        # print(precision, recall, f1)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        # print(true_positive, false_positive, false_negative)

    average_precision = sum(precision_scores) / len(precision_scores)
    average_recall = sum(recall_scores) / len(recall_scores)
    average_f1 = sum(f1_scores) / len(f1_scores)

    print("Average Precision:", average_precision)
    print("Average Recall:", average_recall)
    print("Average F1 Score:", average_f1)

evaluate_semantic_search()


# Diversity@K

In [None]:
def diversity_at_k(top_k_documents):
    num_documents = len(top_k_documents)
    diversity_scores = []

    for i in range(num_documents):
        for j in range(i + 1, num_documents):
            embedding_i = corpus_embeddings[top_k_documents[i]]
            embedding_j = corpus_embeddings[top_k_documents[j]]
            similarity = cosine_similarity(embedding_i.reshape(1, -1), embedding_j.reshape(1, -1))[0, 0]
            diversity_scores.append(1 - similarity)

    diversity_at_k = sum(diversity_scores) / len(diversity_scores)
    return diversity_at_k



In [None]:
def query_passage_similarity(query, pid):
    embedding_i = get_embedding(query)
    embedding_j = corpus_embeddings[pid]
    similarity = cosine_similarity(embedding_i.reshape(1, -1), embedding_j.reshape(1, -1))[0, 0]
    return similarity

In [1]:
diversity_sum = 0
diversity_data = []
q = queries.iterrows()

NameError: name 'queries' is not defined

In [None]:


for index, query in q:
    search_result = semantic_search(query['query'])
    diversity_score = diversity_at_k(search_result)
    query_similarity = query_passage_similarity(query['query'], search_result[0])
    diversity_data.append({"qid": query['qid'], "query": query['query'],
                            "diversity_score": diversity_score, "query_similarity": query_similarity})
    diversity_sum+=diversity_score
    print(query, diversity_score, query_similarity)

In [None]:
diversity_data

In [None]:
print("Diversity@K average ", diversity_sum/len(queries))