In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from rank_bm25 import BM25Okapi
import numpy as np
from huggingface_hub import login
import pandas as pd
from typing import List, Dict
import pickle
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


Evaluation Metrics

In [25]:
def calculate_ndcg(relevant_docs: List[str], retrieved_docs: List[str], k: int = 3) -> float:
  """Calculate NDCG (Normalized Discounted Cumulative Gain) NDCG@k
    
    Measures the quality of ranking
    Takes into account both relevance and position of results
    Ranges from 0 to 1 (1 being perfect ranking)
  
  """
  relevance = []
  
  for doc in retrieved_docs[:k]:
      # Calculate relevance score based on similarity with ground truth
      print("retrieved_docs:", doc)
      similarity = 1 if doc in relevant_docs else 0
      print("source_content:", relevant_docs)
      relevance.append(similarity)
      print("relevance:", relevance)
          
  dcg = sum([(2**rel - 1) / np.log2(i + 2) for i, rel in enumerate(relevance)])
  print("dcg:", dcg)
  ideal_relevance = sorted(relevance, reverse=True)
  print("ideal_relevance:", ideal_relevance)
  idcg = sum([(2**rel - 1) / np.log2(i + 2) for i, rel in enumerate(ideal_relevance)])
  print("idcg:", idcg)

  return dcg / idcg if idcg > 0 else 0

In [3]:
def calculate_map(relevant_docs: List[str], retrieved_docs: List[str], k: int = 3) -> float:
  """Calculate MAP (Mean Average Precision) MAP@k
  
    Measures precision at each relevant document position
    Takes into account order of relevant documents
    Averages precision across multiple queries
  
  """
  precision_sum = 0
  num_relevant = 0
  
  for i, doc in enumerate(retrieved_docs[:k]):
      if doc in relevant_docs:
          num_relevant += 1
          precision_sum += num_relevant / (i + 1)
          
  return precision_sum / len(relevant_docs) if len(relevant_docs) > 0 else 0

Text-based BM25 Retriever

In [4]:
class BM25Retriever:
  def __init__(self, texts: List[str]):
      self.texts = texts
      tokenized_texts = [text.split() for text in texts]
      self.bm25 = BM25Okapi(tokenized_texts)
  
  def get_relevant_documents(self, query: str, k: int = 3) -> List[str]:
      tokenized_query = query.split()
      doc_scores = self.bm25.get_scores(tokenized_query)
      top_k_indices = np.argsort(doc_scores)[-k:][::-1]
      return [self.texts[i] for i in top_k_indices]

Semantic Retriever using LangChain

In [5]:
class SemanticRetriever:
  def __init__(self, texts: List[str]):
      self.embeddings = HuggingFaceEmbeddings(
          model_name="sentence-transformers/all-mpnet-base-v2"
      )
      text_splitter = RecursiveCharacterTextSplitter(
          chunk_size=500,
          chunk_overlap=50
      )
      documents = text_splitter.create_documents(texts)
      self.vectorstore = FAISS.from_documents(documents, self.embeddings)
  
  def get_relevant_documents(self, query: str, k: int = 3) -> List[str]:
      docs = self.vectorstore.similarity_search(query, k=k)
      return [doc.page_content for doc in docs]

Hybrid Retriever

In [6]:
class HybridRetriever:
  def __init__(self, texts: List[str]):
      self.bm25_retriever = BM25Retriever(texts)
      self.semantic_retriever = SemanticRetriever(texts)
      
  def reciprocal_rank_fusion(self, rankings: List[List[str]], k: float = 60) -> Dict[str, float]:
      scores = {}
      for rank_list in rankings:
          for rank, doc in enumerate(rank_list):
              if doc not in scores:
                  scores[doc] = 0
              scores[doc] += 1 / (rank + k)
      return scores
  
  def get_relevant_documents(self, query: str, k: int = 3) -> List[str]:
      bm25_docs = self.bm25_retriever.get_relevant_documents(query, k)
      semantic_docs = self.semantic_retriever.get_relevant_documents(query, k)
      
      fusion_scores = self.reciprocal_rank_fusion([bm25_docs, semantic_docs])
      sorted_docs = sorted(fusion_scores.items(), key=lambda x: x[1], reverse=True)
      return [doc for doc, _ in sorted_docs[:k]]

Main Pipeline

In [7]:
def process_dataframe(df: pd.DataFrame) -> pd.DataFrame:
  # Initialize retrievers
  texts = df['context'].tolist()
  bm25_retriever = BM25Retriever(texts)
  semantic_retriever = SemanticRetriever(texts)
  hybrid_retriever = HybridRetriever(texts)
  
  # Lists to store results
  results = {
      'bm25_chunks': [],
      'semantic_chunks': [],
      'hybrid_chunks': [],
      'bm25_ndcg': [],
      'bm25_map': [],
      'semantic_ndcg': [],
      'semantic_map': [],
      'hybrid_ndcg': [],
      'hybrid_map': []
  }
  
  # Process each query
  for idx, row in df.iterrows():
      query = row['question']
      relevant_doc = [row['context']]
      
      # Get retrieved documents from each retriever
      bm25_docs = bm25_retriever.get_relevant_documents(query)
      semantic_docs = semantic_retriever.get_relevant_documents(query)
      hybrid_docs = hybrid_retriever.get_relevant_documents(query)
      
      # Store retrieved chunks
      results['bm25_chunks'].append(bm25_docs)
      results['semantic_chunks'].append(semantic_docs)
      results['hybrid_chunks'].append(hybrid_docs)
      
      # Calculate metrics
      results['bm25_ndcg'].append(calculate_ndcg(relevant_doc, bm25_docs))
      results['bm25_map'].append(calculate_map(relevant_doc, bm25_docs))
      results['semantic_ndcg'].append(calculate_ndcg(relevant_doc, semantic_docs))
      results['semantic_map'].append(calculate_map(relevant_doc, semantic_docs))
      results['hybrid_ndcg'].append(calculate_ndcg(relevant_doc, hybrid_docs))
      results['hybrid_map'].append(calculate_map(relevant_doc, hybrid_docs))
  
  # Add results to dataframe
  for key, value in results.items():
      df[key] = value
  
  return df

Usage example

In [10]:
def main():
  
  # Load Huggingface API
  login(token="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")

  # Load your dataframe
  df = pd.read_parquet("hf://datasets/PatronusAI/HaluBench/data/test-00000-of-00001.parquet")
  test_df= df.iloc[0:50, 0:5]
  test_df.rename(columns={'passage': 'context'}, inplace=True)
  
  # Process the dataframe
  processed_df = process_dataframe(test_df)
  
  # Save results
  # processed_df.to_csv('retrieval_results.csv', index=False)
  
  # Print average metrics
  print("\nAverage Metrics:")
  print("BM25:")
  print(f"NDCG: {processed_df['bm25_ndcg'].mean():.3f}")
  print(f"MAP: {processed_df['bm25_map'].mean():.3f}")
  
  print("\nSemantic:")
  print(f"NDCG: {processed_df['semantic_ndcg'].mean():.3f}")
  print(f"MAP: {processed_df['semantic_map'].mean():.3f}")
  
  print("\nHybrid:")
  print(f"NDCG: {processed_df['hybrid_ndcg'].mean():.3f}")
  print(f"MAP: {processed_df['hybrid_map'].mean():.3f}")

  return processed_df

Average Metrics:<br>
BM25:<br>
NDCG: 0.617<br>
MAP: 0.963

Semantic:<br>
NDCG: 0.125<br>
MAP: 0.132

Hybrid:<br>
NDCG: 0.603<br>
MAP: 0.595

In [13]:
if __name__ == "__main__":
  result_df = main() 


Average Metrics:
BM25:
NDCG: 0.342
MAP: 0.597

Semantic:
NDCG: 0.020
MAP: 0.020

Hybrid:
NDCG: 0.296
MAP: 0.273


In [20]:
result_df.iloc[1,1]

'As of the census of 2000, there were 218,590 people, 79,667 households, and 60,387 families residing in the county.  The population density was 496 people per square mile (192/km²). There were 83,146 housing units at an average density of 189 per square mile (73/km²). The racial makeup of the county was 86.77% Race (United States Census), 9.27% Race (United States Census), 0.23% Race (United States Census), 1.52% Race (United States Census), 0.06% Race (United States Census), 0.69% from Race (United States Census), and 1.47% from two or more races.  1.91% of the population were Race (United States Census) or Race (United States Census) of any race. 22.5% were of German people, 13.1% Irish people, 9.8% Italian people, 9.2% English, 8.1% "American" and 6.0% Polish ancestry.'

In [26]:
test_ndcg = calculate_ndcg(result_df.iloc[1,1], result_df.iloc[1,5])
test_ndcg

retrieved_docs: 1564: The city of Ryazan posad was burned.:47 1571: Russo-Crimean War 1572: Battle of Molodi 1591: Raid reaches Moscow. :116 1591: Artillery stops a raid at Kolomenskoy on the Bank Line. :52 1592: Suburbs of Moscow burned.  Russian troops were away fighting Sweden.:17 1598: Crimeans stopped by Bank Line, withdraw and sue for peace.:46 1614: Nogai raids within sight of Moscow. During the Time of Troubles so many captives were taken that the price of a slave at Kaffa dropped to fifteen or twenty gold pieces.:66 1618: Nogais release 15,000 captives in peace treaty with Moscow. 1632: Force from Livny ambushed by Tatars and Janissaries. 300 killed and the rest enslaved.:67 1632: 20,000 Tatars raid the south, as troops were shifted north for the Smolensk War.:76 1633: 30,000 Tatars cross Abatis and Bank lines. Thousands were captured from Oka region.:76 This was the last deep raid into Muscovy. :26 1635: Many small war parties invaded Russia south of Ryazan.:79 1637,41-43: Se

0