# Evaluation
In this notebook, we will evaluate the performance of our Retrieval-Augmented Generation (RAG) system using the Qdrant vector database populated with Earth Observation documents.

## Setup Qdrant Vector Database

Before proceeding, you'll need to:

1. **Create a Qdrant account**: Sign up for a free account at [Qdrant Cloud](https://cloud.qdrant.io/)
2. **Create a cluster**: Follow the setup wizard to create a new Qdrant cluster
3. **Get your credentials**:
   - **Cluster URL**: Found in your cluster dashboard (e.g., `https://xyz-example.eu-central.aws.cloud.qdrant.io`)
   - **API Key**: Generate an API key from the cluster settings
4. **Configure your environment**:
   - Copy `.env.example` to `.env` (if you haven't already)
   - Update `QDRANT_URL` and `QDRANT_API_KEY` in your `.env` file with your actual credentials

The credentials will be automatically loaded from your `.env` file in the next cell.

In [None]:
# Import env variables
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..'))

from dotenv import load_dotenv
load_dotenv()
# Configure your Qdrant connection
# These credentials are loaded from the .env file
# To set up:
# 1. Copy .env.example to .env (if you haven't already)
# 2. Sign up at https://cloud.qdrant.io/
# 3. Create a cluster and get your credentials
# 4. Update the QDRANT_URL and QDRANT_API_KEY in your .env file

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# Validate that credentials are set
if not QDRANT_URL or not QDRANT_API_KEY:
    raise ValueError(
        "Qdrant credentials not found!\n"
        "Please set QDRANT_URL and QDRANT_API_KEY in your .env file.\n"
        "See .env.example for the expected format."
    )

# Snapshot configuration
snapshot_file = "../data/sample_collection.snapshot"
collection_name = snapshot_file.split('-')[0]  # "hallucination_sample"

print(f"✓ Qdrant URL: {QDRANT_URL}")
print(f"✓ Collection name: {collection_name}")

## Connect to Qdrant and Upload Snapshot

This cell will:
1. Connect to your Qdrant Cloud instance using the credentials you provided
2. Check if the collection already exists
3. If not, upload the snapshot file to create the collection with pre-populated data

The snapshot contains Earth Observation documents that will be used for hallucination detection.

In [None]:
import os
import requests
from qdrant_client import QdrantClient

# Check if snapshot file exists
if not os.path.exists(snapshot_file):
    raise FileNotFoundError(
        f"Snapshot file not found: {snapshot_file}\n"
        f"Please ensure the snapshot file is in the current directory."
    )

print(f"✓ Snapshot file found: {snapshot_file}")
print(f"\nConnecting to Qdrant Cloud...")

# Connect to your Qdrant Cloud instance
try:
    qdrant_client = QdrantClient(
        url=QDRANT_URL,
        api_key=QDRANT_API_KEY,
    )
    # Test the connection
    collections = qdrant_client.get_collections()
    print(f"✓ Successfully connected to Qdrant Cloud!")
    print(f"  Existing collections: {[col.name for col in collections.collections]}")
except Exception as e:
    raise Exception(
        f"Failed to connect to Qdrant Cloud. Please check your credentials.\n"
        f"Error: {e}"
    )

# Check if collection already exists
existing_collections = [col.name for col in qdrant_client.get_collections().collections]

if collection_name in existing_collections:
    print(f"\n✓ Collection '{collection_name}' already exists")
    # Get collection info
    info = qdrant_client.get_collection(collection_name)
    print(f"\n{'='*60}")
    print(f"COLLECTION INFO")
    print(f"{'='*60}")
    print(f"Name: {collection_name}")
    print(f"Points count: {info.points_count}")
    print(f"Vectors count: {info.vectors_count}")
    print(f"Status: {info.status}")
    print(f"{'='*60}")
else:
    print(f"\nCollection '{collection_name}' not found. Uploading snapshot...")

    # Extract the Qdrant URL without port for the API endpoint
    # Handle both http and https URLs
    base_url = QDRANT_URL.replace(':6333', '').replace(':443', '')

    # Upload snapshot using HTTP API
    with open(snapshot_file, 'rb') as f:
        headers = {'api-key': QDRANT_API_KEY} if QDRANT_API_KEY else {}
        response = requests.post(
            f"{base_url}/collections/{collection_name}/snapshots/upload",
            files={'snapshot': f},
            headers=headers
        )

    if response.status_code == 200:
        print("✓ Snapshot uploaded successfully!")

        # Wait a moment for the collection to be created
        import time
        time.sleep(2)

        # Verify collection was created
        info = qdrant_client.get_collection(collection_name)
        print(f"\n{'='*60}")
        print(f"COLLECTION INFO")
        print(f"{'='*60}")
        print(f"Name: {collection_name}")
        print(f"Points count: {info.points_count}")
        print(f"Vectors count: {info.vectors_count}")
        print(f"Status: {info.status}")
        print(f"{'='*60}")
    else:
        raise Exception(
            f"Failed to upload snapshot: {response.status_code}\n"
            f"Response: {response.text}"
        )

print("\n✓ Vector database is ready for use!")

In [None]:
# Create keyword index for file_path field to enable filtering
from qdrant_client import models

try:
    # Check if index exists by trying to create it
    # If it already exists, this will just update it
    qdrant_client.create_payload_index(
        collection_name=collection_name,
        field_name="file_path",
        field_schema=models.PayloadSchemaType.KEYWORD
    )
    print(f"✓ Created/updated keyword index for 'file_path' field")
except Exception as e:
    print(f"Note: Index may already exist or error occurred: {e}")

print(f"✓ Collection '{collection_name}' is ready with required indexes")

In [None]:
from sentence_transformers import SentenceTransformer

# Initialize the embedding model
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Example: Query the vector database to get relevant documents for a question
def get_relevant_docs(question: str, k: int = 5) -> str:
    """
    Retrieve relevant documents from the vector database for a given question.

    Args:
        question: The question to search for
        k: Number of relevant documents to retrieve

    Returns:
        Concatenated string of relevant document contents
    """
    # Generate query embedding
    query_embedding = embedder.encode([question])[0].tolist()

    # Query the Qdrant server directly
    results = qdrant_client.query_points(
        collection_name=collection_name,
        query=query_embedding,
        limit=k,
        score_threshold=0.3
    )

    context = ''
    # Extract and concatenate document contents
    for i, point in enumerate(results.points):
        # Try different possible content field names
        content = point.payload.get('content', '') or point.payload.get('text', '') or str(point.payload)
        context += f'Document {i}:\n{content}\n\n'

    return context

# Test with the sample question
relevant_docs = get_relevant_docs("hyperspectral", k=5)

# Print relevant docs
print('Context: \n', relevant_docs)

## Generate evaluation dataset

To evaluate the RAG system we will create two different evaluation datasets:
1. **Extractive**: pieces of text extracted from the documents retrieved from Qdrant and used as query to the retrieval system
2. **Generative QA dataset**: following the [Chroma](https://research.trychroma.com/evaluating-chunking) approach we generated new answers using an LLM and asking him to also extract the supporting chunks

In [None]:
import torch
from transformers import pipeline

# ============================================================================
# OPTION 1: Use a local model (loads model weights locally)
# ============================================================================
# model_id = "microsoft/Phi-3.5-mini-instruct"
# model = pipeline(
#     "text-generation",
#     model=model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
# )

# ============================================================================
# OPTION 2: Use an API-based model (requires API key)
# ============================================================================
# Uncomment ONE of the options below to use an API instead of a local model
# Make sure to set your API key in the .env file

# --- OpenAI API (also supports OpenAI-compatible providers) ---
from openai import OpenAI
#
# # Initialize OpenAI client
# # This works with OpenAI and any OpenAI-compatible provider (LocalAI, Ollama, vLLM, etc.)
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")  # Defaults to OpenAI if not set
)

def model(messages, max_new_tokens=1500):
    """Wrapper to make OpenAI API compatible with the local model interface"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # or "gpt-4", "gpt-3.5-turbo", or local model name for compatible providers
        messages=messages,
        max_tokens=max_new_tokens,
        temperature=0.1
    )
    return [{
        'generated_text': messages + [{
            'role': 'assistant',
            'content': response.choices[0].message.content
        }]
    }]

# print(f"✓ Model loaded successfully!")

In [None]:
qa_generation="""You are an agent that generates questions from a provided research paper. Your job is to generate one specific question and provide the relevant sections from the paper as references.

Instructions:

Generate a question that can be answered solely by the facts in the provided paper.

Extract up to 5 significant sections from the paper that answer the question. These must be *exact copies* from the text and should be whole sentences where possible.

Focus on the most relevant information; avoid background or unrelated sections.

Format the response in JSON with three fields:

"oath": "I will not use the word 'and' in the question unless it is part of a proper noun. I will also make sure the question is concise."

"question": A concise question directly answerable using the references.

"references": A list of the extracted sections from the paper.

Notes:

Make the question specific; do not ask about multiple topics.

DO NOT USE THE WORD 'and' IN THE QUESTION UNLESS IT IS PART OF A PROPER NOUN.

Do not repeat a question that has already been used.

When the paper is long, scan all sections but only pick the most relevant ones to answer the question.

Example:

Paper Text:
"Section 1: Introduction: Climate change has accelerated glacier melt in the Himalayas, affecting water resources downstream.

Section 2: Methodology: Remote sensing data from 2000–2020 were analyzed to quantify changes in glacier area.

Section 3: Results: Glacier area decreased by 12% over 20 years, with the highest retreat in the eastern Himalayas. Streamflow measurements confirmed increased seasonal variability.

Section 4: Discussion: The retreat impacts hydropower generation and agriculture. Communities relying on glacier-fed rivers experience water stress during summer months.

Section 5: Conclusion: Urgent adaptation strategies are needed to mitigate the socioeconomic impact of glacier retreat."


Example Output:
{
  "oath": "I will not use the word 'and' in the question unless it is part of a proper noun. I will also make sure the question is concise.",
  "question": "How has glacier retreat affected downstream water resources in the Himalayas?",
  "references": [
    "Section 3: Results: Glacier area decreased by 12% over 20 years, with the highest retreat in the eastern Himalayas. Streamflow measurements confirmed increased seasonal variability.",
    "Section 4: Discussion: The retreat impacts hydropower generation and agriculture. Communities relying on glacier-fed rivers experience water stress during summer months."
  ]
}

Please provide your answer in the following JSON format:
{format_instructions}
"""

In [None]:
from pydantic import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from typing import List


class GeneratedQA(BaseModel):
    oath: str = Field(..., description="The oath taken by the agent regarding the use of the word 'and'")
    question: str = Field(..., description="A concise question directly answerable using the references")
    references: List[str] = Field(..., description="A list of extracted sections from the paper that answer the question")

def get_structured_output(model, prompt: str, schema_class: BaseModel):
    """
    Helper function to get structured output from a language model.

    Args:
        model: The language model pipeline
        prompt: The formatted prompt string (can contain {format_instructions} placeholder)
        schema_class: The Pydantic schema class for output parsing

    Returns:
        Parsed output according to the schema, or None if parsing fails
    """
    parser = PydanticOutputParser(pydantic_object=schema_class)

    # Use string replacement instead of .format() to avoid issues with curly braces in the content
    if '{format_instructions}' in prompt:
        full_prompt = prompt.replace('{format_instructions}', parser.get_format_instructions())
    else:
        full_prompt = prompt

    messages = [{"role": "system", "content": full_prompt}]

    try:
        response = model(messages, max_new_tokens=1500)[0]['generated_text'][-1]['content']
        output = parser.parse(response)
        return output
    except Exception as e:
        print(f"Error parsing response: {e}")
        print(f"Raw response: {response}")
        return None

Let's generate the dataset by sampling documents from our knowledge base. In `documents.jsonl` each line corresponds to a document with its content and metadata including the file path.

In [None]:
# Sample documents from documents.jsonl
import json
import random

def sample_documents_from_jsonl(jsonl_path: str, n_samples: int = 10):
    """
    Sample documents from a JSONL file.
    
    Args:
        jsonl_path: Path to the JSONL file
        n_samples: Number of documents to sample
    
    Returns:
        List of tuples (content, file_path)
    """
    # Read all documents from JSONL
    all_docs = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            doc = json.loads(line)
            content = doc.get('content', '')
            file_path = doc.get('metadata', {}).get('file_path', '')
            if content and file_path:
                all_docs.append((content, file_path))
    
    print(f"✓ Loaded {len(all_docs)} documents from {jsonl_path}")
    
    # Randomly sample documents
    sampled = random.sample(all_docs, min(n_samples, len(all_docs)))
    
    print(f"✓ Sampled {len(sampled)} documents")
    
    return sampled

# Sample from documents.jsonl
jsonl_path = "../data/documents.jsonl"
sampled_docs_with_paths = sample_documents_from_jsonl(jsonl_path, n_samples=5)

In [None]:
# Display sampled documents count
print(f"✓ Ready to generate Q&A for {len(sampled_docs_with_paths)} documents")

Once we have the sampled documents we want also to retrieve all chunks corresponding to each document from Qdrant. This will allow us to map the references generated by the LLM back to the chunk numbers in the vector database and compute passage-level metrics.

In [None]:
def get_document_chunks_from_qdrant(client: QdrantClient, collection_name: str, file_path: str):
    """
    Retrieve all chunks for a specific document from Qdrant.
    
    Args:
        client: Qdrant client instance
        collection_name: Name of the collection
        file_path: File path to filter by
    
    Returns:
        List of tuples (chunk_number, content)
    """
    from qdrant_client import models
    
    # Scroll through Qdrant to find all chunks with matching file_path
    chunks = []
    offset = None
    
    while True:
        records, offset = client.scroll(
            collection_name=collection_name,
            scroll_filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="file_path",
                        match=models.MatchValue(value=file_path)
                    )
                ]
            ),
            limit=100,
            offset=offset,
            with_payload=True,
            with_vectors=False
        )
        
        for record in records:
            content = record.payload.get('content', '') or record.payload.get('text', '')
            chunks.append(content)
        
        if offset is None:
            break
    
    # Number chunks sequentially (0-indexed)
    numbered_chunks = [(i, chunk) for i, chunk in enumerate(chunks)]
    return numbered_chunks

def find_reference_chunk_numbers(references: list, numbered_chunks: list, threshold: float = 0.8):
    """
    Find which chunk number each reference appears in.
    
    Args:
        references: List of reference texts
        numbered_chunks: List of (chunk_number, content) tuples
        threshold: Fuzzy matching threshold
    
    Returns:
        List of tuples (reference, [chunk_numbers])
    """
    reference_chunk_mapping = []
    
    for ref in references:
        matching_chunks = []
        for chunk_num, chunk_content in numbered_chunks:
            if is_reference_present_fuzzy(ref, chunk_content, threshold):
                matching_chunks.append(chunk_num)
        reference_chunk_mapping.append((ref, matching_chunks))
    
    return reference_chunk_mapping

print("✓ Helper functions for chunk mapping loaded")

## Q&A Generation with Reference Validation

Now we generate Q&A pairs by:
1. Providing the entire document content to the LLM
2. Asking it to generate a question with supporting references (exact text extracts)
3. **Validating each reference**: We check that each generated reference actually exists in at least one Qdrant chunk
4. **Filtering invalid references**: References not found in any chunk are removed (they may have been filtered during ingestion)
5. **Quality control**: Q&A pairs with no valid references are skipped

This validation step is crucial because:
- Documents might be processed differently during chunking (e.g., cleaning, filtering)
- Some text might be removed during the ingestion pipeline
- We need ground truth references that actually exist in the retrieval system

Each validated reference is mapped to its corresponding chunk numbers in Qdrant for later evaluation.

In [None]:
# Generate Q&A pairs for all sampled documents
import json
from tqdm.auto import tqdm

qa_dataset = []
total_refs_generated = 0
total_refs_valid = 0
total_refs_removed = 0
qa_pairs_skipped = 0

print(f"Generating Q&A pairs for {len(sampled_docs_with_paths)} documents...")
print("This may take several minutes depending on your model and hardware.")
print("Validating that all references exist in Qdrant chunks...\n")

for idx, (doc_content, file_path) in enumerate(tqdm(sampled_docs_with_paths, desc="Generating Q&A")):
    input_text = f"Context:\n{doc_content}\n\nInstructions:\n{qa_generation}"
    
    # Generate structured output
    try:
        qa_output = get_structured_output(model, input_text, GeneratedQA)
        
        if qa_output:
            # Get all chunks for this document from Qdrant
            print(f"\nFetching chunks for document {idx}: {file_path}")
            numbered_chunks = get_document_chunks_from_qdrant(qdrant_client, collection_name, file_path)
            print(f"Found {len(numbered_chunks)} chunks")
            
            if len(numbered_chunks) == 0:
                print(f"⚠ No chunks found in Qdrant for this document - skipping")
                qa_pairs_skipped += 1
                continue
            
            # VALIDATE REFERENCES: Check if each reference exists in at least one chunk
            validated_references = []
            refs_generated = len(qa_output.references)
            total_refs_generated += refs_generated
            
            for ref_text in qa_output.references:
                # Check if this reference appears in at least one chunk
                found_in_chunks = []
                for chunk_num, chunk_content in numbered_chunks:
                    if is_reference_present_fuzzy(ref_text, chunk_content, threshold=0.8):
                        found_in_chunks.append(chunk_num)
                
                if found_in_chunks:
                    # Reference is valid - it exists in at least one chunk
                    validated_references.append({
                        "text": ref_text,
                        "chunk_numbers": found_in_chunks
                    })
                    total_refs_valid += 1
                else:
                    # Reference not found in any chunk - remove it
                    print(f"  ⚠ Reference not found in any chunk (removed): '{ref_text[:60]}...'")
                    total_refs_removed += 1
            
            # Only keep Q&A pair if at least one valid reference remains
            if len(validated_references) > 0:
                qa_entry = {
                    "question": qa_output.question,
                    "file_path": file_path,
                    "references": validated_references,
                    "source_document": doc_content[:500] + "..." if len(doc_content) > 500 else doc_content,
                    "refs_generated": refs_generated,
                    "refs_valid": len(validated_references),
                    "refs_removed": refs_generated - len(validated_references)
                }
                qa_dataset.append(qa_entry)
                
                print(f"  ✓ Valid references: {len(validated_references)}/{refs_generated}")
                
                # Print progress every 10 documents
                if (idx + 1) % 10 == 0:
                    print(f"\n✓ Generated {len(qa_dataset)} Q&A pairs so far...")
            else:
                print(f"  ✗ No valid references found - skipping Q&A pair")
                qa_pairs_skipped += 1
        else:
            print(f"\n⚠ Failed to generate Q&A for document {idx}")
            qa_pairs_skipped += 1
            
    except Exception as e:
        print(f"\n⚠ Error processing document {idx}: {e}")
        qa_pairs_skipped += 1
        continue

print(f"\n{'='*60}")
print(f"Q&A GENERATION COMPLETE")
print(f"{'='*60}")
print(f"Total Q&A pairs generated: {len(qa_dataset)}")
print(f"Q&A pairs skipped: {qa_pairs_skipped}")
print(f"Success rate: {len(qa_dataset)/(len(sampled_docs_with_paths))*100:.1f}%")
print(f"\nReference Statistics:")
print(f"  Total references generated: {total_refs_generated}")
print(f"  Valid references (found in chunks): {total_refs_valid}")
print(f"  Removed references (not in chunks): {total_refs_removed}")
print(f"  Validity rate: {(total_refs_valid/total_refs_generated*100) if total_refs_generated > 0 else 0:.1f}%")
print(f"{'='*60}")

# Save the dataset to a JSON file
output_file = "qa_evaluation_dataset.json"
with open(output_file, 'w') as f:
    json.dump(qa_dataset, f, indent=2)

print(f"\n✓ Dataset saved to: {output_file}")

# Display a sample Q&A pair
if qa_dataset:
    print(f"\nSample Q&A pair:")
    print(f"{'='*60}")
    sample = qa_dataset[0]
    print(f"Question: {sample['question']}")
    print(f"File Path: {sample['file_path']}")
    print(f"References: {sample['refs_valid']}/{sample['refs_generated']} valid")
    print(f"\nFirst Reference:")
    if sample['references']:
        ref_info = sample['references'][0]
        ref_text = ref_info['text']
        chunk_nums = ref_info['chunk_numbers']
        print(f"  Text: {ref_text[:100]}...")
        print(f"  Chunks: {chunk_nums}")
    print(f"{'='*60}")

In [None]:
# Display Q&A dataset structure and validation statistics
if qa_dataset:
    sample = qa_dataset[0]
    print(f"Q&A Dataset Structure:")
    print(f"{'='*60}")
    print(f"Question: {sample['question']}")
    print(f"File Path: {sample['file_path']}")
    print(f"Number of References: {sample['refs_valid']} (out of {sample['refs_generated']} generated)")
    
    if sample['refs_removed'] > 0:
        print(f"⚠ Removed {sample['refs_removed']} reference(s) not found in Qdrant chunks")
    
    print(f"\nFirst Reference:")
    if sample['references']:
        ref = sample['references'][0]
        print(f"  Text: {ref['text'][:150]}...")
        print(f"  Chunk Numbers: {ref['chunk_numbers']}")
        print(f"  ✓ Validated: Present in {len(ref['chunk_numbers'])} chunk(s)")
    print(f"{'='*60}")
    
    # Show overall validation statistics
    print(f"\n\nOverall Dataset Statistics:")
    print(f"{'='*60}")
    total_generated = sum(qa['refs_generated'] for qa in qa_dataset)
    total_valid = sum(qa['refs_valid'] for qa in qa_dataset)
    total_removed = sum(qa['refs_removed'] for qa in qa_dataset)
    
    print(f"Total Q&A pairs: {len(qa_dataset)}")
    print(f"References generated: {total_generated}")
    print(f"References validated: {total_valid} ({total_valid/total_generated*100:.1f}%)")
    print(f"References removed: {total_removed} ({total_removed/total_generated*100:.1f}%)")
    print(f"{'='*60}")

## Evaluation Metrics

We evaluate the RAG system using three distinct levels of metrics:

### 1. Token-Level Metrics
Measures overlap between reference tokens and retrieved document tokens:
- **IoU (Intersection over Union)**: Overlap of token sets
- **Precision**: Fraction of retrieved tokens that are relevant
- **Recall**: Fraction of relevant tokens that are retrieved
- **F1 Score**: Harmonic mean of precision and recall

### 2. Passage-Level Metrics
Treats each reference as a discrete unit (passage) and measures retrieval effectiveness:
- **Coverage**: Fraction of reference passages found in retrieved chunks
- **Accuracy**: Binary metric (1 if all references found, 0 otherwise)
- **Precision**: Fraction of retrieved chunks containing at least one reference
- **Recall**: Same as coverage (fraction of references found)
- **F1 Score**: Harmonic mean of precision and recall

### 3. Document-Level Metrics
Measures whether the source document appears in the retrieval results:
- **Coverage**: Binary metric (1 if source document retrieved, 0 otherwise)
- **Accuracy**: Same as coverage
- **Precision**: Fraction of retrieved chunks from the source document
- **Recall**: Binary metric (same as coverage)
- **Chunks Retrieved**: Count of source document chunks in top-K

In [None]:
import pandas as pd
from collections import Counter
import string
from typing import List, Dict

def normalize_text(text: str) -> str:
    """Normalize text by lowercasing, removing punctuation, and normalizing whitespace."""
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = " ".join(text.split())
    return text

def is_reference_present_fuzzy(reference: str, document: str, threshold: float = 0.8) -> bool:
    """
    Returns True if enough of the reference tokens appear in the document.
    
    Args:
        reference: Reference text to find
        document: Document to search in
        threshold: Fraction of tokens that must match (default: 0.8)
    
    Returns:
        True if the reference is found with sufficient token overlap
    """
    ref_tokens = normalize_text(reference).split()
    doc_tokens = normalize_text(document).split()
    if not ref_tokens:
        return False
    matched_tokens = sum(1 for t in ref_tokens if t in doc_tokens)
    fraction_matched = matched_tokens / len(ref_tokens)
    return fraction_matched >= threshold

# ============================================================================
# TOKEN-LEVEL METRICS
# ============================================================================

def compute_token_metrics(references: List[str], retrieved_texts: List[str], threshold: float = 0.8) -> Dict[str, float]:
    """
    Compute token-level IoU, precision, recall, and F1 score.
    
    Args:
        references: List of reference texts that should be found
        retrieved_texts: List of retrieved document chunks
        threshold: Threshold for fuzzy matching (default: 0.8)
    
    Returns:
        Dictionary with 'iou', 'precision', 'recall', 'f1' scores
    """
    all_ref_tokens = []
    all_doc_tokens = []

    # Track which references are found
    for ref in references:
        found = any(is_reference_present_fuzzy(ref, doc, threshold) for doc in retrieved_texts)
        ref_tokens = normalize_text(ref).split()
        all_ref_tokens.extend(ref_tokens)
        if found:
            # Add tokens from retrieved documents
            for doc in retrieved_texts:
                all_doc_tokens.extend(normalize_text(doc).split())
    
    if not all_ref_tokens:
        return {"iou": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}

    ref_counter = Counter(all_ref_tokens)
    doc_counter = Counter(all_doc_tokens)

    intersection_tokens = ref_counter & doc_counter
    intersection_count = sum(intersection_tokens.values())

    ref_count = sum(ref_counter.values())
    doc_count = sum(doc_counter.values())

    union_count = ref_count + doc_count - intersection_count

    iou = intersection_count / union_count if union_count > 0 else 0.0
    precision = intersection_count / doc_count if doc_count > 0 else 0.0
    recall = intersection_count / ref_count if ref_count > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return {"iou": iou, "precision": precision, "recall": recall, "f1": f1}

# ============================================================================
# PASSAGE-LEVEL METRICS
# ============================================================================

def compute_passage_metrics(references: List[str], retrieved_texts: List[str], threshold: float = 0.8) -> Dict[str, float]:
    """
    Compute passage-level coverage, accuracy, precision, recall, and F1 score.
    
    Passage-level treats each reference as a unit and checks if it appears in any retrieved chunk.
    
    Args:
        references: List of reference texts (passages) that should be found
        retrieved_texts: List of retrieved document chunks
        threshold: Threshold for fuzzy matching (default: 0.8)
    
    Returns:
        Dictionary with 'coverage', 'accuracy', 'precision', 'recall', 'f1' scores
    """
    if not references:
        return {"coverage": 0.0, "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    if not retrieved_texts:
        return {"coverage": 0.0, "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    # Count how many references were found
    found_references = 0
    for ref in references:
        if any(is_reference_present_fuzzy(ref, doc, threshold) for doc in retrieved_texts):
            found_references += 1
    
    # Count how many retrieved chunks contain at least one reference
    relevant_retrieved = 0
    for doc in retrieved_texts:
        if any(is_reference_present_fuzzy(ref, doc, threshold) for ref in references):
            relevant_retrieved += 1
    
    # Coverage: fraction of references found
    coverage = found_references / len(references)
    
    # Accuracy: 1 if all references found, 0 otherwise
    accuracy = 1.0 if found_references == len(references) else 0.0
    
    # Precision: fraction of retrieved chunks that contain at least one reference
    precision = relevant_retrieved / len(retrieved_texts) if retrieved_texts else 0.0
    
    # Recall: fraction of references that were found
    recall = found_references / len(references)
    
    # F1: harmonic mean of precision and recall
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return {
        "coverage": coverage,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# ============================================================================
# DOCUMENT-LEVEL METRICS
# ============================================================================

def compute_document_metrics(source_file_path: str, retrieved_file_paths: List[str]) -> Dict[str, float]:
    """
    Compute document-level coverage, accuracy, precision, and recall.
    
    Document-level checks if chunks from the source document appear in the retrieved results.
    
    Args:
        source_file_path: File path of the source document
        retrieved_file_paths: List of file paths of retrieved chunks
    
    Returns:
        Dictionary with 'coverage', 'accuracy', 'precision', 'recall' scores
    """
    if not retrieved_file_paths:
        return {"coverage": 0.0, "accuracy": 0.0, "precision": 0.0, "recall": 0.0}
    
    # Count how many retrieved chunks are from the source document
    source_chunks_retrieved = sum(1 for fp in retrieved_file_paths if fp == source_file_path)
    
    # Coverage/Accuracy: binary - did we retrieve at least one chunk from source document?
    coverage = 1.0 if source_chunks_retrieved > 0 else 0.0
    accuracy = coverage  # Same as coverage for document-level
    
    # Precision: fraction of retrieved chunks that are from the source document
    precision = source_chunks_retrieved / len(retrieved_file_paths)
    
    # Recall: For this metric, we define it as binary (did we find the source doc?)
    # Could also be computed as fraction of source chunks retrieved, but that requires knowing total chunks
    recall = coverage
    
    return {
        "coverage": coverage,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "source_chunks_count": source_chunks_retrieved
    }

print("✓ Token-level, Passage-level, and Document-level metric functions loaded")

## Evaluate the RAG System at different K values

Let's compute the evaluation metrics for our Q&A dataset by retrieving documents from Qdrant and calculating the defined metrics at multiple K values.

In [None]:
# Evaluate the Q&A dataset by retrieving documents and computing metrics
import time

# Configuration
K_MAX = 15  # Maximum number of documents to retrieve
K_VALUES = [3, 5, 10, 15]  # Compute metrics at these K values
THRESHOLD = 0.8  # Threshold for fuzzy matching

# Store results for each K value
results_by_k = {k: [] for k in K_VALUES}

print(f"Evaluating {len(qa_dataset)} Q&A pairs...")
print(f"Retrieving top-{K_MAX} documents for each question")
print(f"Computing metrics at K = {K_VALUES}")
print(f"Computing Token-level, Passage-level, and Document-level metrics\n")

for idx, qa_pair in enumerate(tqdm(qa_dataset, desc="Evaluating Q&A pairs")):
    question = qa_pair["question"]
    file_path = qa_pair["file_path"]
    references_info = qa_pair["references"]  # List of dicts with 'text' and 'chunk_numbers'
    
    # Extract reference texts for evaluation
    reference_texts = [ref_info["text"] for ref_info in references_info]
    
    # Generate query embedding
    query_embedding = embedder.encode([question])[0].tolist()
    
    # Measure retrieval time
    start_time = time.time()
    
    # Retrieve documents from Qdrant (retrieve K_MAX)
    search_results = qdrant_client.query_points(
        collection_name=collection_name,
        query=query_embedding,
        limit=K_MAX,
        score_threshold=0.3
    )
    
    retrieval_time = time.time() - start_time
    
    # Extract ALL retrieved document contents and their file paths
    all_retrieved_texts = []
    all_retrieved_scores = []
    all_retrieved_file_paths = []
    for point in search_results.points:
        content = point.payload.get('content', '') or point.payload.get('text', '') or str(point.payload)
        all_retrieved_texts.append(content)
        all_retrieved_scores.append(point.score)
        all_retrieved_file_paths.append(point.payload.get('file_path', ''))
    
    # Compute metrics at different K values
    for k in K_VALUES:
        # Take only top-K results
        retrieved_texts = all_retrieved_texts[:k]
        retrieved_scores = all_retrieved_scores[:k]
        retrieved_file_paths = all_retrieved_file_paths[:k]
        
        # ========================================================================
        # COMPUTE TOKEN-LEVEL METRICS
        # ========================================================================
        token_metrics = compute_token_metrics(reference_texts, retrieved_texts, threshold=THRESHOLD)
        
        # ========================================================================
        # COMPUTE PASSAGE-LEVEL METRICS
        # ========================================================================
        passage_metrics = compute_passage_metrics(reference_texts, retrieved_texts, threshold=THRESHOLD)
        
        # ========================================================================
        # COMPUTE DOCUMENT-LEVEL METRICS
        # ========================================================================
        document_metrics = compute_document_metrics(file_path, retrieved_file_paths)
        
        # Store results with all three metric levels for this K value
        results_by_k[k].append({
            "question": question,
            "file_path": file_path,
            "k": k,
            "num_references": len(reference_texts),
            "num_retrieved": len(retrieved_texts),
            "retrieval_time": retrieval_time,
            "avg_score": sum(retrieved_scores) / len(retrieved_scores) if retrieved_scores else 0,
            
            # Token-level metrics
            "token_iou": token_metrics["iou"],
            "token_precision": token_metrics["precision"],
            "token_recall": token_metrics["recall"],
            "token_f1": token_metrics["f1"],
            
            # Passage-level metrics
            "passage_coverage": passage_metrics["coverage"],
            "passage_accuracy": passage_metrics["accuracy"],
            "passage_precision": passage_metrics["precision"],
            "passage_recall": passage_metrics["recall"],
            "passage_f1": passage_metrics["f1"],
            
            # Document-level metrics
            "doc_coverage": document_metrics["coverage"],
            "doc_accuracy": document_metrics["accuracy"],
            "doc_precision": document_metrics["precision"],
            "doc_recall": document_metrics["recall"],
            "doc_chunks_retrieved": document_metrics["source_chunks_count"]
        })

# Convert to DataFrames
dfs_by_k = {k: pd.DataFrame(results) for k, results in results_by_k.items()}

print(f"\n✓ Evaluation complete: {len(qa_dataset)} Q&A pairs evaluated at K = {K_VALUES}")

## Evaluation at Multiple K Values

We retrieve K=15 documents but compute metrics at different cutoff points (K=3, 5, 10, 15) to understand:
- How performance changes with the number of retrieved documents
- The optimal K value for this RAG system
- Trade-offs between retrieval quality and computational cost

For each K value, we compute:
- **Token-level metrics**: Fine-grained text overlap
- **Passage-level metrics**: Reference passage retrieval effectiveness
- **Document-level metrics**: Source document retrieval success

In [None]:
# Display average metrics for each K value and save results

print(f"\n{'='*80}")
print(f"AVERAGE EVALUATION METRICS AT DIFFERENT K VALUES")
print(f"{'='*80}")

# Create a summary table
summary_data = []

for k in K_VALUES:
    df = dfs_by_k[k]
    
    print(f"\n{'─'*80}")
    print(f"K = {k}")
    print(f"{'─'*80}")
    
    # Token-level metrics
    print(f"\n  [TOKEN-LEVEL]")
    print(f"    IoU: {df['token_iou'].mean():.4f}  |  ", end="")
    print(f"Precision: {df['token_precision'].mean():.4f}  |  ", end="")
    print(f"Recall: {df['token_recall'].mean():.4f}  |  ", end="")
    print(f"F1: {df['token_f1'].mean():.4f}")
    
    # Passage-level metrics
    print(f"\n  [PASSAGE-LEVEL]")
    print(f"    Coverage: {df['passage_coverage'].mean():.4f}  |  ", end="")
    print(f"Accuracy: {df['passage_accuracy'].mean():.4f}  |  ", end="")
    print(f"Precision: {df['passage_precision'].mean():.4f}  |  ", end="")
    print(f"Recall: {df['passage_recall'].mean():.4f}  |  ", end="")
    print(f"F1: {df['passage_f1'].mean():.4f}")
    
    # Document-level metrics
    print(f"\n  [DOCUMENT-LEVEL]")
    print(f"    Coverage: {df['doc_coverage'].mean():.4f}  |  ", end="")
    print(f"Accuracy: {df['doc_accuracy'].mean():.4f}  |  ", end="")
    print(f"Precision: {df['doc_precision'].mean():.4f}  |  ", end="")
    print(f"Recall: {df['doc_recall'].mean():.4f}  |  ", end="")
    print(f"Avg Chunks: {df['doc_chunks_retrieved'].mean():.2f}")
    
    # Store for summary table
    summary_data.append({
        'K': k,
        'Token_F1': df['token_f1'].mean(),
        'Passage_Coverage': df['passage_coverage'].mean(),
        'Passage_F1': df['passage_f1'].mean(),
        'Doc_Coverage': df['doc_coverage'].mean(),
        'Doc_Precision': df['doc_precision'].mean(),
        'Retrieval_Time': df['retrieval_time'].mean()
    })

print(f"\n{'='*80}")

# Create summary comparison table
df_summary = pd.DataFrame(summary_data)
print(f"\nSUMMARY COMPARISON TABLE")
print(f"{'='*80}")
print(df_summary.to_string(index=False))
print(f"{'='*80}")


# Save summary table
summary_file = "evaluation_summary.csv"
df_summary.to_csv(summary_file, index=False)
print(f"✓ Summary saved to: {summary_file}")

In [None]:
df_summary