In [None]:
import os
import sys
from hybrid_retriever import Retriever, chunk_text_by_section, generate_embeddings_batch

## Step 1: Load and Chunk Document

Load the report.md file and split it into sections for indexing.

In [None]:
# Load the report
report_path = "report.md"
if not os.path.exists(report_path):
    print(f"‚ùå {report_path} not found")
else:
    with open(report_path, 'r', encoding='utf-8') as f:
        report_text = f.read()
    
    # Chunk by sections
    chunks = chunk_text_by_section(report_text)
    print(f"‚úÖ Loaded and chunked document into {len(chunks)} sections")
    
    # Show first chunk
    print(f"\nFirst chunk preview:")
    print(chunks[0][:300] + "...\n")

## Step 2: Generate Embeddings

Create embeddings for semantic search.

In [None]:
# Generate embeddings for all chunks
embeddings = generate_embeddings_batch(chunks)
print(f"‚úÖ Generated {len(embeddings)} embeddings")
print(f"   Embedding dimension: {len(embeddings[0]) if embeddings else 0}")
print(f"   Example: {[f'{x:.4f}' for x in embeddings[0][:3]]}...")

## Step 3: Build Hybrid Index

Create a Retriever that combines semantic and lexical search.

In [None]:
# Initialize hybrid retriever
retriever = Retriever()

# Add documents to both indexes
for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    metadata = {
        'id': i,
        'content': chunk,
        'section': chunk.split('\n')[0] if chunk else f"Section {i}"
    }
    retriever.add_document(chunk, embedding, metadata)

print(f"‚úÖ Built hybrid retriever with {len(chunks)} documents")
print(f"   - Vector Index: Ready for semantic search")
print(f"   - BM25 Index: Ready for lexical search")
print(f"   - Retriever: Ready for merged RRF search")

## Step 4: Test Semantic Search Only

See what happens with only vector search (the original problem).

In [None]:
from hybrid_retriever import generate_embeddings_batch

# Create query
query = "What happened with incident 2023 Q4 011"
query_embedding = generate_embeddings_batch([query])[0]

# Semantic search only
print(f"üîç Query: '{query}'\n")
print("=" * 80)
print("SEMANTIC SEARCH ONLY (Original Problem)")
print("=" * 80 + "\n")

semantic_results = retriever.vector_index.search(query_embedding, top_k=3)

for i, (metadata, distance) in enumerate(semantic_results, 1):
    similarity = 1 - distance
    print(f"Result {i}: {metadata['section']}")
    print(f"  Distance: {distance:.4f}, Similarity: {similarity:.4f}")
    print(f"  Content: {metadata['content'][:200]}...\n")

## Step 5: Test Lexical Search (BM25)

See how BM25 keyword matching performs.

In [None]:
print("=" * 80)
print("LEXICAL SEARCH (BM25)")
print("=" * 80 + "\n")

bm25_results = retriever.bm25_index.search(query, top_k=3)

for i, (metadata, distance) in enumerate(bm25_results, 1):
    score = -distance  # Negate to get positive score
    print(f"Result {i}: {metadata['section']}")
    print(f"  BM25 Score: {score:.4f}")
    print(f"  Content: {metadata['content'][:200]}...\n")

## Step 6: Test Hybrid Search (Reciprocal Rank Fusion)

Merge both results using RRF for improved ranking.

In [None]:
print("=" * 80)
print("HYBRID SEARCH (RRF Fusion)")
print("=" * 80 + "\n")

hybrid_results = retriever.search(query, query_embedding, top_k=3)

for i, (metadata, rrf_score) in enumerate(hybrid_results, 1):
    print(f"Result {i}: {metadata['section']}")
    print(f"  RRF Score: {-rrf_score:.4f}")
    print(f"  Content: {metadata['content'][:200]}...\n")

## Understanding Reciprocal Rank Fusion (RRF)

RRF combines rankings from multiple search systems:

**Formula:** `RRF_score = 1/(k + rank)` summed across all ranking systems

**Example:**
- Semantic search returns: [Section A, Section B, Section C]
- BM25 returns: [Section C, Section A, Section B]

**RRF Calculation:**
- Section A: 1/(60+1) + 1/(60+2) = 0.0164 + 0.0160 = 0.0324 ‚≠ê Best
- Section B: 1/(60+2) + 1/(60+3) = 0.0160 + 0.0157 = 0.0317
- Section C: 1/(60+3) + 1/(60+1) = 0.0157 + 0.0164 = 0.0321

**Result:** Section A wins because it ranked well in both systems!

## Comparison Summary

| Aspect | Semantic | Lexical (BM25) | Hybrid (RRF) |
|--------|----------|----------------|---------------|
| **Strengths** | Understanding context & meaning | Exact keyword matching | Both! |
| **Weakness** | Misses exact keywords | No semantic understanding | Requires both |
| **Problem Query** | Returns irrelevant sections | May miss nuance | Returns best of both |
| **Architecture** | VectorIndex | BM25Index | Retriever + RRF |

### Key Insight

By combining semantic search (embeddings) with lexical search (BM25), we get:
- ‚úÖ Semantic understanding
- ‚úÖ Exact keyword matching  
- ‚úÖ Robust ranking via RRF

This is the foundation of production RAG systems!

## Why This Design is Extensible

The Retriever pattern makes it easy to add more search methods:

```python
class MyCustomIndex:
    def add_document(self, text, metadata):
        # Your implementation
        pass
    
    def search(self, query, top_k):
        # Your implementation
        return [(metadata, distance), ...]

# Just add it to Retriever!
retriever.custom_index = MyCustomIndex()
retriever.search()  # Automatically includes custom results via RRF
```

As long as each index has `add_document()` and `search()`, it works with RRF!