# Module 2: RAG Systems

## Applied AI Scientist Field Notes - Expanded Edition

---


## Module 2: Retrieval-Augmented Generation (RAG) - Production Implementation

### Topics Covered
1. Advanced chunking strategies
2. Vector embeddings and similarity search
3. Hybrid search (BM25 + Dense)
4. Security: RBAC and audit logging
5. Reranking and fusion
6. Production optimization

---

In [None]:
%pip install -q chromadb sentence-transformers faiss-cpu rank-bm25
%pip install -q pandas numpy

print('Dependencies installed!')

### Section 1: Document Chunking Strategies

Chunking is critical for RAG quality. Different strategies:
- **Fixed-size**: Simple but breaks semantics
- **Sentence-aware**: Respects boundaries
- **Semantic**: Based on topic shifts
- **Recursive**: Multi-level splitting
- **Structure-aware**: Preserves document structure (headers, lists)

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Any
import hashlib

@dataclass
class Chunk:
    text: str
    start_idx: int
    end_idx: int
    chunk_id: str
    metadata: Dict[str, Any]

class AdvancedChunker:
    '''Production-grade document chunking'''
    
    def __init__(self, chunk_size=512, overlap=50):
        self.chunk_size = chunk_size
        self.overlap = overlap
    
    def chunk_by_sentences(self, text: str, doc_id: str) -> List[Chunk]:
        '''Chunk respecting sentence boundaries'''
        import re
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        chunks = []
        current = []
        current_len = 0
        start_idx = 0
        
        for sent in sentences:
            if current_len + len(sent) > self.chunk_size and current:
                chunk_text = ' '.join(current)
                chunks.append(Chunk(
                    text=chunk_text,
                    start_idx=start_idx,
                    end_idx=start_idx + len(chunk_text),
                    chunk_id=f'{doc_id}_c{len(chunks)}',
                    metadata={'method': 'sentence', 'doc_id': doc_id}
                ))
                
                # Overlap
                if self.overlap > 0 and len(current) > 1:
                    current = current[-1:]
                    current_len = len(current[0])
                else:
                    current = []
                    current_len = 0
                
                start_idx += len(chunk_text) - current_len
            
            current.append(sent)
            current_len += len(sent)
        
        if current:
            chunk_text = ' '.join(current)
            chunks.append(Chunk(
                text=chunk_text,
                start_idx=start_idx,
                end_idx=start_idx + len(chunk_text),
                chunk_id=f'{doc_id}_c{len(chunks)}',
                metadata={'method': 'sentence', 'doc_id': doc_id}
            ))
        
        return chunks

# Test
chunker = AdvancedChunker(chunk_size=200, overlap=30)
sample = 'AI agents are software systems. They use LLMs for reasoning. They can use tools. They maintain state across interactions.'
chunks = chunker.chunk_by_sentences(sample, 'doc1')

print(f'Created {len(chunks)} chunks:')
for i, c in enumerate(chunks):
    print(f'  Chunk {i+1}: {len(c.text)} chars - {c.text[:60]}...')

### Section 2: Production RAG System with Security

Key features:
- Role-based access control (RBAC)
- Audit logging
- Metadata filtering
- Citation tracking
- Cost monitoring

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
from datetime import datetime
import uuid

class SecureRAG:
    '''Production RAG with RBAC and observability'''
    
    def __init__(self, embedding_model='all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.client = chromadb.Client()
        self.collection = self.client.get_or_create_collection('secure_docs')
        self.audit_log = []
    
    def ingest(self, text: str, doc_id: str, allowed_roles: set, metadata: dict = None):
        '''Ingest document with access control'''
        chunker = AdvancedChunker(400, 50)
        chunks = chunker.chunk_by_sentences(text, doc_id)
        
        texts = [c.text for c in chunks]
        embeddings = self.embedding_model.encode(texts).tolist()
        
        ids = []
        metas = []
        
        for i, chunk in enumerate(chunks):
            chunk_id = f'{doc_id}_{i}_{hashlib.md5(chunk.text.encode()).hexdigest()[:8]}'
            ids.append(chunk_id)
            
            meta = {
                'doc_id': doc_id,
                'allowed_roles': ','.join(allowed_roles),
                'ingested_at': datetime.utcnow().isoformat()
            }
            if metadata:
                meta.update(metadata)
            metas.append(meta)
        
        self.collection.add(ids=ids, embeddings=embeddings, documents=texts, metadatas=metas)
        self._log('INGEST', doc_id=doc_id, chunks=len(ids))
        return ids
    
    def retrieve(self, query: str, user_role: str, top_k=5):
        '''Retrieve with RBAC enforcement'''
        query_emb = self.embedding_model.encode([query])[0].tolist()
        results = self.collection.query(query_embeddings=[query_emb], n_results=top_k * 2)
        
        filtered = []
        for i in range(len(results['ids'][0])):
            meta = results['metadatas'][0][i]
            allowed = set(meta.get('allowed_roles', '').split(','))
            
            if user_role in allowed or 'public' in allowed:
                filtered.append({
                    'text': results['documents'][0][i],
                    'metadata': meta,
                    'similarity': 1 - results['distances'][0][i]
                })
                if len(filtered) >= top_k:
                    break
        
        self._log('RETRIEVE', query=query[:50], role=user_role, results=len(filtered))
        return filtered
    
    def _log(self, action, **kwargs):
        self.audit_log.append({
            'timestamp': datetime.utcnow().isoformat(),
            'action': action,
            'log_id': str(uuid.uuid4()),
            **kwargs
        })
    
    def get_audit_log(self, last_n=10):
        return self.audit_log[-last_n:]

# Example usage
print('Initializing Secure RAG...')
rag = SecureRAG()

# Ingest with different access levels
rag.ingest('Company holidays: Jan 1, Jul 4, Dec 25', 'holidays', {'public', 'employee'})
rag.ingest('Leave policy: 15 days after 1 year tenure', 'leave', {'employee', 'hr'})
rag.ingest('L4 salary: $150K-$180K', 'compensation', {'hr'})

print('\nTesting RBAC retrieval...')
queries = [
    ('holidays', 'public'),
    ('leave policy', 'employee'),
    ('L4 salary', 'employee'),  # blocked
    ('L4 salary', 'hr')  # allowed
]

for q, role in queries:
    results = rag.retrieve(q, role, top_k=1)
    print(f'Query: {q:20} | Role: {role:10} | Results: {len(results)}')

## Interview Questions: RAG Systems - Part 1 (Chunking & Retrieval)

### For Experienced Professionals

Understanding production RAG requires deep knowledge of chunking strategies, retrieval methods, and performance optimization.


In [None]:
interview_questions_rag_part1 = [
    {
        "level": "Senior",
        "question": "Your RAG system has 92% retrieval recall but users complain answers are 'incomplete' or 'miss important details.' The retrieved chunks are semantically relevant. What's likely wrong and how do you fix it?",
        "answer": """
**Root Cause Analysis:**

The problem is likely **chunk boundary issues** - semantically relevant chunks are retrieved, but critical context is split across chunk boundaries.

**Example Scenario:**
```
Chunk 1 (retrieved): "...the refund policy applies to purchases within 30 days."
Chunk 2 (not retrieved): "However, electronics have a different policy requiring return within 14 days with original packaging."
```

User asks: "What's the electronics refund policy?"
System retrieves Chunk 1 (mentions "refund"), but misses the specific exception for electronics.

**Diagnostic Steps:**

1. **Analyze Failed Cases:**
   ```python
   def diagnose_incomplete_answers(query: str, retrieved_chunks: List[str], ground_truth: str) -> dict:
       # Check if answer spans multiple chunks
       all_chunks = get_all_chunks_for_doc(doc_id)
       
       # Find which chunks contain ground truth info
       relevant_chunk_ids = []
       for i, chunk in enumerate(all_chunks):
           if contains_answer_info(chunk, ground_truth):
               relevant_chunk_ids.append(i)
       
       # Check if relevant chunks are adjacent but not all retrieved
       retrieved_ids = [c.id for c in retrieved_chunks]
       missing_adjacent = [
           id for id in relevant_chunk_ids 
           if id not in retrieved_ids and (id-1 in retrieved_ids or id+1 in retrieved_ids)
       ]
       
       return {
           "total_relevant": len(relevant_chunk_ids),
           "retrieved_relevant": len([id for id in relevant_chunk_ids if id in retrieved_ids]),
           "missing_adjacent": len(missing_adjacent),
           "boundary_issue": len(missing_adjacent) > 0
       }
   ```

2. **Check Chunk Size Distribution:**
   - Too small: Fragments concepts
   - Too large: Dilutes relevance scores

**Solutions:**

**1. Sentence Window Retrieval (Immediate, ~40% improvement):**
```python
class SentenceWindowRetriever:
    '''Retrieve small chunks but return with context window.'''
    
    def __init__(self, chunk_size=256, window_sentences=3):
        self.chunk_size = chunk_size
        self.window_sentences = window_sentences
    
    def retrieve_with_context(self, query: str, top_k: int = 5) -> List[dict]:
        # Step 1: Retrieve small, focused chunks
        small_chunks = self.retrieve_small_chunks(query, top_k)
        
        # Step 2: Expand each chunk with surrounding sentences
        expanded_chunks = []
        for chunk in small_chunks:
            expanded = self.expand_chunk(
                chunk,
                before=self.window_sentences,
                after=self.window_sentences
            )
            expanded_chunks.append(expanded)
        
        return expanded_chunks
    
    def expand_chunk(self, chunk: dict, before: int, after: int) -> dict:
        '''Add context from surrounding sentences.'''
        doc_id = chunk['metadata']['doc_id']
        chunk_idx = chunk['metadata']['chunk_idx']
        
        # Get original document
        doc = self.get_document(doc_id)
        all_chunks = self.chunk_document(doc)
        
        # Expand window
        start_idx = max(0, chunk_idx - before)
        end_idx = min(len(all_chunks), chunk_idx + after + 1)
        
        expanded_text = ' '.join([
            all_chunks[i].text for i in range(start_idx, end_idx)
        ])
        
        return {
            'text': expanded_text,
            'original_chunk': chunk['text'],
            'context_added': True,
            'metadata': chunk['metadata']
        }
```

**Benefits:**
- Retrieve precise matches (small chunks)
- Return complete context (window expansion)
- 20-40% improvement in answer completeness

**2. Hierarchical Chunking (Medium-term, ~50% improvement):**
```python
class HierarchicalChunker:
    '''Multi-level chunking: paragraph -> section -> document.'''
    
    def chunk_hierarchical(self, doc: str, doc_id: str) -> List[dict]:
        chunks = []
        
        # Level 1: Paragraph chunks (for retrieval)
        paragraphs = doc.split('\\n\\n')
        
        # Level 2: Section chunks (for context)
        sections = self.split_by_headers(doc)
        
        for i, para in enumerate(paragraphs):
            # Find parent section
            parent_section = self.find_parent_section(para, sections)
            
            chunks.append({
                'chunk_id': f'{doc_id}_p{i}',
                'text': para,  # Small chunk for embedding
                'parent_text': parent_section,  # Large context for LLM
                'level': 'paragraph',
                'metadata': {
                    'doc_id': doc_id,
                    'section_title': parent_section.split('\\n')[0]
                }
            })
        
        return chunks
    
    def retrieve_hierarchical(self, query: str) -> List[dict]:
        # Retrieve small paragraph chunks
        para_chunks = self.retrieve_paragraphs(query, top_k=10)
        
        # Return parent section for each chunk
        return [
            {
                'retrieval_chunk': chunk['text'],
                'context_for_llm': chunk['parent_text'],  # Full section
                'metadata': chunk['metadata']
            }
            for chunk in para_chunks
        ]
```

**Benefits:**
- Retrieve granular (paragraph-level)
- Provide comprehensive context (section-level)
- Preserves document structure

**3. Overlapping with Semantic Similarity (Long-term, ~60% improvement):**
```python
class SmartOverlapChunker:
    '''Intelligent overlap based on semantic coherence.'''
    
    def chunk_with_smart_overlap(self, doc: str) -> List[dict]:
        sentences = self.split_sentences(doc)
        chunks = []
        
        i = 0
        while i < len(sentences):
            # Build chunk up to max size
            chunk_sentences = []
            chunk_len = 0
            
            while i < len(sentences) and chunk_len < self.max_chunk_size:
                chunk_sentences.append(sentences[i])
                chunk_len += len(sentences[i])
                i += 1
            
            # Look ahead: should we include more for completeness?
            if i < len(sentences):
                next_sentence = sentences[i]
                
                # Check semantic coherence
                if self.should_extend_chunk(chunk_sentences, next_sentence):
                    # Critical info continues, extend chunk
                    chunk_sentences.append(next_sentence)
                    i += 1
            
            chunks.append({
                'text': ' '.join(chunk_sentences),
                'sentences': chunk_sentences
            })
            
            # Smart backtrack for overlap
            # Include last 2-3 sentences in next chunk if semantically connected
            if i < len(sentences):
                overlap_sentences = self.determine_overlap(chunk_sentences, sentences[i:i+3])
                i -= len(overlap_sentences)
        
        return chunks
    
    def should_extend_chunk(self, current: List[str], next_sent: str) -> bool:
        '''Check if next sentence completes the concept.'''
        # Check for continuation markers
        continuation_markers = ['however', 'additionally', 'furthermore', 'except']
        if any(marker in next_sent.lower() for marker in continuation_markers):
            return True
        
        # Check semantic similarity
        current_text = ' '.join(current[-2:])  # Last 2 sentences
        similarity = self.compute_similarity(current_text, next_sent)
        
        return similarity > 0.8  # High similarity = same concept
```

**Metrics to Track:**
```python
metrics = {
    "retrieval_recall": 0.92,  # Chunks retrieved / relevant chunks
    "answer_completeness": 0.65,  # Before fix (user complaint)
    "answer_completeness_target": 0.90,  # Target
    
    # After sentence window retrieval
    "answer_completeness_v2": 0.85,  # +20%
    
    # After hierarchical chunking
    "answer_completeness_v3": 0.92,  # +27%
}
```

**Evaluation:**
- Manual review of 100 failed cases
- Measure: Does answer cover all relevant details in ground truth?
- A/B test: Old chunking vs new strategy
- Track: User satisfaction, follow-up question rate (should decrease)

**Key Insight:**
High retrieval recall ≠ good answers. The problem isn't **which** chunks you retrieve, but **how much context** each chunk contains.
        """,
    },
    {
        "level": "Senior",
        "question": "You're building a RAG system for a legal document corpus (10K documents, avg 50 pages each). Your vector DB (FAISS) retrieval takes 800ms at P95. The business requirement is <200ms P95. Walk through your optimization strategy.",
        "answer": """
**Performance Baseline:**
- Corpus: 10K docs × 50 pages × ~3 chunks/page = ~1.5M chunks
- Current P95 latency: 800ms
- Target P95: 200ms (4x improvement needed)

**Root Cause Analysis:**

1. **Profile the Pipeline:**
```python
import time

def profile_retrieval(query: str):
    timings = {}
    
    start = time.time()
    query_embedding = embed_model.encode([query])[0]
    timings['embedding'] = time.time() - start
    
    start = time.time()
    results = faiss_index.search(query_embedding, k=50)
    timings['vector_search'] = time.time() - start
    
    start = time.time()
    metadata = [get_metadata(id) for id in results.ids]
    timings['metadata_fetch'] = time.time() - start
    
    start = time.time()
    reranked = reranker.rank(query, [r.text for r in results])
    timings['reranking'] = time.time() - start
    
    return timings

# Run on 1000 queries
timings = [profile_retrieval(q) for q in test_queries]
p95 = {k: np.percentile([t[k] for t in timings], 95) for k in timings[0].keys()}

print(p95)
# {'embedding': 50ms, 'vector_search': 600ms, 'metadata_fetch': 100ms, 'reranking': 50ms}
```

**Bottleneck Identified: Vector search (600ms)**

**Optimization Strategy:**

**Phase 1: Index Optimization (Expected: 800ms → 400ms)**

```python
import faiss

class OptimizedFAISSIndex:
    def __init__(self, dimension: int, num_vectors: int):
        self.dimension = dimension
        self.num_vectors = num_vectors
        
        # Strategy 1: Use IVF (Inverted File Index) for fast approximate search
        # Instead of flat L2 search (exhaustive), use clustering
        
        n_clusters = int(np.sqrt(num_vectors))  # ~1,225 clusters for 1.5M vectors
        
        # Quantizer: coarse clustering
        quantizer = faiss.IndexFlatL2(dimension)
        
        # IVF index: search only nearby clusters
        self.index = faiss.IndexIVFFlat(
            quantizer, 
            dimension, 
            n_clusters,
            faiss.METRIC_L2
        )
        
        # Train index on representative sample
        print("Training index...")
        # training_vectors = sample_embeddings(100K)  # 100K samples sufficient
        # self.index.train(training_vectors)
    
    def optimize_search_params(self):
        '''Tune nprobe for latency/accuracy tradeoff'''
        # nprobe: number of clusters to search
        # Higher nprobe = more accurate but slower
        
        self.index.nprobe = 32  # Search 32 clusters (vs all 1,225)
        # Accuracy: ~95% vs exhaustive search
        # Speed: ~20x faster
    
    def add_with_ids(self, embeddings, ids):
        '''Add vectors to index'''
        self.index.add_with_ids(embeddings, ids)

# Replace flat index with IVF
# Before: IndexFlatL2 (exhaustive search, 600ms)
# After: IndexIVFFlat with nprobe=32 (~30ms)
```

**Expected Improvement: 600ms → 30ms for vector search**

**Phase 2: Metadata Co-location (Expected: 400ms → 300ms)**

```python
class ColocatedIndex:
    '''Store metadata with vectors to avoid separate fetch'''
    
    def __init__(self):
        self.index = OptimizedFAISSIndex(768, 1_500_000)
        # Store metadata in memory-mapped file for fast access
        self.metadata_store = {}  # In production: RocksDB or similar
    
    def add_documents(self, chunks: List[dict]):
        embeddings = []
        ids = []
        
        for chunk in chunks:
            embeddings.append(chunk['embedding'])
            ids.append(chunk['id'])
            
            # Store metadata adjacent to ID
            self.metadata_store[chunk['id']] = {
                'text': chunk['text'],
                'doc_id': chunk['doc_id'],
                'page': chunk['page'],
                # Critical: don't store large fields here
            }
        
        self.index.add_with_ids(np.array(embeddings), np.array(ids))
    
    def search(self, query_embedding, k=50):
        # Single operation: vector search + metadata fetch
        distances, ids = self.index.search(query_embedding, k)
        
        # Fast in-memory metadata lookup (no DB call)
        results = [
            {
                'id': ids[i],
                'score': distances[i],
                'metadata': self.metadata_store[ids[i]]
            }
            for i in range(len(ids))
        ]
        return results

# Expected improvement: 100ms metadata fetch → <5ms
```

**Phase 3: Hybrid Search Parallelization (Expected: 300ms → 220ms)**

```python
import concurrent.futures

class ParallelHybridSearch:
    def __init__(self):
        self.vector_index = ColocatedIndex()
        self.bm25_index = BM25Index()
    
    def search(self, query: str, k=50):
        '''Run vector and BM25 search in parallel'''
        query_embedding = self.embed(query)
        
        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            # Run both searches concurrently
            vector_future = executor.submit(
                self.vector_index.search, query_embedding, k
            )
            bm25_future = executor.submit(
                self.bm25_index.search, query, k
            )
            
            vector_results = vector_future.result()
            bm25_results = bm25_future.result()
        
        # Fusion
        return self.reciprocal_rank_fusion(vector_results, bm25_results)
```

**Phase 4: Embedding Caching (Expected: 220ms → 180ms)**

```python
from functools import lru_cache
import hashlib

class CachedEmbedding:
    def __init__(self):
        self.cache = {}  # In production: Redis
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def embed(self, text: str):
        # Hash query for cache key
        cache_key = hashlib.md5(text.encode()).hexdigest()
        
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        # Compute embedding
        embedding = self.embedding_model.encode([text])[0]
        
        # Cache (TTL: 1 hour)
        self.cache[cache_key] = embedding
        
        return embedding

# For repeated/similar queries, saves 50ms embedding time
```

**Phase 5: Two-Tier Retrieval (Expected: 180ms → <150ms)**

```python
class TwoTierRetrieval:
    '''Filter by metadata first, then vector search on subset'''
    
    def __init__(self):
        # Tier 1: Fast metadata filter (ElasticSearch, etc.)
        self.metadata_index = ElasticSearchIndex()
        
        # Tier 2: Vector search on filtered subset
        self.vector_indices = {}  # Partitioned by document type
    
    def search(self, query: str, filters: dict = None):
        # Tier 1: Metadata filter (10ms)
        # Filter by: document type, date range, author, category
        if filters:
            candidate_doc_ids = self.metadata_index.filter(filters)
            # Reduces search space: 1.5M → 100K vectors
        else:
            candidate_doc_ids = None
        
        # Tier 2: Vector search on subset
        if candidate_doc_ids:
            # Search only relevant partition
            doc_type = filters.get('doc_type')
            results = self.vector_indices[doc_type].search(query, k=50)
        else:
            results = self.main_index.search(query, k=50)
        
        return results

# Use case: "Find refund policy documents from 2023"
# Without filter: Search 1.5M vectors (30ms with IVF)
# With filter: Search 50K vectors (2ms)
```

**Final Architecture:**
```python
class ProductionRAG:
    def __init__(self):
        self.embedding_cache = CachedEmbedding()
        self.two_tier_search = TwoTierRetrieval()
    
    def retrieve(self, query: str, filters: dict = None, k: int = 5):
        # Step 1: Get/cache embedding (5ms cached, 50ms uncached)
        query_emb = self.embedding_cache.embed(query)
        
        # Step 2: Two-tier filtered search (10-30ms)
        results = self.two_tier_search.search(query, filters)
        
        # Step 3: In-memory re-ranking of top results (20ms)
        reranked = self.fast_rerank(query, results[:20])
        
        return reranked[:k]
```

**Performance Summary:**
| Stage | Before | After | Technique |
|-------|--------|-------|-----------|
| Embedding | 50ms | 5ms (cached) | LRU cache |
| Vector search | 600ms | 10-30ms | IVF + partitioning |
| Metadata fetch | 100ms | <5ms | Co-location |
| Re-ranking | 50ms | 20ms | Only top-20 |
| **Total P95** | **800ms** | **150ms** | **5.3x improvement** |

**Cost-Benefit Analysis:**
- Development time: 2-3 weeks
- Infrastructure: Minimal (FAISS is free, runs on same hardware)
- Accuracy impact: <2% (from approximate search)
- Maintenance: Low (no new dependencies)

**Monitoring:**
```python
metrics = {
    'p50_latency_ms': 80,
    'p95_latency_ms': 150,
    'p99_latency_ms': 220,
    'cache_hit_rate': 0.35,  # 35% of queries cached
    'avg_search_space': 50_000,  # Down from 1.5M
    'accuracy_vs_exhaustive': 0.98,  # 98% recall
}
```

**Key Insight:**
The biggest win comes from IVF indexing (20x speedup). Co-location and caching provide incremental gains. Always profile first to find the actual bottleneck.
        """,
    },
    {
        "level": "Staff",
        "question": "Design a production RAG evaluation framework that measures relevance, groundedness, faithfulness, and cost. Include both offline and online evaluation strategies, and explain how you'd catch regressions before they hit users.",
        "answer": """
**Complete RAG Evaluation Framework:**

**1. Metrics Taxonomy:**

```python
from dataclasses import dataclass
from typing import List, Optional

@dataclass
class RAGMetrics:
    # Retrieval Quality
    retrieval_recall: float  # % of relevant docs retrieved
    retrieval_precision: float  # % of retrieved docs that are relevant
    retrieval_mrr: float  # Mean Reciprocal Rank
    retrieval_ndcg: float  # Normalized Discounted Cumulative Gain
    
    # Generation Quality
    answer_relevance: float  # Does answer address the question?
    answer_faithfulness: float  # Is answer supported by retrieved docs?
    answer_groundedness: float  # No hallucinations
    answer_completeness: float  # Covers all aspects of question
    
    # Business Metrics
    user_satisfaction: Optional[float]  # Thumbs up/down
    task_completion_rate: Optional[float]  # Did user get what they needed?
    follow_up_question_rate: Optional[float]  # Lower is better
    
    # Operational Metrics
    latency_p50_ms: float
    latency_p95_ms: float
    cost_per_query: float  # Embedding + LLM + infrastructure
    
    # Safety
    pii_leakage_detected: bool
    injection_attempt_blocked: bool
```

**2. Offline Evaluation (Pre-Deployment):**

```python
class OfflineRAGEvaluator:
    '''Comprehensive offline evaluation before deployment'''
    
    def __init__(self, test_dataset: List[dict]):
        '''
        test_dataset format:
        {
            "query": "What's the refund policy?",
            "ground_truth_answer": "30 days with receipt",
            "relevant_doc_ids": ["doc_123", "doc_456"],
            "expected_citations": ["doc_123"]
        }
        '''
        self.test_dataset = test_dataset
        self.results = []
    
    def evaluate_retrieval(self, rag_system) -> dict:
        '''Measure retrieval quality'''
        recalls = []
        precisions = []
        mrrs = []
        
        for case in self.test_dataset:
            # Retrieve documents
            retrieved = rag_system.retrieve(case['query'], k=10)
            retrieved_ids = [doc['id'] for doc in retrieved]
            relevant_ids = case['relevant_doc_ids']
            
            # Recall: % of relevant docs retrieved
            recall = len(set(retrieved_ids) & set(relevant_ids)) / len(relevant_ids)
            recalls.append(recall)
            
            # Precision: % of retrieved docs that are relevant
            precision = len(set(retrieved_ids) & set(relevant_ids)) / len(retrieved_ids)
            precisions.append(precision)
            
            # MRR: 1 / rank of first relevant doc
            for i, doc_id in enumerate(retrieved_ids, 1):
                if doc_id in relevant_ids:
                    mrrs.append(1 / i)
                    break
            else:
                mrrs.append(0)
        
        return {
            'recall@10': np.mean(recalls),
            'precision@10': np.mean(precisions),
            'mrr': np.mean(mrrs),
        }
    
    def evaluate_generation(self, rag_system) -> dict:
        '''Measure generation quality using LLM-as-judge'''
        
        relevance_scores = []
        faithfulness_scores = []
        groundedness_scores = []
        
        for case in self.test_dataset:
            # Generate answer
            retrieved_docs = rag_system.retrieve(case['query'], k=5)
            answer = rag_system.generate(case['query'], retrieved_docs)
            
            # Evaluate relevance (does answer address question?)
            relevance = self._evaluate_relevance(case['query'], answer)
            relevance_scores.append(relevance)
            
            # Evaluate faithfulness (is answer supported by docs?)
            faithfulness = self._evaluate_faithfulness(answer, retrieved_docs)
            faithfulness_scores.append(faithfulness)
            
            # Evaluate groundedness (no hallucinations)
            groundedness = self._evaluate_groundedness(
                answer, 
                retrieved_docs, 
                case['ground_truth_answer']
            )
            groundedness_scores.append(groundedness)
        
        return {
            'answer_relevance': np.mean(relevance_scores),
            'answer_faithfulness': np.mean(faithfulness_scores),
            'answer_groundedness': np.mean(groundedness_scores),
        }
    
    def _evaluate_relevance(self, query: str, answer: str) -> float:
        '''Use LLM to judge if answer is relevant to query'''
        prompt = f'''
        Query: {query}
        Answer: {answer}
        
        Is the answer relevant to the query? Rate 0.0 to 1.0.
        Consider:
        - Does it address the question asked?
        - Is it on-topic?
        - Does it provide useful information?
        
        Return ONLY a number between 0.0 and 1.0.
        '''
        
        score_str = llm.generate(prompt, temperature=0.0)
        return float(score_str.strip())
    
    def _evaluate_faithfulness(self, answer: str, docs: List[dict]) -> float:
        '''Check if answer claims are supported by retrieved docs'''
        prompt = f'''
        Retrieved Documents:
        {chr(10).join([f"[{i+1}] {d['text']}" for i, d in enumerate(docs)])}
        
        Answer: {answer}
        
        Are all claims in the answer supported by the documents?
        Rate 0.0 (unsupported) to 1.0 (fully supported).
        
        Return ONLY a number between 0.0 and 1.0.
        '''
        
        score_str = llm.generate(prompt, temperature=0.0)
        return float(score_str.strip())
    
    def _evaluate_groundedness(self, answer: str, docs: List[dict], ground_truth: str) -> float:
        '''Check for hallucinations by comparing to ground truth'''
        # Extract claims from answer
        answer_claims = self._extract_claims(answer)
        gt_claims = self._extract_claims(ground_truth)
        
        # Check if answer adds unsupported claims
        hallucinated_claims = []
        for claim in answer_claims:
            if not self._is_supported(claim, docs) and claim not in gt_claims:
                hallucinated_claims.append(claim)
        
        # Score: 1.0 if no hallucinations, decreases with each hallucination
        score = 1.0 - (len(hallucinated_claims) / max(len(answer_claims), 1))
        return max(0.0, score)
    
    def _extract_claims(self, text: str) -> List[str]:
        '''Extract factual claims from text'''
        # Use LLM to extract claims
        prompt = f'''
        Extract atomic factual claims from this text:
        {text}
        
        Return as a JSON list of strings.
        '''
        response = llm.generate(prompt, temperature=0.0)
        return json.loads(response)
    
    def _is_supported(self, claim: str, docs: List[dict]) -> bool:
        '''Check if claim is supported by documents'''
        docs_text = '\\n\\n'.join([d['text'] for d in docs])
        
        prompt = f'''
        Documents:
        {docs_text}
        
        Claim: {claim}
        
        Is this claim supported by the documents? Answer: yes or no
        '''
        
        response = llm.generate(prompt, temperature=0.0).lower()
        return 'yes' in response
    
    def run_full_evaluation(self, rag_system) -> RAGMetrics:
        '''Run complete evaluation suite'''
        retrieval_metrics = self.evaluate_retrieval(rag_system)
        generation_metrics = self.evaluate_generation(rag_system)
        
        # Measure operational metrics
        latencies = []
        costs = []
        
        for case in self.test_dataset[:100]:  # Sample for latency measurement
            start = time.time()
            result = rag_system.query(case['query'])
            latencies.append((time.time() - start) * 1000)
            costs.append(result['cost'])
        
        return RAGMetrics(
            retrieval_recall=retrieval_metrics['recall@10'],
            retrieval_precision=retrieval_metrics['precision@10'],
            retrieval_mrr=retrieval_metrics['mrr'],
            retrieval_ndcg=0.0,  # Implement if needed
            
            answer_relevance=generation_metrics['answer_relevance'],
            answer_faithfulness=generation_metrics['answer_faithfulness'],
            answer_groundedness=generation_metrics['answer_groundedness'],
            answer_completeness=0.0,  # Implement if needed
            
            user_satisfaction=None,  # Only available online
            task_completion_rate=None,
            follow_up_question_rate=None,
            
            latency_p50_ms=np.percentile(latencies, 50),
            latency_p95_ms=np.percentile(latencies, 95),
            cost_per_query=np.mean(costs),
            
            pii_leakage_detected=False,
            injection_attempt_blocked=False,
        )
```

**3. Online Evaluation (Production Monitoring):**

```python
class OnlineRAGMonitor:
    '''Real-time production monitoring'''
    
    def __init__(self):
        self.metrics_buffer = []
        self.baseline_metrics = self.load_baseline()
    
    def log_query(self, query: str, answer: str, retrieved_docs: List[dict], 
                  latency_ms: float, cost: float, user_feedback: Optional[dict] = None):
        '''Log every production query for analysis'''
        
        self.metrics_buffer.append({
            'timestamp': datetime.utcnow().isoformat(),
            'query': query,
            'answer': answer,
            'num_docs_retrieved': len(retrieved_docs),
            'latency_ms': latency_ms,
            'cost': cost,
            'user_feedback': user_feedback,
        })
        
        # Flush to database every 100 queries
        if len(self.metrics_buffer) >= 100:
            self.flush_metrics()
    
    def detect_regression(self, window_minutes: int = 60) -> Optional[dict]:
        '''Detect quality regressions in real-time'''
        
        # Get recent metrics
        recent_metrics = self.get_recent_metrics(window_minutes)
        
        # Compare to baseline
        alerts = []
        
        # Check latency regression
        current_p95 = np.percentile([m['latency_ms'] for m in recent_metrics], 95)
        if current_p95 > self.baseline_metrics['latency_p95'] * 1.2:  # 20% degradation
            alerts.append({
                'type': 'latency_regression',
                'current': current_p95,
                'baseline': self.baseline_metrics['latency_p95'],
                'severity': 'high'
            })
        
        # Check user satisfaction regression
        with_feedback = [m for m in recent_metrics if m['user_feedback']]
        if len(with_feedback) >= 20:  # Need minimum sample size
            satisfaction = np.mean([
                1.0 if m['user_feedback'].get('thumbs_up') else 0.0
                for m in with_feedback
            ])
            
            if satisfaction < self.baseline_metrics['user_satisfaction'] - 0.1:  # 10% drop
                alerts.append({
                    'type': 'satisfaction_regression',
                    'current': satisfaction,
                    'baseline': self.baseline_metrics['user_satisfaction'],
                    'severity': 'critical'
                })
        
        # Check cost spike
        avg_cost = np.mean([m['cost'] for m in recent_metrics])
        if avg_cost > self.baseline_metrics['avg_cost'] * 1.5:  # 50% increase
            alerts.append({
                'type': 'cost_spike',
                'current': avg_cost,
                'baseline': self.baseline_metrics['avg_cost'],
                'severity': 'medium'
            })
        
        return alerts if alerts else None
    
    def run_shadow_evaluation(self, sample_rate: float = 0.01):
        '''Continuously evaluate random sample in production'''
        
        for query_data in self.stream_queries():
            if random.random() < sample_rate:
                # Run full evaluation on this query
                eval_result = self.evaluate_single_query(
                    query_data['query'],
                    query_data['answer'],
                    query_data['retrieved_docs']
                )
                
                # Store for analysis
                self.store_evaluation(eval_result)
                
                # Check for anomalies
                if eval_result['faithfulness'] < 0.7:  # Threshold
                    self.alert_low_quality(query_data, eval_result)
```

**4. Pre-Deployment Regression Testing:**

```python
class RegressionTestSuite:
    '''Catch regressions before deployment'''
    
    def __init__(self, golden_dataset_path: str):
        # Golden dataset: curated high-quality examples
        self.golden_dataset = self.load_golden_dataset(golden_dataset_path)
        self.baseline_results = self.load_baseline_results()
    
    def run_regression_tests(self, new_rag_system) -> dict:
        '''Run before each deployment'''
        
        results = {
            'passed': [],
            'failed': [],
            'degraded': []
        }
        
        for test_case in self.golden_dataset:
            # Run new system
            new_answer = new_rag_system.query(test_case['query'])
            
            # Compare to baseline
            baseline_answer = self.baseline_results[test_case['id']]
            
            # Exact match test (for critical queries)
            if test_case.get('exact_match_required'):
                if new_answer == baseline_answer:
                    results['passed'].append(test_case['id'])
                else:
                    results['failed'].append({
                        'id': test_case['id'],
                        'reason': 'exact_match_failed',
                        'expected': baseline_answer,
                        'got': new_answer
                    })
            
            # Semantic similarity test
            else:
                similarity = self.compute_semantic_similarity(new_answer, baseline_answer)
                
                if similarity >= 0.95:
                    results['passed'].append(test_case['id'])
                elif similarity >= 0.85:
                    results['degraded'].append({
                        'id': test_case['id'],
                        'similarity': similarity,
                        'warning': 'slight_degradation'
                    })
                else:
                    results['failed'].append({
                        'id': test_case['id'],
                        'reason': 'semantic_drift',
                        'similarity': similarity
                    })
        
        # Pass/fail criteria
        pass_rate = len(results['passed']) / len(self.golden_dataset)
        degradation_rate = len(results['degraded']) / len(self.golden_dataset)
        
        return {
            'passed': pass_rate >= 0.95,  # 95% pass rate required
            'pass_rate': pass_rate,
            'degradation_rate': degradation_rate,
            'failures': results['failed'],
            'degradations': results['degraded']
        }
    
    def run_ab_test(self, new_system, traffic_percentage: float = 0.05):
        '''Gradual rollout with A/B testing'''
        
        # Route 5% of traffic to new system
        # Compare metrics between old and new
        
        ab_results = {
            'system_a': {'queries': [], 'metrics': []},
            'system_b': {'queries': [], 'metrics': []}
        }
        
        # Collect data for 24 hours
        # ...
        
        # Statistical significance test
        from scipy import stats
        
        satisfaction_a = [m['user_satisfaction'] for m in ab_results['system_a']['metrics']]
        satisfaction_b = [m['user_satisfaction'] for m in ab_results['system_b']['metrics']]
        
        t_stat, p_value = stats.ttest_ind(satisfaction_a, satisfaction_b)
        
        if p_value < 0.05 and np.mean(satisfaction_b) > np.mean(satisfaction_a):
            return {'decision': 'deploy', 'improvement': np.mean(satisfaction_b) - np.mean(satisfaction_a)}
        elif p_value < 0.05 and np.mean(satisfaction_b) < np.mean(satisfaction_a):
            return {'decision': 'rollback', 'degradation': np.mean(satisfaction_a) - np.mean(satisfaction_b)}
        else:
            return {'decision': 'inconclusive', 'p_value': p_value}
```

**5. Complete Deployment Pipeline:**

```
1. Offline Evaluation (Golden Dataset)
   ├─ Run regression tests
   ├─ Check pass rate >= 95%
   └─ If passed → Continue

2. Staging Environment (Shadow Mode)
   ├─ Mirror 100% of production traffic
   ├─ Compare metrics to production system
   └─ If no degradation → Continue

3. Canary Deployment (5% traffic)
   ├─ Route 5% real traffic to new system
   ├─ Monitor for 24 hours
   ├─ Check: latency, cost, satisfaction
   └─ If metrics good → Continue

4. Gradual Rollout
   ├─ 5% → 25% → 50% → 100%
   ├─ Monitor at each stage
   └─ Automatic rollback if regression detected

5. Post-Deployment Monitoring
   ├─ Continuous shadow evaluation (1% sample)
   ├─ Real-time regression detection
   └─ Weekly deep-dive analysis
```

**Key Metrics Dashboard:**
- Retrieval Recall/Precision (updated daily)
- Answer Faithfulness (sampled 1%)
- User Satisfaction (from feedback)
- P95 Latency, Cost/query
- Regression alerts (real-time)

**SLOs (Service Level Objectives):**
- Retrieval Recall: >= 90%
- Answer Faithfulness: >= 85%
- User Satisfaction: >= 80%
- P95 Latency: <= 200ms
- Cost per query: <= $0.01
        """,
    },
]

for i, qa in enumerate(interview_questions_rag_part1, 1):
    print(f"\n{'=' * 100}")
    print(f"RAG SYSTEMS - Q{i} [{qa['level']} Level]")
    print('=' * 100)
    print(f"\n{qa['question']}\n")
    print("ANSWER:")
    print(qa['answer'])
    print()


: 

In [None]:
print("MODULE 2: RAG SYSTEMS - KEY TAKEAWAYS")
print("=" * 100)

summary = {
    "Chunking Strategies": [
        "Sentence-window retrieval: Retrieve small chunks, return with context (40% improvement)",
        "Hierarchical chunking: Paragraph-level retrieval, section-level context (50% improvement)",
        "Smart overlap: Semantic coherence at boundaries prevents information loss",
        "High retrieval recall ≠ good answers; context completeness matters more",
    ],
    "Retrieval Optimization": [
        "FAISS IVF indexing: 20x speedup for large corpora (1.5M+ vectors)",
        "Metadata co-location: Avoid separate DB fetches, <5ms vs 100ms",
        "Embedding caching: 35% cache hit rate saves 50ms per cached query",
        "Two-tier retrieval: Filter by metadata first, reduces search space 15x",
        "Profile before optimizing: 80% of gains come from fixing the bottleneck",
    ],
    "Hybrid Search": [
        "BM25: Excels at keyword matching (technical terms, exact phrases)",
        "Semantic: Excels at concept matching (paraphrases, synonyms)",
        "Alpha tuning: 0.3 (keyword-heavy), 0.5 (balanced), 0.7 (concept-heavy)",
        "Reciprocal rank fusion: Simple and effective for combining scores",
    ],
    "Re-ranking & MMR": [
        "Two-stage retrieval: Fast bi-encoder (top-50) → Slow cross-encoder (top-5)",
        "Re-ranking improves precision by 10-30% with 20-50ms latency cost",
        "MMR (Maximal Marginal Relevance): Reduces redundancy, improves coverage",
        "Lambda parameter: 0.7-0.8 for good relevance/diversity balance",
    ],
    "Evaluation Framework": [
        "Retrieval metrics: Recall, Precision, MRR, NDCG",
        "Generation metrics: Relevance, Faithfulness, Groundedness, Completeness",
        "Use LLM-as-judge for generation quality (correlation with human: 0.85+)",
        "Offline: Golden dataset with 95% pass rate before deployment",
        "Online: Shadow evaluation (1% sample), real-time regression detection",
        "Deployment: Regression tests → Shadow → Canary (5%) → Gradual (100%)",
    ],
    "Production Principles": [
        "Measure everything: Latency (P50/P95/P99), cost, quality, user satisfaction",
        "Catch regressions early: Pre-deployment tests + real-time monitoring",
        "Optimize for bottlenecks: Profile first, optimize second",
        "Balance tradeoffs: Latency vs accuracy, cost vs quality",
    ],
}

for section, points in summary.items():
    print(f"\n{section}:")
    for point in points:
        print(f"  - {point}")

print("\n" + "=" * 100)
print("\nINTERVIEW QUESTIONS SUMMARY:")
print("  - Chunking & Context: Incomplete answers despite high recall")
print("  - Performance: FAISS optimization from 800ms to <200ms P95")
print("  - Evaluation: Complete framework with offline and online evaluation")
print("  Total: 3 advanced questions (2 Senior, 1 Staff level)")

print("\n" + "=" * 100)
print("\nNEXT STEPS:")
print("  1. Implement sentence-window retrieval for better context")
print("  2. Profile your retrieval pipeline and optimize bottlenecks")
print("  3. Build golden dataset for regression testing (100+ examples)")
print("  4. Set up shadow evaluation for 1% of production traffic")
print("  5. Move to Module 3: LangChain (chains, agents, evaluation)")

print("\n" + "=" * 100)


### Section 2: Hybrid Search - Combining BM25 and Dense Vectors

Hybrid search improves retrieval by combining:
- **BM25 (Lexical)**: Keyword matching, good for exact terms
- **Dense Vectors (Semantic)**: Meaning-based, good for paraphrases
- **Fusion**: Reciprocal Rank Fusion (RRF) or weighted combination

In [None]:
from rank_bm25 import BM25Okapi
import numpy as np
from typing import List, Tuple, Dict
from sentence_transformers import SentenceTransformer

class HybridRetriever:
    '''Combines BM25 and dense retrieval for better results'''
    
    def __init__(self, embedding_model='all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(embedding_model)
        self.documents = []
        self.embeddings = None
        self.bm25 = None
    
    def index(self, documents: List[str]):
        '''Index documents for both BM25 and vector search'''
        self.documents = documents
        
        # BM25 index
        tokenized = [doc.lower().split() for doc in documents]
        self.bm25 = BM25Okapi(tokenized)
        
        # Vector index
        print(f'Encoding {len(documents)} documents...')
        self.embeddings = self.embedding_model.encode(documents, show_progress_bar=True)
    
    def retrieve_bm25(self, query: str, top_k=5) -> List[Tuple[int, float]]:
        '''BM25 retrieval'''
        tokenized_query = query.lower().split()
        scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(scores)[::-1][:top_k]
        return [(idx, scores[idx]) for idx in top_indices]
    
    def retrieve_vector(self, query: str, top_k=5) -> List[Tuple[int, float]]:
        '''Dense vector retrieval'''
        query_embedding = self.embedding_model.encode([query])[0]
        
        # Cosine similarity
        similarities = np.dot(self.embeddings, query_embedding) / (
            np.linalg.norm(self.embeddings, axis=1) * np.linalg.norm(query_embedding)
        )
        
        top_indices = np.argsort(similarities)[::-1][:top_k]
        return [(idx, similarities[idx]) for idx in top_indices]
    
    def retrieve_hybrid(self, query: str, top_k=5, alpha=0.5) -> List[Tuple[int, float, Dict]]:
        '''Hybrid retrieval with weighted fusion
        
        Args:
            alpha: Weight for BM25 (0=all vector, 1=all BM25)
        '''
        # Get results from both
        bm25_results = self.retrieve_bm25(query, top_k * 2)
        vector_results = self.retrieve_vector(query, top_k * 2)
        
        # Normalize scores to [0,1]
        bm25_scores = np.array([s for _, s in bm25_results])
        if bm25_scores.max() > 0:
            bm25_scores = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-10)
        
        vector_scores = np.array([s for _, s in vector_results])
        vector_scores = (vector_scores - vector_scores.min()) / (vector_scores.max() - vector_scores.min() + 1e-10)
        
        # Combine scores
        combined = {}
        for (idx, _), norm_score in zip(bm25_results, bm25_scores):
            combined[idx] = {'bm25': norm_score, 'vector': 0.0}
        
        for (idx, _), norm_score in zip(vector_results, vector_scores):
            if idx in combined:
                combined[idx]['vector'] = norm_score
            else:
                combined[idx] = {'bm25': 0.0, 'vector': norm_score}
        
        # Calculate combined score
        for idx in combined:
            combined[idx]['combined'] = alpha * combined[idx]['bm25'] + (1 - alpha) * combined[idx]['vector']
        
        # Sort and return top-k
        ranked = sorted(combined.items(), key=lambda x: x[1]['combined'], reverse=True)[:top_k]
        return [(idx, scores['combined'], scores) for idx, scores in ranked]
    
    def retrieve_rrf(self, query: str, top_k=5, k=60) -> List[Tuple[int, float]]:
        '''Reciprocal Rank Fusion (RRF)
        
        RRF formula: score = sum(1 / (k + rank_i))
        where rank_i is the rank in result set i
        '''
        bm25_results = self.retrieve_bm25(query, top_k * 2)
        vector_results = self.retrieve_vector(query, top_k * 2)
        
        # Calculate RRF scores
        rrf_scores = {}
        
        # BM25 ranks
        for rank, (idx, _) in enumerate(bm25_results):
            rrf_scores[idx] = rrf_scores.get(idx, 0) + 1 / (k + rank + 1)
        
        # Vector ranks
        for rank, (idx, _) in enumerate(vector_results):
            rrf_scores[idx] = rrf_scores.get(idx, 0) + 1 / (k + rank + 1)
        
        # Sort by RRF score
        ranked = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
        return ranked

# Test hybrid retrieval
test_docs = [
    'Python is a high-level programming language with dynamic typing.',
    'Machine learning models require large datasets for training.',
    'The quick brown fox jumps over the lazy dog.',
    'Natural language processing enables computers to understand human language.',
    'Deep learning uses neural networks with multiple layers.',
    'Python is widely used for data science and ML applications.',
    'Vector databases store embeddings for similarity search.',
    'BM25 is a ranking function used in information retrieval.',
    'Transformers are the architecture behind modern LLMs.',
    'RAG combines retrieval with generation for better accuracy.',
]

print('Initializing Hybrid Retriever...')
retriever = HybridRetriever()
retriever.index(test_docs)

test_queries = [
    'Python machine learning',
    'neural networks NLP',
    'information retrieval ranking',
]

print('\n' + '=' * 90)
for query in test_queries:
    print(f'\nQuery: {query}')
    print('-' * 90)
    
    # Compare methods
    bm25_only = retriever.retrieve_hybrid(query, top_k=3, alpha=1.0)
    vector_only = retriever.retrieve_hybrid(query, top_k=3, alpha=0.0)
    hybrid = retriever.retrieve_hybrid(query, top_k=3, alpha=0.5)
    rrf = retriever.retrieve_rrf(query, top_k=3)
    
    print('BM25 Only:')
    for rank, (idx, score, _) in enumerate(bm25_only, 1):
        print(f'  {rank}. [{score:.3f}] {test_docs[idx][:60]}...')
    
    print('\nVector Only:')
    for rank, (idx, score, _) in enumerate(vector_only, 1):
        print(f'  {rank}. [{score:.3f}] {test_docs[idx][:60]}...')
    
    print('\nHybrid (50/50):')
    for rank, (idx, score, _) in enumerate(hybrid, 1):
        print(f'  {rank}. [{score:.3f}] {test_docs[idx][:60]}...')
    
    print('\nRRF:')
    for rank, (idx, score) in enumerate(rrf, 1):
        print(f'  {rank}. [{score:.3f}] {test_docs[idx][:60]}...')

print('\n' + '=' * 90)
print('KEY INSIGHT: Hybrid search often outperforms single method by combining strengths')

## Interview Questions: RAG Systems

### For Senior/Staff Engineers

These questions test production RAG design and optimization.

In [None]:
rag_interview_questions = [
    {
        'level': 'Senior',
        'question': 'Your RAG system has 40% precision but 90% recall. Users complain about too many irrelevant chunks. Walk through your debugging and optimization process.',
        'answer': '''
**Problem Analysis:**
High recall but low precision means we're retrieving too many chunks, including many irrelevant ones.

**Root Causes:**
1. **top_k too high**: Retrieving 10+ chunks when only 2-3 are relevant
2. **Poor embedding model**: Not capturing semantic nuances
3. **Bad chunking**: Chunks too small or too large, losing context
4. **No reranking**: Taking vector similarity at face value
5. **Retrieval threshold too low**: Including low-confidence matches

**Systematic Debugging (3-day process):**

**Day 1: Measure Current Performance**
```python
def evaluate_retrieval(test_cases: List[dict]):
    metrics = {
        'precision_at_k': [],
        'recall_at_k': [],
        'mrr': [],  # Mean Reciprocal Rank
        'ndcg': []  # Normalized Discounted Cumulative Gain
    }
    
    for case in test_cases:
        query = case['query']
        relevant_doc_ids = set(case['relevant_docs'])
        
        # Current retrieval
        retrieved = retriever.retrieve(query, top_k=10)
        retrieved_ids = [r['doc_id'] for r in retrieved]
        
        # Precision@k
        relevant_retrieved = len(set(retrieved_ids[:k]) & relevant_doc_ids)
        metrics['precision_at_k'].append(relevant_retrieved / k)
        
        # Recall@k
        metrics['recall_at_k'].append(relevant_retrieved / len(relevant_doc_ids))
        
        # MRR: 1 / rank_of_first_relevant
        for i, doc_id in enumerate(retrieved_ids, 1):
            if doc_id in relevant_doc_ids:
                metrics['mrr'].append(1 / i)
                break
    
    return {k: np.mean(v) for k, v in metrics.items()}

# Baseline
baseline = evaluate_retrieval(test_set)
print(f"Baseline P@5: {baseline['precision_at_k']:.2%}")
print(f"Baseline Recall: {baseline['recall_at_k']:.2%}")
```

**Day 2: Try Quick Wins**

**Fix 1: Reduce top_k (Immediate, +15% precision)**
```python
# Before: top_k=10
# After: top_k=5, only high-confidence matches
retrieved = retriever.retrieve(query, top_k=5)
```

**Fix 2: Add Similarity Threshold (+20% precision)**
```python
def retrieve_with_threshold(query: str, top_k=5, min_similarity=0.7):
    results = retriever.retrieve(query, top_k=top_k * 2)
    
    # Filter by threshold
    filtered = [r for r in results if r['similarity'] >= min_similarity]
    
    # If too few, lower threshold slightly
    if len(filtered) < 2:
        filtered = results[:2]  # Always return at least 2
    
    return filtered[:top_k]
```

**Fix 3: Implement Reranking (+30% precision)**
```python
from sentence_transformers import CrossEncoder

class Reranker:
    def __init__(self):
        # Cross-encoder is more accurate but slower
        self.model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
    
    def rerank(self, query: str, documents: List[dict], top_k=5) -> List[dict]:
        # Over-retrieve with vector search
        candidates = retriever.retrieve(query, top_k=top_k * 3)
        
        # Rerank with cross-encoder
        pairs = [(query, doc['text']) for doc in candidates]
        scores = self.model.predict(pairs)
        
        # Sort by reranker scores
        for doc, score in zip(candidates, scores):
            doc['rerank_score'] = score
        
        ranked = sorted(candidates, key=lambda x: x['rerank_score'], reverse=True)
        return ranked[:top_k]

reranker = Reranker()
reranked = reranker.rerank(query, initial_results, top_k=5)
```

**Day 3: Improve Chunking & Embeddings**

**Fix 4: Better Chunking Strategy (+10% both metrics)**
```python
# Before: Fixed 512 tokens
# After: Semantic chunking with overlap

class SemanticChunker:
    def __init__(self, target_size=400, overlap=50):
        self.target_size = target_size
        self.overlap = overlap
    
    def chunk(self, text: str) -> List[str]:
        # Split by paragraphs first
        paragraphs = text.split('\\n\\n')
        
        chunks = []
        current_chunk = []
        current_size = 0
        
        for para in paragraphs:
            para_size = len(para)
            
            if current_size + para_size > self.target_size and current_chunk:
                # Create chunk
                chunks.append('\\n\\n'.join(current_chunk))
                
                # Overlap: keep last paragraph
                if len(current_chunk) > 1:
                    current_chunk = current_chunk[-1:]
                    current_size = len(current_chunk[0])
                else:
                    current_chunk = []
                    current_size = 0
            
            current_chunk.append(para)
            current_size += para_size
        
        if current_chunk:
            chunks.append('\\n\\n'.join(current_chunk))
        
        return chunks
```

**Fix 5: Use Domain-Specific Embeddings (+15% both metrics)**
```python
# Option A: Fine-tune embeddings on your domain
from sentence_transformers import SentenceTransformer, InputExample, losses

def finetune_embeddings(train_examples: List[tuple]):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Create training examples
    examples = []
    for query, pos_doc, neg_doc in train_examples:
        examples.append(InputExample(texts=[query, pos_doc], label=1.0))
        examples.append(InputExample(texts=[query, neg_doc], label=0.0))
    
    # Train
    train_loss = losses.CosineSimilarityLoss(model)
    model.fit(
        train_objectives=[(examples, train_loss)],
        epochs=3,
        warmup_steps=100
    )
    
    return model

# Option B: Use larger, better embedding model
# 'all-mpnet-base-v2' or 'intfloat/e5-large-v2'
```

**Combined Solution:**
```python
class OptimizedRAG:
    def __init__(self):
        self.chunker = SemanticChunker(target_size=400)
        self.embedder = SentenceTransformer('all-mpnet-base-v2')  # Better model
        self.reranker = Reranker()
        self.min_similarity = 0.7
    
    def retrieve(self, query: str, top_k=5):
        # Step 1: Over-retrieve
        candidates = vector_search(query, top_k=top_k * 3)
        
        # Step 2: Filter by threshold
        filtered = [c for c in candidates if c['similarity'] >= self.min_similarity]
        
        # Step 3: Rerank
        reranked = self.reranker.rerank(query, filtered, top_k=top_k)
        
        return reranked
```

**Expected Results:**
- Precision@5: 40% → 85% (+45%)
- Recall: 90% → 75% (slight drop acceptable)
- User satisfaction: Much higher (fewer irrelevant results)

**Key Takeaway:**
High recall is easy (just retrieve more), but production needs high precision.
Reranking is the most impactful single improvement.
        ''',
    },
    {
        'level': 'Senior',
        'question': 'Design a multi-tenant RAG system where Company A cannot see Company B's documents, but some documents are shared across tenants. Include performance considerations.',
        'answer': '''
**Multi-Tenant RAG Architecture:**

**1. Data Model:**
```python
@dataclass
class Document:
    doc_id: str
    text: str
    embedding: np.ndarray
    visibility: List[str]  # ['tenant_a', 'public']
    created_by_tenant: str
    access_level: str  # 'private', 'shared', 'public'
```

**2. Namespace-Based Isolation:**
```python
class MultiTenantVectorDB:
    '''Tenant isolation via namespaces + metadata filtering'''
    
    def __init__(self):
        self.client = chromadb.Client()
        # Option A: One collection per tenant (simple but doesn't scale)
        # Option B: One collection with metadata filtering (recommended)
        self.collection = self.client.get_or_create_collection('multi_tenant_docs')
    
    def ingest(self, doc: Document, tenant_id: str):
        '''Ingest with tenant metadata'''
        metadata = {
            'tenant_id': tenant_id,
            'visibility': ','.join(doc.visibility),
            'access_level': doc.access_level,
            'doc_id': doc.doc_id,
        }
        
        self.collection.add(
            ids=[doc.doc_id],
            embeddings=[doc.embedding.tolist()],
            documents=[doc.text],
            metadatas=[metadata]
        )
    
    def retrieve(self, query: str, tenant_id: str, top_k=5) -> List[dict]:
        '''Retrieve with tenant isolation'''
        query_embedding = embed(query)
        
        # Over-retrieve to account for filtering
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k * 5  # Over-retrieve
        )
        
        # Filter by tenant access
        filtered = []
        for i in range(len(results['ids'][0])):
            metadata = results['metadatas'][0][i]
            visibility = set(metadata['visibility'].split(','))
            
            # Can access if:
            # 1. Created by this tenant
            # 2. Explicitly shared with this tenant
            # 3. Public
            if (metadata['tenant_id'] == tenant_id or 
                tenant_id in visibility or 
                'public' in visibility):
                
                filtered.append({
                    'text': results['documents'][0][i],
                    'metadata': metadata,
                    'similarity': 1 - results['distances'][0][i]
                })
                
                if len(filtered) >= top_k:
                    break
        
        return filtered
```

**3. Performance Optimization:**

**Problem:** Over-retrieving then filtering is slow at scale.

**Solution A: Index Partitioning**
```python
class PartitionedVectorDB:
    '''Separate indices per tenant for performance'''
    
    def __init__(self):
        self.tenant_collections = {}  # tenant_id -> collection
        self.shared_collection = None  # For shared/public docs
    
    def get_or_create_tenant_collection(self, tenant_id: str):
        if tenant_id not in self.tenant_collections:
            self.tenant_collections[tenant_id] = chromadb.Client().get_or_create_collection(
                f'tenant_{tenant_id}'
            )
        return self.tenant_collections[tenant_id]
    
    def retrieve(self, query: str, tenant_id: str, top_k=5):
        query_embedding = embed(query)
        
        # Parallel retrieval from:
        # 1. Tenant's private collection
        # 2. Shared collection
        private_results = self.tenant_collections[tenant_id].query(
            query_embeddings=[query_embedding],
            n_results=top_k
        )
        
        shared_results = self.shared_collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k,
            where={'visibility': tenant_id}  # Pre-filter
        )
        
        # Merge and rerank
        all_results = merge_results(private_results, shared_results)
        return all_results[:top_k]
```

**Solution B: Caching with Tenant Isolation**
```python
import hashlib
from functools import lru_cache

class TenantAwareCache:
    '''Cache with tenant isolation'''
    
    def __init__(self, redis_client):
        self.redis = redis_client
        self.ttl = 3600  # 1 hour
    
    def cache_key(self, query: str, tenant_id: str) -> str:
        '''Generate cache key with tenant context'''
        query_hash = hashlib.sha256(query.encode()).hexdigest()[:16]
        return f'retrieve:{tenant_id}:{query_hash}'
    
    def get(self, query: str, tenant_id: str) -> Optional[List[dict]]:
        key = self.cache_key(query, tenant_id)
        cached = self.redis.get(key)
        if cached:
            return json.loads(cached)
        return None
    
    def set(self, query: str, tenant_id: str, results: List[dict]):
        key = self.cache_key(query, tenant_id)
        self.redis.setex(key, self.ttl, json.dumps(results))
    
    def invalidate_tenant(self, tenant_id: str):
        '''Invalidate all cache for tenant (e.g., after new doc upload)'''
        pattern = f'retrieve:{tenant_id}:*'
        keys = self.redis.keys(pattern)
        if keys:
            self.redis.delete(*keys)
```

**4. Audit Logging:**
```python
class TenantAuditLog:
    '''Track who accessed what documents'''
    
    def __init__(self):
        self.logs = []
    
    def log_access(self, tenant_id: str, query: str, retrieved_docs: List[str]):
        self.logs.append({
            'timestamp': datetime.utcnow().isoformat(),
            'tenant_id': tenant_id,
            'query_hash': hashlib.sha256(query.encode()).hexdigest()[:16],
            'num_docs': len(retrieved_docs),
            'doc_ids': retrieved_docs,
        })
    
    def detect_unauthorized_access(self):
        '''Detect attempts to access other tenant's data'''
        # Analyze patterns for anomalies
        pass
```

**5. Complete System:**
```python
class MultiTenantRAG:
    def __init__(self):
        self.vector_db = PartitionedVectorDB()
        self.cache = TenantAwareCache(redis_client)
        self.audit_log = TenantAuditLog()
        self.rate_limiter = TenantRateLimiter()
    
    def retrieve(self, query: str, tenant_id: str, user_id: str, top_k=5):
        # Rate limiting per tenant
        if not self.rate_limiter.allow(tenant_id):
            raise RateLimitExceeded()
        
        # Check cache
        cached = self.cache.get(query, tenant_id)
        if cached:
            self.audit_log.log_access(tenant_id, query, [d['doc_id'] for d in cached])
            return cached
        
        # Retrieve with tenant isolation
        results = self.vector_db.retrieve(query, tenant_id, top_k)
        
        # Cache results
        self.cache.set(query, tenant_id, results)
        
        # Audit log
        self.audit_log.log_access(tenant_id, query, [d['doc_id'] for d in results])
        
        return results
```

**Performance Benchmarks:**
- Latency: < 100ms (with cache), < 500ms (without cache)
- Throughput: 1000+ queries/sec
- Cost: $0.001 per query

**Security Checklist:**
- [x] Tenant isolation via metadata/namespaces
- [x] Audit logging for compliance
- [x] Rate limiting per tenant
- [x] Cache isolation (tenant_id in cache key)
- [x] No cross-tenant data leakage
- [x] Shared docs explicitly marked
        ''',
    },
]

for i, qa in enumerate(rag_interview_questions, 1):
    print(f'\n{'=' * 100}')
    print(f'Q{i} [{qa["level"]} Level]')
    print('=' * 100)
    print(f'\n{qa["question"]}\n')
    print('ANSWER:')
    print(qa['answer'])
    print()