# RAG in Action - Part 2: Production Qdrant Demo

Building production-grade RAG with Docker Qdrant, metadata filtering, and persistent vector storage

## Step 0: Start Qdrant Server

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import subprocess
import time
import requests

def start_qdrant_server():
    """One-click Qdrant server launch with Docker Compose"""
    print("🚀 Starting Qdrant server...")
    
    # Run Docker Compose
    result = subprocess.run(
        ["docker-compose", "up", "-d"],
        capture_output=True,
        text=True
    )
    
    if result.returncode == 0:
        print("✅ Qdrant server started!")
        
        # Wait for connection
        for i in range(10):
            try:
                response = requests.get("http://localhost:6333")
                if response.status_code == 200:
                    print(f"🌐 Qdrant server connected successfully! (http://localhost:6333)")
                    return True
            except:
                print(f"⏳ Waiting for connection... ({i+1}/10)")
                time.sleep(2)
    
    print("❌ Server startup failed")
    return False

# Start Qdrant server
server_ready = start_qdrant_server()

## Step 1: Reproduce Part 1 Limitations

In [None]:
# First, let's set up basic components like Part 1
import sys
from pathlib import Path

# Add parent directory to path
sys.path.append(str(Path().absolute().parent))
from data.download_data import download_apple_10k

# Download Apple 10-K report (reuse from Part 1)
file_path = download_apple_10k()
print(f"Downloaded: {file_path}")

In [None]:
# Load and process documents (same as Part 1)
import torch
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Check device availability
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"Using device: {device}")

# Load PDF
loader = PyPDFLoader(file_path)
documents = loader.load()
print(f"Loaded {len(documents)} pages")

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")

In [None]:
print("🚫 Reproducing Part 1 Limitations")
print("=" * 50)

from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Qdrant

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': device},
    encode_kwargs={'normalize_embeddings': True}
)

# Part 1 approach: In-memory vector store
memory_store = Qdrant.from_documents(
    chunks,
    embeddings,
    location=":memory:",  # ← This data will be lost!
    collection_name="part1_memory"
)

print("✅ In-memory vector store created")
print(f"📊 Current vector count: {memory_store.client.count(collection_name='part1_memory').count}")

# Display memory usage
import psutil
process = psutil.Process()
memory_usage = process.memory_info().rss / 1024 / 1024
print(f"💾 Current memory usage: {memory_usage:.1f} MB")

print("\n" + "="*50)
print("⚠️ ACTION REQUIRED: Please restart the kernel now!")
print("   (Jupyter menu: Kernel → Restart)")
print("   Then, run the cell below.")
print("="*50)

### ⚠️ Kernel Restart Required

**Perform the following steps now:**

1. Click `Kernel → Restart` in Jupyter menu
2. After kernel restarts, **skip the cells above** and run only the cell below
3. Verify if the data is truly lost!

> 💡 **Key Point**: The `:memory:` approach stores data only in Python process memory, so all data is completely lost when the kernel restarts (process terminates).

In [None]:
print("🔍 Verifying Data Persistence After Kernel Restart")
print("=" * 50)

from qdrant_client import QdrantClient

# Connect to in-memory instance with new client
# Note: Data from before kernel restart cannot be found
client_after_restart = QdrantClient(location=":memory:")

try:
    # Attempt to retrieve previously created collection info
    collection_info = client_after_restart.get_collection(collection_name="part1_memory")
    print(f"✅ Collection found! Vector count: {collection_info.points_count}")
    print("💡 If you see this message: You didn't restart the kernel, or you re-ran the cells above.")
    
except Exception as e:
    print(f"❌ Error occurred: Cannot access collection 'part1_memory'.")
    print(f"   Reason: In-memory database was completely reset after kernel restart.")
    print(f"   Error type: {type(e).__name__}")
    print("💡 This is proof that the data is not persistent!")

## Step 2: Enhanced Document Processing with Metadata

In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path().absolute().parent))
from data.download_data import download_apple_10k

file_path = download_apple_10k()
print(f"Downloaded: {file_path}")

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def enrich_documents_with_metadata(pdf_path):
    """Unlike Part 1, add rich metadata"""
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    print(f"📚 Loaded pages: {len(documents)}")

    project_root = Path().absolute().parent 
    
    for doc in documents:
        absolute_path = Path(doc.metadata['source'])
        relative_path = absolute_path.relative_to(project_root.parent)
        doc.metadata['source'] = str(relative_path)
        
    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_documents(documents)
    
    # Enrich metadata
    enriched_chunks = []
    for chunk in chunks:
        page_num = chunk.metadata.get('page', 0)
        content_lower = chunk.page_content.lower()
        
        # Automatic section classification (heuristic)
        if any(keyword in content_lower for keyword in ['revenue', 'sales', 'net income']):
            section = "Financial Performance"
        elif any(keyword in content_lower for keyword in ['risk', 'uncertainty', 'factor']):
            section = "Risk Factors"
        elif any(keyword in content_lower for keyword in ['business', 'product', 'services']):
            section = "Business Overview"
        else:
            section = "Other"
        
        # Part 2 key feature: Rich metadata
        chunk.metadata.update({
            'section': section,
            'page_number': page_num,
            'doc_type': '10-K',
            'year': 2023,
            'company': 'Apple Inc.',
            'file_name': Path(pdf_path).name
        })
        enriched_chunks.append(chunk)
    
    return enriched_chunks

# Generate metadata-enriched chunks
enriched_chunks = enrich_documents_with_metadata(file_path)
print(f"✅ Metadata enrichment complete: {len(enriched_chunks)} chunks")

from collections import Counter

# Extract 'section' metadata from each chunk into a list
sections = [chunk.metadata['section'] for chunk in enriched_chunks]

# Count each section using Counter
section_counts = Counter(sections)

print("\n📊 Chunk count by section (debugging):")
for section, count in section_counts.items():
    print(f"   - {section}: {count}")

# Check metadata example
sample_chunk = enriched_chunks[10]
print(f"\n📋 Metadata example:")
for key, value in sample_chunk.metadata.items():
    print(f"   {key}: {value}")

## Step 3: Create Production Qdrant Collection

In [None]:
# Set up persistent storage Qdrant collection
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

# Connect to Qdrant client
client = QdrantClient(host="localhost", port=6333)
print("🔗 Qdrant server connection successful!")

# Create production-grade collection
collection_name = "apple_10k_production"

try:
    # Delete existing collection if exists (for demo)
    client.delete_collection(collection_name)
    print(f"🗑️ Existing collection '{collection_name}' deleted")
except:
    pass

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(
        size=384,  # all-MiniLM-L6-v2 dimension
        distance=Distance.COSINE,  # Cosine similarity
        # HNSW optimization parameters (Part 2 core settings)
        hnsw_config={
            "m": 16,  # Number of connections
            "ef_construct": 100,  # Search depth during construction
            "full_scan_threshold": 10000,  # Full scan threshold
            "on_disk": False  # In-memory index (faster search)
        }
    ),
    # Optimization: Automatic metadata indexing
    optimizers_config={
        "deleted_threshold": 0.2,
        "vacuum_min_vector_number": 1000,
        "default_segment_number": 0,
        "max_segment_size": None,
        "memmap_threshold": 1000000,
        "indexing_threshold": 20000,
        "flush_interval_sec": 5,
        "max_optimization_threads": None
    }
)

print(f"✅ Production collection '{collection_name}' created!")

# Check collection info
collection_info = client.get_collection(collection_name)
print(f"📊 Vector dimension: {collection_info.config.params.vectors.size}")
print(f"📏 Distance metric: {collection_info.config.params.vectors.distance}")

## Step 4: Index Documents to Qdrant with Metadata

In [None]:
from qdrant_client.models import PointStruct
from tqdm import tqdm
import uuid
from langchain_huggingface import HuggingFaceEmbeddings
import torch

def index_documents_to_qdrant(client, collection_name, chunks, embeddings):
    """Index documents to Qdrant with metadata"""
    print("🔄 Starting document vectorization and Qdrant indexing...")
    
    points = []
    batch_size = 50
    
    for i, chunk in enumerate(tqdm(chunks, desc="Vectorization progress")):
        # Convert text to vector
        vector = embeddings.embed_query(chunk.page_content)
        
        # Create Qdrant Point (ID, vector, metadata)
        point = PointStruct(
            id=str(uuid.uuid4()),  # Unique ID
            vector=vector,
            payload={
                "page_content": chunk.page_content,
                "section": chunk.metadata.get("section"),
                "page_number": chunk.metadata.get("page_number"),
                "doc_type": chunk.metadata.get("doc_type"),
                "year": chunk.metadata.get("year"),
                "company": chunk.metadata.get("company"),
                "file_name": chunk.metadata.get("file_name")
            }
        )
        points.append(point)
        
        # Upload in batches
        if len(points) >= batch_size:
            client.upsert(collection_name=collection_name, points=points)
            points = []
    
    # Upload remaining points
    if points:
        client.upsert(collection_name=collection_name, points=points)
    
    print(f"✅ {len(chunks)} documents indexed!")
    
    # Check collection statistics
    collection_info = client.get_collection(collection_name)
    print(f"📊 Stored vector count: {collection_info.points_count}")
    return collection_info.points_count

device = 'mps' if torch.backends.mps.is_available() else 'cpu'

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': device},
    encode_kwargs={'normalize_embeddings': True}
)

# Execute actual indexing
indexed_count = index_documents_to_qdrant(
    client, collection_name, enriched_chunks, embeddings
)

print(f"\n🎉 Part 2 production vector database construction complete!")
print(f"📦 Total {indexed_count} vectors permanently stored")
print("🔄 Data will persist even after notebook restart!")

## Step 5: Compare Search Performance (Part 1 vs Part 2)

In [None]:
import time

# Part 1 style search function
def simple_vector_search(query, top_k=5):
    """Part 1 style: Simple search without metadata filtering"""
    query_vector = embeddings.embed_query(query)
    
    result = client.query_points(
        collection_name=collection_name,
        query=query_vector,
        limit=top_k,
        with_payload=True
    )
    return result.points

# Part 2 style search function
def filtered_vector_search(query, section=None, page_range=None, top_k=5):
    """Part 2 style: Search with metadata filtering"""
    from qdrant_client.models import Filter, FieldCondition, MatchValue, Range
    
    query_vector = embeddings.embed_query(query)
    
    # Build filter conditions
    filter_conditions = []
    if section:
        filter_conditions.append(
            FieldCondition(key="section", match=MatchValue(value=section))
        )
    if page_range:
        filter_conditions.append(
            FieldCondition(key="page_number",
                         range=Range(gte=page_range[0], lte=page_range[1]))
        )
    
    query_filter = Filter(must=filter_conditions) if filter_conditions else None
    
    result = client.query_points(
        collection_name=collection_name,
        query=query_vector,
        query_filter=query_filter,
        limit=top_k,
        score_threshold=0.4,
        with_payload=True
    )
    return result.points

# Run comparison test
test_query = "What was Apple's total revenue in 2023?"

print("🔍 Search Method Comparison Test")
print("=" * 60)
print(f"Question: {test_query}")

# Part 1 style search
print("\n❌ Part 1 Style: Simple Vector Search")
start_time = time.time()
simple_results = simple_vector_search(test_query)
simple_time = time.time() - start_time

print(f"⏱️ Search time: {simple_time:.3f}s")
print(f"📊 Result count: {len(simple_results)}")

for i, result in enumerate(simple_results, 1):
    section = result.payload.get('section', 'Unknown')
    page = result.payload.get('page_number', 'Unknown')
    score = result.score
    print(f"   {i}. Section:{section}, Page:{page}, Score:{score:.3f}")

# Part 2 style search
print("\n✅ Part 2 Style: Metadata Filtered Search")
start_time = time.time()
filtered_results = filtered_vector_search(
    test_query,
    section="Financial Performance",  # Search only financial section
)
filtered_time = time.time() - start_time

print(f"⏱️ Search time: {filtered_time:.3f}s")
print(f"📊 Result count: {len(filtered_results)}")

for i, result in enumerate(filtered_results, 1):
    section = result.payload.get('section', 'Unknown')
    page = result.payload.get('page_number', 'Unknown')
    score = result.score
    print(f"   {i}. Section:{section}, Page:{page}, Score:{score:.3f}")

print(f"\n📈 Performance Comparison:")
print(f"   Speed: {((simple_time - filtered_time) / simple_time * 100):+.1f}% change with filtering")
print(f"   Accuracy: Filtering yields more relevant results!")

## Step 6: Complete RAG Pipeline with Filtering

In [None]:
import ollama

def ask_rag_with_filter(query, section=None, page_range=None):
    """RAG Q&A with filtering functionality"""
    # 1. Filtered vector search
    search_results = filtered_vector_search(query, section, page_range)
    
    if not search_results:
        return "No relevant documents found.", []
    
    # 2. Build context
    contexts = []
    sources = []
    
    for result in search_results:
        content = result.payload.get('page_content', '')
        metadata = {
            'section': result.payload.get('section'),
            'page': result.payload.get('page_number'),
            'score': result.score
        }
        contexts.append(content)
        sources.append(metadata)
    
    # 3. Build LLM prompt
    context_text = "\n\n".join(contexts)
    prompt = f"""Answer the question accurately based on the following Apple 2023 10-K report information:

Reference materials:
{context_text}

Question: {query}

Answer: Please provide an answer with specific numbers based on the provided materials."""
    
    # 4. Generate LLM response
    response = ollama.chat(
        model='llama3.1:8b',
        messages=[{'role': 'user', 'content': prompt}]
    )
    
    return response['message']['content'], sources

# Test with various questions
test_cases = [
    {
        "query": "What was Apple's total revenue in 2023?",
        "section": "Financial Performance",
        "page_range": (20, 60)
    },
    {
        "query": "What are the main risk factors Apple faces?",
        "section": "Risk Factors",
        "page_range": (5, 20)
    },
    {
        "query": "What are Apple's main products and services?",
        "section": "Business Overview",
        "page_range": (1, 30)
    }
]

print("🎯 Part 2 Production RAG System Test")
print("=" * 70)

for i, test_case in enumerate(test_cases, 1):
    print(f"\n{i}. Question: {test_case['query']}")
    print(f"   Filter: Section={test_case['section']}, Pages={test_case['page_range']}")
    
    start_time = time.time()
    answer, sources = ask_rag_with_filter(**test_case)
    response_time = time.time() - start_time
    
    print(f"   ⏱️ Response time: {response_time:.2f}s")
    print(f"   📄 Referenced documents: {len(sources)}")
    print(f"   📝 Answer: {answer[:200]}...")
    
    # Display source information
    print("   📚 Reference sources:")
    for j, source in enumerate(sources[:2], 1):
        print(f"      {j}. Section:{source['section']}, Page:{source['page']}, Score:{source['score']:.3f}")

print("\n✅ All tests complete! Part 2 production RAG system is functioning properly.")

## Step 7: Data Persistence Verification

In [None]:
def verify_data_persistence():
    """Verify data persistence"""
    print("🔍 Data Persistence Verification")
    print("=" * 40)
    
    # Check collection existence
    collections = client.get_collections()
    collection_names = [col.name for col in collections.collections]
    
    if collection_name in collection_names:
        print(f"✅ Collection '{collection_name}' exists")
        
        # Check stored data count
        collection_info = client.get_collection(collection_name)
        print(f"📊 Stored vector count: {collection_info.points_count}")
        
        # Sample search test
        sample_results = simple_vector_search("Apple revenue", top_k=3)
        print(f"🔍 Sample search results: {len(sample_results)}")
        
        # Check metadata
        if sample_results:
            sample = sample_results[0]
            print("📋 Sample metadata:")
            for key, value in sample.payload.items():
                if key != 'page_content':
                    print(f"   {key}: {value}")
        
        print("\n🎉 Data persistence verification complete!")
        print("💡 This data will persist even after notebook kernel restart.")
        return True
    else:
        print(f"❌ Collection '{collection_name}' not found")
        return False

# Run verification
persistence_ok = verify_data_persistence()

if persistence_ok:
    print("\n" + "="*60)
    print("🏆 Part 2 Achievements Summary")
    print("="*60)
    print("✅ Docker Qdrant server deployment")
    print("✅ Metadata-rich document indexing")
    print("✅ Production-grade vector collection creation")
    print("✅ Advanced metadata filtering search")
    print("✅ Persistent storage data retention")
    print("✅ Performance improvement and accuracy enhancement")

## Step 8: Identify Part 2 Limitations and Part 3 Preview

In [None]:
print("🚨 Testing Part 2 System Limitations")
print("=" * 50)

# Questions that show limitations of dense search
limitation_queries = [
    "Find content related to AAPL stock symbol",  # Exact keyword matching needed
    "2023 fiscal year Q4 quarterly results",        # Multi-language search
    "Tim Cook CEO leadership",                       # Proper noun search
    "iPhone 15 Pro Max specifications"               # New product info (may not be in 10K)
]

print("\n❌ Cases where dense search alone has limitations:")

for i, query in enumerate(limitation_queries, 1):
    print(f"\n{i}. Question: \"{query}\"")
    
    # Search with current system
    results = simple_vector_search(query, top_k=3)
    
    if results and results[0].score > 0.7:
        print(f"   Result: {len(results)} found (best score: {results[0].score:.3f})")
        print("   ✅ Dense search found appropriate results")
    else:
        score = results[0].score if results else 0
        print(f"   Result: {len(results)} found (best score: {results[0].score:.3f})")
        print("   ❌ Dense search struggles to find highly relevant results")
        print("   💡 Keyword-based search (BM25) might be more appropriate")

print("\n" + "="*60)
print("🔮 Problems to Solve in Part 3")
print("="*60)
print("🎯 Hybrid Search (Dense + Sparse)")
print("   - Combine semantic search + exact keyword matching")
print("   - Generate sparse vectors with SPLADE model")
print("   - Combine BM25 and vector search scores")
print()
print("🎯 Query Expansion and Rewriting")
print("   - Transform user questions to be more search-friendly")
print("   - Expand synonyms and abbreviations")
print("   - Multiple query strategies")
print()
print("🎯 Re-ranking")
print("   - More sophisticated re-ranking of initial search results")
print("   - Utilize cross-encoder models")
print("   - Comprehensive evaluation of metadata and content")
print()
print("💫 From Part 2 to Part 3...")
print("   Now that we've built a solid foundation with dense search,")
print("   let's combine it with sparse search to create the perfect RAG!")

print(f"\n🎊 Part 2 complete! See you in Part 3!")

## Optional: Server Cleanup

In [None]:
import subprocess

def stop_qdrant_server():
    """Stop Qdrant server and clean up resources with Docker Compose"""
    print("🛑 Stopping Qdrant server...")
    
    try:
        # Stop Docker Compose services
        result = subprocess.run(
            ["docker-compose", "down"],
            capture_output=True,
            text=True,
            cwd="."
        )
        
        if result.returncode == 0:
            print("✅ Qdrant server stopped!")
            print("📦 Docker containers have been stopped.")
            
            # Confirm data preservation
            print("\n💾 Data Preservation Status:")
            print("   ✅ Named volume data is preserved")
            print("   ✅ Data can be recovered on next 'docker-compose up -d' run")
        
        else:
            print("⚠️ Some errors occurred during server shutdown (may be normal)")
            print(f"   Output: {result.stderr}")
    
    except FileNotFoundError:
        print("⚠️ docker-compose command not found.")
        print("   Please run 'docker-compose down' manually.")
    except Exception as e:
        print(f"❌ Unexpected error: {e}")

def show_cleanup_options():
    """Guide user on cleanup options"""
    print("🧹 Cleanup Options After Practice")
    print("=" * 40)
    print()
    print("1️⃣ Stop server only (preserve data):")
    print("   stop_qdrant_server()  # ← Run this function")
    print("   📋 Effect: Only stop Docker container, keep data")
    print()
    print("2️⃣ Complete cleanup (delete data):")
    print("   !docker-compose down -v")
    print("   📋 Effect: Completely delete all data and volumes")
    print()
    print("3️⃣ Leave as is:")
    print("   📋 Effect: Server continues running (can do other experiments)")
    print("   💡 Using port 6333 in background")
    print()
    print("🎯 Recommendations:")
    print("   - Learning purpose: Option 1️⃣ (preserve data, stop server only)")
    print("   - Complete cleanup: Option 2️⃣ (free disk space)")
    print("   - Continue experiments: Option 3️⃣ (test other questions)")

# Show cleanup options
show_cleanup_options()

print("\n" + "="*50)
print("💡 To stop the server, uncomment the cell below and run it")
print("="*50)

In [None]:
# Server shutdown execution (optional)
# Uncomment and run the line below to stop the Qdrant server

# stop_qdrant_server()

print("Uncomment and run the function above to stop the server.")
print("💾 Data is safely preserved in Docker named volume!")

## Summary

🎉 **Part 2 Complete! Production RAG System Successfully Built!**

### Key Improvements Compared to Part 1:

| Aspect | Part 1 | Part 2 |
|--------|---------|--------|
| **Data Storage** | Memory (volatile) | Docker persistent storage |
| **Search Method** | Simple vector search | Metadata filtering + vector |
| **Scalability** | Memory constrained | Production environment ready |
| **Data Persistence** | Lost on restart | Retained through server restart |
| **Search Accuracy** | Basic level | Greatly improved with filtering |

### Core Achievements:
- ✅ Docker-based production-grade Qdrant server deployment
- ✅ Metadata-rich document processing and indexing  
- ✅ HNSW optimization parameters applied
- ✅ Sophisticated filtering search by section and page
- ✅ Data persistence secured with permanent storage
- ✅ Significantly improved performance and accuracy over Part 1

### Next Steps (Part 3 Preview):
**One Step Further with Hybrid Search!**
- Dense search (Part 2) + Sparse search (BM25)
- Perfect combination of semantic search and exact keyword matching
- More sophisticated search results and re-ranking techniques

Now that we've built a solid foundation in Part 2, we'll implement more sophisticated hybrid search in Part 3! 🚀