# Custom RAG Baseline Implementation

This notebook implements a traditional RAG system for comparison with GFS.

**Components**:
- **Embeddings**: sentence-transformers (all-MiniLM-L6-v2)
- **Vector DB**: ChromaDB
- **LLM**: Gemini (same as GFS)

**Objectives**:
1. Index documents with custom chunking
2. Run same test queries as GFS experiments
3. Measure performance metrics
4. Compare with GFS results

In [None]:
import sys
from pathlib import Path
import json
import time

project_root = Path.cwd().parent
sys.path.append(str(project_root / "src"))

from custom_rag import CustomRAG
from data_loader import scan_documents
from utils import load_api_key

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

print("Imports successful")

## 1. Initialize Custom RAG

In [None]:
# Load API key
api_key = load_api_key("GOOGLE_API_KEY", str(project_root / ".env"))

# Initialize custom RAG
persist_dir = project_root / "models" / "custom_rag" / "chroma_db"
persist_dir.mkdir(parents=True, exist_ok=True)

rag = CustomRAG(
    api_key=api_key,
    embedding_model="all-MiniLM-L6-v2",
    llm_model="gemini-2.0-flash-exp",
    persist_directory=persist_dir
)

print("Custom RAG initialized")
print(f"Embedding dimension: {rag.embedding_dim}")

## 2. Create Collection and Index Documents

In [None]:
# Create collection
collection_name = "rag_comparison_docs"
rag.create_collection(collection_name, recreate=True)

print(f"Collection created: {collection_name}")

In [None]:
# Scan and index documents
data_dir = project_root / "data" / "raw"
df = scan_documents(data_dir)

print(f"Documents found: {len(df)}")

if len(df) > 0:
    # Index text files
    text_extensions = {".txt", ".md"}
    indexing_results = []
    
    for i, row in enumerate(df.iter_rows(named=True)):
        file_path = Path(row["file_path"])
        
        if file_path.suffix.lower() in text_extensions:
            print(f"\nIndexing {i+1}: {file_path.name}")
            
            try:
                start_time = time.time()
                
                num_chunks = rag.index_document(
                    file_path=file_path,
                    chunk_size=512,
                    overlap=50,
                    metadata={"file_size_mb": row["size_mb"]}
                )
                
                elapsed = time.time() - start_time
                
                indexing_results.append({
                    "file_name": file_path.name,
                    "num_chunks": num_chunks,
                    "indexing_time": elapsed,
                    "status": "success"
                })
                
                print(f"  ✓ Indexed {num_chunks} chunks ({elapsed:.2f}s)")
                
            except Exception as e:
                indexing_results.append({
                    "file_name": file_path.name,
                    "status": "failed",
                    "error": str(e)
                })
                print(f"  ✗ Failed: {e}")
    
    # Save indexing results
    results_path = project_root / "models" / "custom_rag" / "indexing_results.json"
    with open(results_path, "w") as f:
        json.dump(indexing_results, f, indent=2)
    
    print(f"\nIndexing results saved to: {results_path}")
else:
    print("No documents to index")

In [None]:
# Get collection stats
stats = rag.get_stats()
print("\nCollection Statistics:")
print(f"  Collection: {stats.get('collection_name')}")
print(f"  Total chunks: {stats.get('total_chunks')}")
print(f"  Embedding dimension: {stats.get('embedding_dimension')}")

## 3. Run Test Queries

In [None]:
# Load same test queries as GFS experiments
test_queries = [
    {
        "id": "q1",
        "query": "What are the main topics covered in the documents?",
        "category": "overview"
    },
    {
        "id": "q2",
        "query": "Summarize the key findings or conclusions.",
        "category": "synthesis"
    },
    {
        "id": "q3",
        "query": "What specific data or statistics are mentioned?",
        "category": "factual"
    },
    {
        "id": "q4",
        "query": "Are there any recommendations or best practices?",
        "category": "actionable"
    },
    {
        "id": "q5",
        "query": "What is the weather forecast for tomorrow?",
        "category": "out_of_domain"
    },
]

print(f"Test queries: {len(test_queries)}")

In [None]:
# Run queries
results = []

if stats.get('total_chunks', 0) > 0:
    for i, test in enumerate(test_queries):
        print(f"\n[{i+1}/{len(test_queries)}] {test['id']}: {test['query'][:50]}...")
        
        try:
            result_obj = rag.query(
                query=test["query"],
                top_k=5,
                temperature=0.0
            )
            
            metrics = result_obj["metrics"]
            
            result = {
                "query_id": test["id"],
                "category": test["category"],
                "total_latency_seconds": metrics["total_time"],
                "retrieval_time": metrics["retrieval_time"],
                "generation_time": metrics["generation_time"],
                "num_chunks_retrieved": metrics["num_chunks_retrieved"],
                "response_length": len(result_obj["answer"]),
                "avg_distance": np.mean(result_obj["distances"]) if result_obj["distances"] else None,
                "status": "success"
            }
            
            print(f"  ✓ Latency: {metrics['total_time']:.2f}s (retrieval: {metrics['retrieval_time']:.2f}s, generation: {metrics['generation_time']:.2f}s)")
            
        except Exception as e:
            result = {
                "query_id": test["id"],
                "category": test["category"],
                "status": "failed",
                "error": str(e)
            }
            print(f"  ✗ Failed: {e}")
        
        results.append(result)
        time.sleep(1)
    
    print(f"\nQueries completed: {len(results)}")
else:
    print("No documents indexed. Skipping queries.")

## 4. Analyze Performance

In [None]:
# Analyze results
successful_results = [r for r in results if r["status"] == "success"]

if successful_results:
    latencies = [r["total_latency_seconds"] for r in successful_results]
    retrieval_times = [r["retrieval_time"] for r in successful_results]
    generation_times = [r["generation_time"] for r in successful_results]
    
    print("Total Latency Statistics:")
    print(f"  Mean: {np.mean(latencies):.3f}s")
    print(f"  Median (P50): {np.median(latencies):.3f}s")
    print(f"  P95: {np.percentile(latencies, 95):.3f}s")
    print(f"  P99: {np.percentile(latencies, 99):.3f}s")
    
    print(f"\nRetrieval Time:")
    print(f"  Mean: {np.mean(retrieval_times):.3f}s")
    print(f"  Median: {np.median(retrieval_times):.3f}s")
    
    print(f"\nGeneration Time:")
    print(f"  Mean: {np.mean(generation_times):.3f}s")
    print(f"  Median: {np.median(generation_times):.3f}s")

In [None]:
# Visualize latency breakdown
if successful_results:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Stacked bar chart for latency breakdown
    query_ids = [r["query_id"] for r in successful_results]
    
    ax1.bar(query_ids, retrieval_times, label="Retrieval")
    ax1.bar(query_ids, generation_times, bottom=retrieval_times, label="Generation")
    ax1.set_xlabel("Query ID")
    ax1.set_ylabel("Time (seconds)")
    ax1.set_title("Custom RAG Latency Breakdown")
    ax1.legend()
    
    # Total latency distribution
    ax2.hist(latencies, bins=10, edgecolor="black", alpha=0.7)
    ax2.axvline(np.median(latencies), color="red", linestyle="--", label="Median")
    ax2.set_xlabel("Total Latency (seconds)")
    ax2.set_ylabel("Count")
    ax2.set_title("Total Latency Distribution")
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

## 5. Save Results

In [None]:
# Save experimental results
results_path = project_root / "models" / "custom_rag" / "experiment_results.json"

output = {
    "collection_name": collection_name,
    "embedding_model": "all-MiniLM-L6-v2",
    "llm_model": "gemini-2.0-flash-exp",
    "collection_stats": stats,
    "queries": test_queries,
    "results": results,
    "summary": {
        "total_queries": len(results),
        "successful": len(successful_results),
        "failed": len(results) - len(successful_results),
        "mean_total_latency": np.mean(latencies) if successful_results else None,
        "median_total_latency": np.median(latencies) if successful_results else None,
        "mean_retrieval_time": np.mean(retrieval_times) if successful_results else None,
        "mean_generation_time": np.mean(generation_times) if successful_results else None,
    }
}

with open(results_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"Results saved to: {results_path}")

## Summary

**Custom RAG Baseline Completed**:
- Indexed documents with ChromaDB
- Ran same test queries as GFS
- Measured latency (retrieval + generation)
- Saved results for comparison

**Next Steps**:
- Compare GFS vs Custom RAG in `05_comparison_analysis.ipynb`
- Analyze trade-offs (latency, cost, quality)