# Local Brain RAG - Exploration Notebook

Use this notebook for experimentation and testing.

**GPU**: NVIDIA RTX 4060 (8GB VRAM)

## 1. Setup and Imports

In [None]:
import sys
sys.path.insert(0, '../')

import torch
from src.utils import load_config, check_cuda_available
from src.ingest import DocumentLoader
from src.vectorstore import VectorManager
from src.rag import RAGEngine

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Load Configuration

In [None]:
config = load_config('../config/config.yaml')
print("Configuration loaded successfully")

## 3. Test Document Loading

In [None]:
# Initialize document loader
loader = DocumentLoader(config)

# Load documents from data/raw
documents = loader.load_and_split('../data/raw', is_directory=True)

# Display statistics
stats = loader.get_document_stats(documents)
print(f"Loaded documents: {stats}")

## 4. Test Embedding Generation (CUDA)

In [None]:
# Initialize vector manager (embeddings will use CUDA)
vector_manager = VectorManager(config)
vector_manager.initialize_vectorstore(reset=True)

# Monitor VRAM usage during embedding
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    start_mem = torch.cuda.memory_allocated() / 1e9
    print(f"VRAM before embedding: {start_mem:.2f} GB")

# Add documents (embeddings generated on CUDA)
ids = vector_manager.add_documents(documents)

if torch.cuda.is_available():
    end_mem = torch.cuda.memory_allocated() / 1e9
    peak_mem = torch.cuda.max_memory_allocated() / 1e9
    print(f"VRAM after embedding: {end_mem:.2f} GB")
    print(f"Peak VRAM usage: {peak_mem:.2f} GB")

print(f"\nAdded {len(ids)} document chunks to vector store")

## 5. Test Similarity Search

In [None]:
# Test query
test_query = "What is this document about?"

# Search for similar documents
results = vector_manager.similarity_search_with_score(test_query, k=3)

print(f"Query: {test_query}\n")
for i, (doc, score) in enumerate(results, 1):
    print(f"Result {i} (score: {score:.4f}):")
    print(f"Source: {doc.metadata.get('source', 'Unknown')}")
    print(f"Content: {doc.page_content[:200]}...\n")

## 6. Test RAG Pipeline (Requires Ollama)

In [None]:
# Initialize RAG engine
# NOTE: Ensure Ollama is running: ollama serve
# And model is downloaded: ollama pull llama3

rag_engine = RAGEngine(config, vector_manager)
rag_engine.initialize_chain()

# Test query
response = rag_engine.query(test_query)

print("Answer:")
print(response['answer'])
print("\nSources:")
for doc in response.get('source_documents', []):
    print(f"  - {doc.metadata.get('source', 'Unknown')}")

## 7. Benchmark Embedding Speed on RTX 4060

In [None]:
import time

# Create sample texts for benchmarking
sample_texts = [f"This is sample document number {i} for testing." for i in range(100)]

# Benchmark
results = vector_manager.benchmark_embedding_speed(sample_texts)

print("Benchmark Results:")
print(f"Throughput: {results['texts_per_second']:.2f} texts/second")
print(f"VRAM usage: {results['vram_mb']:.2f} MB")

## 8. VRAM Monitoring

In [None]:
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    
    print(f"VRAM Status (RTX 4060):")
    print(f"  Total: {total:.2f} GB")
    print(f"  Allocated: {allocated:.2f} GB")
    print(f"  Reserved: {reserved:.2f} GB")
    print(f"  Free: {total - reserved:.2f} GB")
    
    # Clear cache
    torch.cuda.empty_cache()
    print("\nCache cleared")