# RAG Document Assistant - Example Usage

This notebook demonstrates how to use the RAG Document Assistant programmatically.

## 1. Setup and Configuration

In [None]:
import sys
from pathlib import Path
import time

# Add project to path
sys.path.append(str(Path.cwd().parent))

from src.config import Config
from src.document_processor import DocumentProcessor
from src.vector_store import VectorStore, HybridRetriever
from src.rag_chain import RAGChain
from src.evaluator import RAGEvaluator, TestQuestionGenerator

## 2. Initialize System

In [None]:
# Initialize components
vector_store = VectorStore(collection_name="demo_collection")
rag_chain = RAGChain(vector_store)
evaluator = RAGEvaluator()

print("✅ System initialized!")

## 3. Process and Index Documents

In [None]:
# Process a document
processor = DocumentProcessor()

# Example: Process a PDF
pdf_path = Path("../data/uploads/sample_paper.pdf")

if pdf_path.exists():
    documents = processor.process_pdf(pdf_path)
    print(f"Processed {len(documents)} chunks from {pdf_path.name}")
    
    # Add to vector store
    vector_store.add_documents(documents)
    print("✅ Documents indexed!")
else:
    print("⚠️ Sample PDF not found. Please add a PDF to data/uploads/")

## 4. Query the System

In [None]:
# Ask a question
question = "What are the main findings of this research?"

print(f"Question: {question}\n")

start_time = time.time()
response = rag_chain.query(question, search_mode="hybrid")
response_time = time.time() - start_time

print(f"Answer: {response.answer}\n")
print(f"Confidence: {response.confidence:.2%}")
print(f"Response Time: {response_time:.2f}s\n")

print("Citations:")
for i, citation in enumerate(response.citations, 1):
    print(f"{i}. Source: {citation.source}, Page: {citation.page}")

## 5. Test Different Search Modes

In [None]:
search_modes = ["vector", "keyword", "hybrid"]
question = "What methodology was used in the study?"

results = {}

for mode in search_modes:
    start = time.time()
    response = rag_chain.query(question, search_mode=mode)
    elapsed = time.time() - start
    
    results[mode] = {
        'answer_length': len(response.answer),
        'num_citations': len(response.citations),
        'confidence': response.confidence,
        'time': elapsed
    }
    
    print(f"\n{mode.upper()} Search:")
    print(f"  Answer Length: {results[mode]['answer_length']} chars")
    print(f"  Citations: {results[mode]['num_citations']}")
    print(f"  Confidence: {results[mode]['confidence']:.2%}")
    print(f"  Time: {results[mode]['time']:.2f}s")

## 6. Evaluate Response Quality

In [None]:
# Evaluate a response
question = "What are the limitations of the study?"

start_time = time.time()
response = rag_chain.query(question)
response_time = time.time() - start_time

print(f"Answer: {response.answer[:200]}...\n")

# Run evaluation
metrics = evaluator.evaluate_response(question, response, response_time)

print("\nEvaluation Metrics:")
print(f"  Answer Relevance: {metrics.answer_relevance:.2%}")
print(f"  Citation Accuracy: {metrics.citation_accuracy:.2%}")
print(f"  Faithfulness: {metrics.faithfulness:.2%}")
print(f"  Retrieval Precision: {metrics.retrieval_precision:.2%}")
print(f"  Response Time: {metrics.response_time:.2f}s")

## 7. Compare Multiple Documents

In [None]:
# Assuming you have multiple documents indexed
stats = vector_store.get_collection_stats()
print(f"Total documents: {stats['unique_documents']}")

if stats['unique_documents'] >= 2:
    # Get document IDs
    all_docs = vector_store.get_all_documents()
    doc_ids = list(set([doc.metadata.get('doc_id') for doc in all_docs if doc.metadata.get('doc_id')]))
    
    if len(doc_ids) >= 2:
        question = "Compare the methodologies used in these studies"
        response = rag_chain.compare_documents(question, doc_ids[:2])
        
        print(f"Comparison Answer:\n{response.answer}")
else:
    print("Need at least 2 documents for comparison. Upload more documents first.")

## 8. Generate Test Questions

In [None]:
# Generate test questions from document
generator = TestQuestionGenerator()

# Get a sample document
sample_docs = vector_store.get_all_documents()
if sample_docs:
    sample_text = sample_docs[0].page_content
    
    questions = generator.generate_questions_from_document(sample_text, num_questions=5)
    
    print("Generated Test Questions:")
    for i, q in enumerate(questions, 1):
        print(f"{i}. {q}")

## 9. Batch Evaluation

In [None]:
# Evaluate multiple questions
test_questions = [
    "What is the main hypothesis?",
    "What data was collected?",
    "What are the key conclusions?"
]

all_metrics = []

for question in test_questions:
    start_time = time.time()
    response = rag_chain.query(question)
    response_time = time.time() - start_time
    
    metrics = evaluator.evaluate_response(question, response, response_time)
    all_metrics.append(metrics)
    
    print(f"\nQ: {question}")
    print(f"Relevance: {metrics.answer_relevance:.2%} | "
          f"Citations: {metrics.citation_accuracy:.2%} | "
          f"Time: {metrics.response_time:.2f}s")

# Get average metrics
avg_metrics = evaluator.get_average_metrics()
print("\n=== Average Metrics ===")
for key, value in avg_metrics.items():
    print(f"{key}: {value:.2%}" if 'time' not in key else f"{key}: {value:.2f}s")

## 10. Export Evaluation Report

In [None]:
# Save evaluation report
report_path = "../data/evaluation_report.json"
evaluator.save_evaluation_report(report_path)

print(f"✅ Evaluation report saved to {report_path}")

## 11. View Collection Statistics

In [None]:
stats = vector_store.get_collection_stats()

print("=== Collection Statistics ===")
print(f"Collection Name: {stats['collection_name']}")
print(f"Total Chunks: {stats['total_chunks']}")
print(f"Unique Documents: {stats['unique_documents']}")

## 12. Clean Up (Optional)

In [None]:
# Uncomment to clear the collection
# vector_store.clear_collection()
# print("✅ Collection cleared!")