# Document Ingestion and Indexing

This notebook demonstrates how to:
1. Load documents from various formats
2. Chunk documents into manageable pieces
3. Index documents for retrieval

In [None]:
# Setup
import sys
sys.path.insert(0, '../')

from src.document_processing import DocumentLoader, DocumentChunker, DocumentIndexer
from src.config.settings import settings

## 1. Load a Document

In [None]:
# Initialize loader
loader = DocumentLoader()

# Load a document (replace with your file path)
file_path = "../data/uploads/sample.pdf"
text, metadata = loader.load(file_path)

print(f"Loaded document: {metadata.filename}")
print(f"Document ID: {metadata.document_id}")
print(f"Format: {metadata.format}")
print(f"File size: {metadata.file_size} bytes")
print(f"\nFirst 500 characters:\n{text[:500]}...")

## 2. Chunk the Document

In [None]:
# Initialize chunker
chunker = DocumentChunker()

# Chunk the document
chunks = chunker.chunk_text(text, metadata)

print(f"Created {len(chunks)} chunks")
print(f"\nFirst chunk:")
print(f"Chunk ID: {chunks[0].chunk_id}")
print(f"Text: {chunks[0].text[:200]}...")

## 3. Index the Chunks

In [None]:
# Initialize indexer
indexer = DocumentIndexer()

# Index the chunks
indexer.index_chunks(chunks, metadata)

print(f"Indexed {len(chunks)} chunks")
print(f"\nCollection stats:")
stats = indexer.get_collection_stats()
for key, value in stats.items():
    print(f"{key}: {value}")

## 4. Test Retrieval

In [None]:
# Test querying
query = "What is the main topic?"
results = indexer.query(query, top_k=3)

print(f"Query: {query}")
print(f"\nTop {len(results)} results:")
for i, result in enumerate(results, 1):
    print(f"\n{i}. Score: {1 / (1 + result['distance']):.3f}")
    print(f"Text: {result['text'][:200]}...")

## Complete Pipeline Example

In [None]:
def ingest_document(file_path: str) -> str:
    """Complete document ingestion pipeline."""
    # Load
    loader = DocumentLoader()
    text, metadata = loader.load(file_path)
    
    # Chunk
    chunker = DocumentChunker()
    chunks = chunker.chunk_text(text, metadata)
    
    # Index
    indexer = DocumentIndexer()
    indexer.index_chunks(chunks, metadata)
    
    return metadata.document_id

# Use the pipeline
doc_id = ingest_document("../data/uploads/sample.pdf")
print(f"Document ingested with ID: {doc_id}")