# RAG Chunk Explorer

Interactive notebook for exploring indexed chunks and testing queries.

**Note:** All functionality here is also available via:
- API endpoints (`/search`, `/chunk/{id}`, `/terminology/{id}`, etc.)
- CLI scripts (`scripts/validate_chunks.py`, `scripts/run_evaluation.py`)

In [None]:
import sys
sys.path.insert(0, '..')

import yaml
from src.database.connection import get_db_connection
from src.database.operations import get_chunk_by_id, get_terminology_by_id, get_terminology_by_uri
from src.embedding.factory import get_embedder
from src.search.retrieval_tool import RetrievalTool, get_collection_stats, list_available_documents

In [None]:
# Load configuration
with open('../config.yaml') as f:
    config = yaml.safe_load(f)

# Connect to database
conn = get_db_connection(config)

# Initialize embedder
embedder = get_embedder(config)

# Create retrieval tool
tool = RetrievalTool(conn, embedder, config)

## Collection Statistics

Same as `GET /stats` endpoint and `scripts/validate_chunks.py --stats`

In [None]:
stats = get_collection_stats(conn)
print(f"Total chunks: {stats['total_chunks']}")
print(f"Total documents: {stats['total_documents']}")
print(f"Total terminology: {stats['total_terminology']}")
print("\nChunks by type:")
for doc_type, count in stats['chunks_by_type'].items():
    print(f"  {doc_type}: {count}")
print("\nTerminology by vocabulary:")
for vocab, count in list(stats['terminology_by_vocabulary'].items())[:10]:
    print(f"  {vocab}: {count}")

## List Available Documents

Same as `GET /documents` endpoint

In [None]:
docs = list_available_documents(conn)
print(f"Found {len(docs)} documents\n")

for doc in docs[:20]:
    print(f"{doc['document_id']}: {doc['document_title']} ({doc['document_type']}) - {doc['chunk_count']} chunks")

## Test Queries

Same as `POST /search` endpoint and `scripts/run_evaluation.py -q "query"`

In [None]:
def test_query(query: str, max_results: int = 5):
    """Run a query and display results. Uses RetrievalTool.search() from src/search/retrieval_tool.py"""
    result = tool.search(query, max_chunks=max_results)
    
    print(f"Query: {query}")
    print(f"Type detected: {result.query_type_detected}")
    print(f"Confidence: {result.confidence:.3f}")
    print(f"Latency: {result.latency_ms}ms")
    print(f"\nResults ({len(result.chunks)}):")
    
    for i, chunk in enumerate(result.chunks, 1):
        print(f"\n{i}. {chunk.document_id} ({chunk.score:.3f})")
        print(f"   Type: {chunk.document_type}")
        print(f"   Section: {chunk.section_header}")
        print(f"   Preview: {chunk.content[:200]}...")
    
    if result.terminology_matches:
        print("\nTerminology matches:")
        for term in result.terminology_matches:
            print(f"  - {term.pref_label} ({term.score:.3f})")
    
    return result

In [None]:
# Example query
result = test_query("What is the decision on using CIM as domain language?")

In [None]:
# Try a terminology query
result = test_query("Contingency")

In [None]:
# Try an exact match query
result = test_query("ADR-0000")

In [None]:
# Try a Dutch query
result = test_query("Wat zijn de data governance principes?")

## Inspect Specific Chunk

Same as `GET /chunk/{chunk_id}` endpoint. Uses `get_chunk_by_id()` from `src/database/operations.py`

In [None]:
def inspect_chunk(chunk_id: int):
    """Display full details of a specific chunk. Uses get_chunk_by_id() from src/database/operations.py"""
    chunk = get_chunk_by_id(conn, chunk_id)
    
    if chunk:
        print(f"Chunk ID: {chunk['id']}")
        print(f"Document: {chunk['document_id']} - {chunk['document_title']}")
        print(f"Type: {chunk['document_type']}")
        print(f"Section: {chunk['section_header']}")
        print(f"Source: {chunk['source_file']}")
        print(f"Owner: {chunk['owner_team']}")
        print(f"Metadata: {chunk['metadata']}")
        print(f"\nContent:\n{chunk['content']}")
    else:
        print(f"Chunk {chunk_id} not found")
    
    return chunk

In [None]:
# Inspect a specific chunk (change ID as needed)
chunk = inspect_chunk(1)

## Inspect Terminology

Same as `GET /terminology/{term_id}` endpoint. Uses `get_terminology_by_id()` from `src/database/operations.py`

In [None]:
def inspect_terminology(term_id: int):
    """Display full details of a specific terminology concept. Uses get_terminology_by_id() from src/database/operations.py"""
    term = get_terminology_by_id(conn, term_id)
    
    if term:
        print(f"Term ID: {term['id']}")
        print(f"URI: {term['concept_uri']}")
        print(f"Label (EN): {term['pref_label_en']}")
        print(f"Label (NL): {term['pref_label_nl']}")
        print(f"Alt Labels: {term['alt_labels']}")
        print(f"Vocabulary: {term['vocabulary_name']}")
        print(f"\nDefinition:\n{term['definition']}")
    else:
        print(f"Terminology {term_id} not found")
    
    return term

In [None]:
# Inspect a specific terminology concept (change ID as needed)
term = inspect_terminology(1)

## Cleanup

In [None]:
conn.close()
print("Connection closed")