# Simple RAG with Golden Chunks — Real Papers

This notebook demonstrates a minimal RAG evaluation workflow using the **actual ingested scientific papers** and the **real golden dataset** with ground-truth chunk IDs.

**Evaluation approach:**
1. Load the real FAISS vectorstore and BM25 index (from `ingest.py`)
2. Load the golden dataset with `expected_chunk_ids` (ground-truth evidence)
3. For each query, retrieve chunks and compare against golden chunks
4. Compute chunk-level Precision@K, Recall@K, and MRR
5. Run generation with Ollama and display answers alongside sources

**Prerequisites:** Run `python ingest.py` first and ensure Ollama is running.

In [None]:
import sys
import json
import hashlib
from pathlib import Path

import pandas as pd

# Resolve repo root and add to path
REPO_ROOT = Path.cwd().resolve()
if not (REPO_ROOT / "config.py").exists():
    REPO_ROOT = REPO_ROOT.parent
assert (REPO_ROOT / "config.py").exists(), f"Cannot find repo root (tried {REPO_ROOT})"

if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

print(f"Repo root: {REPO_ROOT}")

In [None]:
# Load golden dataset
golden_path = REPO_ROOT / "eval" / "golden_dataset.json"
golden_dataset = json.loads(golden_path.read_text())

print(f"Golden dataset: {len(golden_dataset)} queries")
print(f"Fields per entry: {list(golden_dataset[0].keys())}")

df_golden = pd.DataFrame(golden_dataset)
df_golden[["question", "expected_sources", "expected_chunk_ids"]].head()

## 1) Load Real Vectorstore and Build Retriever

Load the FAISS index, BM25 index, and cross-encoder — the same components used by `query.py`.

In [None]:
from query import load_vectorstore, build_retriever

vs = load_vectorstore()
retriever = build_retriever(vs)

# Show vectorstore stats
n_vectors = vs.index.ntotal
print(f"FAISS vectors: {n_vectors}")
print(f"BM25 docs: {len(retriever.bm25_docs)}")
print(f"TOP_K = {retriever.k}, TOP_K_CANDIDATES = {retriever.k_candidates}")
print(f"Weights: dense={retriever.dense_weight}, bm25={retriever.bm25_weight}")

## 2) Chunk-Level Retrieval Evaluation

For each golden query, retrieve chunks and compare against `expected_chunk_ids`. This is the core evaluation — **if retrieval is bad, generation cannot be good.**

In [None]:
from eval.evaluate import (
    doc_chunk_id,
    extract_retrieved_chunk_ids,
    reciprocal_rank_chunks,
    precision_at_k_chunks,
    recall_at_k_chunks,
    reciprocal_rank,
    recall_at_k,
)

K = retriever.k
rows = []

for q in golden_dataset:
    question = q["question"]
    expected_chunks = q.get("expected_chunk_ids", [])
    expected_sources = q.get("expected_sources", [])

    docs = retriever.invoke(question)
    retrieved_ids = extract_retrieved_chunk_ids(docs)
    retrieved_sources = [d.metadata.get("source", "?") for d in docs]

    chunk_mrr = reciprocal_rank_chunks(expected_chunks, docs)
    chunk_prec = precision_at_k_chunks(expected_chunks, docs, K)
    chunk_rec = recall_at_k_chunks(expected_chunks, docs, K)
    src_mrr = reciprocal_rank(expected_sources, docs)
    src_recall = recall_at_k(expected_sources, docs, K)

    rows.append({
        "question": question[:70],
        "chunk_mrr": chunk_mrr,
        f"chunk_prec@{K}": chunk_prec,
        f"chunk_recall@{K}": chunk_rec,
        "src_mrr": src_mrr,
        f"src_recall@{K}": src_recall,
        "retrieved_chunks": retrieved_ids[:3],
        "expected_chunks": expected_chunks[:3],
    })

df_eval = pd.DataFrame(rows)
df_eval

## 3) Aggregate Metrics Summary

In [None]:
metrics = {
    "Chunk MRR": df_eval["chunk_mrr"].mean(),
    f"Chunk Precision@{K}": df_eval[f"chunk_prec@{K}"].mean(),
    f"Chunk Recall@{K}": df_eval[f"chunk_recall@{K}"].mean(),
    "Source MRR": df_eval["src_mrr"].mean(),
    f"Source Recall@{K}": df_eval[f"src_recall@{K}"].mean(),
}

print(f"{'Metric':<25} {'Value':>8}")
print("-" * 35)
for name, val in metrics.items():
    print(f"{name:<25} {val:>8.3f}")

# Highlight queries where chunk retrieval completely missed
misses = df_eval[df_eval["chunk_mrr"] == 0.0]
if len(misses) > 0:
    print(f"\nChunk-level misses: {len(misses)}/{len(df_eval)}")
    for _, row in misses.iterrows():
        print(f"  - {row['question']}")

## 4) Detailed Chunk Comparison for a Single Query

Pick one query and inspect exactly which chunks were retrieved vs expected — useful for debugging retrieval misses.

In [None]:
QUERY_IDX = 0  # Change this to inspect different queries

q = golden_dataset[QUERY_IDX]
docs = retriever.invoke(q["question"])

print(f"Question: {q['question']}\n")
print(f"Expected chunk IDs:")
for cid in q.get("expected_chunk_ids", []):
    print(f"  {cid}")

print(f"\nRetrieved chunk IDs:")
for doc in docs:
    cid = doc_chunk_id(doc)
    match = "MATCH" if cid in q.get("expected_chunk_ids", []) else "     "
    print(f"  [{match}] {cid}")
    print(f"          {doc.page_content[:120]}...")
    print()

## 5) End-to-End Generation with Ollama

Run the full pipeline: retrieve chunks, then generate an answer using the local LLM. Compare against the golden `ideal_answer`.

In [None]:
from langchain_ollama import ChatOllama
from config import LLM_MODEL, OLLAMA_BASE_URL

llm = ChatOllama(model=LLM_MODEL, base_url=OLLAMA_BASE_URL, temperature=0)

def generate_answer(question: str, docs) -> str:
    """Simple RAG generation: stuff retrieved chunks into a prompt."""
    context = "\n\n".join(doc.page_content for doc in docs)
    prompt = (
        "Answer the question using only the provided context. "
        "Be precise and cite specific numbers.\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\nAnswer:"
    )
    return llm.invoke(prompt).content

# Run on a few sample queries
SAMPLE_N = 5
for q in golden_dataset[:SAMPLE_N]:
    docs = retriever.invoke(q["question"])
    answer = generate_answer(q["question"], docs)
    retrieved_ids = extract_retrieved_chunk_ids(docs)
    expected_ids = set(q.get("expected_chunk_ids", []))
    chunk_hits = sum(1 for cid in retrieved_ids if cid in expected_ids)

    print(f"Q: {q['question']}")
    print(f"Golden: {q['ideal_answer'][:150]}")
    print(f"Generated: {answer[:150]}")
    print(f"Chunks: {chunk_hits}/{len(expected_ids)} golden chunks retrieved")
    print("-" * 80)