# Minimal RAG Experiments Notebook\n\nThis notebook implements a compact end-to-end RAG workflow:\n1. Load minimal documents (`text`, `source`).\n2. Chunk into fixed sentence windows (default: 5 sentences).\n3. Build a golden dataset with `question`, `answer`, `relevant_chunks`.\n4. Generate embeddings for chunks and build a FAISS index.\n5. Initialize a minimal RAG architecture (tokenizer + retriever + sequence model).\n6. Run queries, retrieve chunks, generate answers, and score retrieval with Recall@k / Precision@k.\n7. Sweep chunk sizes / retrieval settings / embedding models for optimization.

In [None]:
from pathlib import Path\nimport os\nimport sys\nimport pandas as pd\n\nPROJECT_ROOT = Path.cwd().resolve().parent\nif str(PROJECT_ROOT) not in sys.path:\n    sys.path.insert(0, str(PROJECT_ROOT))\n\nos.chdir(PROJECT_ROOT)\n\nfrom experiments.minimal_rag_pipeline import (\n    load_json,\n    save_json,\n    chunk_documents,\n    build_golden_dataset,\n    build_chunk_index,\n    evaluate_retrieval,\n    FaissRetriever,\n    MinimalRAGSequence,\n    run_experiment_grid,\n)

In [None]:
DATA_DIR = Path("experiments/data")\nDOCS_PATH = DATA_DIR / "minimal_documents.json"\nQUESTIONS_PATH = DATA_DIR / "minimal_golden_questions.json"\nCHUNKED_PATH = DATA_DIR / "minimal_documents_chunked.json"\nGOLDEN_PATH = DATA_DIR / "minimal_golden_dataset.json"\n\ndocuments = load_json(DOCS_PATH)\ngolden_questions = load_json(QUESTIONS_PATH)\n\nchunked_documents, chunks = chunk_documents(documents, sentences_per_chunk=5)\ngolden_dataset = build_golden_dataset(golden_questions, chunks)\n\nsave_json(CHUNKED_PATH, chunked_documents)\nsave_json(GOLDEN_PATH, golden_dataset)\n\nprint(f"Documents: {len(documents)}")\nprint(f"Chunks: {len(chunks)}")\nprint(f"Golden rows: {len(golden_dataset)}")\nprint(f"Chunked dataset saved to: {CHUNKED_PATH}")\nprint(f"Golden dataset saved to: {GOLDEN_PATH}")

In [None]:
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"\nK = 3\n\nchunk_index = build_chunk_index(chunks, EMBED_MODEL)\nmetrics = evaluate_retrieval(golden_dataset, chunk_index, k=K)\n\nprint(f"Embedding model: {EMBED_MODEL}")\nprint(f"Mean Precision@{K}: {metrics['mean_precision']:.3f}")\nprint(f"Mean Recall@{K}: {metrics['mean_recall']:.3f}")\npd.DataFrame(metrics['rows'])

In [None]:
retriever = FaissRetriever(chunk_index, k=K)\nrag = MinimalRAGSequence(\n    retriever=retriever,\n    generator_model_name="google/flan-t5-small",\n    max_new_tokens=80,\n)\n\nfor item in golden_dataset:\n    result = rag.generate(item['question'], k=K)\n    retrieved_ids = [r['chunk_id'] for r in result['retrieved']]\n    print(f"Q: {item['question']}")\n    print(f"A: {result['answer']}")\n    print(f"Retrieved: {retrieved_ids}")\n    print(f"Expected relevant: {item['relevant_chunks']}")\n    print("-" * 80)

In [None]:
chunk_sizes = [3, 5, 7]\nembedding_models = [\n    "sentence-transformers/all-MiniLM-L6-v2",\n    "sentence-transformers/paraphrase-MiniLM-L3-v2",\n]\n\nexperiment_results = run_experiment_grid(\n    documents=documents,\n    golden_questions=golden_questions,\n    chunk_sizes=chunk_sizes,\n    embedding_models=embedding_models,\n    k=K,\n)\n\nsummary = pd.DataFrame(experiment_results)[[\n    'chunk_size_sentences',\n    'embedding_model',\n    f'precision@{K}',\n    f'recall@{K}',\n    'num_chunks',\n]]\nsummary.sort_values(by=[f'recall@{K}', f'precision@{K}'], ascending=False)