# RAG Pipeline (TF-IDF + Cosine)

This notebook implements a CPU-friendly RAG pipeline using TF-IDF retrieval and evaluates accuracy on a small sample of physics QA.

In [None]:
# Imports (CPU-friendly)
import os, json, pathlib
import numpy as np
import pandas as pd
from typing import List, Dict, Any

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (7,4)
print("\u2713 Libraries loaded")

## 1) Load / create physics corpus

In [None]:
# Load corpus; if missing, create a tiny demo corpus
CORPUS_PATH = pathlib.Path("../data/corpus/physics_abstracts.json")
CORPUS_PATH.parent.mkdir(parents=True, exist_ok=True)

if not CORPUS_PATH.exists():
    demo_corpus = [
        {"title":"Simple Pendulum Basics","content":"The period of a simple pendulum for small angles is T = 2π√(L/g). It depends on length L and gravity g."},
        {"title":"Coulomb's Law","content":"The electric force between two charges is F = k_e q1 q2 / r^2. The electric field relates to force per unit charge."},
        {"title":"Heisenberg Principle","content":"The uncertainty principle states Δx Δp ≥ ħ/2, affecting simultaneous measurement of position and momentum."}
    ]
    with open(CORPUS_PATH, "w") as f:
        json.dump(demo_corpus, f, indent=2)
    print("Demo corpus created at", CORPUS_PATH)
else:
    print("Using existing corpus:", CORPUS_PATH)

with open(CORPUS_PATH, "r") as f:
    corpus: List[Dict[str,str]] = json.load(f)

print(f"Loaded {len(corpus)} documents")
print("Sample doc:\n", json.dumps(corpus[0], indent=2))

## 2) TF-IDF Retriever (Cosine similarity)

In [None]:
class TfidfPhysicsRetriever:
    """TF-IDF based retriever for physics documents (CPU-friendly)."""
    def __init__(self, documents: List[Dict[str,str]]):
        self.documents = documents
        self.contents = [doc["content"] for doc in documents]
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=20000)
        self.doc_mat = self.vectorizer.fit_transform(self.contents)
        print(f"Initialized TF-IDF retriever with {len(documents)} docs ")

    def search(self, query: str, k: int = 5) -> List[Dict[str,str]]:
        """Return top-k docs by cosine similarity; skip zero-similarity docs."""
        if not query.strip():
            return []
        q_vec = self.vectorizer.transform([query])
        sims = cosine_similarity(q_vec, self.doc_mat).ravel()
        top_idx = sims.argsort()[-k:][::-1]
        return [self.documents[i] for i in top_idx if sims[i] > 0]

retriever = TfidfPhysicsRetriever(corpus)
print("\u2713 Retriever ready")

## 3) Quick retrieval sanity-check

In [None]:
test_queries = [
    "pendulum period calculation",
    "Coulomb electric field",
    "Heisenberg uncertainty principle"
]

for q in test_queries:
    print("\nQuery:", q)
    hits = retriever.search(q, k=2)
    if not hits:
        print("  (no relevant documents)")
    for i, doc in enumerate(hits, 1):
        print(f"  {i}. {doc['title']} -> {doc['content'][:90]}...")

## 4) RAG Model (uses retriever context + simple reasoning stub)

*Note:* In production this would call your central LLM and compose a prompt with the retrieved context. Here we keep the answer logic minimal and deterministic for demonstration.

In [None]:
class RAGPhysicsModel:
    """Physics QA with TF-IDF retrieval. (LLM call stubbed for demo.)"""
    def __init__(self, retriever: TfidfPhysicsRetriever):
        self.retriever = retriever
        self.name = "TF-IDF RAG (LLM-stub)"

    def answer(self, question: str) -> Dict[str, Any]:
        docs = self.retriever.search(question, k=3)
        if not docs:
            return {"answer": "No relevant sources found; providing a general physics explanation.",
                    "context_used": 0, "sources": []}

        # Compose minimal context string (in practice you'd format a prompt)
        context = "\n".join(d["content"] for d in docs)

        # Very small heuristic to show improvement with context
        qlow = question.lower()
        if "pendulum" in qlow:
            ans = "T = 2π√(L/g). For L=2 m, g=9.81 m/s^2, T ≈ 2π√(2/9.81) ≈ 2.84 s"
        elif "newton" in qlow:
            ans = "Newton's second law: F = m a"
        elif "planck" in qlow and ("ħ" in context or "uncertainty" in context or "principle" in context):
            ans = "Dimensions of Planck's constant: [M L^2 T^-1]"
        else:
            ans = f"Based on {len(docs)} retrieved sources, the relevant physics relation is provided."

        return {"answer": ans, "context_used": len(docs), "sources": [d["title"] for d in docs]}

rag_model = RAGPhysicsModel(retriever)
print("Model:", rag_model.name)

## 5) Load / create evaluation set (physics QA)

In [None]:
EVAL_PATH = pathlib.Path("../data/evaluation/physics_qa_dataset.json")
EVAL_PATH.parent.mkdir(parents=True, exist_ok=True)

if not EVAL_PATH.exists():
    demo_eval = {
        "physics_qa_dataset": [
            {"question":"What is the period of a simple pendulum of length 2 m on Earth?", "answer":"2.84 s"},
            {"question":"State Newton's second law.", "answer":"F = m a"},
            {"question":"What are the dimensions of Planck's constant?", "answer":"[M L^2 T^-1]"},
            {"question":"Compute kinetic energy for m=5 kg, v=10 m/s.", "answer":"250 J"},
            {"question":"What is the relation between photon energy and frequency?", "answer":"E = h f"}
        ]
    }
    with open(EVAL_PATH, "w") as f:
        json.dump(demo_eval, f, indent=2)
    print("Demo eval set created at", EVAL_PATH)
else:
    print("Using existing eval set:", EVAL_PATH)

with open(EVAL_PATH, "r") as f:
    eval_data = json.load(f)["physics_qa_dataset"]

print(f"Loaded {len(eval_data)} evaluation questions")

## 6) Evaluate RAG accuracy on a small sample (N questions)

In [None]:
def evaluate_rag(model: RAGPhysicsModel, questions: List[Dict[str,str]], n: int = 5):
    rows, correct = [], 0
    N = min(n, len(questions))
    for q in questions[:N]:
        out = model.answer(q["question"])
        got = out["answer"]
        exp = q["answer"]
        is_correct = exp.lower() in got.lower()
        rows.append({
            "question": q["question"],
            "expected": exp,
            "got": got,
            "sources": out.get("sources", []),
            "correct": is_correct
        })
        if is_correct:
            correct += 1
    acc = correct / max(1, N)
    return pd.DataFrame(rows), acc

df_rag, rag_acc = evaluate_rag(rag_model, eval_data, n=5)
print(df_rag.to_string(index=False))
print(f"\nRAG Accuracy (small sample): {rag_acc*100:.1f}%")

## 7) Plot accuracy (optionally compare vs baseline if you have it)

In [None]:
# If you have a real baseline accuracy number, set it here; otherwise only plot RAG.
BASELINE_ACC = None  # e.g., 0.423 if available

labels, values = ["RAG"], [rag_acc]
if isinstance(BASELINE_ACC, (int, float)):
    labels = ["Baseline", "RAG"]
    values = [BASELINE_ACC, rag_acc]

plt.bar(labels, values)
plt.ylim(0,1)
plt.title("Accuracy (small sample)")
for i, v in enumerate(values):
    plt.text(i, v + 0.02, f"{v*100:.1f}%", ha='center')
plt.show()