# Step 2: Build Naive RAG

In [1]:
!pip -q install datasets sentence-transformers faiss-cpu transformers accelerate pymilvus==2.4.4 pandas numpy



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.0/196.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m126.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.71.2 requires grpcio>=1.71.2, but you have grpcio 1.63.0 which is incompatible.[0m[

In [2]:
import os, json, logging
from dataclasses import dataclass, asdict
import numpy as np
import pandas as pd
from datasets import load_dataset

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("rag_step2")

@dataclass
class Config:
    dataset_name: str = "rag-datasets/rag-mini-wikipedia"
    split: str = "test"
    n_docs: int = 2000    # number of passages to embed
    backend: str = "faiss"
    emb_model: str = "sentence-transformers/all-MiniLM-L6-v2"
    gen_model: str = "facebook/bart-large"
    normalize: bool = True
    top_k: int = 3
    max_gen_length: int = 64
    batch_size: int = 64
    milvus_uri: str = "milvus_lite.db"
    milvus_collection: str = "rag_corpus_step2"

cfg = Config()
print(json.dumps(asdict(cfg), indent=2))


{
  "dataset_name": "rag-datasets/rag-mini-wikipedia",
  "split": "test",
  "n_docs": 2000,
  "backend": "faiss",
  "emb_model": "sentence-transformers/all-MiniLM-L6-v2",
  "gen_model": "facebook/bart-large",
  "normalize": true,
  "top_k": 3,
  "max_gen_length": 64,
  "batch_size": 64,
  "milvus_uri": "milvus_lite.db",
  "milvus_collection": "rag_corpus_step2"
}


In [3]:
# Corpus: text-corpus
ds_corpus = load_dataset(cfg.dataset_name, "text-corpus")
corpus = ds_corpus["passages"]
corpus = corpus.select(range(min(cfg.n_docs, len(corpus))))

# QA dataset: question-answer
ds_qa = load_dataset(cfg.dataset_name, "question-answer")
qa = ds_qa[cfg.split]

print("Corpus size:", len(corpus))
print("QA size:", len(qa))

# Extract raw texts from corpus
raw_passages = [row.get("text") or row.get("passage") or "" for row in corpus]
print("Sample passage:", raw_passages[0][:300], "...")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

data/passages.parquet/part.0.parquet:   0%|          | 0.00/797k [00:00<?, ?B/s]

Generating passages split:   0%|          | 0/3200 [00:00<?, ? examples/s]

data/test.parquet/part.0.parquet:   0%|          | 0.00/54.4k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/918 [00:00<?, ? examples/s]

Corpus size: 2000
QA size: 918
Sample passage: Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area. ...


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode text passages
raw_passages = [row.get("text") or row.get("passage") or "" for row in corpus]
embeddings = embedding_model.encode(raw_passages, batch_size=64, convert_to_numpy=True, show_progress_bar=True)

print("Embeddings shape:", embeddings.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Embeddings shape: (2000, 384)


In [20]:
import faiss

In [34]:
import numpy as np
import faiss

# Convert embeddings to float32
emb_array = np.array(embeddings, dtype="float32")

# Normalize embeddings for cosine similarity
faiss.normalize_L2(emb_array)

# Create FAISS index (Inner Product = cosine similarity with normalized vectors)
d = emb_array.shape[1]  # embedding dimension
faiss_index = faiss.IndexFlatIP(d)

# Add embeddings to index
faiss_index.add(emb_array)

print("FAISS index size:", faiss_index.ntotal)



FAISS index size: 2000


In [35]:
# Encode and normalize query
query_emb = embedding_model.encode([query], convert_to_numpy=True).astype("float32")
faiss.normalize_L2(query_emb)

# Top-k search
D, I = faiss_index.search(query_emb, k=3)

# Fetch contexts
contexts = [raw_passages[idx] for idx in I[0]]


In [22]:
# Example single query
query = qa[0]["question"]
query_embedding = embedding_model.encode([query], convert_to_numpy=True)

print("Query:", query)
print("Embedding shape:", query_embedding.shape)


Query: Was Abraham Lincoln the sixteenth President of the United States?
Embedding shape: (1, 384)


In [36]:
def faiss_search(query: str, k: int = 5):
    qv = embedding_model.encode([query], convert_to_numpy=True).astype("float32").reshape(1, -1)
    D, I = index.search(qv, k)
    results = [(raw_passages[i], float(D[0][j])) for j, i in enumerate(I[0])]
    return results


In [37]:

results = faiss_search(query, k=3)   # k=3 = top-3 results

print("Search output:")
for passage, score in results:
    print(f"Score: {score:.4f}, Passage: {passage[:200]}...")



Search output:
Score: 0.7095, Passage: Young Abraham Lincoln...
Score: 0.5840, Passage: Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion o...
Score: 0.5569, Passage: Sixteen months before his death, his son, John Quincy Adams, became the sixth President of the United States (1825 1829), the only son of a former President to hold the office until George W. Bush in ...


In [38]:
system_prompt = "You are a helpful assistant that answers based only on the provided context."

results = faiss_search(query, k=3)
contexts = [passage for passage, score in results]
context = " ".join(contexts)

prompt = f"""{system_prompt}\nContext: {context}\nQuestion: {query}"""
print(prompt)


You are a helpful assistant that answers based only on the provided context.
Context: Young Abraham Lincoln Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that 'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Li

In [39]:
# Top-1 result (FAISS returns list of (passage, score) tuples)
top_hit = results[0][0]   # first query, first hit
context = top_hit[0]      # passage text is at index 0, score at index 1

print("Retrieved context:", context[:200], "...")


Retrieved context: Y ...


In [30]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="facebook/bart-large", device_map="auto")

response = generator(prompt, max_length=64, truncation=True)
print("Generated answer:", response[0]["generated_text"])


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated answer: You are a helpful assistant that answers based only on the provided context.Context: Young Abraham Lincoln Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that 'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center stage of American politics at precisely the right time and place, and with p

# Step 3: Evaluation Phase I

In [31]:
!pip install -q evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [42]:
import evaluate
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

# Load squad metric
metric = evaluate.load("squad")

# --- Prompt strategies ---
def build_prompt(strategy, context, question):
    if strategy == "instruction":
        return f"Answer the question based only on the context.\nContext: {context}\nQuestion: {question}"
    elif strategy == "cot":
        return f"Let's reason step by step.\nContext: {context}\nQuestion: {question}\nAnswer:"
    elif strategy == "persona":
        return f"You are a knowledgeable historian specializing in US Presidents. Using the context provided, answer the question.\nContext: {context}\nQuestion: {question}"
    else:
        return f"Context: {context}\nQuestion: {question}"

# --- Bootstrap confidence intervals ---
def bootstrap_ci(metric_func, preds, refs, n_bootstraps=200, alpha=0.05):
    scores = []
    for _ in range(n_bootstraps):
        idxs = [random.randint(0, len(preds)-1) for _ in range(len(preds))]
        p_sample = [preds[i] for i in idxs]
        r_sample = [refs[i] for i in idxs]
        score = metric_func.compute(predictions=p_sample, references=r_sample)
        scores.append(score["f1"])
    lower = np.percentile(scores, alpha/2*100)
    upper = np.percentile(scores, (1-alpha/2)*100)
    return lower, upper

# --- Failure Analysis ---
def categorize_errors(preds, refs):
    errors = {"context_mismatch":0, "hallucination":0, "incomplete":0, "yesno_flip":0}
    for pred, ref in zip(preds, refs):
        gold = ref["answers"]["text"][0].lower()
        guess = pred["prediction_text"].lower()

        if gold not in guess:
            errors["context_mismatch"] += 1
        elif gold in guess and len(guess) < len(gold):
            errors["incomplete"] += 1
        elif gold in ["yes","no"] and guess.strip() in ["yes","no"] and gold!=guess:
            errors["yesno_flip"] += 1
        else:
            errors["hallucination"] += 1
    return errors

# --- Evaluation function ---
def evaluate_strategy(strategy, dataset, max_samples=120):
    preds, refs = [], []

    for row in tqdm(dataset.select(range(max_samples)), desc=f"Running {strategy}"):
        q = row["question"]
        q_emb = embedding_model.encode([q], convert_to_numpy=True).astype("float32")
        faiss.normalize_L2(q_emb)

        # retrieve top-3 passages
        D, I = faiss_index.search(q_emb.reshape(1, -1), k=3)
        contexts = [raw_passages[idx] for idx in I[0] if idx < len(raw_passages)]
        context = " ".join(contexts)

        # build prompt + generate
        prompt = build_prompt(strategy, context, q)
        gen = generator(prompt, max_new_tokens=64, truncation=True)[0]["generated_text"]

        preds.append({"id": str(row["id"]), "prediction_text": gen})
        refs.append({"id": str(row["id"]), "answers": {"text": [row["answer"]], "answer_start": [0]}})

    # metrics
    results = metric.compute(predictions=preds, references=refs)
    ci_low, ci_high = bootstrap_ci(metric, preds, refs)
    errors = categorize_errors(preds, refs)

    return results, ci_low, ci_high, errors, preds, refs

# --- Run multiple strategies ---
strategies = ["instruction", "cot", "persona"]
all_results = []

for strat in strategies:
    results, ci_low, ci_high, errors, preds, refs = evaluate_strategy(strat, qa, max_samples=150)  # 100–200 queries

    print(f"\nStrategy: {strat}")
    print(f"F1: {results['f1']:.2f} (95% CI: {ci_low:.2f} – {ci_high:.2f})")
    print(f"Exact Match: {results['exact_match']:.2f}")
    print("Failure breakdown:", errors)

    all_results.append({
        "strategy": strat,
        "f1": results["f1"],
        "exact_match": results["exact_match"],
        "f1_ci_low": ci_low,
        "f1_ci_high": ci_high,
        "errors": errors
    })

    # Save predictions per strategy
    pd.DataFrame(preds).to_csv(f"preds_{strat}.csv", index=False)

# Save summary table
df_results = pd.DataFrame(all_results)
df_results.to_csv("strategy_metrics.csv", index=False)
print("\nSaved summary metrics to strategy_metrics.csv")
print(df_results)


Running instruction: 100%|██████████| 150/150 [02:16<00:00,  1.10it/s]



Strategy: instruction
F1: 2.65 (95% CI: 1.90 – 3.65)
Exact Match: 0.00
Failure breakdown: {'context_mismatch': 127, 'hallucination': 23, 'incomplete': 0, 'yesno_flip': 0}


Running cot: 100%|██████████| 150/150 [02:17<00:00,  1.09it/s]



Strategy: cot
F1: 2.81 (95% CI: 1.98 – 3.81)
Exact Match: 0.00
Failure breakdown: {'context_mismatch': 123, 'hallucination': 27, 'incomplete': 0, 'yesno_flip': 0}


Running persona: 100%|██████████| 150/150 [02:17<00:00,  1.09it/s]



Strategy: persona
F1: 2.72 (95% CI: 1.72 – 3.43)
Exact Match: 0.00
Failure breakdown: {'context_mismatch': 122, 'hallucination': 28, 'incomplete': 0, 'yesno_flip': 0}

Saved summary metrics to strategy_metrics.csv
      strategy        f1  exact_match  f1_ci_low  f1_ci_high  \
0  instruction  2.645235          0.0   1.896833    3.647449   
1          cot  2.811932          0.0   1.984794    3.808152   
2      persona  2.724774          0.0   1.724258    3.429596   

                                              errors  
0  {'context_mismatch': 127, 'hallucination': 23,...  
1  {'context_mismatch': 123, 'hallucination': 27,...  
2  {'context_mismatch': 122, 'hallucination': 28,...  


# Step 4 - Experimentation

In [45]:
import faiss
import numpy as np
import pandas as pd
from tqdm import tqdm
import evaluate, random
from sentence_transformers import SentenceTransformer

metric = evaluate.load("squad")

# --- Function to normalize embeddings ---
def build_faiss_index(embeddings, dim):
    emb_array = np.array(embeddings, dtype="float32")
    faiss.normalize_L2(emb_array)   # normalize for cosine similarity
    index = faiss.IndexFlatIP(dim)  # inner product = cosine similarity
    index.add(emb_array)
    return index

# --- Bootstrap CI ---
def bootstrap_ci(metric_func, preds, refs, n_bootstraps=200, alpha=0.05):
    scores = []
    for _ in range(n_bootstraps):
        idxs = [random.randint(0, len(preds)-1) for _ in range(len(preds))]
        p_sample = [preds[i] for i in idxs]
        r_sample = [refs[i] for i in idxs]
        score = metric_func.compute(predictions=p_sample, references=r_sample)
        scores.append(score["f1"])
    lower = np.percentile(scores, alpha/2*100)
    upper = np.percentile(scores, (1-alpha/2)*100)
    return lower, upper

# --- Failure Analysis ---
def categorize_errors(preds, refs):
    errors = {"context_mismatch":0, "hallucination":0, "incomplete":0, "yesno_flip":0}
    for pred, ref in zip(preds, refs):
        gold = ref["answers"]["text"][0].lower()
        guess = pred["prediction_text"].lower()
        if gold not in guess:
            errors["context_mismatch"] += 1
        elif gold in guess and len(guess) < len(gold):
            errors["incomplete"] += 1
        elif gold in ["yes","no"] and guess.strip() in ["yes","no"] and gold!=guess:
            errors["yesno_flip"] += 1
        else:
            errors["hallucination"] += 1
    return errors

# --- Evaluation for given embedding + retrieval k ---
def evaluate_combo(embed_size, k, dataset, max_samples=120):
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    model = SentenceTransformer(model_name)
    raw_embs = model.encode(raw_passages, convert_to_numpy=True, normalize_embeddings=True)

    projector = None
    if raw_embs.shape[1] != embed_size:
        projector = faiss.PCAMatrix(raw_embs.shape[1], embed_size)
        projector.train(raw_embs)
        raw_embs = projector.apply_py(raw_embs)

    # 2. Build FAISS index
    faiss_index = build_faiss_index(raw_embs, raw_embs.shape[1])

    preds, refs = [], []
    for row in tqdm(dataset.select(range(max_samples)), desc=f"Embed {embed_size}, top-{k}"):
        q = row["question"]
        q_emb = model.encode([q], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

        if projector is not None:   # ✅ FIX: only apply if projector exists
            q_emb = projector.apply_py(q_emb)

        # Sanity check
        assert q_emb.shape[1] == faiss_index.d, f"Dim mismatch: query={q_emb.shape[1]}, index={faiss_index.d}"

        # Search top-k
        D, I = faiss_index.search(q_emb.reshape(1, -1), k)
        contexts = [raw_passages[idx] for idx in I[0] if idx < len(raw_passages)]
        context = " ".join(contexts)

        prompt = f"Answer the question based only on the context.\nContext: {context}\nQuestion: {q}"
        gen = generator(prompt, max_new_tokens=64, truncation=True)[0]["generated_text"]

        preds.append({"id": str(row["id"]), "prediction_text": gen})
        refs.append({"id": str(row["id"]), "answers": {"text": [row["answer"]], "answer_start": [0]}})

    results = metric.compute(predictions=preds, references=refs)
    ci_low, ci_high = bootstrap_ci(metric, preds, refs)
    errors = categorize_errors(preds, refs)
    return results, ci_low, ci_high, errors

# --- Run experiments ---
embed_sizes = [256, 384]
retrieval_ks = [3, 5, 10]

all_results = []
for embed_size in embed_sizes:
    for k in retrieval_ks:
        results, ci_low, ci_high, errors = evaluate_combo(embed_size, k, qa, max_samples=150)
        print(f"\nEmbed={embed_size}, top-{k}")
        print(f"F1={results['f1']:.2f} (95% CI: {ci_low:.2f}-{ci_high:.2f}), EM={results['exact_match']:.2f}")
        print("Errors:", errors)

        all_results.append({
            "embedding_dim": embed_size,
            "top_k": k,
            "f1": results["f1"],
            "exact_match": results["exact_match"],
            "f1_ci_low": ci_low,
            "f1_ci_high": ci_high,
            "errors": errors
        })

# Save summary
df_results = pd.DataFrame(all_results)
df_results.to_csv("step4_experiments.csv", index=False)
print("\nSaved results to step4_experiments.csv")
print(df_results)


Embed 256, top-3: 100%|██████████| 150/150 [02:17<00:00,  1.09it/s]



Embed=256, top-3
F1=2.62 (95% CI: 1.83-3.54), EM=0.00
Errors: {'context_mismatch': 127, 'hallucination': 23, 'incomplete': 0, 'yesno_flip': 0}


Embed 256, top-5: 100%|██████████| 150/150 [02:20<00:00,  1.07it/s]



Embed=256, top-5
F1=2.73 (95% CI: 1.80-3.70), EM=0.00
Errors: {'context_mismatch': 125, 'hallucination': 25, 'incomplete': 0, 'yesno_flip': 0}


Embed 256, top-10: 100%|██████████| 150/150 [02:24<00:00,  1.04it/s]



Embed=256, top-10
F1=2.70 (95% CI: 1.90-3.70), EM=0.00
Errors: {'context_mismatch': 128, 'hallucination': 22, 'incomplete': 0, 'yesno_flip': 0}


Embed 384, top-3: 100%|██████████| 150/150 [02:17<00:00,  1.09it/s]



Embed=384, top-3
F1=2.65 (95% CI: 1.85-3.56), EM=0.00
Errors: {'context_mismatch': 127, 'hallucination': 23, 'incomplete': 0, 'yesno_flip': 0}


Embed 384, top-5: 100%|██████████| 150/150 [02:19<00:00,  1.08it/s]



Embed=384, top-5
F1=2.74 (95% CI: 1.94-3.71), EM=0.00
Errors: {'context_mismatch': 124, 'hallucination': 26, 'incomplete': 0, 'yesno_flip': 0}


Embed 384, top-10: 100%|██████████| 150/150 [02:24<00:00,  1.04it/s]



Embed=384, top-10
F1=2.45 (95% CI: 1.68-3.22), EM=0.00
Errors: {'context_mismatch': 128, 'hallucination': 22, 'incomplete': 0, 'yesno_flip': 0}

Saved results to step4_experiments.csv
   embedding_dim  top_k        f1  exact_match  f1_ci_low  f1_ci_high  \
0            256      3  2.621181          0.0   1.832261    3.541514   
1            256      5  2.731383          0.0   1.801604    3.700805   
2            256     10  2.696525          0.0   1.903727    3.701019   
3            384      3  2.645235          0.0   1.853535    3.555116   
4            384      5  2.744406          0.0   1.942912    3.705791   
5            384     10  2.445345          0.0   1.678442    3.221874   

                                              errors  
0  {'context_mismatch': 127, 'hallucination': 23,...  
1  {'context_mismatch': 125, 'hallucination': 25,...  
2  {'context_mismatch': 128, 'hallucination': 22,...  
3  {'context_mismatch': 127, 'hallucination': 23,...  
4  {'context_mismatch': 124,

#Step 5 - Add Two Advanced RAG Features

In [50]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline

# Base embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Cross-encoder reranker (BERT-based)
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# Generator
generator = pipeline("text-generation", model="distilgpt2")


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [51]:

embeddings = embed_model.encode(raw_passages, convert_to_numpy=True, normalize_embeddings=True)

d = embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(d)
faiss.normalize_L2(embeddings)
faiss_index.add(embeddings)

print("FAISS index size:", faiss_index.ntotal)


FAISS index size: 2000


In [46]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

rewrite_model = AutoModelForSeq2SeqLM.from_pretrained("castorini/t5-base-canard")
rewrite_tokenizer = AutoTokenizer.from_pretrained("castorini/t5-base-canard")

def rewrite_query(query: str) -> str:
    inputs = rewrite_tokenizer(query, return_tensors="pt")
    outputs = rewrite_model.generate(**inputs, max_new_tokens=32)
    return rewrite_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test
print("Original:", "Who was the 16th president of the US?")
print("Rewritten:", rewrite_query("Who was the 16th president of the US?"))


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Original: Who was the 16th president of the US?
Rewritten: Who was the 16th president of the US ?


In [47]:
def retrieve_with_reranking(query: str, k=10, top_rerank=3):
    #  Rewrite query
    rewritten = rewrite_query(query)

    #  Embed and retrieve top-k
    q_emb = embed_model.encode([rewritten], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    D, I = faiss_index.search(q_emb.reshape(1, -1), k)
    candidate_passages = [raw_passages[idx] for idx in I[0]]

    #  Rerank using cross-encoder
    pairs = [(rewritten, passage) for passage in candidate_passages]
    scores = reranker.predict(pairs)

    ranked = sorted(zip(candidate_passages, scores), key=lambda x: x[1], reverse=True)
    top_passages = [p for p, _ in ranked[:top_rerank]]

    return " ".join(top_passages)


In [66]:
def answer_query(query, k=3, return_contexts=False):
    # Encode query
    q_emb = embedding_model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

    # Retrieve top-k passages from FAISS
    D, I = faiss_index.search(q_emb.reshape(1, -1), k)
    contexts = [raw_passages[idx] for idx in I[0] if idx < len(raw_passages)]
    context = " ".join(contexts)

    # Build prompt
    prompt = f"Answer the question based only on the context.\nContext: {context}\nQuestion: {query}"

    # Generate
    gen = generator(prompt, max_new_tokens=128, truncation=True)[0]["generated_text"]

    # Return answer and optionally the retrieved contexts
    if return_contexts:
        return gen, contexts
    return gen



In [67]:
print(answer_query("Who was the 16th president of the United States?"))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer the question based only on the context.
Context: Sixteen months before his death, his son, John Quincy Adams, became the sixth President of the United States (1825 1829), the only son of a former President to hold the office until George W. Bush in 2001. The second president from Ohio, Grant was the 18th President of the United States and served two terms from March 4, 1869, to March 4, 1877. In the 1872 election he won by a landslide against the breakaway Liberal Republican party that nominated Horace Greeley. Grover Cleveland
Question: Who was the 16th president of the United States?
Context: Sixteen months before his death, his son, John Quincy Adams, became the sixth President of the United States and served two terms from March 4, 1877 to March 4, 1877. In the 1872 election he won by a landslide against the breakaway Liberal Republican party that nominated Horace Greeley. Grover Cleveland
Question: Who was the 16th President of the United States?
Context: Sixteen months bef

In [68]:
ans, ctxs = answer_query("Who was the 16th president of the United States?", return_contexts=True)
print("Answer:", ans)
print("Contexts:", ctxs)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: Answer the question based only on the context.
Context: Sixteen months before his death, his son, John Quincy Adams, became the sixth President of the United States (1825 1829), the only son of a former President to hold the office until George W. Bush in 2001. The second president from Ohio, Grant was the 18th President of the United States and served two terms from March 4, 1869, to March 4, 1877. In the 1872 election he won by a landslide against the breakaway Liberal Republican party that nominated Horace Greeley. Grover Cleveland
Question: Who was the 16th president of the United States? Was the 16th president of the United States by an electoral margin?
Context: Sixteen months before his death, his son, John Quincy Adams, became the sixth President of the United States (1825 1829), the only son of a former President to hold the office until George W. Bush in 2001. The second president from Ohio, Grant was the 18th President of the United States and served two terms from M

In [57]:
import evaluate
metric = evaluate.load("squad")

preds, refs = [], []
subset = qa.select(range(100))  # speed up with 100 queries

for row in subset:
    query = row["question"]
    gen = answer_query(query)
    preds.append({"id": str(row["id"]), "prediction_text": gen})
    refs.append({"id": str(row["id"]), "answers": {"text": [row["answer"]], "answer_start": [0]}})

results = metric.compute(predictions=preds, references=refs)
print("Enhanced RAG Results:", results)


Enhanced RAG Results: {'exact_match': 0.0, 'f1': 0.8378776484349595}


#Step 6: Advanced Evaluation with RAGAs

In [58]:
!pip install -q ragas

from ragas.metrics import faithfulness, context_precision, context_recall, answer_relevancy
from ragas.evaluation import evaluate
from datasets import Dataset


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/303.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.6/303.6 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/155.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.5/155.5 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m100.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [74]:
import os
os.environ["OPENAI_API_KEY"] = "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"


In [75]:
import evaluate
from ragas.metrics import faithfulness, context_precision, context_recall, answer_relevancy
from ragas import evaluate as ragas_eval
from ragas.dataset import Dataset as RagasDataset
import pandas as pd

# Load SQuAD metric
squad_metric = evaluate.load("squad")

# --- Query function with context return ---
def answer_query(query, k=3, return_contexts=False):
    # Encode query
    q_emb = embedding_model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

    # Retrieve top-k passages
    D, I = faiss_index.search(q_emb.reshape(1, -1), k)
    contexts = [raw_passages[idx] for idx in I[0] if idx < len(raw_passages)]
    context = " ".join(contexts)

    # Prompt
    prompt = f"Answer the question based only on the context.\nContext: {context}\nQuestion: {query}"
    gen = generator(
    prompt,
    max_new_tokens=128,
    truncation=True,
    pad_token_id=generator.model.config.eos_token_id
)[0]["generated_text"]

    if return_contexts:
        return gen, contexts
    return gen

# --- Evaluation Loop ---
preds, refs = [], []
ragas_records = []

subset = qa.select(range(100))   # 100 queries for faster eval

for row in subset:
    query = row["question"]

    # Get answer + contexts
    gen, contexts = answer_query(query, return_contexts=True)

    # For SQuAD metric
    preds.append({"id": str(row["id"]), "prediction_text": gen})
    refs.append({"id": str(row["id"]), "answers": {"text": [row["answer"]], "answer_start": [0]}})

    # For RAGAs

    ragas_records.append({
        "question": query,
        "answer": gen,
        "contexts": contexts,
        "ground_truth": row["answer"]
    })

    ragas_dict = {
        "question": [r["question"] for r in ragas_records],
        "answer": [r["answer"] for r in ragas_records],
        "contexts": [r["contexts"] for r in ragas_records],
        "ground_truth": [r["ground_truth"] for r in ragas_records]
    }

# --- Compute SQuAD results ---
squad_results = squad_metric.compute(predictions=preds, references=refs)
print("📊 SQuAD Results:", squad_results)

# --- Convert to RAGAs Dataset ---
from datasets import Dataset
ragas_ds = Dataset.from_dict(ragas_dict)

# --- Run RAGAs metrics ---
ragas_results = ragas_eval(
    ragas_ds,
    metrics=[faithfulness, context_precision, context_recall, answer_relevancy]
)
print("📊 RAGAs Results:")
print(ragas_results)

# --- Save outputs ---
pd.DataFrame(ragas_records).to_csv("enhanced_rag_outputs.csv", index=False)
ragas_results.to_pandas().to_csv("enhanced_rag_ragas.csv", index=False)


📊 SQuAD Results: {'exact_match': 0.0, 'f1': 0.8145865063618623}


Evaluating:   0%|          | 0/400 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[0]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[1]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[2]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[4]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[5]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[3]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[6]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[9]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[11]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[12]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[13]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[14]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[15]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[7]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[8]: TimeoutError()
ERROR:ragas.executor:Exception rais

KeyboardInterrupt: 