# Question 4 - Retrieval-Augmented Generation (RAG) and Knowledge-Grounded Text Synthesis

Here, in order to construct a RAG system, we have chosen:
* The retriever BM25 (sparse)
* The retriever all-MiniLM-L6-v2 (sentcence-bert, dense)
* The generator FLAN-T5 (small)
* Applied to HotpotQA



In [21]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from bert_score import score as bert_score_func
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
import nltk
from tqdm import tqdm

# Set device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Download NLTK data for tokenizer
nltk.download('punkt', quiet=True)

Using device: cuda


True

### a-b ) Construct RAG system with BM25, S-BERT, FLAN-T5, applied to HotpotQA

In [22]:
# --- CONFIGURATION ---
MODEL_NAME = "google/flan-t5-small"  # Efficient instruction-tuned model
SUBSET_SIZE = 5000  
TOP_K = 3          # Number of documents to retrieve

In [23]:
# DATA PREPARATION

print("Loading HotpotQA dataset (distractor subset)...")
# We use the 'distractor' set which contains the question and supporting paragraphs
dataset = load_dataset("hotpot_qa", "distractor", split=f"validation[:{SUBSET_SIZE}]")

# We need to flatten the dataset to create a "Knowledge Base" (Corpus)
# HotpotQA structure: 'context' is a list of [title, sentences]
corpus = []
doc_ids = []
queries = []
ground_truth_answers = []
ground_truth_titles = []  # To check retrieval accuracy

print("Processing corpus...")
for item in dataset:
    # 1. Extract Queries and Answers
    queries.append(item["question"])
    ground_truth_answers.append(item["answer"])
    
    # 2. Extract Ground Truth Titles (for Retrieval Evaluation)
    # In the HF version, supporting_facts is often a dict:
    #   {'title': [...], 'sent_id': [...]}
    sf = item["supporting_facts"]
    
    if isinstance(sf, dict) and "title" in sf:
        # Newer HF format
        titles = sf["title"]
    else:
        # Fallback: older format as list of [title, sent_id]
        titles = [fact[0] for fact in sf]
    
    gt_titles_set = set(titles)
    ground_truth_titles.append(gt_titles_set)
    
    # 3. Build Corpus from 'context'
    # Each item has multiple paragraphs. We treat each paragraph as a document.
    for title, sentences in zip(item["context"]["title"], item["context"]["sentences"]):
        text = " ".join(sentences)
        corpus.append(text)
        doc_ids.append(title)  # Use title as the doc ID

print(f"Processed {len(queries)} queries against {len(corpus)} documents.")


Loading HotpotQA dataset (distractor subset)...
Processing corpus...
Processed 5000 queries against 49774 documents.


In [24]:
# RETRIEVERS

class SparseRetriever:
    def __init__(self, corpus):
        print("Initializing BM25 (Sparse) Retriever...")
        # Tokenize corpus for BM25
        tokenized_corpus = [nltk.word_tokenize(doc.lower()) for doc in corpus]
        self.bm25 = BM25Okapi(tokenized_corpus)
        self.corpus = corpus
        
    def retrieve(self, query, top_k=3):
        tokenized_query = nltk.word_tokenize(query.lower())
        # Get top-k indices
        scores = self.bm25.get_scores(tokenized_query)
        top_n_indices = np.argsort(scores)[::-1][:top_k]
        return [self.corpus[i] for i in top_n_indices], [doc_ids[i] for i in top_n_indices]

class DenseRetriever:
    def __init__(self, corpus):
        print("Initializing SBERT (Dense) Retriever...")
        self.model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
        self.corpus = corpus
        # Pre-compute corpus embeddings
        self.corpus_embeddings = self.model.encode(corpus, convert_to_tensor=True, show_progress_bar=True)
        
    def retrieve(self, query, top_k=3):
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        # Cosine Similarity
        hits = util.semantic_search(query_embedding, self.corpus_embeddings, top_k=top_k)[0]
        # hits is list of {corpus_id, score}
        indices = [hit['corpus_id'] for hit in hits]
        return [self.corpus[i] for i in indices], [doc_ids[i] for i in indices]

In [25]:
# Initialize Retrievers
sparse_retriever = SparseRetriever(corpus)
dense_retriever = DenseRetriever(corpus) # Optional Task b

Initializing BM25 (Sparse) Retriever...
Initializing SBERT (Dense) Retriever...


Batches: 100%|██████████| 1556/1556 [00:29<00:00, 52.03it/s] 


In [26]:
# Generation Module

class RAGGenerator:
    def __init__(self):
        print(f"Loading Generator ({MODEL_NAME})...")
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)

    def generate(self, query, context_docs):
        # Concatenate retrieved docs into a context block
        context_text = " ".join(context_docs)
        
        # FLAN-T5 Prompt Template
        prompt = f"Use the context below to answer the question.\n\nContext: {context_text}\n\nQuestion: {query}\n\nAnswer:"
        
        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(DEVICE)
        
        outputs = self.model.generate(
            **inputs, 
            max_new_tokens=50, 
            num_beams=2, 
            early_stopping=True
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

In [27]:
generator = RAGGenerator()

Loading Generator (google/flan-t5-small)...


### c) Evaluate retrieval and generation components separately and jointly using Precision@k, Recall@k, BLEU, ROUGE-L, and BERTScore.

In [28]:
def calculate_metrics(retrieval_results, generation_results, gt_titles_list, gt_answers_list):
    # 1. Retrieval Metrics
    recall_at_k = 0
    precision_scores = []
    
    for retrieved_ids, true_ids in zip(retrieval_results['ids'], gt_titles_list):
        retrieved_set = set(retrieved_ids)
        true_set = set(true_ids)
        
        # Intersection between retrieved titles and ground-truth titles
        intersection = retrieved_set.intersection(true_set)
        
        # Recall@K (relaxed: did we retrieve at least one relevant doc?)
        if len(intersection) > 0:
            recall_at_k += 1
        
        # Precision@K = (# relevant retrieved) / (# retrieved)
        if len(retrieved_ids) > 0:
            precision_scores.append(len(intersection) / len(retrieved_ids))
        else:
            precision_scores.append(0.0)
            
    avg_recall = recall_at_k / len(gt_titles_list)
    avg_precision = np.mean(precision_scores)
    
    # 2. Generation Metrics (BLEU, ROUGE)
    rouge = Rouge()
    bleu_scores = []
    rouge_l_scores = []
    
    preds_for_bert = generation_results
    refs_for_bert = gt_answers_list
    
    for pred, ref in zip(generation_results, gt_answers_list):
        if len(pred.strip()) == 0:
            pred = "empty"  # Prevent crash

        bleu = sentence_bleu(
            [nltk.word_tokenize(ref)],
            nltk.word_tokenize(pred)
        )
        bleu_scores.append(bleu)
        
        try:
            r_scores = rouge.get_scores(pred, ref)[0]
            rouge_l_scores.append(r_scores['rouge-l']['f'])
        except:
            rouge_l_scores.append(0.0)

    try:
        P, R, F1 = bert_score_func(preds_for_bert, refs_for_bert,
                                   lang="en", verbose=False,
                                   device=DEVICE.type)
        bert_score_avg = F1.mean().item()
    except Exception as e:
        print(f"BERTScore failed (likely memory): {e}")
        bert_score_avg = 0.0

    return {
        "Recall@K": avg_recall,
        "Precision@K": avg_precision,
        "BLEU": np.mean(bleu_scores),
        "ROUGE-L": np.mean(rouge_l_scores),
        "BERTScore": bert_score_avg
    }


In [29]:
# EXECUTION LOOP
results_log = {'sparse': {'docs': [], 'ids': [], 'ans': []}, 
               'dense': {'docs': [], 'ids': [], 'ans': []}}

print("\n--- Running RAG Pipeline ---")
for i in tqdm(range(len(queries))):
    q = queries[i]
    
    # 1. Sparse Path
    s_docs, s_ids = sparse_retriever.retrieve(q, TOP_K)
    s_ans = generator.generate(q, s_docs)
    results_log['sparse']['docs'].append(s_docs)
    results_log['sparse']['ids'].append(s_ids)
    results_log['sparse']['ans'].append(s_ans)
    
    # 2. Dense Path
    d_docs, d_ids = dense_retriever.retrieve(q, TOP_K)
    d_ans = generator.generate(q, d_docs)
    results_log['dense']['docs'].append(d_docs)
    results_log['dense']['ids'].append(d_ids)
    results_log['dense']['ans'].append(d_ans)

# Calculate Metrics
metrics_sparse = calculate_metrics(results_log['sparse'], results_log['sparse']['ans'], ground_truth_titles, ground_truth_answers)
metrics_dense = calculate_metrics(results_log['dense'], results_log['dense']['ans'], ground_truth_titles, ground_truth_answers)

# Display Results
results_df = pd.DataFrame([metrics_sparse, metrics_dense], index=['BM25 (Sparse)', 'SBERT (Dense)'])
print("\n=== RAG Performance Comparison ===")
print(results_df)


--- Running RAG Pipeline ---


100%|██████████| 5000/5000 [41:49<00:00,  1.99it/s]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc


=== RAG Performance Comparison ===
               Recall@K  Precision@K      BLEU   ROUGE-L  BERTScore
BM25 (Sparse)    0.8046     0.337200  0.026541  0.315914   0.884432
SBERT (Dense)    0.8692     0.378733  0.025308  0.292598   0.880790


In [32]:
#QUALITATIVE EXAMPLES (Task d)
print("\n=== Qualitative Analysis: Faithful vs. Hallucinated ===")

# Find a good example (high ROUGE) and a bad example (low ROUGE) from Dense results
rouge = Rouge()
best_score = -1
worst_score = 100
best_idx = 0
worst_idx = 0

for i, (pred, ref) in enumerate(zip(results_log['dense']['ans'], ground_truth_answers)):
    try:
        score = rouge.get_scores(pred, ref)[0]['rouge-l']['f']
        if score > best_score:
            best_score = score
            best_idx = i
        if score < worst_score:
            worst_score = score
            worst_idx = i
    except: pass

def print_example(idx, label):
    print(f"\n--- {label} GENERATION ---")
    print(f"Question: {queries[idx]}")
    print(f"Ground Truth: {ground_truth_answers[idx]}")
    print(f"Retrieved Context (Top 1): {results_log['dense']['docs'][idx][0][:200]}...")
    print(f"RAG Prediction: {results_log['dense']['ans'][idx]}")

print_example(best_idx, "FAITHFUL / SUCCESSFUL")
print_example(worst_idx, "HALLUCINATED / FAILED")


=== Qualitative Analysis: Faithful vs. Hallucinated ===

--- FAITHFUL / SUCCESSFUL GENERATION ---
Question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?
Ground Truth: no
Retrieved Context (Top 1): The Esma Sultan Mansion (Turkish: "Esma Sultan Yalısı" ), a historical yalı (English: waterside mansion ) located at Bosphorus in Ortaköy neighborhood of Istanbul, Turkey and named after its original ...
RAG Prediction: no

--- HALLUCINATED / FAILED GENERATION ---
Question: Were Scott Derrickson and Ed Wood of the same nationality?
Ground Truth: yes
Retrieved Context (Top 1): Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.  The film concerns the period in Wood's life w...
RAG Prediction: no
