In [1]:
# ==========================================
# STEP 1: SETUP & LIBRARIES
# ==========================================
from google.colab import drive
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

# 1. Mount Drive
drive.mount('/content/drive')
work_dir = '/content/drive/My Drive/CENG543_Midterm_Q4'
if not os.path.exists(work_dir):
    os.makedirs(work_dir)

# 2. Install Requirements
# 'rank_bm25': Retriever
# 'transformers': Generator
# 'bert_score': Evaluation Metric
# 'datasets': To load SQuAD (Wikipedia based QA dataset)
!pip install -q rank_bm25 transformers datasets bert_score torchmetrics nltk

import nltk
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics.text.bert import BERTScore
from nltk.translate.bleu_score import sentence_bleu

# Download NLTK data for tokenization
nltk.download('punkt')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Device: cpu


In [2]:
# ==========================================
# STEP 2: DATASET & RETRIEVER (BM25)
# ==========================================

# 1. Load SQuAD Dataset (Small subset for speed)
# SQuAD contains: 'context' (Knowledge), 'question', 'answers'
print("Loading SQuAD dataset (Validation split)...")
dataset = load_dataset("squad", split="validation[:200]") # First 200 examples

# 2. Build Knowledge Base (Corpus)
# We will use the 'context' passages from SQuAD as our "Wikipedia Corpus"
corpus = list(set(dataset['context'])) # Unique contexts
print(f"Knowledge Base Size: {len(corpus)} unique documents.")

# 3. Initialize Tokenizer & BM25
def simple_tokenize(text):
    return text.lower().split()

print("Indexing Corpus with BM25...")
tokenized_corpus = [simple_tokenize(doc) for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# 4. Initialize Generator (FLAN-T5)
print("Loading Generator (FLAN-T5)...")
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

print("RAG System Ready.")

Loading SQuAD dataset (Validation split)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Knowledge Base Size: 10 unique documents.
Indexing Corpus with BM25...
Loading Generator (FLAN-T5)...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

RAG System Ready.


In [4]:
# ==========================================
# STEP 3: RAG PIPELINE & METRICS (FIXED)
# ==========================================

# --- RAG COMPONENTS ---
def retrieve_docs(query, top_k=3):
    """Retrieves top_k documents using BM25"""
    tokenized_query = simple_tokenize(query)
    # get_top_n returns the documents themselves
    docs = bm25.get_top_n(tokenized_query, corpus, n=top_k)
    return docs

def generate_answer(query, context=None):
    """Generates answer using FLAN-T5"""
    if context:
        input_text = f"question: {query} context: {context}"
    else:
        input_text = f"question: {query}"

    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).input_ids.to(device)

    outputs = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# --- EVALUATION METRICS ---
rouge_scorer = ROUGEScore()

# FIX: Changed 'model_type' to 'model_name_or_path' for torchmetrics
bert_scorer = BERTScore(model_name_or_path="distilbert-base-uncased")

def calculate_generation_metrics(prediction, references):
    """Calculates BLEU, ROUGE-L, BERTScore"""
    # 1. BLEU (NLTK)
    # References needs to be list of list of tokens
    ref_tokens = [r.split() for r in references]
    pred_tokens = prediction.split()
    # Smoothing function added to avoid 0 score for short sequences
    bleu = sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5))

    # 2. ROUGE (TorchMetrics)
    rouge = rouge_scorer(prediction, references)['rougeL_fmeasure'].item()

    # 3. BERTScore
    # Returns precision, recall, f1. We take F1.
    # BERTScore in torchmetrics expects lists
    score = bert_scorer([prediction], [references])
    bert_f1 = score['f1'].mean().item()

    return bleu, rouge, bert_f1

def calculate_retrieval_metrics(retrieved_docs, ground_truth_context):
    """Calculates Recall@k (Is the correct document in the retrieved list?)"""
    # In SQuAD, we check if the retrieved text matches the ground truth context
    is_hit = 0
    for doc in retrieved_docs:
        if doc == ground_truth_context:
            is_hit = 1
            break
    return is_hit

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# ==========================================
# STEP 4: EXPERIMENT & EVALUATION LOOP
# ==========================================

results = []
print(f"Running Evaluation on {len(dataset)} examples...")

# 1. Quantitative Evaluation (Task c)
for i, example in enumerate(tqdm(dataset)):
    question = example['question']
    # SQuAD has multiple valid answers, we take the list
    ground_truth_answers = example['answers']['text']
    ground_truth_context = example['context']

    # A) Retrieval
    retrieved_docs = retrieve_docs(question, top_k=1) # Retrieve top 1 for RAG generation
    best_context = retrieved_docs[0]

    # Eval Retrieval (Recall@1)
    recall_at_1 = calculate_retrieval_metrics(retrieved_docs, ground_truth_context)

    # B) Generation (With RAG)
    prediction = generate_answer(question, context=best_context)

    # Eval Generation
    bleu, rouge, bert_s = calculate_generation_metrics(prediction, ground_truth_answers)

    results.append({
        'Question': question,
        'Prediction': prediction,
        'Recall@1': recall_at_1,
        'BLEU': bleu,
        'ROUGE-L': rouge,
        'BERTScore': bert_s
    })

# Create DataFrame
df_results = pd.DataFrame(results)
print("\n" + "="*40)
print("QUANTITATIVE RESULTS (Task c)")
print("="*40)
print(f"Average Recall@1: {df_results['Recall@1'].mean():.4f}")
print(f"Average BLEU:     {df_results['BLEU'].mean():.4f}")
print(f"Average ROUGE-L:  {df_results['ROUGE-L'].mean():.4f}")
print(f"Average BERTScore:{df_results['BERTScore'].mean():.4f}")

# Save to CSV
df_results.to_csv(f'{work_dir}/q4_rag_metrics.csv', index=False)


# 2. Qualitative Analysis (Task d - Faithful vs Hallucinated)
print("\n" + "="*40)
print("QUALITATIVE ANALYSIS (Task d)")
print("="*40)

# Custom fake fact to test hallucination
fake_context = "Zubizubia is a purple fruit found on Mars that tastes like spicy chocolate."
custom_queries = [
    ("What is Zubizubia?", None),          # No Context -> Expect Hallucination or "I don't know"
    ("What is Zubizubia?", fake_context),  # With Context -> Expect Faithful generation
    ("Who founded the Republic of Turkey?", "Ataturk founded the Republic of Turkey in 1923.")
]

print(f"{'QUERY':<35} | {'CONTEXT':<10} | {'ANSWER'}")
print("-" * 80)
for q, ctx in custom_queries:
    ans = generate_answer(q, context=ctx)
    ctx_status = "Given" if ctx else "None"
    print(f"{q:<35} | {ctx_status:<10} | {ans}")

Running Evaluation on 200 examples...


  0%|          | 0/200 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

The following layers were not sharded: transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.attention.out_lin.bias, embeddings.word_embeddings.weight, transformer.layer.*.attention.v_lin.weight, transformer.layer.*.attention.out_lin.weight, transformer.layer.*.ffn.lin*.weight, transformer.layer.*.ffn.lin*.bias, transformer.layer.*.attention.k_lin.weight, transformer.layer.*.sa_layer_norm.bias, transformer.layer.*.sa_layer_norm.weight, transformer.layer.*.attention.q_lin.weight, embeddings.position_embeddings.weight, embeddings.LayerNorm.bias, embeddings.LayerNorm.weight, transformer.layer.*.output_layer_norm.bias, transformer.layer.*.attention.k_lin.bias
  0%|          | 1/200 [00:03<13:09,  3.97s/it]The following layers were not sharded: transformer.layer.*.attention.q_lin.bias, transformer.layer.*.attention.v_lin.bias, transformer.layer.*.output_layer_norm.weight, transformer.layer.*.atte


QUANTITATIVE RESULTS (Task c)
Average Recall@1: 0.5150
Average BLEU:     0.2283
Average ROUGE-L:  0.4518
Average BERTScore:0.7528

QUALITATIVE ANALYSIS (Task d)
QUERY                               | CONTEXT    | ANSWER
--------------------------------------------------------------------------------
What is Zubizubia?                  | None       | Zubizubia
What is Zubizubia?                  | Given      | purple fruit
Who founded the Republic of Turkey? | Given      | Ataturk
