In [1]:
%cd graph-enhanced-retrieval-qa
!pwd

/home/sslab/24m0786/graph-enhanced-retrieval-qa
/home/sslab/24m0786/graph-enhanced-retrieval-qa


In [3]:
import sys
sys.path.append('..')

import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import numpy as np

from src.data_loader import load_dataset, process_sample
from src.models.baselines import run_bm25, run_dpr
from src.evaluate import calculate_mrr, calculate_f1

dev_dataset = load_dataset('data/raw/dev.json')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_path = '/home/sslab/24m0786/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/5617a9f61b028005a4858fdac845db406aefb181'
print(f"Loading DPR model from local path: {model_path}")
dpr_model = SentenceTransformer(model_path, device=device)
print("Model loaded.")

bm25_mrrs, bm25_f1s = [], []
dpr_mrrs, dpr_f1s = [], []

debug_counter = 0
SAMPLES_TO_DEBUG = 3

for sample in tqdm(dev_dataset, desc="Evaluating Baselines"):
    processed = process_sample(sample)
    question = processed['question']
    passages = processed['passages']
    ground_truth = processed['ground_truth_titles']

    if not passages:
        continue

    if debug_counter < SAMPLES_TO_DEBUG:
        print(f"\n--- [DEBUG] Processing Sample {debug_counter + 1} ---")
        print(f"Question: {question[:80]}...") # Print first 80 chars of the question
    
    if debug_counter < SAMPLES_TO_DEBUG: print("[DEBUG] --> Activating BM25...")
    bm25_ranked = run_bm25(question, passages)
    if debug_counter < SAMPLES_TO_DEBUG: print("[DEBUG] <-- BM25 complete.")
    
    bm25_mrrs.append(calculate_mrr(bm25_ranked, ground_truth))
    bm25_f1s.append(calculate_f1(bm25_ranked, ground_truth))

    if debug_counter < SAMPLES_TO_DEBUG: print("[DEBUG] --> Activating DPR...")
    dpr_ranked = run_dpr(question, passages, dpr_model)
    if debug_counter < SAMPLES_TO_DEBUG: print("[DEBUG] <-- DPR complete.")

    dpr_mrrs.append(calculate_mrr(dpr_ranked, ground_truth))
    dpr_f1s.append(calculate_f1(dpr_ranked, ground_truth))
    
    if debug_counter < SAMPLES_TO_DEBUG:
        print(f"--- [DEBUG] Finished Sample {debug_counter + 1} ---")

    debug_counter += 1 
print("\n--- BASELINE PERFORMANCE ---")
print(f"Evaluated on {len(dev_dataset)} samples.")
print("\nBM25:")
print(f"  - Average MRR: {np.mean(bm25_mrrs):.4f}")
print(f"  - Average F1:  {np.mean(bm25_f1s):.4f}")
print(f"\nDPR (bge-m3 from local path):")
print(f"  - Average MRR: {np.mean(dpr_mrrs):.4f}")
print(f"  - Average F1:  {np.mean(dpr_f1s):.4f}")

Loading DPR model from local path: /home/sslab/24m0786/.cache/huggingface/hub/models--BAAI--bge-m3/snapshots/5617a9f61b028005a4858fdac845db406aefb181
Model loaded.


Evaluating Baselines:   0%|          | 0/12576 [00:00<?, ?it/s]


--- [DEBUG] Processing Sample 1 ---
Question: Who is the mother of the director of film Polish-Russian War (Film)?...
[DEBUG] --> Activating BM25...
[DEBUG] <-- BM25 complete.
[DEBUG] --> Activating DPR...


  return forward_call(*args, **kwargs)
Evaluating Baselines:   0%|          | 3/12576 [00:00<32:07,  6.52it/s]  

[DEBUG] <-- DPR complete.
--- [DEBUG] Finished Sample 1 ---

--- [DEBUG] Processing Sample 2 ---
Question: Which film came out first, Blind Shaft or The Mask Of Fu Manchu?...
[DEBUG] --> Activating BM25...
[DEBUG] <-- BM25 complete.
[DEBUG] --> Activating DPR...
[DEBUG] <-- DPR complete.
--- [DEBUG] Finished Sample 2 ---

--- [DEBUG] Processing Sample 3 ---
Question: When did John V, Prince Of Anhalt-Zerbst's father die?...
[DEBUG] --> Activating BM25...
[DEBUG] <-- BM25 complete.
[DEBUG] --> Activating DPR...
[DEBUG] <-- DPR complete.
--- [DEBUG] Finished Sample 3 ---


Evaluating Baselines: 100%|██████████| 12576/12576 [16:18<00:00, 12.85it/s]


--- BASELINE PERFORMANCE ---
Evaluated on 12576 samples.

BM25:
  - Average MRR: 0.7501
  - Average F1:  0.4633

DPR (bge-m3 from local path):
  - Average MRR: 0.9804
  - Average F1:  0.7307



