In [None]:
import os
from datasets import load_dataset

pubmed_ds = load_dataset("csv", data_files="../data/raw/PubMed/train.csv")

def process_pubmed_row(examples):
    processed_chunks = []
    for i in range(len(examples['abstract_text'])):

        text_content = f"In PubMed abstract {examples['abstract_id'][i]}, the {examples['target'][i]} section states: {examples['abstract_text'][i]}"
        
        metadata = {
            "source": "PubMed",
            "abstract_id": examples['abstract_id'][i],
            "target": examples['target'][i],
            "line_number": examples['line_number'][i]
        }
        
        processed_chunks.append({
            "content": text_content,
            "metadata": metadata
        })
    
    return {"processed_data": processed_chunks}

pubmed_processed = pubmed_ds.map(process_pubmed_row, batched=True, remove_columns=pubmed_ds['train'].column_names)
pubmed_chunks = list(pubmed_processed['train']['processed_data'])

# Recursive Character Splitting
def txt_chunking(text, chunk_size=600, overlap=100, metadata=None):
    paragraphs = text.split('\n\n')
    chunks = []
    current_chunk = ""
    
    for p in paragraphs:
        if len(current_chunk) + len(p) <= chunk_size:
            current_chunk += p + "\n\n"
        else:
            if current_chunk:
                chunks.append({
                    "content": current_chunk.strip(),
                    "metadata": metadata.copy() if metadata else {}
                })
            current_chunk = current_chunk[-overlap:] + p + "\n\n"
   
    # optimized on overlap  
    if current_chunk:
        chunks.append({"content": current_chunk.strip(), "metadata": metadata})
    
        raw_overlap = current_chunk[-overlap:]
        first_space = raw_overlap.find(' ')
        if first_space != -1:
            current_chunk = raw_overlap[first_space:].strip() + "\n\n" + p + "\n\n"
        else:
            current_chunk = raw_overlap + "\n\n" + p + "\n\n"
        
    return chunks

txt_chunks = [] 
def load_txt_folder(folder_path, source_name):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
    
                meta = {
                    "source": source_name,
                    "file_name": filename
                }
                
                file_chunks = txt_chunking(content, metadata=meta)
                txt_chunks.extend(file_chunks)


load_txt_folder("../data/raw/libc", "libc")
load_txt_folder("../data/raw/pytorch", "pytorch")

final_chunks = txt_chunks + pubmed_chunks

pubmed_map = {}
for chunk in final_chunks:
    if chunk['metadata'].get('source') == 'PubMed':
        abs_id = chunk['metadata']['abstract_id']             
        if abs_id not in pubmed_map:
            pubmed_map[abs_id] = []
        pubmed_map[abs_id].append(chunk) 

def expand_context(retrieved_chunks, pubmed_map, window_size=1):
    expanded_results = []
    for hit in retrieved_chunks:
        meta = hit['metadata']
        if meta.get('source') != 'PubMed':
            expanded_results.append(hit['content'])
            continue
            
        abs_id = meta['abstract_id']
        curr_line = meta['line_number']
        
        all_lines = pubmed_map.get(abs_id, [])
        neighbors = [
            c['content'] for c in all_lines 
            if abs(c['metadata']['line_number'] - curr_line) <= window_size
        ]
        expanded_results.append(" ".join(neighbors))
    return expanded_results

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# Use a local embedding model to calculate semantic similarity
test_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "Qwen/Qwen2.5-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model_llm = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config,
    device_map="auto" 
)

In [None]:
import torch
import faiss
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder

text_list = [c['content'] for c in final_chunks]
embeddings = test_embeddings.embed_documents(text_list) 
embeddings = np.array(embeddings).astype('float32')
d = embeddings.shape[1]
M = 32
index = faiss.IndexHNSWFlat(d, M)
index.hnsw.efConstruction = 200
index.add(embeddings)

tokenized_corpus = [chunk['content'].lower().split() for chunk in final_chunks]
bm25 = BM25Okapi(tokenized_corpus)


# RRF Fusion Algorithm
def rrf_fusion(vector_results, bm25_results, k=60):
    scores = {}
    for rank, idx in enumerate(vector_results):
        scores[idx] = scores.get(idx, 0) + 1 / (k + rank)
    for rank, idx in enumerate(bm25_results):
        scores[idx] = scores.get(idx, 0) + 1 / (k + rank)
    
    sorted_indices = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
    return sorted_indices

reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cpu')

# Tag Mapping for PubMed data
target_to_indices = {}
for idx, chunk in enumerate(final_chunks):
    t = chunk['metadata'].get('target')
    if t:
        if t not in target_to_indices:
            target_to_indices[t] = []
        target_to_indices[t].append(idx)


# Hybrid Search
def query_rag_v3(user_query, k, target_filter=None, rerank_top_n=5):
    query_vec = test_embeddings.embed_query(user_query)
    query_vec = np.array([query_vec]).astype('float32')
    
    # Hard Filter
    allowed_indices = target_to_indices.get(target_filter, None) if target_filter else None
    
    # FAISS Retrieval
    if allowed_indices is not None:
        selector = faiss.IDSelectorBatch(allowed_indices)
        params = faiss.SearchParameters(sel=selector)
        index.hnsw.efSearch = 128
        _, I_vector = index.search(query_vec, k * 2, params=params)
    else:
        index.hnsw.efSearch = 128
        _, I_vector = index.search(query_vec, k * 2)
    
    vector_indices = I_vector[0].tolist()

    # BM25 Retrieval
    tokenized_query = user_query.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    
    if allowed_indices is not None:
        mask = np.zeros_like(bm25_scores)
        mask[allowed_indices] = 1
        bm25_scores = bm25_scores * mask

    bm25_indices = np.argsort(bm25_scores)[::-1][:k * 2].tolist()

    # RRF
    combined_indices = rrf_fusion(vector_indices, bm25_indices)

    # Reranking
    candidate_chunks = [final_chunks[i] for i in combined_indices[:10]]
    pairs = [[user_query, c['content']] for c in candidate_chunks]
    rerank_scores = reranker_model.predict(pairs)
    reranked_chunks = [c for _, c in sorted(zip(rerank_scores, candidate_chunks), key=lambda x: x[0], reverse=True)]
    
    top_chunks = reranked_chunks[:rerank_top_n]
    # context expansion
    expanded_texts = expand_context(top_chunks, pubmed_map, window_size=1)
    context_str = "\n\n ".join(expanded_texts)
    full_prompt = f"You are a helpful agent, answer the question based on provided contexts.\nContext:\n{context_str}\n\nQuestion: {user_query}\nAnswer:"

    
    return full_prompt, expanded_texts

def query_rag_final_v3(user_query, k=30):
    full_prompt, expanded_texts = query_rag_v3(user_query, k) 

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model_llm.device)
    
    with torch.no_grad():
        outputs = model_llm.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
    
    answer = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    return answer, expanded_texts

In [None]:
import torch
import json
import csv
from tqdm.auto import tqdm


questions = []
with open('../data/raw/questions.txt', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        q = row.get('Question ', row.get('Question')).strip()
        gt = row.get('Ground Truth', row.get('ground_truth')).strip()
        if q:
            questions.append({"question": q, "ground_truth": gt})

results = []
for item in tqdm(questions):
    q = item['question']
    gt = item['ground_truth']
    
    ans, contexts = query_rag_final_v3(q)
    
    results.append({
        "question": q,
        "answer": ans,
        "contexts": contexts,  
        "ground_truth": gt
    })


with open("../data/eval/results_v3.json", "w") as f:
    json.dump(results, f)

del model_llm
torch.cuda.empty_cache()

In [None]:
import json
import numpy as np
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def embed(texts):
    if isinstance(texts, str):
        texts = [texts]
    return test_embeddings.embed_documents(texts)

def recall_at_k(contexts, gt, threshold=0.75):
    gt_emb = embed(gt)[0]
    ctx_embs = embed(contexts)
    sims = cosine_similarity([gt_emb], ctx_embs)[0]
    hit = np.max(sims) > threshold
    best_rank = int(np.argmax(sims)) + 1
    return hit, best_rank, float(np.max(sims))

def answer_gt_similarity(ans, gt):
    emb = embed([ans, gt])
    sim = cosine_similarity([emb[0]], [emb[1]])[0][0]
    return float(sim)
    
def answer_context_similarity(ans, contexts):
    ans_emb = embed(ans)[0]
    ctx_embs = embed(contexts)
    sims = cosine_similarity([ans_emb], ctx_embs)[0]
    return float(np.max(sims))

def hallucination_flag(ans_gt_sim, ans_ctx_sim,
                       gt_th=0.6, ctx_th=0.6):
    if ans_gt_sim < gt_th and ans_ctx_sim < ctx_th:
        return True
    return False


with open("../data/eval/results_v4.json") as f:
    results = json.load(f)


eval_results = []

for item in tqdm(results):
    q = item["question"]
    ans = item["answer"]
    contexts = item["contexts"]
    gt = item["ground_truth"]

    hit, rank, ctx_gt_sim = recall_at_k(contexts, gt)
    ans_gt_sim = answer_gt_similarity(ans, gt)
    ans_ctx_sim = answer_context_similarity(ans, contexts)
    hallucinated = hallucination_flag(ans_gt_sim, ans_ctx_sim)

    eval_results.append({
        "question": q,
        "hit": hit,
        "best_rank": rank,
        "ctx_gt_sim": ctx_gt_sim,
        "ans_gt_sim": ans_gt_sim,
        "ans_ctx_sim": ans_ctx_sim,
        "hallucinated": hallucinated
    })


import pandas as pd

df = pd.DataFrame(eval_results)

summary = {
    "Recall@k": df["hit"].mean(),
    "MRR": (1 / df["best_rank"]).mean(),
    "Avg Context-GT Sim": df["ctx_gt_sim"].mean(),
    "Avg Answer-GT Sim": df["ans_gt_sim"].mean(),
    "Avg Answer-Context Sim": df["ans_ctx_sim"].mean(),
    "Hallucination Rate": df["hallucinated"].mean()
}

print(summary)

with open("../data/eval/summary_v4.json", "w") as f:
    json.dump(summary, f)