In [1]:
import os
from datasets import load_dataset

pubmed_ds = load_dataset("csv", data_files="../data/raw/PubMed/train.csv")

def process_pubmed_row(examples):
    processed_chunks = []
    for i in range(len(examples['abstract_text'])):

        text_content = f"In PubMed abstract {examples['abstract_id'][i]}, the {examples['target'][i]} section states: {examples['abstract_text'][i]}"
        
        metadata = {
            "source": "PubMed",
            "abstract_id": examples['abstract_id'][i],
            "target": examples['target'][i],
            "line_number": examples['line_number'][i]
        }
        
        processed_chunks.append({
            "content": text_content,
            "metadata": metadata
        })
    
    return {"processed_data": processed_chunks}

pubmed_processed = pubmed_ds.map(process_pubmed_row, batched=True, remove_columns=pubmed_ds['train'].column_names)
pubmed_chunks = list(pubmed_processed['train']['processed_data'])

# Recursive Character Splitting
def txt_chunking(text, chunk_size=600, metadata=None):
    paragraphs = text.split('\n\n')
    chunks = []
    current_chunk = ""
    
    for p in paragraphs:
        if len(current_chunk) + len(p) <= chunk_size:
            current_chunk += p + "\n\n"
        else:
            if current_chunk:
                chunks.append({
                    "content": current_chunk.strip(),
                    "metadata": metadata.copy() if metadata else {}
                })
            current_chunk = p + "\n\n"
            
    if current_chunk:
        chunks.append({
            "content": current_chunk.strip(),
            "metadata": metadata.copy() if metadata else {}
        })
    return chunks

txt_chunks = [] 
def load_txt_folder(folder_path, source_name):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
    
                meta = {
                    "source": source_name,
                    "file_name": filename
                }
                
                file_chunks = txt_chunking(content, metadata=meta)
                txt_chunks.extend(file_chunks)


load_txt_folder("../data/raw/libc", "libc")
load_txt_folder("../data/raw/pytorch", "pytorch")

final_chunks = txt_chunks + pubmed_chunks

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

# Use a local embedding model to calculate semantic similarity for metrics like Answer Relevance.
test_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

In [3]:
import faiss
import numpy as np

dimension = 384 
index = faiss.IndexFlatL2(dimension)
text_list = [c['content'] for c in final_chunks]
embeddings = test_embeddings.embed_documents(text_list) 
embeddings = np.array(embeddings).astype('float32')
index.add(embeddings)

def query_rag_v1(user_query, k):
    query_vec = test_embeddings.embed_query(user_query)
    query_vec = np.array([query_vec]).astype('float32')
    
    _, I_vector = index.search(query_vec, k)
    vector_indices = I_vector[0].tolist()
    
    retrieved_texts = [final_chunks[i]['content'] for i in vector_indices if i != -1]
    
    context_str = "\n\n".join(retrieved_texts)
    full_prompt = f"Context:\n{context_str}\n\nQuestion: {user_query}\nAnswer:"
    
    return full_prompt, retrieved_texts

def query_rag_final_v1(user_query, k=10):
    full_prompt, contexts = query_rag_v1(user_query, k)
    
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model_llm.device)
    with torch.no_grad():
        outputs = model_llm.generate(**inputs, max_new_tokens=512)
    
    answer = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return answer, contexts

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_id = "Qwen/Qwen2.5-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model_llm = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config,
    device_map="auto" 
)

In [5]:
import torch
import json
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from langchain_openai import ChatOpenAI

with open('../data/raw/questions.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()

data = []
for line in lines[1:]:
    parts = line.strip().split(',', 2) 
    if len(parts) == 3:
        data.append(parts)

df_test = pd.DataFrame(data, columns=['category', 'question', 'ground_truth'])

df_test['ground_truth'] = df_test['ground_truth'].str.strip('"')

results = []

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    q = row['question']
    gt = row['ground_truth']
    
    ans, contexts = query_rag_final_v1(q)
    
    results.append({
        "question": q,
        "answer": ans,
        "contexts": [doc.page_content if hasattr(doc, 'page_content') else str(doc) for doc in contexts],  
        "ground_truth": gt
    })

eval_dataset = Dataset.from_list(results) 

with open("../data/eval/results_v1.json", "w") as f:
    json.dump(results, f)

del model_llm
torch.cuda.empty_cache()

  0%|          | 0/98 [00:00<?, ?it/s]

In [6]:
import json
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def embed(texts):
    if isinstance(texts, str):
        texts = [texts]
    return test_embeddings.embed_documents(texts)

def recall_at_k(contexts, gt, threshold=0.75):
    gt_emb = embed(gt)[0]
    ctx_embs = embed(contexts)
    sims = cosine_similarity([gt_emb], ctx_embs)[0]
    hit = np.max(sims) > threshold
    best_rank = int(np.argmax(sims)) + 1
    return hit, best_rank, float(np.max(sims))

def answer_gt_similarity(ans, gt):
    emb = embed([ans, gt])
    sim = cosine_similarity([emb[0]], [emb[1]])[0][0]
    return float(sim)
    
def answer_context_similarity(ans, contexts):
    ans_emb = embed(ans)[0]
    ctx_embs = embed(contexts)
    sims = cosine_similarity([ans_emb], ctx_embs)[0]
    return float(np.max(sims))

def hallucination_flag(ans_gt_sim, ans_ctx_sim,
                       gt_th=0.6, ctx_th=0.6):
    if ans_gt_sim < gt_th and ans_ctx_sim < ctx_th:
        return True
    return False


with open("../data/eval/results_v1.json") as f:
   results = json.load(f)


eval_results = []

for item in tqdm(results):
    q = item["question"]
    ans = item["answer"]
    contexts = item["contexts"]
    gt = item["ground_truth"]

    hit, rank, ctx_gt_sim = recall_at_k(contexts, gt)
    ans_gt_sim = answer_gt_similarity(ans, gt)
    ans_ctx_sim = answer_context_similarity(ans, contexts)
    hallucinated = hallucination_flag(ans_gt_sim, ans_ctx_sim)

    eval_results.append({
        "question": q,
        "hit": hit,
        "best_rank": rank,
        "ctx_gt_sim": ctx_gt_sim,
        "ans_gt_sim": ans_gt_sim,
        "ans_ctx_sim": ans_ctx_sim,
        "hallucinated": hallucinated
    })


import pandas as pd

df = pd.DataFrame(eval_results)

summary = {
    "Recall@k": df["hit"].mean(),
    "MRR": (1 / df["best_rank"]).mean(),
    "Avg Context-GT Sim": df["ctx_gt_sim"].mean(),
    "Avg Answer-GT Sim": df["ans_gt_sim"].mean(),
    "Avg Answer-Context Sim": df["ans_ctx_sim"].mean(),
    "Hallucination Rate": df["hallucinated"].mean()
}

print(summary)

with open("../data/eval/summary_v1.json", "w") as f:
    json.dump(summary, f)

  0%|          | 0/98 [00:00<?, ?it/s]

{'Recall@k': np.float64(0.5612244897959183), 'MRR': np.float64(0.5948250728862974), 'Avg Context-GT Sim': np.float64(0.7944092929327524), 'Avg Answer-GT Sim': np.float64(0.7496613243003292), 'Avg Answer-Context Sim': np.float64(0.858734216109723), 'Hallucination Rate': np.float64(0.01020408163265306)}
