In [1]:
import os
import re
from datasets import load_dataset

pubmed_ds = load_dataset("csv", data_files="../data/raw/PubMed/train.csv")

def process_pubmed_row(examples):
    processed_chunks = []
    for i in range(len(examples['abstract_text'])):

        text_content = f"In PubMed abstract {examples['abstract_id'][i]}, the {examples['target'][i]} section states: {examples['abstract_text'][i]}"
        
        metadata = {
            "source": "PubMed",
            "abstract_id": examples['abstract_id'][i],
            "target": examples['target'][i],
            "line_number": examples['line_number'][i]
        }
        
        processed_chunks.append({
            "content": text_content,
            "metadata": metadata
        })
    
    return {"processed_data": processed_chunks}

pubmed_processed = pubmed_ds.map(process_pubmed_row, batched=True, remove_columns=pubmed_ds['train'].column_names)
pubmed_chunks = list(pubmed_processed['train']['processed_data'])


def libc_semantic_chunking(text, metadata):
    header_pattern = re.compile(r'\n([A-Z][A-Z\s]+)\n')
    
    headers = list(header_pattern.finditer(text))
    chunks = []
    
    for i in range(len(headers)):
        start_idx = headers[i].start()
        end_idx = headers[i+1].start() if i + 1 < len(headers) else len(text)
        
        section_title = headers[i].group(1).strip()
        section_content = text[start_idx:end_idx].strip()
        
        meta = metadata.copy()
        meta["section"] = section_title
        
        chunks.append({
            "content": section_content,
            "metadata": meta
        })
    return chunks


def recursive_smart_chunking(text, chunk_size=600, overlap=100, metadata=None):
    separators = ["\n\n", "\n", " ", ""]
    
    def split_text(text, separators):
        if len(text) <= chunk_size:
            return [text]
        
        sep = separators[0]
        for s in separators:
            if s in text:
                sep = s
                break
        
        final_chunks = []
        parts = text.split(sep)
        current_doc = ""
        
        for p in parts:
            if len(current_doc) + len(p) + len(sep) <= chunk_size:
                current_doc += (sep if current_doc else "") + p
            else:
                if current_doc:
                    final_chunks.append(current_doc)
                current_doc = current_doc[-overlap:] + (sep if current_doc else "") + p
                
        if current_doc:
            final_chunks.append(current_doc)
        return final_chunks

    raw_chunks = split_text(text, separators)
    return [{"content": c, "metadata": metadata.copy() if metadata else {}} for c in raw_chunks]


def load_and_process(folder_path, source_name):
    txt_chunks = []
    source_map = {} 

    for filename in os.listdir(folder_path):
        if not filename.endswith(".txt"):
            continue
        
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            meta = {"source": source_name, "file_name": filename}
            
            if source_name == "libc":
                file_chunks = libc_semantic_chunking(content, meta)
            else:
                file_chunks = recursive_smart_chunking(content, metadata=meta)
                for i, chunk in enumerate(file_chunks):
                    chunk['metadata']['chunk_index'] = i
            
            txt_chunks.extend(file_chunks)
            source_map[filename] = file_chunks
            
    return txt_chunks, source_map


libc_chunks, libc_map_raw = load_and_process("../data/raw/libc", "libc")
pytorch_chunks, pytorch_map = load_and_process("../data/raw/pytorch", "pytorch")
final_chunks = libc_chunks + pytorch_chunks + pubmed_chunks

libc_map = {}
for f_name, chunks in libc_map_raw.items():
    libc_map[f_name] = {c['metadata']['section']: c['content'] for c in chunks}

pubmed_map = {}
for chunk in pubmed_chunks:
    abs_id = chunk['metadata']['abstract_id']             
    if abs_id not in pubmed_map:
        pubmed_map[abs_id] = []
    pubmed_map[abs_id].append(chunk)

global_maps = {
    'pubmed': pubmed_map,   
    'pytorch': pytorch_map, 
    'libc': libc_map        
}

def get_expanded_content(hit, global_maps, window_size=1):
    meta = hit['metadata']
    source = meta.get('source')
    
    if source == 'PubMed':
        abs_id = meta['abstract_id']
        curr_line = meta['line_number']
        all_lines = global_maps['pubmed'].get(abs_id, [])
        neighbors = [
            c['content'] for c in all_lines 
            if abs(c['metadata']['line_number'] - curr_line) <= window_size
        ]
        return " ".join(neighbors)
    
    elif source == 'pytorch':
        f_name = meta['file_name']
        idx = meta['chunk_index']
        all_chunks = global_maps['pytorch'].get(f_name, [])
        neighbors = [
            c['content'] for c in all_chunks 
            if abs(c['metadata']['chunk_index'] - idx) <= window_size
        ]
        return "\n\n...[CONTEXT OVERLAP]...\n\n".join(neighbors)
    
    elif source == 'libc':
        f_name = meta['file_name']
        sec = meta['section']
        return global_maps['libc'].get(f_name, {}).get(sec, hit['content'])
    
    return hit['content']

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

# Use a local embedding model to calculate semantic similarity for metrics like Answer Relevance.
test_embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "Qwen/Qwen2.5-1.5B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model_llm = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config,
    device_map="auto" 
)

In [None]:
import faiss
import pickle
index = faiss.read_index("pubmed_pytorch_libc.index")

with open("final_chunks_metadata.pkl", "rb") as f:
    final_chunks = pickle.load(f)

In [4]:
import torch
import faiss
import pickle
import numpy as np
import gc
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder

batch_size = 512
text_list = [c['content'] for c in final_chunks]
d = 384
M = 32
index = faiss.IndexHNSWFlat(d, M)
index.hnsw.efConstruction = 200

for i in tqdm(range(0, len(text_list), batch_size), desc="Building FAISS Index"):
    batch_texts = text_list[i : i + batch_size]
    batch_embeddings = test_embeddings.embed_documents(batch_texts)
    batch_embeddings_np = np.array(batch_embeddings).astype('float32')
    index.add(batch_embeddings_np)
    
    del batch_embeddings, batch_embeddings_np
    gc.collect() 

faiss.write_index(index, "pubmed_pytorch_libc.index")
print("FAISS Index saved")

with open("final_chunks_metadata.pkl", "wb") as f:
    pickle.dump(final_chunks, f)
print("Metadata saved.")

def technical_tokenizer(text):
    tokens = re.sub(r'[^a-zA-Z0-9._\s]', ' ', text.lower()).split()
    return tokens

def get_tokenized_corpus(chunks):
    for chunk in chunks:
        yield technical_tokenizer(chunk['content'])

bm25 = BM25Okapi(get_tokenized_corpus(final_chunks))
gc.collect()


# RRF Fusion Algorithm
def rrf_fusion(vector_results, bm25_results, k=60):
    scores = {}
    for rank, idx in enumerate(vector_results):
        scores[idx] = scores.get(idx, 0) + 1 / (k + rank)
    for rank, idx in enumerate(bm25_results):
        scores[idx] = scores.get(idx, 0) + 1 / (k + rank)
    
    sorted_indices = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
    return sorted_indices

reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cpu')

# Hard filter
source_to_indices = {}
for idx, chunk in enumerate(final_chunks):
    s = chunk['metadata'].get('source')
    if s:
        if s not in source_to_indices:
            source_to_indices[s] = []
        source_to_indices[s].append(idx)


# Hybrid Search
def query_rag_v4(user_query, k, rerank_top_n, source_filter=None):
    query_vec = test_embeddings.embed_query(user_query)
    query_vec = np.array([query_vec]).astype('float32')
    
    # Hard Filter
    allowed_indices = None
    if source_filter:
        allowed_indices = source_to_indices.get(source_filter)
    
    # FAISS Retrieval
    index.hnsw.efSearch = 128 

    search_k = k * 2
    if allowed_indices is not None:
        search_k = min(search_k, len(allowed_indices))
    
    if allowed_indices is not None and search_k > 0:
        selector = faiss.IDSelectorBatch(allowed_indices)
        params = faiss.SearchParameters(sel=selector)
        _, I_vector = index.search(query_vec, search_k, params=params)
    else:
        if search_k > 0:
            _, I_vector = index.search(query_vec, search_k)
        else:
            I_vector = [[]]
    
    vector_indices = I_vector[0].tolist()

    # BM25 Retrieval
    tokenized_query = technical_tokenizer(user_query)
    bm25_scores = bm25.get_scores(tokenized_query)
    
    if allowed_indices is not None:
        mask = np.zeros_like(bm25_scores)
        mask[allowed_indices] = 1
        bm25_scores = bm25_scores * mask

    bm25_indices = np.argsort(bm25_scores)[::-1][:k * 2].tolist()

    # RRF
    combined_indices = rrf_fusion(vector_indices, bm25_indices)

    # Reranking
    candidate_chunks = [final_chunks[i] for i in combined_indices[:10]]
    pairs = [[user_query, c['content']] for c in candidate_chunks]
    rerank_scores = reranker_model.predict(pairs)
    reranked_chunks = [c for _, c in sorted(zip(rerank_scores, candidate_chunks), key=lambda x: x[0], reverse=True)]
    
    # context expansion
    top_chunks = reranked_chunks[:rerank_top_n]
    expanded_texts = [get_expanded_content(hit, global_maps, window_size=1) for hit in top_chunks]
    context_str = "\n\n ".join(expanded_texts)
    full_prompt = f"You are a helpful agent, answer the question based on provided contexts. Please prioritize the most relevant information, ignore any noise, and answer the question accurately.\n\nContext:\n{context_str}\n\nQuestion: {user_query}\nAnswer:"

    return full_prompt, expanded_texts

def query_rag_final_v4(user_query, k=30):
    full_prompt, expanded_texts = query_rag_v4(user_query, k, rerank_top_n=3) 

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    inputs = tokenizer(full_prompt, return_tensors="pt").to(model_llm.device)
    
    with torch.no_grad():
        outputs = model_llm.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)
    
    answer = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    
    return answer, expanded_texts

Building FAISS Index: 100%|██████████| 4370/4370 [1:47:38<00:00,  1.48s/it]


FAISS Index saved
Metadata saved.


In [5]:
import torch
import json
import csv
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
#from langchain_openai import ChatOpenAI
#from langchain_huggingface import HuggingFaceEmbeddings

questions = []
with open('../data/raw/questions.txt', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    reader.fieldnames = [name.strip() for name in reader.fieldnames]
    
    for row in reader:
        q = row.get('Question', '').strip()
        gt = row.get('Ground Truth', '').strip()
        
        if q:
            questions.append({"question": q, "ground_truth": gt})

results = []
for item in tqdm(questions):
    q = item['question']
    gt = item['ground_truth']
    
    ans, contexts = query_rag_final_v4(q)
    
    results.append({
        "question": q,
        "answer": ans,
        "contexts": contexts,  
        "ground_truth": gt
    })


with open("../data/eval/results_v4.json", "w") as f:
    json.dump(results, f)

del model_llm
torch.cuda.empty_cache()

实际识别到的列名有: ['Source File', 'Question', 'Ground Truth ']


  0%|          | 0/98 [00:00<?, ?it/s]

In [6]:
import json
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def embed(texts):
    if isinstance(texts, str):
        texts = [texts]
    return test_embeddings.embed_documents(texts)

def recall_at_k(contexts, gt, threshold=0.75):
    gt_emb = embed(gt)[0]
    ctx_embs = embed(contexts)
    sims = cosine_similarity([gt_emb], ctx_embs)[0]
    hit = np.max(sims) > threshold
    best_rank = int(np.argmax(sims)) + 1
    return hit, best_rank, float(np.max(sims))

def answer_gt_similarity(ans, gt):
    emb = embed([ans, gt])
    sim = cosine_similarity([emb[0]], [emb[1]])[0][0]
    return float(sim)
    
def answer_context_similarity(ans, contexts):
    ans_emb = embed(ans)[0]
    ctx_embs = embed(contexts)
    sims = cosine_similarity([ans_emb], ctx_embs)[0]
    return float(np.max(sims))

def hallucination_flag(ans_gt_sim, ans_ctx_sim,
                       gt_th=0.6, ctx_th=0.6):
    if ans_gt_sim < gt_th and ans_ctx_sim < ctx_th:
        return True
    return False


with open("../data/eval/results_v4.json") as f:
    results = json.load(f)


eval_results = []

for item in tqdm(results):
    q = item["question"]
    ans = item["answer"]
    contexts = item["contexts"]
    gt = item["ground_truth"]

    hit, rank, ctx_gt_sim = recall_at_k(contexts, gt)
    ans_gt_sim = answer_gt_similarity(ans, gt)
    ans_ctx_sim = answer_context_similarity(ans, contexts)
    hallucinated = hallucination_flag(ans_gt_sim, ans_ctx_sim)

    eval_results.append({
        "question": q,
        "hit": hit,
        "best_rank": rank,
        "ctx_gt_sim": ctx_gt_sim,
        "ans_gt_sim": ans_gt_sim,
        "ans_ctx_sim": ans_ctx_sim,
        "hallucinated": hallucinated
    })


import pandas as pd

df = pd.DataFrame(eval_results)

summary = {
    "Recall@k": df["hit"].mean(),
    "MRR": (1 / df["best_rank"]).mean(),
    "Avg Context-GT Sim": df["ctx_gt_sim"].mean(),
    "Avg Answer-GT Sim": df["ans_gt_sim"].mean(),
    "Avg Answer-Context Sim": df["ans_ctx_sim"].mean(),
    "Hallucination Rate": df["hallucinated"].mean()
}

print(summary)

with open("../data/eval/summary_v4.json", "w") as f:
    json.dump(summary, f)

  0%|          | 0/98 [00:00<?, ?it/s]

{'Recall@k': np.float64(0.3469387755102041), 'MRR': np.float64(0.7534013605442177), 'Avg Context-GT Sim': np.float64(0.7199309381486957), 'Avg Answer-GT Sim': np.float64(0.7702295959114209), 'Avg Answer-Context Sim': np.float64(0.8472616561740863), 'Hallucination Rate': np.float64(0.0)}
