## RAG PIPELINE

### Set up


In [None]:
import os
import torch
import pandas as pd
import chromadb
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from openai import AzureOpenAI
from datasets import load_dataset, Dataset as HFDataset
from bert_score import score as bert_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision
from ragas import evaluate

AZURE_OPENAI_API_KEY = "ae9587f69088409992009cb7bcf61436"
AZURE_OPENAI_ENDPOINT = "https://genai-openai-eus.openai.azure.com/"
AZURE_OPENAI_API_VERSION = "2024-05-01-preview"
AZURE_OPENAI_DEPLOYMENT = "genai-GPT4o-mini"

client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VERSION
)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

MODEL_NAME = "microsoft/biogpt"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()
print("BioGPT model loaded for embeddings.")

Using device: cuda
BioGPT model loaded for embeddings.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Exception ignored in: <function tqdm.__del__ at 0x00000278DB54D300>
Traceback (most recent call last):
  File "c:\Users\Jasper\anaconda3\envs\biogpt_gpu\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\Jasper\anaconda3\envs\biogpt_gpu\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm' object has no attribute 'disp'


### Build Chroma Vector Store

In [2]:
train_df = pd.read_csv("PubMedQA_artificial_RAG.csv")
train_df = train_df.dropna(subset=["context", "question", "long_answer"])
print(f"Loaded training data: {len(train_df)} rows")

Loaded training data: 211269 rows


In [5]:
chroma_client = chromadb.Client()
collection_name = "pubmedqa_biogpt"
try:
    chroma_client.delete_collection(name=collection_name)
except:
    pass
collection = chroma_client.create_collection(name=collection_name)

context_texts = train_df["context"].astype(str).tolist()
print("Embedding contexts...")

for i in tqdm(range(0, len(context_texts), 512), desc="Indexing batches"):
    batch = context_texts[i:i+512]
    emb = embed_texts_biogpt(batch)
    ids = [f"doc_{i+j}" for j in range(len(batch))]
    collection.add(documents=batch, embeddings=emb.tolist(), ids=ids)

print(f"Indexed {len(context_texts)} documents into ChromaDB.")

Embedding contexts...


Indexing batches: 100%|██████████| 413/413 [1:10:08<00:00, 10.19s/it]

Indexed 211269 documents into ChromaDB.





### Embedding

In [4]:
def embed_texts_biogpt(texts, batch_size=8):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        with torch.no_grad():
            inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
            outputs = model(**inputs, output_hidden_states=True)
            emb = outputs.last_hidden_state.mean(dim=1).cpu()
            embeddings.append(emb)
    return torch.cat(embeddings, dim=0)

### Retrieval from Chroma

In [6]:
def retrieve_context(question, k=5):
    q_emb = embed_texts_biogpt([question])[0].tolist()
    results = collection.query(query_embeddings=[q_emb], n_results=k)
    retrieved_docs = results["documents"][0]
    return "\n\n".join(retrieved_docs)



In [7]:
def generate_answer(question, context):
    prompt = f"""You are a biomedical assistant. 
Use the context below to answer concisely.

Question: {question}
Context: {context}
Answer:"""
    completion = client.chat.completions.create(
        model=AZURE_OPENAI_DEPLOYMENT,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
        max_tokens=256
    )
    return completion.choices[0].message.content.strip()

### Evaluation

In [8]:
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")["train"]
questions = dataset["question"][:1000]
gold_answers = dataset["long_answer"][:1000]

pred_answers, contexts = [], []

for q in tqdm(questions, desc="Generating answers"):
    try:
        ctx = retrieve_context(q)
        ans = generate_answer(q, ctx)
        pred_answers.append(ans)
        contexts.append(ctx)
    except Exception as e:
        print(f"Skipped one: {e}")

min_len = min(len(questions), len(pred_answers), len(contexts), len(gold_answers))
questions, pred_answers, contexts, gold_answers = [
    lst[:min_len] for lst in [questions, pred_answers, contexts, gold_answers]
]
print(f" Using {min_len} valid samples for evaluation.")

Generating answers: 100%|██████████| 1000/1000 [34:22<00:00,  2.06s/it]

 Using 1000 valid samples for evaluation.





In [9]:
def sanitize_list(lst):
    return [str(x) for x in lst if x not in [None, float("nan"), ...]]

min_len = min(len(questions), len(pred_answers), len(contexts), len(gold_answers))
questions, pred_answers, contexts, gold_answers = [
    sanitize_list(lst[:min_len]) for lst in [questions, pred_answers, contexts, gold_answers]
]
print(f"Using {min_len} samples for evaluation.")

Using 1000 samples for evaluation.


### Compute BERTscore

In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from bert_score import score as bert_score
import pandas as pd

print("\nEvaluating BERTScore...")
P, R, F1 = bert_score(pred_answers, gold_answers, lang="en", verbose=True)
mean_bert_f1 = F1.mean().item()
print(f"Average BERTScore (semantic F1): {mean_bert_f1:.4f}")

threshold = 1
binary_preds = [1 if f >= threshold else 0 for f in F1]
binary_labels = [1] * len(binary_preds)

accuracy = accuracy_score(binary_labels, binary_preds)
precision = precision_score(binary_labels, binary_preds, zero_division=0)
recall = recall_score(binary_labels, binary_preds, zero_division=0)
f1_cls = f1_score(binary_labels, binary_preds, zero_division=0)

print(f"\nThreshold = {threshold}")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1_cls:.4f}")

# Save results
eval_df = pd.DataFrame({
    "question": questions,
    "context": contexts,
    "pred_answer": pred_answers,
    "gold_answer": gold_answers,
    "bert_f1": F1.tolist(),
    "is_correct": binary_preds
})
eval_df.to_csv("RAG_Evaluation_Results.csv", index=False)
eval_df.to_excel("RAG_Evaluation_Results.xlsx", index=False)
print("Saved: RAG_Evaluation_Results.csv")


Evaluating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/32 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/16 [00:00<?, ?it/s]

done in 8.01 seconds, 124.88 sentences/sec
Average BERTScore (semantic F1): 0.8692

Threshold = 1
Accuracy  : 0.0000
Precision : 0.0000
Recall    : 0.0000
F1-score  : 0.0000
Saved: RAG_Evaluation_Results.csv


### RAGAS Metrics

In [None]:
import os, asyncio, pandas as pd
from datasets import load_dataset
from openai import AzureOpenAI
from tqdm import tqdm

# ============================================================
# 1. Azure OpenAI Setup
# ============================================================
AZURE_OPENAI_API_KEY = "ae9587f69088409992009cb7bcf61436"
AZURE_OPENAI_ENDPOINT = "https://genai-openai-eus.openai.azure.com/"
AZURE_OPENAI_API_VERSION = "2024-05-01-preview"
AZURE_OPENAI_DEPLOYMENT = "genai-GPT4o-mini"

client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VERSION
)

async def llm_score(prompt: str) -> float:
    """LLM judge that returns a normalized score between 0–1."""
    try:
        response = client.chat.completions.create(
            model=AZURE_OPENAI_DEPLOYMENT,
            temperature=0,
            messages=[
                {"role": "system", "content": "You are an expert evaluator who outputs only a number between 0 and 1."},
                {"role": "user", "content": prompt}
            ],
        )
        text = response.choices[0].message.content.strip()
        # Extract numeric portion
        for token in text.split():
            try:
                val = float(token)
                return min(max(val, 0.0), 1.0)
            except ValueError:
                continue
    except Exception as e:
        print("Error:", e)
    return 0.0

# ============================================================
# 2. Metric Prompts
# ============================================================
def make_prompts(question, context, answer):
    return {
        "faithfulness": f"""
You are evaluating the factual consistency of an answer given the context.

Question: {question}
Context: {context}
Answer: {answer}

Rate how factually supported the answer is by the context (0 = not supported, 1 = fully supported).
Respond with a single number between 0 and 1.""",

        "answer_relevancy": f"""
You are evaluating how relevant an answer is to a given question.

Question: {question}
Answer: {answer}

Rate how directly and correctly the answer addresses the question (0 = unrelated, 1 = fully relevant).
Respond with only a number between 0 and 1.""",

        "context_precision": f"""
You are evaluating the relevance of the retrieved context.

Question: {question}
Context: {context}

Rate how much of the context is relevant to answering the question (0 = mostly irrelevant, 1 = fully relevant).
Respond with only a number between 0 and 1.""",

        "context_recall": f"""
You are evaluating how completely the context covers the information required to answer the question.

Question: {question}
Context: {context}

Rate how much of the necessary information is contained in the context (0 = missing key info, 1 = fully complete).
Respond with only a number between 0 and 1."""
    }

# ============================================================
# 3. Load Unseen Dataset
# ============================================================
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")["train"].select(range(1000))
questions = [ex["question"] for ex in dataset]
contexts = [" ".join(ex["context"]["contexts"]) for ex in dataset]
pred_answers = ["Generated placeholder answer" for _ in dataset]  # <-- plug in your model outputs

# ============================================================
# 4. Evaluate Each Metric via GPT-4o
# ============================================================
async def evaluate_llm_metrics():
    results = []
    for i, (q, c, a) in enumerate(zip(questions, contexts, pred_answers), start=1):
        print(f"\nEvaluating sample {i}/{len(questions)}")
        prompts = make_prompts(q, c, a)
        f = await llm_score(prompts["faithfulness"])
        r = await llm_score(prompts["answer_relevancy"])
        p = await llm_score(prompts["context_precision"])
        rec = await llm_score(prompts["context_recall"])

        results.append({
            "question": q,
            "answer": a,
            "faithfulness": f,
            "answer_relevancy": r,
            "context_precision": p,
            "context_recall": rec
        })
    return results

# ============================================================
# 5. Run Evaluation + Compute Averages
# ============================================================
async def main():
    results = await evaluate_llm_metrics()
    df = pd.DataFrame(results)
    df.to_csv("RAGAS_Eval_Scores.csv", index=False)
    print("\nSaved LLM_Eval_Scores.csv")

    # Compute averages
    avg_scores = df[["faithfulness", "answer_relevancy", "context_precision", "context_recall"]].mean()
    print("\n=== Average LLM Evaluation Scores ===")
    print(avg_scores)

asyncio.run(main())
