## TT04: RAG Model Evaluation

In [4]:
#import libraries
from haystack import Pipeline
from haystack.components.builders import ChatPromptBuilder
from haystack.dataclasses import ChatMessage
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.joiners import DocumentJoiner
import sentence_transformers
from haystack.components.writers import DocumentWriter
from haystack.components.converters import PyPDFToDocument
from haystack.components.converters import TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.builders.answer_builder import AnswerBuilder

import os
from getpass import getpass

## TT02 & TT03 Recap
---
### Preprocessing Pipeline

In [3]:
#create document store
document_store = InMemoryDocumentStore()

#indicate the types of files to expect
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf"])

#we need converters for the types of documents we are getting
text_file_converter = TextFileToDocument()
pdf_converter = PyPDFToDocument()

#this will join all our documents so they can be fed through the pipeline together
document_joiner = DocumentJoiner()

#add cleaning and preprocessing functions
document_cleaner = DocumentCleaner() #standardizes and removes extra whitespace
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50) #chunks the text with overlap for context, like we did with spaCy

#create document embeddings
document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2") #use prebuilt model
document_embedder.warm_up() #download the embedding model

#write the processed documents to the InMemboryDocumentStore
document_writer = DocumentWriter(document_store)

preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=file_type_router, name="file_type_router")
preprocessing_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
preprocessing_pipeline.add_component(instance=document_joiner, name="document_joiner")
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter")
preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
preprocessing_pipeline.connect("text_file_converter", "document_joiner")
preprocessing_pipeline.connect("pypdf_converter", "document_joiner")
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
preprocessing_pipeline.connect("document_cleaner", "document_splitter")
preprocessing_pipeline.connect("document_splitter", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

from pathlib import Path

# get all our documents
file = "./TT04_Document_Library/"
files = list(Path(file).rglob("*"))

# run the pipeline
preprocessing_pipeline.run({"file_type_router": {"sources": files}})

Batches: 100%|██████████| 757/757 [22:50<00:00,  1.81s/it]


{'document_writer': {'documents_written': 24212}}

### Load OpenAI Key

In [5]:
# Ensure OpenAI API Key is Set
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

### Prompt Engineering

In [6]:
# Define Chat Prompt Template
template = [
    ChatMessage.from_user(
        """
You are an AI assistant specializing in emergency medical triage. 
Your task is to assess patient cases, determine the urgency level, and provide the best immediate recommendations.
If you receive a question without a patient case you may answer or ask follow-up questions, but any query with a 
patient case should receive urgency level and immediate recommendations.

Guidelines:
- Red (Critical): Immediate life-threatening condition (e.g., heart attack, stroke).
- Yellow (Urgent): Needs prompt medical attention but not critical (e.g., moderate breathing difficulty).
- Green (Non-Urgent): Minor conditions (e.g., mild fever, minor injuries).

### Contextual Information:
{% for document in documents %}
    - {{ document.content }}
{% endfor %}

### Patient Query:
{{ question }}

### Triage Decision & Rationale:
"""
    )
]

### RAG Model Pipeline

In [7]:
# Initialize Retriever
retriever = InMemoryBM25Retriever(document_store, top_k=5)

# Initialize Components
prompt_builder = ChatPromptBuilder(template=template)
chat_generator = OpenAIChatGenerator(model="gpt-4o-mini")
document_joiner = DocumentJoiner()
answer_builder = AnswerBuilder()

# initialize rag pipeline
rag_pipeline = Pipeline()
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("document_joiner", document_joiner)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", chat_generator)
rag_pipeline.add_component("answer_builder", answer_builder)

# Connect Components
rag_pipeline.connect("retriever.documents", "document_joiner.documents")
rag_pipeline.connect("document_joiner.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder.prompt", "llm.messages")
rag_pipeline.connect("document_joiner.documents", "answer_builder.documents")
rag_pipeline.connect("llm.replies", "answer_builder")

<haystack.core.pipeline.pipeline.Pipeline object at 0x00000268540DDE10>
🚅 Components
  - retriever: InMemoryBM25Retriever
  - document_joiner: DocumentJoiner
  - prompt_builder: ChatPromptBuilder
  - llm: OpenAIChatGenerator
  - answer_builder: AnswerBuilder
🛤️ Connections
  - retriever.documents -> document_joiner.documents (List[Document])
  - document_joiner.documents -> prompt_builder.documents (List[Document])
  - document_joiner.documents -> answer_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.messages (List[ChatMessage])
  - llm.replies -> answer_builder.replies (List[ChatMessage])

## TT04: Model Evaluation
---

In [8]:
with open("medical_case_1.txt", "r") as f:
    query = f.read()

print(query)

Medical Specialty:
Emergency Room Reports

Sample Name: Asthma in a 5-year-old

Description: Mother states he has been wheezing and coughing.
(Medical Transcription Sample Report)
CHIEF COMPLAINT: This 5-year-old male presents to Children's Hospital Emergency Department by the mother with "have asthma." Mother states he has been wheezing and coughing. They saw their primary medical doctor. He was evaluated at the clinic, given the breathing treatment and discharged home, was not having asthma, prescribed prednisone and an antibiotic. They told to go to the ER if he got worse. He has had some vomiting and some abdominal pain. His peak flows on the morning are normal at 150, but in the morning, they were down to 100 and subsequently decreased to 75 over the course of the day.

PAST MEDICAL HISTORY: Asthma with his last admission in 07/2007. Also inclusive of frequent pneumonia by report.

IMMUNIZATIONS: Up-to-date.

ALLERGIES: Denied.

MEDICATIONS: Advair, Nasonex, Xopenex, Zicam, Zithro

In [9]:
# Run pipeline
response = rag_pipeline.run(
            {
                "retriever": {"query": query},
                "prompt_builder": {"question": query},
                "answer_builder": {"query": query},
            })

## Model Evaluation

In [10]:
from ragas import evaluate
from ragas.dataset_schema import EvaluationDataset

# Extract the generated answer
answer_text = response["answer_builder"]["answers"][0].data

# Extract original query
query = response["answer_builder"]["answers"][0].query

# Extract retrieved document texts
retrieved_docs = response["answer_builder"]["answers"][0].documents
document_texts = [doc.content for doc in retrieved_docs]

# set up the evaluation dataset
eval_dict = {
    "user_input": query,  # User's original query
    "response": answer_text,  # AI-generated answer
    "retrieved_contexts": document_texts  # Retrieved documents
}

# method expects a list of dicts
evaluation_dataset = EvaluationDataset.from_dict([eval_dict])

In [11]:
# Print the AI response
print("**AI Triage Recommendation:**\n")
print(answer_text)

# Print retrieved document excerpts
print("\n**Retrieved Documents:**\n")
for i, doc in enumerate(document_texts):
    print(f"Document {i+1} Excerpt: {doc[:50]}...\n")  # Show first 50 characters

**AI Triage Recommendation:**

**Urgency Level: Yellow (Urgent)**

**Rationale:** The patient is a 5-year-old boy with a history of asthma showing wheezing, coughing, signs of respiratory distress, and low oxygen saturation (91% on room air). Although he is alert and cooperating, he has had a significant drop in peak flow and now has continued audible wheezing, which indicates that his asthma is not well controlled. His vital signs (respiratory rate of 28 and heart rate of 105) indicate some level of distress, and he is showing signs of respiratory compromise with low oxygen saturation.

**Immediate Recommendations:**
1. **Administer Supplemental Oxygen:** Increase oxygen delivery to improve saturation immediately while continuing monitoring.
  
2. **Continue High-Dose Albuterol Therapy:** Ensure the child receives continuous nebulizer treatments to manage bronchospasm.

3. **Administer Corticosteroids:** Decadron (dexamethasone) should continue as it helps reduce airway inflammation.


In [12]:
from ragas.metrics import AnswerRelevancy, Faithfulness, LLMContextPrecisionWithoutReference, AspectCritic
from ragas.llms import LangchainLLMWrapper
from langchain.chat_models.openai import ChatOpenAI

# Initialize LLM Wrapper
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) #temperature = 0 for more consistent results
evaluator_llm = LangchainLLMWrapper(llm)

# Run Ragas Evaluation
ragas_evaluator = evaluate(
    dataset=evaluation_dataset,
    metrics=[AnswerRelevancy(), 
             AspectCritic(name="patient triaged",
                          definition="Does the response include a triage level of Red, Yellow, or Green?",
                          llm=evaluator_llm), 
             Faithfulness(llm=evaluator_llm), 
             LLMContextPrecisionWithoutReference(llm=evaluator_llm)]
)

print(ragas_evaluator)

  llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) #temperature = 0 for more consistent results
Evaluating: 100%|██████████| 4/4 [00:26<00:00,  6.58s/it]


{'answer_relevancy': 0.8609, 'patient triaged': 1.0000, 'faithfulness': 0.4783, 'llm_context_precision_without_reference': 1.0000}


## References

---

- Haystack - Ragas. (2025). Ragas.io. https://docs.ragas.io/en/stable/howtos/integrations/haystack/
- RagasEvaluator. (2025). Haystack Documentation. https://docs.haystack.deepset.ai/docs/ragasevaluator
- RAG Pipeline Evaluation Using RAGAS | Haystack. (2024). Haystack. https://haystack.deepset.ai/cookbook/rag_eval_ragas