# Using Ragas to Evaluate a RAG Application

In [1]:
# Standard Library Imports
import copy
import os
import time
from getpass import getpass
from uuid import uuid4

# Third-Party Imports
# LangChain Core
from langchain.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever

# LangChain Community
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

# LangChain Integrations
from langchain_cohere import CohereRerank
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore

# LangGraph
from langgraph.graph import START, StateGraph

# Qdrant
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

# RAGAS
from ragas import EvaluationDataset, RunConfig, evaluate
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
    ContextEntityRecall,
    Faithfulness,
    FactualCorrectness,
    LLMContextRecall,
    NoiseSensitivity,
    ResponseRelevancy,
)
from ragas.testset import TestsetGenerator

# Typing
from typing_extensions import List, TypedDict

# Local Application Imports
# (none yet)

In [2]:
os.environ["OPENAI_API_KEY"] = getpass("Please enter your OpenAI API key!")
os.environ["COHERE_API_KEY"] = getpass("Please enter your Cohere API key!")
os.environ["LANGSMITH_API_KEY"] = getpass("LangSmith API Key:")
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_PROJECT"] = f"AIM - RAGAS EVALS - {uuid4().hex[0:8]}"

In [3]:
BASELINE_PROMPT = """\
You are a helpful assistant who answers questions based on provided context. You must only use the provided context, and cannot use your own knowledge.

### Question
{question}

### Context
{context}
"""

In [4]:
from pathlib import Path

project_root = Path.cwd().parent  # Go up one level from notebooks/ to project root
data_path = project_root / "data"

print(f"Project root: {project_root}")
print(f"Data path: {data_path}")
print(f"Data path exists: {data_path.exists()}")

# Load documents
loader = DirectoryLoader(str(data_path), glob="*.pdf", loader_cls=PyMuPDFLoader)
docs = loader.load()
print(f"Loaded {len(docs)} documents")

ragas_docs = docs
retriever_docs = docs

Project root: /home/donbr/don-aie-cohort8/aie8-s09-adv-retrieval
Data path: /home/donbr/don-aie-cohort8/aie8-s09-adv-retrieval/data
Data path exists: True
Loaded 64 documents


In [5]:
# RAGAS
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))

# langchain_openai
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
llm = ChatOpenAI(model="gpt-4.1-nano")

In [6]:
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)

In [7]:
golden_testset = generator.generate_with_langchain_docs(ragas_docs, testset_size=10)
golden_testset.to_pandas()

Applying HeadlinesExtractor:   0%|          | 0/21 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/64 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to ap

Applying SummaryExtractor:   0%|          | 0/38 [00:00<?, ?it/s]

Property 'summary' already exists in node '176f7e'. Skipping!
Property 'summary' already exists in node '09c993'. Skipping!
Property 'summary' already exists in node 'dffb12'. Skipping!
Property 'summary' already exists in node 'd06d7f'. Skipping!
Property 'summary' already exists in node 'fa06fc'. Skipping!
Property 'summary' already exists in node '4d8553'. Skipping!
Property 'summary' already exists in node '4325ad'. Skipping!
Property 'summary' already exists in node 'ff45c8'. Skipping!
Property 'summary' already exists in node '37f24a'. Skipping!
Property 'summary' already exists in node '040138'. Skipping!
Property 'summary' already exists in node '1013a5'. Skipping!
Property 'summary' already exists in node '84d7eb'. Skipping!
Property 'summary' already exists in node 'cd0b56'. Skipping!
Property 'summary' already exists in node 'c7f581'. Skipping!
Property 'summary' already exists in node 'e5dd26'. Skipping!
Property 'summary' already exists in node '79fc98'. Skipping!
Property

Applying CustomNodeFilter:   0%|          | 0/8 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/48 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node 'ff45c8'. Skipping!
Property 'summary_embedding' already exists in node 'fa06fc'. Skipping!
Property 'summary_embedding' already exists in node '176f7e'. Skipping!
Property 'summary_embedding' already exists in node 'dffb12'. Skipping!
Property 'summary_embedding' already exists in node '4325ad'. Skipping!
Property 'summary_embedding' already exists in node '09c993'. Skipping!
Property 'summary_embedding' already exists in node '1013a5'. Skipping!
Property 'summary_embedding' already exists in node '4d8553'. Skipping!
Property 'summary_embedding' already exists in node '37f24a'. Skipping!
Property 'summary_embedding' already exists in node 'd06d7f'. Skipping!
Property 'summary_embedding' already exists in node '040138'. Skipping!
Property 'summary_embedding' already exists in node '84d7eb'. Skipping!
Property 'summary_embedding' already exists in node 'cd0b56'. Skipping!
Property 'summary_embedding' already exists in node '79fc98'. Sk

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,"When was ChatGPT launched, and what significan...",[Introduction ChatGPT launched in November 202...,"ChatGPT was launched in November 2022, marking...",single_hop_specifc_query_synthesizer
1,cud u tel me how many total mesages were there...,[Month Non-Work (M) (%) Work (M) (%) Total Mes...,"In Jun 2025, there were 2,627 total messages, ...",single_hop_specifc_query_synthesizer
2,What details are provided in Section 3 regardi...,[Table 1: ChatGPT daily message counts (millio...,Section 3 contains details about the exclusion...,single_hop_specifc_query_synthesizer
3,What occupation category does SOC2 code 19 rep...,[Variation by Occupation Figure 23 presents va...,"In the context of ChatGPT usage analysis, SOC2...",single_hop_specifc_query_synthesizer
4,How has the growth in ChatGPT message volume a...,[<1-hop>\n\nVariation by Occupation Figure 23 ...,The growth in ChatGPT message volume has been ...,multi_hop_abstract_query_synthesizer
5,How does ChatGPT adoption and usage differ by ...,[<1-hop>\n\nIntroduction ChatGPT launched in N...,ChatGPT adoption and usage show significant va...,multi_hop_abstract_query_synthesizer
6,What evidence from internal ChatGPT message da...,[<1-hop>\n\nIntroduction ChatGPT launched in N...,Internal ChatGPT message data shows unpreceden...,multi_hop_abstract_query_synthesizer
7,"Based on the message volume statistics, how di...",[<1-hop>\n\nMonth Non-Work (M) (%) Work (M) (%...,"Between June 2024 and June 2025, ChatGPT's tot...",multi_hop_abstract_query_synthesizer
8,How did the total number of ChatGPT messages a...,[<1-hop>\n\nMonth Non-Work (M) (%) Work (M) (%...,"In June 2025, there were 2,627 million total m...",multi_hop_specific_query_synthesizer
9,how many messages people send in jun 2025 and ...,[<1-hop>\n\nMonth Non-Work (M) (%) Work (M) (%...,"in jun 2025, people send 2,627 million (2.627 ...",multi_hop_specific_query_synthesizer


In [8]:
baseline_dataset = copy.deepcopy(golden_testset)
rerank_dataset = copy.deepcopy(golden_testset)

## LangGraph RAG

### First Graph - baseline

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30)
split_documents = text_splitter.split_documents(docs)
len(split_documents)

275

In [10]:
rag_prompt = ChatPromptTemplate.from_template(BASELINE_PROMPT)

In [11]:
baseline_client = QdrantClient(":memory:")

baseline_client.create_collection(
    collection_name="use_case_data",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

baseline_vector_store = QdrantVectorStore(
    client=baseline_client,
    collection_name="use_case_data",
    embedding=embeddings,
)

In [12]:
_ = baseline_vector_store.add_documents(documents=split_documents)

retriever = baseline_vector_store.as_retriever(search_kwargs={"k": 3})

In [13]:
def retrieve(state):
  retrieved_docs = retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [14]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = llm.invoke(messages)
  return {"response" : response.content}

In [15]:
class State(TypedDict):
  question: str
  context: List[Document]
  response: str

In [16]:
baseline_graph_builder = StateGraph(State).add_sequence([retrieve, generate])
baseline_graph_builder.add_edge(START, "retrieve")
baseline_graph = baseline_graph_builder.compile()

### Second Graph - Reranker

In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30)
split_documents = text_splitter.split_documents(retriever_docs)
len(split_documents)

275

In [18]:
rerank_client = QdrantClient(":memory:")

rerank_client.create_collection(
    collection_name="use_case_data_new_chunks",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

rerank_vector_store = QdrantVectorStore(
    client=rerank_client,
    collection_name="use_case_data_new_chunks",
    embedding=embeddings,
)

In [19]:
_ = rerank_vector_store.add_documents(documents=split_documents)

baseline_retriever = rerank_vector_store.as_retriever(search_kwargs={"k": 20})

In [20]:
def retrieve_reranked(state):
  compressor = CohereRerank(model="rerank-v3.5")
  compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=baseline_retriever, search_kwargs={"k": 5}
  )
  retrieved_docs = compression_retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [21]:
class AdjustedState(TypedDict):
  question: str
  context: List[Document]
  response: str

rerank_graph_builder = StateGraph(AdjustedState).add_sequence([retrieve_reranked, generate])
rerank_graph_builder.add_edge(START, "retrieve_reranked")
rerank_graph = rerank_graph_builder.compile()

## RAGAS

In [22]:
custom_run_config = RunConfig(timeout=360)

### Raw Evaluation Data

In [23]:
for test_row in baseline_dataset:
  response = baseline_graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [24]:
baseline_evaluation_dataset = EvaluationDataset.from_pandas(baseline_dataset.to_pandas())

In [25]:
baseline_evaluation_dataset.to_csv("baseline_evaluation_dataset.csv")

In [26]:
for test_row in rerank_dataset:
  response = rerank_graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]
  time.sleep(2) # To try to avoid rate limiting.

In [27]:
rerank_evaluation_dataset = EvaluationDataset.from_pandas(rerank_dataset.to_pandas())

In [28]:
rerank_evaluation_dataset.to_csv("rerank_evaluation_dataset.csv")

### Evaluation Results

In [29]:
baseline_result = evaluate(
    dataset=baseline_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Exception raised in Job[2]: AttributeError('StringIO' object has no attribute 'statements')
Exception raised in Job[32]: AttributeError('StringIO' object has no attribute 'statements')


In [30]:
rerank_evaluation_result = evaluate(
    dataset=rerank_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

## Compare Evaluation Results

In [31]:
baseline_result

{'context_recall': 0.6633, 'faithfulness': 0.9171, 'factual_correctness': 0.5763, 'answer_relevancy': 0.8547, 'context_entity_recall': 0.2941, 'noise_sensitivity_relevant': 0.3456}

In [32]:
rerank_evaluation_result

{'context_recall': 0.6842, 'faithfulness': 0.9341, 'factual_correctness': 0.6270, 'answer_relevancy': 0.7693, 'context_entity_recall': 0.4160, 'noise_sensitivity_relevant': 0.2441}

## Python Library Versions - from uv.lock

```
requires-dist = [
    { name = "claude-agent-sdk", specifier = ">=0.1.0" },
    { name = "cohere", specifier = ">=5.12.0,<5.13.0" },
    { name = "ipykernel", specifier = ">=6.30.1" },
    { name = "jupyter", specifier = ">=1.1.1" },
    { name = "langchain", specifier = ">=0.3.14" },
    { name = "langchain-cohere", specifier = "==0.4.4" },
    { name = "langchain-community", specifier = ">=0.3.29" },
    { name = "langchain-openai", specifier = ">=0.3.33" },
    { name = "langchain-qdrant", specifier = ">=0.2.1" },
    { name = "langgraph", specifier = "==0.6.7" },
    { name = "pymupdf", specifier = ">=1.24.0" },
    { name = "pyppeteer", specifier = ">=2.0.0" },
    { name = "qdrant-client", specifier = ">=1.7.0" },
    { name = "ragas", specifier = "==0.2.10" },
    { name = "rapidfuzz", specifier = ">=3.0.0" },
]
```