### Nest Asyncio

In [1]:
!pip install arxiv==2.1.3 accelerate>=1.3.0 beautifulsoup4>=4.13.3 datasets>=3.2.0 faiss-cpu>=1.10.0 ipykernel>=6.29.5 fsspec==2025.3.2

In [5]:
!pip install ipywidgets>=8.1.5 langchain>=0.3.18 langchain-community>=0.3.17 langchain-core>=0.3.34 langchain-huggingface>=0.1.2 langchain-openai>=0.3.4 langchain-text-splitters>=0.3.6

In [6]:
!pip install lxml>=5.3.1 nltk==3.9.1 pyarrow>=19.0.0 pymupdf>=1.25.3 python-pptx==1.0.2 sentence-transformers>=3.4.1 'transformers[torch]>=4.48.3' wandb>=0.19.6

In [7]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.5.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.5.0-py3-none-any.whl (303 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.4/303.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.5.0


In [8]:
import nest_asyncio

nest_asyncio.apply()

In [9]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

Enter Your OpenAI API Key: ··········


In [2]:
!mkdir data

In [3]:
!curl https://arxiv.org/html/2407.11005v2/ -o data/2407.11005v2.html

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  325k  100  325k    0     0  1693k      0 --:--:-- --:--:-- --:--:-- 1702k


In [4]:
!curl https://www.datacamp.com/blog/llm-evaluation/ -o data/llm-evaluation.html

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  7007  100  7007    0     0  59074      0 --:--:-- --:--:-- --:--:-- 59381


In [10]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import BSHTMLLoader

path = "data/"
text_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)

Next, we'll set up a classic naive chunking strategy as we only care that the documents get parsed into chunks that we can generate synthetic questions about.

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 50,
    length_function = len
)

In [15]:
training_documents = text_splitter.split_documents(text_loader.load())

In [16]:
len(training_documents)

225

In [17]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [18]:
training_split_documents = training_documents[:len(training_documents) - 28]
val_split_documents = training_documents[len(training_documents) - 28:225-14]
test_split_documents = training_documents[225-14:]

In [19]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0
)

In [20]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [21]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [22]:
import tqdm
import asyncio

"""
Sample Usage of TQDM:

for i in tqdm.tqdm(range(10)):
  time.sleep(1)
"""

async def create_questions(documents, n_questions):

    questions = {}
    relevant_docs = {}

    for doc in tqdm.tqdm(documents, desc="Generating questions"):
        # Prepare the input for the chain
        input_context = doc.page_content
        doc_id = doc.metadata["id"]

        # Call the question generation chain
        response = await question_generation_chain.ainvoke({"context": input_context, "n_questions": n_questions})

        # Extract questions
        generated_questions = response.content.split("\n")
        generated_questions = [q.strip() for q in generated_questions if q.strip()]

        # Some outputs might be numbered like "1. What is ...?", so clean numbering
        cleaned_questions = []
        for q in generated_questions:
            if q[0].isdigit() and q[1] == '.':
                cleaned_questions.append(q[2:].strip())
            elif q[0].isdigit() and q[1] == ' ':
                cleaned_questions.append(q[1:].strip())
            else:
                cleaned_questions.append(q)

        # Now save each question
        for q in cleaned_questions:
            question_id = str(uuid.uuid4())
            questions[question_id] = q
            relevant_docs[question_id] = [doc_id]

    return questions, relevant_docs

In [23]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Generating questions: 100%|██████████| 197/197 [03:58<00:00,  1.21s/it]


We'll use the function to generate training, validation, and test data.

In [24]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Generating questions: 100%|██████████| 14/14 [00:18<00:00,  1.29s/it]


In [25]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Generating questions: 100%|██████████| 14/14 [00:18<00:00,  1.29s/it]


In [26]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset_evaluation_assistant.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [27]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset_evaluation_assistant.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [28]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset_evaluation_assistant.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [29]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/85.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [30]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [31]:
BATCH_SIZE = 5

Let's move our dataset into the expected format for training.

In [32]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

Now we can create a `torch` `DataLoader`!

In [33]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [34]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [35]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

We'll train this model for 5 epochs, though you could increase this number if we had a significant amount more data.

In [36]:
EPOCHS = 10

In [37]:
import wandb
wandb.init(mode="disabled")

> NOTE: You may not see direct improvement during the training cycles - this is absolutely expected. We will verify performance later in the notebook.

In [38]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,No log,0.892857,0.964286,1.0,1.0,0.892857,0.321429,0.2,0.1,0.892857,0.964286,1.0,1.0,0.95174,0.935714,0.935714
80,No log,No log,0.928571,0.964286,1.0,1.0,0.928571,0.321429,0.2,0.1,0.928571,0.964286,1.0,1.0,0.964921,0.953571,0.953571
100,No log,No log,0.928571,0.964286,1.0,1.0,0.928571,0.321429,0.2,0.1,0.928571,0.964286,1.0,1.0,0.964921,0.953571,0.953571
150,No log,No log,0.892857,0.964286,1.0,1.0,0.892857,0.321429,0.2,0.1,0.892857,0.964286,1.0,1.0,0.95174,0.935714,0.935714
160,No log,No log,0.892857,0.964286,1.0,1.0,0.892857,0.321429,0.2,0.1,0.892857,0.964286,1.0,1.0,0.95174,0.935714,0.935714
200,No log,No log,0.857143,0.964286,1.0,1.0,0.857143,0.321429,0.2,0.1,0.857143,0.964286,1.0,1.0,0.938559,0.917857,0.917857
240,No log,No log,0.857143,0.964286,1.0,1.0,0.857143,0.321429,0.2,0.1,0.857143,0.964286,1.0,1.0,0.938559,0.917857,0.917857
250,No log,No log,0.892857,0.964286,1.0,1.0,0.892857,0.321429,0.2,0.1,0.892857,0.964286,1.0,1.0,0.95174,0.935714,0.935714
300,No log,No log,0.857143,0.964286,1.0,1.0,0.857143,0.321429,0.2,0.1,0.857143,0.964286,1.0,1.0,0.938559,0.917857,0.917857
320,No log,No log,0.892857,0.964286,1.0,1.0,0.892857,0.321429,0.2,0.1,0.892857,0.964286,1.0,1.0,0.95174,0.935714,0.935714


In [39]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [40]:
hf_username = "chelleboyer"

In [41]:
import uuid

model.push_to_hub(f"{hf_username}/llm-evals-2-{uuid.uuid4()}")

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'https://huggingface.co/chelleboyer/llm-evals-2-79b954ef-4798-4994-be72-a88d46b8ecca/commit/3123007efcc5948f8397e200576bb0332b7dced6'

## Task 5: Evaluating our Retriever

Now that we have fine-tuned our retriever - let's see if it's worthwhile!

We'll start with some basic imports.

In [42]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

Now we'll define a function that will help us evaluate our retrieval process.

> NOTE: We're assuming 1 correct document in a "hit".

In [43]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm.tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

All that's left to do is evaluate, we'll evaluate our model against:

1. OpenAI's closed source `text-embedding-3-small`
2. The base non-fine-tuned version of `Snowflake/snowflake-arctic-embed-l`.

Let's see how it stacks up!

### `text-embedding-3-small`

In [44]:
te3_openai = OpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

100%|██████████| 28/28 [00:10<00:00,  2.77it/s]


In [45]:
te3_results_df = pd.DataFrame(te3_results)

In [46]:
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

np.float64(1.0)

### `Snowflake/snowflake-arctic-embed-l` (base)

In [47]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

100%|██████████| 28/28 [00:00<00:00, 45.04it/s]


In [48]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)

In [49]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

np.float64(0.75)

### `Snowflake/snowflake-arctic-embed-l` (fine-tuned)

In [50]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic_ft")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic_ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 28/28 [00:00<00:00, 43.08it/s]


In [51]:
finetune_results_df = pd.DataFrame(finetune_results)

In [52]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

np.float64(1.0)

## Task 1: Vibe Checking the RAG Pipeline

We're going to use our RAG pipeline to vibe check on some common phrases now that we've modified it!

### Creating New Chunks

In order to try and evaluate our system more fairly, let's create new chunks that we will use to create our Vector Store.

In [53]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 50,
    length_function = len
)

training_documents = text_splitter.split_documents(text_loader.load())

### Base Chain

We'll start by constructing our base chain, which will use the untrained retrieval model.

#### R - Retrieval

In [54]:
from langchain_community.vectorstores import FAISS

base_vectorstore = FAISS.from_documents(training_documents, huggingface_embeddings)
base_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 6})

#### A - Augmented

In [55]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

#### G - Generation

In [56]:
rag_llm =  ChatOpenAI(
    model="gpt-4.1-nano",
    temperature=0
)

#### RAG - LCEL RAG Pipeline

In [57]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

base_rag_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [58]:
base_rag_chain.invoke({"question" : "What is the goal of evaluating a language model?"})["response"]

"The goal of evaluating a language model is to assess its performance in generating accurate, relevant, and faithful responses based on the provided context or input. This involves measuring how well the model can produce correct information, maintain relevance to the query, and faithfully represent the underlying data or knowledge, thereby ensuring the model's effectiveness and reliability in knowledge-intensive tasks."

In [63]:
base_rag_chain.invoke({"question" : "How does TRACe assess RAG systems?"})["response"]

'TRACe assesses RAG systems by evaluating their performance using specific metrics, such as the TRACe metrics, which are applied to each of the 32 resulting RAG systems. This evaluation involves using an LLM annotation prompt to analyze and provide granular, actionable insights into the overall performance of the RAG systems, including their ability to generate accurate and reliable outputs.'

In [64]:
base_rag_chain.invoke({"question" : "How does RAGBench contribute to the evaluation of RAG systems across different domains?"})["response"]

'RAGBench contributes to the evaluation of RAG systems across different domains by providing an explainable benchmark that assesses overall performance and offers granular, actionable insights. It enables comparison of various RAG approaches, addresses the lack of established benchmarks, and utilizes specific metrics like TRACe to evaluate system correctness and faithfulness across diverse datasets and contexts.'

In [65]:
base_rag_chain.invoke({"question" : "Why is perplexity considered a fundamental metric in evaluating LLMs, and what are its limitations?"})["response"]

'I do not know.'

### Fine-tuned Embedding Model

Now let's rebuild our RAG chain with the Fine-tuned model - the only component we need to change is our `FAISS` vectorstore!

In [66]:
finetune_vectorstore = FAISS.from_documents(training_documents, finetune_embeddings)
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

In [67]:
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [68]:
finetune_rag_chain.invoke({"question" : "What is the goal of evaluating a language model?"})["response"]

"The goal of evaluating a language model is to assess its performance in tasks such as correctness, relevance, faithfulness, robustness, and noise rejection. This involves determining how accurately and reliably the model can generate or retrieve information, especially in knowledge-intensive tasks, and how well it can handle noisy or irrelevant context. Evaluation helps to understand the model's strengths and limitations, guiding improvements and ensuring its outputs are trustworthy and useful."

In [69]:
finetune_rag_chain.invoke({"question" : "How does TRACe assess RAG systems?"})["response"]

'TRACe assesses RAG systems by using four metrics: Utilization, Relevance, Adherence, and Completeness. These metrics provide a standardized and systematic way to evaluate different aspects of RAG system performance. Utilization, Adherence, and Completeness measure the quality of the generator, with Adherence focusing on how well the output aligns with factual information from sources. Relevance evaluates the relevance of the generated content to the query or context.'

In [70]:
finetune_rag_chain.invoke({"question" : "How does RAGBench contribute to the evaluation of RAG systems across different domains?"})["response"]

'RAGBench contributes to the evaluation of RAG systems across different domains by providing a comprehensive, large-scale dataset sourced from multiple industry-specific domains, such as user manuals and other industry corpora. It introduces a standardized evaluation framework called TRACe, which includes multiple explainable metrics—such as context relevance, context utilization, answer completeness, and adherence—that enable systematic and holistic assessment of RAG system performance across various task types and domains. This approach facilitates consistent benchmarking, actionable feedback, and the development of more effective evaluation models tailored to diverse industry applications.'

In [71]:
finetune_rag_chain.invoke({"question" : "Why is perplexity considered a fundamental metric in evaluating LLMs, and what are its limitations?"})["response"]

'I do not know.'

## Task 2: RAGAS Evaluation

It's great to have some idea of how our system is doing based on vibe-checks, but let's use RAGAS to provide more insight info. on how things are improving!

> NOTE: Please recreate *exactly* the RAGAS process we used to evaluate RAG, baselining with the default retriever, and then comparing the new retriever. The includes the Synthetic Data Generation steps.

In [72]:
!pip install -qU ragas datasets rapidfuzz

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/190.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m


In [73]:
#from langchain_community.document_loaders import DirectoryLoader
#from langchain_community.document_loaders import BSHTMLLoader

#path = "data/"
#text_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)

docs = text_loader.load()

I need to refactor the original code in functon calls so i don't have to duplicate code here.

In [74]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.testset import TestsetGenerator
from ragas import evaluate, RunConfig
from ragas import EvaluationDataset
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas.llms import LangchainLLMWrapper

def generate_questions(gen_model, embed_model, n_questions=10):

  generator_llm = LangchainLLMWrapper(gen_model)
  generator_embeddings = LangchainEmbeddingsWrapper(embed_model)

  generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
  dataset = generator.generate_with_langchain_docs(docs, testset_size=n_questions)

  return dataset

def run_response_chain(dataset, chain):
  from tqdm import tqdm

  for test_row in dataset:
    response = chain.invoke({"question" : test_row.eval_sample.user_input})
    test_row.eval_sample.response = response["response"]
    test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

  return dataset

def run_evaluation(dataset, llm):
  evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())
  evaluator_llm = LangchainLLMWrapper(llm)

  custom_run_config = RunConfig(timeout=360)

  result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
  )

  return result

#### Baseline Evaluation

In [75]:
try:
    base_test_dataset = generate_questions(rag_llm,huggingface_embeddings)
except Exception as exc:
    print(repr(exc.errors()[0]['type']))
    #> 'missing'


Applying HeadlinesExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/2 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/12 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/11 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

In [76]:
base_eval_dataset = run_response_chain(base_test_dataset,base_rag_chain)

In [77]:
base_eval_result = run_evaluation(base_eval_dataset,rag_llm)

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[1]: OutputParserException(Invalid json output: The answer is "I do not know."
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE )


In [78]:
import os
from getpass import getpass
os.environ["RAGAS_APP_TOKEN"] = getpass("Please enter your RAGAS Token!")

Please enter your RAGAS Token!··········


In [79]:
base_test_dataset.upload()

Testset uploaded! View at https://app.ragas.io/dashboard/alignment/testset/b0009f7a-3106-42fa-862b-4897d56d5cea


'https://app.ragas.io/dashboard/alignment/testset/b0009f7a-3106-42fa-862b-4897d56d5cea'

In [80]:
base_eval_result

{'context_recall': 0.7000, 'faithfulness': 0.8305, 'factual_correctness(mode=f1)': 0.3540, 'answer_relevancy': 0.4684, 'context_entity_recall': 0.0875, 'noise_sensitivity(mode=relevant)': 0.2366}

####Fine-tuned Evaluation

In [81]:
finetune_test_dataset = generate_questions(rag_llm,finetune_embeddings)

Applying HeadlinesExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/2 [00:00<?, ?it/s]

ERROR:ragas.testset.transforms.engine:unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/12 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/7 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

In [82]:
finetune_test_dataset.upload()

Testset uploaded! View at https://app.ragas.io/dashboard/alignment/testset/22b0b9df-1f4d-410c-bb75-92ff7a4b73df


'https://app.ragas.io/dashboard/alignment/testset/22b0b9df-1f4d-410c-bb75-92ff7a4b73df'

In [83]:
finetune_eval_dataset = run_response_chain(finetune_test_dataset,finetune_rag_chain)

In [84]:
finetune_eval_result = run_evaluation(finetune_eval_dataset,rag_llm)

Evaluating:   0%|          | 0/72 [00:00<?, ?it/s]

In [85]:
finetune_eval_result

{'context_recall': 0.9833, 'faithfulness': 0.8269, 'factual_correctness(mode=f1)': 0.4408, 'answer_relevancy': 0.6212, 'context_entity_recall': 0.1927, 'noise_sensitivity(mode=relevant)': 0.2293}

#### Interpretation

| **Metric**                         | **Base Model** | **Fine-Tuned Model** | **Change**         | **Interpretation**                                                                 |
|-----------------------------------|----------------|----------------------|--------------------|------------------------------------------------------------------------------------|
| **Context Recall**                | 0.7000         | 0.9833               | **+0.2833**        | Strong improvement — the fine-tuned model retrieves far more relevant context.     |
| **Faithfulness**                  | 0.8305         | 0.8269               | −0.0036            | Slight decrease, but nearly the same — still very faithful to the source.         |
| **Factual Correctness (F1)**      | 0.3540         | 0.4408               | **+0.0868**        | Moderate gain — more correct facts in the answers after fine-tuning.              |
| **Answer Relevancy**              | 0.4684         | 0.6212               | **+0.1528**        | Significant improvement — answers are more on-topic and aligned with queries.     |
| **Context Entity Recall**         | 0.0875         | 0.1927               | **+0.1052**        | Major improvement — better at retrieving named entities from context.             |
| **Noise Sensitivity (Relevant)**  | 0.2366         | 0.2293               | −0.0073            | Slight decrease — model is a little more robust to irrelevant input noise.        |
