# Finetuning on restaurant reviews
This notebook finetunes a custom model on restaurant reviews. The goal is to build an agentic RAG application which can better understand and retrieve queries about food. 


### Dependencies and Boilerplate

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
import random
import os
import numpy as np
import torch
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [3]:
seed_everything(42)

In [4]:
from dotenv import load_dotenv
_ = load_dotenv()


In [5]:
# Not needed unless on colab
!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters ragas==0.2.10 rapidfuzz jq


In [6]:
!pip install -qU faiss-cpu python-pptx==1.0.2 nltk==3.9.1 pymupdf beautifulsoup4 lxml


In [7]:
!pip install jq



In [8]:

import os
from langchain_community.document_loaders import HuggingFaceDatasetLoader, JSONLoader, CSVLoader
DATA_ROOT = "../../data/10000_restaurant_reviews/restaurant_reviews.csv"
dataloader = CSVLoader(DATA_ROOT)

In [9]:
documents = dataloader.load()

In [10]:
#shuffle the documents
import numpy as np
documents = np.random.permutation(documents)

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 500,
    length_function = len
)

In [12]:
documents = text_splitter.split_documents(documents)


In [13]:
len(documents)

10882

In [14]:
# Take a smaller subset of documents for training
training_documents = documents[:1000]

In [14]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [14]:
# Create documentation splits
training_split_documents = training_documents[:len(training_documents) - 200]
val_split_documents = training_documents[len(training_documents) - 200:len(training_documents)-100]
test_split_documents = training_documents[len(training_documents)-100:]

## Construct Fine-tuning Dataset

In [19]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0
)

In [20]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [21]:
question_generation_chain = qa_prompt_template | qa_chat_model


In [86]:
import tqdm
import asyncio

"""
Sample Usage of TQDM:

for i in tqdm.tqdm(range(10)):
  time.sleep(1)
"""

async def create_questions(documents, n_questions):

    questions = {}
    relevant_docs = {}
    question_pattern = r"^\d+\.\s(.*)$"
    id_set = set()
    ### YOUR CODE HERE
    for i, doc in tqdm.tqdm(enumerate(documents)):
        r = question_generation_chain.invoke({'n_questions': n_questions, "context": doc.page_content})
        import re
        import uuid
        question_list = re.findall(question_pattern, r.content, re.MULTILINE)
        for question in question_list:
            id = uuid.uuid4()
            while id in id_set:
                id = uuid()
            id_set.add(id)
            questions[str(id)] = question
            if id not in relevant_docs:
                relevant_docs[str(id)] = [str(doc.metadata["id"])]
            else:
                relevant_docs[id].append(str(doc.metadata["id"]))

    return questions, relevant_docs

In [87]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)


800it [15:46,  1.18s/it]


In [88]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)


100it [01:41,  1.02s/it]


In [89]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)


100it [01:39,  1.01it/s]


In [94]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [95]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [96]:
test_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : test_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [15]:
import json
with open("training_dataset.jsonl", "r") as f:
  train_dataset = json.load(f)

with open("val_dataset.jsonl", "r") as f:
  val_dataset = json.load(f)

with open("test_dataset.jsonl", "r") as f:
  test_dataset = json.load(f)

### Fine-tuning `snowflake-arctic-embed-l`

In [None]:
!pip install -qU sentence_transformers datasets pyarrow


In [99]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [100]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [101]:
BATCH_SIZE = 10


In [102]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [103]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [104]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [105]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [106]:
EPOCHS = 10


In [107]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [108]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,No log,0.865,0.935,0.95,0.98,0.865,0.311667,0.19,0.098,0.865,0.935,0.95,0.98,0.923887,0.905857,0.907032
100,No log,No log,0.885,0.935,0.955,0.985,0.885,0.311667,0.191,0.0985,0.885,0.935,0.955,0.985,0.931342,0.914635,0.915857
150,No log,No log,0.88,0.95,0.96,0.975,0.88,0.316667,0.192,0.0975,0.88,0.95,0.96,0.975,0.930701,0.916139,0.918109
160,No log,No log,0.885,0.945,0.955,0.97,0.885,0.315,0.191,0.097,0.885,0.945,0.955,0.97,0.930119,0.917042,0.919404
200,No log,No log,0.88,0.955,0.975,0.99,0.88,0.318333,0.195,0.099,0.88,0.955,0.975,0.99,0.938205,0.921298,0.922109
250,No log,No log,0.9,0.955,0.98,0.99,0.9,0.318333,0.196,0.099,0.9,0.955,0.98,0.99,0.945418,0.931,0.931871
300,No log,No log,0.9,0.975,0.985,0.99,0.9,0.325,0.197,0.099,0.9,0.975,0.985,0.99,0.950134,0.936667,0.937415
320,No log,No log,0.91,0.975,0.98,0.99,0.91,0.325,0.196,0.099,0.91,0.975,0.98,0.99,0.953234,0.941,0.941871
350,No log,No log,0.895,0.98,0.985,0.995,0.895,0.326667,0.197,0.0995,0.895,0.98,0.985,0.995,0.950081,0.935083,0.935346
400,No log,No log,0.9,0.985,0.985,0.995,0.9,0.328333,0.197,0.0995,0.9,0.985,0.985,0.995,0.955882,0.9425,0.94275


In [109]:
hf_username = "deman539"
import uuid

model.push_to_hub(f"{hf_username}/food-review-ft-snowflake-l-{uuid.uuid4()}")

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'https://huggingface.co/deman539/food-review-ft-snowflake-l-f18eeff6-7504-48c7-af10-1d2d85ca8caa/commit/b8baf52b6a8fb5105ec17c9384eb54be2efea980'

### Evaluation

In [110]:
# Creating a new set of documents to do a quick evaluation on
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap  = 100,
    length_function = len
)

training_documents = text_splitter.split_documents(documents)

In [111]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

In [112]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm.tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

#### OpenAI Evaluation

In [113]:
te3_openai = OpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

100%|██████████| 200/200 [01:27<00:00,  2.29it/s]


In [114]:
te3_results_df = pd.DataFrame(te3_results)


In [117]:
te3_hit_rate = te3_results_df["is_hit"].mean()
print(f"Hit rate for openai embeddings: {te3_hit_rate*100:.2f}%")

Hit rate for openai embeddings: 96.50%


#### Base Snowflake-Arctic-l Evaluation

In [118]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

100%|██████████| 200/200 [00:20<00:00,  9.54it/s]


In [119]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)


In [120]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
print(f"Hit rate for arctic-embed-m embeddings: {arctic_embed_m_hit_rate*100:.2f}%")

Hit rate for arctic-embed-m embeddings: 78.00%


#### Finetuned Snowflake-Arctic-L Evaluation

In [121]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic_ft")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic_ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 200/200 [00:11<00:00, 18.14it/s]


In [122]:
finetune_results_df = pd.DataFrame(finetune_results)


In [123]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
print(f"Hit rate for finetuned embeddings: {finetune_hit_rate*100:.2f}%")

Hit rate for finetuned embeddings: 97.50%


Overall, the finetuned embeddings (on this very basic test) perform better than OpenAI Embeddings

### RAGAS Evaluation

In [126]:
!pip install rapidfuzz==3.13.0 ragas==0.2.10

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting ragas==0.2.10
  Downloading ragas-0.2.10-py3-none-any.whl.metadata (8.2 kB)
Collecting appdirs (from ragas==0.2.10)
  Using cached appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting pysbd>=0.3.4 (from ragas==0.2.10)
  Using cached pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting diskcache>=5.6.3 (from ragas==0.2.10)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading ragas-0.2.10-py3-none-any.whl (175 kB)
Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)
Using cached pysbd-0.3.4-py3-none-any.whl (71 kB)
Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: appdirs, pysbd, diskcache, ragas
Successfully installed appdirs-1.4.4 diskcache-5.6.3 pysbd-0.3.4 ragas-0.2.10


In [16]:
# Use a different subset of the documents for RAGAS evaluation
docs = documents[1000:2000]
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 50,
    length_function = len
)

training_documents = text_splitter.split_documents(docs)

In [17]:
from ragas.testset import TestsetGenerator

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-nano"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(training_documents, testset_size=100)

Applying SummaryExtractor:   0%|          | 0/476 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/1396 [00:00<?, ?it/s]

Node a97861f2-dae3-4389-9e81-ebe2cd28550b does not have a summary. Skipping filtering.
Node 5aee0ac8-33b4-4fe0-b225-38a6658714dc does not have a summary. Skipping filtering.
Node 505a42f3-e24c-4ef9-8904-071488431b9c does not have a summary. Skipping filtering.
Node 699a2fda-9744-47a3-b33f-a71015997e80 does not have a summary. Skipping filtering.
Node 2225b800-11df-4481-b297-60030ab60363 does not have a summary. Skipping filtering.
Node f8803499-b645-44a7-ab30-a8066ae59603 does not have a summary. Skipping filtering.
Node 2849d6a7-f380-4282-a677-d85b9d2eff0f does not have a summary. Skipping filtering.
Node 2cda4fbc-429f-4ff8-9d95-0c4ab8aba3db does not have a summary. Skipping filtering.
Node 2e1eb5df-2d75-413c-a10c-6c06b131c68f does not have a summary. Skipping filtering.
Node 558f8c7b-0bad-4791-a549-81eff088c399 does not have a summary. Skipping filtering.
Node a46935de-e4c0-4ba0-8b3a-3207fa3db4ad does not have a summary. Skipping filtering.
Node 2c15f2d6-fc1f-436a-9019-82b298ce4c87 d

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/3199 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/100 [00:00<?, ?it/s]

In [18]:
dataset.to_pandas()


Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Who is Debasish Choudhury in the context of th...,[Restaurant: Kritunga Restaurant\nReviewer: De...,Debasish Choudhury is the reviewer of Kritunga...,single_hop_specifc_query_synthesizer
1,What did Akhil Ralla say about Delhi-39?,[Restaurant: Delhi-39\nReviewer: Akhil Ralla\n...,Akhil Ralla's review of Delhi-39 states that i...,single_hop_specifc_query_synthesizer
2,What are some popular nightlife spots near Gac...,[Restaurant: Diners Pavilion\nReviewer: Vinay ...,The review mentions Diners Pavilion as a good ...,single_hop_specifc_query_synthesizer
3,Who is Shaily Naik?,[Restaurant: Triptify\nReviewer: Shaily Naik\n...,Shaily Naik is a reviewer for Triptify who gav...,single_hop_specifc_query_synthesizer
4,Who is Anmol Bansal in the review?,[Restaurant: Over The Moon Brew Company\nRevie...,Anmol Bansal is the reviewer who shared their ...,single_hop_specifc_query_synthesizer
...,...,...,...,...
95,Based on the review mentioning apple desserts ...,[<1-hop>\n\nMy review is only on vegetarian fo...,"The inclusion of apple-themed items, such as a...",multi_hop_specific_query_synthesizer
96,"Based on the reviews, how does AB's - Absolute...",[<1-hop>\n\nRestaurant: AB's - Absolute Barbec...,AB's - Absolute Barbecues excels in providing ...,multi_hop_specific_query_synthesizer
97,"Ashraf and Ashraful helpin in service, chicken...",[<1-hop>\n\nchicken wings deserves a special m...,"Yes, in the reviews, Ashraf and Ashraful are m...",multi_hop_specific_query_synthesizer
98,Wht is DLF food place bad for eat?,[<1-hop>\n\nRestaurant: Marsala Food Company\n...,The review of Marsala Food Company at DLF ment...,multi_hop_specific_query_synthesizer


In [19]:
dataset.to_pandas().to_csv("ragas_golden_eval_dataset.csv", index=False)

In [41]:
# Base embedding model
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
base_vectorstore = FAISS.from_documents(training_documents, huggingface_embeddings)
base_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 6})

In [42]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

In [44]:
rag_llm =  ChatOpenAI(
    model="gpt-4.1-nano",
    temperature=0
)

In [45]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

base_rag_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [46]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="deman539/food-review-ft-snowflake-l-f18eeff6-7504-48c7-af10-1d2d85ca8caa")
finetune_vectorstore = FAISS.from_documents(training_documents, finetune_embeddings)
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/26.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at deman539/food-review-ft-snowflake-l-f18eeff6-7504-48c7-af10-1d2d85ca8caa and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [51]:
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [48]:
for test_row in dataset:
  response = base_rag_chain.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [49]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [50]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1"))

In [52]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

base_result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
base_result

Evaluating:   0%|          | 0/600 [00:00<?, ?it/s]

Exception raised in Job[308]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Exception raised in Job[368]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Exception raised in Job[392]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')


{'context_recall': 0.2998, 'faithfulness': 0.7608, 'factual_correctness': 0.3091, 'answer_relevancy': 0.3666, 'context_entity_recall': 0.2814, 'noise_sensitivity_relevant': 0.1437}

In [53]:
for test_row in dataset:
  response = finetune_rag_chain.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [54]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [55]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

finetune_result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
finetune_result

Evaluating:   0%|          | 0/600 [00:00<?, ?it/s]

Exception raised in Job[152]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Exception raised in Job[356]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Exception raised in Job[392]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Exception raised in Job[398]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Exception raised in Job[494]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')


{'context_recall': 0.6383, 'faithfulness': 0.7685, 'factual_correctness': 0.4484, 'answer_relevancy': 0.6256, 'context_entity_recall': 0.5006, 'noise_sensitivity_relevant': 0.2411}

In [61]:
comparison_df = pd.DataFrame(
    [
     {'model': 'base','context_recall': 0.2998, 'faithfulness': 0.7608, 'factual_correctness': 0.3091, 'answer_relevancy': 0.3666, 'context_entity_recall': 0.2814, 'noise_sensitivity_relevant': 0.1437},
     {'model': 'finetuned', 'context_recall': 0.6383, 'faithfulness': 0.7685, 'factual_correctness': 0.4484, 'answer_relevancy': 0.6256, 'context_entity_recall': 0.5006, 'noise_sensitivity_relevant': 0.2411},

    ]
)

In [62]:
comparison_df

Unnamed: 0,model,context_recall,faithfulness,factual_correctness,answer_relevancy,context_entity_recall,noise_sensitivity_relevant
0,base,0.2998,0.7608,0.3091,0.3666,0.2814,0.1437
1,finetuned,0.6383,0.7685,0.4484,0.6256,0.5006,0.2411


It is very clear that the finetuned model performs much better on food related query retrieval.