# Fine-tuning Embeddings for RAG on Specific Data




### Nest Asyncio

In [None]:
import nest_asyncio

nest_asyncio.apply()

### Install Dependencies



In [None]:
!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters pypdf

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13

In [None]:
!pip install -qU faiss-cpu python-pptx==1.0.2 nltk==3.9.1 pymupdf beautifulsoup4 lxml

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.1/165.1 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h

###  OpenAI API Key and RAGAS API Key

In [None]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

Enter Your OpenAI API Key: ··········


In [None]:
os.environ["RAGAS_APP_TOKEN"] = getpass.getpass("Please enter your Ragas API key!")

Please enter your Ragas API key!··········


Loading Data


In [None]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader

path = "data/"
text_loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap  = 200,
    length_function = len
)

In [None]:
import re
def remove_references(doc):
    text = doc.page_content

    # Common headers for reference sections
    reference_markers = ["References", "Bibliography", "Cited Works", "Literature Cited"]

    for marker in reference_markers:
        if marker in text:
            text = text.split(marker)[0]  # Keep only the content before references
            break  # Stop checking after the first match

    text = re.sub(r"https?://\S+|doi:\S+", "", text)
    text = re.sub(r"\[\d+\]", "", text)  

    text = re.sub(r"\n{2,}", "\n", text).strip()
    doc.page_content = text.strip()  # Update document content
    return doc

# Apply reference filtering
filtered_documents = [remove_references(doc) for doc in text_loader.load()]

In [None]:
training_documents = text_splitter.split_documents(filtered_documents)

In [None]:
len(training_documents)

513

In [None]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [None]:
training_split_documents = training_documents[:len(training_documents) - 113]
val_split_documents = training_documents[len(training_documents) - 113:513-63]
test_split_documents = training_documents[513-63:]

In [None]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o",
    temperature=0
)

In [None]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [None]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [None]:
import asyncio
import uuid
from tqdm.asyncio import tqdm

async def process_document(document, n_questions):
    """Generates questions asynchronously for a single document."""
    document_content = {"context": document.page_content, "questions": []}

    # Run the synchronous invoke function in a separate thread
    questions_generated = await asyncio.to_thread(
        question_generation_chain.invoke,
        {"context": document.page_content, "n_questions": n_questions}
    )

    local_questions = {}
    local_relevant_docs = {}

    for question in questions_generated.content.split("\n"):
        question_id = str(uuid.uuid4())
        local_questions[question_id] = "".join(question.split(".")[1:]).strip()
        local_relevant_docs[question_id] = [document.metadata["id"]]

    return local_questions, local_relevant_docs

async def create_questions(documents, n_questions):
    """Processes all documents asynchronously to generate questions."""
    questions = {}
    relevant_docs = {}

    # Create tasks for each document
    tasks = [process_document(doc, n_questions) for doc in documents]

    # Execute tasks asynchronously with a progress bar
    results = [await f for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing Documents")]

    # Combine results
    for local_questions, local_relevant_docs in results:
        questions.update(local_questions)
        relevant_docs.update(local_relevant_docs)

    return questions, relevant_docs

In [None]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Processing Documents: 100%|██████████| 400/400 [00:41<00:00,  9.62it/s]


In [None]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Processing Documents: 100%|██████████| 50/50 [00:06<00:00,  7.90it/s]


In [None]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing Documents: 100%|██████████| 63/63 [00:08<00:00,  7.58it/s]


### Reformating and Saving Datasets



In [None]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [None]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [None]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

Fine-tuning `snowflake-arctic-embed-l`


In [None]:
!pip install -qU sentence_transformers datasets pyarrow

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.12.0 requires pyarrow<19.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 19.0.1 which is incompatible.
pylibcudf-cu12 24.12.0 requires pyarrow<19.0.0a0,>=14.0.0; platform_machine 

In [None]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/85.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [None]:
BATCH_SIZE = 10

In [None]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [None]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [None]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [None]:
EPOCHS = 10

In [None]:
import wandb
wandb.init(mode="disabled")

In [None]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
50,No log,No log,0.61,0.89,0.93,0.99,0.61,0.296667,0.186,0.099,0.61,0.89,0.93,0.99,0.813776,0.755635,0.756404
81,No log,No log,0.61,0.87,0.94,1.0,0.61,0.29,0.188,0.1,0.61,0.87,0.94,1.0,0.815845,0.755718,0.755718
100,No log,No log,0.59,0.87,0.92,0.98,0.59,0.29,0.184,0.098,0.59,0.87,0.92,0.98,0.793242,0.732512,0.73419
150,No log,No log,0.59,0.86,0.91,0.99,0.59,0.286667,0.182,0.099,0.59,0.86,0.91,0.99,0.7989,0.737036,0.737661
162,No log,No log,0.56,0.87,0.9,0.98,0.56,0.29,0.18,0.098,0.56,0.87,0.9,0.98,0.786564,0.723151,0.724365
200,No log,No log,0.59,0.87,0.92,0.99,0.59,0.29,0.184,0.099,0.59,0.87,0.92,0.99,0.801244,0.739659,0.740185
243,No log,No log,0.55,0.84,0.91,0.99,0.55,0.28,0.182,0.099,0.55,0.84,0.91,0.99,0.780325,0.71204,0.712595
250,No log,No log,0.6,0.83,0.91,0.99,0.6,0.276667,0.182,0.099,0.6,0.83,0.91,0.99,0.796896,0.734651,0.735239
300,No log,No log,0.61,0.84,0.93,0.99,0.61,0.28,0.186,0.099,0.61,0.84,0.93,0.99,0.803043,0.743056,0.743825
324,No log,No log,0.62,0.84,0.91,0.96,0.62,0.28,0.182,0.096,0.62,0.84,0.91,0.96,0.799341,0.746845,0.750088


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
hf_username = "Gonalb"

In [None]:
model.push_to_hub(f"{hf_username}/flucold-ft-v2")

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'https://huggingface.co/Gonalb/flucold-ft-v2/commit/f876541f85b0937365c46cd59a9073daae51b8b6'

##  Evaluating  Retriever


In [None]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

In [None]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

### `text-embedding-3-small`

In [None]:
te3_openai = OpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

100%|██████████| 126/126 [00:57<00:00,  2.17it/s]


In [None]:
te3_results_df = pd.DataFrame(te3_results)

In [None]:
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

0.8968253968253969

### `Snowflake/snowflake-arctic-embed-l` (base)

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

100%|██████████| 126/126 [00:02<00:00, 45.57it/s]


In [None]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)

In [None]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

0.6031746031746031

### `Snowflake/snowflake-arctic-embed-l` (fine-tuned)

In [None]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic_ft")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic_ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 126/126 [00:02<00:00, 44.19it/s]


In [None]:
finetune_results_df = pd.DataFrame(finetune_results)

In [None]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

0.9920634920634921

### RAGAS Evaluation


In [None]:
!pip install ragas rapidfuzz -q

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap  = 200,
    length_function = len
)

training_documents = text_splitter.split_documents(text_loader.load())

In [None]:
def remove_references(doc):
    text = doc.page_content

    # Common headers for reference sections
    reference_markers = ["References", "Bibliography", "Cited Works", "Literature Cited"]

    for marker in reference_markers:
        if marker in text:
            text = text.split(marker)[0]  # Keep only the content before references
            break  # Stop checking after the first match

    doc.page_content = text.strip()  # Update document content
    return doc

# Apply reference filtering
filtered_documents = [remove_references(doc) for doc in text_loader.load()]

In [None]:
training_documents = text_splitter.split_documents(filtered_documents)

In [None]:
from langchain_community.vectorstores import FAISS

base_vectorstore = FAISS.from_documents(training_documents, huggingface_embeddings)
base_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 6})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

In [None]:
rag_llm =  ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [None]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [None]:
from ragas.testset import TestsetGenerator
import copy
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(test_split_documents, testset_size=20)

Applying SummaryExtractor:   0%|          | 0/60 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/63 [00:00<?, ?it/s]



Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/186 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
base_flue = copy.deepcopy(dataset)

In [None]:
finetune_flue = copy.deepcopy(dataset)

### Base chain

In [None]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

base_rag_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [None]:
for test_row in base_flue:
  response = base_rag_chain.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [None]:
base_flue.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,What is the effect of Ipratropium on rhinorrhe...,[dren. Cochrane Database Syst Rev . 2018; (4):...,[286 American Family Physician www.aafp.org/af...,I do not know.,Intranasal ipratropium may decrease rhinorrhea...,single_hop_specifc_query_synthesizer
1,What is the effectiveness of P. sidoides in tr...,[for coughing children and their parents. Pedi...,"[The use of antihistamines, either alone or in...",I do not know.,P. sidoides may help with symptoms of acute br...,single_hop_specifc_query_synthesizer
2,What Cochrane say about treatments for cold sy...,[dren. Cochrane Database Syst Rev . 2018; (4):...,"[bronchitis in children, but it has not been s...",The Cochrane Database of Systematic Reviews in...,A Cochrane review of 6 low- to moderate-qualit...,single_hop_specifc_query_synthesizer
3,What is the recommended usage of intranasal ip...,[dren. Cochrane Database Syst Rev . 2018; (4):...,[placebo41\nAnalgesics 0 to 18 years Acetamino...,I do not know.,"For children aged 5 to 11 years, intranasal ip...",single_hop_specifc_query_synthesizer
4,What RCTs say about nasal treatments for kids?,[dren. Cochrane Database Syst Rev . 2018; (4):...,[6 to 11 years 5 mL Once\n12 to 18 years 10 mL...,The provided context does not contain specific...,"A Cochrane review of 7 RCTs with 2,144 partici...",single_hop_specifc_query_synthesizer
5,Does Vitamin C help in reducing the duration o...,[no treatment (P < .05 for all)64\n6 to 11 yea...,[no treatment (P < .05 for all)64\n6 to 11 yea...,The context does not provide specific informat...,A Cochrane review of 29 trials found that Vita...,single_hop_specifc_query_synthesizer
6,What does the Cochrane review say about the ef...,"[a double-blind, placebo-controlled survey. Ad...","[September 1, 2019 ◆ Volume 100, Number 5 www....",I do not know.,A Cochrane review that previously reported ben...,single_hop_specifc_query_synthesizer
7,What is the effectiveness of oral corticostero...,[causes harm.40\nIntranasal and Oral Corticost...,[Bronchodilators. In a randomized controlled t...,Oral corticosteroids have not been studied for...,Intranasal and oral corticosteroids do not red...,single_hop_specifc_query_synthesizer
8,What Fashner say about treatin' common cold in...,[Syst Rev. 2015;(9): CD001726.\n 74. Taylor J...,[causes harm.40\nIntranasal and Oral Corticost...,"Fashner, Ericson, and Werner discuss the treat...",The article updates previous articles on this ...,single_hop_specifc_query_synthesizer
9,What information can be found in the TRIP data...,[controlled trials. Altern Ther Health Med . 2...,[port the use of vitamin D 18 \nor echinacea 4...,I do not know.,The TRIP database is one of the data sources u...,single_hop_specifc_query_synthesizer


In [None]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas import EvaluationDataset

base_evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

base_evaluation_dataset = EvaluationDataset.from_pandas(base_flue.to_pandas())

In [None]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

base_result = evaluate(
    dataset=base_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=base_evaluator_llm,
    run_config=custom_run_config
)
base_result

Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[2]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
ERROR:ragas.executor:Exception raised in Job[8]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
ERROR:ragas.executor:Exception raised in Job[20]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
ERROR:ragas.executor:Exception raised in Job[38]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
ERROR:ragas.executor:Exception raised in Job[56]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be 

{'context_recall': 0.3000, 'faithfulness': 0.4256, 'factual_correctness': 0.4373, 'answer_relevancy': 0.2882, 'context_entity_recall': 0.1229, 'noise_sensitivity_relevant': 0.1275}

### Finetuned chain

In [None]:
finetune_vectorstore = FAISS.from_documents(training_documents, finetune_embeddings)
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

In [None]:
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [None]:
for test_row in finetune_flue:
  response = finetune_rag_chain.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [None]:
finet_evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

finet_evaluation_dataset = EvaluationDataset.from_pandas(finetune_flue.to_pandas())

In [None]:
finet_result = evaluate(
    dataset=finet_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=finet_evaluator_llm,
    run_config=custom_run_config
)
finet_result

Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[56]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')


{'context_recall': 0.8958, 'faithfulness': 0.7651, 'factual_correctness': 0.5389, 'answer_relevancy': 0.7093, 'context_entity_recall': 0.3161, 'noise_sensitivity_relevant': 0.1983}