## Finetune embedding

In [1]:
import nest_asyncio

nest_asyncio.apply()

#### Dependencies

In [2]:
!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters

In [3]:
!pip install -qU faiss-cpu python-pptx==1.0.2 nltk==3.9.1 pymupdf beautifulsoup4 lxml

In [4]:
!pip install pypdf 



In [5]:
!pip install wandb 'accelerate>=0.26.0'



In [6]:
!pip install --upgrade torch transformers



In [7]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

In [8]:
from langchain_community.document_loaders import DirectoryLoader, BSHTMLLoader, PyPDFLoader

path = "data/"

# Load HTML files
html_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)
html_docs = html_loader.load()

# Load PDF files
pdf_loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)
pdf_docs = pdf_loader.load()

# Combine both document lists
docs = html_docs + pdf_docs

print(f"Loaded {len(docs)} documents")

Loaded 13 documents


In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 850,
    chunk_overlap  = 50,
    length_function = len
)

In [10]:
training_documents = text_splitter.split_documents(docs)
len(training_documents)

73

In [11]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [12]:
training_split_documents = training_documents[:len(training_documents) - 24]
val_split_documents = training_documents[len(training_documents) - 24:73-12]
test_split_documents = training_documents[73-12:]

In [13]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [14]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [15]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [16]:
import asyncio
import uuid
from tqdm import tqdm

async def process_document(document, n_questions):
    questions_generated = await question_generation_chain.ainvoke({"context": document.page_content, "n_questions": n_questions})

    doc_questions = {}
    doc_relevant_docs = {}

    for question in questions_generated.content.split("\n"):
        question_id = str(uuid.uuid4())
        doc_questions[question_id] = "".join(question.split(".")[1:]).strip()
        doc_relevant_docs[question_id] = [document.metadata["id"]]

    return doc_questions, doc_relevant_docs


async def create_questions(documents, n_questions):
    tasks = [process_document(doc, n_questions) for doc in documents]

    questions = {}
    relevant_docs = {}

    for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc="Processing documents"):
        doc_questions, doc_relevant_docs = await task
        questions.update(doc_questions)
        relevant_docs.update(doc_relevant_docs)

    return questions, relevant_docs

In [17]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing documents: 100%|██████████| 49/49 [00:03<00:00, 13.85it/s]
Processing documents: 100%|██████████| 12/12 [00:02<00:00,  5.09it/s]
Processing documents: 100%|██████████| 12/12 [00:08<00:00,  1.49it/s]


In [18]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [19]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [20]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [21]:
!pip install -qU sentence_transformers datasets pyarrow

In [22]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [23]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [24]:
BATCH_SIZE = 10

In [25]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [26]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [27]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [28]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [29]:
EPOCHS = 10

In [30]:
import wandb
wandb.init(mode="disabled")

In [31]:
import accelerate
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_caregiver_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
10,No log,No log,0.875,0.958333,1.0,1.0,0.875,0.319444,0.2,0.1,0.875,0.958333,1.0,1.0,0.938241,0.918056,0.918056
20,No log,No log,0.875,1.0,1.0,1.0,0.875,0.333333,0.2,0.1,0.875,1.0,1.0,1.0,0.953866,0.9375,0.9375
30,No log,No log,0.875,1.0,1.0,1.0,0.875,0.333333,0.2,0.1,0.875,1.0,1.0,1.0,0.948411,0.930556,0.930556
40,No log,No log,0.875,1.0,1.0,1.0,0.875,0.333333,0.2,0.1,0.875,1.0,1.0,1.0,0.948411,0.930556,0.930556
50,No log,No log,0.916667,1.0,1.0,1.0,0.916667,0.333333,0.2,0.1,0.916667,1.0,1.0,1.0,0.963789,0.951389,0.951389
60,No log,No log,0.916667,1.0,1.0,1.0,0.916667,0.333333,0.2,0.1,0.916667,1.0,1.0,1.0,0.963789,0.951389,0.951389
70,No log,No log,0.916667,1.0,1.0,1.0,0.916667,0.333333,0.2,0.1,0.916667,1.0,1.0,1.0,0.963789,0.951389,0.951389
80,No log,No log,0.916667,1.0,1.0,1.0,0.916667,0.333333,0.2,0.1,0.916667,1.0,1.0,1.0,0.963789,0.951389,0.951389
90,No log,No log,0.916667,1.0,1.0,1.0,0.916667,0.333333,0.2,0.1,0.916667,1.0,1.0,1.0,0.963789,0.951389,0.951389
100,No log,No log,0.916667,1.0,1.0,1.0,0.916667,0.333333,0.2,0.1,0.916667,1.0,1.0,1.0,0.963789,0.951389,0.951389


In [32]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
hf_username = "ernestobs7"

In [34]:
model.push_to_hub(f"{hf_username}/caregiver-ft-v1")

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

'https://huggingface.co/ernestobs7/caregiver-ft-v1/commit/af0231a63af7eef35447a9bf7030362093090d10'

In [35]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

In [36]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=True,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

In [37]:
te3_openai = OpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

100%|██████████| 24/24 [00:11<00:00,  2.06it/s]


In [38]:
te3_results_df = pd.DataFrame(te3_results)

In [39]:
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

1.0

In [40]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

100%|██████████| 24/24 [00:01<00:00, 12.85it/s]


In [41]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)

In [42]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

0.9583333333333334

In [43]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_caregiver_ft")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at finetuned_caregiver_ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 24/24 [00:01<00:00, 12.07it/s]


In [44]:
finetune_results_df = pd.DataFrame(finetune_results)

In [45]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

1.0

In [46]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 850,
    chunk_overlap  = 50,
    length_function = len
)

In [47]:
training_documents = text_splitter.split_documents(docs)

In [48]:
from langchain_community.vectorstores import FAISS

finetune_vectorstore = FAISS.from_documents(training_documents, finetune_embeddings)
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

In [49]:
from langchain_community.document_loaders import DirectoryLoader, BSHTMLLoader, PyPDFLoader

path = "data/"

# Load HTML files
html_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)
html_docs = html_loader.load()

# Load PDF files
pdf_loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyPDFLoader)
pdf_docs = pdf_loader.load()

# Combine both document lists
docs = html_docs + pdf_docs

In [50]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas import EvaluationDataset
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
dataset.to_pandas()

Applying HeadlinesExtractor:   0%|          | 0/5 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/13 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/9 [00:00<?, ?it/s]

Property 'summary' already exists in node 'd74e55'. Skipping!
Property 'summary' already exists in node 'cab783'. Skipping!
Property 'summary' already exists in node '93a86a'. Skipping!
Property 'summary' already exists in node '0d586c'. Skipping!


Applying CustomNodeFilter:   0%|          | 0/5 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/17 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '93a86a'. Skipping!
Property 'summary_embedding' already exists in node 'cab783'. Skipping!
Property 'summary_embedding' already exists in node '0d586c'. Skipping!
Property 'summary_embedding' already exists in node 'd74e55'. Skipping!


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What role does the National Institute of Neuro...,[Amyotrophic Lateral Sclerosis (ALS) | Nationa...,The National Institute of Neurological Disorde...,single_hop_specifc_query_synthesizer
1,What role does the U.S. Food and Drug Administ...,[How is amyotrophic lateral sclerosis (ALS) di...,The U.S. Food and Drug Administration (FDA) ap...,single_hop_specifc_query_synthesizer
2,Are Whites more likely to develop ALS?,[Who is more likely to get amyotrophic lateral...,"Yes, Whites and non-Hispanics are most likely ...",single_hop_specifc_query_synthesizer
3,What role does the Agency for Toxic Substances...,[What are the latest updates on amyotrophic la...,The Agency for Toxic Substances and Disease Re...,single_hop_specifc_query_synthesizer
4,How is the diagnosis of Amyotrophic Lateral Sc...,[<1-hop>\n\nAmyotrophic Lateral Sclerosis (ALS...,The diagnosis of Amyotrophic Lateral Sclerosis...,multi_hop_abstract_query_synthesizer
5,What are the diagnostic methods for Amyotrophi...,[<1-hop>\n\nAmyotrophic Lateral Sclerosis (ALS...,Diagnosing Amyotrophic Lateral Sclerosis (ALS)...,multi_hop_abstract_query_synthesizer
6,What are the current diagnostic and treatment ...,[<1-hop>\n\nAmyotrophic Lateral Sclerosis (ALS...,Amyotrophic Lateral Sclerosis (ALS) is diagnos...,multi_hop_abstract_query_synthesizer
7,How is the diagnosis of Amyotrophic Lateral Sc...,[<1-hop>\n\nAmyotrophic Lateral Sclerosis (ALS...,The diagnosis of Amyotrophic Lateral Sclerosis...,multi_hop_abstract_query_synthesizer
8,What are the risk factors for developing amyot...,[<1-hop>\n\nWho is more likely to get amyotrop...,Risk factors for developing amyotrophic latera...,multi_hop_specific_query_synthesizer
9,How is amyotrophic lateral sclerosis (ALS) dia...,[<1-hop>\n\nHow is amyotrophic lateral scleros...,Amyotrophic lateral sclerosis (ALS) is diagnos...,multi_hop_specific_query_synthesizer


In [51]:
# Define retriever (Use the fine-tuned retriever or baseline retriever)
retriever = finetune_retriever  # or base_rag_retriever for baseline

# Generate synthetic dataset
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

# Convert dataset to Pandas
df = dataset.to_pandas()

# Rename columns to match RAGAS expectations
df.rename(columns={"user_input": "question", "reference_contexts": "retrieved_contexts", "reference": "response"}, inplace=True)

# Ensure retrieved_contexts is populated
if "retrieved_contexts" not in df.columns or df["retrieved_contexts"].isnull().all():
    df["retrieved_contexts"] = df["question"].apply(lambda q: [doc.page_content for doc in retriever.invoke(q)])

# Ensure required columns exist
required_columns = {"question", "response", "retrieved_contexts"}
if not required_columns.issubset(df.columns):
    raise ValueError(f"Dataset is missing required columns: {required_columns - set(df.columns)}")

# Convert to EvaluationDataset
evaluation_dataset = EvaluationDataset.from_pandas(df)

# Print sample to verify
print(df.head())

Applying HeadlinesExtractor:   0%|          | 0/5 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/13 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/9 [00:00<?, ?it/s]

Property 'summary' already exists in node '32cf22'. Skipping!
Property 'summary' already exists in node '4dec39'. Skipping!
Property 'summary' already exists in node '6676fa'. Skipping!
Property 'summary' already exists in node '67bd49'. Skipping!


Applying CustomNodeFilter:   0%|          | 0/5 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/17 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '32cf22'. Skipping!
Property 'summary_embedding' already exists in node '6676fa'. Skipping!
Property 'summary_embedding' already exists in node '4dec39'. Skipping!
Property 'summary_embedding' already exists in node '67bd49'. Skipping!


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

                                            question  \
0  What National Institute of Neurological Disord...   
1  How is amyotrophic lateral sclerosis diagnosed...   
2  What is NIV and how does it help individuals w...   
3  What are the current efforts and initiatives i...   
4  How does the progression of dementia in FTD-AL...   

                                  retrieved_contexts  \
0  [Amyotrophic Lateral Sclerosis (ALS) | Nationa...   
1  [How is amyotrophic lateral sclerosis (ALS) di...   
2  [Who is more likely to get amyotrophic lateral...   
3  [What are the latest updates on amyotrophic la...   
4  [<1-hop>\n\nAmyotrophic Lateral Sclerosis (ALS...   

                                            response  \
0  The National Institute of Neurological Disorde...   
1  Diagnosing ALS involves a comprehensive approa...   
2  Noninvasive ventilation (NIV) is a type of bre...   
3  In the United States, the National Institute o...   
4  In individuals with FTD-ALS, a form of deme

In [52]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas import EvaluationDataset
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig
from langchain_openai import ChatOpenAI

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

# Restore column names to match RAGAS expectations
df["user_input"] = df["question"]  # Restore 'user_input' column
df["reference"] = df["response"]   # Restore 'reference' column

# Convert to EvaluationDataset
evaluation_dataset = EvaluationDataset.from_pandas(df)

# Define evaluation config
custom_run_config = RunConfig(timeout=360)

# Run RAGAS evaluation
result = evaluate(
    dataset=evaluation_dataset,
    metrics=[
        LLMContextRecall(),
        Faithfulness(),
        FactualCorrectness(),
        ResponseRelevancy(),
        ContextEntityRecall(),
        NoiseSensitivity()
    ],
    llm=generator_llm,
    run_config=custom_run_config
)

print(result)

Evaluating:   0%|          | 0/72 [00:00<?, ?it/s]

Exception raised in Job[47]: TimeoutError()


{'context_recall': 0.9729, 'faithfulness': 0.9353, 'factual_correctness': 0.9925, 'answer_relevancy': 0.9466, 'context_entity_recall': 0.3952, 'noise_sensitivity_relevant': 0.0000}
