In [19]:
# !pip install nest_asyncio \
#     langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters \
#     python-pptx==1.0.2 nltk==3.9.1 pymupdf lxml \
#     sentence-transformers IProgress \
#     huggingface_hub ipywidgets \
#     qdrant-client langchain_experimental

# !pip install sentence_transformers datasets pyarrow
# !pip install torch
# !pip install accelerate>=0.26.0
# !pip install transformers
# !pip install wandb
# !pip install ragas



In [1]:

import nest_asyncio

nest_asyncio.apply()

In [3]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")
os.environ["RAGAS_APP_TOKEN"] = getpass.getpass("Please enter your Ragas API key!")

In [4]:
hf_username = getpass.getpass("Enter Your Hugging Face Username: ")


In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Token has not been saved to git credential helper.


In [6]:
from huggingface_hub import whoami
print(whoami())


{'type': 'user', 'id': '67624d1b57e77fe6e0c87ae5', 'name': 'drewgenai', 'fullname': 'Drew DeMarco', 'email': 'drewgenai@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/L6eLaZmCK4jqW3ZTLYIAR.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'newotken', 'role': 'write', 'createdAt': '2025-02-12T04:11:04.130Z'}}}


In [7]:
!mkdir example_files
!mkdir output

mkdir: cannot create directory ‘example_files’: File exists
mkdir: cannot create directory ‘output’: File exists


In [8]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader

path = "example_files/"
text_loader = DirectoryLoader(path, glob="*.pdf", loader_cls=PyMuPDFLoader)

1️⃣ Header-Based Chunking (Title-Based Splitter)
Uses document structure to split on headings, section titles, or patterns.
Works well for structured documents with named assessments, numbered lists, or headers.
Example: If it detects Chronic Pain Adjustment Index (CPAI-10), it groups everything under that title.
2️⃣ Semantic Chunking (Text-Meaning Splitter)
Uses embeddings or sentence similarity to decide where to break chunks.
Prevents splitting mid-context if sentences are closely related.
Example: Groups all related pain-assessment questions into one chunk.

###testingbelow


In [78]:


from langchain_experimental.text_splitter import SemanticChunker

from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings

from langchain.embeddings import HuggingFaceEmbeddings
model_id = "Snowflake/snowflake-arctic-embed-m"
embedding_model = HuggingFaceEmbeddings(model_name=model_id)

semantic_splitter = SemanticChunker(embedding_model)

all_documents = text_loader.load()
documents_with_metadata = []



In [10]:
from langchain.schema import Document

for doc in all_documents:
    source_name = doc.metadata.get("source", "unknown")  # Get document source

    # Use SemanticChunker to intelligently split text
    chunks = semantic_splitter.split_text(doc.page_content)

    # Convert chunks into LangChain Document format with metadata
    for chunk in chunks:
        doc_chunk = Document(page_content=chunk, metadata={"source": source_name})
        documents_with_metadata.append(doc_chunk)

##########################new testing below

In [75]:
#training_documents = text_loader.load()
### keeping documents_with_metadata and training_documents separate for now


from langchain.schema import Document

training_documents = []


for doc in all_documents:
    source_name = doc.metadata.get("source", "unknown")  # Get document source

    # Use SemanticChunker to intelligently split text
    chunks = semantic_splitter.split_text(doc.page_content)

    # Convert chunks into LangChain Document format with metadata
    for chunk in chunks:
        doc_chunk = Document(page_content=chunk, metadata={"source": source_name})
        training_documents.append(doc_chunk)





In [76]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [77]:
# Define split percentages
train_ratio = 0.7  # 70% training
val_ratio = 0.2    # 20% validation
test_ratio = 0.1   # 10% test

# Calculate index breakpoints
total_docs = len(training_documents)
train_size = int(total_docs * train_ratio)
val_size = int(total_docs * val_ratio)

# Perform the splits
training_split_documents = training_documents[:train_size]
val_split_documents = training_documents[train_size:train_size + val_size]
test_split_documents = training_documents[train_size + val_size:]

# Print sizes to verify
print(f"Training set: {len(training_split_documents)} docs")
print(f"Validation set: {len(val_split_documents)} docs")
print(f"Test set: {len(test_split_documents)} docs")




Training set: 9 docs
Validation set: 2 docs
Test set: 3 docs


In [44]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [45]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [46]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [47]:
import asyncio
import uuid
from tqdm import tqdm

async def process_document(document, n_questions):
    questions_generated = await question_generation_chain.ainvoke({"context": document.page_content, "n_questions": n_questions})

    doc_questions = {}
    doc_relevant_docs = {}

    for question in questions_generated.content.split("\n"):
        question_id = str(uuid.uuid4())
        doc_questions[question_id] = "".join(question.split(".")[1:]).strip()
        doc_relevant_docs[question_id] = [document.metadata["id"]]

    return doc_questions, doc_relevant_docs

async def create_questions(documents, n_questions):
    tasks = [process_document(doc, n_questions) for doc in documents]

    questions = {}
    relevant_docs = {}

    for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc="Processing documents"):
        doc_questions, doc_relevant_docs = await task
        questions.update(doc_questions)
        relevant_docs.update(doc_relevant_docs)

    return questions, relevant_docs

In [48]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Processing documents: 100%|██████████| 9/9 [00:02<00:00,  4.44it/s]
Processing documents: 100%|██████████| 2/2 [00:01<00:00,  1.74it/s]
Processing documents: 100%|██████████| 3/3 [00:02<00:00,  1.50it/s]


In [49]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)


val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)


train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [50]:
# !pip install -qU sentence_transformers datasets pyarrow

In [51]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id)

In [52]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [53]:
BATCH_SIZE = 10

In [54]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [55]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [56]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [57]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [58]:
EPOCHS = 5

In [59]:
#!pip install wandb

import wandb
wandb.init(mode="disabled")

In [69]:
#commented out for now as want to run whole notebook but not retrain
# warmup_steps = int(len(loader) * EPOCHS * 0.1)

# model.fit(
#     train_objectives=[(loader, train_loss)],
#     epochs=EPOCHS,
#     warmup_steps=warmup_steps,
#     output_path='models/midterm-compare-arctic-embed-m-ft',
#     show_progress_bar=True,
#     evaluator=evaluator,
#     evaluation_steps=50
# )

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
2,No log,No log,0.75,1.0,1.0,1.0,0.75,0.333333,0.2,0.1,0.75,1.0,1.0,1.0,0.907732,0.875,0.875
4,No log,No log,0.75,1.0,1.0,1.0,0.75,0.333333,0.2,0.1,0.75,1.0,1.0,1.0,0.907732,0.875,0.875
6,No log,No log,0.75,1.0,1.0,1.0,0.75,0.333333,0.2,0.1,0.75,1.0,1.0,1.0,0.907732,0.875,0.875
8,No log,No log,0.75,1.0,1.0,1.0,0.75,0.333333,0.2,0.1,0.75,1.0,1.0,1.0,0.907732,0.875,0.875
10,No log,No log,0.75,1.0,1.0,1.0,0.75,0.333333,0.2,0.1,0.75,1.0,1.0,1.0,0.907732,0.875,0.875


In [61]:
#commented out for now as want to run whole notebook but not sending to hub
#model.push_to_hub(f"{hf_username}/midterm-compare-arctic-embed-m-ft")

In [62]:
finetune_embeddings = HuggingFaceEmbeddings(model_name=f"{hf_username}/midterm-compare-arctic-embed-m-ft")

Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


###testingabove

In [93]:
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings


# Load the SentenceTransformer model
#model_id = "Snowflake/snowflake-arctic-embed-m"
model_id = f"{hf_username}/midterm-compare-arctic-embed-m-ft"  
embedding_model = HuggingFaceEmbeddings(model_name=model_id)
# model_id = "Snowflake/snowflake-arctic-embed-m"
# embedding_model = HuggingFaceEmbeddings(model_name=model_id)
# model_id = "Snowflake/snowflake-arctic-embed-m-v2.0"
# embedding_model = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={"trust_remote_code": True})


# Load documents into Qdrant
qdrant_vectorstore = Qdrant.from_documents(
    documents_with_metadata,
    embedding_model,
    location=":memory:",  # In-memory for testing
    collection_name="document_comparison",
)

# Create a retriever
qdrant_retriever = qdrant_vectorstore.as_retriever()

Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: list index out of range

In [64]:
from langchain_core.prompts import ChatPromptTemplate
RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

You are a helpful assistant. Use the available context to answer the question.

Return the response in **valid JSON format** with the following structure:

[
    {{
        "Derived Description": "A short name for the matched concept",
        "Protocol_1_Name": "Protocol 1 - Matching Element",
        "Protocol_2_Name": "Protocol 2 - Matching Element"
    }},
    ...
]

### Rules:
1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.
2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.
3. If no matching element is found in a protocol, leave it empty ("").
4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.
5. It should include all the elements in the two protocols.
6. If it cannot match the element, create the row and include the protocol it did find and put "could not match" in the other protocol column.
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

from langchain_openai import ChatOpenAI

#openai_chat_model = ChatOpenAI(model="gpt-4o")
openai_chat_model = ChatOpenAI(model="gpt-4o-mini")

from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser

rag_chain = (
    {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
    | rag_prompt | openai_chat_model | StrOutputParser()
)

In [65]:
question_text = """Between these two files containing protocols, can you find the data elements in each that most likely match the element in the other and output a CSV that lists three columns:

The questions within elements will be similar between the two documents and can be used to match the elements.

1. Derived description from the two documents describing the index/measure/scale.
2. A column for each standard.
3. In the column for each name/version, the data element used to capture that description.

There should only be one row for each scale/index/etc.
The description should not be one of the questions but a name that best describes the similar data elements."""

response_text = rag_chain.invoke({"question": question_text})
# response = rag_chain.invoke({"question": question_text})

In [66]:
import json
import pandas as pd

def parse_rag_output(response_text):
    """Extract structured JSON data from the RAG response."""
    try:
        structured_data = json.loads(response_text)

        # Ensure similarity score is always included
        for item in structured_data:
            item.setdefault("Similarity Score", "N/A")  # Default if missing

        return structured_data
    except json.JSONDecodeError:
        print("Error: Response is not valid JSON.")
        return None

def save_to_csv(data, directory="./output", filename="matching_data_elements.csv"):
    """Save structured data to CSV."""
    if not data:
        print("No data to save.")
        return

    file_path = os.path.join(directory, filename)
    df = pd.DataFrame(data, columns=["Derived Description", "Protocol_1_Name", "Protocol_2_Name"])  # Ensure correct columns
    df.to_csv(file_path, index=False)
    print(f"✅ CSV file saved: {filename}")

# Run the pipeline
structured_output = parse_rag_output(response_text)
save_to_csv(structured_output)


✅ CSV file saved: matching_data_elements.csv


In [67]:
# rag_chain.invoke({"question" : "Based on the types of questions asked under each heading.  can you identify the headings in one document that most closely match the second document.  list them e.g   paincoping/doc1  painstrategy/doc2"})

In [68]:
# rag_chain.invoke({"question" : "Based on the types of questions asked under each heading.  can you identify the headings in one document that most closely match the second document.  list them e.g   paincoping/doc1  painstrategy/doc2. these are example headings not the ones in the actual documents.  just list the matches not the rational.  Can you list multiple matches?"})

In [96]:
### ragas testing below
#docs = documents_with_metadata
docs = text_loader.load()

In [91]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

In [92]:
rag_llm =  ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [113]:
base_model_id = f"Snowflake/snowflake-arctic-embed-m"  
base_embedding_model = HuggingFaceEmbeddings(model_name=base_model_id)

finetune_model_id = f"{hf_username}/midterm-compare-arctic-embed-m-ft"  
finetune_embedding_model = HuggingFaceEmbeddings(model_name=finetune_model_id)

openai_model_id = "text-embedding-3-small"
openai_embedding_model = OpenAIEmbeddings(model=openai_model_id)


Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [114]:
#from langchain_community.vectorstores import FAISS

### try qdrant?

qdrant_vectorstore_base = Qdrant.from_documents(
    docs,
    base_embedding_model,
    location=":memory:",  # In-memory for testing
    collection_name="document_comparison",
)


base_retriever = qdrant_vectorstore_base.as_retriever(search_kwargs={"k": 6})

qdrant_vectorstore_finetune = Qdrant.from_documents(
    docs,
    finetune_embedding_model,
    location=":memory:",  # In-memory for testing
    collection_name="document_comparison",
)


finetune_retriever = qdrant_vectorstore_finetune.as_retriever(search_kwargs={"k": 6})



qdrant_vectorstore_openai = Qdrant.from_documents(
    docs,
    openai_embedding_model,
    location=":memory:",  # In-memory for testing
    collection_name="document_comparison",
)


openai_retriever = qdrant_vectorstore_openai.as_retriever(search_kwargs={"k": 6})


In [None]:

# # Create a retriever
# qdrant_retriever = qdrant_vectorstore.as_retriever()





# ###

# base_vectorstore = FAISS.from_documents(training_documents, base_embedding_model)
# base_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 6})

In [100]:
from langchain.schema.runnable import RunnablePassthrough

base_rag_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [102]:
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [115]:
from langchain.schema.runnable import RunnablePassthrough

openai_rag_chain = (
    {"context": itemgetter("question") | openai_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [103]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [104]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Applying SummaryExtractor:   0%|          | 0/6 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/7 [00:00<?, ?it/s]

Node 77fa3fd5-0ec7-4864-8a9f-fb6df33f64ec does not have a summary. Skipping filtering.


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/20 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

In [105]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,How does the Pain Coping Strategy Scale (PCSS-...,[Linked Psychological & Physical Assessment\nP...,The Pain Coping Strategy Scale (PCSS-9) measur...,single_hop_specifc_query_synthesizer
1,Cud yu pleese explane wut the Pain Coping Stra...,[Linked Psychological & Physical Assessment\nP...,The Pain Coping Strategy Scale (PCSS-9) measur...,single_hop_specifc_query_synthesizer
2,Wht is the ERI-9 and how does it relate to emo...,[Financial Stress Index (FSI-6)\nThe FSI-6 eva...,The Emotional Regulation Index (ERI-9) is ment...,single_hop_specifc_query_synthesizer
3,what cognitive load management scale do,[Financial Stress Index (FSI-6)\nThe FSI-6 eva...,The Cognitive Load Management Scale (CLMS-7) m...,single_hop_specifc_query_synthesizer
4,What does the MRI-6 assessment evaluate?,[The ERI-9 assesses an individual's ability to...,The MRI-6 evaluates short-term and long-term m...,single_hop_specifc_query_synthesizer
5,what scm-6 do for social confidence and public...,[The ERI-9 assesses an individual's ability to...,The SCM-6 evaluates levels of confidence in so...,single_hop_specifc_query_synthesizer
6,What does the RDMT-6 assess in terms of cognit...,[Linked Psychological & Physical Assessment\nC...,The RDMT-6 evaluates logical reasoning and dec...,single_hop_specifc_query_synthesizer
7,What does the CPAI-10 assess in individuals wi...,[Linked Psychological & Physical Assessment\nC...,The CPAI-10 evaluates the strategies people us...,single_hop_specifc_query_synthesizer
8,What does the CWT-7 assessment measure in term...,[I feel confident when making important decisi...,The CWT-7 evaluates an individual's ability to...,single_hop_specifc_query_synthesizer
9,What does the Stamina and Endurance Index (SEI...,[I feel confident when making important decisi...,The Stamina and Endurance Index (SEI-8) measur...,single_hop_specifc_query_synthesizer


Eval with base model

In [106]:
for test_row in dataset:
  response = base_rag_chain.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [107]:
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

In [108]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [109]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28698, Requested 2725. Please try again in 2.846s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[22]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29211, Requested 2254. Please try again in 2.93s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29563, Requested

{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.1973}

Evaluate the Fine tuned.


In [110]:
for test_row in dataset:
  response = finetune_rag_chain.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [111]:
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [112]:
result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Exception raised in Job[22]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28950, Requested 2254. Please try again in 2.408s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[16]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28949, Requested 2254. Please try again in 2.406s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28567, Requeste

{'context_recall': 1.0000, 'faithfulness': 0.8500, 'factual_correctness': 0.7220, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.7917, 'noise_sensitivity_relevant': 0.1111}

Evaluate the openai model

In [116]:
for test_row in dataset:
  response = openai_rag_chain.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [117]:
evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [118]:
result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28587, Requested 2574. Please try again in 2.322s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29460, Requested 2782. Please try again in 4.484s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29365, Requested

{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9463, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.3095}


Base model evaluation
{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.1973}

Finetuned model
{'context_recall': 1.0000, 'faithfulness': 0.8500, 'factual_correctness': 0.7220, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.7917, 'noise_sensitivity_relevant': 0.1111}


Openai model
{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9463, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.3095}



Base snowflake model and OpenAI are very similar with the openai model performing slightly better for noise sensitivity.
The finetuned snowflak model perform does not perform better in most case though it reduces noise sensitivity.