<a target="_blank" href="https://colab.research.google.com/github/gox6/colab-demos/blob/main/rags/evaluate-rags-rigorously-or-perish.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

<html>
<body>
    <div style="display: flex;">
        <a href="https://towardsdatascience.com/evaluate-rags-rigorously-or-perish-54f790557357" style="width: 45em; height: 15em; background-color: white; padding-left: 1em; display: flex; flex-direction: column; justify-content: center; color: inherit; text-decoration: none;">
            <p style="margin: 0; font-size: 1.5em; color: darkblue; font-weight: bold;">The notebook contains the code for Medium article</p>
            <p style="margin: 0; font-weight: bold; font-size: 1.5em; color: #3D4849;">Evaluate RAGs Rigorously or Perish</p>
        </a>
    </div>
</body>
</html>

# Project Setup

In [1]:
# Installing Python packages & hiding

OPENAI_MODE = False
llm_model = "mistralai/Mistral-7B-Instruct-v0.3"
llm_model = "microsoft/Phi-3.5-mini-instruct"
embeddings_model = "e5-base-v2"

!pip install --quiet \
  chromadb \
  datasets \
  langchain \
  langchain_chroma \
  optuna \
  plotly \
  polars \
  ragas \
  transformers \
  sentence-transformers \
  1> /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.
tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.28.3 which is incompatible.
tensorflow-metadata 1.13.1 requires protobuf<5,>=3.20.3, but you have protobuf 5.28.3 which is incompatible.[0m[31m
[0m

In [2]:
# Importing the packages
from functools import reduce
import json
import os
import requests
import warnings

if OPENAI_MODE:
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
else:
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
  from sentence_transformers import SentenceTransformer
  # Local LLM abstraction
  def LocalLLM(model: str = "phi-3", gguf_file: str = None):
      """Creates a text generation pipeline for a local LLM."""
      tokenizer = AutoTokenizer.from_pretrained(model) if gguf_file is None else AutoTokenizer.from_pretrained(model, gguf_file=gguf_file)
      model = AutoModelForCausalLM.from_pretrained(model) if gguf_file is None else AutoModelForCausalLM.from_pretrained(model, gguf_file=gguf_file)
      return pipeline("text-generation", model=model, tokenizer=tokenizer)

  # Local Embeddings abstraction
  def LocalEmbeddings(model: str = "all-MiniLM-L6-v2"):
      """Creates an embedding function using a sentence transformer model."""
      embeddings_model = SentenceTransformer(model)
      return lambda text: embeddings_model.encode(text)

import chromadb
from chromadb.api.models.Collection import Collection as ChromaCollection
from datasets import load_dataset, Dataset
from getpass import getpass
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.runnables.base import RunnableSequence
from langchain_community.document_loaders import WebBaseLoader, PolarsDataFrameLoader
from langchain_text_splitters import CharacterTextSplitter
from operator import itemgetter
import optuna
import pandas as pd
import plotly.express as px
import polars as pl
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness
)
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import SingleHopSpecificQuerySynthesizer, MultiHopSpecificQuerySynthesizer, MultiHopAbstractQuerySynthesizer



In [3]:
# Managing secrets
# - If using Colab please use Colab Secrets
# - If running outside Colab please provide secrets as environmental variables
COLAB = os.getenv("COLAB_RELEASE_TAG") is not None

if OPENAI_MODE:
  if COLAB:
    from google.colab import userdata, data_table
    # Secrets
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    # Enabling Colab's data formatter for pandas
    data_table.enable_dataframe_formatter()
  else:
    OPENAI_API_KEY = getpass("OPENAI_API_KEY")
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

runtime_info = "Colab runtime" if COLAB else "Non Colab runtime"
print(runtime_info)

Colab runtime


#Exploring Different Types of Question Evolution in RAGAs




In [4]:
# Getting example docs into vectordb
urls = ["https://en.wikipedia.org/wiki/Large_language_model"]

wikis_loader = WebBaseLoader(urls)
wikis = wikis_loader.load()
wikis[0]

Document(metadata={'source': 'https://en.wikipedia.org/wiki/Large_language_model', 'title': 'Large language model - Wikipedia', 'language': 'en'}, page_content='\n\n\n\nLarge language model - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\

In [None]:
# Examining question evolution types evailable in ragas library
if OPENAI_MODE:
  llm = ChatOpenAI(model="gpt-3.5-turbo")
  generator_llm = llm
  critic_llm = llm
  embeddings = OpenAIEmbeddings()
else:
  llm = LocalLLM(model=llm_model)
  critic_llm = llm
  generator_llm = llm
  embeddings = LocalEmbeddings(model=embeddings_model)

example_generator=None
#example_generator = TestsetGenerator.from_langchain( generator_llm, critic_llm, embeddings)
example_generator = TestsetGenerator.from_langchain( generator_llm=generator_llm, critic_llm=critic_llm, embeddings=embeddings, synthesizer=SingleHopSpecificQuerySynthesizer(llm=generator_llm),)

# Change resulting question type distribution
list_of_distributions = [{simple: 1}, {reasoning: 1}, {multi_context: 1}, {conditional: 1}]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# This step COSTS $$$ ...
# Generating the example evolutions
avoid_costs = True

if not avoid_costs:
  # Running ragas to get examples of question evolutions
  question_evolution_types = list(map(lambda x: example_generator.generate_with_langchain_docs(wikis, 1, x), list_of_distributions))
  question_evolution_types_pd = reduce(lambda x, y: pd.concat([x, y], axis=0), [x.to_pandas() for x in question_evolution_types])
  question_evolution_types_pd = question_evolution_types_pd.loc[:, ["evolution_type", "question", "ground_truth"]]
else:
  # Downloading examples for question evolutions discussed in the article:
  question_evolution_types_pd  = pl.read_csv(
    "https://gist.github.com/gox6/bfd422a6f203ba73f081b08c9bb25e66/raw/example-question-evolution-types-in-ragas.csv",
    separator=",",
).drop("index").to_pandas()


In [None]:
if COLAB:
  display(data_table.DataTable(question_evolution_types_pd, include_index=False, num_rows_per_page=5))
else:
  display(question_evolution_types_pd)

#Getting data: CNN and Daily Mail news articles


In [None]:
# Loading small sample of article from CNN and Daily Mail news dateset on HF: https://huggingface.co/datasets/cnn_dailymail
# To save time leveraging the gist with tiny extract from the dataset on HF
# - Not directly via LangChain with HuggingFaceDatasetLoader class because, it doesn't have split argument
save_time = True

if not save_time:
  news_hf = load_dataset(path="cnn_dailymail", name='1.0.0', split='train[:100]')
  news_pl = (pl.from_arrow(news_hf.data.table)
            .with_columns([pl.col("article").str.split(' ').list.len().alias("word_count")]))
else:

  news_pl = pl.read_csv(
      "https://gist.github.com/gox6/ef0aabc16dab6811e9b3da1e6694a84e/raw/cnn_daily_mail_tiny_extract.csv",
      separator=",",
  )
  news_hf = Dataset(news_pl.to_arrow())


news_pd = news_pl.to_pandas()

loader = PolarsDataFrameLoader(news_pl, page_content_column="article")
news = loader.load()


In [None]:
# Distribution of artciles by word count
fig = px.histogram(news_pl, x="word_count", marginal="rug")
fig.update_layout(
    title_text="Distribution of articles by word count", # title of plot
    xaxis_title_text='Word Count', # xaxis label
    yaxis_title_text='# Articles', # yaxis label
)
fig.show()

In [None]:
# Seeing news data
if COLAB:
  display(data_table.DataTable(news_pd, include_index=False, num_rows_per_page=5))
else:
  display(news_pd.head(5))

#Generating Synthetic Evaluation Set

In [None]:
# Examining question evolution types evailable in ragas library
llm = ChatOpenAI(model="gpt-3.5-turbo") if OPENAI_MODE else LocalLLM(model=llm_model)
generator_llm = llm
critic_llm = llm
embeddings = OpenAIEmbeddings(model="text-embedding-3-small") if OPENAI_MODE else LocalEmbeddings(model=embeddings_model)
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Set question type distribution
distributions = {simple: 0.25, reasoning: 0.25, multi_context: 0.25,conditional: 0.25}

In [None]:
# This costs some real $$$
avoid_costs = True
save_time = True

if not (avoid_costs or save_time):
  # Generate evaluation set
  synthetic_evaluation_set_hf = generator.generate_with_langchain_docs(documents=news, test_size=20, distributions=distributions).to_dataset()
  synthetic_evaluation_set_pl = pl.from_arrow(synthetic_evaluation_set_hf.data.table)
  synthetic_evaluation_set_pd = synthetic_evaluation_set_hf.to_pandas()
else:
  # Download the pre-computed evaluation set
  synthetic_evaluation_set_url = "https://gist.github.com/gox6/0858a1ae2d6e3642aa132674650f9c76/raw/synthetic-evaluation-set-cnn-daily-mail.csv"
  synthetic_evaluation_set_pl = pl.read_csv(synthetic_evaluation_set_url, separator=",").drop("index")
  synthetic_evaluation_set_pd = synthetic_evaluation_set_pl.to_pandas()
  synthetic_evaluation_set_hf = Dataset(synthetic_evaluation_set_pl.to_arrow())



In [None]:
# Seeing news data
if COLAB:
  display(data_table.DataTable(synthetic_evaluation_set_pd, include_index=False, num_rows_per_page=3))
else:
  display(synthetic_evaluation_set_pd.head(5))

# Setting up a vector database: ChromaDB

In [None]:
# Setting up a ChromaDB client
chroma_client = chromadb.EphemeralClient()

# Listing exististing document collections in Chroma DB
chroma_client.list_collections()


In [None]:
# Defining a function to get document collection from vector db with given hyperparemeters
# The function embeds the documents only if collection is missing
# This development version as for production one would rather implement document level check


def get_vectordb_collection(chroma_client,
                            documents,
                            embedding_model="text-embedding-ada-002",
                            chunk_size=None, overlap_size=0) -> ChromaCollection:

    if chunk_size is None:
      collection_name = "full_text"
      docs_pp = documents
    else:
      collection_name = f"{embedding_model}_chunk{chunk_size}_overlap{overlap_size}"

      text_splitter = CharacterTextSplitter(
        separator=".",
        chunk_size=chunk_size,
        chunk_overlap=overlap_size,
        length_function=len,
        is_separator_regex=False,
      )

      docs_pp = text_splitter.transform_documents(documents)


    embedding = OpenAIEmbeddings(model=embedding_model) if OPENAI_MODE else LocalEmbeddings(model=embeddings_model)

    langchain_chroma = Chroma(client=chroma_client,
                              collection_name=collection_name,
                              embedding_function=embedding,
                              )

    existing_collections = [collection.name for collection in chroma_client.list_collections()]

    if chroma_client.get_collection(collection_name).count() == 0:
      langchain_chroma.from_documents(collection_name=collection_name,
                                        documents=docs_pp,
                                        embedding=embedding)
    return langchain_chroma

# Simple RAG in LangChain

In [None]:
# Defininig a function to get a simple RAG as Langchain chain with given hyperparemeters
# RAG returns also the context documents retrieved for evaluation purposes in RAGAs

def get_chain(chroma_client,
              documents,
              embedding_model="text-embedding-ada-002",
              llm_model="gpt-3.5-turbo",
              chunk_size=None,
              overlap_size=0,
              top_k=4,
              lambda_mult=0.25) -> RunnableSequence:

    vectordb_collection = get_vectordb_collection(chroma_client=chroma_client,
                                                  documents=documents,
                                                  embedding_model=embedding_model,
                                                  chunk_size=chunk_size,
                                                  overlap_size=overlap_size)

    retriever = vectordb_collection.as_retriever(top_k=top_k, lambda_mult=lambda_mult)

    template = """Answer the question based only on the following context.
    If the context doesn't contain entities present in the question say you don't know.

    {context}

    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)
    llm = ChatOpenAI(model=llm_model) if OPENAI_MODE else LocalLLM(model=llm_model)

    def format_docs(docs):
        return "\n\n".join([doc.page_content for doc in docs])

    chain_from_docs = (
      RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
      | prompt
      | llm
      | StrOutputParser()
    )

    chain_with_context_and_ground_truth = RunnableParallel(
      context=itemgetter("question") | retriever,
      question=itemgetter("question"),
      ground_truth=itemgetter("ground_truth"),
    ).assign(answer=chain_from_docs)

    return chain_with_context_and_ground_truth

In [None]:
# Testing full text rag

with warnings.catch_warnings():
  rag_prototype = get_chain(chroma_client=chroma_client, documents=news, chunk_size=1000, overlap_size=200)

rag_prototype.invoke( {'question': 'What happened in Minneapolis to the bridge?',
                       "ground_truth": "x"})["answer"]

# Evaluation of RAG

In [None]:
# We create the helper function to generate the RAG ansers together with Ground Truth based on synthetic evaluation set
# The dataset for RAGAS evaluation should contain the columns: question, answer, ground_truth, contexts
# RAGAs expects the data in Huggingface Dataset format

def generate_rag_answers_for_synthetic_questions(chain,
                                                 synthetic_evaluation_set) -> pl.DataFrame:

  df = pl.DataFrame()

  for row in synthetic_evaluation_set.iter_rows(named=True):
    rag_output = chain.invoke({"question": row["question"], "ground_truth": row["ground_truth"]})
    rag_output["contexts"] = [doc.page_content for doc in rag_output["context"]]
    del rag_output["context"]
    rag_output_pp = {k: [v] for k, v in rag_output.items()}
    df = pl.concat([df, pl.DataFrame(rag_output_pp)], how="vertical")

  return df

avoid_costs = True
save_time = True

if not (avoid_costs or save_time):

  rag_prototype_answers = generate_rag_answers_for_synthetic_questions(rag_prototype, synthetic_evaluation_set_pl)

else:
  url = "https://gist.github.com/gox6/73927c9e273dc0ed48525d89bf9f36dd/raw/rag_prototype_answers_with_ground_truth.json"
  response = requests.get(url)
  rag_prototype_answers = pl.from_dicts(json.loads(response.text))


In [None]:
rag_prototype_answers_pd = rag_prototype_answers.to_pandas()
rag_prototype_answers_pl = pl.from_pandas(rag_prototype_answers_pd)
rag_prototype_answers_hf = Dataset.from_pandas(rag_prototype_answers_pd )

if COLAB:
  display(data_table.DataTable(rag_prototype_answers_pd, include_index=False, num_rows_per_page=3))
else:
  display(rag_prototype_answers_pd.head(5))

In [None]:
prototype_result = evaluate(rag_prototype_answers_hf,
                            metrics=[answer_correctness],
                            )

print(prototype_result)

# Optimising RAG using RAGAs and Optuna

In [None]:
# Train test split
# We need at least 2 sets: train and test for RAG optimization.

shuffled = synthetic_evaluation_set_pl.sample(fraction=1,
                                              shuffle=True,
                                              seed=6)
test_fraction = 0.5

test_n = round(len(synthetic_evaluation_set_pl) * test_fraction)
train, test = (shuffled.head(-test_n),
               shuffled.head( test_n))


In [None]:
def objective(trial):

  embedding_model = trial.suggest_categorical(name="embedding_model",
                                              choices=["text-embedding-ada-002", 'text-embedding-3-small'])

  chunk_size = trial.suggest_int(name="chunk_size",
                                 low=500,
                                 high=1000,
                                 step=100)

  overlap_size = trial.suggest_int(name="overlap_size",
                                   low=100,
                                   high=400,
                                   step=50)

  top_k = trial.suggest_int(name="top_k",
                            low=1,
                            high=10,
                            step=1)


  challenger_chain = get_chain(chroma_client,
                            news,
                            embedding_model=embedding_model,
                            llm_model="gpt-3.5-turbo",
                            chunk_size=chunk_size,
                            overlap_size= overlap_size ,
                            top_k=top_k,
                            lambda_mult=0.25)


  challenger_answers_pl = generate_rag_answers_for_synthetic_questions(challenger_chain , train)
  challenger_answers_hf = Dataset.from_pandas(challenger_answers_pl.to_pandas())

  challenger_result = evaluate(challenger_answers_hf,
                               metrics=[answer_correctness],
                              )

  return challenger_result['answer_correctness']



In [None]:
sampler = optuna.samplers.TPESampler(seed=6)
study = optuna.create_study(study_name="RAG Optimisation",
                            direction="maximize",
                            sampler=sampler)
study.set_metric_names(['answer_correctness'])

educated_guess = {"embedding_model": "text-embedding-3-small",
                  "chunk_size": 1000,
                  "overlap_size": 200,
                  "top_k": 3}


study.enqueue_trial(educated_guess)

print(f"Sampler is {study.sampler.__class__.__name__}")
study.optimize(objective, timeout=180)

In [None]:
print("Best trial with answer_correctness:", study.best_trial.value)
print("Hyper-parameters for the best trial:", study.best_trial.params)

In [None]:
# Evaluation of the best trial parameters on the test set
challenger_chain = get_chain(chroma_client, news, **study.best_trial.params)
challenger_answers_pl = generate_rag_answers_for_synthetic_questions(challenger_chain , test)
challenger_answers_hf = Dataset.from_pandas(challenger_answers_pl.to_pandas())

challenger_result = evaluate(challenger_answers_hf, metrics=[answer_correctness])
challenger_result

<html>
<body>
    <div style="display: flex;">
            <a href="https://medium.com/@jgrygolec" style="width: 45em; height: 15em; background-color: white; padding-left: 1em; display: flex; flex-direction: column; justify-content: center; color: inherit; text-decoration: none;">
            <p style="margin: 0; font-size: 1.5em; color: #3D4849;">Thank you for attention!</p>
            <p style="margin: 0; font-size: 1.5em; color: #3D4849; font-weight: bold;">Click to see more of my articles on Medium.</p>
            <p style="margin: 0; font-size: 1.5em; color: #3D4849;">BR,</p>
            <p style="margin: 0; font-size: 1.5em; color: #3D4849;">Jarek Grygolec</p>
        </a>
        <a href="https://medium.com/@jgrygolec" style="width: 30em; height: 15em; background-repeat: no-repeat; background-size: cover; background-position: center;">
                    <img src="https://miro.medium.com/v2/resize:fit:3992/0*WCQwsoZC0FA2-haq" style="width: 100%; height: 100%; object-fit: cover;"></a>
    </div>
</body>
</html>