More information can be found at the link below
https://github.com/explodinggradients/ragas/blob/main/docs/getstarted/rag_testset_generation.md

# Setup

In [1]:
# %pip install -r requirements.txt

# Environment Variables

In [2]:
# Get environment variables

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

# Utils

In [3]:
# Create a custom is_finished_parser to capture Gemini generation completion signals
# https://docs.ragas.io/en/stable/howtos/customizations/customize_models/#google-vertex

from langchain_core.outputs import LLMResult

def gemini_is_finished_parser(response: LLMResult) -> bool:
	is_finished_list = []
	for g in response.flatten():
		resp = g.generations[0][0]

		# Check generation_info first
		if resp.generation_info is not None:
			finish_reason = resp.generation_info.get("finish_reason")
			if finish_reason is not None:
				is_finished_list.append(
					finish_reason in ["STOP", "MAX_TOKENS"]
				)
				continue

		# Check response_metadata as fallback
		if isinstance(resp, ChatGeneration) and resp.message is not None:
			metadata = resp.message.response_metadata
			if metadata.get("finish_reason"):
				is_finished_list.append(
					metadata["finish_reason"] in ["STOP", "MAX_TOKENS"]
				)
			elif metadata.get("stop_reason"):
				is_finished_list.append(
					metadata["stop_reason"] in ["STOP", "MAX_TOKENS"] 
				)

		# If no finish reason found, default to True
		if not is_finished_list:
			is_finished_list.append(True)

	return all(is_finished_list)

# Generate Unittest Testset

In [4]:
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader("docs/10k/html", glob="**/*.html", show_progress=True,)

docs = loader.load()

100%|██████████| 2/2 [00:02<00:00,  1.41s/it]


In [None]:
# https://docs.ragas.io/en/stable/howtos/customizations/customize_models/#google-vertex 
# https://docs.ragas.io/en/v0.1.21/howtos/customisations/run_config.html

import os

import google

from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.run_config import RunConfig

USE_GOOGLE = False

if USE_GOOGLE:
	creds, _ = google.auth.default(quota_project_id=os.environ["PROJECT_ID"])
	
	llm = ChatVertexAI(credentials=creds, model_name=os.environ["GOOGLE_LLM_MODEL_NAME"],)
	embeddings = VertexAIEmbeddings(credentials=creds, model_name=os.environ["GOOGLE_EMBEDDING_MODEL_NAME"])

	llm = LangchainLLMWrapper(llm, is_finished_parser=gemini_is_finished_parser)
	embeddings = LangchainEmbeddingsWrapper(embeddings)

	run_config=RunConfig(max_workers=2)
else:
	llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
	embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

	run_config=RunConfig(max_workers=16)

In [7]:
import time

from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import default_query_distribution

TESTSET_SIZE = 5

query_distribution = default_query_distribution(llm)

# Create the generator 
generator = TestsetGenerator(llm=llm, embedding_model=embeddings)
              
# Generate the testset
testset = generator.generate_with_langchain_docs(docs, testset_size=TESTSET_SIZE, query_distribution = query_distribution,run_config=run_config)

# Write the testet to disk
file_name = "unittest_testset.csv" 
testset.to_csv(f"testsets/{file_name}")

# Display the testset
testset.to_pandas()

Applying HeadlinesExtractor:   0%|          | 0/2 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/2 [00:00<?, ?it/s]

Applying SummaryExtractor:   0%|          | 0/2 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/162 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/234 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Wut is the Annual Report on Form 10-K?,[PART II Item 5. Market for Registrant’s Commo...,The Annual Report on Form 10-K contains forwar...,single_hop_specifc_query_synthesizer
1,How does Uber utilize technology to enhance it...,"[statements, such information may be limited o...","Uber utilizes a massive network, leading techn...",single_hop_specifc_query_synthesizer
2,How does revenue recognition for mobility serv...,[<1-hop>\n\ndollar value of transactions invoi...,Revenue recognition for mobility services is p...,multi_hop_abstract_query_synthesizer
3,What are the key factors contributing to reven...,[<1-hop>\n\ndollar value of transactions invoi...,The key factors contributing to revenue recogn...,multi_hop_abstract_query_synthesizer
4,How do economic conditions in Egypt affect the...,[<1-hop>\n\nthereto had an adverse impact on o...,Economic conditions in Egypt can significantly...,multi_hop_specific_query_synthesizer
5,What significant events related to security in...,"[<1-hop>\n\n2022, we settled our UK VAT disput...","In November 2021, Drizly obtained final court ...",multi_hop_specific_query_synthesizer


# Generate Full Testset

In [None]:
from langchain_community.document_loaders import DirectoryLoader

path = "docs/10k/html"

loader = DirectoryLoader(path=path, glob="**/*.html", show_progress=True, )

docs = loader.load()

In [None]:
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import default_query_distribution

TESTSET_SIZE = 50

query_distribution = default_query_distribution(generator_llm)

# Create the generator 
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)

# Generate the testset
testset = generator.generate_with_langchain_docs(docs, testset_size=TESTSET_SIZE, query_distribution=query_distribution,)

# Write the testet to disk
file_name = "10k_testset.csv_" + timestr
testset.to_csv(f"tesetsets/{file_name}")

# Display the testset
testset.to_pandas()