-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Closed
Labels
answered🤖 The question has been answered. Will be closed automatically if no new comments🤖 The question has been answered. Will be closed automatically if no new commentsbugSomething isn't workingSomething isn't workingmodule-testsetgenModule testset generationModule testset generation
Description
Describe the bug
I am not getting 'metadata' column for which source, question is related to
Ragas version:
Python version:
3.12
Code to Reproduce
Share code to reproduce the issue
import os
import logging
from pathlib import Path
from typing import Optional, List
import sys
from llama_index.core import SimpleDirectoryReader, Document
from ragas.testset import TestsetGenerator
from llama_index.llms.litellm import LiteLLM
from llama_index.embeddings.litellm import LiteLLMEmbedding
from dotenv import load_dotenv
import pandas as pd
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('testset_generator.log'),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
class TestsetGeneratorConfig:
"""Configuration class for testset generation."""
def __init__(self):
load_dotenv()
self.api_key = os.getenv("LITELLM_API_KEY", "")
self.api_base = os.getenv("LITELLM_API", "")
self.model_name = os.getenv("MODEL_NAME", "")
self.embedding_model = os.getenv("EMBEDDING_MODEL", "")
self.documents_dir = os.getenv("DOCUMENTS_DIR", "./abc")
self.output_dir = os.getenv("OUTPUT_DIR", "./output")
self.testset_size = int(os.getenv("TESTSET_SIZE", "10"))
def validate(self) -> bool:
"""Validate that all required configuration is present."""
required_fields = [
("LITELLM_API_KEY", self.api_key),
("LITELLM_API", self.api_base),
("MODEL_NAME", self.model_name),
("EMBEDDING_MODEL", self.embedding_model)
]
missing_fields = [field for field, value in required_fields if not value]
if missing_fields:
logger.error(f"Missing required environment variables: {missing_fields}")
return False
return True
class DocumentLoader:
"""Handles document loading operations."""
@staticmethod
def load_documents(directory_path: str) -> Optional[List[Document]]:
"""
Load documents from the specified directory.
Args:
directory_path: Path to the directory containing documents
Returns:
List of loaded documents or None if error occurs
"""
try:
if not os.path.exists(directory_path):
logger.error(f"Directory does not exist: {directory_path}")
return None
if not os.listdir(directory_path):
logger.error(f"Directory is empty: {directory_path}")
return None
reader = SimpleDirectoryReader(directory_path)
documents = reader.load_data()
if not documents:
logger.error(f"No valid documents found in directory: {directory_path}")
return None
logger.info(f"Successfully loaded {len(documents)} documents from {directory_path}")
return documents
except Exception as e:
logger.error(f"Error loading documents from {directory_path}: {str(e)}")
return None
class TestsetGeneratorService:
"""Service class for generating testsets."""
def __init__(self, config: TestsetGeneratorConfig):
self.config = config
self.llm = None
self.embeddings = None
self.generator = None
def initialize_models(self) -> bool:
"""Initialize LLM and embedding models."""
try:
self.llm = LiteLLM(
model=self.config.model_name,
api_key=self.config.api_key,
api_base=self.config.api_base
)
self.embeddings = LiteLLMEmbedding(
api_key=self.config.api_key,
api_base=self.config.api_base,
model_name=self.config.embedding_model
)
self.generator = TestsetGenerator.from_llama_index(
llm=self.llm,
embedding_model=self.embeddings
)
logger.info("Successfully initialized models")
return True
except Exception as e:
logger.error(f"Error initializing models: {str(e)}")
return False
def generate_testset(self, documents: List[Document]) -> Optional[pd.DataFrame]:
"""
Generate testset from documents.
Args:
documents: List of documents to generate testset from
Returns:
DataFrame containing the generated testset or None if error occurs
"""
try:
if not documents:
logger.error("No documents provided for testset generation")
return None
logger.info(f"Generating testset with size {self.config.testset_size}")
testset = self.generator.generate_with_llamaindex_docs(
documents,
testset_size=self.config.testset_size
)
df = testset.to_pandas()
# Column mapping for renaming
column_mapping = {
'user_input': 'input',
'reference_contexts': 'context',
'reference': 'reference_output'
}
# Check if required columns exist
missing_columns = [col for col in column_mapping.keys() if col not in df.columns]
if missing_columns:
logger.warning(f"Missing columns in testset: {missing_columns}")
# Rename columns that exist
df = df.rename(columns=column_mapping)
# Select only the target columns that exist
target_columns = ['input', 'context', 'reference_output']
available_columns = [col for col in target_columns if col in df.columns]
if not available_columns:
logger.error("No target columns found in the testset")
return None
# Keep only the specified columns
df = df[available_columns]
logger.info(f"Successfully generated testset with {len(df)} entries")
return df
except Exception as e:
logger.error(f"Error generating testset: {str(e)}")
return None
def save_testset(self, df: pd.DataFrame, filename: str = "testset_output.csv") -> bool:
"""
Save testset to CSV file.
Args:
df: DataFrame to save
filename: Output filename
Returns:
True if successful, False otherwise
"""
try:
# Create output directory if it doesn't exist
output_path = Path(self.config.output_dir)
output_path.mkdir(parents=True, exist_ok=True)
file_path = output_path / filename
df.to_csv(file_path, index=False)
logger.info(f"Successfully saved testset to {file_path}")
return True
except Exception as e:
logger.error(f"Error saving testset: {str(e)}")
return False
def main():
"""Main function to run the testset generation process."""
logger.info("Starting testset generation process")
# Initialize configuration
config = TestsetGeneratorConfig()
if not config.validate():
logger.error("Configuration validation failed")
sys.exit(1)
# Load documents
documents = DocumentLoader.load_documents(config.documents_dir)
if documents is None:
logger.error("Failed to load documents")
sys.exit(1)
# Initialize testset generator service
service = TestsetGeneratorService(config)
if not service.initialize_models():
logger.error("Failed to initialize models")
sys.exit(1)
# Generate testset
testset_df = service.generate_testset(documents)
if testset_df is None:
logger.error("Failed to generate testset")
sys.exit(1)
# Save testset
if not service.save_testset(testset_df):
logger.error("Failed to save testset")
sys.exit(1)
logger.info("Testset generation completed successfully")
if __name__ == "__main__":
main()
Error trace
No Error
Expected behavior
Should contains metadata column including file namein output
Additional context
Add any other context about the problem here.
dosubot
Metadata
Metadata
Assignees
Labels
answered🤖 The question has been answered. Will be closed automatically if no new comments🤖 The question has been answered. Will be closed automatically if no new commentsbugSomething isn't workingSomething isn't workingmodule-testsetgenModule testset generationModule testset generation