Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -202,3 +202,4 @@ text2sql_logs
# MLflow artifacts
mlartifacts
mlflow.db
plan
1 change: 1 addition & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,4 @@ analytics_logger.addHandler(console_handler)
- **Minimal setup**: `[project.optional-dependencies].dev-minimal` for fast development (79 packages)
- **Full setup**: `[dependency-groups].dev` for comprehensive development (383 packages)
- Use `make install-minimal` for most development tasks, `make install` for full ML stack work
- if the user asks you to save a plan, save it into the plan/ directory with an appropriate file name.
18 changes: 9 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ install: ## Install full dependencies with uv sync (backward compatible - modern
format: ## Format and lint all code
@echo "Formatting and linting all code..."
@echo "(ruff format) Formatting ragas..."
$(Q)uv run --active ruff format src tests docs --config pyproject.toml
$(Q)uv run --active ruff format src tests docs --exclude src/ragas/_version.py --config pyproject.toml
@echo "(ruff) Auto-fixing ragas (includes import sorting and unused imports)..."
$(Q)uv run --active ruff check src tests docs --fix-only --config pyproject.toml
$(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --fix-only --config pyproject.toml
@echo "(ruff) Final linting check for ragas..."
$(Q)uv run --active ruff check src tests docs --config pyproject.toml
$(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --config pyproject.toml

type: ## Type check all code
@echo "Type checking all code..."
Expand Down Expand Up @@ -93,8 +93,8 @@ benchmarks-test: ## Run benchmarks for ragas unit tests
run-ci: ## Run complete CI pipeline (mirrors GitHub CI exactly)
@echo "Running complete CI pipeline..."
@echo "Format check..."
$(Q)uv run --active ruff format --check src tests docs --config pyproject.toml
$(Q)uv run --active ruff check src tests docs --config pyproject.toml
$(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
$(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
@echo "Type check..."
$(Q)$(MAKE) type
@echo "Unit tests..."
Expand All @@ -104,8 +104,8 @@ run-ci: ## Run complete CI pipeline (mirrors GitHub CI exactly)
run-ci-format-check: ## Run format check in dry-run mode (like GitHub CI)
@echo "Running format check (dry-run, like GitHub CI)..."
@echo "Checking ragas formatting..."
$(Q)uv run --active ruff format --check src tests docs --config pyproject.toml
$(Q)uv run --active ruff check src docs tests --config pyproject.toml
$(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
$(Q)uv run --active ruff check src docs tests --exclude src/ragas/_version.py --config pyproject.toml

run-ci-type: ## Run type checking (matches GitHub CI)
@echo "Running type checking (matches GitHub CI)..."
Expand All @@ -118,8 +118,8 @@ run-ci-tests: ## Run all tests with CI options
run-ci-fast: ## Fast CI check for quick local validation (2-3 minutes)
@echo "Running fast CI check for quick feedback..."
@echo "Format check..."
$(Q)uv run --active ruff format --check src tests docs --config pyproject.toml
$(Q)uv run --active ruff check src docs tests --config pyproject.toml
$(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml
$(Q)uv run --active ruff check src docs tests --exclude src/ragas/_version.py --config pyproject.toml
@echo "Core unit tests (no nbmake for speed)..."
$(Q)uv run --active pytest tests/unit --dist loadfile -n auto -x
@echo "Fast CI check completed!"
Expand Down
197 changes: 197 additions & 0 deletions tests/e2e/metrics_migration/base_migration_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
"""Base test class for metrics migration E2E tests."""

from typing import Any, Callable, Dict, List, Optional

import pytest

from .test_utils import (
assert_score_types,
compare_scores_with_tolerance,
create_legacy_sample,
print_score_comparison,
print_test_header,
print_test_success,
)


class BaseMigrationTest:
"""Base class for metrics migration E2E tests.

Provides common functionality for testing compatibility between legacy and v2 implementations.
Subclasses should implement metric-specific test data and configurations.
"""

@pytest.mark.asyncio
async def run_e2e_compatibility_test(
self,
sample_data: List[Dict[str, Any]],
legacy_metric_factory: Callable,
v2_metric_factory: Callable,
v2_score_method_name: str = "ascore",
legacy_components: Optional[Dict[str, Any]] = None,
v2_components: Optional[Dict[str, Any]] = None,
tolerance: float = 0.3,
metric_name: str = "Metric",
additional_info_keys: Optional[List[str]] = None,
) -> None:
"""Run E2E compatibility test between legacy and v2 implementations.

Args:
sample_data: List of test cases, each as a dictionary
legacy_metric_factory: Function to create legacy metric instance
v2_metric_factory: Function to create v2 metric instance
v2_score_method_name: Name of the scoring method on v2 metric
legacy_components: Components for legacy metric (llm, embeddings, etc.)
v2_components: Components for v2 metric (llm, embeddings, etc.)
tolerance: Maximum allowed score difference
metric_name: Name of the metric for display
additional_info_keys: Keys from data dict to display in test output
"""
# Check if required components are available
if legacy_components:
if any(component is None for component in legacy_components.values()):
pytest.skip("Required components not available for E2E testing")

if v2_components:
if any(component is None for component in v2_components.values()):
pytest.skip("Required components not available for E2E testing")

# Create metric instances
legacy_metric = (
legacy_metric_factory(**legacy_components)
if legacy_components
else legacy_metric_factory()
)
v2_metric = (
v2_metric_factory(**v2_components) if v2_components else v2_metric_factory()
)

# Run tests for each sample
for i, data in enumerate(sample_data):
description = data.get("description", "No description")

# Prepare additional info for display
additional_info = {}
if additional_info_keys:
for key in additional_info_keys:
if key in data:
additional_info[key.replace("_", " ").title()] = str(data[key])

print_test_header(metric_name, i + 1, description, additional_info)

# Score with legacy implementation
legacy_sample = create_legacy_sample(data)
legacy_score = await legacy_metric._single_turn_ascore(legacy_sample, None)

# Score with v2 implementation
# Extract parameters for v2 scoring (exclude metadata keys)
v2_params = {k: v for k, v in data.items() if k != "description"}
v2_score_method = getattr(v2_metric, v2_score_method_name)
v2_result = await v2_score_method(**v2_params)

# Compare scores
print_score_comparison(legacy_score, v2_result.value)

# Assert scores are within tolerance
compare_scores_with_tolerance(
legacy_score,
v2_result.value,
tolerance,
description,
i + 1,
)

# Assert types and ranges
assert_score_types(legacy_score, v2_result)

print_test_success()

@pytest.mark.asyncio
async def run_metric_specific_test(
self,
test_cases: List[Dict[str, Any]],
legacy_metric_factory: Callable,
v2_metric_factory: Callable,
legacy_components: Optional[Dict[str, Any]] = None,
v2_components: Optional[Dict[str, Any]] = None,
test_name: str = "Metric Specific Test",
assertion_fn: Optional[Callable] = None,
) -> None:
"""Run a metric-specific test with custom assertions.

Args:
test_cases: List of test cases
legacy_metric_factory: Function to create legacy metric instance
v2_metric_factory: Function to create v2 metric instance
legacy_components: Components for legacy metric
v2_components: Components for v2 metric
test_name: Name of the test for display
assertion_fn: Optional custom assertion function that takes (case, legacy_score, v2_result)
"""
# Check if required components are available
if legacy_components:
if any(component is None for component in legacy_components.values()):
pytest.skip("Required components not available for testing")

if v2_components:
if any(component is None for component in v2_components.values()):
pytest.skip("Required components not available for testing")

# Create metric instances
legacy_metric = (
legacy_metric_factory(**legacy_components)
if legacy_components
else legacy_metric_factory()
)
v2_metric = (
v2_metric_factory(**v2_components) if v2_components else v2_metric_factory()
)

# Run tests for each case
for case in test_cases:
description = case.get("description", "No description")
print(f"\n🎯 Testing {test_name}: {description}")

# Score with legacy implementation
legacy_sample = create_legacy_sample(case)
legacy_score = await legacy_metric._single_turn_ascore(legacy_sample, None)

# Score with v2 implementation
v2_params = {
k: v
for k, v in case.items()
if k not in ["description", "expected_high", "expected_low"]
}
v2_result = await v2_metric.ascore(**v2_params)

# Print scores
print_score_comparison(legacy_score, v2_result.value)

# Run custom assertions if provided
if assertion_fn:
assertion_fn(case, legacy_score, v2_result)
else:
# Default: just verify types
assert_score_types(legacy_score, v2_result)

def create_requirements_documentation(
self,
metric_name: str,
requirements: Dict[str, str],
test_file_name: str,
) -> None:
"""Print documentation about E2E test requirements.

Args:
metric_name: Name of the metric
requirements: Dictionary of requirements
test_file_name: Name of the test file
"""
print(f"\nπŸ“‹ {metric_name} E2E Test Requirements:")
for key, value in requirements.items():
print(f" {key.capitalize()}: {value}")

print("\nπŸš€ To enable full E2E testing:")
print(" 1. Configure required providers (e.g., export OPENAI_API_KEY=...)")
print(" 2. Remove @pytest.mark.skip decorators")
print(f" 3. Run: pytest tests/e2e/metrics_migration/{test_file_name} -v -s")
69 changes: 69 additions & 0 deletions tests/e2e/metrics_migration/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Common fixtures for metrics migration E2E tests.

This module provides pytest fixtures that wrap the shared utility functions
from tests.utils.llm_setup for use in E2E migration tests.
"""

import pytest

from tests.utils import (
create_legacy_embeddings,
create_legacy_llm,
create_modern_embeddings,
create_modern_llm,
)


@pytest.fixture
def legacy_llm():
"""Create a test LLM for legacy metric evaluation.

Uses legacy llm_factory for legacy implementation.
Skips if LLM factory is not available or API key is missing.
"""
try:
return create_legacy_llm("gpt-3.5-turbo")
except Exception as e:
pytest.skip(str(e))


@pytest.fixture
def modern_llm():
"""Create a modern instructor LLM for v2 implementation.

Uses instructor_llm_factory with OpenAI client.
Skips if instructor LLM factory is not available or API key is missing.
"""
try:
return create_modern_llm("openai", model="gpt-3.5-turbo")
except Exception as e:
pytest.skip(str(e))


@pytest.fixture
def legacy_embeddings():
"""Create legacy embeddings for legacy implementation.

Uses legacy embedding_factory interface.
Skips if embedding factory is not available or API key is missing.
"""
try:
return create_legacy_embeddings("text-embedding-ada-002")
except Exception as e:
pytest.skip(str(e))


@pytest.fixture
def modern_embeddings():
"""Create modern embeddings for v2 implementation.

Uses modern interface with explicit provider and client.
Skips if OpenAI or embedding factory is not available or API key is missing.
"""
try:
return create_modern_embeddings(
provider="openai",
model="text-embedding-ada-002",
)
except Exception as e:
pytest.skip(str(e))
Loading
Loading