diff --git a/.gitignore b/.gitignore index bf49b53bf..f817be47d 100644 --- a/.gitignore +++ b/.gitignore @@ -202,3 +202,4 @@ text2sql_logs # MLflow artifacts mlartifacts mlflow.db +plan diff --git a/CLAUDE.md b/CLAUDE.md index 41f0f2c50..8d913a88f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -218,3 +218,4 @@ analytics_logger.addHandler(console_handler) - **Minimal setup**: `[project.optional-dependencies].dev-minimal` for fast development (79 packages) - **Full setup**: `[dependency-groups].dev` for comprehensive development (383 packages) - Use `make install-minimal` for most development tasks, `make install` for full ML stack work +- if the user asks you to save a plan, save it into the plan/ directory with an appropriate file name. diff --git a/Makefile b/Makefile index 9150843db..3e4b4fb19 100644 --- a/Makefile +++ b/Makefile @@ -52,11 +52,11 @@ install: ## Install full dependencies with uv sync (backward compatible - modern format: ## Format and lint all code @echo "Formatting and linting all code..." @echo "(ruff format) Formatting ragas..." - $(Q)uv run --active ruff format src tests docs --config pyproject.toml + $(Q)uv run --active ruff format src tests docs --exclude src/ragas/_version.py --config pyproject.toml @echo "(ruff) Auto-fixing ragas (includes import sorting and unused imports)..." - $(Q)uv run --active ruff check src tests docs --fix-only --config pyproject.toml + $(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --fix-only --config pyproject.toml @echo "(ruff) Final linting check for ragas..." - $(Q)uv run --active ruff check src tests docs --config pyproject.toml + $(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --config pyproject.toml type: ## Type check all code @echo "Type checking all code..." @@ -93,8 +93,8 @@ benchmarks-test: ## Run benchmarks for ragas unit tests run-ci: ## Run complete CI pipeline (mirrors GitHub CI exactly) @echo "Running complete CI pipeline..." @echo "Format check..." - $(Q)uv run --active ruff format --check src tests docs --config pyproject.toml - $(Q)uv run --active ruff check src tests docs --config pyproject.toml + $(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml + $(Q)uv run --active ruff check src tests docs --exclude src/ragas/_version.py --config pyproject.toml @echo "Type check..." $(Q)$(MAKE) type @echo "Unit tests..." @@ -104,8 +104,8 @@ run-ci: ## Run complete CI pipeline (mirrors GitHub CI exactly) run-ci-format-check: ## Run format check in dry-run mode (like GitHub CI) @echo "Running format check (dry-run, like GitHub CI)..." @echo "Checking ragas formatting..." - $(Q)uv run --active ruff format --check src tests docs --config pyproject.toml - $(Q)uv run --active ruff check src docs tests --config pyproject.toml + $(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml + $(Q)uv run --active ruff check src docs tests --exclude src/ragas/_version.py --config pyproject.toml run-ci-type: ## Run type checking (matches GitHub CI) @echo "Running type checking (matches GitHub CI)..." @@ -118,8 +118,8 @@ run-ci-tests: ## Run all tests with CI options run-ci-fast: ## Fast CI check for quick local validation (2-3 minutes) @echo "Running fast CI check for quick feedback..." @echo "Format check..." - $(Q)uv run --active ruff format --check src tests docs --config pyproject.toml - $(Q)uv run --active ruff check src docs tests --config pyproject.toml + $(Q)uv run --active ruff format --check src tests docs --exclude src/ragas/_version.py --config pyproject.toml + $(Q)uv run --active ruff check src docs tests --exclude src/ragas/_version.py --config pyproject.toml @echo "Core unit tests (no nbmake for speed)..." $(Q)uv run --active pytest tests/unit --dist loadfile -n auto -x @echo "Fast CI check completed!" diff --git a/tests/e2e/metrics_migration/base_migration_test.py b/tests/e2e/metrics_migration/base_migration_test.py new file mode 100644 index 000000000..d8f1fcb4d --- /dev/null +++ b/tests/e2e/metrics_migration/base_migration_test.py @@ -0,0 +1,197 @@ +"""Base test class for metrics migration E2E tests.""" + +from typing import Any, Callable, Dict, List, Optional + +import pytest + +from .test_utils import ( + assert_score_types, + compare_scores_with_tolerance, + create_legacy_sample, + print_score_comparison, + print_test_header, + print_test_success, +) + + +class BaseMigrationTest: + """Base class for metrics migration E2E tests. + + Provides common functionality for testing compatibility between legacy and v2 implementations. + Subclasses should implement metric-specific test data and configurations. + """ + + @pytest.mark.asyncio + async def run_e2e_compatibility_test( + self, + sample_data: List[Dict[str, Any]], + legacy_metric_factory: Callable, + v2_metric_factory: Callable, + v2_score_method_name: str = "ascore", + legacy_components: Optional[Dict[str, Any]] = None, + v2_components: Optional[Dict[str, Any]] = None, + tolerance: float = 0.3, + metric_name: str = "Metric", + additional_info_keys: Optional[List[str]] = None, + ) -> None: + """Run E2E compatibility test between legacy and v2 implementations. + + Args: + sample_data: List of test cases, each as a dictionary + legacy_metric_factory: Function to create legacy metric instance + v2_metric_factory: Function to create v2 metric instance + v2_score_method_name: Name of the scoring method on v2 metric + legacy_components: Components for legacy metric (llm, embeddings, etc.) + v2_components: Components for v2 metric (llm, embeddings, etc.) + tolerance: Maximum allowed score difference + metric_name: Name of the metric for display + additional_info_keys: Keys from data dict to display in test output + """ + # Check if required components are available + if legacy_components: + if any(component is None for component in legacy_components.values()): + pytest.skip("Required components not available for E2E testing") + + if v2_components: + if any(component is None for component in v2_components.values()): + pytest.skip("Required components not available for E2E testing") + + # Create metric instances + legacy_metric = ( + legacy_metric_factory(**legacy_components) + if legacy_components + else legacy_metric_factory() + ) + v2_metric = ( + v2_metric_factory(**v2_components) if v2_components else v2_metric_factory() + ) + + # Run tests for each sample + for i, data in enumerate(sample_data): + description = data.get("description", "No description") + + # Prepare additional info for display + additional_info = {} + if additional_info_keys: + for key in additional_info_keys: + if key in data: + additional_info[key.replace("_", " ").title()] = str(data[key]) + + print_test_header(metric_name, i + 1, description, additional_info) + + # Score with legacy implementation + legacy_sample = create_legacy_sample(data) + legacy_score = await legacy_metric._single_turn_ascore(legacy_sample, None) + + # Score with v2 implementation + # Extract parameters for v2 scoring (exclude metadata keys) + v2_params = {k: v for k, v in data.items() if k != "description"} + v2_score_method = getattr(v2_metric, v2_score_method_name) + v2_result = await v2_score_method(**v2_params) + + # Compare scores + print_score_comparison(legacy_score, v2_result.value) + + # Assert scores are within tolerance + compare_scores_with_tolerance( + legacy_score, + v2_result.value, + tolerance, + description, + i + 1, + ) + + # Assert types and ranges + assert_score_types(legacy_score, v2_result) + + print_test_success() + + @pytest.mark.asyncio + async def run_metric_specific_test( + self, + test_cases: List[Dict[str, Any]], + legacy_metric_factory: Callable, + v2_metric_factory: Callable, + legacy_components: Optional[Dict[str, Any]] = None, + v2_components: Optional[Dict[str, Any]] = None, + test_name: str = "Metric Specific Test", + assertion_fn: Optional[Callable] = None, + ) -> None: + """Run a metric-specific test with custom assertions. + + Args: + test_cases: List of test cases + legacy_metric_factory: Function to create legacy metric instance + v2_metric_factory: Function to create v2 metric instance + legacy_components: Components for legacy metric + v2_components: Components for v2 metric + test_name: Name of the test for display + assertion_fn: Optional custom assertion function that takes (case, legacy_score, v2_result) + """ + # Check if required components are available + if legacy_components: + if any(component is None for component in legacy_components.values()): + pytest.skip("Required components not available for testing") + + if v2_components: + if any(component is None for component in v2_components.values()): + pytest.skip("Required components not available for testing") + + # Create metric instances + legacy_metric = ( + legacy_metric_factory(**legacy_components) + if legacy_components + else legacy_metric_factory() + ) + v2_metric = ( + v2_metric_factory(**v2_components) if v2_components else v2_metric_factory() + ) + + # Run tests for each case + for case in test_cases: + description = case.get("description", "No description") + print(f"\n๐ŸŽฏ Testing {test_name}: {description}") + + # Score with legacy implementation + legacy_sample = create_legacy_sample(case) + legacy_score = await legacy_metric._single_turn_ascore(legacy_sample, None) + + # Score with v2 implementation + v2_params = { + k: v + for k, v in case.items() + if k not in ["description", "expected_high", "expected_low"] + } + v2_result = await v2_metric.ascore(**v2_params) + + # Print scores + print_score_comparison(legacy_score, v2_result.value) + + # Run custom assertions if provided + if assertion_fn: + assertion_fn(case, legacy_score, v2_result) + else: + # Default: just verify types + assert_score_types(legacy_score, v2_result) + + def create_requirements_documentation( + self, + metric_name: str, + requirements: Dict[str, str], + test_file_name: str, + ) -> None: + """Print documentation about E2E test requirements. + + Args: + metric_name: Name of the metric + requirements: Dictionary of requirements + test_file_name: Name of the test file + """ + print(f"\n๐Ÿ“‹ {metric_name} E2E Test Requirements:") + for key, value in requirements.items(): + print(f" {key.capitalize()}: {value}") + + print("\n๐Ÿš€ To enable full E2E testing:") + print(" 1. Configure required providers (e.g., export OPENAI_API_KEY=...)") + print(" 2. Remove @pytest.mark.skip decorators") + print(f" 3. Run: pytest tests/e2e/metrics_migration/{test_file_name} -v -s") diff --git a/tests/e2e/metrics_migration/conftest.py b/tests/e2e/metrics_migration/conftest.py new file mode 100644 index 000000000..22dd7ead1 --- /dev/null +++ b/tests/e2e/metrics_migration/conftest.py @@ -0,0 +1,69 @@ +"""Common fixtures for metrics migration E2E tests. + +This module provides pytest fixtures that wrap the shared utility functions +from tests.utils.llm_setup for use in E2E migration tests. +""" + +import pytest + +from tests.utils import ( + create_legacy_embeddings, + create_legacy_llm, + create_modern_embeddings, + create_modern_llm, +) + + +@pytest.fixture +def legacy_llm(): + """Create a test LLM for legacy metric evaluation. + + Uses legacy llm_factory for legacy implementation. + Skips if LLM factory is not available or API key is missing. + """ + try: + return create_legacy_llm("gpt-3.5-turbo") + except Exception as e: + pytest.skip(str(e)) + + +@pytest.fixture +def modern_llm(): + """Create a modern instructor LLM for v2 implementation. + + Uses instructor_llm_factory with OpenAI client. + Skips if instructor LLM factory is not available or API key is missing. + """ + try: + return create_modern_llm("openai", model="gpt-3.5-turbo") + except Exception as e: + pytest.skip(str(e)) + + +@pytest.fixture +def legacy_embeddings(): + """Create legacy embeddings for legacy implementation. + + Uses legacy embedding_factory interface. + Skips if embedding factory is not available or API key is missing. + """ + try: + return create_legacy_embeddings("text-embedding-ada-002") + except Exception as e: + pytest.skip(str(e)) + + +@pytest.fixture +def modern_embeddings(): + """Create modern embeddings for v2 implementation. + + Uses modern interface with explicit provider and client. + Skips if OpenAI or embedding factory is not available or API key is missing. + """ + try: + return create_modern_embeddings( + provider="openai", + model="text-embedding-ada-002", + ) + except Exception as e: + pytest.skip(str(e)) diff --git a/tests/e2e/metrics_migration/metric_score_diff.ipynb b/tests/e2e/metrics_migration/metric_score_diff.ipynb new file mode 100644 index 000000000..8ec16c5c5 --- /dev/null +++ b/tests/e2e/metrics_migration/metric_score_diff.ipynb @@ -0,0 +1,1293 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Metrics Migration Testing Notebook (General Purpose)\n", + "\n", + "This notebook provides a **generalized, reusable approach** for comparing legacy and modern metric implementations.\n", + "\n", + "## Quick Start\n", + "1. **Edit the Configuration Cell** (cell 2) with your metric details\n", + "2. Run all cells - no other modifications needed!\n", + "3. Works for ANY metric type: LLM-based, embeddings-based, or deterministic\n", + "\n", + "## Purpose\n", + "- **PRIMARY**: Validate migration on real-world datasets (amnesty_qa, fiqa)\n", + "- **SECONDARY**: Test specific edge cases and behaviors\n", + "- **FLEXIBLE**: Works with any metric configuration\n", + "\n", + "## Structure\n", + "1. Configuration (specify your metrics and requirements)\n", + "2. Setup and component creation\n", + "3. Dataset-based comparison (Amnesty QA)\n", + "4. FIQA dataset testing (domain generalization)\n", + "5. Optional: Different LLMs, edge cases\n", + "\n", + "Based on: `tests/e2e/plan-for-metrics-migration.md`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "# Ragas imports" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ“ Configuration loaded - Edit above for your metric\n" + ] + } + ], + "source": [ + "## โš ๏ธ CONFIGURATION CELL - EDIT THIS FOR YOUR METRIC โš ๏ธ\n", + "\n", + "# Metric Configuration - Update these values for any metric\n", + "METRIC_CONFIG = {\n", + " # ===== METRIC IMPORTS =====\n", + " \"legacy_import\": {\n", + " \"module\": \"ragas.metrics._answer_relevance\", # e.g., \"ragas.metrics._context_recall\"\n", + " \"class_name\": \"AnswerRelevancy\", # e.g., \"ContextRecall\"\n", + " },\n", + " \"modern_import\": {\n", + " \"module\": \"ragas.metrics.collections\",\n", + " \"class_name\": \"AnswerRelevancy\",\n", + " },\n", + " # ===== COMPONENT REQUIREMENTS =====\n", + " # Set to False if your metric doesn't need this component\n", + " \"needs_llm\": True,\n", + " \"needs_embeddings\": True,\n", + " # ===== DATASET FIELD MAPPING =====\n", + " # Which fields does your metric require from the dataset?\n", + " # Choose ONE based on your metric type:\n", + " # OPTION 1: Answer-based metrics (AnswerRelevancy, AnswerSimilarity, etc.)\n", + " \"dataset_fields\": [\"user_input\", \"response\"],\n", + " # OPTION 2: Context-based metrics (ContextRecall, ContextPrecision, etc.)\n", + " # \"dataset_fields\": [\"user_input\", \"retrieved_contexts\", \"reference\"],\n", + " # OPTION 3: Deterministic metrics (NonLLMContextRecall, etc.)\n", + " # \"dataset_fields\": [\"retrieved_contexts\", \"reference_contexts\"],\n", + " # \"needs_llm\": False,\n", + " # \"needs_embeddings\": False,\n", + "}\n", + "\n", + "# ===== QUICK REFERENCE =====\n", + "# AnswerRelevancy: dataset_fields = [\"user_input\", \"response\"], needs_llm = True, needs_embeddings = True\n", + "# ContextRecall: dataset_fields = [\"user_input\", \"retrieved_contexts\", \"reference\"], needs_llm = True, needs_embeddings = False\n", + "# NonLLMContextRecall: dataset_fields = [\"retrieved_contexts\", \"reference_contexts\"], needs_llm = False, needs_embeddings = False\n", + "# ContextPrecision: dataset_fields = [\"user_input\", \"retrieved_contexts\", \"reference\"], needs_llm = True, needs_embeddings = False\n", + "\n", + "print(\"โœ“ Configuration loaded - Edit above for your metric\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "METRIC_CONFIG = {\n", + " # ===== METRIC IMPORTS =====\n", + " \"legacy_import\": {\n", + " \"module\": \"ragas.metrics._context_precision\",\n", + " \"class_name\": \"LLMContextPrecisionWithReference\",\n", + " },\n", + " \"modern_import\": {\n", + " \"module\": \"ragas.metrics.collections\",\n", + " \"class_name\": \"ContextPrecision\",\n", + " },\n", + " # ===== COMPONENT REQUIREMENTS =====\n", + " \"needs_llm\": True,\n", + " \"needs_embeddings\": False,\n", + " # ===== DATASET FIELD MAPPING =====\n", + " # Context-based metric using user_input, retrieved_contexts, and reference\n", + " \"dataset_fields\": [\"user_input\", \"retrieved_contexts\", \"reference\"],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Make sure you have your OpenAI API key set as an environment variable before running this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import importlib\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "# Add project root to path\n", + "project_root = Path.cwd().parent.parent.parent\n", + "sys.path.insert(0, str(project_root))\n", + "\n", + "from tests.utils import check_api_key # noqa: E402\n", + "\n", + "# Check for OpenAI API key\n", + "check_api_key(\"openai\")\n", + "print(\"โœ“ Setup complete\")\n", + "\n", + "\n", + "# ===== DYNAMIC METRIC LOADING =====\n", + "def load_metric_class(import_config):\n", + " \"\"\"Dynamically load a metric class from module and class name.\"\"\"\n", + " try:\n", + " module = importlib.import_module(import_config[\"module\"])\n", + " return getattr(module, import_config[\"class_name\"])\n", + " except (ImportError, AttributeError) as e:\n", + " raise ValueError(\n", + " f\"Failed to load {import_config['class_name']} from {import_config['module']}: {e}\"\n", + " )\n", + "\n", + "\n", + "# Load metric classes from config\n", + "LegacyMetric = load_metric_class(METRIC_CONFIG[\"legacy_import\"])\n", + "ModernMetric = load_metric_class(METRIC_CONFIG[\"modern_import\"])\n", + "\n", + "print(\"โœ“ Metric classes loaded:\")\n", + "print(\n", + " f\" Legacy: {METRIC_CONFIG['legacy_import']['class_name']} from {METRIC_CONFIG['legacy_import']['module']}\"\n", + ")\n", + "print(\n", + " f\" Modern: {METRIC_CONFIG['modern_import']['class_name']} from {METRIC_CONFIG['modern_import']['module']}\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Comparison Utilities\n", + "\n", + "The `compare_metrics` function is imported from `tests.utils` and provides:\n", + "- Concurrent processing for better performance\n", + "- Parallel or sequential metric execution\n", + "- Built-in result aggregation and statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ“ Comparison utilities loaded\n" + ] + } + ], + "source": [ + "from tests.utils import compare_metrics\n", + "\n", + "print(\"โœ“ Comparison utilities loaded\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create LLM and Embeddings Components\n", + "\n", + "Use shared test utilities to create legacy and modern components based on configuration." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ“ LLM components created\n", + "โœ“ All required components created\n" + ] + } + ], + "source": [ + "from tests.utils import (\n", + " create_legacy_embeddings,\n", + " create_legacy_llm,\n", + " create_modern_embeddings,\n", + " create_modern_llm,\n", + ")\n", + "\n", + "# ===== CREATE COMPONENTS BASED ON CONFIGURATION =====\n", + "components_config = {\n", + " \"legacy_llm\": None,\n", + " \"legacy_embeddings\": None,\n", + " \"modern_llm\": None,\n", + " \"modern_embeddings\": None,\n", + "}\n", + "\n", + "if METRIC_CONFIG[\"needs_llm\"]:\n", + " components_config[\"legacy_llm\"] = create_legacy_llm(model=\"gpt-4o-mini\")\n", + " components_config[\"modern_llm\"] = create_modern_llm(\n", + " provider=\"openai\", model=\"gpt-4o-mini\"\n", + " )\n", + " print(\"โœ“ LLM components created\")\n", + "\n", + "if METRIC_CONFIG[\"needs_embeddings\"]:\n", + " components_config[\"legacy_embeddings\"] = create_legacy_embeddings(\n", + " model=\"text-embedding-ada-002\"\n", + " )\n", + " components_config[\"modern_embeddings\"] = create_modern_embeddings(\n", + " provider=\"openai\", model=\"text-embedding-ada-002\"\n", + " )\n", + " print(\"โœ“ Embeddings components created\")\n", + "\n", + "print(\"โœ“ All required components created\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Metrics\n", + "\n", + "Uses the dynamically loaded metric classes and configured components." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ“ Metrics initialized:\n", + " Legacy: llm_context_precision_with_reference\n", + " Modern: context_precision\n", + " Dataset fields required: ['user_input', 'retrieved_contexts', 'reference']\n" + ] + } + ], + "source": [ + "# ===== INITIALIZE METRICS DYNAMICALLY =====\n", + "def init_metric(metric_class, components_config, is_legacy=True):\n", + " \"\"\"Initialize a metric with available components.\"\"\"\n", + " prefix = \"legacy_\" if is_legacy else \"modern_\"\n", + "\n", + " # Build kwargs from available components\n", + " kwargs = {}\n", + " if components_config[f\"{prefix}llm\"]:\n", + " kwargs[\"llm\"] = components_config[f\"{prefix}llm\"]\n", + " if components_config[f\"{prefix}embeddings\"]:\n", + " kwargs[\"embeddings\"] = components_config[f\"{prefix}embeddings\"]\n", + "\n", + " return metric_class(**kwargs)\n", + "\n", + "\n", + "# Initialize metrics\n", + "legacy_metric = init_metric(LegacyMetric, components_config, is_legacy=True)\n", + "modern_metric = init_metric(ModernMetric, components_config, is_legacy=False)\n", + "\n", + "# Display initialized metrics\n", + "legacy_name = getattr(legacy_metric, \"name\", legacy_metric.__class__.__name__)\n", + "modern_name = getattr(modern_metric, \"name\", modern_metric.__class__.__name__)\n", + "\n", + "print(\"โœ“ Metrics initialized:\")\n", + "print(f\" Legacy: {legacy_name}\")\n", + "print(f\" Modern: {modern_name}\")\n", + "print(f\" Dataset fields required: {METRIC_CONFIG['dataset_fields']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## PRIMARY: Dataset-Based Testing\n", + "\n", + "### Load Amnesty QA Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading amnesty_qa dataset...\n", + "โœ“ Loaded 20 samples from amnesty_qa\n", + "โœ“ Prepared 20 samples for testing\n", + "\n", + "First sample fields:\n", + " user_input: What are the global implications of the USA Supreme Court ruling on abortion?...\n", + " retrieved_contexts: 3 item(s)\n", + " reference: The global implications of the USA Supreme Court ruling on abortion are signific...\n" + ] + } + ], + "source": [ + "from tests.e2e.test_dataset_utils import load_amnesty_dataset_safe\n", + "\n", + "print(\"Loading amnesty_qa dataset...\")\n", + "amnesty_dataset = load_amnesty_dataset_safe(\"english_v3\")\n", + "print(f\"โœ“ Loaded {len(amnesty_dataset)} samples from amnesty_qa\")\n", + "\n", + "# Convert to format expected by metric using configured fields\n", + "amnesty_test_data = []\n", + "for i, sample in enumerate(amnesty_dataset):\n", + " if i >= 20: # Start with 20 samples, adjust as needed\n", + " break\n", + "\n", + " # Extract only configured fields\n", + " test_sample = {}\n", + " for field in METRIC_CONFIG[\"dataset_fields\"]:\n", + " if field == \"reference_contexts\" and field not in sample:\n", + " # Handle transform case: split retrieved_contexts\n", + " retrieved_contexts = sample.get(\"retrieved_contexts\", [])\n", + " if retrieved_contexts and len(retrieved_contexts) > 1:\n", + " mid = len(retrieved_contexts) // 2\n", + " test_sample[field] = retrieved_contexts[mid:]\n", + " elif field in sample:\n", + " test_sample[field] = sample[field]\n", + " elif field == \"response\":\n", + " # Default for response if not in sample\n", + " test_sample[field] = sample.get(\"response\", \"\")\n", + " elif field == \"reference\":\n", + " # Rename reference_contexts to reference if needed\n", + " test_sample[field] = sample.get(\n", + " \"reference_contexts\", sample.get(\"reference\", \"\")\n", + " )\n", + "\n", + " if test_sample: # Only add if we have data\n", + " amnesty_test_data.append(test_sample)\n", + "\n", + "print(f\"โœ“ Prepared {len(amnesty_test_data)} samples for testing\")\n", + "if amnesty_test_data:\n", + " print(\"\\nFirst sample fields:\")\n", + " first_sample = amnesty_test_data[0]\n", + " for key, value in first_sample.items():\n", + " if isinstance(value, list):\n", + " print(f\" {key}: {len(value)} item(s)\")\n", + " elif isinstance(value, str):\n", + " print(f\" {key}: {value[:80]}...\")\n", + " else:\n", + " print(f\" {key}: {value}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare on Amnesty QA (Optimized & Parallel)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "AMNESTY QA DATASET COMPARISON\n", + "======================================================================\n", + "Dataset: 20 samples\n", + "Mode: Concurrent processing + Parallel metrics\n", + "======================================================================\n", + "Running both metrics in parallel on 20 samples (max 10 concurrent)...\n", + "============================================================\n", + "METRIC COMPARISON SUMMARY\n", + "============================================================\n", + "\n", + "Score Statistics:\n", + " Old Metric Mean: 0.8583\n", + " New Metric Mean: 0.8292\n", + "\n", + "Difference Statistics (new - old):\n", + " Mean Diff: -0.0292\n", + " Max Diff: 0.4167\n", + " Min Diff: -0.5000\n", + " Std Dev: 0.1565\n", + "\n", + "Execution Time:\n", + " Old Metric: 10.74s\n", + " New Metric: 10.18s\n", + " Speedup: 1.06x\n", + "============================================================\n" + ] + } + ], + "source": [ + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"AMNESTY QA DATASET COMPARISON\")\n", + "print(\"=\" * 70)\n", + "print(f\"Dataset: {len(amnesty_test_data)} samples\")\n", + "print(\"Mode: Concurrent processing + Parallel metrics\")\n", + "print(\"=\" * 70)\n", + "\n", + "amnesty_result = await compare_metrics(\n", + " old_metric=legacy_metric,\n", + " new_metric=modern_metric,\n", + " dataset=amnesty_test_data,\n", + " old_metric_type=\"old\",\n", + " new_metric_type=\"new\",\n", + " max_concurrent=10,\n", + " parallel_metrics=True,\n", + ")\n", + "\n", + "amnesty_result.print_summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analyze Amnesty QA Results in Detail\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "DETAILED STATISTICAL ANALYSIS\n", + "======================================================================\n", + "\n", + "Dataset: amnesty_qa (20 samples)\n", + "\n", + "Score Statistics:\n", + " Legacy Mean: 0.8583\n", + " New Mean: 0.8292\n", + " Score Shift: -0.0292\n", + "\n", + "Difference Statistics:\n", + " Mean |Diff|: 0.0708\n", + " Std Dev: 0.1565\n", + " Max Diff: 0.4167\n", + " Min Diff: -0.5000\n", + " Median Diff: 0.0000\n", + "\n", + "Tolerance Analysis:\n", + " < 0.10: 15/20 ( 75.0%)\n", + " < 0.15: 15/20 ( 75.0%)\n", + " < 0.20: 18/20 ( 90.0%)\n", + " < 0.25: 18/20 ( 90.0%)\n", + " < 0.30: 18/20 ( 90.0%)\n", + "\n", + "======================================================================\n", + "TOP 10 LARGEST DIFFERENCES\n", + "======================================================================\n", + "\n", + "#4: What action did Amnesty International urge its supporters to...\n", + " Legacy: 1.0000 | New: 0.5000 | Diff: 0.5000\n", + "\n", + "#20: When did the government of Qatar start repealing restriction...\n", + " Legacy: 0.5833 | New: 1.0000 | Diff: 0.4167\n", + "\n", + "#7: Which right guarantees access to comprehensive information a...\n", + " Legacy: 1.0000 | New: 0.8333 | Diff: 0.1667\n", + "\n", + "#12: What conditions designate wetlands as Ramsar sites?...\n", + " Legacy: 1.0000 | New: 0.8333 | Diff: 0.1667\n", + "\n", + "#19: What labor abuses were documented by Amnesty International i...\n", + " Legacy: 1.0000 | New: 0.8333 | Diff: 0.1667\n", + "\n", + "#10: When does the prosecution consider statements contrary to th...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#1: What are the global implications of the USA Supreme Court ru...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#2: Which companies are the main contributors to GHG emissions a...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#3: Which private companies in the Americas are the largest GHG ...\n", + " Legacy: 0.8333 | New: 0.8333 | Diff: 0.0000\n", + "\n", + "#5: What are the recommendations made by Amnesty International t...\n", + " Legacy: 0.5833 | New: 0.5833 | Diff: 0.0000\n" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Get detailed DataFrame\n", + "df_amnesty = amnesty_result.to_dataframe()\n", + "df_amnesty[\"sample_idx\"] = range(len(df_amnesty))\n", + "\n", + "\n", + "# Create description from first available string field in your test data\n", + "def get_description(sample):\n", + " \"\"\"Extract a short description from sample data.\"\"\"\n", + " for key in [\"user_input\", \"response\", \"reference\", \"question\"]:\n", + " if key in sample and isinstance(sample[key], str):\n", + " return sample[key][:60] + \"...\"\n", + " return f\"Sample with {len(sample)} fields\"\n", + "\n", + "\n", + "df_amnesty[\"description\"] = [get_description(s) for s in amnesty_test_data]\n", + "\n", + "# Statistical Analysis\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"DETAILED STATISTICAL ANALYSIS\")\n", + "print(\"=\" * 70)\n", + "print(f\"\\nDataset: amnesty_qa ({len(df_amnesty)} samples)\")\n", + "print(\"\\nScore Statistics:\")\n", + "print(f\" Legacy Mean: {amnesty_result.old_mean:.4f}\")\n", + "print(f\" New Mean: {amnesty_result.new_mean:.4f}\")\n", + "print(f\" Score Shift: {amnesty_result.mean_diff:+.4f}\")\n", + "\n", + "print(\"\\nDifference Statistics:\")\n", + "print(f\" Mean |Diff|: {df_amnesty['abs_diff'].mean():.4f}\")\n", + "print(f\" Std Dev: {amnesty_result.std_diff:.4f}\")\n", + "print(f\" Max Diff: {amnesty_result.max_diff:.4f}\")\n", + "print(f\" Min Diff: {amnesty_result.min_diff:.4f}\")\n", + "print(f\" Median Diff: {df_amnesty['abs_diff'].median():.4f}\")\n", + "\n", + "# Tolerance Analysis (adjust for your metric type)\n", + "# For LLM-based metrics: use [0.1, 0.15, 0.2, 0.25, 0.3]\n", + "# For deterministic metrics: use [1e-10, 1e-8, 1e-6, 1e-4, 0.01]\n", + "tolerance_levels = [0.1, 0.15, 0.2, 0.25, 0.3]\n", + "print(\"\\nTolerance Analysis:\")\n", + "for tol in tolerance_levels:\n", + " within = (df_amnesty[\"abs_diff\"] < tol).sum()\n", + " pct = within / len(df_amnesty) * 100\n", + " print(f\" < {tol:.2f}: {within:3d}/{len(df_amnesty)} ({pct:5.1f}%)\")\n", + "\n", + "# Identify problematic cases\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"TOP 10 LARGEST DIFFERENCES\")\n", + "print(\"=\" * 70)\n", + "top_diffs = df_amnesty.nlargest(10, \"abs_diff\")\n", + "for idx, row in top_diffs.iterrows():\n", + " print(f\"\\n#{row['sample_idx'] + 1}: {row['description']}\")\n", + " print(\n", + " f\" Legacy: {row['old_score']:.4f} | New: {row['new_score']:.4f} | Diff: {row['abs_diff']:.4f}\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/2y/02fp70k56p75ldrkgtx7z10r0000gn/T/ipykernel_39797/1485780648.py:59: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.\n", + " ax5.boxplot([df_amnesty[\"old_score\"], df_amnesty[\"new_score\"]], labels=['Legacy', 'New'])\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "### Visualize Amnesty QA Results\n", + "\n", + "# Comprehensive Visualization\n", + "fig = plt.figure(figsize=(16, 12))\n", + "gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)\n", + "\n", + "# 1. Scatter: Legacy vs New scores\n", + "ax1 = fig.add_subplot(gs[0, 0])\n", + "ax1.scatter(df_amnesty[\"old_score\"], df_amnesty[\"new_score\"], alpha=0.5, s=30)\n", + "ax1.plot([0, 1], [0, 1], \"r--\", label=\"Perfect match\", linewidth=2)\n", + "ax1.set_xlabel(\"Legacy Score\", fontsize=10)\n", + "ax1.set_ylabel(\"New Score\", fontsize=10)\n", + "ax1.set_title(\"Score Correlation\", fontsize=12, fontweight=\"bold\")\n", + "ax1.legend()\n", + "ax1.grid(True, alpha=0.3)\n", + "ax1.set_xlim(-0.05, 1.05)\n", + "ax1.set_ylim(-0.05, 1.05)\n", + "\n", + "# 2. Histogram: Difference distribution\n", + "ax2 = fig.add_subplot(gs[0, 1])\n", + "ax2.hist(df_amnesty[\"diff\"], bins=40, alpha=0.7, edgecolor=\"black\")\n", + "ax2.axvline(x=0, color=\"r\", linestyle=\"--\", linewidth=2, label=\"Zero diff\")\n", + "ax2.axvline(\n", + " x=df_amnesty[\"diff\"].mean(),\n", + " color=\"g\",\n", + " linestyle=\"--\",\n", + " linewidth=2,\n", + " label=f\"Mean: {df_amnesty['diff'].mean():.3f}\",\n", + ")\n", + "ax2.set_xlabel(\"Difference (New - Legacy)\", fontsize=10)\n", + "ax2.set_ylabel(\"Frequency\", fontsize=10)\n", + "ax2.set_title(\"Difference Distribution\", fontsize=12, fontweight=\"bold\")\n", + "ax2.legend()\n", + "ax2.grid(True, alpha=0.3)\n", + "\n", + "# 3. Histogram: Absolute difference (log scale for deterministic metrics)\n", + "ax3 = fig.add_subplot(gs[0, 2])\n", + "non_zero_diffs = df_amnesty[df_amnesty[\"abs_diff\"] > 0][\"abs_diff\"]\n", + "if len(non_zero_diffs) > 0:\n", + " ax3.hist(\n", + " np.log10(non_zero_diffs), bins=40, alpha=0.7, color=\"orange\", edgecolor=\"black\"\n", + " )\n", + " ax3.axvline(x=-10, color=\"r\", linestyle=\"--\", linewidth=2, label=\"1e-10 tolerance\")\n", + " ax3.set_xlabel(\"Log10(Absolute Difference)\", fontsize=10)\n", + "else:\n", + " ax3.text(\n", + " 0.5, 0.5, \"All differences are zero!\", ha=\"center\", va=\"center\", fontsize=12\n", + " )\n", + "ax3.set_ylabel(\"Frequency\", fontsize=10)\n", + "ax3.set_title(\"Absolute Difference Distribution (Log)\", fontsize=12, fontweight=\"bold\")\n", + "ax3.legend()\n", + "ax3.grid(True, alpha=0.3)\n", + "\n", + "# 4. Line plot: Score trends\n", + "ax4 = fig.add_subplot(gs[1, :])\n", + "x = df_amnesty[\"sample_idx\"]\n", + "ax4.plot(x, df_amnesty[\"old_score\"], \"o-\", label=\"Legacy\", alpha=0.6, markersize=4)\n", + "ax4.plot(x, df_amnesty[\"new_score\"], \"s-\", label=\"New\", alpha=0.6, markersize=4)\n", + "ax4.set_xlabel(\"Sample Index\", fontsize=10)\n", + "ax4.set_ylabel(\"Score\", fontsize=10)\n", + "ax4.set_title(\"Score Trends Across Dataset\", fontsize=12, fontweight=\"bold\")\n", + "ax4.legend()\n", + "ax4.grid(True, alpha=0.3)\n", + "ax4.set_ylim(-0.05, 1.05)\n", + "\n", + "# 5. Box plots: Score distributions\n", + "ax5 = fig.add_subplot(gs[2, 0])\n", + "ax5.boxplot(\n", + " [df_amnesty[\"old_score\"], df_amnesty[\"new_score\"]], labels=[\"Legacy\", \"New\"]\n", + ")\n", + "ax5.set_ylabel(\"Score\", fontsize=10)\n", + "ax5.set_title(\"Score Distribution Comparison\", fontsize=12, fontweight=\"bold\")\n", + "ax5.grid(True, alpha=0.3, axis=\"y\")\n", + "\n", + "# 6. Cumulative distribution of absolute differences\n", + "ax6 = fig.add_subplot(gs[2, 1])\n", + "sorted_diffs = np.sort(df_amnesty[\"abs_diff\"])\n", + "cumulative = np.arange(1, len(sorted_diffs) + 1) / len(sorted_diffs) * 100\n", + "ax6.plot(sorted_diffs, cumulative, linewidth=2)\n", + "ax6.axvline(x=0.2, color=\"r\", linestyle=\"--\", linewidth=2, label=\"0.2 tolerance\")\n", + "ax6.axhline(y=90, color=\"g\", linestyle=\"--\", linewidth=1, alpha=0.5, label=\"90%\")\n", + "ax6.set_xlabel(\"Absolute Difference\", fontsize=10)\n", + "ax6.set_ylabel(\"Cumulative Percentage\", fontsize=10)\n", + "ax6.set_title(\"Cumulative Distribution\", fontsize=12, fontweight=\"bold\")\n", + "ax6.set_xscale(\"log\")\n", + "ax6.legend()\n", + "ax6.grid(True, alpha=0.3)\n", + "\n", + "# 7. Scatter: Difference vs Legacy score\n", + "ax7 = fig.add_subplot(gs[2, 2])\n", + "ax7.scatter(df_amnesty[\"old_score\"], df_amnesty[\"abs_diff\"], alpha=0.5, s=30)\n", + "ax7.axhline(y=0.2, color=\"r\", linestyle=\"--\", linewidth=2, label=\"0.2 tolerance\")\n", + "ax7.set_xlabel(\"Legacy Score\", fontsize=10)\n", + "ax7.set_ylabel(\"Absolute Difference\", fontsize=10)\n", + "ax7.set_title(\"Difference vs Score\", fontsize=12, fontweight=\"bold\")\n", + "ax7.set_yscale(\"log\")\n", + "ax7.legend()\n", + "ax7.grid(True, alpha=0.3)\n", + "\n", + "plt.suptitle(\n", + " f\"Amnesty QA Migration Analysis ({len(df_amnesty)} samples)\",\n", + " fontsize=14,\n", + " fontweight=\"bold\",\n", + " y=0.995,\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐ŸŽฏ AMNESTY QA VALIDATION COMPLETE\n", + "======================================================================\n", + " Mean |Diff|: 0.0708\n", + " Within 0.2: 18/20 (90.0%)\n", + " Within 0.3: 18/20 (90.0%)\n", + "\n", + "๐Ÿ“Š Validation Criteria (LLM-based metrics):\n", + " โœ… Mean |diff| < 0.15: 0.0708\n", + " โš ๏ธ >90% within 0.2: 90.0%\n", + " โš ๏ธ >95% within 0.3: 90.0%\n", + " โœ… No systematic bias (|mean diff| < 0.05): 0.0292\n", + "\n", + "๐Ÿ’ก For deterministic metrics, use stricter criteria:\n", + " - Mean |diff| < 1e-10\n", + " - 100% within 1e-10\n" + ] + } + ], + "source": [ + "### Validate Amnesty QA Results\n", + "\n", + "print(\"๐ŸŽฏ AMNESTY QA VALIDATION COMPLETE\")\n", + "print(\"=\" * 70)\n", + "print(f\" Mean |Diff|: {df_amnesty['abs_diff'].mean():.4f}\")\n", + "print(\n", + " f\" Within 0.2: {(df_amnesty['abs_diff'] < 0.2).sum()}/{len(df_amnesty)} \"\n", + " f\"({(df_amnesty['abs_diff'] < 0.2).sum() / len(df_amnesty) * 100:.1f}%)\"\n", + ")\n", + "print(\n", + " f\" Within 0.3: {(df_amnesty['abs_diff'] < 0.3).sum()}/{len(df_amnesty)} \"\n", + " f\"({(df_amnesty['abs_diff'] < 0.3).sum() / len(df_amnesty) * 100:.1f}%)\"\n", + ")\n", + "\n", + "# Validation criteria for LLM-based metrics\n", + "# For deterministic metrics, use stricter tolerances (1e-10, 1e-6)\n", + "mean_abs_diff = df_amnesty[\"abs_diff\"].mean()\n", + "pct_within_02 = (df_amnesty[\"abs_diff\"] < 0.2).sum() / len(df_amnesty) * 100\n", + "pct_within_03 = (df_amnesty[\"abs_diff\"] < 0.3).sum() / len(df_amnesty) * 100\n", + "\n", + "print(\"\\n๐Ÿ“Š Validation Criteria (LLM-based metrics):\")\n", + "print(\n", + " f\" {'โœ…' if mean_abs_diff < 0.15 else 'โŒ'} Mean |diff| < 0.15: {mean_abs_diff:.4f}\"\n", + ")\n", + "print(f\" {'โœ…' if pct_within_02 > 90 else 'โš ๏ธ'} >90% within 0.2: {pct_within_02:.1f}%\")\n", + "print(f\" {'โœ…' if pct_within_03 > 95 else 'โš ๏ธ'} >95% within 0.3: {pct_within_03:.1f}%\")\n", + "print(\n", + " f\" {'โœ…' if abs(amnesty_result.mean_diff) < 0.05 else 'โš ๏ธ'} \"\n", + " f\"No systematic bias (|mean diff| < 0.05): {abs(amnesty_result.mean_diff):.4f}\"\n", + ")\n", + "\n", + "print(\"\\n๐Ÿ’ก For deterministic metrics, use stricter criteria:\")\n", + "print(\" - Mean |diff| < 1e-10\")\n", + "print(\" - 100% within 1e-10\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## FIQA Dataset Testing (Domain Generalization)\n", + "\n", + "Test on financial Q&A dataset to validate metric works across different domains." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "FIQA DATASET COMPARISON\n", + "======================================================================\n", + "Testing on financial Q&A dataset for domain generalization...\n", + "โœ“ Loaded 30 samples from fiqa\n", + "โœ“ Prepared 30 samples for testing\n", + "\n", + "First sample fields:\n", + " user_input: How to deposit a cheque issued to an associate in my business into my business a...\n", + " retrieved_contexts: 1 item(s)\n", + " reference: [\"Have the check reissued to the proper payee.Just have the associate sign the b...\n" + ] + } + ], + "source": [ + "### Load FIQA Dataset\n", + "\n", + "from tests.e2e.test_dataset_utils import load_fiqa_dataset_safe\n", + "\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"FIQA DATASET COMPARISON\")\n", + "print(\"=\" * 70)\n", + "print(\"Testing on financial Q&A dataset for domain generalization...\")\n", + "\n", + "fiqa_dataset = load_fiqa_dataset_safe(\"ragas_eval_v3\")\n", + "print(f\"โœ“ Loaded {len(fiqa_dataset)} samples from fiqa\")\n", + "\n", + "# Convert to format expected by metric using configured fields\n", + "fiqa_test_data = []\n", + "for i, sample in enumerate(fiqa_dataset):\n", + " if i >= 30: # Use up to 30 samples from ragas_eval_v3\n", + " break\n", + "\n", + " # Extract only configured fields (same logic as Amnesty QA)\n", + " test_sample = {}\n", + " for field in METRIC_CONFIG[\"dataset_fields\"]:\n", + " if field == \"reference_contexts\" and field not in sample:\n", + " # Handle transform case: split retrieved_contexts\n", + " retrieved_contexts = sample.get(\"retrieved_contexts\", [])\n", + " if retrieved_contexts and len(retrieved_contexts) > 1:\n", + " mid = len(retrieved_contexts) // 2\n", + " test_sample[field] = retrieved_contexts[mid:]\n", + " elif retrieved_contexts:\n", + " test_sample[field] = retrieved_contexts\n", + " elif field in sample:\n", + " test_sample[field] = sample[field]\n", + " elif field == \"response\":\n", + " test_sample[field] = sample.get(\"response\", \"\")\n", + " elif field == \"reference\":\n", + " test_sample[field] = sample.get(\n", + " \"reference_contexts\", sample.get(\"reference\", \"\")\n", + " )\n", + "\n", + " if test_sample: # Only add if we have data\n", + " fiqa_test_data.append(test_sample)\n", + "\n", + "print(f\"โœ“ Prepared {len(fiqa_test_data)} samples for testing\")\n", + "if fiqa_test_data:\n", + " print(\"\\nFirst sample fields:\")\n", + " first_sample = fiqa_test_data[0]\n", + " for key, value in first_sample.items():\n", + " if isinstance(value, list):\n", + " print(f\" {key}: {len(value)} item(s)\")\n", + " elif isinstance(value, str):\n", + " print(f\" {key}: {value[:80]}...\")\n", + " else:\n", + " print(f\" {key}: {value}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "Dataset: 30 samples\n", + "Mode: Concurrent processing + Parallel metrics\n", + "======================================================================\n", + "Running both metrics in parallel on 30 samples (max 10 concurrent)...\n", + "============================================================\n", + "METRIC COMPARISON SUMMARY\n", + "============================================================\n", + "\n", + "Score Statistics:\n", + " Old Metric Mean: 0.8667\n", + " New Metric Mean: 0.8667\n", + "\n", + "Difference Statistics (new - old):\n", + " Mean Diff: 0.0000\n", + " Max Diff: 1.0000\n", + " Min Diff: -1.0000\n", + " Std Dev: 0.2582\n", + "\n", + "Execution Time:\n", + " Old Metric: 5.70s\n", + " New Metric: 6.35s\n", + " Speedup: 0.90x\n", + "============================================================\n" + ] + } + ], + "source": [ + "### Compare on FIQA (Optimized & Parallel)\n", + "\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(f\"Dataset: {len(fiqa_test_data)} samples\")\n", + "print(\"Mode: Concurrent processing + Parallel metrics\")\n", + "print(\"=\" * 70)\n", + "\n", + "fiqa_result = await compare_metrics(\n", + " old_metric=legacy_metric,\n", + " new_metric=modern_metric,\n", + " dataset=fiqa_test_data,\n", + " old_metric_type=\"old\",\n", + " new_metric_type=\"new\",\n", + " max_concurrent=10,\n", + " parallel_metrics=True,\n", + ")\n", + "\n", + "fiqa_result.print_summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "======================================================================\n", + "DETAILED STATISTICAL ANALYSIS\n", + "======================================================================\n", + "\n", + "Dataset: fiqa (30 samples)\n", + "\n", + "Score Statistics:\n", + " Legacy Mean: 0.8667\n", + " New Mean: 0.8667\n", + " Score Shift: +0.0000\n", + "\n", + "Difference Statistics:\n", + " Mean |Diff|: 0.0667\n", + " Std Dev: 0.2582\n", + " Max Diff: 1.0000\n", + " Min Diff: -1.0000\n", + " Median Diff: 0.0000\n", + "\n", + "Tolerance Analysis:\n", + " < 0.10: 28/30 ( 93.3%)\n", + " < 0.15: 28/30 ( 93.3%)\n", + " < 0.20: 28/30 ( 93.3%)\n", + " < 0.25: 28/30 ( 93.3%)\n", + " < 0.30: 28/30 ( 93.3%)\n", + "\n", + "======================================================================\n", + "TOP 10 LARGEST DIFFERENCES\n", + "======================================================================\n", + "\n", + "#5: 401k Transfer After Business Closure...\n", + " Legacy: 1.0000 | New: 0.0000 | Diff: 1.0000\n", + "\n", + "#24: Employer options when setting up 401k for employees...\n", + " Legacy: 0.0000 | New: 1.0000 | Diff: 1.0000\n", + "\n", + "#1: How to deposit a cheque issued to an associate in my busines...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#2: Can I send a money order from USPS as a business?...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#3: 1 EIN doing business under multiple business names...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#4: Applying for and receiving business credit...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#6: What are the ins/outs of writing equipment purchases off as ...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#7: Can a entrepreneur hire a self-employed business owner?...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n", + "\n", + "#8: Intentions of Deductible Amount for Small Business...\n", + " Legacy: 0.0000 | New: 0.0000 | Diff: 0.0000\n", + "\n", + "#9: How can I deposit a check made out to my business into my pe...\n", + " Legacy: 1.0000 | New: 1.0000 | Diff: 0.0000\n" + ] + } + ], + "source": [ + "### Analyze FIQA Results in Detail\n", + "\n", + "# Get detailed DataFrame\n", + "df_fiqa = fiqa_result.to_dataframe()\n", + "df_fiqa[\"sample_idx\"] = range(len(df_fiqa))\n", + "df_fiqa[\"description\"] = [get_description(s) for s in fiqa_test_data]\n", + "\n", + "# Statistical Analysis\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"DETAILED STATISTICAL ANALYSIS\")\n", + "print(\"=\" * 70)\n", + "print(f\"\\nDataset: fiqa ({len(df_fiqa)} samples)\")\n", + "print(\"\\nScore Statistics:\")\n", + "print(f\" Legacy Mean: {fiqa_result.old_mean:.4f}\")\n", + "print(f\" New Mean: {fiqa_result.new_mean:.4f}\")\n", + "print(f\" Score Shift: {fiqa_result.mean_diff:+.4f}\")\n", + "\n", + "print(\"\\nDifference Statistics:\")\n", + "print(f\" Mean |Diff|: {df_fiqa['abs_diff'].mean():.4f}\")\n", + "print(f\" Std Dev: {fiqa_result.std_diff:.4f}\")\n", + "print(f\" Max Diff: {fiqa_result.max_diff:.4f}\")\n", + "print(f\" Min Diff: {fiqa_result.min_diff:.4f}\")\n", + "print(f\" Median Diff: {df_fiqa['abs_diff'].median():.4f}\")\n", + "\n", + "# Tolerance Analysis (adjust for your metric type)\n", + "# For LLM-based metrics: use [0.1, 0.15, 0.2, 0.25, 0.3]\n", + "# For deterministic metrics: use [1e-10, 1e-8, 1e-6, 1e-4, 0.01]\n", + "tolerance_levels = [0.1, 0.15, 0.2, 0.25, 0.3]\n", + "print(\"\\nTolerance Analysis:\")\n", + "for tol in tolerance_levels:\n", + " within = (df_fiqa[\"abs_diff\"] < tol).sum()\n", + " pct = within / len(df_fiqa) * 100\n", + " print(f\" < {tol:.2f}: {within:3d}/{len(df_fiqa)} ({pct:5.1f}%)\")\n", + "\n", + "# Identify problematic cases\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"TOP 10 LARGEST DIFFERENCES\")\n", + "print(\"=\" * 70)\n", + "top_diffs = df_fiqa.nlargest(10, \"abs_diff\")\n", + "for idx, row in top_diffs.iterrows():\n", + " print(f\"\\n#{row['sample_idx'] + 1}: {row['description']}\")\n", + " print(\n", + " f\" Legacy: {row['old_score']:.4f} | New: {row['new_score']:.4f} | Diff: {row['abs_diff']:.4f}\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/2y/02fp70k56p75ldrkgtx7z10r0000gn/T/ipykernel_39797/2878535787.py:59: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.\n", + " ax5.boxplot([df_fiqa[\"old_score\"], df_fiqa[\"new_score\"]], labels=['Legacy', 'New'])\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "### Visualize FIQA Results\n", + "\n", + "# Comprehensive Visualization\n", + "fig = plt.figure(figsize=(16, 12))\n", + "gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)\n", + "\n", + "# 1. Scatter: Legacy vs New scores\n", + "ax1 = fig.add_subplot(gs[0, 0])\n", + "ax1.scatter(df_fiqa[\"old_score\"], df_fiqa[\"new_score\"], alpha=0.5, s=30)\n", + "ax1.plot([0, 1], [0, 1], \"r--\", label=\"Perfect match\", linewidth=2)\n", + "ax1.set_xlabel(\"Legacy Score\", fontsize=10)\n", + "ax1.set_ylabel(\"New Score\", fontsize=10)\n", + "ax1.set_title(\"Score Correlation\", fontsize=12, fontweight=\"bold\")\n", + "ax1.legend()\n", + "ax1.grid(True, alpha=0.3)\n", + "ax1.set_xlim(-0.05, 1.05)\n", + "ax1.set_ylim(-0.05, 1.05)\n", + "\n", + "# 2. Histogram: Difference distribution\n", + "ax2 = fig.add_subplot(gs[0, 1])\n", + "ax2.hist(df_fiqa[\"diff\"], bins=40, alpha=0.7, edgecolor=\"black\")\n", + "ax2.axvline(x=0, color=\"r\", linestyle=\"--\", linewidth=2, label=\"Zero diff\")\n", + "ax2.axvline(\n", + " x=df_fiqa[\"diff\"].mean(),\n", + " color=\"g\",\n", + " linestyle=\"--\",\n", + " linewidth=2,\n", + " label=f\"Mean: {df_fiqa['diff'].mean():.3f}\",\n", + ")\n", + "ax2.set_xlabel(\"Difference (New - Legacy)\", fontsize=10)\n", + "ax2.set_ylabel(\"Frequency\", fontsize=10)\n", + "ax2.set_title(\"Difference Distribution\", fontsize=12, fontweight=\"bold\")\n", + "ax2.legend()\n", + "ax2.grid(True, alpha=0.3)\n", + "\n", + "# 3. Histogram: Absolute difference (log scale for deterministic metrics)\n", + "ax3 = fig.add_subplot(gs[0, 2])\n", + "non_zero_diffs = df_fiqa[df_fiqa[\"abs_diff\"] > 0][\"abs_diff\"]\n", + "if len(non_zero_diffs) > 0:\n", + " ax3.hist(\n", + " np.log10(non_zero_diffs), bins=40, alpha=0.7, color=\"orange\", edgecolor=\"black\"\n", + " )\n", + " ax3.axvline(x=-10, color=\"r\", linestyle=\"--\", linewidth=2, label=\"1e-10 tolerance\")\n", + " ax3.set_xlabel(\"Log10(Absolute Difference)\", fontsize=10)\n", + "else:\n", + " ax3.text(\n", + " 0.5, 0.5, \"All differences are zero!\", ha=\"center\", va=\"center\", fontsize=12\n", + " )\n", + "ax3.set_ylabel(\"Frequency\", fontsize=10)\n", + "ax3.set_title(\"Absolute Difference Distribution (Log)\", fontsize=12, fontweight=\"bold\")\n", + "ax3.legend()\n", + "ax3.grid(True, alpha=0.3)\n", + "\n", + "# 4. Line plot: Score trends\n", + "ax4 = fig.add_subplot(gs[1, :])\n", + "x = df_fiqa[\"sample_idx\"]\n", + "ax4.plot(x, df_fiqa[\"old_score\"], \"o-\", label=\"Legacy\", alpha=0.6, markersize=4)\n", + "ax4.plot(x, df_fiqa[\"new_score\"], \"s-\", label=\"New\", alpha=0.6, markersize=4)\n", + "ax4.set_xlabel(\"Sample Index\", fontsize=10)\n", + "ax4.set_ylabel(\"Score\", fontsize=10)\n", + "ax4.set_title(\"Score Trends Across Dataset\", fontsize=12, fontweight=\"bold\")\n", + "ax4.legend()\n", + "ax4.grid(True, alpha=0.3)\n", + "ax4.set_ylim(-0.05, 1.05)\n", + "\n", + "# 5. Box plots: Score distributions\n", + "ax5 = fig.add_subplot(gs[2, 0])\n", + "ax5.boxplot([df_fiqa[\"old_score\"], df_fiqa[\"new_score\"]], labels=[\"Legacy\", \"New\"])\n", + "ax5.set_ylabel(\"Score\", fontsize=10)\n", + "ax5.set_title(\"Score Distribution Comparison\", fontsize=12, fontweight=\"bold\")\n", + "ax5.grid(True, alpha=0.3, axis=\"y\")\n", + "\n", + "# 6. Cumulative distribution of absolute differences\n", + "ax6 = fig.add_subplot(gs[2, 1])\n", + "sorted_diffs = np.sort(df_fiqa[\"abs_diff\"])\n", + "cumulative = np.arange(1, len(sorted_diffs) + 1) / len(sorted_diffs) * 100\n", + "ax6.plot(sorted_diffs, cumulative, linewidth=2)\n", + "ax6.axvline(x=0.2, color=\"r\", linestyle=\"--\", linewidth=2, label=\"0.2 tolerance\")\n", + "ax6.axhline(y=90, color=\"g\", linestyle=\"--\", linewidth=1, alpha=0.5, label=\"90%\")\n", + "ax6.set_xlabel(\"Absolute Difference\", fontsize=10)\n", + "ax6.set_ylabel(\"Cumulative Percentage\", fontsize=10)\n", + "ax6.set_title(\"Cumulative Distribution\", fontsize=12, fontweight=\"bold\")\n", + "ax6.set_xscale(\"log\")\n", + "ax6.legend()\n", + "ax6.grid(True, alpha=0.3)\n", + "\n", + "# 7. Scatter: Difference vs Legacy score\n", + "ax7 = fig.add_subplot(gs[2, 2])\n", + "ax7.scatter(df_fiqa[\"old_score\"], df_fiqa[\"abs_diff\"], alpha=0.5, s=30)\n", + "ax7.axhline(y=0.2, color=\"r\", linestyle=\"--\", linewidth=2, label=\"0.2 tolerance\")\n", + "ax7.set_xlabel(\"Legacy Score\", fontsize=10)\n", + "ax7.set_ylabel(\"Absolute Difference\", fontsize=10)\n", + "ax7.set_title(\"Difference vs Score\", fontsize=12, fontweight=\"bold\")\n", + "ax7.set_yscale(\"log\")\n", + "ax7.legend()\n", + "ax7.grid(True, alpha=0.3)\n", + "\n", + "plt.suptitle(\n", + " f\"FIQA Migration Analysis ({len(df_fiqa)} samples)\",\n", + " fontsize=14,\n", + " fontweight=\"bold\",\n", + " y=0.995,\n", + ")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐ŸŽฏ FIQA VALIDATION COMPLETE\n", + "======================================================================\n", + " Mean |Diff|: 0.0667\n", + " Within 0.2: 28/30 (93.3%)\n", + " Within 0.3: 28/30 (93.3%)\n", + "\n", + "๐Ÿ“Š Validation Criteria (LLM-based metrics):\n", + " โœ… Mean |diff| < 0.15: 0.0667\n", + " โœ… >90% within 0.2: 93.3%\n", + " โš ๏ธ >95% within 0.3: 93.3%\n", + " โœ… No systematic bias (|mean diff| < 0.05): 0.0000\n", + "\n", + "๐Ÿ’ก Domain Generalization Check:\n", + " โœ… Amnesty QA Mean |Diff|: 0.0708\n", + " โœ… FIQA Mean |Diff|: 0.0667\n", + " โœ… Consistent across domains\n" + ] + } + ], + "source": [ + "### Validate FIQA Results\n", + "\n", + "print(\"๐ŸŽฏ FIQA VALIDATION COMPLETE\")\n", + "print(\"=\" * 70)\n", + "print(f\" Mean |Diff|: {df_fiqa['abs_diff'].mean():.4f}\")\n", + "print(\n", + " f\" Within 0.2: {(df_fiqa['abs_diff'] < 0.2).sum()}/{len(df_fiqa)} \"\n", + " f\"({(df_fiqa['abs_diff'] < 0.2).sum() / len(df_fiqa) * 100:.1f}%)\"\n", + ")\n", + "print(\n", + " f\" Within 0.3: {(df_fiqa['abs_diff'] < 0.3).sum()}/{len(df_fiqa)} \"\n", + " f\"({(df_fiqa['abs_diff'] < 0.3).sum() / len(df_fiqa) * 100:.1f}%)\"\n", + ")\n", + "\n", + "# Validation criteria for LLM-based metrics\n", + "mean_abs_diff = df_fiqa[\"abs_diff\"].mean()\n", + "pct_within_02 = (df_fiqa[\"abs_diff\"] < 0.2).sum() / len(df_fiqa) * 100\n", + "pct_within_03 = (df_fiqa[\"abs_diff\"] < 0.3).sum() / len(df_fiqa) * 100\n", + "\n", + "print(\"\\n๐Ÿ“Š Validation Criteria (LLM-based metrics):\")\n", + "print(\n", + " f\" {'โœ…' if mean_abs_diff < 0.15 else 'โŒ'} Mean |diff| < 0.15: {mean_abs_diff:.4f}\"\n", + ")\n", + "print(f\" {'โœ…' if pct_within_02 > 90 else 'โš ๏ธ'} >90% within 0.2: {pct_within_02:.1f}%\")\n", + "print(f\" {'โœ…' if pct_within_03 > 95 else 'โš ๏ธ'} >95% within 0.3: {pct_within_03:.1f}%\")\n", + "print(\n", + " f\" {'โœ…' if abs(fiqa_result.mean_diff) < 0.05 else 'โš ๏ธ'} \"\n", + " f\"No systematic bias (|mean diff| < 0.05): {abs(fiqa_result.mean_diff):.4f}\"\n", + ")\n", + "\n", + "print(\"\\n๐Ÿ’ก Domain Generalization Check:\")\n", + "print(f\" โœ… Amnesty QA Mean |Diff|: {df_amnesty['abs_diff'].mean():.4f}\")\n", + "print(f\" โœ… FIQA Mean |Diff|: {df_fiqa['abs_diff'].mean():.4f}\")\n", + "print(\n", + " f\" {'โœ…' if abs(df_amnesty['abs_diff'].mean() - df_fiqa['abs_diff'].mean()) < 0.1 else 'โš ๏ธ'} \"\n", + " f\"Consistent across domains\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/tests/e2e/metrics_migration/plan-for-metrics-migration.md b/tests/e2e/metrics_migration/plan-for-metrics-migration.md new file mode 100644 index 000000000..d5103b2e9 --- /dev/null +++ b/tests/e2e/metrics_migration/plan-for-metrics-migration.md @@ -0,0 +1,798 @@ +# Comprehensive Generalizable Metrics Migration Plan + +## Overview +This document provides a complete, step-by-step plan for migrating any metric from legacy implementation to the modern collections pattern, incorporating all learnings from Context Recall migration, test infrastructure refactoring, and notebook-based testing approaches. + +--- + +## Phase 0: Pre-Migration Study & Planning + +### Study Existing Migrated Metrics + +**Metrics to analyze**: +1. Answer Relevancy (LLM + Embeddings based) +2. Answer Similarity (Embeddings only) +3. BLEU/ROUGE (No LLM/embeddings) +4. String metrics (Simple comparison) +5. Context Recall (LLM with statement classification) + +**What to look for in legacy metrics** (`src/ragas/metrics/_*.py`): +- [ ] **Core algorithm logic**: How is the score calculated? +- [ ] **LLM/Embeddings usage**: Which components are required? +- [ ] **Prompt structure**: PydanticPrompt classes and examples +- [ ] **Input parameters**: What data does it need? +- [ ] **Edge cases**: How are empty inputs, errors handled? +- [ ] **Ensembling**: Does it run multiple times and aggregate? +- [ ] **Deprecated methods**: Old APIs to maintain compatibility with +- [ ] **Output format**: Float score vs structured output + +**Important patterns from legacy**: +1. `_single_turn_ascore()` is the main method to replicate +2. `MetricWithLLM`, `MetricWithEmbeddings` mixins show dependencies +3. `PydanticPrompt` examples become inline examples in new prompts +4. Score normalization and range validation (0.0-1.0) +5. Error handling and nan score returns + +--- + +## Phase 1: Implement New Metric + +### 1.1 Create Prompt Function +**File**: `src/ragas/prompts/metrics/{metric_name}.py` + +**Structure**: +```python +"""Prompt for {MetricName} evaluation.""" + +import json + +def {metric_name}_prompt(param1: str, param2: str, ...) -> str: + """ + Generate prompt for {metric_name} evaluation. + + Args: + param1: Description + param2: Description + + Returns: + Formatted prompt string for LLM + """ + # Use json.dumps() for safe string escaping + safe_param1 = json.dumps(param1) + safe_param2 = json.dumps(param2) + + return f"""Task description here. + +--------EXAMPLES----------- +Example 1 +Input: {{ + "param1": "example value", + "param2": "example value" +}} +Output: {{ + "result": "expected output format" +}} + +Example 2 +[Add 2-3 examples covering different scenarios] +----------------------------- + +Now perform the same with the following input +Input: {{ + "param1": {safe_param1}, + "param2": {safe_param2} +}} +Output: """ +``` + +**Key points**: +- Use `json.dumps()` for escaping user inputs +- Include 2-3 examples showing different cases +- Clear output format specification +- Match the logic from legacy PydanticPrompt + +### 1.2 Define Output Models +**File**: `src/ragas/metrics/collections/_{metric_name}.py` + +```python +from pydantic import BaseModel +import typing as t + +class {MetricName}Item(BaseModel): + """Single classification/item result.""" + field1: str + field2: int + # ... based on legacy output model + +class {MetricName}Output(BaseModel): + """Complete structured output.""" + items: t.List[{MetricName}Item] + # or whatever structure the LLM returns +``` + +**Guidelines**: +- Match field names from legacy output models +- Use appropriate types (str, int, float, List, etc.) +- Add docstrings for clarity + +### 1.3 Implement Metric Class +**File**: `src/ragas/metrics/collections/_{metric_name}.py` + +```python +"""MetricName v2 - Modern implementation with instructor LLMs.""" + +import typing as t +import numpy as np +from ragas.metrics.collections.base import BaseMetric +from ragas.metrics.result import MetricResult +from ragas.prompts.metrics.{metric_name} import {metric_name}_prompt + +if t.TYPE_CHECKING: + from ragas.llms.base import InstructorBaseRagasLLM + from ragas.embeddings.base import BaseRagasEmbeddings + +class {MetricName}(BaseMetric): + """ + {Metric description - what it measures}. + + This implementation uses modern instructor LLMs with structured output. + Only supports modern components - legacy wrappers rejected with clear errors. + + Usage: + >>> from openai import AsyncOpenAI + >>> from ragas.llms.base import instructor_llm_factory + >>> from ragas.metrics.collections import {MetricName} + >>> + >>> client = AsyncOpenAI() + >>> llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini") + >>> + >>> metric = {MetricName}(llm=llm) + >>> result = await metric.ascore(param1="value1", param2="value2") + >>> print(f"Score: {result.value}") + + Attributes: + llm: Modern instructor-based LLM (if needed) + embeddings: Modern embeddings (if needed) + name: Metric name + allowed_values: Score range (0.0 to 1.0) + """ + + # Type hints for components + llm: "InstructorBaseRagasLLM" # If LLM-based + embeddings: "BaseRagasEmbeddings" # If embeddings-based + + def __init__( + self, + llm: t.Optional["InstructorBaseRagasLLM"] = None, + embeddings: t.Optional["BaseRagasEmbeddings"] = None, + name: str = "{metric_name}", + **kwargs, + ): + """Initialize metric with required components.""" + # Set attributes before super() for validation + if llm: + self.llm = llm + if embeddings: + self.embeddings = embeddings + + # BaseMetric validates components are modern (not legacy wrappers) + super().__init__(name=name, **kwargs) + + async def ascore( + self, + param1: str, + param2: str, + # ... other parameters based on metric needs + ) -> MetricResult: + """ + Calculate score asynchronously. + + Args: + param1: Description + param2: Description + + Returns: + MetricResult with score (0.0-1.0) + """ + # 1. Validate inputs (handle empty/None cases) + if not param1 or not param2: + return MetricResult(value=0.0) + + # 2. For LLM-based metrics: Generate prompt and get structured output + prompt = {metric_name}_prompt(param1=param1, param2=param2) + output = await self.llm.agenerate(prompt, {MetricName}Output) + + # 3. For embeddings-based metrics: Get embeddings and compute similarity + # embedding1 = await self.embeddings.embed_text(param1) + # embedding2 = await self.embeddings.embed_text(param2) + # score = cosine_similarity(embedding1, embedding2) + + # 4. Calculate score from output (match legacy logic exactly) + score = self._calculate_score(output) + + # 5. Return MetricResult + return MetricResult(value=float(score)) + + def _calculate_score(self, output: {MetricName}Output) -> float: + """Calculate final score from LLM output.""" + # Implement exact logic from legacy _single_turn_ascore + # This is where the core algorithm lives + pass +``` + +**Key patterns**: +- `__init__` sets attributes before `super()` for validation +- `ascore()` is the main public method (not `_single_turn_ascore`) +- Return `MetricResult` not raw float +- Match legacy calculation logic exactly +- Handle edge cases (empty inputs, None values) +- Type hints use `TYPE_CHECKING` for circular imports + +### 1.4 Update Exports +**File**: `src/ragas/metrics/collections/__init__.py` + +```python +from ._metric_name import MetricName + +__all__ = [ + # ... existing exports + "MetricName", +] +``` + +--- + +## Phase 2: Manual Testing with General-Purpose Notebook + +### 2.1 Use General-Purpose Testing Notebook + +**File**: `tests/notebooks/metric_score_diff.ipynb` (already exists - reusable for all metrics) + +**Purpose**: Validate migration on real-world datasets (PRIMARY) and test edge cases (SECONDARY) + +**Testing Priority**: +1. **PRIMARY**: Large-scale dataset testing (amnesty_qa, fiqa) - proves migration quality +2. **SECONDARY**: Hand-crafted edge cases - validates specific behaviors + +**Key Advantage**: This notebook is configuration-driven. You only need to edit ONE cell (Cell 2) with your metric configuration, then run all cells without any other modifications! + +**What the notebook provides**: +- Automatic component creation (LLM/embeddings) based on your needs +- Dynamic metric loading from your configuration +- Dataset-based testing (Amnesty QA + FIQA) +- Comprehensive statistical analysis and visualizations +- Validation criteria checking +- Optional edge case testing + +--- + +### 2.2 Generate Metric Configuration + +Generate the `METRIC_CONFIG` dictionary for Cell 2 of the notebook. Print it to console for easy copy-pasting. Use the template below based on your metric type: + +#### Configuration Template + +```python +METRIC_CONFIG = { + # ===== METRIC IMPORTS ===== + "legacy_import": { + "module": "ragas.metrics._{legacy_module_name}", # e.g., "ragas.metrics._answer_relevance" + "class_name": "{LegacyMetricClassName}", # e.g., "AnswerRelevancy" + }, + "modern_import": { + "module": "ragas.metrics.collections", + "class_name": "{ModernMetricClassName}", # e.g., "AnswerRelevancy" + }, + + # ===== COMPONENT REQUIREMENTS ===== + # Set to False if your metric doesn't need this component + "needs_llm": True, # Does your metric use an LLM? + "needs_embeddings": True, # Does your metric use embeddings? + + # ===== DATASET FIELD MAPPING ===== + # Choose ONE option based on your metric type (uncomment the appropriate one) + + # OPTION 1: Answer-based metrics (AnswerRelevancy, AnswerSimilarity, AnswerCorrectness, etc.) + "dataset_fields": ["user_input", "response"], + + # OPTION 2: Context-based metrics (ContextRecall, ContextPrecision, Faithfulness, etc.) + # "dataset_fields": ["user_input", "retrieved_contexts", "reference"], + + # OPTION 3: Deterministic/Non-LLM metrics (NonLLMContextRecall, etc.) + # "dataset_fields": ["retrieved_contexts", "reference_contexts"], +} +``` + +#### Configuration Examples + +**Example 1: AnswerRelevancy (LLM + Embeddings)** +```python +METRIC_CONFIG = { + "legacy_import": { + "module": "ragas.metrics._answer_relevance", + "class_name": "AnswerRelevancy", + }, + "modern_import": { + "module": "ragas.metrics.collections", + "class_name": "AnswerRelevancy", + }, + "needs_llm": True, + "needs_embeddings": True, + "dataset_fields": ["user_input", "response"], +} +``` + +**Example 2: ContextRecall (LLM only)** +```python +METRIC_CONFIG = { + "legacy_import": { + "module": "ragas.metrics._context_recall", + "class_name": "ContextRecall", + }, + "modern_import": { + "module": "ragas.metrics.collections", + "class_name": "ContextRecall", + }, + "needs_llm": True, + "needs_embeddings": False, + "dataset_fields": ["user_input", "retrieved_contexts", "reference"], +} +``` + +**Example 3: NonLLMContextRecall (No LLM/Embeddings)** +```python +METRIC_CONFIG = { + "legacy_import": { + "module": "ragas.metrics._context_recall", + "class_name": "NonLLMContextRecall", + }, + "modern_import": { + "module": "ragas.metrics.collections", + "class_name": "NonLLMContextRecall", + }, + "needs_llm": False, + "needs_embeddings": False, + "dataset_fields": ["retrieved_contexts", "reference_contexts"], +} +``` + +**Example 4: ContextPrecision (LLM only)** +```python +METRIC_CONFIG = { + "legacy_import": { + "module": "ragas.metrics._context_precision", + "class_name": "ContextPrecision", + }, + "modern_import": { + "module": "ragas.metrics.collections", + "class_name": "ContextPrecision", + }, + "needs_llm": True, + "needs_embeddings": False, + "dataset_fields": ["user_input", "retrieved_contexts", "reference"], +} +``` + +#### How to Choose `dataset_fields` + +The `dataset_fields` list tells the notebook which fields to extract from the test datasets (Amnesty QA, FIQA) for your metric: + +1. **Answer-based metrics**: Use `["user_input", "response"]` + - Metrics that evaluate the quality of generated answers + - Examples: AnswerRelevancy, AnswerSimilarity, AnswerCorrectness + +2. **Context-based metrics**: Use `["user_input", "retrieved_contexts", "reference"]` + - Metrics that evaluate retrieved context quality + - Examples: ContextRecall, ContextPrecision, Faithfulness + +3. **Deterministic metrics**: Use `["retrieved_contexts", "reference_contexts"]` + - Metrics that don't use LLMs and compare contexts directly + - Examples: NonLLMContextRecall + - Note: The notebook will automatically split `retrieved_contexts` to create `reference_contexts` if needed + +**Available dataset fields**: +- **Amnesty QA**: `user_input`, `response`, `retrieved_contexts`, `reference_contexts` +- **FIQA**: `user_input`, `response`, `retrieved_contexts`, `reference` + +--- + +### 2.3 Run Notebook and Analyze Results + +**Steps**: + +1. **Open the notebook**: `tests/notebooks/metric_score_diff.ipynb` + +2. **Edit Cell 2**: Replace the `METRIC_CONFIG` dictionary with your generated configuration from Section 2.2 + +3. **Run all cells**: The notebook handles everything automatically: + - Loads your metric classes dynamically + - Creates only the required components (LLM/embeddings) + - Initializes both legacy and modern metrics + - Loads and transforms datasets based on your `dataset_fields` + - Runs concurrent comparisons on Amnesty QA and FIQA + - Generates comprehensive statistical analysis + - Creates 7-plot visualizations for each dataset + - Validates results against migration criteria + +4. **Review results**: The notebook displays inline: + - Score comparison statistics (mean, std dev, differences) + - Tolerance analysis (% of samples within various thresholds) + - Top 10 largest differences with descriptions + - Comprehensive visualizations (scatter, histograms, trends, distributions) + - Validation criteria checkmarks (โœ…/โŒ) + +5. **Iterate if needed**: + - If scores don't match well, review the problematic cases + - Adjust your metric implementation + - Re-run the notebook to verify improvements + +6. **Document findings**: Print a migration summary with the following information: + - Mean absolute difference + - Percentage of samples within tolerance + - Recommended tolerance level + - Any patterns or anomalies observed + - Edge cases that need special handling + - Key implementation details and algorithm differences + +**Output approach**: Print the METRIC_CONFIG and migration summary directly to console/output instead of creating files. This allows for easy copy-pasting without cluttering the repository. + +--- + +--- + +### 2.4 Migration Validation Criteria + +After running the notebook, the migration is considered successful if: + +**Amnesty QA Dataset** (PRIMARY criterion): +- โœ… Mean absolute difference < 0.15 (stricter than per-case tolerance) +- โœ… >90% of samples within 0.2 tolerance for LLM-based metrics +- โœ… >95% of samples within 1e-6 tolerance for deterministic metrics +- โœ… No systematic bias (mean diff close to 0, ideally < 0.05) +- โœ… Similar score distributions (check box plots and histograms) + +**FIQA Dataset** (if available): +- โœ… Similar criteria as amnesty_qa +- โœ… Validates generalization across different domains + +**Edge Cases** (SECONDARY criterion): +- โœ… All edge cases handle gracefully (no crashes) +- โœ… Empty inputs return 0.0 or handle appropriately +- โœ… Special characters don't break the metric + +**Performance**: +- โœ… New implementation not significantly slower (< 2x) +- โœ… Concurrent processing works correctly + +**Documentation**: +For the migration, review and document in the notebook: +- Dataset comparison statistics (displayed inline) +- Top 10 largest differences with analysis (displayed inline) +- Visual analysis with 7 comprehensive plots (displayed inline) +- Any patterns or anomalies observed +- Recommended tolerance for E2E tests + +**This becomes the proof that migration works correctly!** + +**Note**: All results are displayed inline in the notebook - no CSV or PNG files are saved. + +--- + +## Phase 3: Write E2E Migration Tests + +### 3.1 Create Test File +**File**: `tests/e2e/metrics_migration/test_{metric_name}_migration.py` + +**Structure**: +```python +"""E2E tests for {MetricName} migration from v1 to v2.""" + +import pytest + +from ragas.metrics import {LegacyMetricName} +from ragas.metrics.collections import {MetricName} + +from .base_migration_test import BaseMigrationTest + +class Test{MetricName}E2EMigration(BaseMigrationTest): + """E2E compatibility tests between legacy and v2 implementations.""" + + @pytest.fixture + def sample_data(self): + """Test cases for {metric_name} evaluation. + + Based on dataset testing in notebook: tests/notebooks/metric_score_diff.ipynb + + Dataset validation results: + - Amnesty QA: Mean |diff|={mean_diff:.4f}, {pct_within_tolerance}% within tolerance + - FIQA: Mean |diff|={mean_diff:.4f}, {pct_within_tolerance}% within tolerance (if tested) + + These test cases focus on edge cases and specific behaviors not fully covered by datasets. + The primary validation comes from the dataset comparisons documented in the notebook. + """ + return [ + # Edge cases from notebook testing + # Cases with interesting/problematic behavior from dataset analysis + # Specific scenarios requiring validation + { + "param1": "value1", + "param2": "value2", + "description": "Test case description", + }, + ] + + @pytest.mark.asyncio + async def test_legacy_vs_v2_e2e_compatibility( + self, + sample_data, + legacy_llm, # from conftest.py + modern_llm, # from conftest.py + legacy_embeddings, # if needed + modern_embeddings, # if needed + ): + """E2E test that legacy and v2 produce similar scores.""" + await self.run_e2e_compatibility_test( + sample_data=sample_data, + legacy_metric_factory={LegacyMetricName}, + v2_metric_factory={MetricName}, + legacy_components={"llm": legacy_llm, "embeddings": legacy_embeddings}, + v2_components={"llm": modern_llm, "embeddings": modern_embeddings}, + tolerance=0.2, # Adjust based on notebook findings + metric_name="{MetricName}", + additional_info_keys=["param1", "param2"], # For debug output + ) + + @pytest.mark.asyncio + async def test_{metric_specific_behavior}( + self, + legacy_llm, + modern_llm, + ): + """Test metric-specific behavior.""" + + test_cases = [ + { + "param1": "specific case", + "param2": "for testing", + "expected_high": True, # or other expected behavior + "description": "Specific behavior description", + }, + # Add 2-3 cases testing specific behaviors + ] + + def assertion_fn(case, legacy_score, v2_result): + """Custom assertions for metric-specific behavior.""" + if case.get("expected_high"): + assert legacy_score > 0.8 + assert v2_result.value > 0.8 + print(" โœ… High score as expected") + # Add other assertions based on metric logic + + await self.run_metric_specific_test( + test_cases=test_cases, + legacy_metric_factory={LegacyMetricName}, + v2_metric_factory={MetricName}, + legacy_components={"llm": legacy_llm}, + v2_components={"llm": modern_llm}, + test_name="{specific behavior}", + assertion_fn=assertion_fn, + ) + + def test_migration_requirements_documented(self): + """Document requirements for running E2E tests.""" + requirements = { + "llm": "OpenAI GPT or compatible LLM", + "embeddings": "OpenAI embeddings (if needed)", + "environment": "API keys configured", + "purpose": "Verify v2 produces similar scores to legacy", + } + + self.create_requirements_documentation( + metric_name="{MetricName}", + requirements=requirements, + test_file_name="test_{metric_name}_migration.py", + ) + + assert True +``` + +**Key points**: +- Inherit from `BaseMigrationTest` for reusable test methods +- Use fixtures from `conftest.py` (no local fixture definitions) +- `sample_data` comes from notebook testing (working cases) +- Tolerance based on notebook findings +- Add metric-specific behavior tests +- Document requirements + +### 3.2 Run Tests +```bash +# Run the new tests +uv run pytest tests/e2e/metrics_migration/test_{metric_name}_migration.py -v -s + +# Check they collect properly +uv run pytest tests/e2e/metrics_migration/test_{metric_name}_migration.py --collect-only +``` + +--- + +## Phase 4: Code Quality & Finalization + +### 4.1 Run Linting & Formatting +```bash +# Format code +make format + +# Type check +make type + +# Quick health check +make check +``` + +### 4.2 Run All Tests +```bash +# Unit tests +make test + +# E2E tests +make test-e2e + +# Or run specific test +uv run pytest tests/e2e/metrics_migration/ -v +``` + +### 4.3 Update Documentation +**File**: `docs/howtos/migrations/{metric_name}.md` (if needed) + +Document: +- Migration rationale +- API changes +- Usage examples (before/after) +- Breaking changes (if any) + +### 4.4 Create PR Checklist +- [ ] New metric implementation complete +- [ ] Prompt function with examples +- [ ] E2E migration tests passing +- [ ] Notebook testing completed +- [ ] Code formatted and linted +- [ ] Type checking passes +- [ ] Documentation updated +- [ ] Exports added to `__init__.py` + +--- + +## Key Learnings & Best Practices + +### From Context Recall Migration +1. **Components validation**: Base class rejects legacy wrappers automatically +2. **Structured output**: Use Pydantic models with instructor LLMs +3. **Prompt format**: Inline examples with json.dumps() escaping +4. **Score calculation**: Extract to separate method for clarity +5. **Edge cases**: Handle empty inputs gracefully + +### From Test Infrastructure +1. **Use shared fixtures**: `conftest.py` provides llm/embeddings +2. **Base test class**: `BaseMigrationTest` eliminates boilerplate +3. **Test utilities**: `test_utils.py` for common operations +4. **Consistent patterns**: All tests follow same structure +5. **Proper skipping**: Tests skip gracefully without API keys + +### From Notebook Testing +1. **Manual testing first**: Catches issues before E2E tests +2. **User modifications matter**: Inform final test design +3. **Performance tools**: Use optimized `compare_metrics` function +4. **Diverse test cases**: Cover normal, edge, high/low score scenarios +5. **Iteration speed**: Faster to debug in notebook than pytest + +### Tolerance Guidelines +- **LLM-based metrics**: 0.2-0.3 (accounts for randomness) +- **Embeddings-based**: 1e-6 to 1e-10 (deterministic) +- **String/rule-based**: 1e-10 (exact match expected) +- **Adjust based on**: Notebook findings and metric nature + +--- + +## Complete Checklist + +### Pre-Migration +- [ ] Study legacy metric implementation thoroughly +- [ ] Identify required components (LLM/embeddings/neither) +- [ ] Document core algorithm logic +- [ ] Note edge cases and special handling +- [ ] Review existing migrated metrics for patterns + +### Implementation +- [ ] Create prompt function with examples +- [ ] Define Pydantic output models +- [ ] Implement metric class inheriting from BaseMetric +- [ ] Match legacy calculation logic exactly +- [ ] Handle edge cases (empty, None, errors) +- [ ] Update `__init__.py` exports + +### Manual Testing (Notebook) +- [ ] Open general-purpose notebook: `tests/notebooks/metric_score_diff.ipynb` +- [ ] Generate `METRIC_CONFIG` for your metric (Section 2.2) +- [ ] Edit Cell 2 with your configuration +- [ ] Run all cells (no other modifications needed) +- [ ] Review Amnesty QA and FIQA comparison results +- [ ] Iterate on implementation until scores match +- [ ] Document findings (mean |diff|, tolerance, patterns) + +### E2E Testing +- [ ] Create test file inheriting from BaseMigrationTest +- [ ] Use fixtures from conftest.py +- [ ] Copy working test cases from notebook +- [ ] Set appropriate tolerance +- [ ] Add metric-specific behavior tests +- [ ] Document requirements +- [ ] Run tests and verify they pass + +### Quality & Finalization +- [ ] Run `make format` +- [ ] Run `make type` +- [ ] Run `make check` +- [ ] Run `make test` +- [ ] Run `make test-e2e` +- [ ] Update documentation if needed +- [ ] Create PR with checklist + +--- + +## File Structure Reference + +``` +ragas/ +โ”œโ”€โ”€ src/ragas/ +โ”‚ โ”œโ”€โ”€ prompts/metrics/ +โ”‚ โ”‚ โ””โ”€โ”€ {metric_name}.py # NEW: Prompt function +โ”‚ โ””โ”€โ”€ metrics/ +โ”‚ โ”œโ”€โ”€ collections/ +โ”‚ โ”‚ โ”œโ”€โ”€ _{metric_name}.py # NEW: V2 implementation +โ”‚ โ”‚ โ””โ”€โ”€ __init__.py # MODIFIED: Add export +โ”‚ โ””โ”€โ”€ _{metric_name}.py # EXISTING: Legacy implementation +โ”œโ”€โ”€ tests/ +โ”‚ โ”œโ”€โ”€ utils/ # EXISTING: Shared utilities +โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ”‚ โ””โ”€โ”€ llm_setup.py +โ”‚ โ”œโ”€โ”€ notebooks/ +โ”‚ โ”‚ โ””โ”€โ”€ metric_score_diff.ipynb # EXISTING: General-purpose testing notebook +โ”‚ โ””โ”€โ”€ e2e/metrics_migration/ +โ”‚ โ”œโ”€โ”€ conftest.py # EXISTING: Shared fixtures +โ”‚ โ”œโ”€โ”€ test_utils.py # EXISTING: Test utilities +โ”‚ โ”œโ”€โ”€ base_migration_test.py # EXISTING: Base test class +โ”‚ โ””โ”€โ”€ test_{metric_name}_migration.py # NEW: E2E tests +โ””โ”€โ”€ docs/ + โ””โ”€โ”€ howtos/migrations/ + โ””โ”€โ”€ {metric_name}.md # OPTIONAL: Migration guide +``` + +--- + +## Success Criteria + +โœ… **Implementation**: +- New metric produces similar scores to legacy (within tolerance) +- Works only with modern components (rejects legacy wrappers) +- Handles all edge cases properly +- Code is clean, typed, and documented + +โœ… **Testing**: +- E2E tests pass +- Manual notebook testing completed +- User satisfied with score matching +- All code quality checks pass + +โœ… **Documentation**: +- Usage examples clear +- Requirements documented +- Migration path explained (if needed) + +โœ… **Integration**: +- Exports added +- No regressions in existing tests +- Ready for PR and review + +--- + +This plan provides a complete, battle-tested workflow for migrating any metric from legacy to modern implementation, incorporating all learnings from previous migrations and leveraging the full testing infrastructure. diff --git a/tests/e2e/metrics_migration/test_utils.py b/tests/e2e/metrics_migration/test_utils.py new file mode 100644 index 000000000..edd26bc3d --- /dev/null +++ b/tests/e2e/metrics_migration/test_utils.py @@ -0,0 +1,146 @@ +"""Utility functions for metrics migration E2E tests.""" + +from typing import Any, Dict, Optional + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics import MetricResult + + +def create_legacy_sample( + data: Dict[str, Any], + user_input_key: str = "user_input", + response_key: str = "response", + reference_key: Optional[str] = "reference", + retrieved_contexts_key: Optional[str] = "retrieved_contexts", +) -> SingleTurnSample: + """Create a SingleTurnSample from a data dictionary for legacy metrics. + + Args: + data: Dictionary containing sample data + user_input_key: Key for user input in data dict + response_key: Key for response in data dict + reference_key: Key for reference in data dict (optional) + retrieved_contexts_key: Key for retrieved contexts in data dict (optional) + + Returns: + SingleTurnSample instance + """ + kwargs = { + "user_input": data.get(user_input_key, "dummy"), + } + + if response_key and response_key in data: + kwargs["response"] = data[response_key] + + if reference_key and reference_key in data: + kwargs["reference"] = data[reference_key] + + if retrieved_contexts_key and retrieved_contexts_key in data: + kwargs["retrieved_contexts"] = data[retrieved_contexts_key] + + return SingleTurnSample(**kwargs) + + +def compare_scores_with_tolerance( + legacy_score: float, + v2_score: float, + tolerance: float, + case_description: str, + case_num: int, +) -> None: + """Compare scores and assert they are within tolerance. + + Args: + legacy_score: Score from legacy implementation + v2_score: Score from v2 implementation + tolerance: Maximum allowed difference + case_description: Description of the test case + case_num: Test case number + + Raises: + AssertionError: If scores differ by more than tolerance + """ + score_diff = abs(legacy_score - v2_score) + assert score_diff < tolerance, ( + f"Case {case_num} ({case_description}): " + f"Large difference: {legacy_score} vs {v2_score} (diff: {score_diff})" + ) + + +def assert_score_types(legacy_score: Any, v2_result: MetricResult) -> None: + """Assert that scores have correct types and values are in valid range. + + Args: + legacy_score: Score from legacy implementation + v2_result: MetricResult from v2 implementation + + Raises: + AssertionError: If types or ranges are invalid + """ + assert isinstance(legacy_score, float), ( + f"Legacy score should be float, got {type(legacy_score)}" + ) + assert isinstance(v2_result, MetricResult), ( + f"V2 result should be MetricResult, got {type(v2_result)}" + ) + assert 0.0 <= legacy_score <= 1.0, f"Legacy score out of range: {legacy_score}" + assert 0.0 <= v2_result.value <= 1.0, f"V2 score out of range: {v2_result.value}" + + +def print_test_header( + metric_name: str, + case_num: int, + description: str, + additional_info: Optional[Dict[str, str]] = None, +) -> None: + """Print a standardized test case header. + + Args: + metric_name: Name of the metric being tested + case_num: Test case number + description: Description of the test case + additional_info: Optional dictionary of additional info to print + """ + print(f"\n๐Ÿงช Testing {metric_name} - Case {case_num}: {description}") + if additional_info: + for key, value in additional_info.items(): + # Truncate long values + display_value = value[:100] + "..." if len(value) > 100 else value + print(f" {key}: {display_value}") + + +def print_score_comparison( + legacy_score: float, + v2_score: float, + precision: int = 6, +) -> None: + """Print a standardized score comparison. + + Args: + legacy_score: Score from legacy implementation + v2_score: Score from v2 implementation + precision: Number of decimal places to display + """ + score_diff = abs(legacy_score - v2_score) + print(f" Legacy: {legacy_score:.{precision}f}") + print(f" V2 Class: {v2_score:.{precision}f}") + print(f" Diff: {score_diff:.{precision}f}") + + +def print_test_success(message: str = "Scores within tolerance!") -> None: + """Print a standardized success message. + + Args: + message: Success message to display + """ + print(f" โœ… {message}") + + +def print_metric_specific_info(metric_name: str, description: str) -> None: + """Print metric-specific test information. + + Args: + metric_name: Name of the metric + description: Description of what's being tested + """ + print(f"\n๐ŸŽฏ Testing {metric_name}: {description}") diff --git a/tests/e2e/test_dataset_utils.py b/tests/e2e/test_dataset_utils.py index 4f1861466..0f4cb7a0a 100644 --- a/tests/e2e/test_dataset_utils.py +++ b/tests/e2e/test_dataset_utils.py @@ -30,6 +30,39 @@ }, ] +# Sample data structure matching the fiqa dataset +SAMPLE_FIQA_DATA = [ + { + "user_input": "How to deposit a cheque issued to an associate in my business account?", + "reference": "Have the check reissued to the proper payee. Just have the associate sign the back and then deposit it. It's called a third party cheque and is perfectly legal. I wouldn't be surprised if it has a longer hold period and, as always, you don't get the money if the cheque doesn't clear.", + "response": "The best way to deposit a cheque issued to an associate in your business account is to have the associate sign the back of the cheque and deposit it as a third party cheque.", + "retrieved_contexts": [ + "Just have the associate sign the back and then deposit it. It's called a third party cheque and is perfectly legal.", + "I wouldn't be surprised if it has a longer hold period and, as always, you don't get the money if the cheque doesn't clear.", + ], + }, + { + "user_input": "What is the difference between a mutual fund and an ETF?", + "reference": "Mutual funds are actively managed investment vehicles that pool money from multiple investors. ETFs are passively managed and trade on exchanges like stocks. ETFs typically have lower fees and can be bought and sold throughout the trading day.", + "response": "A mutual fund pools money from investors and is actively managed, while an ETF trades like a stock and typically tracks an index with lower fees.", + "retrieved_contexts": [ + "Mutual funds pool money from multiple investors and are actively managed by professional fund managers.", + "ETFs trade on exchanges like stocks and can be bought and sold throughout the trading day.", + "ETFs typically have lower expense ratios compared to mutual funds.", + ], + }, + { + "user_input": "Should I pay off my mortgage early or invest the money?", + "reference": "It depends on your mortgage interest rate and expected investment returns. If your mortgage rate is low and you expect higher returns from investments, investing may be better. Consider your risk tolerance and financial goals.", + "response": "The decision depends on comparing your mortgage interest rate to expected investment returns, along with your risk tolerance and financial security needs.", + "retrieved_contexts": [ + "Compare your mortgage interest rate to expected investment returns to make an informed decision.", + "Consider your risk tolerance and overall financial situation before making this decision.", + "Having no mortgage provides peace of mind and guaranteed savings equal to the interest rate.", + ], + }, +] + def load_amnesty_dataset_safe(config: str = "english_v3"): """ @@ -54,3 +87,28 @@ def load_amnesty_dataset_safe(config: str = "english_v3"): local_dataset = Dataset.from_list(SAMPLE_AMNESTY_DATA) logger.info(f"Created local dataset with {len(local_dataset)} samples") return local_dataset + + +def load_fiqa_dataset_safe(config: str = "ragas_eval_v3"): + """ + Safely load the fiqa dataset, falling back to local data if remote fails. + + Args: + config: Dataset configuration name (default: "ragas_eval_v3" - recommended) + + Returns: + Dataset: The loaded dataset + """ + try: + logger.info(f"Attempting to load fiqa dataset with config '{config}'") + dataset = load_dataset("explodinggradients/fiqa", config)["baseline"] + logger.info(f"Successfully loaded dataset with {len(dataset)} samples") + return dataset + except Exception as e: + logger.warning(f"Failed to load remote dataset: {e}") + logger.info("Using local sample data as fallback") + + # Create a local dataset from sample data + local_dataset = Dataset.from_list(SAMPLE_FIQA_DATA) + logger.info(f"Created local dataset with {len(local_dataset)} samples") + return local_dataset diff --git a/tests/notebooks/metric_score_diff.ipynb b/tests/notebooks/metric_score_diff.ipynb deleted file mode 100644 index cca6ced1c..000000000 --- a/tests/notebooks/metric_score_diff.ipynb +++ /dev/null @@ -1,498 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Metric Score Diff Checker\n", - "\n", - "This notebook provides utilities to compare scores between different metric versions, algorithms, LLMs, or datasets.\n", - "\n", - "## Dataset\n", - "This notebook uses the amnesty_qa dataset which contains human rights related Q&A pairs. It will attempt to load from HuggingFace and fallback to local samples if unavailable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "from dataclasses import dataclass\n", - "from typing import Any, Dict, List, Tuple\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "# Ragas imports\n", - "from ragas.dataset_schema import SingleTurnSample" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup\n", - "\n", - "Make sure you have your OpenAI API key set as an environment variable before running this notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "# Check for OpenAI API key\n", - "if not os.getenv(\"OPENAI_API_KEY\"):\n", - " raise ValueError(\n", - " \"OPENAI_API_KEY environment variable not set. \"\n", - " \"Please set it before running this notebook:\\n\"\n", - " \" export OPENAI_API_KEY='your-api-key-here'\"\n", - " )\n", - "\n", - "print(\"โœ“ OpenAI API key found\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utility Functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@dataclass\n", - "class MetricDiffResult:\n", - " \"\"\"Container for metric comparison results.\"\"\"\n", - "\n", - " old_scores: List[float]\n", - " new_scores: List[float]\n", - " diffs: List[float]\n", - " mean_diff: float\n", - " max_diff: float\n", - " min_diff: float\n", - " std_diff: float\n", - " old_mean: float\n", - " new_mean: float\n", - " old_time: float\n", - " new_time: float\n", - "\n", - " def to_dataframe(self) -> pd.DataFrame:\n", - " \"\"\"Convert results to a pandas DataFrame.\"\"\"\n", - " return pd.DataFrame(\n", - " {\n", - " \"old_score\": self.old_scores,\n", - " \"new_score\": self.new_scores,\n", - " \"diff\": self.diffs,\n", - " \"abs_diff\": [abs(d) for d in self.diffs],\n", - " }\n", - " )\n", - "\n", - " def print_summary(self):\n", - " \"\"\"Print a summary of the comparison.\"\"\"\n", - " print(\"=\" * 60)\n", - " print(\"METRIC COMPARISON SUMMARY\")\n", - " print(\"=\" * 60)\n", - " print(\"\\nScore Statistics:\")\n", - " print(f\" Old Metric Mean: {self.old_mean:.4f}\")\n", - " print(f\" New Metric Mean: {self.new_mean:.4f}\")\n", - " print(\"\\nDifference Statistics (new - old):\")\n", - " print(f\" Mean Diff: {self.mean_diff:.4f}\")\n", - " print(f\" Max Diff: {self.max_diff:.4f}\")\n", - " print(f\" Min Diff: {self.min_diff:.4f}\")\n", - " print(f\" Std Dev: {self.std_diff:.4f}\")\n", - " print(\"\\nExecution Time:\")\n", - " print(f\" Old Metric: {self.old_time:.2f}s\")\n", - " print(f\" New Metric: {self.new_time:.2f}s\")\n", - " print(\n", - " f\" Speedup: {self.old_time / self.new_time:.2f}x\"\n", - " if self.new_time > 0\n", - " else \" N/A\"\n", - " )\n", - " print(\"=\" * 60)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async def run_metric_on_dataset(\n", - " metric: Any, dataset: List[Dict[str, Any]], metric_type: str = \"old\"\n", - ") -> Tuple[List[float], float]:\n", - " \"\"\"\n", - " Run a metric on a dataset and return scores with execution time.\n", - "\n", - " Args:\n", - " metric: The metric instance (either old or new style)\n", - " dataset: List of dictionaries containing the data samples\n", - " metric_type: \"old\" for legacy metrics, \"new\" for collections metrics\n", - "\n", - " Returns:\n", - " Tuple of (scores list, execution time in seconds)\n", - " \"\"\"\n", - " scores = []\n", - " start_time = time.time()\n", - "\n", - " for sample_dict in dataset:\n", - " try:\n", - " if metric_type == \"old\":\n", - " # Old metrics use SingleTurnSample\n", - " sample = SingleTurnSample(**sample_dict)\n", - " score = await metric._single_turn_ascore(sample, callbacks=None)\n", - " else:\n", - " # New metrics use direct kwargs\n", - " result = await metric.ascore(**sample_dict)\n", - " score = result.value\n", - "\n", - " scores.append(float(score))\n", - " except Exception as e:\n", - " print(f\"Error processing sample: {e}\")\n", - " scores.append(np.nan)\n", - "\n", - " execution_time = time.time() - start_time\n", - " return scores, execution_time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async def compare_metrics(\n", - " old_metric: Any,\n", - " new_metric: Any,\n", - " dataset: List[Dict[str, Any]],\n", - " old_metric_type: str = \"old\",\n", - " new_metric_type: str = \"new\",\n", - ") -> MetricDiffResult:\n", - " \"\"\"\n", - " Compare two metrics on the same dataset.\n", - "\n", - " Args:\n", - " old_metric: The baseline/old metric instance\n", - " new_metric: The new/updated metric instance\n", - " dataset: List of dictionaries containing the data samples\n", - " old_metric_type: Type identifier for old metric (\"old\" or \"new\")\n", - " new_metric_type: Type identifier for new metric (\"old\" or \"new\")\n", - "\n", - " Returns:\n", - " MetricDiffResult containing comparison statistics\n", - " \"\"\"\n", - " print(f\"Running old metric on {len(dataset)} samples...\")\n", - " old_scores, old_time = await run_metric_on_dataset(\n", - " old_metric, dataset, old_metric_type\n", - " )\n", - "\n", - " print(f\"Running new metric on {len(dataset)} samples...\")\n", - " new_scores, new_time = await run_metric_on_dataset(\n", - " new_metric, dataset, new_metric_type\n", - " )\n", - "\n", - " # Calculate differences\n", - " diffs = [new - old for old, new in zip(old_scores, new_scores)]\n", - "\n", - " return MetricDiffResult(\n", - " old_scores=old_scores,\n", - " new_scores=new_scores,\n", - " diffs=diffs,\n", - " mean_diff=float(np.mean(diffs)),\n", - " max_diff=float(np.max(diffs)),\n", - " min_diff=float(np.min(diffs)),\n", - " std_diff=float(np.std(diffs)),\n", - " old_mean=float(np.mean(old_scores)),\n", - " new_mean=float(np.mean(new_scores)),\n", - " old_time=old_time,\n", - " new_time=new_time,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 1: Compare Answer Relevancy (Old vs New Implementation)\n", - "\n", - "Compare the legacy `AnswerRelevancy` from `ragas.metrics` with the new `AnswerRelevancy` from `ragas.metrics.collections`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Setup LLMs and Embeddings\nfrom langchain_openai import ChatOpenAI, OpenAIEmbeddings\nfrom openai import AsyncOpenAI\n\nfrom ragas.embeddings.base import LangchainEmbeddingsWrapper, embedding_factory\nfrom ragas.llms.base import LangchainLLMWrapper, instructor_llm_factory\n\n# For old metric (legacy) - wrap langchain components\nlangchain_llm = ChatOpenAI(model=\"gpt-4o-mini\")\nlegacy_llm = LangchainLLMWrapper(langchain_llm)\n\nlangchain_embeddings = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\nlegacy_embeddings = LangchainEmbeddingsWrapper(langchain_embeddings)\n\n# For new metric (modern)\nclient = AsyncOpenAI()\nmodern_llm = instructor_llm_factory(\"openai\", client=client, model=\"gpt-4o-mini\")\nmodern_embeddings = embedding_factory(\n \"openai\", model=\"text-embedding-ada-002\", client=client, interface=\"modern\"\n)" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import metrics\n", - "from ragas.metrics._answer_relevance import AnswerRelevancy as OldAnswerRelevancy\n", - "from ragas.metrics.collections._answer_relevancy import (\n", - " AnswerRelevancy as NewAnswerRelevancy,\n", - ")\n", - "\n", - "# Initialize metrics\n", - "old_metric = OldAnswerRelevancy(\n", - " llm=legacy_llm, embeddings=legacy_embeddings, strictness=3\n", - ")\n", - "\n", - "new_metric = NewAnswerRelevancy(\n", - " llm=modern_llm, embeddings=modern_embeddings, strictness=3\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": "# Load amnesty dataset\nimport sys\nfrom pathlib import Path\n\nfrom tests.e2e.test_dataset_utils import load_amnesty_dataset_safe\n\n# Add tests directory to path\ntests_dir = Path.cwd().parent if Path.cwd().name == \"notebooks\" else Path.cwd()\nif tests_dir.name == \"tests\":\n sys.path.insert(0, str(tests_dir.parent))\nelse:\n sys.path.insert(0, str(tests_dir))\n\n# Load the dataset (will use HuggingFace or fallback to local samples)\namnesty_dataset = load_amnesty_dataset_safe(\"english_v3\")\n\n# Convert to list of dicts for our utility functions\n# We'll use a subset for faster testing\ntest_dataset = []\nfor i, sample in enumerate(amnesty_dataset):\n if i >= 5: # Limit to 5 samples for faster testing\n break\n test_dataset.append(\n {\"user_input\": sample[\"user_input\"], \"response\": sample[\"response\"]}\n )\n\nprint(f\"Test dataset contains {len(test_dataset)} samples from amnesty_qa\")\nprint(\"\\nFirst sample:\")\nprint(f\"Question: {test_dataset[0]['user_input']}\")\nprint(f\"Response: {test_dataset[0]['response'][:100]}...\")" - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run comparison\n", - "result = await compare_metrics(\n", - " old_metric=old_metric,\n", - " new_metric=new_metric,\n", - " dataset=test_dataset,\n", - " old_metric_type=\"old\",\n", - " new_metric_type=\"new\",\n", - ")\n", - "\n", - "# Print summary\n", - "result.print_summary()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# View detailed results\n", - "df = result.to_dataframe()\n", - "df[\"user_input\"] = [s[\"user_input\"] for s in test_dataset]\n", - "df = df[[\"user_input\", \"old_score\", \"new_score\", \"diff\", \"abs_diff\"]]\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize the differences\n", - "import matplotlib.pyplot as plt\n", - "\n", - "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", - "\n", - "# Plot 1: Score comparison\n", - "x = range(len(result.old_scores))\n", - "axes[0].plot(x, result.old_scores, \"o-\", label=\"Old Metric\", linewidth=2)\n", - "axes[0].plot(x, result.new_scores, \"s-\", label=\"New Metric\", linewidth=2)\n", - "axes[0].set_xlabel(\"Sample Index\")\n", - "axes[0].set_ylabel(\"Score\")\n", - "axes[0].set_title(\"Metric Scores Comparison\")\n", - "axes[0].legend()\n", - "axes[0].grid(True, alpha=0.3)\n", - "\n", - "# Plot 2: Difference distribution\n", - "axes[1].bar(x, result.diffs, alpha=0.7)\n", - "axes[1].axhline(y=0, color=\"r\", linestyle=\"--\", linewidth=1)\n", - "axes[1].set_xlabel(\"Sample Index\")\n", - "axes[1].set_ylabel(\"Difference (New - Old)\")\n", - "axes[1].set_title(\"Score Differences\")\n", - "axes[1].grid(True, alpha=0.3)\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 2: Compare Same Metric with Different LLMs\n", - "\n", - "Compare how the same metric performs with different LLM models." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create two instances with different LLMs\n", - "client = AsyncOpenAI()\n", - "\n", - "llm_gpt4_mini = instructor_llm_factory(\"openai\", client=client, model=\"gpt-4o-mini\")\n", - "llm_gpt4 = instructor_llm_factory(\"openai\", client=client, model=\"gpt-4o\")\n", - "\n", - "embeddings = embedding_factory(\n", - " \"openai\", model=\"text-embedding-ada-002\", client=client, interface=\"modern\"\n", - ")\n", - "\n", - "metric_gpt4_mini = NewAnswerRelevancy(\n", - " llm=llm_gpt4_mini, embeddings=embeddings, strictness=3\n", - ")\n", - "\n", - "metric_gpt4 = NewAnswerRelevancy(llm=llm_gpt4, embeddings=embeddings, strictness=3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compare LLMs\n", - "result_llm = await compare_metrics(\n", - " old_metric=metric_gpt4_mini,\n", - " new_metric=metric_gpt4,\n", - " dataset=test_dataset,\n", - " old_metric_type=\"new\",\n", - " new_metric_type=\"new\",\n", - ")\n", - "\n", - "result_llm.print_summary()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 3: Compare with Different Datasets\n", - "\n", - "Load different datasets to test metric consistency." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use different subsets of amnesty dataset\n", - "# First 2 samples\n", - "dataset_subset_1 = test_dataset[:2]\n", - "\n", - "# Next 2 samples (if available)\n", - "dataset_subset_2 = test_dataset[2:4] if len(test_dataset) >= 4 else test_dataset[:2]\n", - "\n", - "print(f\"Subset 1: {len(dataset_subset_1)} samples\")\n", - "print(f\"Subset 2: {len(dataset_subset_2)} samples\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compare same metric on different dataset subsets\n", - "print(\"\\n=== Dataset Subset 1 ===\")\n", - "scores_subset_1, time_subset_1 = await run_metric_on_dataset(\n", - " new_metric, dataset_subset_1, \"new\"\n", - ")\n", - "print(f\"Mean score: {np.mean(scores_subset_1):.4f}\")\n", - "print(f\"Execution time: {time_subset_1:.2f}s\")\n", - "\n", - "print(\"\\n=== Dataset Subset 2 ===\")\n", - "scores_subset_2, time_subset_2 = await run_metric_on_dataset(\n", - " new_metric, dataset_subset_2, \"new\"\n", - ")\n", - "print(f\"Mean score: {np.mean(scores_subset_2):.4f}\")\n", - "print(f\"Execution time: {time_subset_2:.2f}s\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utility: Export Results to CSV" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def export_comparison_results(\n", - " result: MetricDiffResult,\n", - " dataset: List[Dict[str, Any]],\n", - " filename: str = \"metric_comparison_results.csv\",\n", - "):\n", - " \"\"\"Export comparison results to CSV file.\"\"\"\n", - " df = result.to_dataframe()\n", - "\n", - " # Add dataset information\n", - " for key in dataset[0].keys():\n", - " df[key] = [sample[key] for sample in dataset]\n", - "\n", - " # Add summary statistics as a separate row\n", - " summary = pd.DataFrame(\n", - " [\n", - " {\n", - " \"user_input\": \"SUMMARY\",\n", - " \"old_score\": result.old_mean,\n", - " \"new_score\": result.new_mean,\n", - " \"diff\": result.mean_diff,\n", - " \"abs_diff\": np.mean([abs(d) for d in result.diffs]),\n", - " }\n", - " ]\n", - " )\n", - "\n", - " df = pd.concat([df, summary], ignore_index=True)\n", - " df.to_csv(filename, index=False)\n", - " print(f\"Results exported to {filename}\")\n", - "\n", - "\n", - "# Example usage\n", - "export_comparison_results(result, test_dataset, \"answer_relevancy_comparison.csv\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 000000000..7d7733305 --- /dev/null +++ b/tests/utils/__init__.py @@ -0,0 +1,35 @@ +"""Shared test utilities for Ragas tests. + +This module provides reusable utilities for both pytest tests and Jupyter notebooks, +including LLM setup, embeddings configuration, and common test helpers. +""" + +from .llm_setup import ( + check_api_key, + create_legacy_embeddings, + create_legacy_llm, + create_modern_embeddings, + create_modern_llm, +) +from .metric_comparison import ( + MetricDiffResult, + compare_metrics, + export_comparison_results, + run_metric_on_dataset, + run_metric_on_dataset_with_batching, +) + +__all__ = [ + # LLM and embeddings setup + "check_api_key", + "create_legacy_llm", + "create_modern_llm", + "create_legacy_embeddings", + "create_modern_embeddings", + # Metric comparison utilities + "MetricDiffResult", + "compare_metrics", + "export_comparison_results", + "run_metric_on_dataset", + "run_metric_on_dataset_with_batching", +] diff --git a/tests/utils/llm_setup.py b/tests/utils/llm_setup.py new file mode 100644 index 000000000..79caf75d4 --- /dev/null +++ b/tests/utils/llm_setup.py @@ -0,0 +1,229 @@ +"""Factory functions for creating LLMs and embeddings for testing. + +This module provides reusable functions for creating both legacy and modern +LLM and embedding instances. These can be used in both pytest tests (via fixtures) +and Jupyter notebooks (directly). +""" + +import os +from typing import Optional + + +def check_api_key(provider: str = "openai") -> bool: + """Check if required API key is set. + + Args: + provider: The provider to check for (default: "openai") + + Returns: + True if API key is set + + Raises: + ValueError: If API key is not set + """ + env_vars = { + "openai": "OPENAI_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + } + + env_var = env_vars.get(provider.lower()) + if not env_var: + raise ValueError(f"Unknown provider: {provider}") + + if not os.getenv(env_var): + raise ValueError( + f"{env_var} environment variable not set. " + f"Please set it before running:\n" + f" export {env_var}='your-api-key-here'" + ) + + return True + + +def create_legacy_llm(model: str = "gpt-3.5-turbo", **kwargs): + """Create a legacy LLM instance for old-style metrics. + + Args: + model: The model name to use + **kwargs: Additional arguments to pass to llm_factory + + Returns: + Legacy LLM instance + + Raises: + ImportError: If llm_factory is not available + Exception: If LLM creation fails (e.g., missing API key) + """ + try: + from ragas.llms.base import llm_factory + + return llm_factory(model, **kwargs) + except ImportError as e: + raise ImportError(f"LLM factory not available: {e}") + except Exception as e: + raise Exception(f"Could not create LLM (API key may be missing): {e}") + + +def create_modern_llm( + provider: str = "openai", + model: str = "gpt-3.5-turbo", + client: Optional[any] = None, + **kwargs, +): + """Create a modern instructor LLM instance for v2 metrics. + + Args: + provider: The LLM provider (e.g., "openai", "anthropic") + model: The model name to use + client: Optional async client instance. If None, will create one. + **kwargs: Additional arguments to pass to instructor_llm_factory + + Returns: + Modern instructor LLM instance + + Raises: + ImportError: If required libraries are not available + Exception: If LLM creation fails + """ + try: + from ragas.llms.base import instructor_llm_factory + + # Create client if not provided + if client is None: + if provider == "openai": + import openai + + client = openai.AsyncOpenAI() + else: + raise ValueError(f"Auto-client creation not supported for {provider}") + + return instructor_llm_factory(provider, model=model, client=client, **kwargs) + except ImportError as e: + raise ImportError(f"Instructor LLM factory not available: {e}") + except Exception as e: + raise Exception(f"Could not create modern LLM (API key may be missing): {e}") + + +def create_legacy_embeddings(model: str = "text-embedding-ada-002", **kwargs): + """Create legacy embeddings for old-style metrics. + + Args: + model: The embedding model name to use + **kwargs: Additional arguments to pass to embedding_factory + + Returns: + Legacy embeddings instance + + Raises: + ImportError: If embedding_factory is not available + Exception: If embeddings creation fails + """ + try: + from ragas.embeddings.base import embedding_factory + + return embedding_factory(model, **kwargs) + except ImportError as e: + raise ImportError(f"Embedding factory not available: {e}") + except Exception as e: + raise Exception( + f"Could not create legacy embeddings (API key may be missing): {e}" + ) + + +def create_modern_embeddings( + provider: str = "openai", + model: str = "text-embedding-ada-002", + client: Optional[any] = None, + interface: str = "modern", + **kwargs, +): + """Create modern embeddings for v2 metrics. + + Args: + provider: The embeddings provider (e.g., "openai") + model: The embedding model name to use + client: Optional async client instance. If None, will create one. + interface: Interface type (default: "modern") + **kwargs: Additional arguments to pass to embedding_factory + + Returns: + Modern embeddings instance + + Raises: + ImportError: If required libraries are not available + Exception: If embeddings creation fails + """ + try: + from ragas.embeddings.base import embedding_factory + + # Create client if not provided + if client is None: + if provider == "openai": + import openai + + client = openai.AsyncOpenAI() + else: + raise ValueError(f"Auto-client creation not supported for {provider}") + + return embedding_factory( + provider=provider, + model=model, + client=client, + interface=interface, + **kwargs, + ) + except ImportError as e: + raise ImportError(f"OpenAI or embedding factory not available: {e}") + except Exception as e: + raise Exception( + f"Could not create modern embeddings (API key may be missing): {e}" + ) + + +# Legacy-style factory functions for backward compatibility with langchain wrappers +def create_legacy_llm_with_langchain(model: str = "gpt-4o-mini", **kwargs): + """Create a legacy LLM using Langchain wrapper. + + This is for compatibility with older code that uses Langchain wrappers. + + Args: + model: The model name to use + **kwargs: Additional arguments + + Returns: + LangchainLLMWrapper instance + """ + try: + from langchain_openai import ChatOpenAI + + from ragas.llms.base import LangchainLLMWrapper + + langchain_llm = ChatOpenAI(model=model, **kwargs) + return LangchainLLMWrapper(langchain_llm) + except ImportError as e: + raise ImportError(f"Langchain or LangchainLLMWrapper not available: {e}") + + +def create_legacy_embeddings_with_langchain( + model: str = "text-embedding-ada-002", **kwargs +): + """Create legacy embeddings using Langchain wrapper. + + This is for compatibility with older code that uses Langchain wrappers. + + Args: + model: The embedding model name to use + **kwargs: Additional arguments + + Returns: + LangchainEmbeddingsWrapper instance + """ + try: + from langchain_openai import OpenAIEmbeddings + + from ragas.embeddings.base import LangchainEmbeddingsWrapper + + langchain_embeddings = OpenAIEmbeddings(model=model, **kwargs) + return LangchainEmbeddingsWrapper(langchain_embeddings) + except ImportError as e: + raise ImportError(f"Langchain or LangchainEmbeddingsWrapper not available: {e}") diff --git a/tests/utils/metric_comparison.py b/tests/utils/metric_comparison.py new file mode 100644 index 000000000..2adc6171b --- /dev/null +++ b/tests/utils/metric_comparison.py @@ -0,0 +1,351 @@ +"""Utilities for comparing metrics across different implementations. + +This module provides tools for comparing legacy and modern metric implementations, +including concurrent execution, statistical analysis, and result export capabilities. +""" + +import asyncio +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Tuple + +import numpy as np +import pandas as pd + +from ragas.dataset_schema import SingleTurnSample + + +@dataclass +class MetricDiffResult: + """Container for metric comparison results. + + Attributes: + old_scores: List of scores from the baseline/old metric + new_scores: List of scores from the new metric + diffs: List of differences (new - old) + mean_diff: Mean of differences + max_diff: Maximum difference + min_diff: Minimum difference + std_diff: Standard deviation of differences + old_mean: Mean of old metric scores + new_mean: Mean of new metric scores + old_time: Execution time for old metric (seconds) + new_time: Execution time for new metric (seconds) + """ + + old_scores: List[float] + new_scores: List[float] + diffs: List[float] + mean_diff: float + max_diff: float + min_diff: float + std_diff: float + old_mean: float + new_mean: float + old_time: float + new_time: float + + def to_dataframe(self) -> pd.DataFrame: + """Convert results to a pandas DataFrame. + + Returns: + DataFrame with columns: old_score, new_score, diff, abs_diff + """ + return pd.DataFrame( + { + "old_score": self.old_scores, + "new_score": self.new_scores, + "diff": self.diffs, + "abs_diff": [abs(d) for d in self.diffs], + } + ) + + def print_summary(self): + """Print a formatted summary of the comparison results.""" + print("=" * 60) + print("METRIC COMPARISON SUMMARY") + print("=" * 60) + print("\nScore Statistics:") + print(f" Old Metric Mean: {self.old_mean:.4f}") + print(f" New Metric Mean: {self.new_mean:.4f}") + print("\nDifference Statistics (new - old):") + print(f" Mean Diff: {self.mean_diff:.4f}") + print(f" Max Diff: {self.max_diff:.4f}") + print(f" Min Diff: {self.min_diff:.4f}") + print(f" Std Dev: {self.std_diff:.4f}") + print("\nExecution Time:") + print(f" Old Metric: {self.old_time:.2f}s") + print(f" New Metric: {self.new_time:.2f}s") + print( + f" Speedup: {self.old_time / self.new_time:.2f}x" + if self.new_time > 0 + else " N/A" + ) + print("=" * 60) + + +async def run_metric_on_dataset( + metric: Any, + dataset: List[Dict[str, Any]], + metric_type: str = "old", + max_concurrent: int = 10, +) -> Tuple[List[float], float]: + """ + Run a metric on a dataset with concurrent processing for better performance. + + This function processes all samples concurrently with a semaphore to limit + the number of simultaneous API calls, preventing rate limiting issues. + + Args: + metric: The metric instance (either old or new style) + dataset: List of dictionaries containing the data samples + metric_type: "old" for legacy metrics, "new" for collections metrics + max_concurrent: Maximum number of concurrent requests (default: 10) + + Returns: + Tuple of (scores list, execution time in seconds) + + Example: + >>> scores, time = await run_metric_on_dataset( + ... metric=my_metric, + ... dataset=[{"user_input": "q1", "response": "a1"}], + ... metric_type="new", + ... max_concurrent=5, + ... ) + """ + + async def score_single_sample(sample_dict: Dict[str, Any]) -> float: + """Score a single sample using the appropriate metric interface.""" + try: + if metric_type == "old": + # Old metrics use SingleTurnSample + sample = SingleTurnSample(**sample_dict) + score = await metric._single_turn_ascore(sample, callbacks=None) + else: + # New metrics use direct kwargs + result = await metric.ascore(**sample_dict) + score = result.value + + return float(score) + except Exception as e: + print(f"Error processing sample: {e}") + return np.nan + + start_time = time.time() + + # Use semaphore to limit concurrent requests (prevents API rate limiting) + semaphore = asyncio.Semaphore(max_concurrent) + + async def score_with_limit(sample_dict: Dict[str, Any]) -> float: + """Score with concurrency control.""" + async with semaphore: + return await score_single_sample(sample_dict) + + # Process all samples concurrently + scores = await asyncio.gather(*[score_with_limit(s) for s in dataset]) + + execution_time = time.time() - start_time + return list(scores), execution_time + + +async def compare_metrics( + old_metric: Any, + new_metric: Any, + dataset: List[Dict[str, Any]], + old_metric_type: str = "old", + new_metric_type: str = "new", + max_concurrent: int = 10, + parallel_metrics: bool = True, +) -> MetricDiffResult: + """ + Compare two metrics on the same dataset with optional parallel execution. + + This function runs both metrics on the dataset and computes detailed + comparison statistics. Metrics can be run in parallel (faster) or + sequentially (more accurate individual timing). + + Args: + old_metric: The baseline/old metric instance + new_metric: The new/updated metric instance + dataset: List of dictionaries containing the data samples + old_metric_type: Type identifier for old metric ("old" or "new") + new_metric_type: Type identifier for new metric ("old" or "new") + max_concurrent: Maximum number of concurrent requests per metric (default: 10) + parallel_metrics: If True, run both metrics in parallel. If False, run sequentially + for more accurate individual timing (default: True) + + Returns: + MetricDiffResult containing detailed comparison statistics + + Example: + >>> result = await compare_metrics( + ... old_metric=legacy_metric, + ... new_metric=modern_metric, + ... dataset=test_data, + ... max_concurrent=5, + ... parallel_metrics=True, + ... ) + >>> result.print_summary() + """ + if parallel_metrics: + print( + f"Running both metrics in parallel on {len(dataset)} samples (max {max_concurrent} concurrent)..." + ) + + # Run both metrics concurrently using asyncio.gather + (old_scores, old_time), (new_scores, new_time) = await asyncio.gather( + run_metric_on_dataset(old_metric, dataset, old_metric_type, max_concurrent), + run_metric_on_dataset(new_metric, dataset, new_metric_type, max_concurrent), + ) + else: + # Sequential execution for more accurate individual timing + print( + f"Running old metric on {len(dataset)} samples (max {max_concurrent} concurrent)..." + ) + old_scores, old_time = await run_metric_on_dataset( + old_metric, dataset, old_metric_type, max_concurrent + ) + + print( + f"Running new metric on {len(dataset)} samples (max {max_concurrent} concurrent)..." + ) + new_scores, new_time = await run_metric_on_dataset( + new_metric, dataset, new_metric_type, max_concurrent + ) + + # Calculate differences + diffs = [new - old for old, new in zip(old_scores, new_scores)] + + return MetricDiffResult( + old_scores=old_scores, + new_scores=new_scores, + diffs=diffs, + mean_diff=float(np.mean(diffs)), + max_diff=float(np.max(diffs)), + min_diff=float(np.min(diffs)), + std_diff=float(np.std(diffs)), + old_mean=float(np.mean(old_scores)), + new_mean=float(np.mean(new_scores)), + old_time=old_time, + new_time=new_time, + ) + + +async def run_metric_on_dataset_with_batching( + metric: Any, + dataset: List[Dict[str, Any]], + metric_type: str = "new", + batch_size: int = 5, +) -> Tuple[List[float], float]: + """ + Run metric using batch processing if available (for better performance). + + This function attempts to use the metric's abatch_score method if available, + which can be more efficient than individual scoring. Falls back to concurrent + processing if batching is not supported. + + Args: + metric: The metric instance + dataset: List of dictionaries containing the data samples + metric_type: "old" or "new" - old metrics don't support batching + batch_size: Number of samples per batch (default: 5) + + Returns: + Tuple of (scores list, execution time in seconds) + + Example: + >>> scores, time = await run_metric_on_dataset_with_batching( + ... metric=my_metric, + ... dataset=test_data, + ... metric_type="new", + ... batch_size=10, + ... ) + """ + # Check if metric supports batching + has_batch = hasattr(metric, "abatch_score") + + if not has_batch or metric_type == "old": + # Fall back to concurrent processing + print(" Batching not available, using concurrent processing...") + return await run_metric_on_dataset(metric, dataset, metric_type) + + start_time = time.time() + all_scores = [] + + # Process in batches + num_batches = (len(dataset) + batch_size - 1) // batch_size + print( + f" Processing {len(dataset)} samples in {num_batches} batches of {batch_size}..." + ) + + for i in range(0, len(dataset), batch_size): + batch = dataset[i : i + batch_size] + try: + results = await metric.abatch_score(batch) + scores = [r.value for r in results] + all_scores.extend(scores) + except Exception as e: + print( + f" Warning: Batch {i // batch_size + 1} failed ({e}), falling back to individual processing..." + ) + # Fall back to individual processing for this batch + for sample in batch: + try: + result = await metric.ascore(**sample) + all_scores.append(result.value) + except Exception as e2: + print(f" Error processing sample: {e2}") + all_scores.append(np.nan) + + execution_time = time.time() - start_time + return all_scores, execution_time + + +def export_comparison_results( + result: MetricDiffResult, + dataset: List[Dict[str, Any]], + filename: str = "metric_comparison_results.csv", +): + """ + Export comparison results to CSV file. + + The CSV includes all scores, differences, and the original dataset fields, + plus a summary row with aggregate statistics. + + Args: + result: MetricDiffResult object containing comparison data + dataset: Original dataset (to include context in export) + filename: Output CSV filename (default: "metric_comparison_results.csv") + + Example: + >>> export_comparison_results( + ... result=comparison_result, + ... dataset=test_data, + ... filename="context_recall_results.csv", + ... ) + """ + df = result.to_dataframe() + + # Add dataset information + for key in dataset[0].keys(): + df[key] = [sample.get(key, "") for sample in dataset] + + # Add summary statistics as a separate row + summary = pd.DataFrame( + [ + { + **{ + key: "SUMMARY" if i == 0 else "" + for i, key in enumerate(dataset[0].keys()) + }, + "old_score": result.old_mean, + "new_score": result.new_mean, + "diff": result.mean_diff, + "abs_diff": np.mean([abs(d) for d in result.diffs]), + } + ] + ) + + df = pd.concat([df, summary], ignore_index=True) + df.to_csv(filename, index=False) + print(f"Results exported to {filename}")