# Metric Score Diff Checker

This notebook provides utilities to compare scores between different metric versions, algorithms, LLMs, or datasets.

## Dataset
This notebook uses the amnesty_qa dataset which contains human rights related Q&A pairs. It will attempt to load from HuggingFace and fallback to local samples if unavailable.

In [None]:
import time
from dataclasses import dataclass
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd

# Ragas imports
from ragas.dataset_schema import SingleTurnSample

## Setup

Make sure you have your OpenAI API key set as an environment variable before running this notebook.

In [None]:
import os

# Check for OpenAI API key
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError(
        "OPENAI_API_KEY environment variable not set. "
        "Please set it before running this notebook:\n"
        "  export OPENAI_API_KEY='your-api-key-here'"
    )

print("✓ OpenAI API key found")

## Utility Functions

In [None]:
@dataclass
class MetricDiffResult:
    """Container for metric comparison results."""

    old_scores: List[float]
    new_scores: List[float]
    diffs: List[float]
    mean_diff: float
    max_diff: float
    min_diff: float
    std_diff: float
    old_mean: float
    new_mean: float
    old_time: float
    new_time: float

    def to_dataframe(self) -> pd.DataFrame:
        """Convert results to a pandas DataFrame."""
        return pd.DataFrame(
            {
                "old_score": self.old_scores,
                "new_score": self.new_scores,
                "diff": self.diffs,
                "abs_diff": [abs(d) for d in self.diffs],
            }
        )

    def print_summary(self):
        """Print a summary of the comparison."""
        print("=" * 60)
        print("METRIC COMPARISON SUMMARY")
        print("=" * 60)
        print("\nScore Statistics:")
        print(f"  Old Metric Mean: {self.old_mean:.4f}")
        print(f"  New Metric Mean: {self.new_mean:.4f}")
        print("\nDifference Statistics (new - old):")
        print(f"  Mean Diff:   {self.mean_diff:.4f}")
        print(f"  Max Diff:    {self.max_diff:.4f}")
        print(f"  Min Diff:    {self.min_diff:.4f}")
        print(f"  Std Dev:     {self.std_diff:.4f}")
        print("\nExecution Time:")
        print(f"  Old Metric:  {self.old_time:.2f}s")
        print(f"  New Metric:  {self.new_time:.2f}s")
        print(
            f"  Speedup:     {self.old_time / self.new_time:.2f}x"
            if self.new_time > 0
            else "  N/A"
        )
        print("=" * 60)

In [None]:
async def run_metric_on_dataset(
    metric: Any, dataset: List[Dict[str, Any]], metric_type: str = "old"
) -> Tuple[List[float], float]:
    """
    Run a metric on a dataset and return scores with execution time.

    Args:
        metric: The metric instance (either old or new style)
        dataset: List of dictionaries containing the data samples
        metric_type: "old" for legacy metrics, "new" for collections metrics

    Returns:
        Tuple of (scores list, execution time in seconds)
    """
    scores = []
    start_time = time.time()

    for sample_dict in dataset:
        try:
            if metric_type == "old":
                # Old metrics use SingleTurnSample
                sample = SingleTurnSample(**sample_dict)
                score = await metric._single_turn_ascore(sample, callbacks=None)
            else:
                # New metrics use direct kwargs
                result = await metric.ascore(**sample_dict)
                score = result.value

            scores.append(float(score))
        except Exception as e:
            print(f"Error processing sample: {e}")
            scores.append(np.nan)

    execution_time = time.time() - start_time
    return scores, execution_time

In [None]:
async def compare_metrics(
    old_metric: Any,
    new_metric: Any,
    dataset: List[Dict[str, Any]],
    old_metric_type: str = "old",
    new_metric_type: str = "new",
) -> MetricDiffResult:
    """
    Compare two metrics on the same dataset.

    Args:
        old_metric: The baseline/old metric instance
        new_metric: The new/updated metric instance
        dataset: List of dictionaries containing the data samples
        old_metric_type: Type identifier for old metric ("old" or "new")
        new_metric_type: Type identifier for new metric ("old" or "new")

    Returns:
        MetricDiffResult containing comparison statistics
    """
    print(f"Running old metric on {len(dataset)} samples...")
    old_scores, old_time = await run_metric_on_dataset(
        old_metric, dataset, old_metric_type
    )

    print(f"Running new metric on {len(dataset)} samples...")
    new_scores, new_time = await run_metric_on_dataset(
        new_metric, dataset, new_metric_type
    )

    # Calculate differences
    diffs = [new - old for old, new in zip(old_scores, new_scores)]

    return MetricDiffResult(
        old_scores=old_scores,
        new_scores=new_scores,
        diffs=diffs,
        mean_diff=float(np.mean(diffs)),
        max_diff=float(np.max(diffs)),
        min_diff=float(np.min(diffs)),
        std_diff=float(np.std(diffs)),
        old_mean=float(np.mean(old_scores)),
        new_mean=float(np.mean(new_scores)),
        old_time=old_time,
        new_time=new_time,
    )

## Example 1: Compare Answer Relevancy (Old vs New Implementation)

Compare the legacy `AnswerRelevancy` from `ragas.metrics` with the new `AnswerRelevancy` from `ragas.metrics.collections`.

In [None]:
# Setup LLMs and Embeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from openai import AsyncOpenAI

from ragas.embeddings.base import LangchainEmbeddingsWrapper, embedding_factory
from ragas.llms.base import LangchainLLMWrapper, instructor_llm_factory

# For old metric (legacy) - wrap langchain components
langchain_llm = ChatOpenAI(model="gpt-4o-mini")
legacy_llm = LangchainLLMWrapper(langchain_llm)

langchain_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
legacy_embeddings = LangchainEmbeddingsWrapper(langchain_embeddings)

# For new metric (modern)
client = AsyncOpenAI()
modern_llm = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
modern_embeddings = embedding_factory(
    "openai", model="text-embedding-ada-002", client=client, interface="modern"
)

In [None]:
# Import metrics
from ragas.metrics._answer_relevance import AnswerRelevancy as OldAnswerRelevancy
from ragas.metrics.collections._answer_relevancy import (
    AnswerRelevancy as NewAnswerRelevancy,
)

# Initialize metrics
old_metric = OldAnswerRelevancy(
    llm=legacy_llm, embeddings=legacy_embeddings, strictness=3
)

new_metric = NewAnswerRelevancy(
    llm=modern_llm, embeddings=modern_embeddings, strictness=3
)

In [None]:
# Load amnesty dataset
import sys
from pathlib import Path

from tests.e2e.test_dataset_utils import load_amnesty_dataset_safe

# Add tests directory to path
tests_dir = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
if tests_dir.name == "tests":
    sys.path.insert(0, str(tests_dir.parent))
else:
    sys.path.insert(0, str(tests_dir))

# Load the dataset (will use HuggingFace or fallback to local samples)
amnesty_dataset = load_amnesty_dataset_safe("english_v3")

# Convert to list of dicts for our utility functions
# We'll use a subset for faster testing
test_dataset = []
for i, sample in enumerate(amnesty_dataset):
    if i >= 5:  # Limit to 5 samples for faster testing
        break
    test_dataset.append(
        {"user_input": sample["user_input"], "response": sample["response"]}
    )

print(f"Test dataset contains {len(test_dataset)} samples from amnesty_qa")
print("\nFirst sample:")
print(f"Question: {test_dataset[0]['user_input']}")
print(f"Response: {test_dataset[0]['response'][:100]}...")

In [None]:
# Run comparison
result = await compare_metrics(
    old_metric=old_metric,
    new_metric=new_metric,
    dataset=test_dataset,
    old_metric_type="old",
    new_metric_type="new",
)

# Print summary
result.print_summary()

In [None]:
# View detailed results
df = result.to_dataframe()
df["user_input"] = [s["user_input"] for s in test_dataset]
df = df[["user_input", "old_score", "new_score", "diff", "abs_diff"]]
df

In [None]:
# Visualize the differences
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Score comparison
x = range(len(result.old_scores))
axes[0].plot(x, result.old_scores, "o-", label="Old Metric", linewidth=2)
axes[0].plot(x, result.new_scores, "s-", label="New Metric", linewidth=2)
axes[0].set_xlabel("Sample Index")
axes[0].set_ylabel("Score")
axes[0].set_title("Metric Scores Comparison")
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Difference distribution
axes[1].bar(x, result.diffs, alpha=0.7)
axes[1].axhline(y=0, color="r", linestyle="--", linewidth=1)
axes[1].set_xlabel("Sample Index")
axes[1].set_ylabel("Difference (New - Old)")
axes[1].set_title("Score Differences")
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Example 2: Compare Same Metric with Different LLMs

Compare how the same metric performs with different LLM models.

In [None]:
# Create two instances with different LLMs
client = AsyncOpenAI()

llm_gpt4_mini = instructor_llm_factory("openai", client=client, model="gpt-4o-mini")
llm_gpt4 = instructor_llm_factory("openai", client=client, model="gpt-4o")

embeddings = embedding_factory(
    "openai", model="text-embedding-ada-002", client=client, interface="modern"
)

metric_gpt4_mini = NewAnswerRelevancy(
    llm=llm_gpt4_mini, embeddings=embeddings, strictness=3
)

metric_gpt4 = NewAnswerRelevancy(llm=llm_gpt4, embeddings=embeddings, strictness=3)

In [None]:
# Compare LLMs
result_llm = await compare_metrics(
    old_metric=metric_gpt4_mini,
    new_metric=metric_gpt4,
    dataset=test_dataset,
    old_metric_type="new",
    new_metric_type="new",
)

result_llm.print_summary()

## Example 3: Compare with Different Datasets

Load different datasets to test metric consistency.

In [None]:
# Use different subsets of amnesty dataset
# First 2 samples
dataset_subset_1 = test_dataset[:2]

# Next 2 samples (if available)
dataset_subset_2 = test_dataset[2:4] if len(test_dataset) >= 4 else test_dataset[:2]

print(f"Subset 1: {len(dataset_subset_1)} samples")
print(f"Subset 2: {len(dataset_subset_2)} samples")

In [None]:
# Compare same metric on different dataset subsets
print("\n=== Dataset Subset 1 ===")
scores_subset_1, time_subset_1 = await run_metric_on_dataset(
    new_metric, dataset_subset_1, "new"
)
print(f"Mean score: {np.mean(scores_subset_1):.4f}")
print(f"Execution time: {time_subset_1:.2f}s")

print("\n=== Dataset Subset 2 ===")
scores_subset_2, time_subset_2 = await run_metric_on_dataset(
    new_metric, dataset_subset_2, "new"
)
print(f"Mean score: {np.mean(scores_subset_2):.4f}")
print(f"Execution time: {time_subset_2:.2f}s")

## Utility: Export Results to CSV

In [None]:
def export_comparison_results(
    result: MetricDiffResult,
    dataset: List[Dict[str, Any]],
    filename: str = "metric_comparison_results.csv",
):
    """Export comparison results to CSV file."""
    df = result.to_dataframe()

    # Add dataset information
    for key in dataset[0].keys():
        df[key] = [sample[key] for sample in dataset]

    # Add summary statistics as a separate row
    summary = pd.DataFrame(
        [
            {
                "user_input": "SUMMARY",
                "old_score": result.old_mean,
                "new_score": result.new_mean,
                "diff": result.mean_diff,
                "abs_diff": np.mean([abs(d) for d in result.diffs]),
            }
        ]
    )

    df = pd.concat([df, summary], ignore_index=True)
    df.to_csv(filename, index=False)
    print(f"Results exported to {filename}")


# Example usage
export_comparison_results(result, test_dataset, "answer_relevancy_comparison.csv")