From ee408796801036f2ca885facb261aaaa6b28876d Mon Sep 17 00:00:00 2001 From: Rahul Bhatnagar Date: Tue, 4 Nov 2025 23:26:18 -0500 Subject: [PATCH] Migrate factual correctness --- src/ragas/metrics/collections/__init__.py | 2 + .../collections/_factual_correctness.py | 359 ++++++++++++++++++ .../test_factual_correctness_migration.py | 249 ++++++++++++ .../test_summary_score_migration.py | 4 +- 4 files changed, 612 insertions(+), 2 deletions(-) create mode 100644 src/ragas/metrics/collections/_factual_correctness.py create mode 100644 tests/e2e/metrics_migration/test_factual_correctness_migration.py diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py index c935ca496..ae7fd93e7 100644 --- a/src/ragas/metrics/collections/__init__.py +++ b/src/ragas/metrics/collections/__init__.py @@ -7,6 +7,7 @@ from ragas.metrics.collections._bleu_score import BleuScore from ragas.metrics.collections._context_entity_recall import ContextEntityRecall from ragas.metrics.collections._context_relevance import ContextRelevance +from ragas.metrics.collections._factual_correctness import FactualCorrectness from ragas.metrics.collections._faithfulness import Faithfulness from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity from ragas.metrics.collections._rouge_score import RougeScore @@ -31,6 +32,7 @@ "ContextRelevance", "DistanceMeasure", "ExactMatch", + "FactualCorrectness", "Faithfulness", "NoiseSensitivity", "NonLLMStringSimilarity", diff --git a/src/ragas/metrics/collections/_factual_correctness.py b/src/ragas/metrics/collections/_factual_correctness.py new file mode 100644 index 000000000..b12ba580b --- /dev/null +++ b/src/ragas/metrics/collections/_factual_correctness.py @@ -0,0 +1,359 @@ +"""Factual Correctness metric v2 - Modern implementation with multi-modal scoring.""" + +import typing as t +from typing import List + +import numpy as np +from pydantic import BaseModel + +from ragas.metrics.collections.base import BaseMetric +from ragas.metrics.result import MetricResult +from ragas.metrics.utils import fbeta_score +from ragas.prompt.metrics.common import nli_statement_prompt + +if t.TYPE_CHECKING: + from ragas.llms.base import InstructorBaseRagasLLM + + +class ClaimDecompositionOutput(BaseModel): + """Structured output for claim decomposition.""" + + claims: List[str] + + +class StatementFaithfulnessAnswer(BaseModel): + """Individual statement with reason and verdict for NLI evaluation.""" + + statement: str + reason: str + verdict: int + + +class NLIStatementOutput(BaseModel): + """Structured output for NLI statement evaluation.""" + + statements: List[StatementFaithfulnessAnswer] + + +def claim_decomposition_prompt( + response: str, atomicity: str = "low", coverage: str = "low" +) -> str: + """ + V1-identical claim decomposition prompt with configurable atomicity/coverage. + + Args: + response: The response text to break down into claims + atomicity: Level of atomicity ("low" or "high") + coverage: Level of coverage ("low" or "high") + + Returns: + V1-identical prompt string for the LLM + """ + import json + + safe_response = json.dumps(response) + + # Select examples based on atomicity and coverage configuration + if atomicity == "low" and coverage == "low": + examples = [ + { + "input": { + "response": "Charles Babbage was a French mathematician, philosopher, and food critic." + }, + "output": { + "claims": ["Charles Babbage was a mathematician and philosopher."] + }, + }, + { + "input": { + "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics." + }, + "output": { + "claims": [ + "Albert Einstein was a German physicist.", + "Albert Einstein developed relativity and contributed to quantum mechanics.", + ] + }, + }, + ] + elif atomicity == "low" and coverage == "high": + examples = [ + { + "input": { + "response": "Charles Babbage was a French mathematician, philosopher, and food critic." + }, + "output": { + "claims": [ + "Charles Babbage was a French mathematician, philosopher, and food critic." + ] + }, + }, + { + "input": { + "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics." + }, + "output": { + "claims": [ + "Albert Einstein was a German theoretical physicist.", + "Albert Einstein developed the theory of relativity and also contributed to the development of quantum mechanics.", + ] + }, + }, + ] + elif atomicity == "high" and coverage == "low": + examples = [ + { + "input": { + "response": "Charles Babbage was a French mathematician, philosopher, and food critic." + }, + "output": { + "claims": [ + "Charles Babbage was a mathematician.", + "Charles Babbage was a philosopher.", + ] + }, + }, + { + "input": { + "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics." + }, + "output": { + "claims": [ + "Albert Einstein was a German theoretical physicist.", + "Albert Einstein developed the theory of relativity.", + ] + }, + }, + ] + else: # high atomicity, high coverage + examples = [ + { + "input": { + "response": "Charles Babbage was a French mathematician, philosopher, and food critic." + }, + "output": { + "claims": [ + "Charles Babbage was a mathematician.", + "Charles Babbage was a philosopher.", + "Charles Babbage was a food critic.", + "Charles Babbage was French.", + ] + }, + }, + { + "input": { + "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics." + }, + "output": { + "claims": [ + "Albert Einstein was a German theoretical physicist.", + "Albert Einstein developed the theory of relativity.", + "Albert Einstein contributed to the development of quantum mechanics.", + ] + }, + }, + ] + + # Build examples string + examples_str = "\n".join( + [ + f"""Example {i + 1} +Input: {json.dumps(ex["input"], indent=4)} +Output: {json.dumps(ex["output"], indent=4)}""" + for i, ex in enumerate(examples) + ] + ) + + return f"""Decompose and break down each of the input sentences into one or more standalone statements. Each statement should be a standalone claim that can be independently verified. +Follow the level of atomicity and coverage as shown in the examples. +Please return the output in a JSON format that complies with the following schema as specified in JSON Schema: +{{"properties": {{"claims": {{"description": "Decomposed Claims", "items": {{"type": "string"}}, "title": "Claims", "type": "array"}}}}, "required": ["claims"], "title": "ClaimDecompositionOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash. + +--------EXAMPLES----------- +{examples_str} +----------------------------- + +Now perform the same with the following input +input: {{ + "response": {safe_response} +}} +Output: """ + + +class FactualCorrectness(BaseMetric): + """ + Modern v2 implementation of factual correctness evaluation. + + Evaluates the factual correctness of responses by comparing claims made in the response + against a reference text. Uses claim decomposition and natural language inference (NLI) + to verify claims in both directions. + + The metric supports three evaluation modes: + - Precision: What fraction of response claims are supported by reference + - Recall: What fraction of reference claims are covered by response + - F1: Harmonic mean of precision and recall (with configurable beta) + + The metric also supports configurable claim decomposition: + - Atomicity: "low" (fewer, broader claims) vs "high" (more, atomic claims) + - Coverage: "low" (partial coverage) vs "high" (comprehensive coverage) + + Usage: + >>> import instructor + >>> from openai import AsyncOpenAI + >>> from ragas.llms.base import llm_factory + >>> from ragas.metrics.collections import FactualCorrectness + >>> + >>> # Setup dependencies + >>> client = AsyncOpenAI() + >>> llm = llm_factory("gpt-4o-mini", client=client) + >>> + >>> # Create metric instance + >>> metric = FactualCorrectness(llm=llm, mode="f1", beta=1.0) + >>> + >>> # Single evaluation + >>> result = await metric.ascore( + ... response="Einstein was born in Germany in 1879.", + ... reference="Albert Einstein was born in Ulm, Germany on March 14, 1879." + ... ) + >>> print(f"Factual Correctness: {result.value}") + + Attributes: + llm: Modern instructor-based LLM for claim decomposition and NLI evaluation + mode: Evaluation mode ("precision", "recall", or "f1") + beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision) + atomicity: Claim decomposition atomicity ("low" or "high") + coverage: Claim decomposition coverage ("low" or "high") + name: The metric name + allowed_values: Score range (0.0 to 1.0, higher is better) + """ + + # Type hints for linter (attributes are set in __init__) + llm: "InstructorBaseRagasLLM" + + def __init__( + self, + llm: "InstructorBaseRagasLLM", + mode: t.Literal["precision", "recall", "f1"] = "f1", + beta: float = 1.0, + atomicity: t.Literal["low", "high"] = "low", + coverage: t.Literal["low", "high"] = "low", + name: str = "factual_correctness", + **kwargs, + ): + """ + Initialize FactualCorrectness metric with required components. + + Args: + llm: Modern instructor-based LLM for claim decomposition and NLI evaluation + mode: Evaluation mode ("precision", "recall", or "f1") + beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision) + atomicity: Claim decomposition atomicity ("low" or "high") + coverage: Claim decomposition coverage ("low" or "high") + name: The metric name + """ + # Set attributes explicitly before calling super() + self.llm = llm + self.mode = mode + self.beta = beta + self.atomicity = atomicity + self.coverage = coverage + + # Validate beta parameter + if not isinstance(beta, (int, float)): + raise ValueError( + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." + ) + + # Call super() for validation (without passing llm in kwargs) + super().__init__(name=name, **kwargs) + + async def ascore(self, response: str, reference: str) -> MetricResult: + """ + Calculate factual correctness score. + + Args: + response: The response to evaluate for factual correctness + reference: The reference text to check claims against + + Returns: + MetricResult with factual correctness score (0.0-1.0, higher is better) + """ + # Input validation + if not response: + raise ValueError( + "response is missing. Please add response to the test sample." + ) + if not reference: + raise ValueError( + "reference is missing. Please add reference to the test sample." + ) + + # Step 1: Get claim verifications based on mode + if self.mode != "precision": + # For recall and f1: response claims → reference verification + response_verified = await self._decompose_and_verify_claims( + response, reference + ) + else: + response_verified = np.array([], dtype=bool) + + if self.mode != "recall": + # For precision and f1: reference claims → response verification + reference_verified = await self._decompose_and_verify_claims( + reference, response + ) + else: + reference_verified = np.array([], dtype=bool) + + # Step 2: Compute TP, FP, FN + if self.mode != "precision": + tp = int(np.sum(response_verified)) + fn = int(np.sum(~response_verified)) + else: + tp = int(np.sum(reference_verified)) + fn = 0 + + if self.mode != "recall": + fp = int(np.sum(~reference_verified)) + else: + fp = 0 + + # Step 3: Compute final score based on mode + if self.mode == "precision": + score = tp / (tp + fp + 1e-8) + elif self.mode == "recall": + score = tp / (tp + fn + 1e-8) + else: # f1 + score = fbeta_score(tp, fp, fn, self.beta) + + return MetricResult(value=float(np.round(score, 2))) + + async def _decompose_claims(self, response: str) -> List[str]: + """Break response into claims using configurable decomposition.""" + prompt = claim_decomposition_prompt( + response, atomicity=self.atomicity, coverage=self.coverage + ) + result = await self.llm.agenerate(prompt, ClaimDecompositionOutput) + return result.claims + + async def _verify_claims( + self, claims: List[str], reference: str + ) -> NLIStatementOutput: + """Verify claims against reference using NLI.""" + prompt = nli_statement_prompt(reference, claims) + result = await self.llm.agenerate(prompt, NLIStatementOutput) + return result + + async def _decompose_and_verify_claims( + self, text_to_decompose: str, reference_text: str + ) -> np.ndarray: + """Decompose text into claims and verify against reference.""" + claims = await self._decompose_claims(text_to_decompose) + if not claims: + return np.array([], dtype=bool) + + verdicts = await self._verify_claims(claims, reference_text) + if not verdicts.statements: + return np.array([], dtype=bool) + + return np.array([bool(stmt.verdict) for stmt in verdicts.statements]) diff --git a/tests/e2e/metrics_migration/test_factual_correctness_migration.py b/tests/e2e/metrics_migration/test_factual_correctness_migration.py new file mode 100644 index 000000000..14daed502 --- /dev/null +++ b/tests/e2e/metrics_migration/test_factual_correctness_migration.py @@ -0,0 +1,249 @@ +"""E2E tests for FactualCorrectness metric migration from v1 to v2.""" + +import numpy as np +import pytest + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics._factual_correctness import ( + FactualCorrectness as LegacyFactualCorrectness, +) +from ragas.metrics.collections import FactualCorrectness + + +class TestFactualCorrectnessE2EMigration: + """E2E test compatibility between legacy FactualCorrectness and new V2 FactualCorrectness with modern components.""" + + @pytest.fixture + def sample_data(self): + """Real-world test cases for factual correctness evaluation.""" + return [ + { + "response": "Einstein was born in Germany on 14th March 1879.", + "reference": "Albert Einstein was born in Ulm, Germany on March 14, 1879.", + "description": "High factual correctness - consistent facts", + }, + { + "response": "Einstein was born in France on 14th March 1879.", + "reference": "Albert Einstein was born in Ulm, Germany on March 14, 1879.", + "description": "Low factual correctness - wrong country", + }, + { + "response": "The first superbowl was held on Jan 15, 1967.", + "reference": "The First AFL–NFL World Championship Game was played on January 15, 1967.", + "description": "Perfect factual correctness - exact match", + }, + { + "response": "Photosynthesis converts sunlight into energy and produces oxygen.", + "reference": "Photosynthesis is the process by which plants convert sunlight into energy and produce oxygen as a byproduct.", + "description": "High factual correctness - covers key facts", + }, + { + "response": "Newton discovered gravity when an apple fell on his head.", + "reference": "Newton developed his theory of universal gravitation, though the apple story is likely apocryphal.", + "description": "Mixed factual correctness - partially correct", + }, + ] + + @pytest.fixture + def test_llm(self): + """Create a LangChain LLM for legacy factual correctness evaluation.""" + try: + from langchain_openai import ChatOpenAI + + from ragas.llms import LangchainLLMWrapper + + langchain_llm = ChatOpenAI(model="gpt-4o", temperature=0.01) + return LangchainLLMWrapper(langchain_llm) + except ImportError as e: + pytest.skip(f"LangChain LLM not available: {e}") + except Exception as e: + pytest.skip(f"Could not create LangChain LLM (API key may be missing): {e}") + + @pytest.fixture + def test_modern_llm(self): + """Create a modern instructor LLM for v2 implementation.""" + try: + import openai + + from ragas.llms.base import llm_factory + + client = openai.AsyncOpenAI() + return llm_factory("gpt-4o", client=client) + except ImportError as e: + pytest.skip(f"LLM factory not available: {e}") + except Exception as e: + pytest.skip(f"Could not create modern LLM (API key may be missing): {e}") + + @pytest.mark.asyncio + async def test_legacy_factual_correctness_vs_v2_factual_correctness_e2e_compatibility( + self, sample_data, test_llm, test_modern_llm + ): + """E2E test that legacy and v2 implementations produce similar scores.""" + + if test_llm is None or test_modern_llm is None: + pytest.skip("LLM required for E2E testing") + + # Test different modes and configurations + test_configs = [ + {"mode": "f1", "atomicity": "low", "coverage": "low"}, + {"mode": "precision", "atomicity": "high", "coverage": "high"}, + {"mode": "recall", "atomicity": "low", "coverage": "high"}, + ] + + for config in test_configs: + print(f"\n🧪 Testing FactualCorrectness - Config: {config}") + + for i, data in enumerate(sample_data): + print(f"\n Case {i + 1}: {data['description']}") + print(f" Response: {data['response'][:80]}...") + print(f" Reference: {data['reference'][:80]}...") + + # Legacy implementation + legacy_correctness = LegacyFactualCorrectness( + llm=test_llm, + mode=config["mode"], # type: ignore[arg-type] + atomicity=config["atomicity"], # type: ignore[arg-type] + coverage=config["coverage"], # type: ignore[arg-type] + ) + legacy_sample = SingleTurnSample( + response=data["response"], + reference=data["reference"], + ) + legacy_score = await legacy_correctness._single_turn_ascore( + legacy_sample, None + ) + + # V2 implementation + v2_correctness = FactualCorrectness( + llm=test_modern_llm, + mode=config["mode"], # type: ignore[arg-type] + atomicity=config["atomicity"], # type: ignore[arg-type] + coverage=config["coverage"], # type: ignore[arg-type] + ) + v2_result = await v2_correctness.ascore( + response=data["response"], + reference=data["reference"], + ) + + score_diff = abs(legacy_score - v2_result.value) + print(f" Legacy: {legacy_score:.6f}") + print(f" V2: {v2_result.value:.6f}") + print(f" Diff: {score_diff:.6f}") + + # Ensure implementations give reasonably similar scores + # Factual correctness may have more variation due to claim decomposition and different LLM behavior + assert score_diff < 0.35, ( + f"Legacy and V2 scores should be similar: Legacy={legacy_score:.6f}, " + f"V2={v2_result.value:.6f}, Diff={score_diff:.6f} (tolerance: 0.35)" + ) + print(" ✅ Both implementations give consistent scores") + + # Validate score ranges (both should be 0-1 or NaN) + if not np.isnan(legacy_score): + assert 0.0 <= legacy_score <= 1.0 + if not np.isnan(v2_result.value): + assert 0.0 <= v2_result.value <= 1.0 + + @pytest.mark.asyncio + async def test_factual_correctness_edge_cases(self, test_modern_llm): + """Test edge cases like empty responses and references.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for edge case testing") + + metric = FactualCorrectness(llm=test_modern_llm) + + # Test empty response + with pytest.raises(ValueError, match="response is missing"): + await metric.ascore( + response="", + reference="Einstein was born in Germany.", + ) + + # Test empty reference + with pytest.raises(ValueError, match="reference is missing"): + await metric.ascore( + response="Einstein was born in Germany.", + reference="", + ) + + @pytest.mark.asyncio + async def test_factual_correctness_different_modes(self, test_modern_llm): + """Test that different modes (precision, recall, f1) produce different scores.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for mode testing") + + response = "Einstein was a physicist born in Germany." + reference = "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity." + + # Test different modes + precision_metric = FactualCorrectness(llm=test_modern_llm, mode="precision") + recall_metric = FactualCorrectness(llm=test_modern_llm, mode="recall") + f1_metric = FactualCorrectness(llm=test_modern_llm, mode="f1") + + precision_result = await precision_metric.ascore( + response=response, reference=reference + ) + recall_result = await recall_metric.ascore( + response=response, reference=reference + ) + f1_result = await f1_metric.ascore(response=response, reference=reference) + + print(f"Precision score: {precision_result.value:.3f}") + print(f"Recall score: {recall_result.value:.3f}") + print(f"F1 score: {f1_result.value:.3f}") + + # Validate ranges + assert 0.0 <= precision_result.value <= 1.0 + assert 0.0 <= recall_result.value <= 1.0 + assert 0.0 <= f1_result.value <= 1.0 + + @pytest.mark.asyncio + async def test_factual_correctness_atomicity_coverage_configurations( + self, test_modern_llm + ): + """Test that different atomicity/coverage configurations work.""" + + if test_modern_llm is None: + pytest.skip("Modern LLM required for configuration testing") + + response = "Einstein was a German physicist who developed relativity theory." + reference = ( + "Albert Einstein was born in Germany and created the theory of relativity." + ) + + configs = [ + {"atomicity": "low", "coverage": "low"}, + {"atomicity": "low", "coverage": "high"}, + {"atomicity": "high", "coverage": "low"}, + {"atomicity": "high", "coverage": "high"}, + ] + + for config in configs: + metric = FactualCorrectness( + llm=test_modern_llm, + atomicity=config["atomicity"], # type: ignore[arg-type] + coverage=config["coverage"], # type: ignore[arg-type] + ) + result = await metric.ascore(response=response, reference=reference) + + print(f"Config {config}: {result.value:.3f}") + + # Validate score range + assert 0.0 <= result.value <= 1.0, f"Invalid score for config {config}" + + def test_factual_correctness_migration_requirements_documented(self): + """Test that migration requirements are properly documented.""" + + # V2 implementation should not accept legacy components + with pytest.raises((TypeError, ValueError, AttributeError)): + FactualCorrectness(llm="invalid_llm_type") # type: ignore[arg-type] # Should reject string + + # V2 should only accept InstructorBaseRagasLLM + with pytest.raises((TypeError, ValueError, AttributeError)): + FactualCorrectness(llm=None) # type: ignore[arg-type] # Should reject None + + # Test beta validation + with pytest.raises(ValueError, match="Beta must be a float"): + FactualCorrectness(llm=None, beta="invalid") # type: ignore[arg-type] # Should reject non-numeric beta diff --git a/tests/e2e/metrics_migration/test_summary_score_migration.py b/tests/e2e/metrics_migration/test_summary_score_migration.py index df092d35e..1eed0ef06 100644 --- a/tests/e2e/metrics_migration/test_summary_score_migration.py +++ b/tests/e2e/metrics_migration/test_summary_score_migration.py @@ -177,8 +177,8 @@ def test_summary_score_migration_requirements_documented(self): # V2 implementation should not accept legacy components with pytest.raises((TypeError, ValueError, AttributeError)): - SummaryScore(llm="invalid_llm_type") # Should reject string + SummaryScore(llm="invalid_llm_type") # type: ignore[arg-type] # Should reject string # V2 should only accept InstructorBaseRagasLLM with pytest.raises((TypeError, ValueError, AttributeError)): - SummaryScore(llm=None) # Should reject None + SummaryScore(llm=None) # type: ignore[arg-type] # Should reject None