From ee408796801036f2ca885facb261aaaa6b28876d Mon Sep 17 00:00:00 2001
From: Rahul Bhatnagar <rhls.mailbox@gmail.com>
Date: Tue, 4 Nov 2025 23:26:18 -0500
Subject: [PATCH] Migrate factual correctness

---
 src/ragas/metrics/collections/__init__.py     |   2 +
 .../collections/_factual_correctness.py       | 359 ++++++++++++++++++
 .../test_factual_correctness_migration.py     | 249 ++++++++++++
 .../test_summary_score_migration.py           |   4 +-
 4 files changed, 612 insertions(+), 2 deletions(-)
 create mode 100644 src/ragas/metrics/collections/_factual_correctness.py
 create mode 100644 tests/e2e/metrics_migration/test_factual_correctness_migration.py

diff --git a/src/ragas/metrics/collections/__init__.py b/src/ragas/metrics/collections/__init__.py
index c935ca496..ae7fd93e7 100644
--- a/src/ragas/metrics/collections/__init__.py
+++ b/src/ragas/metrics/collections/__init__.py
@@ -7,6 +7,7 @@
 from ragas.metrics.collections._bleu_score import BleuScore
 from ragas.metrics.collections._context_entity_recall import ContextEntityRecall
 from ragas.metrics.collections._context_relevance import ContextRelevance
+from ragas.metrics.collections._factual_correctness import FactualCorrectness
 from ragas.metrics.collections._faithfulness import Faithfulness
 from ragas.metrics.collections._noise_sensitivity import NoiseSensitivity
 from ragas.metrics.collections._rouge_score import RougeScore
@@ -31,6 +32,7 @@
     "ContextRelevance",
     "DistanceMeasure",
     "ExactMatch",
+    "FactualCorrectness",
     "Faithfulness",
     "NoiseSensitivity",
     "NonLLMStringSimilarity",
diff --git a/src/ragas/metrics/collections/_factual_correctness.py b/src/ragas/metrics/collections/_factual_correctness.py
new file mode 100644
index 000000000..b12ba580b
--- /dev/null
+++ b/src/ragas/metrics/collections/_factual_correctness.py
@@ -0,0 +1,359 @@
+"""Factual Correctness metric v2 - Modern implementation with multi-modal scoring."""
+
+import typing as t
+from typing import List
+
+import numpy as np
+from pydantic import BaseModel
+
+from ragas.metrics.collections.base import BaseMetric
+from ragas.metrics.result import MetricResult
+from ragas.metrics.utils import fbeta_score
+from ragas.prompt.metrics.common import nli_statement_prompt
+
+if t.TYPE_CHECKING:
+    from ragas.llms.base import InstructorBaseRagasLLM
+
+
+class ClaimDecompositionOutput(BaseModel):
+    """Structured output for claim decomposition."""
+
+    claims: List[str]
+
+
+class StatementFaithfulnessAnswer(BaseModel):
+    """Individual statement with reason and verdict for NLI evaluation."""
+
+    statement: str
+    reason: str
+    verdict: int
+
+
+class NLIStatementOutput(BaseModel):
+    """Structured output for NLI statement evaluation."""
+
+    statements: List[StatementFaithfulnessAnswer]
+
+
+def claim_decomposition_prompt(
+    response: str, atomicity: str = "low", coverage: str = "low"
+) -> str:
+    """
+    V1-identical claim decomposition prompt with configurable atomicity/coverage.
+
+    Args:
+        response: The response text to break down into claims
+        atomicity: Level of atomicity ("low" or "high")
+        coverage: Level of coverage ("low" or "high")
+
+    Returns:
+        V1-identical prompt string for the LLM
+    """
+    import json
+
+    safe_response = json.dumps(response)
+
+    # Select examples based on atomicity and coverage configuration
+    if atomicity == "low" and coverage == "low":
+        examples = [
+            {
+                "input": {
+                    "response": "Charles Babbage was a French mathematician, philosopher, and food critic."
+                },
+                "output": {
+                    "claims": ["Charles Babbage was a mathematician and philosopher."]
+                },
+            },
+            {
+                "input": {
+                    "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
+                },
+                "output": {
+                    "claims": [
+                        "Albert Einstein was a German physicist.",
+                        "Albert Einstein developed relativity and contributed to quantum mechanics.",
+                    ]
+                },
+            },
+        ]
+    elif atomicity == "low" and coverage == "high":
+        examples = [
+            {
+                "input": {
+                    "response": "Charles Babbage was a French mathematician, philosopher, and food critic."
+                },
+                "output": {
+                    "claims": [
+                        "Charles Babbage was a French mathematician, philosopher, and food critic."
+                    ]
+                },
+            },
+            {
+                "input": {
+                    "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
+                },
+                "output": {
+                    "claims": [
+                        "Albert Einstein was a German theoretical physicist.",
+                        "Albert Einstein developed the theory of relativity and also contributed to the development of quantum mechanics.",
+                    ]
+                },
+            },
+        ]
+    elif atomicity == "high" and coverage == "low":
+        examples = [
+            {
+                "input": {
+                    "response": "Charles Babbage was a French mathematician, philosopher, and food critic."
+                },
+                "output": {
+                    "claims": [
+                        "Charles Babbage was a mathematician.",
+                        "Charles Babbage was a philosopher.",
+                    ]
+                },
+            },
+            {
+                "input": {
+                    "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
+                },
+                "output": {
+                    "claims": [
+                        "Albert Einstein was a German theoretical physicist.",
+                        "Albert Einstein developed the theory of relativity.",
+                    ]
+                },
+            },
+        ]
+    else:  # high atomicity, high coverage
+        examples = [
+            {
+                "input": {
+                    "response": "Charles Babbage was a French mathematician, philosopher, and food critic."
+                },
+                "output": {
+                    "claims": [
+                        "Charles Babbage was a mathematician.",
+                        "Charles Babbage was a philosopher.",
+                        "Charles Babbage was a food critic.",
+                        "Charles Babbage was French.",
+                    ]
+                },
+            },
+            {
+                "input": {
+                    "response": "Albert Einstein was a German theoretical physicist. He developed the theory of relativity and also contributed to the development of quantum mechanics."
+                },
+                "output": {
+                    "claims": [
+                        "Albert Einstein was a German theoretical physicist.",
+                        "Albert Einstein developed the theory of relativity.",
+                        "Albert Einstein contributed to the development of quantum mechanics.",
+                    ]
+                },
+            },
+        ]
+
+    # Build examples string
+    examples_str = "\n".join(
+        [
+            f"""Example {i + 1}
+Input: {json.dumps(ex["input"], indent=4)}
+Output: {json.dumps(ex["output"], indent=4)}"""
+            for i, ex in enumerate(examples)
+        ]
+    )
+
+    return f"""Decompose and break down each of the input sentences into one or more standalone statements. Each statement should be a standalone claim that can be independently verified.
+Follow the level of atomicity and coverage as shown in the examples.
+Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
+{{"properties": {{"claims": {{"description": "Decomposed Claims", "items": {{"type": "string"}}, "title": "Claims", "type": "array"}}}}, "required": ["claims"], "title": "ClaimDecompositionOutput", "type": "object"}}Do not use single quotes in your response but double quotes,properly escaped with a backslash.
+
+--------EXAMPLES-----------
+{examples_str}
+-----------------------------
+
+Now perform the same with the following input
+input: {{
+    "response": {safe_response}
+}}
+Output: """
+
+
+class FactualCorrectness(BaseMetric):
+    """
+    Modern v2 implementation of factual correctness evaluation.
+
+    Evaluates the factual correctness of responses by comparing claims made in the response
+    against a reference text. Uses claim decomposition and natural language inference (NLI)
+    to verify claims in both directions.
+
+    The metric supports three evaluation modes:
+    - Precision: What fraction of response claims are supported by reference
+    - Recall: What fraction of reference claims are covered by response
+    - F1: Harmonic mean of precision and recall (with configurable beta)
+
+    The metric also supports configurable claim decomposition:
+    - Atomicity: "low" (fewer, broader claims) vs "high" (more, atomic claims)
+    - Coverage: "low" (partial coverage) vs "high" (comprehensive coverage)
+
+    Usage:
+        >>> import instructor
+        >>> from openai import AsyncOpenAI
+        >>> from ragas.llms.base import llm_factory
+        >>> from ragas.metrics.collections import FactualCorrectness
+        >>>
+        >>> # Setup dependencies
+        >>> client = AsyncOpenAI()
+        >>> llm = llm_factory("gpt-4o-mini", client=client)
+        >>>
+        >>> # Create metric instance
+        >>> metric = FactualCorrectness(llm=llm, mode="f1", beta=1.0)
+        >>>
+        >>> # Single evaluation
+        >>> result = await metric.ascore(
+        ...     response="Einstein was born in Germany in 1879.",
+        ...     reference="Albert Einstein was born in Ulm, Germany on March 14, 1879."
+        ... )
+        >>> print(f"Factual Correctness: {result.value}")
+
+    Attributes:
+        llm: Modern instructor-based LLM for claim decomposition and NLI evaluation
+        mode: Evaluation mode ("precision", "recall", or "f1")
+        beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision)
+        atomicity: Claim decomposition atomicity ("low" or "high")
+        coverage: Claim decomposition coverage ("low" or "high")
+        name: The metric name
+        allowed_values: Score range (0.0 to 1.0, higher is better)
+    """
+
+    # Type hints for linter (attributes are set in __init__)
+    llm: "InstructorBaseRagasLLM"
+
+    def __init__(
+        self,
+        llm: "InstructorBaseRagasLLM",
+        mode: t.Literal["precision", "recall", "f1"] = "f1",
+        beta: float = 1.0,
+        atomicity: t.Literal["low", "high"] = "low",
+        coverage: t.Literal["low", "high"] = "low",
+        name: str = "factual_correctness",
+        **kwargs,
+    ):
+        """
+        Initialize FactualCorrectness metric with required components.
+
+        Args:
+            llm: Modern instructor-based LLM for claim decomposition and NLI evaluation
+            mode: Evaluation mode ("precision", "recall", or "f1")
+            beta: Beta parameter for F1 score (>1 favors recall, <1 favors precision)
+            atomicity: Claim decomposition atomicity ("low" or "high")
+            coverage: Claim decomposition coverage ("low" or "high")
+            name: The metric name
+        """
+        # Set attributes explicitly before calling super()
+        self.llm = llm
+        self.mode = mode
+        self.beta = beta
+        self.atomicity = atomicity
+        self.coverage = coverage
+
+        # Validate beta parameter
+        if not isinstance(beta, (int, float)):
+            raise ValueError(
+                "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision."
+            )
+
+        # Call super() for validation (without passing llm in kwargs)
+        super().__init__(name=name, **kwargs)
+
+    async def ascore(self, response: str, reference: str) -> MetricResult:
+        """
+        Calculate factual correctness score.
+
+        Args:
+            response: The response to evaluate for factual correctness
+            reference: The reference text to check claims against
+
+        Returns:
+            MetricResult with factual correctness score (0.0-1.0, higher is better)
+        """
+        # Input validation
+        if not response:
+            raise ValueError(
+                "response is missing. Please add response to the test sample."
+            )
+        if not reference:
+            raise ValueError(
+                "reference is missing. Please add reference to the test sample."
+            )
+
+        # Step 1: Get claim verifications based on mode
+        if self.mode != "precision":
+            # For recall and f1: response claims → reference verification
+            response_verified = await self._decompose_and_verify_claims(
+                response, reference
+            )
+        else:
+            response_verified = np.array([], dtype=bool)
+
+        if self.mode != "recall":
+            # For precision and f1: reference claims → response verification
+            reference_verified = await self._decompose_and_verify_claims(
+                reference, response
+            )
+        else:
+            reference_verified = np.array([], dtype=bool)
+
+        # Step 2: Compute TP, FP, FN
+        if self.mode != "precision":
+            tp = int(np.sum(response_verified))
+            fn = int(np.sum(~response_verified))
+        else:
+            tp = int(np.sum(reference_verified))
+            fn = 0
+
+        if self.mode != "recall":
+            fp = int(np.sum(~reference_verified))
+        else:
+            fp = 0
+
+        # Step 3: Compute final score based on mode
+        if self.mode == "precision":
+            score = tp / (tp + fp + 1e-8)
+        elif self.mode == "recall":
+            score = tp / (tp + fn + 1e-8)
+        else:  # f1
+            score = fbeta_score(tp, fp, fn, self.beta)
+
+        return MetricResult(value=float(np.round(score, 2)))
+
+    async def _decompose_claims(self, response: str) -> List[str]:
+        """Break response into claims using configurable decomposition."""
+        prompt = claim_decomposition_prompt(
+            response, atomicity=self.atomicity, coverage=self.coverage
+        )
+        result = await self.llm.agenerate(prompt, ClaimDecompositionOutput)
+        return result.claims
+
+    async def _verify_claims(
+        self, claims: List[str], reference: str
+    ) -> NLIStatementOutput:
+        """Verify claims against reference using NLI."""
+        prompt = nli_statement_prompt(reference, claims)
+        result = await self.llm.agenerate(prompt, NLIStatementOutput)
+        return result
+
+    async def _decompose_and_verify_claims(
+        self, text_to_decompose: str, reference_text: str
+    ) -> np.ndarray:
+        """Decompose text into claims and verify against reference."""
+        claims = await self._decompose_claims(text_to_decompose)
+        if not claims:
+            return np.array([], dtype=bool)
+
+        verdicts = await self._verify_claims(claims, reference_text)
+        if not verdicts.statements:
+            return np.array([], dtype=bool)
+
+        return np.array([bool(stmt.verdict) for stmt in verdicts.statements])
diff --git a/tests/e2e/metrics_migration/test_factual_correctness_migration.py b/tests/e2e/metrics_migration/test_factual_correctness_migration.py
new file mode 100644
index 000000000..14daed502
--- /dev/null
+++ b/tests/e2e/metrics_migration/test_factual_correctness_migration.py
@@ -0,0 +1,249 @@
+"""E2E tests for FactualCorrectness metric migration from v1 to v2."""
+
+import numpy as np
+import pytest
+
+from ragas.dataset_schema import SingleTurnSample
+from ragas.metrics._factual_correctness import (
+    FactualCorrectness as LegacyFactualCorrectness,
+)
+from ragas.metrics.collections import FactualCorrectness
+
+
+class TestFactualCorrectnessE2EMigration:
+    """E2E test compatibility between legacy FactualCorrectness and new V2 FactualCorrectness with modern components."""
+
+    @pytest.fixture
+    def sample_data(self):
+        """Real-world test cases for factual correctness evaluation."""
+        return [
+            {
+                "response": "Einstein was born in Germany on 14th March 1879.",
+                "reference": "Albert Einstein was born in Ulm, Germany on March 14, 1879.",
+                "description": "High factual correctness - consistent facts",
+            },
+            {
+                "response": "Einstein was born in France on 14th March 1879.",
+                "reference": "Albert Einstein was born in Ulm, Germany on March 14, 1879.",
+                "description": "Low factual correctness - wrong country",
+            },
+            {
+                "response": "The first superbowl was held on Jan 15, 1967.",
+                "reference": "The First AFL–NFL World Championship Game was played on January 15, 1967.",
+                "description": "Perfect factual correctness - exact match",
+            },
+            {
+                "response": "Photosynthesis converts sunlight into energy and produces oxygen.",
+                "reference": "Photosynthesis is the process by which plants convert sunlight into energy and produce oxygen as a byproduct.",
+                "description": "High factual correctness - covers key facts",
+            },
+            {
+                "response": "Newton discovered gravity when an apple fell on his head.",
+                "reference": "Newton developed his theory of universal gravitation, though the apple story is likely apocryphal.",
+                "description": "Mixed factual correctness - partially correct",
+            },
+        ]
+
+    @pytest.fixture
+    def test_llm(self):
+        """Create a LangChain LLM for legacy factual correctness evaluation."""
+        try:
+            from langchain_openai import ChatOpenAI
+
+            from ragas.llms import LangchainLLMWrapper
+
+            langchain_llm = ChatOpenAI(model="gpt-4o", temperature=0.01)
+            return LangchainLLMWrapper(langchain_llm)
+        except ImportError as e:
+            pytest.skip(f"LangChain LLM not available: {e}")
+        except Exception as e:
+            pytest.skip(f"Could not create LangChain LLM (API key may be missing): {e}")
+
+    @pytest.fixture
+    def test_modern_llm(self):
+        """Create a modern instructor LLM for v2 implementation."""
+        try:
+            import openai
+
+            from ragas.llms.base import llm_factory
+
+            client = openai.AsyncOpenAI()
+            return llm_factory("gpt-4o", client=client)
+        except ImportError as e:
+            pytest.skip(f"LLM factory not available: {e}")
+        except Exception as e:
+            pytest.skip(f"Could not create modern LLM (API key may be missing): {e}")
+
+    @pytest.mark.asyncio
+    async def test_legacy_factual_correctness_vs_v2_factual_correctness_e2e_compatibility(
+        self, sample_data, test_llm, test_modern_llm
+    ):
+        """E2E test that legacy and v2 implementations produce similar scores."""
+
+        if test_llm is None or test_modern_llm is None:
+            pytest.skip("LLM required for E2E testing")
+
+        # Test different modes and configurations
+        test_configs = [
+            {"mode": "f1", "atomicity": "low", "coverage": "low"},
+            {"mode": "precision", "atomicity": "high", "coverage": "high"},
+            {"mode": "recall", "atomicity": "low", "coverage": "high"},
+        ]
+
+        for config in test_configs:
+            print(f"\n🧪 Testing FactualCorrectness - Config: {config}")
+
+            for i, data in enumerate(sample_data):
+                print(f"\n   Case {i + 1}: {data['description']}")
+                print(f"   Response: {data['response'][:80]}...")
+                print(f"   Reference: {data['reference'][:80]}...")
+
+                # Legacy implementation
+                legacy_correctness = LegacyFactualCorrectness(
+                    llm=test_llm,
+                    mode=config["mode"],  # type: ignore[arg-type]
+                    atomicity=config["atomicity"],  # type: ignore[arg-type]
+                    coverage=config["coverage"],  # type: ignore[arg-type]
+                )
+                legacy_sample = SingleTurnSample(
+                    response=data["response"],
+                    reference=data["reference"],
+                )
+                legacy_score = await legacy_correctness._single_turn_ascore(
+                    legacy_sample, None
+                )
+
+                # V2 implementation
+                v2_correctness = FactualCorrectness(
+                    llm=test_modern_llm,
+                    mode=config["mode"],  # type: ignore[arg-type]
+                    atomicity=config["atomicity"],  # type: ignore[arg-type]
+                    coverage=config["coverage"],  # type: ignore[arg-type]
+                )
+                v2_result = await v2_correctness.ascore(
+                    response=data["response"],
+                    reference=data["reference"],
+                )
+
+                score_diff = abs(legacy_score - v2_result.value)
+                print(f"   Legacy: {legacy_score:.6f}")
+                print(f"   V2:     {v2_result.value:.6f}")
+                print(f"   Diff:   {score_diff:.6f}")
+
+                # Ensure implementations give reasonably similar scores
+                # Factual correctness may have more variation due to claim decomposition and different LLM behavior
+                assert score_diff < 0.35, (
+                    f"Legacy and V2 scores should be similar: Legacy={legacy_score:.6f}, "
+                    f"V2={v2_result.value:.6f}, Diff={score_diff:.6f} (tolerance: 0.35)"
+                )
+                print("   ✅ Both implementations give consistent scores")
+
+                # Validate score ranges (both should be 0-1 or NaN)
+                if not np.isnan(legacy_score):
+                    assert 0.0 <= legacy_score <= 1.0
+                if not np.isnan(v2_result.value):
+                    assert 0.0 <= v2_result.value <= 1.0
+
+    @pytest.mark.asyncio
+    async def test_factual_correctness_edge_cases(self, test_modern_llm):
+        """Test edge cases like empty responses and references."""
+
+        if test_modern_llm is None:
+            pytest.skip("Modern LLM required for edge case testing")
+
+        metric = FactualCorrectness(llm=test_modern_llm)
+
+        # Test empty response
+        with pytest.raises(ValueError, match="response is missing"):
+            await metric.ascore(
+                response="",
+                reference="Einstein was born in Germany.",
+            )
+
+        # Test empty reference
+        with pytest.raises(ValueError, match="reference is missing"):
+            await metric.ascore(
+                response="Einstein was born in Germany.",
+                reference="",
+            )
+
+    @pytest.mark.asyncio
+    async def test_factual_correctness_different_modes(self, test_modern_llm):
+        """Test that different modes (precision, recall, f1) produce different scores."""
+
+        if test_modern_llm is None:
+            pytest.skip("Modern LLM required for mode testing")
+
+        response = "Einstein was a physicist born in Germany."
+        reference = "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity."
+
+        # Test different modes
+        precision_metric = FactualCorrectness(llm=test_modern_llm, mode="precision")
+        recall_metric = FactualCorrectness(llm=test_modern_llm, mode="recall")
+        f1_metric = FactualCorrectness(llm=test_modern_llm, mode="f1")
+
+        precision_result = await precision_metric.ascore(
+            response=response, reference=reference
+        )
+        recall_result = await recall_metric.ascore(
+            response=response, reference=reference
+        )
+        f1_result = await f1_metric.ascore(response=response, reference=reference)
+
+        print(f"Precision score: {precision_result.value:.3f}")
+        print(f"Recall score: {recall_result.value:.3f}")
+        print(f"F1 score: {f1_result.value:.3f}")
+
+        # Validate ranges
+        assert 0.0 <= precision_result.value <= 1.0
+        assert 0.0 <= recall_result.value <= 1.0
+        assert 0.0 <= f1_result.value <= 1.0
+
+    @pytest.mark.asyncio
+    async def test_factual_correctness_atomicity_coverage_configurations(
+        self, test_modern_llm
+    ):
+        """Test that different atomicity/coverage configurations work."""
+
+        if test_modern_llm is None:
+            pytest.skip("Modern LLM required for configuration testing")
+
+        response = "Einstein was a German physicist who developed relativity theory."
+        reference = (
+            "Albert Einstein was born in Germany and created the theory of relativity."
+        )
+
+        configs = [
+            {"atomicity": "low", "coverage": "low"},
+            {"atomicity": "low", "coverage": "high"},
+            {"atomicity": "high", "coverage": "low"},
+            {"atomicity": "high", "coverage": "high"},
+        ]
+
+        for config in configs:
+            metric = FactualCorrectness(
+                llm=test_modern_llm,
+                atomicity=config["atomicity"],  # type: ignore[arg-type]
+                coverage=config["coverage"],  # type: ignore[arg-type]
+            )
+            result = await metric.ascore(response=response, reference=reference)
+
+            print(f"Config {config}: {result.value:.3f}")
+
+            # Validate score range
+            assert 0.0 <= result.value <= 1.0, f"Invalid score for config {config}"
+
+    def test_factual_correctness_migration_requirements_documented(self):
+        """Test that migration requirements are properly documented."""
+
+        # V2 implementation should not accept legacy components
+        with pytest.raises((TypeError, ValueError, AttributeError)):
+            FactualCorrectness(llm="invalid_llm_type")  # type: ignore[arg-type]  # Should reject string
+
+        # V2 should only accept InstructorBaseRagasLLM
+        with pytest.raises((TypeError, ValueError, AttributeError)):
+            FactualCorrectness(llm=None)  # type: ignore[arg-type]  # Should reject None
+
+        # Test beta validation
+        with pytest.raises(ValueError, match="Beta must be a float"):
+            FactualCorrectness(llm=None, beta="invalid")  # type: ignore[arg-type]  # Should reject non-numeric beta
diff --git a/tests/e2e/metrics_migration/test_summary_score_migration.py b/tests/e2e/metrics_migration/test_summary_score_migration.py
index df092d35e..1eed0ef06 100644
--- a/tests/e2e/metrics_migration/test_summary_score_migration.py
+++ b/tests/e2e/metrics_migration/test_summary_score_migration.py
@@ -177,8 +177,8 @@ def test_summary_score_migration_requirements_documented(self):
 
         # V2 implementation should not accept legacy components
         with pytest.raises((TypeError, ValueError, AttributeError)):
-            SummaryScore(llm="invalid_llm_type")  # Should reject string
+            SummaryScore(llm="invalid_llm_type")  # type: ignore[arg-type]  # Should reject string
 
         # V2 should only accept InstructorBaseRagasLLM
         with pytest.raises((TypeError, ValueError, AttributeError)):
-            SummaryScore(llm=None)  # Should reject None
+            SummaryScore(llm=None)  # type: ignore[arg-type]  # Should reject None