diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 2f164e980..e559d085c 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -61,6 +61,7 @@ from ragas.metrics._topic_adherence import TopicAdherenceScore from ragas.metrics.base import ( Metric, + MetricOutputType, MetricType, MetricWithEmbeddings, MetricWithLLM, @@ -76,6 +77,7 @@ "MetricWithLLM", "SingleTurnMetric", "MultiTurnMetric", + "MetricOutputType", # specific metrics "AnswerCorrectness", "answer_correctness", diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 3eb853b83..a5c10e206 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -15,6 +15,7 @@ LongFormAnswerPrompt, ) from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithEmbeddings, MetricWithLLM, @@ -163,6 +164,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): MetricType.SINGLE_TURN: {"user_input", "response", "reference"} } ) + output_type = MetricOutputType.CONTINUOUS correctness_prompt: PydanticPrompt = field(default_factory=CorrectnessClassifier) long_form_answer_prompt: PydanticPrompt = field( default_factory=LongFormAnswerPrompt diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index e26d0f4c4..93471dabc 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -9,6 +9,7 @@ from ragas.dataset_schema import SingleTurnSample from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithEmbeddings, MetricWithLLM, @@ -87,6 +88,8 @@ class ResponseRelevancy(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): } } ) + output_type = MetricOutputType.CONTINUOUS + question_generation: PydanticPrompt = ResponseRelevancePrompt() strictness: int = 3 diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 6a6f1ba0e..67bd2c546 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -8,7 +8,12 @@ from ragas.dataset_schema import SingleTurnSample from ragas.embeddings.base import HuggingfaceEmbeddings -from ragas.metrics.base import MetricType, MetricWithEmbeddings, SingleTurnMetric +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithEmbeddings, + SingleTurnMetric, +) if t.TYPE_CHECKING: from langchain_core.callbacks.base import Callbacks @@ -41,6 +46,7 @@ class SemanticSimilarity(MetricWithEmbeddings, SingleTurnMetric): _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: {MetricType.SINGLE_TURN: {"reference", "response"}} ) + output_type = MetricOutputType.CONTINUOUS is_cross_encoder: bool = False threshold: t.Optional[float] = None diff --git a/src/ragas/metrics/_aspect_critic.py b/src/ragas/metrics/_aspect_critic.py index c4d224104..88dbc0f6b 100644 --- a/src/ragas/metrics/_aspect_critic.py +++ b/src/ragas/metrics/_aspect_critic.py @@ -8,6 +8,7 @@ from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithLLM, MultiTurnMetric, @@ -94,6 +95,7 @@ def __init__( definition: str, llm: t.Optional[BaseRagasLLM] = None, required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None, + output_type: t.Optional[MetricOutputType] = MetricOutputType.BINARY, single_turn_prompt: t.Optional[PydanticPrompt] = None, multi_turn_prompt: t.Optional[PydanticPrompt] = None, strictness: int = 1, @@ -116,6 +118,7 @@ def __init__( name=name, _required_columns=self._required_columns, llm=llm, + output_type=output_type, ) self._definition = definition diff --git a/src/ragas/metrics/_context_entities_recall.py b/src/ragas/metrics/_context_entities_recall.py index 2a40c2cf8..55f68a773 100644 --- a/src/ragas/metrics/_context_entities_recall.py +++ b/src/ragas/metrics/_context_entities_recall.py @@ -8,7 +8,12 @@ from pydantic import BaseModel from ragas.dataset_schema import SingleTurnSample -from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + SingleTurnMetric, +) from ragas.prompt import PydanticPrompt, StringIO if t.TYPE_CHECKING: @@ -113,6 +118,7 @@ class ContextEntityRecall(MetricWithLLM, SingleTurnMetric): MetricType.SINGLE_TURN: {"reference", "retrieved_contexts"} } ) + output_type = MetricOutputType.CONTINUOUS context_entity_recall_prompt: PydanticPrompt = field( default_factory=ExtractEntitiesPrompt ) diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index e935e8f02..b10e61cfc 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -9,7 +9,13 @@ from ragas.dataset_schema import SingleTurnSample from ragas.metrics._string import NonLLMStringSimilarity -from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric, ensembler +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + SingleTurnMetric, + ensembler, +) from ragas.prompt import PydanticPrompt from ragas.run_config import RunConfig from ragas.utils import deprecated @@ -98,6 +104,7 @@ class LLMContextPrecisionWithReference(MetricWithLLM, SingleTurnMetric): } } ) + output_type = MetricOutputType.CONTINUOUS context_precision_prompt: PydanticPrompt = field( default_factory=ContextPrecisionPrompt ) diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index e655957e4..ec19203fe 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -9,7 +9,13 @@ from ragas.dataset_schema import SingleTurnSample from ragas.metrics._string import NonLLMStringSimilarity -from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric, ensembler +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + SingleTurnMetric, + ensembler, +) from ragas.prompt import PydanticPrompt from ragas.run_config import RunConfig from ragas.utils import deprecated @@ -102,6 +108,7 @@ class LLMContextRecall(MetricWithLLM, SingleTurnMetric): } } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.CONTINUOUS context_recall_prompt: PydanticPrompt = field( default_factory=ContextRecallClassificationPrompt ) @@ -202,6 +209,7 @@ class NonLLMContextRecall(SingleTurnMetric): } } ) + output_type: MetricOutputType = MetricOutputType.CONTINUOUS distance_measure: SingleTurnMetric = field( default_factory=lambda: NonLLMStringSimilarity() ) diff --git a/src/ragas/metrics/_domain_specific_rubrics.py b/src/ragas/metrics/_domain_specific_rubrics.py index 5bde24a7d..864367595 100644 --- a/src/ragas/metrics/_domain_specific_rubrics.py +++ b/src/ragas/metrics/_domain_specific_rubrics.py @@ -7,6 +7,7 @@ from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithLLM, MultiTurnMetric, @@ -88,6 +89,7 @@ def __init__( rubrics: t.Dict[str, str] = DEFAULT_REFERENCE_FREE_RUBRICS, llm: t.Optional[BaseRagasLLM] = None, required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None, + output_type: t.Optional[MetricOutputType] = MetricOutputType.DISCRETE, single_turn_prompt: t.Optional[PydanticPrompt] = None, multi_turn_prompt: t.Optional[PydanticPrompt] = None, max_retries: int = 1, @@ -109,7 +111,12 @@ def __init__( "reference:optional", }, } - super().__init__(name=name, llm=llm, _required_columns=self._required_columns) + super().__init__( + name=name, + llm=llm, + _required_columns=self._required_columns, + output_type=output_type, + ) def __repr__(self) -> str: return f"{self.name}(required_columns={self.required_columns}, llm={self.llm}), rubrics={self.rubrics}" diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py index 61e86b548..f5b8b70e9 100644 --- a/src/ragas/metrics/_factual_correctness.py +++ b/src/ragas/metrics/_factual_correctness.py @@ -15,6 +15,7 @@ NLIStatementPrompt, ) from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithLLM, SingleTurnMetric, @@ -210,6 +211,7 @@ class FactualCorrectness(MetricWithLLM, SingleTurnMetric): _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}} ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.CONTINUOUS mode: t.Literal["precision", "recall", "f1"] = "f1" beta: float = 1.0 atomicity: t.Literal["low", "high"] = "low" diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 6fae49055..47678f544 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -9,6 +9,7 @@ from ragas.dataset_schema import SingleTurnSample from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithLLM, SingleTurnMetric, @@ -172,6 +173,7 @@ class Faithfulness(MetricWithLLM, SingleTurnMetric): } } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.CONTINUOUS nli_statements_message: PydanticPrompt = field(default_factory=NLIStatementPrompt) statement_prompt: PydanticPrompt = field(default_factory=LongFormAnswerPrompt) sentence_segmenter: t.Optional[HasSegmentMethod] = None diff --git a/src/ragas/metrics/_goal_accuracy.py b/src/ragas/metrics/_goal_accuracy.py index 4a1831543..2a3dde932 100644 --- a/src/ragas/metrics/_goal_accuracy.py +++ b/src/ragas/metrics/_goal_accuracy.py @@ -6,7 +6,12 @@ from pydantic import BaseModel, Field from ragas.dataset_schema import MultiTurnSample -from ragas.metrics.base import MetricType, MetricWithLLM, MultiTurnMetric +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + MultiTurnMetric, +) from ragas.prompt import PydanticPrompt if t.TYPE_CHECKING: @@ -106,6 +111,7 @@ class AgentGoalAccuracyWithReference(MetricWithLLM, MultiTurnMetric): } } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.BINARY workflow_prompt: PydanticPrompt = field( default_factory=lambda: InferGoalOutcomePrompt() ) diff --git a/src/ragas/metrics/_instance_specific_rubrics.py b/src/ragas/metrics/_instance_specific_rubrics.py index d893fd23b..6ed36f0e2 100644 --- a/src/ragas/metrics/_instance_specific_rubrics.py +++ b/src/ragas/metrics/_instance_specific_rubrics.py @@ -11,6 +11,7 @@ SingleTurnInputWithoutRubric, ) from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithLLM, MultiTurnMetric, @@ -54,6 +55,7 @@ def __init__( name: str = "instance_rubrics", llm: t.Optional[BaseRagasLLM] = None, required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None, + output_type: t.Optional[MetricOutputType] = MetricOutputType.DISCRETE, single_turn_prompt: t.Optional[PydanticPrompt] = None, multi_turn_prompt: t.Optional[PydanticPrompt] = None, max_retries: int = 1, @@ -73,6 +75,7 @@ def __init__( "reference:optional", }, } + self.output_type = output_type super().__init__(name=name, llm=llm, _required_columns=self._required_columns) self.single_turn_prompt = single_turn_prompt or SingleTurnPrompt() diff --git a/src/ragas/metrics/_multi_modal_faithfulness.py b/src/ragas/metrics/_multi_modal_faithfulness.py index eb36c0d16..8b816dadb 100644 --- a/src/ragas/metrics/_multi_modal_faithfulness.py +++ b/src/ragas/metrics/_multi_modal_faithfulness.py @@ -7,7 +7,12 @@ from pydantic import BaseModel, Field from ragas.dataset_schema import SingleTurnSample -from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + SingleTurnMetric, +) from ragas.prompt import ImageTextPrompt if t.TYPE_CHECKING: @@ -74,6 +79,7 @@ class MultiModalFaithfulness(MetricWithLLM, SingleTurnMetric): } } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.CONTINUOUS faithfulness_prompt: ImageTextPrompt = MultiModalFaithfulnessPrompt() async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: diff --git a/src/ragas/metrics/_multi_modal_relevance.py b/src/ragas/metrics/_multi_modal_relevance.py index 310481a93..5f74a8d1c 100644 --- a/src/ragas/metrics/_multi_modal_relevance.py +++ b/src/ragas/metrics/_multi_modal_relevance.py @@ -7,7 +7,12 @@ from pydantic import BaseModel, Field from ragas.dataset_schema import SingleTurnSample -from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + SingleTurnMetric, +) from ragas.prompt import ImageTextPrompt if t.TYPE_CHECKING: @@ -80,6 +85,8 @@ class MultiModalRelevance(MetricWithLLM, SingleTurnMetric): } } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.CONTINUOUS + relevance_prompt: ImageTextPrompt = MultiModalRelevancePrompt() async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index b1dcef58a..c7026de16 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -15,6 +15,7 @@ NLIStatementPrompt, ) from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithLLM, SingleTurnMetric, @@ -43,6 +44,7 @@ class NoiseSensitivity(MetricWithLLM, SingleTurnMetric): } } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.CONTINUOUS nli_statements_message: PydanticPrompt = field(default_factory=NLIStatementPrompt) statement_prompt: PydanticPrompt = field(default_factory=LongFormAnswerPrompt) sentence_segmenter: t.Optional[HasSegmentMethod] = None diff --git a/src/ragas/metrics/_simple_criteria.py b/src/ragas/metrics/_simple_criteria.py index f6fe401c9..141415228 100644 --- a/src/ragas/metrics/_simple_criteria.py +++ b/src/ragas/metrics/_simple_criteria.py @@ -8,6 +8,7 @@ from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.metrics.base import ( + MetricOutputType, MetricType, MetricWithLLM, MultiTurnMetric, @@ -94,6 +95,7 @@ def __init__( definition: str, llm: t.Optional[BaseRagasLLM] = None, required_columns: t.Optional[t.Dict[MetricType, t.Set[str]]] = None, + output_type: t.Optional[MetricOutputType] = MetricOutputType.DISCRETE, single_turn_prompt: t.Optional[PydanticPrompt] = None, multi_turn_prompt: t.Optional[PydanticPrompt] = None, strictness: int = 1, @@ -116,6 +118,7 @@ def __init__( name=name, llm=llm, _required_columns=required_columns, + output_type=output_type, ) self._definition = definition diff --git a/src/ragas/metrics/_sql_semantic_equivalence.py b/src/ragas/metrics/_sql_semantic_equivalence.py index 9da028a37..9e2039387 100644 --- a/src/ragas/metrics/_sql_semantic_equivalence.py +++ b/src/ragas/metrics/_sql_semantic_equivalence.py @@ -7,7 +7,12 @@ from pydantic import BaseModel, Field from ragas.dataset_schema import SingleTurnSample -from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + SingleTurnMetric, +) from ragas.prompt import PydanticPrompt if t.TYPE_CHECKING: @@ -70,6 +75,7 @@ class LLMSQLEquivalence(MetricWithLLM, SingleTurnMetric): MetricType.SINGLE_TURN: {"response", "reference", "reference_contexts"} } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.BINARY equivalence_prompt: PydanticPrompt = EquivalencePrompt() async def _single_turn_ascore( diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py index 0cdb09d63..601a588cc 100644 --- a/src/ragas/metrics/_summarization.py +++ b/src/ragas/metrics/_summarization.py @@ -8,7 +8,12 @@ from pydantic import BaseModel from ragas.dataset_schema import SingleTurnSample -from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + SingleTurnMetric, +) from ragas.prompt import PydanticPrompt, StringIO if t.TYPE_CHECKING: @@ -154,6 +159,7 @@ class SummarizationScore(MetricWithLLM, SingleTurnMetric): } } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.CONTINUOUS coeff: float = 0.5 question_generation_prompt: PydanticPrompt = field( default_factory=GenerateQuestionsPrompt diff --git a/src/ragas/metrics/_topic_adherence.py b/src/ragas/metrics/_topic_adherence.py index 1737f7f5a..580498294 100644 --- a/src/ragas/metrics/_topic_adherence.py +++ b/src/ragas/metrics/_topic_adherence.py @@ -8,7 +8,12 @@ from pydantic import BaseModel, Field from ragas.dataset_schema import MultiTurnSample -from ragas.metrics.base import MetricType, MetricWithLLM, MultiTurnMetric +from ragas.metrics.base import ( + MetricOutputType, + MetricType, + MetricWithLLM, + MultiTurnMetric, +) from ragas.prompt import PydanticPrompt if t.TYPE_CHECKING: @@ -142,6 +147,7 @@ class TopicAdherenceScore(MetricWithLLM, MultiTurnMetric): } } ) + output_type: t.Optional[MetricOutputType] = MetricOutputType.CONTINUOUS mode: t.Literal["precision", "recall", "f1"] = "f1" topic_extraction_prompt: PydanticPrompt = TopicExtractionPrompt() topic_classification_prompt: PydanticPrompt = TopicClassificationPrompt() diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 681bc1a41..ad8d2eef8 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -57,6 +57,13 @@ class MetricType(Enum): MULTI_TURN = "multi_turn" +class MetricOutputType(Enum): + BINARY = "binary" + DISCRETE = "discrete" + CONTINUOUS = "continuous" + RANKING = "ranking" + + @dataclass class Metric(ABC): """ @@ -211,6 +218,7 @@ class MetricWithLLM(Metric, PromptMixin): """ llm: t.Optional[BaseRagasLLM] = None + output_type: t.Optional[MetricOutputType] = None def init(self, run_config: RunConfig): if self.llm is None: