diff --git a/docs/concepts/metrics/available_metrics/context_precision.md b/docs/concepts/metrics/available_metrics/context_precision.md index 449dadcea..4334efd8f 100644 --- a/docs/concepts/metrics/available_metrics/context_precision.md +++ b/docs/concepts/metrics/available_metrics/context_precision.md @@ -92,4 +92,37 @@ await context_precision.single_turn_ascore(sample) Output ``` 0.9999999999 -``` \ No newline at end of file +``` + +## ID Based Context Precision + +IDBasedContextPrecision provides a direct and efficient way to measure precision by comparing the IDs of retrieved contexts with reference context IDs. This metric is particularly useful when you have a unique ID system for your documents and want to evaluate retrieval performance without comparing the actual content. + +The metric computes precision using retrieved_context_ids and reference_context_ids, with values ranging between 0 and 1. Higher values indicate better performance. It works with both string and integer IDs. + +The formula for calculating ID-based context precision is as follows: + +$$ \text{ID-Based Context Precision} = \frac{\text{Number of retrieved context IDs found in reference context IDs}}{\text{Total number of retrieved context IDs}} $$ + +### Example + +```python +from ragas import SingleTurnSample +from ragas.metrics import IDBasedContextPrecision + +sample = SingleTurnSample( + retrieved_context_ids=["doc_1", "doc_2", "doc_3", "doc_4"], + reference_context_ids=["doc_1", "doc_4", "doc_5", "doc_6"] +) + +id_precision = IDBasedContextPrecision() +await id_precision.single_turn_ascore(sample) + +``` + +Output +``` +0.5 +``` + +In this example, out of the 4 retrieved context IDs, only 2 ("doc_1" and "doc_4") are found in the reference context IDs, resulting in a precision score of 0.5 or 50%. \ No newline at end of file diff --git a/docs/concepts/metrics/available_metrics/context_recall.md b/docs/concepts/metrics/available_metrics/context_recall.md index 7b986a8ff..a0c91b4dc 100644 --- a/docs/concepts/metrics/available_metrics/context_recall.md +++ b/docs/concepts/metrics/available_metrics/context_recall.md @@ -69,4 +69,36 @@ await context_recall.single_turn_ascore(sample) Output ``` 0.5 +``` + +## ID BasedContext Recall + +ID Based Context Recall +IDBasedContextRecall provides a direct and efficient way to measure recall by comparing the IDs of retrieved contexts with reference context IDs. This metric is particularly useful when you have a unique ID system for your documents and want to evaluate retrieval performance without comparing the actual content. + +The metric computes recall using retrieved_context_ids and reference_context_ids, with values ranging between 0 and 1. Higher values indicate better performance. It works with both string and integer IDs. + +The formula for calculating ID-based context recall is as follows: + +$$ \text{ID-Based Context Recall} = \frac{\text{Number of reference context IDs found in retrieved context IDs}}{\text{Total number of reference context IDs}} $$ + +### Example + +```python + +from ragas.dataset_schema import SingleTurnSample +from ragas.metrics import IDBasedContextRecall + +sample = SingleTurnSample( + retrieved_context_ids=["doc_1", "doc_2", "doc_3"], + reference_context_ids=["doc_1", "doc_4", "doc_5", "doc_6"] +) + +id_recall = IDBasedContextRecall() +await id_recall.single_turn_ascore(sample) +``` + +Output +``` +0.25 ``` \ No newline at end of file diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index f59782e63..939a469b4 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -76,6 +76,10 @@ class SingleTurnSample(BaseSample): List of contexts retrieved for the query. reference_contexts : Optional[List[str]] List of reference contexts for the query. + retrieved_context_ids : Optional[List[Union[str, int]]] + List of IDs for retrieved contexts. + reference_context_ids : Optional[List[Union[str, int]]] + List of IDs for reference contexts. response : Optional[str] The generated response for the query. multi_responses : Optional[List[str]] @@ -89,6 +93,8 @@ class SingleTurnSample(BaseSample): user_input: t.Optional[str] = None retrieved_contexts: t.Optional[t.List[str]] = None reference_contexts: t.Optional[t.List[str]] = None + retrieved_context_ids: t.Optional[t.List[t.Union[str, int]]] = None + reference_context_ids: t.Optional[t.List[t.Union[str, int]]] = None response: t.Optional[str] = None multi_responses: t.Optional[t.List[str]] = None reference: t.Optional[str] = None diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 381203031..c94c4294c 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -21,12 +21,14 @@ LLMContextPrecisionWithoutReference, LLMContextPrecisionWithReference, NonLLMContextPrecisionWithReference, + IDBasedContextPrecision, context_precision, ) from ragas.metrics._context_recall import ( ContextRecall, LLMContextRecall, NonLLMContextRecall, + IDBasedContextRecall, context_recall, ) from ragas.metrics._datacompy_score import DataCompyScore @@ -113,8 +115,10 @@ "LLMContextPrecisionWithoutReference", "NonLLMContextPrecisionWithReference", "LLMContextPrecisionWithoutReference", + "IDBasedContextPrecision", "LLMContextRecall", "NonLLMContextRecall", + "IDBasedContextRecall", "FactualCorrectness", "InstanceRubrics", "NonLLMStringSimilarity", diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index f32b3c4b2..b12bd9dc7 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -250,6 +250,62 @@ def _calculate_average_precision(self, verdict_list: t.List[int]) -> float: return score +@dataclass +class IDBasedContextPrecision(SingleTurnMetric): + """ + Calculates context precision by directly comparing retrieved context IDs with reference context IDs. + The score represents what proportion of the retrieved context IDs are actually relevant (present in reference). + + This metric works with both string and integer IDs. + + Attributes + ---------- + name : str + Name of the metric + """ + + name: str = "id_based_context_precision" + _required_columns: t.Dict[MetricType, t.Set[str]] = field( + default_factory=lambda: { + MetricType.SINGLE_TURN: { + "retrieved_context_ids", + "reference_context_ids", + } + } + ) + output_type: MetricOutputType = MetricOutputType.CONTINUOUS + + def init(self, run_config: RunConfig) -> None: ... + + async def _single_turn_ascore( + self, sample: SingleTurnSample, callbacks: Callbacks + ) -> float: + retrieved_context_ids = sample.retrieved_context_ids + reference_context_ids = sample.reference_context_ids + assert retrieved_context_ids is not None, "retrieved_context_ids is empty" + assert reference_context_ids is not None, "reference_context_ids is empty" + + # Convert all IDs to strings to ensure consistent comparison + retrieved_ids_set = set(str(id) for id in retrieved_context_ids) + reference_ids_set = set(str(id) for id in reference_context_ids) + + # Calculate precision score + total_retrieved = len(retrieved_ids_set) + if total_retrieved == 0: + logger.warning("No retrieved context IDs provided, cannot calculate precision.") + return np.nan + + # Count how many retrieved IDs match reference IDs + hits = sum(1 for ret_id in retrieved_ids_set if str(ret_id) in reference_ids_set) + + # For precision, we calculate: relevant retrieved / total retrieved + score = hits / total_retrieved + return score + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + return await self._single_turn_ascore(SingleTurnSample(**row), callbacks) + + @dataclass class ContextPrecision(LLMContextPrecisionWithReference): name: str = "context_precision" diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 7af9915f9..c8b7941db 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -237,4 +237,59 @@ def _compute_score(self, verdict_list: t.List[float]) -> float: return score +@dataclass +class IDBasedContextRecall(SingleTurnMetric): + """ + Calculates context recall by directly comparing retrieved context IDs with reference context IDs. + The score represents what proportion of the reference IDs were successfully retrieved. + + This metric works with both string and integer IDs. + + Attributes + ---------- + name : str + Name of the metric + """ + + name: str = "id_based_context_recall" + _required_columns: t.Dict[MetricType, t.Set[str]] = field( + default_factory=lambda: { + MetricType.SINGLE_TURN: { + "retrieved_context_ids", + "reference_context_ids", + } + } + ) + output_type: MetricOutputType = MetricOutputType.CONTINUOUS + + def init(self, run_config: RunConfig) -> None: ... + + async def _single_turn_ascore( + self, sample: SingleTurnSample, callbacks: Callbacks + ) -> float: + retrieved_context_ids = sample.retrieved_context_ids + reference_context_ids = sample.reference_context_ids + assert retrieved_context_ids is not None, "retrieved_context_ids is empty" + assert reference_context_ids is not None, "reference_context_ids is empty" + + # Convert all IDs to strings to ensure consistent comparison + retrieved_ids_set = set(str(id) for id in retrieved_context_ids) + reference_ids_set = set(str(id) for id in reference_context_ids) + + # Calculate how many reference IDs appear in retrieved IDs + hits = sum(1 for ref_id in reference_ids_set if str(ref_id) in retrieved_ids_set) + + # Calculate recall score + total_refs = len(reference_ids_set) + score = hits / total_refs if total_refs > 0 else np.nan + + if np.isnan(score): + logger.warning("No reference context IDs provided, cannot calculate recall.") + + return score + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + return await self._single_turn_ascore(SingleTurnSample(**row), callbacks) + + context_recall = ContextRecall()