explodinggradients · theSalt · Mar 19, 2025
diff --git a/docs/concepts/metrics/available_metrics/context_precision.md b/docs/concepts/metrics/available_metrics/context_precision.md
@@ -92,4 +92,37 @@ await context_precision.single_turn_ascore(sample)
 Output
 ```
 0.9999999999
-```
+```
+
+## ID Based Context Precision
+
+IDBasedContextPrecision provides a direct and efficient way to measure precision by comparing the IDs of retrieved contexts with reference context IDs. This metric is particularly useful when you have a unique ID system for your documents and want to evaluate retrieval performance without comparing the actual content.
+
+The metric computes precision using retrieved_context_ids and reference_context_ids, with values ranging between 0 and 1. Higher values indicate better performance. It works with both string and integer IDs.
+
+The formula for calculating ID-based context precision is as follows:
+
+$$ \text{ID-Based Context Precision} = \frac{\text{Number of retrieved context IDs found in reference context IDs}}{\text{Total number of retrieved context IDs}} $$
+
+### Example
+
+```python
+from ragas import SingleTurnSample
+from ragas.metrics import IDBasedContextPrecision
+
+sample = SingleTurnSample(
+    retrieved_context_ids=["doc_1", "doc_2", "doc_3", "doc_4"], 
+    reference_context_ids=["doc_1", "doc_4", "doc_5", "doc_6"]
+)
+
+id_precision = IDBasedContextPrecision()
+await id_precision.single_turn_ascore(sample)
+
+```
+
+Output
+```
+0.5
+```
+
+In this example, out of the 4 retrieved context IDs, only 2 ("doc_1" and "doc_4") are found in the reference context IDs, resulting in a precision score of 0.5 or 50%.
diff --git a/docs/concepts/metrics/available_metrics/context_recall.md b/docs/concepts/metrics/available_metrics/context_recall.md
@@ -69,4 +69,36 @@ await context_recall.single_turn_ascore(sample)
 Output
 ```
 0.5
+```
+
+## ID BasedContext Recall
+
+ID Based Context Recall
+IDBasedContextRecall provides a direct and efficient way to measure recall by comparing the IDs of retrieved contexts with reference context IDs. This metric is particularly useful when you have a unique ID system for your documents and want to evaluate retrieval performance without comparing the actual content.
+
+The metric computes recall using retrieved_context_ids and reference_context_ids, with values ranging between 0 and 1. Higher values indicate better performance. It works with both string and integer IDs.
+
+The formula for calculating ID-based context recall is as follows:
+
+$$ \text{ID-Based Context Recall} = \frac{\text{Number of reference context IDs found in retrieved context IDs}}{\text{Total number of reference context IDs}} $$
+
+### Example
+
+```python
+
+from ragas.dataset_schema import SingleTurnSample
+from ragas.metrics import IDBasedContextRecall
+
+sample = SingleTurnSample(
+    retrieved_context_ids=["doc_1", "doc_2", "doc_3"], 
+    reference_context_ids=["doc_1", "doc_4", "doc_5", "doc_6"]
+)
+
+id_recall = IDBasedContextRecall()
+await id_recall.single_turn_ascore(sample)
+```
+
+Output
+```
+0.25
 ```
diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py
@@ -76,6 +76,10 @@ class SingleTurnSample(BaseSample):
         List of contexts retrieved for the query.
     reference_contexts : Optional[List[str]]
         List of reference contexts for the query.
+    retrieved_context_ids : Optional[List[Union[str, int]]]
+        List of IDs for retrieved contexts.
+    reference_context_ids : Optional[List[Union[str, int]]]
+        List of IDs for reference contexts.    
     response : Optional[str]
         The generated response for the query.
     multi_responses : Optional[List[str]]
@@ -89,6 +93,8 @@ class SingleTurnSample(BaseSample):
     user_input: t.Optional[str] = None
     retrieved_contexts: t.Optional[t.List[str]] = None
     reference_contexts: t.Optional[t.List[str]] = None
+    retrieved_context_ids: t.Optional[t.List[t.Union[str, int]]] = None
+    reference_context_ids: t.Optional[t.List[t.Union[str, int]]] = None
     response: t.Optional[str] = None
     multi_responses: t.Optional[t.List[str]] = None
     reference: t.Optional[str] = None

diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py
@@ -21,12 +21,14 @@
     LLMContextPrecisionWithoutReference,
     LLMContextPrecisionWithReference,
     NonLLMContextPrecisionWithReference,
+    IDBasedContextPrecision,
     context_precision,
 )
 from ragas.metrics._context_recall import (
     ContextRecall,
     LLMContextRecall,
     NonLLMContextRecall,
+    IDBasedContextRecall,
     context_recall,
 )
 from ragas.metrics._datacompy_score import DataCompyScore
@@ -113,8 +115,10 @@
     "LLMContextPrecisionWithoutReference",
     "NonLLMContextPrecisionWithReference",
     "LLMContextPrecisionWithoutReference",
+    "IDBasedContextPrecision",
     "LLMContextRecall",
     "NonLLMContextRecall",
+    "IDBasedContextRecall",
     "FactualCorrectness",
     "InstanceRubrics",
     "NonLLMStringSimilarity",

diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py
@@ -250,6 +250,62 @@ def _calculate_average_precision(self, verdict_list: t.List[int]) -> float:
         return score
 
 
+@dataclass
+class IDBasedContextPrecision(SingleTurnMetric):
+    """
+    Calculates context precision by directly comparing retrieved context IDs with reference context IDs.
+    The score represents what proportion of the retrieved context IDs are actually relevant (present in reference).
+
+    This metric works with both string and integer IDs.
+
+    Attributes
+    ----------
+    name : str
+        Name of the metric
+    """
+
+    name: str = "id_based_context_precision"
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {
+            MetricType.SINGLE_TURN: {
+                "retrieved_context_ids",
+                "reference_context_ids",
+            }
+        }
+    )
+    output_type: MetricOutputType = MetricOutputType.CONTINUOUS
+
+    def init(self, run_config: RunConfig) -> None: ...
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        retrieved_context_ids = sample.retrieved_context_ids
+        reference_context_ids = sample.reference_context_ids
+        assert retrieved_context_ids is not None, "retrieved_context_ids is empty"
+        assert reference_context_ids is not None, "reference_context_ids is empty"
+
+        # Convert all IDs to strings to ensure consistent comparison
+        retrieved_ids_set = set(str(id) for id in retrieved_context_ids)
+        reference_ids_set = set(str(id) for id in reference_context_ids)
+
+        # Calculate precision score
+        total_retrieved = len(retrieved_ids_set)
+        if total_retrieved == 0:
+            logger.warning("No retrieved context IDs provided, cannot calculate precision.")
+            return np.nan
+
+        # Count how many retrieved IDs match reference IDs
+        hits = sum(1 for ret_id in retrieved_ids_set if str(ret_id) in reference_ids_set)
+
+        # For precision, we calculate: relevant retrieved / total retrieved
+        score = hits / total_retrieved
+        return score
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)
+
+
 @dataclass
 class ContextPrecision(LLMContextPrecisionWithReference):
     name: str = "context_precision"

diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py
@@ -237,4 +237,59 @@ def _compute_score(self, verdict_list: t.List[float]) -> float:
         return score
 
 
+@dataclass
+class IDBasedContextRecall(SingleTurnMetric):
+    """
+    Calculates context recall by directly comparing retrieved context IDs with reference context IDs.
+    The score represents what proportion of the reference IDs were successfully retrieved.
+
+    This metric works with both string and integer IDs.
+
+    Attributes
+    ----------
+    name : str
+        Name of the metric
+    """
+
+    name: str = "id_based_context_recall"
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {
+            MetricType.SINGLE_TURN: {
+                "retrieved_context_ids",
+                "reference_context_ids",
+            }
+        }
+    )
+    output_type: MetricOutputType = MetricOutputType.CONTINUOUS
+
+    def init(self, run_config: RunConfig) -> None: ...
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        retrieved_context_ids = sample.retrieved_context_ids
+        reference_context_ids = sample.reference_context_ids
+        assert retrieved_context_ids is not None, "retrieved_context_ids is empty"
+        assert reference_context_ids is not None, "reference_context_ids is empty"
+
+        # Convert all IDs to strings to ensure consistent comparison
+        retrieved_ids_set = set(str(id) for id in retrieved_context_ids)
+        reference_ids_set = set(str(id) for id in reference_context_ids)
+
+        # Calculate how many reference IDs appear in retrieved IDs
+        hits = sum(1 for ref_id in reference_ids_set if str(ref_id) in retrieved_ids_set)
+
+        # Calculate recall score
+        total_refs = len(reference_ids_set)
+        score = hits / total_refs if total_refs > 0 else np.nan
+
+        if np.isnan(score):
+            logger.warning("No reference context IDs provided, cannot calculate recall.")
+
+        return score
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)
+
+
 context_recall = ContextRecall()