Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion docs/concepts/metrics/available_metrics/context_precision.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,37 @@ await context_precision.single_turn_ascore(sample)
Output
```
0.9999999999
```
```

## ID Based Context Precision

IDBasedContextPrecision provides a direct and efficient way to measure precision by comparing the IDs of retrieved contexts with reference context IDs. This metric is particularly useful when you have a unique ID system for your documents and want to evaluate retrieval performance without comparing the actual content.

The metric computes precision using retrieved_context_ids and reference_context_ids, with values ranging between 0 and 1. Higher values indicate better performance. It works with both string and integer IDs.

The formula for calculating ID-based context precision is as follows:

$$ \text{ID-Based Context Precision} = \frac{\text{Number of retrieved context IDs found in reference context IDs}}{\text{Total number of retrieved context IDs}} $$

### Example

```python
from ragas import SingleTurnSample
from ragas.metrics import IDBasedContextPrecision

sample = SingleTurnSample(
retrieved_context_ids=["doc_1", "doc_2", "doc_3", "doc_4"],
reference_context_ids=["doc_1", "doc_4", "doc_5", "doc_6"]
)

id_precision = IDBasedContextPrecision()
await id_precision.single_turn_ascore(sample)

```

Output
```
0.5
```

In this example, out of the 4 retrieved context IDs, only 2 ("doc_1" and "doc_4") are found in the reference context IDs, resulting in a precision score of 0.5 or 50%.
32 changes: 32 additions & 0 deletions docs/concepts/metrics/available_metrics/context_recall.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,36 @@ await context_recall.single_turn_ascore(sample)
Output
```
0.5
```

## ID BasedContext Recall

ID Based Context Recall
IDBasedContextRecall provides a direct and efficient way to measure recall by comparing the IDs of retrieved contexts with reference context IDs. This metric is particularly useful when you have a unique ID system for your documents and want to evaluate retrieval performance without comparing the actual content.

The metric computes recall using retrieved_context_ids and reference_context_ids, with values ranging between 0 and 1. Higher values indicate better performance. It works with both string and integer IDs.

The formula for calculating ID-based context recall is as follows:

$$ \text{ID-Based Context Recall} = \frac{\text{Number of reference context IDs found in retrieved context IDs}}{\text{Total number of reference context IDs}} $$

### Example

```python

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import IDBasedContextRecall

sample = SingleTurnSample(
retrieved_context_ids=["doc_1", "doc_2", "doc_3"],
reference_context_ids=["doc_1", "doc_4", "doc_5", "doc_6"]
)

id_recall = IDBasedContextRecall()
await id_recall.single_turn_ascore(sample)
```

Output
```
0.25
```
6 changes: 6 additions & 0 deletions src/ragas/dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ class SingleTurnSample(BaseSample):
List of contexts retrieved for the query.
reference_contexts : Optional[List[str]]
List of reference contexts for the query.
retrieved_context_ids : Optional[List[Union[str, int]]]
List of IDs for retrieved contexts.
reference_context_ids : Optional[List[Union[str, int]]]
List of IDs for reference contexts.
response : Optional[str]
The generated response for the query.
multi_responses : Optional[List[str]]
Expand All @@ -89,6 +93,8 @@ class SingleTurnSample(BaseSample):
user_input: t.Optional[str] = None
retrieved_contexts: t.Optional[t.List[str]] = None
reference_contexts: t.Optional[t.List[str]] = None
retrieved_context_ids: t.Optional[t.List[t.Union[str, int]]] = None
reference_context_ids: t.Optional[t.List[t.Union[str, int]]] = None
response: t.Optional[str] = None
multi_responses: t.Optional[t.List[str]] = None
reference: t.Optional[str] = None
Expand Down
4 changes: 4 additions & 0 deletions src/ragas/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
LLMContextPrecisionWithoutReference,
LLMContextPrecisionWithReference,
NonLLMContextPrecisionWithReference,
IDBasedContextPrecision,
context_precision,
)
from ragas.metrics._context_recall import (
ContextRecall,
LLMContextRecall,
NonLLMContextRecall,
IDBasedContextRecall,
context_recall,
)
from ragas.metrics._datacompy_score import DataCompyScore
Expand Down Expand Up @@ -113,8 +115,10 @@
"LLMContextPrecisionWithoutReference",
"NonLLMContextPrecisionWithReference",
"LLMContextPrecisionWithoutReference",
"IDBasedContextPrecision",
"LLMContextRecall",
"NonLLMContextRecall",
"IDBasedContextRecall",
"FactualCorrectness",
"InstanceRubrics",
"NonLLMStringSimilarity",
Expand Down
56 changes: 56 additions & 0 deletions src/ragas/metrics/_context_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,62 @@ def _calculate_average_precision(self, verdict_list: t.List[int]) -> float:
return score


@dataclass
class IDBasedContextPrecision(SingleTurnMetric):
"""
Calculates context precision by directly comparing retrieved context IDs with reference context IDs.
The score represents what proportion of the retrieved context IDs are actually relevant (present in reference).

This metric works with both string and integer IDs.

Attributes
----------
name : str
Name of the metric
"""

name: str = "id_based_context_precision"
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
default_factory=lambda: {
MetricType.SINGLE_TURN: {
"retrieved_context_ids",
"reference_context_ids",
}
}
)
output_type: MetricOutputType = MetricOutputType.CONTINUOUS

def init(self, run_config: RunConfig) -> None: ...

async def _single_turn_ascore(
self, sample: SingleTurnSample, callbacks: Callbacks
) -> float:
retrieved_context_ids = sample.retrieved_context_ids
reference_context_ids = sample.reference_context_ids
assert retrieved_context_ids is not None, "retrieved_context_ids is empty"
assert reference_context_ids is not None, "reference_context_ids is empty"

# Convert all IDs to strings to ensure consistent comparison
retrieved_ids_set = set(str(id) for id in retrieved_context_ids)
reference_ids_set = set(str(id) for id in reference_context_ids)

# Calculate precision score
total_retrieved = len(retrieved_ids_set)
if total_retrieved == 0:
logger.warning("No retrieved context IDs provided, cannot calculate precision.")
return np.nan

# Count how many retrieved IDs match reference IDs
hits = sum(1 for ret_id in retrieved_ids_set if str(ret_id) in reference_ids_set)

# For precision, we calculate: relevant retrieved / total retrieved
score = hits / total_retrieved
return score

async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)


@dataclass
class ContextPrecision(LLMContextPrecisionWithReference):
name: str = "context_precision"
Expand Down
55 changes: 55 additions & 0 deletions src/ragas/metrics/_context_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,4 +237,59 @@ def _compute_score(self, verdict_list: t.List[float]) -> float:
return score


@dataclass
class IDBasedContextRecall(SingleTurnMetric):
"""
Calculates context recall by directly comparing retrieved context IDs with reference context IDs.
The score represents what proportion of the reference IDs were successfully retrieved.

This metric works with both string and integer IDs.

Attributes
----------
name : str
Name of the metric
"""

name: str = "id_based_context_recall"
_required_columns: t.Dict[MetricType, t.Set[str]] = field(
default_factory=lambda: {
MetricType.SINGLE_TURN: {
"retrieved_context_ids",
"reference_context_ids",
}
}
)
output_type: MetricOutputType = MetricOutputType.CONTINUOUS

def init(self, run_config: RunConfig) -> None: ...

async def _single_turn_ascore(
self, sample: SingleTurnSample, callbacks: Callbacks
) -> float:
retrieved_context_ids = sample.retrieved_context_ids
reference_context_ids = sample.reference_context_ids
assert retrieved_context_ids is not None, "retrieved_context_ids is empty"
assert reference_context_ids is not None, "reference_context_ids is empty"

# Convert all IDs to strings to ensure consistent comparison
retrieved_ids_set = set(str(id) for id in retrieved_context_ids)
reference_ids_set = set(str(id) for id in reference_context_ids)

# Calculate how many reference IDs appear in retrieved IDs
hits = sum(1 for ref_id in reference_ids_set if str(ref_id) in retrieved_ids_set)

# Calculate recall score
total_refs = len(reference_ids_set)
score = hits / total_refs if total_refs > 0 else np.nan

if np.isnan(score):
logger.warning("No reference context IDs provided, cannot calculate recall.")

return score

async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
return await self._single_turn_ascore(SingleTurnSample(**row), callbacks)


context_recall = ContextRecall()