diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py index 064c9a7f5..aa5b92e52 100644 --- a/src/ragas/integrations/langchain.py +++ b/src/ragas/integrations/langchain.py @@ -206,7 +206,7 @@ def evaluate_run( if example.outputs is None or "ground_truth" not in example.outputs: raise ValueError("expected `ground_truth` in example outputs.") chain_eval["ground_truth"] = example.outputs["ground_truth"] - eval_output = self(chain_eval, include_run_info=True) + eval_output = self.invoke(chain_eval, include_run_info=True) evaluation_result = EvaluationResult( key=self.metric.name, score=eval_output[self.metric.name] diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 77c0d3922..f2ba99f42 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -4,6 +4,7 @@ C - contexts: context used for generation G - ground_truth: ground truth answer """ + from __future__ import annotations import asyncio @@ -25,17 +26,37 @@ EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg") +def get_required_columns( + eval_mod: EvaluationMode, ignore_columns: t.Optional[t.List[str]] = None +) -> t.List[str]: + if eval_mod == EvaluationMode.qac: + keys = ["question", "answer", "contexts"] + elif eval_mod == EvaluationMode.qa: + keys = ["question", "answer"] + elif eval_mod == EvaluationMode.qc: + keys = ["question", "contexts"] + elif eval_mod == EvaluationMode.gc: + keys = ["contexts", "ground_truth"] + elif eval_mod == EvaluationMode.ga: + keys = ["answer", "ground_truth"] + elif eval_mod == EvaluationMode.qga: + keys = ["question", "contexts", "answer", "ground_truth"] + elif eval_mod == EvaluationMode.qcg: + keys = ["question", "contexts", "ground_truth"] + ignore_columns = ignore_columns or [] + + return [k for k in keys if k not in ignore_columns] + + @dataclass class Metric(ABC): @property @abstractmethod - def name(self) -> str: - ... + def name(self) -> str: ... @property @abstractmethod - def evaluation_mode(self) -> EvaluationMode: - ... + def evaluation_mode(self) -> EvaluationMode: ... @abstractmethod def init(self, run_config: RunConfig): @@ -97,8 +118,9 @@ async def ascore( return score @abstractmethod - async def _ascore(self, row: t.Dict, callbacks: Callbacks, is_async: bool) -> float: - ... + async def _ascore( + self, row: t.Dict, callbacks: Callbacks, is_async: bool + ) -> float: ... @dataclass