From 913c45c9766f696aadfb9bbb785ba2a836183460 Mon Sep 17 00:00:00 2001 From: yuukidach Date: Mon, 25 Dec 2023 20:07:09 +0800 Subject: [PATCH 1/4] feat(metrics): move evaluationMode2Col into metrics.base --- src/ragas/metrics/base.py | 10 ++++++++++ src/ragas/validation.py | 13 +------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 097058733..aee6eb050 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -42,6 +42,16 @@ def make_batches(total_size: int, batch_size: int) -> list[range]: EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg") +EVALMODE_TO_COLUMNS = { + EvaluationMode.qac: ["question", "answer", "contexts"], + EvaluationMode.qa: ["question", "answer"], + EvaluationMode.qc: ["question", "contexts"], + EvaluationMode.gc: ["ground_truths", "contexts"], + EvaluationMode.ga: ["ground_truths", "answer"], + EvaluationMode.qga: ["question", "ground_truths", "answer"], + EvaluationMode.qcg: ["question", "contexts", "ground_truths"], +} + @dataclass class Metric(ABC): diff --git a/src/ragas/validation.py b/src/ragas/validation.py index fa1c4471d..5d7c2862d 100644 --- a/src/ragas/validation.py +++ b/src/ragas/validation.py @@ -3,7 +3,7 @@ from datasets import Dataset, Sequence from ragas.metrics._context_precision import ContextPrecision -from ragas.metrics.base import EvaluationMode, Metric +from ragas.metrics.base import Metric, EVALMODE_TO_COLUMNS def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset: @@ -35,17 +35,6 @@ def validate_column_dtypes(ds: Dataset): ) -EVALMODE_TO_COLUMNS = { - EvaluationMode.qac: ["question", "answer", "contexts"], - EvaluationMode.qa: ["question", "answer"], - EvaluationMode.qc: ["question", "contexts"], - EvaluationMode.gc: ["ground_truths", "contexts"], - EvaluationMode.ga: ["ground_truths", "answer"], - EvaluationMode.qga: ["question", "ground_truths", "answer"], - EvaluationMode.qcg: ["question", "contexts", "ground_truths"], -} - - def validate_evaluation_modes(ds: Dataset, metrics: list[Metric]): """ validates the dataset and returns the evaluation type From 2e8572eda6ea544b8675fedb26637d17327646b3 Mon Sep 17 00:00:00 2001 From: yuukidach Date: Tue, 26 Dec 2023 11:16:53 +0800 Subject: [PATCH 2/4] feat(metrics): skip embedding process when the corresponding weight is zero --- src/ragas/metrics/_answer_correctness.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index b5b22275e..f11b1e32d 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -56,7 +56,7 @@ class AnswerCorrectness(MetricWithLLM): """ Measures answer correctness compared to ground truth as a combination of - semantic similarity and factuality + factuality and semantic similarity. Attributes ---------- @@ -78,6 +78,9 @@ class AnswerCorrectness(MetricWithLLM): answer_similarity: AnswerSimilarity | None = None def __post_init__(self: t.Self): + if len(self.weights) != 2: + raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity") + if self.answer_similarity is None: self.answer_similarity = AnswerSimilarity( llm=self.llm, batch_size=self.batch_size @@ -113,7 +116,7 @@ def _score_batch( "FP": "statements present in the answer but not found in the ground truth", "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 } - + f1_score = [] for prediction in outputs: prediction = json_loader.safe_load(prediction[0].text, self.llm) @@ -131,10 +134,13 @@ def _score_batch( score = tp / (tp + 0.5 * (fp + fn)) else: score = np.nan - + f1_score.append(score) - - similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore + + if self.weights[1] == 0: + similarity_scores = np.zeros(len(f1_score)) + else: + similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore scores_stacked = np.vstack([f1_score, similarity_scores]) scores = np.average( scores_stacked, From e1453ac5bab455df077810c2a2475a22206eff22 Mon Sep 17 00:00:00 2001 From: yuukidach Date: Tue, 26 Dec 2023 11:37:45 +0800 Subject: [PATCH 3/4] feat(metrics): revert base --- src/ragas/metrics/_answer_correctness.py | 4 +++- src/ragas/metrics/base.py | 10 ---------- src/ragas/validation.py | 13 ++++++++++++- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index f11b1e32d..b9985b220 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -80,8 +80,10 @@ class AnswerCorrectness(MetricWithLLM): def __post_init__(self: t.Self): if len(self.weights) != 2: raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity") + if all([w == 0 for w in self.weights]): + raise ValueError("At least one weight must be non-zero") - if self.answer_similarity is None: + if self.answer_similarity is None and self.weights[1] != 0: self.answer_similarity = AnswerSimilarity( llm=self.llm, batch_size=self.batch_size ) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index aee6eb050..097058733 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -42,16 +42,6 @@ def make_batches(total_size: int, batch_size: int) -> list[range]: EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg") -EVALMODE_TO_COLUMNS = { - EvaluationMode.qac: ["question", "answer", "contexts"], - EvaluationMode.qa: ["question", "answer"], - EvaluationMode.qc: ["question", "contexts"], - EvaluationMode.gc: ["ground_truths", "contexts"], - EvaluationMode.ga: ["ground_truths", "answer"], - EvaluationMode.qga: ["question", "ground_truths", "answer"], - EvaluationMode.qcg: ["question", "contexts", "ground_truths"], -} - @dataclass class Metric(ABC): diff --git a/src/ragas/validation.py b/src/ragas/validation.py index 5d7c2862d..fa1c4471d 100644 --- a/src/ragas/validation.py +++ b/src/ragas/validation.py @@ -3,7 +3,7 @@ from datasets import Dataset, Sequence from ragas.metrics._context_precision import ContextPrecision -from ragas.metrics.base import Metric, EVALMODE_TO_COLUMNS +from ragas.metrics.base import EvaluationMode, Metric def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset: @@ -35,6 +35,17 @@ def validate_column_dtypes(ds: Dataset): ) +EVALMODE_TO_COLUMNS = { + EvaluationMode.qac: ["question", "answer", "contexts"], + EvaluationMode.qa: ["question", "answer"], + EvaluationMode.qc: ["question", "contexts"], + EvaluationMode.gc: ["ground_truths", "contexts"], + EvaluationMode.ga: ["ground_truths", "answer"], + EvaluationMode.qga: ["question", "ground_truths", "answer"], + EvaluationMode.qcg: ["question", "contexts", "ground_truths"], +} + + def validate_evaluation_modes(ds: Dataset, metrics: list[Metric]): """ validates the dataset and returns the evaluation type From 99e186e2fc607b623763a1e72ea2b5bb59eaf3d4 Mon Sep 17 00:00:00 2001 From: yuukidach Date: Tue, 26 Dec 2023 11:42:47 +0800 Subject: [PATCH 4/4] feat(metrics): verify weights --- src/ragas/metrics/_answer_correctness.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index b9985b220..8686057d6 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -82,6 +82,8 @@ def __post_init__(self: t.Self): raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity") if all([w == 0 for w in self.weights]): raise ValueError("At least one weight must be non-zero") + if not all([w >= 0 for w in self.weights]): + raise ValueError("Weights must be non-negative") if self.answer_similarity is None and self.weights[1] != 0: self.answer_similarity = AnswerSimilarity(