From 913c45c9766f696aadfb9bbb785ba2a836183460 Mon Sep 17 00:00:00 2001
From: yuukidach <yuukidach@gmail.com>
Date: Mon, 25 Dec 2023 20:07:09 +0800
Subject: [PATCH 1/4] feat(metrics): move evaluationMode2Col into metrics.base

---
 src/ragas/metrics/base.py | 10 ++++++++++
 src/ragas/validation.py   | 13 +------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
index 097058733..aee6eb050 100644
--- a/src/ragas/metrics/base.py
+++ b/src/ragas/metrics/base.py
@@ -42,6 +42,16 @@ def make_batches(total_size: int, batch_size: int) -> list[range]:
 
 EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg")
 
+EVALMODE_TO_COLUMNS = {
+    EvaluationMode.qac: ["question", "answer", "contexts"],
+    EvaluationMode.qa: ["question", "answer"],
+    EvaluationMode.qc: ["question", "contexts"],
+    EvaluationMode.gc: ["ground_truths", "contexts"],
+    EvaluationMode.ga: ["ground_truths", "answer"],
+    EvaluationMode.qga: ["question", "ground_truths", "answer"],
+    EvaluationMode.qcg: ["question", "contexts", "ground_truths"],
+}
+
 
 @dataclass
 class Metric(ABC):
diff --git a/src/ragas/validation.py b/src/ragas/validation.py
index fa1c4471d..5d7c2862d 100644
--- a/src/ragas/validation.py
+++ b/src/ragas/validation.py
@@ -3,7 +3,7 @@
 from datasets import Dataset, Sequence
 
 from ragas.metrics._context_precision import ContextPrecision
-from ragas.metrics.base import EvaluationMode, Metric
+from ragas.metrics.base import Metric, EVALMODE_TO_COLUMNS
 
 
 def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset:
@@ -35,17 +35,6 @@ def validate_column_dtypes(ds: Dataset):
                 )
 
 
-EVALMODE_TO_COLUMNS = {
-    EvaluationMode.qac: ["question", "answer", "contexts"],
-    EvaluationMode.qa: ["question", "answer"],
-    EvaluationMode.qc: ["question", "contexts"],
-    EvaluationMode.gc: ["ground_truths", "contexts"],
-    EvaluationMode.ga: ["ground_truths", "answer"],
-    EvaluationMode.qga: ["question", "ground_truths", "answer"],
-    EvaluationMode.qcg: ["question", "contexts", "ground_truths"],
-}
-
-
 def validate_evaluation_modes(ds: Dataset, metrics: list[Metric]):
     """
     validates the dataset and returns the evaluation type

From 2e8572eda6ea544b8675fedb26637d17327646b3 Mon Sep 17 00:00:00 2001
From: yuukidach <yuukidach@gmail.com>
Date: Tue, 26 Dec 2023 11:16:53 +0800
Subject: [PATCH 2/4] feat(metrics): skip embedding process when the
 corresponding weight is zero

---
 src/ragas/metrics/_answer_correctness.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
index b5b22275e..f11b1e32d 100644
--- a/src/ragas/metrics/_answer_correctness.py
+++ b/src/ragas/metrics/_answer_correctness.py
@@ -56,7 +56,7 @@ class AnswerCorrectness(MetricWithLLM):
 
     """
     Measures answer correctness compared to ground truth as a combination of
-    semantic similarity and factuality
+    factuality and semantic similarity.
 
     Attributes
     ----------
@@ -78,6 +78,9 @@ class AnswerCorrectness(MetricWithLLM):
     answer_similarity: AnswerSimilarity | None = None
 
     def __post_init__(self: t.Self):
+        if len(self.weights) != 2:
+            raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity")
+
         if self.answer_similarity is None:
             self.answer_similarity = AnswerSimilarity(
                 llm=self.llm, batch_size=self.batch_size
@@ -113,7 +116,7 @@ def _score_batch(
                 "FP": "statements present in the answer but not found in the ground truth",
                 "FN": "relevant statements found in the ground truth but omitted in the answer",  # noqa: E501
             }
-    
+
             f1_score = []
             for prediction in outputs:
                 prediction = json_loader.safe_load(prediction[0].text, self.llm)
@@ -131,10 +134,13 @@ def _score_batch(
                     score = tp / (tp + 0.5 * (fp + fn))
                 else:
                     score = np.nan
-    
+
                 f1_score.append(score)
-    
-            similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group)  # type: ignore
+
+            if self.weights[1] == 0:
+                similarity_scores = np.zeros(len(f1_score))
+            else:
+                similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group)  # type: ignore
             scores_stacked = np.vstack([f1_score, similarity_scores])
             scores = np.average(
                 scores_stacked,

From e1453ac5bab455df077810c2a2475a22206eff22 Mon Sep 17 00:00:00 2001
From: yuukidach <yuukidach@gmail.com>
Date: Tue, 26 Dec 2023 11:37:45 +0800
Subject: [PATCH 3/4] feat(metrics): revert base

---
 src/ragas/metrics/_answer_correctness.py |  4 +++-
 src/ragas/metrics/base.py                | 10 ----------
 src/ragas/validation.py                  | 13 ++++++++++++-
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
index f11b1e32d..b9985b220 100644
--- a/src/ragas/metrics/_answer_correctness.py
+++ b/src/ragas/metrics/_answer_correctness.py
@@ -80,8 +80,10 @@ class AnswerCorrectness(MetricWithLLM):
     def __post_init__(self: t.Self):
         if len(self.weights) != 2:
             raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity")
+        if all([w == 0 for w in self.weights]):
+            raise ValueError("At least one weight must be non-zero")
 
-        if self.answer_similarity is None:
+        if self.answer_similarity is None and self.weights[1] != 0:
             self.answer_similarity = AnswerSimilarity(
                 llm=self.llm, batch_size=self.batch_size
             )
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
index aee6eb050..097058733 100644
--- a/src/ragas/metrics/base.py
+++ b/src/ragas/metrics/base.py
@@ -42,16 +42,6 @@ def make_batches(total_size: int, batch_size: int) -> list[range]:
 
 EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg")
 
-EVALMODE_TO_COLUMNS = {
-    EvaluationMode.qac: ["question", "answer", "contexts"],
-    EvaluationMode.qa: ["question", "answer"],
-    EvaluationMode.qc: ["question", "contexts"],
-    EvaluationMode.gc: ["ground_truths", "contexts"],
-    EvaluationMode.ga: ["ground_truths", "answer"],
-    EvaluationMode.qga: ["question", "ground_truths", "answer"],
-    EvaluationMode.qcg: ["question", "contexts", "ground_truths"],
-}
-
 
 @dataclass
 class Metric(ABC):
diff --git a/src/ragas/validation.py b/src/ragas/validation.py
index 5d7c2862d..fa1c4471d 100644
--- a/src/ragas/validation.py
+++ b/src/ragas/validation.py
@@ -3,7 +3,7 @@
 from datasets import Dataset, Sequence
 
 from ragas.metrics._context_precision import ContextPrecision
-from ragas.metrics.base import Metric, EVALMODE_TO_COLUMNS
+from ragas.metrics.base import EvaluationMode, Metric
 
 
 def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset:
@@ -35,6 +35,17 @@ def validate_column_dtypes(ds: Dataset):
                 )
 
 
+EVALMODE_TO_COLUMNS = {
+    EvaluationMode.qac: ["question", "answer", "contexts"],
+    EvaluationMode.qa: ["question", "answer"],
+    EvaluationMode.qc: ["question", "contexts"],
+    EvaluationMode.gc: ["ground_truths", "contexts"],
+    EvaluationMode.ga: ["ground_truths", "answer"],
+    EvaluationMode.qga: ["question", "ground_truths", "answer"],
+    EvaluationMode.qcg: ["question", "contexts", "ground_truths"],
+}
+
+
 def validate_evaluation_modes(ds: Dataset, metrics: list[Metric]):
     """
     validates the dataset and returns the evaluation type

From 99e186e2fc607b623763a1e72ea2b5bb59eaf3d4 Mon Sep 17 00:00:00 2001
From: yuukidach <yuukidach@gmail.com>
Date: Tue, 26 Dec 2023 11:42:47 +0800
Subject: [PATCH 4/4] feat(metrics): verify weights

---
 src/ragas/metrics/_answer_correctness.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
index b9985b220..8686057d6 100644
--- a/src/ragas/metrics/_answer_correctness.py
+++ b/src/ragas/metrics/_answer_correctness.py
@@ -82,6 +82,8 @@ def __post_init__(self: t.Self):
             raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity")
         if all([w == 0 for w in self.weights]):
             raise ValueError("At least one weight must be non-zero")
+        if not all([w >= 0 for w in self.weights]):
+            raise ValueError("Weights must be non-negative")
 
         if self.answer_similarity is None and self.weights[1] != 0:
             self.answer_similarity = AnswerSimilarity(