From 401d5d35d12de3891fe9fdd4fdcb25bc4988ce03 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 4 Feb 2024 18:33:16 -0800 Subject: [PATCH] raise warning for invalid JSON --- src/ragas/metrics/_answer_correctness.py | 3 +++ src/ragas/metrics/_answer_relevance.py | 10 ++++++++-- src/ragas/metrics/_context_precision.py | 4 ++++ src/ragas/metrics/_context_recall.py | 3 +++ src/ragas/metrics/_faithfulness.py | 3 +++ 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 80c63a0f6..793fba3ec 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -130,6 +130,9 @@ def _compute_statement_presence(self, prediction: t.Any) -> float: ] if any([np.isnan(i) for i in [tp, fp, fn]]): score = np.nan + logger.warning( + "Invalid prediction format. Expected a list of dictionaries with keys 'TP', 'FP', 'FN'" + ) else: score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0 else: diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index b87ff9075..14683278b 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -102,8 +102,14 @@ def _calculate_score(self, response: t.Sequence[t.Any], row: t.Dict) -> float: if isinstance(item, dict) ] ) - cosine_sim = self.calculate_similarity(question, gen_questions) - score = cosine_sim.mean() * int(not committal) + if all(q == "" for q in gen_questions): + logger.warning( + "Invalid JSON response. Expected dictionary with key 'question'" + ) + score = np.nan + else: + cosine_sim = self.calculate_similarity(question, gen_questions) + score = cosine_sim.mean() * int(not committal) return score diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 753ce64b1..ce2fdd71a 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -109,6 +109,10 @@ def _calculate_average_precision(self, json_responses: t.List[t.Dict]) -> float: ] ) score = numerator / denominator + if np.isnan(score): + logger.warning( + "Invalid response format. Expected a list of dictionaries with keys 'verdict'" + ) return score async def _ascore( diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index e5512cdf6..783a49137 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -101,6 +101,9 @@ def _compute_score(self, response: t.Any) -> float: numerator = sum(response) return numerator / denom else: + logger.warning( + "Invalid JSON response. Expected dictionary with key 'Attributed'" + ) return np.nan async def _ascore(self, row: t.Dict, callbacks: Callbacks, is_async: bool) -> float: diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index c4cc51b6e..0a38f10ad 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -162,6 +162,9 @@ def _compute_score(self, output: t.Any): if num_statements: score = faithful_statements / num_statements else: + logger.warning( + "Invalid JSON response. Expected dictionary with key 'verdict'" + ) score = np.nan return score