From 38747ff7a33a74f4959c0582667f861f7e9c687a Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 23 May 2024 17:41:29 +0200 Subject: [PATCH] fix: failsafe for non-valid json and failed LLM calls (#7723) * wip * initial import * adding tests * adding params * adding safeguards for nan in evaluators * adding docstrings * fixing tests * removing unused imports * adding tests to context and faithfullness evaluators * fixing docstrings * nit * removing unused imports * adding release notes * attending PR comments * fixing tests * fixing tests * adding types * removing unused imports * Update haystack/components/evaluators/context_relevance.py Co-authored-by: Madeesh Kannan * Update haystack/components/evaluators/faithfulness.py Co-authored-by: Madeesh Kannan * attending PR comments --------- Co-authored-by: Madeesh Kannan --- .../evaluators/context_relevance.py | 10 ++- .../components/evaluators/faithfulness.py | 9 ++- .../components/evaluators/llm_evaluator.py | 71 +++++++++++++++---- ...LLM-based-evaluators-34cdc183ab545315.yaml | 5 ++ .../test_context_relevance_evaluator.py | 41 +++++++++++ .../evaluators/test_faithfulness_evaluator.py | 45 ++++++++++++ .../evaluators/test_llm_evaluator.py | 36 +++++++++- 7 files changed, 199 insertions(+), 18 deletions(-) create mode 100644 releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index 9bd299bbc..3629db2ba 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -70,6 +70,7 @@ def __init__( progress_bar: bool = True, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), + raise_on_failure: bool = True, ): """ Creates an instance of ContextRelevanceEvaluator. @@ -97,6 +98,9 @@ def __init__( Supported APIs: "openai". :param api_key: The API key. + :param raise_on_failure: + Whether to raise an exception if the API call fails. + """ self.instructions = ( "Your task is to judge how relevant the provided context is for answering a question. " @@ -117,6 +121,7 @@ def __init__( examples=self.examples, api=self.api, api_key=self.api_key, + raise_on_failure=raise_on_failure, progress_bar=progress_bar, ) @@ -138,7 +143,10 @@ def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any] result = super().run(questions=questions, contexts=contexts) # calculate average statement relevance score per query - for res in result["results"]: + for idx, res in enumerate(result["results"]): + if res is None: + result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")} + continue if not res["statements"]: res["score"] = 0 else: diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 1e561f669..ee9acfed4 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -84,6 +84,7 @@ def __init__( progress_bar: bool = True, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), + raise_on_failure: bool = True, ): """ Creates an instance of FaithfulnessEvaluator. @@ -112,6 +113,8 @@ def __init__( Supported APIs: "openai". :param api_key: The API key. + :param raise_on_failure: + Whether to raise an exception if the API call fails. """ self.instructions = ( @@ -134,6 +137,7 @@ def __init__( examples=self.examples, api=self.api, api_key=self.api_key, + raise_on_failure=raise_on_failure, progress_bar=progress_bar, ) @@ -157,7 +161,10 @@ def run(self, questions: List[str], contexts: List[List[str]], predicted_answers result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) # calculate average statement faithfulness score per query - for res in result["results"]: + for idx, res in enumerate(result["results"]): + if res is None: + result["results"][idx] = {"statements": [], "statement_scores": [], "score": float("nan")} + continue if not res["statements"]: res["score"] = 0 else: diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py index 9766f236a..fdfe49ffd 100644 --- a/haystack/components/evaluators/llm_evaluator.py +++ b/haystack/components/evaluators/llm_evaluator.py @@ -3,7 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Any, Dict, List, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type +from warnings import warn from tqdm import tqdm @@ -54,6 +55,7 @@ def __init__( examples: List[Dict[str, Any]], progress_bar: bool = True, *, + raise_on_failure: bool = True, api: str = "openai", api_key: Secret = Secret.from_env_var("OPENAI_API_KEY"), ): @@ -73,6 +75,8 @@ def __init__( `outputs` parameters. Each example is a dictionary with keys "inputs" and "outputs" They contain the input and output as dictionaries respectively. + :param raise_on_failure: + If True, the component will raise an exception on an unsuccessful API call. :param progress_bar: Whether to show a progress bar during the evaluation. :param api: @@ -83,6 +87,7 @@ def __init__( """ self.validate_init_parameters(inputs, outputs, examples) + self.raise_on_failure = raise_on_failure self.instructions = instructions self.inputs = inputs self.outputs = outputs @@ -168,7 +173,11 @@ def run(self, **inputs) -> Dict[str, Any]: :returns: A dictionary with a single `results` entry that contains a list of results. Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator - and the evaluation results as the values. + and the evaluation results as the values. If an exception occurs for a particular input value, the result + will be `None` for that entry. + :raises ValueError: + Only in the case that `raise_on_failure` is set to True and the received inputs are not lists or have + different lengths, or if the output is not a valid JSON or doesn't contain the expected keys. """ self.validate_input_parameters(dict(self.inputs), inputs) @@ -177,14 +186,31 @@ def run(self, **inputs) -> Dict[str, Any]: input_names, values = inputs.keys(), list(zip(*inputs.values())) list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values] - results = [] + results: List[Optional[Dict[str, Any]]] = [] + errors = 0 for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar): prompt = self.builder.run(**input_names_to_values) - result = self.generator.run(prompt=prompt["prompt"]) - - self.validate_outputs(expected=self.outputs, received=result["replies"][0]) - parsed_result = json.loads(result["replies"][0]) - results.append(parsed_result) + try: + result = self.generator.run(prompt=prompt["prompt"]) + except Exception as e: + msg = f"Error while generating response for prompt: {prompt}. Error: {e}" + if self.raise_on_failure: + raise ValueError(msg) + warn(msg) + results.append(None) + errors += 1 + continue + + if self.is_valid_json_and_has_expected_keys(expected=self.outputs, received=result["replies"][0]): + parsed_result = json.loads(result["replies"][0]) + results.append(parsed_result) + else: + results.append(None) + errors += 1 + + if errors > 0: + msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs." + warn(msg) return {"results": results} @@ -299,10 +325,9 @@ def validate_input_parameters(expected: Dict[str, Any], received: Dict[str, Any] ) raise ValueError(msg) - @staticmethod - def validate_outputs(expected: List[str], received: str) -> None: + def is_valid_json_and_has_expected_keys(self, expected: List[str], received: str) -> bool: """ - Validate the output. + Output must be a valid JSON with the expected keys. :param expected: Names of expected outputs @@ -310,9 +335,27 @@ def validate_outputs(expected: List[str], received: str) -> None: Names of received outputs :raises ValueError: - If not all expected outputs are present in the received outputs + If the output is not a valid JSON with the expected keys: + - with `raise_on_failure` set to True a ValueError is raised. + - with `raise_on_failure` set to False a warning is issued and False is returned. + + :returns: + True if the received output is a valid JSON with the expected keys, False otherwise. """ - parsed_output = json.loads(received) + try: + parsed_output = json.loads(received) + except json.JSONDecodeError: + msg = "Response from LLM evaluator is not a valid JSON." + if self.raise_on_failure: + raise ValueError(msg) + warn(msg) + return False + if not all(output in parsed_output for output in expected): msg = f"Expected response from LLM evaluator to be JSON with keys {expected}, got {received}." - raise ValueError(msg) + if self.raise_on_failure: + raise ValueError(msg) + warn(msg) + return False + + return True diff --git a/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml new file mode 100644 index 000000000..a97d33c8a --- /dev/null +++ b/releasenotes/notes/add-failsafe-for-LLM-based-evaluators-34cdc183ab545315.yaml @@ -0,0 +1,5 @@ +--- +enhancements: + - | + If an LLM-based evaluator (e.g., `Faithfulness` or `ContextRelevance`) is initialised with `raise_on_failure=False`, and if a call to an LLM fails or an LLM outputs an invalid JSON, the score of the sample is set to `NaN` instead of raising an exception. + The user is notified with a warning indicating the number of requests that failed. diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index ecbc215d0..2db69004d 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -4,6 +4,8 @@ import os from typing import List +import math + import pytest from haystack.components.evaluators import ContextRelevanceEvaluator @@ -159,6 +161,45 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(TypeError, match="missing 2 required positional arguments"): component.run() + def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = ContextRelevanceEvaluator(raise_on_failure=False) + + def generator_run(self, *args, **kwargs): + if "Python" in kwargs["prompt"]: + raise Exception("OpenAI API request failed.") + else: + return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']} + + monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) + + questions = ["Which is the most popular global sport?", "Who created the Python language?"] + contexts = [ + [ + "The popularity of sports can be measured in various ways, including TV viewership, social media " + "presence, number of participants, and economic impact. Football is undoubtedly the world's most " + "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and " + "Messi, drawing a followership of more than 4 billion people." + ], + [ + "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming " + "language. Its design philosophy emphasizes code readability, and its language constructs aim to help " + "programmers write clear, logical code for both small and large-scale software projects." + ], + ] + results = component.run(questions=questions, contexts=contexts) + + assert math.isnan(results["score"]) + + assert results["individual_scores"][0] == 1.0 + assert math.isnan(results["individual_scores"][1]) + + assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0} + + assert results["results"][1]["statements"] == [] + assert results["results"][1]["statement_scores"] == [] + assert math.isnan(results["results"][1]["score"]) + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index e493b709e..5c32f8c06 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -2,8 +2,10 @@ # # SPDX-License-Identifier: Apache-2.0 import os +import math from typing import List +import numpy as np import pytest from haystack.components.evaluators import FaithfulnessEvaluator @@ -191,6 +193,49 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(TypeError, match="missing 3 required positional arguments"): component.run() + def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = FaithfulnessEvaluator(raise_on_failure=False) + + def generator_run(self, *args, **kwargs): + if "Python" in kwargs["prompt"]: + raise Exception("OpenAI API request failed.") + else: + return {"replies": ['{"statements": ["c", "d"], "statement_scores": [1, 1]}']} + + monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run) + + questions = ["Which is the most popular global sport?", "Who created the Python language?"] + contexts = [ + [ + "The popularity of sports can be measured in various ways, including TV viewership, social media " + "presence, number of participants, and economic impact. Football is undoubtedly the world's most " + "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and " + "Messi, drawing a followership of more than 4 billion people." + ], + [ + "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming " + "language. Its design philosophy emphasizes code readability, and its language constructs aim to help " + "programmers write clear, logical code for both small and large-scale software projects." + ], + ] + predicted_answers = [ + "Football is the most popular sport with around 4 billion followers worldwide.", + "Guido van Rossum.", + ] + results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) + + assert math.isnan(results["score"]) + + assert results["individual_scores"][0] == 1.0 + assert math.isnan(results["individual_scores"][1]) + + assert results["results"][0] == {"statements": ["c", "d"], "statement_scores": [1, 1], "score": 1.0} + + assert results["results"][1]["statements"] == [] + assert results["results"][1]["statement_scores"] == [] + assert math.isnan(results["results"][1]["score"]) + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py index 1b28dab84..793f1861b 100644 --- a/test/components/evaluators/test_llm_evaluator.py +++ b/test/components/evaluators/test_llm_evaluator.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from typing import List +import numpy as np import pytest from haystack.components.evaluators import LLMEvaluator @@ -379,10 +380,41 @@ def test_invalid_outputs(self, monkeypatch): ], ) with pytest.raises(ValueError): - component.validate_outputs(expected=["score", "another_expected_output"], received='{"score": 1.0}') + component.is_valid_json_and_has_expected_keys( + expected=["score", "another_expected_output"], received='{"score": 1.0}' + ) + + with pytest.raises(ValueError): + component.is_valid_json_and_has_expected_keys(expected=["score"], received='{"wrong_name": 1.0}') + + def test_output_invalid_json_raise_on_failure_false(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("predicted_answers", List[str])], + outputs=["score"], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], + raise_on_failure=False, + ) + assert ( + component.is_valid_json_and_has_expected_keys(expected=["score"], received="some_invalid_json_output") + is False + ) + def test_output_invalid_json_raise_on_failure_true(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") + component = LLMEvaluator( + instructions="test-instruction", + inputs=[("predicted_answers", List[str])], + outputs=["score"], + examples=[ + {"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}} + ], + ) with pytest.raises(ValueError): - component.validate_outputs(expected=["score"], received='{"wrong_name": 1.0}') + component.is_valid_json_and_has_expected_keys(expected=["score"], received="some_invalid_json_output") def test_unsupported_api(self): with pytest.raises(ValueError):