From 7c31d5f418526ee0795461dfcff71cbf78977196 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 19 Jun 2024 11:49:41 +0200 Subject: [PATCH] add docstrings for EvaluationRunResult (#7885) --- docs/pydoc/config/evaluation_api.yml | 3 +- haystack/evaluation/__init__.py | 3 +- haystack/evaluation/base.py | 49 ++++++++++++++++++ haystack/evaluation/eval_run_result.py | 72 ++++++++++---------------- 4 files changed, 81 insertions(+), 46 deletions(-) create mode 100644 haystack/evaluation/base.py diff --git a/docs/pydoc/config/evaluation_api.yml b/docs/pydoc/config/evaluation_api.yml index fe972c22a8..e445e9a568 100644 --- a/docs/pydoc/config/evaluation_api.yml +++ b/docs/pydoc/config/evaluation_api.yml @@ -3,7 +3,8 @@ loaders: search_path: [../../../haystack/evaluation] modules: [ - "eval_run_result" + "base", + "eval_run_result", ] ignore_when_discovered: ["__init__"] processors: diff --git a/haystack/evaluation/__init__.py b/haystack/evaluation/__init__.py index d85c43081b..734699a03f 100644 --- a/haystack/evaluation/__init__.py +++ b/haystack/evaluation/__init__.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from .eval_run_result import BaseEvaluationRunResult, EvaluationRunResult +from .base import BaseEvaluationRunResult +from .eval_run_result import EvaluationRunResult __all__ = ["BaseEvaluationRunResult", "EvaluationRunResult"] diff --git a/haystack/evaluation/base.py b/haystack/evaluation/base.py new file mode 100644 index 0000000000..617fa17638 --- /dev/null +++ b/haystack/evaluation/base.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from abc import ABC, abstractmethod +from typing import List, Optional + +from pandas import DataFrame + + +class BaseEvaluationRunResult(ABC): + """ + Represents the results of an evaluation run. + """ + + @abstractmethod + def to_pandas(self) -> "DataFrame": + """ + Creates a Pandas DataFrame containing the scores of each metric for every input sample. + + :returns: + Pandas DataFrame with the scores. + """ + + @abstractmethod + def score_report(self) -> "DataFrame": + """ + Transforms the results into a Pandas DataFrame with the aggregated scores for each metric. + + :returns: + Pandas DataFrame with the aggregated scores. + """ + + @abstractmethod + def comparative_individual_scores_report( + self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None + ) -> "DataFrame": + """ + Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs. + + The inputs to both evaluation runs is assumed to be the same. + + :param other: + Results of another evaluation run to compare with. + :param keep_columns: + List of common column names to keep from the inputs of the evaluation runs to compare. + :returns: + Pandas DataFrame with the score comparison. + """ diff --git a/haystack/evaluation/eval_run_result.py b/haystack/evaluation/eval_run_result.py index d57ee87ed0..907e159f01 100644 --- a/haystack/evaluation/eval_run_result.py +++ b/haystack/evaluation/eval_run_result.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from abc import ABC, abstractmethod from copy import deepcopy from typing import Any, Dict, List, Optional from warnings import warn @@ -10,46 +9,7 @@ from pandas import DataFrame from pandas import concat as pd_concat - -class BaseEvaluationRunResult(ABC): - """ - Represents the results of an evaluation run. - """ - - @abstractmethod - def to_pandas(self) -> "DataFrame": - """ - Creates a Pandas DataFrame containing the scores of each metric for every input sample. - - :returns: - Pandas DataFrame with the scores. - """ - - @abstractmethod - def score_report(self) -> "DataFrame": - """ - Transforms the results into a Pandas DataFrame with the aggregated scores for each metric. - - :returns: - Pandas DataFrame with the aggregated scores. - """ - - @abstractmethod - def comparative_individual_scores_report( - self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None - ) -> "DataFrame": - """ - Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs. - - The inputs to both evaluation runs is assumed to be the same. - - :param other: - Results of another evaluation run to compare with. - :param keep_columns: - List of common column names to keep from the inputs of the evaluation runs to compare. - :returns: - Pandas DataFrame with the score comparison. - """ +from .base import BaseEvaluationRunResult class EvaluationRunResult(BaseEvaluationRunResult): @@ -99,13 +59,25 @@ def __init__(self, run_name: str, inputs: Dict[str, List[Any]], results: Dict[st f"Got {len(outputs['individual_scores'])} but expected {expected_len}." ) - def score_report(self) -> DataFrame: # noqa: D102 + def score_report(self) -> DataFrame: + """ + Transforms the results into a Pandas DataFrame with the aggregated scores for each metric. + + :returns: + Pandas DataFrame with the aggregated scores. + """ results = {k: v["score"] for k, v in self.results.items()} df = DataFrame.from_dict(results, orient="index", columns=["score"]).reset_index() df.columns = ["metrics", "score"] return df - def to_pandas(self) -> DataFrame: # noqa: D102 + def to_pandas(self) -> DataFrame: + """ + Creates a Pandas DataFrame containing the scores of each metric for every input sample. + + :returns: + Pandas DataFrame with the scores. + """ inputs_columns = list(self.inputs.keys()) inputs_values = list(self.inputs.values()) inputs_values = list(map(list, zip(*inputs_values))) # transpose the values @@ -118,9 +90,21 @@ def to_pandas(self) -> DataFrame: # noqa: D102 return df_inputs.join(df_scores) - def comparative_individual_scores_report( # noqa: D102 + def comparative_individual_scores_report( self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None ) -> DataFrame: + """ + Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs. + + The inputs to both evaluation runs is assumed to be the same. + + :param other: + Results of another evaluation run to compare with. + :param keep_columns: + List of common column names to keep from the inputs of the evaluation runs to compare. + :returns: + Pandas DataFrame with the score comparison. + """ if not isinstance(other, EvaluationRunResult): raise ValueError("Comparative scores can only be computed between EvaluationRunResults.")