add docstrings for EvaluationRunResult (#7885)

deepset-ai · Jun 19, 2024 · 7c31d5f · 7c31d5f
1 parent 28902c4
commit 7c31d5f
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 46 deletions.
diff --git a/docs/pydoc/config/evaluation_api.yml b/docs/pydoc/config/evaluation_api.yml
@@ -3,7 +3,8 @@ loaders:
     search_path: [../../../haystack/evaluation]
     modules:
       [
-        "eval_run_result"
+        "base",
+        "eval_run_result",
       ]
     ignore_when_discovered: ["__init__"]
 processors:

diff --git a/haystack/evaluation/__init__.py b/haystack/evaluation/__init__.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from .eval_run_result import BaseEvaluationRunResult, EvaluationRunResult
+from .base import BaseEvaluationRunResult
+from .eval_run_result import EvaluationRunResult
 
 __all__ = ["BaseEvaluationRunResult", "EvaluationRunResult"]
diff --git a/haystack/evaluation/base.py b/haystack/evaluation/base.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+from pandas import DataFrame
+
+
+class BaseEvaluationRunResult(ABC):
+    """
+    Represents the results of an evaluation run.
+    """
+
+    @abstractmethod
+    def to_pandas(self) -> "DataFrame":
+        """
+        Creates a Pandas DataFrame containing the scores of each metric for every input sample.
+
+        :returns:
+            Pandas DataFrame with the scores.
+        """
+
+    @abstractmethod
+    def score_report(self) -> "DataFrame":
+        """
+        Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
+
+        :returns:
+            Pandas DataFrame with the aggregated scores.
+        """
+
+    @abstractmethod
+    def comparative_individual_scores_report(
+        self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
+    ) -> "DataFrame":
+        """
+        Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
+
+        The inputs to both evaluation runs is assumed to be the same.
+
+        :param other:
+            Results of another evaluation run to compare with.
+        :param keep_columns:
+            List of common column names to keep from the inputs of the evaluation runs to compare.
+        :returns:
+            Pandas DataFrame with the score comparison.
+        """
diff --git a/haystack/evaluation/eval_run_result.py b/haystack/evaluation/eval_run_result.py
@@ -2,54 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from abc import ABC, abstractmethod
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 from warnings import warn
 
 from pandas import DataFrame
 from pandas import concat as pd_concat
 
-
-class BaseEvaluationRunResult(ABC):
-    """
-    Represents the results of an evaluation run.
-    """
-
-    @abstractmethod
-    def to_pandas(self) -> "DataFrame":
-        """
-        Creates a Pandas DataFrame containing the scores of each metric for every input sample.
-
-        :returns:
-            Pandas DataFrame with the scores.
-        """
-
-    @abstractmethod
-    def score_report(self) -> "DataFrame":
-        """
-        Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
-
-        :returns:
-            Pandas DataFrame with the aggregated scores.
-        """
-
-    @abstractmethod
-    def comparative_individual_scores_report(
-        self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
-    ) -> "DataFrame":
-        """
-        Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
-
-        The inputs to both evaluation runs is assumed to be the same.
-
-        :param other:
-            Results of another evaluation run to compare with.
-        :param keep_columns:
-            List of common column names to keep from the inputs of the evaluation runs to compare.
-        :returns:
-            Pandas DataFrame with the score comparison.
-        """
+from .base import BaseEvaluationRunResult
 
 
 class EvaluationRunResult(BaseEvaluationRunResult):
@@ -99,13 +59,25 @@ def __init__(self, run_name: str, inputs: Dict[str, List[Any]], results: Dict[st
                     f"Got {len(outputs['individual_scores'])} but expected {expected_len}."
                 )
 
-    def score_report(self) -> DataFrame:  # noqa: D102
+    def score_report(self) -> DataFrame:
+        """
+        Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
+
+        :returns:
+            Pandas DataFrame with the aggregated scores.
+        """
         results = {k: v["score"] for k, v in self.results.items()}
         df = DataFrame.from_dict(results, orient="index", columns=["score"]).reset_index()
         df.columns = ["metrics", "score"]
         return df
 
-    def to_pandas(self) -> DataFrame:  # noqa: D102
+    def to_pandas(self) -> DataFrame:
+        """
+        Creates a Pandas DataFrame containing the scores of each metric for every input sample.
+
+        :returns:
+            Pandas DataFrame with the scores.
+        """
         inputs_columns = list(self.inputs.keys())
         inputs_values = list(self.inputs.values())
         inputs_values = list(map(list, zip(*inputs_values)))  # transpose the values
@@ -118,9 +90,21 @@ def to_pandas(self) -> DataFrame:  # noqa: D102
 
         return df_inputs.join(df_scores)
 
-    def comparative_individual_scores_report(  # noqa: D102
+    def comparative_individual_scores_report(
         self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
     ) -> DataFrame:
+        """
+        Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
+
+        The inputs to both evaluation runs is assumed to be the same.
+
+        :param other:
+            Results of another evaluation run to compare with.
+        :param keep_columns:
+            List of common column names to keep from the inputs of the evaluation runs to compare.
+        :returns:
+            Pandas DataFrame with the score comparison.
+        """
         if not isinstance(other, EvaluationRunResult):
             raise ValueError("Comparative scores can only be computed between EvaluationRunResults.")