Skip to content

Commit

Permalink
add docstrings for EvaluationRunResult (#7885)
Browse files Browse the repository at this point in the history
  • Loading branch information
masci committed Jun 19, 2024
1 parent 28902c4 commit 7c31d5f
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 46 deletions.
3 changes: 2 additions & 1 deletion docs/pydoc/config/evaluation_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ loaders:
search_path: [../../../haystack/evaluation]
modules:
[
"eval_run_result"
"base",
"eval_run_result",
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
3 changes: 2 additions & 1 deletion haystack/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

from .eval_run_result import BaseEvaluationRunResult, EvaluationRunResult
from .base import BaseEvaluationRunResult
from .eval_run_result import EvaluationRunResult

__all__ = ["BaseEvaluationRunResult", "EvaluationRunResult"]
49 changes: 49 additions & 0 deletions haystack/evaluation/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from abc import ABC, abstractmethod
from typing import List, Optional

from pandas import DataFrame


class BaseEvaluationRunResult(ABC):
"""
Represents the results of an evaluation run.
"""

@abstractmethod
def to_pandas(self) -> "DataFrame":
"""
Creates a Pandas DataFrame containing the scores of each metric for every input sample.
:returns:
Pandas DataFrame with the scores.
"""

@abstractmethod
def score_report(self) -> "DataFrame":
"""
Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
:returns:
Pandas DataFrame with the aggregated scores.
"""

@abstractmethod
def comparative_individual_scores_report(
self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
) -> "DataFrame":
"""
Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
The inputs to both evaluation runs is assumed to be the same.
:param other:
Results of another evaluation run to compare with.
:param keep_columns:
List of common column names to keep from the inputs of the evaluation runs to compare.
:returns:
Pandas DataFrame with the score comparison.
"""
72 changes: 28 additions & 44 deletions haystack/evaluation/eval_run_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,14 @@
#
# SPDX-License-Identifier: Apache-2.0

from abc import ABC, abstractmethod
from copy import deepcopy
from typing import Any, Dict, List, Optional
from warnings import warn

from pandas import DataFrame
from pandas import concat as pd_concat


class BaseEvaluationRunResult(ABC):
"""
Represents the results of an evaluation run.
"""

@abstractmethod
def to_pandas(self) -> "DataFrame":
"""
Creates a Pandas DataFrame containing the scores of each metric for every input sample.
:returns:
Pandas DataFrame with the scores.
"""

@abstractmethod
def score_report(self) -> "DataFrame":
"""
Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
:returns:
Pandas DataFrame with the aggregated scores.
"""

@abstractmethod
def comparative_individual_scores_report(
self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
) -> "DataFrame":
"""
Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
The inputs to both evaluation runs is assumed to be the same.
:param other:
Results of another evaluation run to compare with.
:param keep_columns:
List of common column names to keep from the inputs of the evaluation runs to compare.
:returns:
Pandas DataFrame with the score comparison.
"""
from .base import BaseEvaluationRunResult


class EvaluationRunResult(BaseEvaluationRunResult):
Expand Down Expand Up @@ -99,13 +59,25 @@ def __init__(self, run_name: str, inputs: Dict[str, List[Any]], results: Dict[st
f"Got {len(outputs['individual_scores'])} but expected {expected_len}."
)

def score_report(self) -> DataFrame: # noqa: D102
def score_report(self) -> DataFrame:
"""
Transforms the results into a Pandas DataFrame with the aggregated scores for each metric.
:returns:
Pandas DataFrame with the aggregated scores.
"""
results = {k: v["score"] for k, v in self.results.items()}
df = DataFrame.from_dict(results, orient="index", columns=["score"]).reset_index()
df.columns = ["metrics", "score"]
return df

def to_pandas(self) -> DataFrame: # noqa: D102
def to_pandas(self) -> DataFrame:
"""
Creates a Pandas DataFrame containing the scores of each metric for every input sample.
:returns:
Pandas DataFrame with the scores.
"""
inputs_columns = list(self.inputs.keys())
inputs_values = list(self.inputs.values())
inputs_values = list(map(list, zip(*inputs_values))) # transpose the values
Expand All @@ -118,9 +90,21 @@ def to_pandas(self) -> DataFrame: # noqa: D102

return df_inputs.join(df_scores)

def comparative_individual_scores_report( # noqa: D102
def comparative_individual_scores_report(
self, other: "BaseEvaluationRunResult", keep_columns: Optional[List[str]] = None
) -> DataFrame:
"""
Creates a Pandas DataFrame with the scores for each metric in the results of two different evaluation runs.
The inputs to both evaluation runs is assumed to be the same.
:param other:
Results of another evaluation run to compare with.
:param keep_columns:
List of common column names to keep from the inputs of the evaluation runs to compare.
:returns:
Pandas DataFrame with the score comparison.
"""
if not isinstance(other, EvaluationRunResult):
raise ValueError("Comparative scores can only be computed between EvaluationRunResults.")

Expand Down

0 comments on commit 7c31d5f

Please sign in to comment.