diff --git a/haystack/evaluation/eval_run_result.py b/haystack/evaluation/eval_run_result.py index 3b9c0e82aa..6149ad1b9c 100644 --- a/haystack/evaluation/eval_run_result.py +++ b/haystack/evaluation/eval_run_result.py @@ -131,7 +131,8 @@ def aggregated_report( JSON or DataFrame with aggregated scores, in case the output is set to a CSV file, a message confirming the successful write or an error message. """ - results = {k: v["score"] for k, v in self.results.items()} + + results = {k: str(v["score"]) for k, v in self.results.items()} data = {"metrics": list(results.keys()), "score": list(results.values())} return self._handle_output(data, output_format, csv_file) diff --git a/test/evaluation/test_eval_run_result.py b/test/evaluation/test_eval_run_result.py index 055b1d7186..a171bed14d 100644 --- a/test/evaluation/test_eval_run_result.py +++ b/test/evaluation/test_eval_run_result.py @@ -100,7 +100,7 @@ def test_score_report(): "faithfulness", "semantic_answer_similarity", ], - "score": [0.476932, 0.75, 0.46428375, 0.58177975, 0.40585375, 0.53757075], + "score": ["0.476932", "0.75", "0.46428375", "0.58177975", "0.40585375", "0.53757075"], } )