explodinggradients · shahules786 · Jan 20, 2024 · Jan 19, 2024 · Jan 19, 2024 · Jan 19, 2024
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -48,6 +48,30 @@ def evaluate(
     metrics : list[Metric] , optional
         List of metrics to use for evaluation. If not provided then ragas will run the
         evaluation on the best set of metrics to give a complete view.
+    llm: BaseRagasLLM, optional
+        The language model to use for the metrics. If not provided then ragas will use
+        the default language model. This can we overridden by the llm specified in
+        the metric level with `metric.llm`.
+    embeddings: BaseRagasEmbeddings, optional
+        The embeddings to use for the metrics. If not provided then ragas will use
+        the default embeddings. This can we overridden by the embeddings specified in
+        the metric level with `metric.embeddings`.
+    callbacks: Callbacks, optional
+        Lifecycle Langchain Callbacks to run during evaluation. Check the
+        [langchain documentation](https://python.langchain.com/docs/modules/callbacks/)
+        for more information.
+    is_async: bool, optional
+        Whether to run the evaluation in async mode or not. If set to True then the
+        evaluation is run by calling the `metric.ascore` method. In case the llm or
+        embeddings does not support async then the evaluation can be run in sync mode
+        with `is_async=False`. Default is False.
+    max_workers: int, optional
+        The number of workers to use for the evaluation. This is used by the
+        `ThreadpoolExecutor` to run the evaluation in sync mode.
+    raise_exceptions: bool, optional
+        Whether to raise exceptions or not. If set to True then the evaluation will
+        raise an exception if any of the metrics fail. If set to False then the
+        evaluation will return `np.nan` for the row that failed. Default is True.
     column_map : dict[str, str], optional
         The column names of the dataset to use for evaluation. If the column names of
         the dataset are different from the default ones then you can provide the
@@ -178,7 +202,7 @@ def evaluate(
             evaluation_rm.on_chain_error(e)
 
         raise e
-    finally:
+    else:
         result = Result(
             scores=Dataset.from_list(scores),
             dataset=dataset,

diff --git a/src/ragas/executor.py b/src/ragas/executor.py
@@ -3,6 +3,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 
+import numpy as np
 from tqdm.auto import tqdm
 
 
@@ -74,7 +75,7 @@ async def _aresults(self) -> t.List[t.Any]:
             desc=self.desc,
             total=len(self.futures),
         ):
-            r = (-1, None)
+            r = (-1, np.nan)
             try:
                 r = await future
             except Exception as e:
@@ -109,18 +110,17 @@ def results(self) -> t.List[t.Any]:
                     desc=self.desc,
                     total=len(self.futures),
                 ):
-                    r = (-1, None)
+                    r = (-1, np.nan)
                     try:
                         r = future.result()
                     except Exception as e:
-                        r = (-1, None)
+                        r = (-1, np.nan)
                         if self.raise_exceptions:
                             raise e
                     finally:
                         results.append(r)
             finally:
                 self.executor.shutdown(wait=False)
 
-        print(results)
         sorted_results = sorted(results, key=lambda x: x[0])
         return [r[1] for r in sorted_results]