explodinggradients · jjmachan · Aug 5, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 5, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -97,7 +97,7 @@ jobs:
             OPTS=(--dist loadfile -n auto)
           fi
           # Now run the unit tests
-          pytest tests/unit "${OPTS[@]}"
+          pytest --nbmake tests/unit "${OPTS[@]}"
         env:
           __RAGAS_DEBUG_TRACKING: true
           RAGAS_DO_NOT_TRACK: true

diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
@@ -96,7 +96,8 @@ def score(self: t.Self, row: t.Dict, callbacks: Callbacks = None) -> float:
         callbacks = callbacks or []
         rm, group_cm = new_group(self.name, inputs=row, callbacks=callbacks)
         try:
-            score = asyncio.run(self._ascore(row=row, callbacks=group_cm))
+            loop = asyncio.get_event_loop()
+            score = loop.run_until_complete(self._ascore(row=row, callbacks=group_cm))
         except Exception as e:
             if not group_cm.ended:
                 rm.on_chain_error(e)

diff --git a/tests/e2e/test_evaluation_in_jupyter.ipynb b/tests/e2e/test_evaluation_in_jupyter.ipynb
@@ -103,13 +103,6 @@
     "\n",
     "result"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/tests/unit/test_executor_in_jupyter.ipynb b/tests/unit/test_executor_in_jupyter.ipynb
@@ -1,24 +1,21 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 1,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
+    "# Test Executor "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b78a418208b84895b03c93c54f1d1d61",
+       "model_id": "ebb0705d6a05459a89f4ae87cbbbfd84",
        "version_major": 2,
        "version_minor": 0
       },
@@ -36,14 +33,14 @@
     "\n",
     "exec = Executor(raise_exceptions=True)\n",
     "for i in range(10):\n",
-    "    exec.submit(sleep, i)\n",
+    "    exec.submit(sleep, i/10)\n",
     "\n",
     "assert exec.results(), \"didn't get anything from results\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -54,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -83,13 +80,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9bb608f8b2de42628fb525581d496d3a",
+       "model_id": "985b8a189c9047c29d6ccebf7c5a938b",
        "version_major": 2,
        "version_minor": 0
       },
@@ -103,7 +100,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "128ca1d600b3457c863ddf376d24c44e",
+       "model_id": "ff3097e24dc249fbab6e610e59ccc9b6",
        "version_major": 2,
        "version_minor": 0
       },
@@ -118,22 +115,56 @@
    "source": [
     "exec = Executor(raise_exceptions=True)\n",
     "for i in range(1000):\n",
-    "    exec.submit(sleep, 1)\n",
+    "    exec.submit(sleep, 0.01)\n",
     "\n",
     "assert exec.results(), \"didn't get anything from results\"\n",
     "\n",
     "for i in range(1000):\n",
-    "    exec.submit(sleep, 1)\n",
+    "    exec.submit(sleep, 0.01)\n",
     "\n",
     "assert exec.results(), \"didn't get anything from results\""
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Test Metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas.metrics.base import Metric, EvaluationMode\n",
+    "\n",
+    "class FakeMetric(Metric):\n",
+    "    name = \"fake_metric\"\n",
+    "    evaluation_mode = EvaluationMode.qa\n",
+    "\n",
+    "    def init(self):\n",
+    "        pass\n",
+    "\n",
+    "    async def _ascore(self, row, callbacks)->float:\n",
+    "        return 0\n",
+    "\n",
+    "fm = FakeMetric()"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "score = fm.score(\n",
+    "    row={\"question\": [\"q\"], \"answer\": [\"a\"]},\n",
+    "    callbacks=None,\n",
+    ")\n",
+    "assert score == 0"
+   ]
   }
  ],
  "metadata": {

diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py
@@ -27,3 +27,20 @@ def test_get_available_metrics():
             for metric in get_available_metrics(ds)
         ]
     ), "All metrics should have evaluation mode qa"
+
+
+def test_metric():
+    from ragas.metrics.base import Metric
+
+    class FakeMetric(Metric):
+        name = "fake_metric"  # type: ignore
+        evaluation_mode = EvaluationMode.qa  # type: ignore
+
+        def init(self, run_config):
+            pass
+
+        async def _ascore(self, row, callbacks) -> float:
+            return 0
+
+    fm = FakeMetric()
+    assert fm.score({"question": "a", "answer": "b"}) == 0