diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4802cb8ac..89db6533b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -97,7 +97,7 @@ jobs: OPTS=(--dist loadfile -n auto) fi # Now run the unit tests - pytest tests/unit "${OPTS[@]}" + pytest --nbmake tests/unit "${OPTS[@]}" env: __RAGAS_DEBUG_TRACKING: true RAGAS_DO_NOT_TRACK: true diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index bd6a12fc5..5587a3656 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -96,7 +96,8 @@ def score(self: t.Self, row: t.Dict, callbacks: Callbacks = None) -> float: callbacks = callbacks or [] rm, group_cm = new_group(self.name, inputs=row, callbacks=callbacks) try: - score = asyncio.run(self._ascore(row=row, callbacks=group_cm)) + loop = asyncio.get_event_loop() + score = loop.run_until_complete(self._ascore(row=row, callbacks=group_cm)) except Exception as e: if not group_cm.ended: rm.on_chain_error(e) diff --git a/tests/e2e/test_evaluation_in_jupyter.ipynb b/tests/e2e/test_evaluation_in_jupyter.ipynb index 32078f43e..6b018a8d2 100644 --- a/tests/e2e/test_evaluation_in_jupyter.ipynb +++ b/tests/e2e/test_evaluation_in_jupyter.ipynb @@ -103,13 +103,6 @@ "\n", "result" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tests/unit/test_executor_in_jupyter.ipynb b/tests/unit/test_executor_in_jupyter.ipynb index f1dba1e42..7fc0dca67 100644 --- a/tests/unit/test_executor_in_jupyter.ipynb +++ b/tests/unit/test_executor_in_jupyter.ipynb @@ -1,24 +1,21 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "%load_ext autoreload\n", - "%autoreload 2" + "# Test Executor " ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b78a418208b84895b03c93c54f1d1d61", + "model_id": "ebb0705d6a05459a89f4ae87cbbbfd84", "version_major": 2, "version_minor": 0 }, @@ -36,14 +33,14 @@ "\n", "exec = Executor(raise_exceptions=True)\n", "for i in range(10):\n", - " exec.submit(sleep, i)\n", + " exec.submit(sleep, i/10)\n", "\n", "assert exec.results(), \"didn't get anything from results\"" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -54,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -83,13 +80,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9bb608f8b2de42628fb525581d496d3a", + "model_id": "985b8a189c9047c29d6ccebf7c5a938b", "version_major": 2, "version_minor": 0 }, @@ -103,7 +100,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "128ca1d600b3457c863ddf376d24c44e", + "model_id": "ff3097e24dc249fbab6e610e59ccc9b6", "version_major": 2, "version_minor": 0 }, @@ -118,22 +115,56 @@ "source": [ "exec = Executor(raise_exceptions=True)\n", "for i in range(1000):\n", - " exec.submit(sleep, 1)\n", + " exec.submit(sleep, 0.01)\n", "\n", "assert exec.results(), \"didn't get anything from results\"\n", "\n", "for i in range(1000):\n", - " exec.submit(sleep, 1)\n", + " exec.submit(sleep, 0.01)\n", "\n", "assert exec.results(), \"didn't get anything from results\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test Metric" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.metrics.base import Metric, EvaluationMode\n", + "\n", + "class FakeMetric(Metric):\n", + " name = \"fake_metric\"\n", + " evaluation_mode = EvaluationMode.qa\n", + "\n", + " def init(self):\n", + " pass\n", + "\n", + " async def _ascore(self, row, callbacks)->float:\n", + " return 0\n", + "\n", + "fm = FakeMetric()" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "score = fm.score(\n", + " row={\"question\": [\"q\"], \"answer\": [\"a\"]},\n", + " callbacks=None,\n", + ")\n", + "assert score == 0" + ] } ], "metadata": { diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py index 6f1fcf516..311914972 100644 --- a/tests/unit/test_metric.py +++ b/tests/unit/test_metric.py @@ -27,3 +27,20 @@ def test_get_available_metrics(): for metric in get_available_metrics(ds) ] ), "All metrics should have evaluation mode qa" + + +def test_metric(): + from ragas.metrics.base import Metric + + class FakeMetric(Metric): + name = "fake_metric" # type: ignore + evaluation_mode = EvaluationMode.qa # type: ignore + + def init(self, run_config): + pass + + async def _ascore(self, row, callbacks) -> float: + return 0 + + fm = FakeMetric() + assert fm.score({"question": "a", "answer": "b"}) == 0