diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb new file mode 100644 index 0000000..c6d6e24 --- /dev/null +++ b/nbs/metric/base.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "00ef8db1", + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.base" + ] + }, + { + "cell_type": "markdown", + "id": "2eb8f806", + "metadata": {}, + "source": [ + "# BaseMetric\n", + "> base class for all type of metrics in ragas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8ccff58", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "#| export\n", + "\n", + "from abc import ABC, abstractmethod\n", + "import asyncio\n", + "from dataclasses import dataclass, field\n", + "from pydantic import BaseModel\n", + "import typing as t\n", + "from ragas_annotator.metric import MetricResult\n", + "from ragas_annotator.metric import LLM\n", + "\n", + "@dataclass\n", + "class Metric(ABC):\n", + " \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n", + " name: str\n", + " prompt: str\n", + " llm: LLM\n", + " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(\n", + " default_factory=dict, init=False, repr=False\n", + " )\n", + " \n", + " @abstractmethod\n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get the appropriate response model.\"\"\"\n", + " pass\n", + "\n", + " @abstractmethod\n", + " def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:\n", + " pass\n", + " \n", + " \n", + " def score(self, reasoning: bool = True, n: int = 1, **kwargs) -> t.Any:\n", + " responses = []\n", + " prompt_input = self.prompt.format(**kwargs)\n", + " for _ in range(n):\n", + " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning)) \n", + " response = MetricResult(**response.model_dump())\n", + " responses.append(response)\n", + " return self._ensemble(responses)\n", + "\n", + "\n", + " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs) -> MetricResult:\n", + " responses = [] # Added missing initialization\n", + " prompt_input = self.prompt.format(**kwargs)\n", + " for _ in range(n):\n", + " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " response = MetricResult(**response.model_dump()) # Fixed missing parentheses\n", + " responses.append(response)\n", + " return self._ensemble(responses)\n", + " \n", + " def batch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[t.Any]:\n", + " return [self.score(reasoning, n, **input_dict) for input_dict in inputs]\n", + " \n", + " async def abatch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[MetricResult]:\n", + " async_tasks = []\n", + " for input_dict in inputs:\n", + " # Add reasoning and n to the input parameters\n", + " async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict))\n", + " \n", + " # Run all tasks concurrently and return results\n", + " return await asyncio.gather(*async_tasks)" + ] + }, + { + "cell_type": "markdown", + "id": "fc4b7458", + "metadata": {}, + "source": [ + "### Example\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcf208fa", + "metadata": {}, + "outputs": [], + "source": [ + "#| eval: false\n", + "\n", + "@dataclass\n", + "class CustomMetric(Metric):\n", + " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " class mymodel(BaseModel):\n", + " result: int\n", + " reason: t.Optional[str] = None\n", + " \n", + " return mymodel \n", + "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + " \n", + " return results[0] # Placeholder for ensemble logic\n", + "\n", + "my_metric = CustomMetric(name=\"example\", prompt=\"What is the result of {input}?\", llm=LLM())\n", + "my_metric.score(input=\"test\")" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/decorator.ipynb b/nbs/metric/decorator.ipynb new file mode 100644 index 0000000..70131f0 --- /dev/null +++ b/nbs/metric/decorator.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.decorator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# decorator factory for metrics\n", + "> decorator factory for creating custom metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "import inspect\n", + "import asyncio\n", + "from dataclasses import dataclass\n", + "from ragas_annotator.metric import MetricResult\n", + "\n", + "\n", + "\n", + "\n", + "def create_metric_decorator(metric_class):\n", + " \"\"\"\n", + " Factory function that creates decorator factories for different metric types.\n", + " \n", + " Args:\n", + " metric_class: The metric class to use (DiscreteMetrics, NumericMetrics, etc.)\n", + " \n", + " Returns:\n", + " A decorator factory function for the specified metric type\n", + " \"\"\"\n", + " def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params):\n", + " \"\"\"\n", + " Creates a decorator that wraps a function into a metric instance.\n", + " \n", + " Args:\n", + " llm: The language model instance to use\n", + " prompt: The prompt template\n", + " name: Optional name for the metric (defaults to function name)\n", + " **metric_params: Additional parameters specific to the metric type\n", + " (values for DiscreteMetrics, range for NumericMetrics, etc.)\n", + " \n", + " Returns:\n", + " A decorator function\n", + " \"\"\"\n", + " def decorator(func):\n", + " # Get metric name and check if function is async\n", + " metric_name = name or func.__name__\n", + " is_async = inspect.iscoroutinefunction(func)\n", + " \n", + " @dataclass\n", + " class CustomMetric(metric_class):\n", + " def _extract_result(self, result, reasoning: bool):\n", + " \"\"\"Extract score and reason from the result.\"\"\"\n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " score, reason = result\n", + " else:\n", + " score, reason = result, None\n", + " \n", + " # Use \"result\" instead of \"score\" for the new MetricResult implementation\n", + " return MetricResult(result=score, reason=reason if reasoning else None)\n", + " \n", + " def _run_sync_in_async(self, func, *args, **kwargs):\n", + " \"\"\"Run a synchronous function in an async context.\"\"\"\n", + " # For sync functions, just run them normally\n", + " return func(*args, **kwargs)\n", + " \n", + " def _execute_metric(self, is_async_execution, reasoning, **kwargs):\n", + " \"\"\"Execute the metric function with proper async handling.\"\"\"\n", + " try:\n", + " if is_async:\n", + " # Async function implementation\n", + " if is_async_execution:\n", + " # In async context, await the function directly\n", + " result = func(self.llm, self.prompt, **kwargs)\n", + " else:\n", + " # In sync context, run the async function in an event loop\n", + " try:\n", + " loop = asyncio.get_event_loop()\n", + " except RuntimeError:\n", + " loop = asyncio.new_event_loop()\n", + " asyncio.set_event_loop(loop)\n", + " result = loop.run_until_complete(func(self.llm, self.prompt, **kwargs))\n", + " else:\n", + " # Sync function implementation\n", + " result = func(self.llm, self.prompt, **kwargs)\n", + " \n", + " return self._extract_result(result, reasoning)\n", + " except Exception as e:\n", + " # Handle errors gracefully\n", + " error_msg = f\"Error executing metric {self.name}: {str(e)}\"\n", + " return MetricResult(result=None, reason=error_msg)\n", + " \n", + " def score(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " \"\"\"Synchronous scoring method.\"\"\"\n", + " return self._execute_metric(is_async_execution=False, reasoning=reasoning, **kwargs)\n", + " \n", + " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " \"\"\"Asynchronous scoring method.\"\"\"\n", + " if is_async:\n", + " # For async functions, await the result\n", + " result = await func(self.llm, self.prompt, **kwargs)\n", + " return self._extract_result(result, reasoning)\n", + " else:\n", + " # For sync functions, run normally\n", + " result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs)\n", + " return self._extract_result(result, reasoning)\n", + " \n", + " # Create the metric instance with all parameters\n", + " metric_instance = CustomMetric(\n", + " name=metric_name,\n", + " prompt=prompt,\n", + " llm=llm,\n", + " **metric_params\n", + " )\n", + " \n", + " # Preserve metadata\n", + " metric_instance.__name__ = metric_name\n", + " metric_instance.__doc__ = func.__doc__\n", + " \n", + " return metric_instance\n", + " \n", + " return decorator\n", + " \n", + " return decorator_factory\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example usage\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "high\n", + "reason\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "\n", + "\n", + "from ragas_annotator.metric import DiscreteMetric\n", + "from ragas_annotator.metric.llm import LLM\n", + "from pydantic import BaseModel\n", + "\n", + "discrete_metric = create_metric_decorator(DiscreteMetric)\n", + "\n", + "@discrete_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: t.List[bool]\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = sum(response.output)\n", + " if total < 1:\n", + " score = 'low'\n", + " else:\n", + " score = 'high'\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "print(result)\n", + "print(result.reason)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/discrete.ipynb b/nbs/metric/discrete.ipynb new file mode 100644 index 0000000..c27815c --- /dev/null +++ b/nbs/metric/discrete.ipynb @@ -0,0 +1,179 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.discrete" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DiscreteMetric\n", + "> Base class from which all discrete metrics should inherit." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import typing as t\n", + "from dataclasses import dataclass, field\n", + "from pydantic import BaseModel, create_model\n", + "from collections import Counter\n", + "from ragas_annotator.metric import Metric, MetricResult\n", + "from ragas_annotator.metric.decorator import create_metric_decorator\n", + "\n", + "\n", + "@dataclass\n", + "class DiscreteMetric(Metric):\n", + " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " values = tuple(self.values)\n", + " fields = {\"result\": (t.Literal[values], ...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...) # type: ignore\n", + " \n", + " model = create_model(model_name, **fields) # type: ignore\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + " \n", + " candidates = [candidate.result for candidate in results]\n", + " counter = Counter(candidates)\n", + " max_count = max(counter.values())\n", + " for candidate in results:\n", + " if counter[candidate.result] == max_count:\n", + " result = candidate.result \n", + " reason = candidate.reason\n", + " return MetricResult(result=result, reason=reason)\n", + " \n", + " return results[0]\n", + "\n", + "\n", + "discrete_metric = create_metric_decorator(DiscreteMetric)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "low\n", + "No context or content was provided for evaluation.\n" + ] + } + ], + "source": [ + "\n", + "#| eval: false\n", + "\n", + "from ragas_annotator.metric.llm import LLM\n", + "\n", + "my_metric = DiscreteMetric(\n", + " llm=LLM(),\n", + " name='helpfulness',\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " values=[\"low\",\"med\",\"high\"],\n", + ")\n", + "\n", + "result = my_metric.score(response=\"this is my response\")\n", + "print(result) #gives \"low\"\n", + "print(result.reason) #gives reasoning from llm\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write custom discrete metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "low\n", + "reason\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "@discrete_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: t.List[bool]\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = sum(response.output)\n", + " if total < 1:\n", + " score = 'low'\n", + " else:\n", + " score = 'high'\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "print(result)\n", + "print(result.reason)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/llm.ipynb b/nbs/metric/llm.ipynb new file mode 100644 index 0000000..6ceca63 --- /dev/null +++ b/nbs/metric/llm.ipynb @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.llm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import openai\n", + "import instructor\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class LLM:\n", + "\n", + " def __post_init__(self):\n", + " self.aclient = instructor.from_openai(openai.AsyncOpenAI())\n", + " self.client = instructor.from_openai(openai.OpenAI())\n", + "\n", + " \n", + " def generate(self,prompt,response_model):\n", + " return self.client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " response_model=response_model,\n", + " )\n", + "\n", + " async def agenerate(self,prompt,response_model):\n", + " return await self.aclient.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " response_model=response_model,\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/numeric.ipynb b/nbs/metric/numeric.ipynb new file mode 100644 index 0000000..e3b08b0 --- /dev/null +++ b/nbs/metric/numeric.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Numeric Metric\n", + "> Base class for all numeric metrics\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.numeric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "from dataclasses import dataclass, field\n", + "from pydantic import BaseModel, create_model\n", + "from ragas_annotator.metric import Metric, MetricResult\n", + "from ragas_annotator.metric.decorator import create_metric_decorator\n", + "\n", + "@dataclass\n", + "class NumericMetric(Metric):\n", + " range: t.Tuple[float,float]\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " fields = {\"result\": (float,...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...) #type: ignore\n", + " \n", + " model = create_model(model_name, **fields)\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + " \n", + " candidates = [candidate.result for candidate in results]\n", + " result = sum(candidates)/len(candidates)\n", + " reason = results[0].reason\n", + " \n", + " return MetricResult(result=result,reason=reason)\n", + " \n", + " \n", + "numeric_metric = create_metric_decorator(NumericMetric)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The response does not provide any context or information that can be evaluated as helpful.'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| eval: false\n", + "\n", + "from ragas_annotator.metric.llm import LLM\n", + "\n", + "my_metric = NumericMetric(\n", + " name='helpfulness',\n", + " llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " range=(0,10),\n", + ")\n", + "\n", + "result = my_metric.score(response=\"this is my response\")\n", + "result #gives \"low\"\n", + "result.reason #gives reasoning from llm\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write custom numeric metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "#| eval: false\n", + "\n", + "@numeric_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',range=(0,10))\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: int\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = response.output\n", + " if total < 1:\n", + " score = 0\n", + " else:\n", + " score = 10\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "result # 10\n", + "result.reason # the reason for the answer\n", + "\n", + "result1 = my_metric.score(response='my response 1') # result\n", + "result2 = my_metric.score(response='my response 2') # result\n", + "\n", + "result1 + result2 # should be addable and behave like a float\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/ranking.ipynb b/nbs/metric/ranking.ipynb new file mode 100644 index 0000000..48e2aa3 --- /dev/null +++ b/nbs/metric/ranking.ipynb @@ -0,0 +1,217 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RankingMetric\n", + "> Base class for ranking metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.ranking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "from dataclasses import dataclass\n", + "from pydantic import BaseModel, Field\n", + "from ragas_annotator.metric import Metric, MetricResult\n", + "from ragas_annotator.metric.decorator import create_metric_decorator\n", + "\n", + "@dataclass\n", + "class RankingMetric(Metric):\n", + " num_ranks: int\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " # Store values needed for validation\n", + " num_ranks = self.num_ranks\n", + " \n", + " # Create explicit model classes instead of using create_model\n", + " if with_reasoning:\n", + " # Model with result and reason\n", + " class ResponseModelWithReason(BaseModel):\n", + " result: t.List[int] = Field(...)\n", + " reason: str = Field(...)\n", + " \n", + " def model_post_init(self, __context):\n", + " expected = set(range(num_ranks))\n", + " if set(self.result) != expected:\n", + " raise ValueError(\n", + " f\"'result' must contain exactly the numbers {sorted(expected)} without repetition.\"\n", + " )\n", + " \n", + " self._response_models[with_reasoning] = ResponseModelWithReason\n", + " return ResponseModelWithReason\n", + " else:\n", + " # Model with just result\n", + " class ResponseModel(BaseModel):\n", + " result: t.List[int] = Field(...)\n", + " \n", + " def model_post_init(self, __context):\n", + " expected = set(range(num_ranks))\n", + " if set(self.result) != expected:\n", + " raise ValueError(\n", + " f\"'result' must contain exactly the numbers {sorted(expected)} without repetition.\"\n", + " )\n", + " \n", + " self._response_models[with_reasoning] = ResponseModel\n", + " return ResponseModel\n", + "\n", + " def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:\n", + " if len(results) == 1:\n", + " return results[0]\n", + "\n", + " n_items = self.num_ranks # Use the class attribute instead of len(results)\n", + " borda_scores = [0] * n_items\n", + "\n", + " for result in results:\n", + " for position_idx, item_idx in enumerate(result.result):\n", + " borda_scores[item_idx] += (n_items - position_idx) # Fixed the formula\n", + "\n", + " indexed_scores = [(score, i) for i, score in enumerate(borda_scores)] \n", + " indexed_scores.sort(key=lambda x: (-x[0], x[1])) \n", + " final_ranking = [pos for _, pos in indexed_scores]\n", + "\n", + " if any(r.reason for r in results):\n", + " reason = \"Ensemble ranking based on multiple evaluations.\\n\" + '\\n'.join([r.reason for r in results if r.reason])\n", + " else:\n", + " reason = None\n", + " \n", + " return MetricResult(result=final_ranking, reason=reason)\n", + " \n", + "\n", + "ranking_metric = create_metric_decorator(RankingMetric)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 2]\n", + "Ensemble ranking based on multiple evaluations.\n", + "The ranking is based on the length and detail of the responses, with 'short answer.' being the least detailed (rank 0), 'a bit more detailed.' being moderate (rank 1), and 'the longest and most detailed answer.' being the most comprehensive (rank 2).\n", + "The ranking is based on the length and detail of the responses. The shortest response is ranked the lowest (0), the moderately detailed response is ranked higher (1), and the longest and most detailed response is ranked the highest (2).\n", + "Ranking is based on length and detail; the longest answer (2) is most detailed, followed by a bit more detailed (1), and the shortest answer (0) is the least detailed.\n" + ] + } + ], + "source": [ + "\n", + "#| eval: false\n", + "\n", + "from ragas_annotator.metric.llm import LLM\n", + "\n", + "my_ranking_metric = RankingMetric(\n", + " name='response_ranking',\n", + " llm=LLM(), # Your language model instance\n", + " prompt=\"Rank the following responses:\\n{candidates}\",\n", + " num_ranks=3,\n", + ")\n", + "\n", + "# To score a single input (ranking candidate responses)\n", + "result = my_ranking_metric.score(candidates=[\n", + " \"short answer.\",\n", + " \"a bit more detailed.\",\n", + " \"the longest and most detailed answer.\"\n", + "],n=3)\n", + "print(result) # Might output something like: [1, 0, 2]\n", + "print(result.reason) # Provides the reasoning behind the ranking\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom ranking metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 0, 2]\n", + "Ranked based on response clarity and detail.\n" + ] + } + ], + "source": [ + "#| eval: false\n", + "\n", + "\n", + "@ranking_metric(\n", + " llm=LLM(), # Your language model instance\n", + " prompt=\"Rank the following responses:\\n{candidates}\",\n", + " name='new_ranking_metric',\n", + " num_ranks=3\n", + ")\n", + "def my_ranking_metric(llm, prompt, **kwargs):\n", + " # Your custom logic that calls the LLM and returns a tuple of (ranking, reason)\n", + " # For example, process the prompt (formatted with candidates) and produce a ranking.\n", + " ranking = [1, 0, 2] # Dummy ranking: second candidate is best, then first, then third.\n", + " reason = \"Ranked based on response clarity and detail.\"\n", + " return ranking, reason\n", + "\n", + "# Using the decorator-based ranking metric:\n", + "result = my_ranking_metric.score(candidates=[\n", + " \"Response A: short answer.\",\n", + " \"Response B: a bit more detailed.\",\n", + " \"Response C: the longest and most detailed answer.\"\n", + "])\n", + "print(result) # E.g., [1, 0, 2]\n", + "print(result.reason) # E.g., \"Ranked based on response clarity and detail.\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/result.ipynb b/nbs/metric/result.ipynb new file mode 100644 index 0000000..cba95c7 --- /dev/null +++ b/nbs/metric/result.ipynb @@ -0,0 +1,264 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "215f57b4", + "metadata": {}, + "source": [ + "# MetricResult\n", + "> MetricResult object to store the result of a metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "164726f3", + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f1c801a-6568-4ba4-8bbe-30bf154174fe", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "class MetricResult:\n", + " \"\"\"Class to hold the result of a metric evaluation.\n", + " \n", + " This class behaves like its underlying result value but still provides access\n", + " to additional metadata like reasoning.\n", + " \n", + " Works with:\n", + " - DiscreteMetrics (string results)\n", + " - NumericMetrics (float/int results)\n", + " - RankingMetrics (list results)\n", + " \"\"\"\n", + " \n", + " def __init__(self, result: t.Any, reason: t.Optional[str] = None):\n", + " self._result = result\n", + " self.reason = reason\n", + " \n", + " def __repr__(self):\n", + " return repr(self._result)\n", + " \n", + " # Access to underlying result\n", + " @property\n", + " def result(self):\n", + " \"\"\"Get the raw result value.\"\"\"\n", + " return self._result\n", + " \n", + " \n", + " # String conversion - works for all types\n", + " def __str__(self):\n", + " return str(self._result)\n", + " \n", + " # Container-like behaviors for list results (RankingMetric)\n", + " def __getitem__(self, key):\n", + " if not hasattr(self._result, \"__getitem__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} object is not subscriptable\")\n", + " return self._result[key]\n", + " \n", + " def __iter__(self):\n", + " if not hasattr(self._result, \"__iter__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} object is not iterable\")\n", + " return iter(self._result)\n", + " \n", + " def __len__(self):\n", + " if not hasattr(self._result, \"__len__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} has no len()\")\n", + " return len(self._result)\n", + " \n", + " # Numeric operations for numeric results (NumericMetric)\n", + " def __float__(self):\n", + " if isinstance(self._result, (int, float)):\n", + " return float(self._result)\n", + " raise TypeError(f\"Cannot convert {type(self._result).__name__} to float\")\n", + " \n", + " def __int__(self):\n", + " if isinstance(self._result, (int, float)):\n", + " return int(self._result)\n", + " raise TypeError(f\"Cannot convert {type(self._result).__name__} to int\")\n", + " \n", + " def __add__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result + other._result\n", + " return self._result + other\n", + " \n", + " def __radd__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", + " return other + self._result\n", + " \n", + " def __sub__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result - other._result\n", + " return self._result - other\n", + " \n", + " def __rsub__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", + " return other - self._result\n", + " \n", + " def __mul__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result * other._result\n", + " return self._result * other\n", + " \n", + " def __rmul__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", + " return other * self._result\n", + " \n", + " def __truediv__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result / other._result\n", + " return self._result / other\n", + " \n", + " def __rtruediv__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", + " return other / self._result\n", + " \n", + " # Comparison operations - work for all types with same-type comparisons\n", + " def __eq__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result == other._result\n", + " return self._result == other\n", + " \n", + " def __lt__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result < other._result\n", + " return self._result < other\n", + " \n", + " def __le__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result <= other._result\n", + " return self._result <= other\n", + " \n", + " def __gt__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result > other._result\n", + " return self._result > other\n", + " \n", + " def __ge__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result >= other._result\n", + " return self._result >= other\n", + " \n", + " # Method forwarding for type-specific behaviors\n", + " def __getattr__(self, name):\n", + " \"\"\"Forward attribute access to the result object if it has that attribute.\n", + " \n", + " This allows calling string methods on discrete results, \n", + " numeric methods on numeric results, and list methods on ranking results.\n", + " \"\"\"\n", + " if hasattr(self._result, name):\n", + " attr = getattr(self._result, name)\n", + " if callable(attr):\n", + " # If it's a method, wrap it to return MetricResult when appropriate\n", + " def wrapper(*args, **kwargs):\n", + " result = attr(*args, **kwargs)\n", + " # If the result is of the same type as self._result, wrap it\n", + " if isinstance(result, type(self._result)):\n", + " return MetricResult(result=result, reason=self.reason)\n", + " return result\n", + " return wrapper\n", + " return attr\n", + " raise AttributeError(f\"{type(self).__name__} has no attribute '{name}'\")\n", + " \n", + " # JSON/dict serialization\n", + " def to_dict(self):\n", + " \"\"\"Convert the result to a dictionary.\"\"\"\n", + " return {\n", + " \"result\": self._result,\n", + " \"reason\": self.reason\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "490cdd2f", + "metadata": {}, + "source": [ + "### Example Usage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24589401", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "42\n", + "This is a test\n", + "8.0\n", + "LOW\n", + "[2, 3]\n" + ] + } + ], + "source": [ + "\n", + "\n", + "metric_result = MetricResult(result=42, reason=\"This is a test\")\n", + "print(metric_result)\n", + "print(metric_result.reason)\n", + "\n", + "### Example with Numeric Operations\n", + "num_result1 = MetricResult(result=5.0)\n", + "num_result2 = MetricResult(result=3.0)\n", + "print(num_result1 + num_result2) # 8.0\n", + "\n", + "\n", + "### Example with String Operations\n", + "str_result = MetricResult(result=\"low\")\n", + "print(str_result.upper()) # \"LOW\"\n", + "\n", + "## Example with List Operations\n", + "list_result = MetricResult(result=[1, 2, 3])\n", + "print(list_result[1:]) # 2\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a984dde9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/sidebar.yml b/nbs/sidebar.yml index f065330..c6a1ec2 100644 --- a/nbs/sidebar.yml +++ b/nbs/sidebar.yml @@ -4,17 +4,35 @@ website: - index.ipynb - dataset.ipynb - experiment.ipynb + - init_module.ipynb - section: backends contents: + - backends/factory.ipynb + - backends/mock_notion_client.ipynb - backends/notion.ipynb + - section: metric + contents: + - metric/base.ipynb + - metric/decorator.ipynb + - metric/discrete.ipynb + - metric/llm.ipynb + - metric/numeric.ipynb + - metric/ranking.ipynb + - metric/result.ipynb + - metric/test_base.ipynb - section: model contents: - model/notion_model.ipynb - model/notion_types.ipynb - section: project contents: - - project/experiment.ipynb - - project/project.ipynb + - project/comparison.ipynb + - project/core.ipynb + - project/experiments.ipynb + - project/naming.ipynb + - section: tracing + contents: + - tracing/langfuse.ipynb - section: utils contents: - utils/exceptions.ipynb diff --git a/ragas_annotator/_modidx.py b/ragas_annotator/_modidx.py index 58458aa..c055929 100644 --- a/ragas_annotator/_modidx.py +++ b/ragas_annotator/_modidx.py @@ -131,6 +131,97 @@ 'ragas_annotator/experiment.py'), 'ragas_annotator.experiment.Experiment.__str__': ( 'experiment.html#experiment.__str__', 'ragas_annotator/experiment.py')}, + 'ragas_annotator.metric.base': { 'ragas_annotator.metric.base.Metric': ( 'metric/base.html#metric', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric._ensemble': ( 'metric/base.html#metric._ensemble', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric._get_response_model': ( 'metric/base.html#metric._get_response_model', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric.abatch_score': ( 'metric/base.html#metric.abatch_score', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric.ascore': ( 'metric/base.html#metric.ascore', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric.batch_score': ( 'metric/base.html#metric.batch_score', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric.score': ( 'metric/base.html#metric.score', + 'ragas_annotator/metric/base.py')}, + 'ragas_annotator.metric.decorator': { 'ragas_annotator.metric.decorator.create_metric_decorator': ( 'metric/decorator.html#create_metric_decorator', + 'ragas_annotator/metric/decorator.py')}, + 'ragas_annotator.metric.discrete': { 'ragas_annotator.metric.discrete.DiscreteMetric': ( 'metric/discrete.html#discretemetric', + 'ragas_annotator/metric/discrete.py'), + 'ragas_annotator.metric.discrete.DiscreteMetric._ensemble': ( 'metric/discrete.html#discretemetric._ensemble', + 'ragas_annotator/metric/discrete.py'), + 'ragas_annotator.metric.discrete.DiscreteMetric._get_response_model': ( 'metric/discrete.html#discretemetric._get_response_model', + 'ragas_annotator/metric/discrete.py')}, + 'ragas_annotator.metric.llm': { 'ragas_annotator.metric.llm.LLM': ('metric/llm.html#llm', 'ragas_annotator/metric/llm.py'), + 'ragas_annotator.metric.llm.LLM.__post_init__': ( 'metric/llm.html#llm.__post_init__', + 'ragas_annotator/metric/llm.py'), + 'ragas_annotator.metric.llm.LLM.agenerate': ( 'metric/llm.html#llm.agenerate', + 'ragas_annotator/metric/llm.py'), + 'ragas_annotator.metric.llm.LLM.generate': ( 'metric/llm.html#llm.generate', + 'ragas_annotator/metric/llm.py')}, + 'ragas_annotator.metric.numeric': { 'ragas_annotator.metric.numeric.NumericMetric': ( 'metric/numeric.html#numericmetric', + 'ragas_annotator/metric/numeric.py'), + 'ragas_annotator.metric.numeric.NumericMetric._ensemble': ( 'metric/numeric.html#numericmetric._ensemble', + 'ragas_annotator/metric/numeric.py'), + 'ragas_annotator.metric.numeric.NumericMetric._get_response_model': ( 'metric/numeric.html#numericmetric._get_response_model', + 'ragas_annotator/metric/numeric.py')}, + 'ragas_annotator.metric.ranking': { 'ragas_annotator.metric.ranking.RankingMetric': ( 'metric/ranking.html#rankingmetric', + 'ragas_annotator/metric/ranking.py'), + 'ragas_annotator.metric.ranking.RankingMetric._ensemble': ( 'metric/ranking.html#rankingmetric._ensemble', + 'ragas_annotator/metric/ranking.py'), + 'ragas_annotator.metric.ranking.RankingMetric._get_response_model': ( 'metric/ranking.html#rankingmetric._get_response_model', + 'ragas_annotator/metric/ranking.py')}, + 'ragas_annotator.metric.result': { 'ragas_annotator.metric.result.MetricResult': ( 'metric/result.html#metricresult', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__add__': ( 'metric/result.html#metricresult.__add__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__eq__': ( 'metric/result.html#metricresult.__eq__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__float__': ( 'metric/result.html#metricresult.__float__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__ge__': ( 'metric/result.html#metricresult.__ge__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__getattr__': ( 'metric/result.html#metricresult.__getattr__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__getitem__': ( 'metric/result.html#metricresult.__getitem__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__gt__': ( 'metric/result.html#metricresult.__gt__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__init__': ( 'metric/result.html#metricresult.__init__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__int__': ( 'metric/result.html#metricresult.__int__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__iter__': ( 'metric/result.html#metricresult.__iter__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__le__': ( 'metric/result.html#metricresult.__le__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__len__': ( 'metric/result.html#metricresult.__len__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__lt__': ( 'metric/result.html#metricresult.__lt__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__mul__': ( 'metric/result.html#metricresult.__mul__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__radd__': ( 'metric/result.html#metricresult.__radd__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__repr__': ( 'metric/result.html#metricresult.__repr__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__rmul__': ( 'metric/result.html#metricresult.__rmul__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__rsub__': ( 'metric/result.html#metricresult.__rsub__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__rtruediv__': ( 'metric/result.html#metricresult.__rtruediv__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__str__': ( 'metric/result.html#metricresult.__str__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__sub__': ( 'metric/result.html#metricresult.__sub__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__truediv__': ( 'metric/result.html#metricresult.__truediv__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.result': ( 'metric/result.html#metricresult.result', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.to_dict': ( 'metric/result.html#metricresult.to_dict', + 'ragas_annotator/metric/result.py')}, 'ragas_annotator.model.notion_model': { 'ragas_annotator.model.notion_model.NotionModel': ( 'model/notion_model.html#notionmodel', 'ragas_annotator/model/notion_model.py'), 'ragas_annotator.model.notion_model.NotionModel.__getattr__': ( 'model/notion_model.html#notionmodel.__getattr__', diff --git a/ragas_annotator/metric/__init__.py b/ragas_annotator/metric/__init__.py new file mode 100644 index 0000000..57a31d3 --- /dev/null +++ b/ragas_annotator/metric/__init__.py @@ -0,0 +1,14 @@ +from ragas_annotator.metric.result import MetricResult +from ragas_annotator.metric.llm import LLM +from ragas_annotator.metric.base import Metric +from ragas_annotator.metric.discrete import DiscreteMetric +from ragas_annotator.metric.numeric import NumericMetric +from ragas_annotator.metric.ranking import RankingMetric + +__all__ = ['MetricResult', + 'LLM', + 'Metric', + 'DiscreteMetric', + 'NumericMetric', + 'RankingMetric', + ] diff --git a/ragas_annotator/metric/base.py b/ragas_annotator/metric/base.py new file mode 100644 index 0000000..d37b9c5 --- /dev/null +++ b/ragas_annotator/metric/base.py @@ -0,0 +1,66 @@ +"""base class for all type of metrics in ragas""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/base.ipynb. + +# %% auto 0 +__all__ = ['Metric'] + +# %% ../../nbs/metric/base.ipynb 2 +from abc import ABC, abstractmethod +import asyncio +from dataclasses import dataclass, field +from pydantic import BaseModel +import typing as t +from . import MetricResult +from . import LLM + +@dataclass +class Metric(ABC): + """Base class for all metrics in the LLM evaluation library.""" + name: str + prompt: str + llm: LLM + _response_models: t.Dict[bool, t.Type[BaseModel]] = field( + default_factory=dict, init=False, repr=False + ) + + @abstractmethod + def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]: + """Get the appropriate response model.""" + pass + + @abstractmethod + def _ensemble(self, results: t.List[MetricResult]) -> MetricResult: + pass + + + def score(self, reasoning: bool = True, n: int = 1, **kwargs) -> t.Any: + responses = [] + prompt_input = self.prompt.format(**kwargs) + for _ in range(n): + response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning)) + response = MetricResult(**response.model_dump()) + responses.append(response) + return self._ensemble(responses) + + + async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs) -> MetricResult: + responses = [] # Added missing initialization + prompt_input = self.prompt.format(**kwargs) + for _ in range(n): + response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning)) + response = MetricResult(**response.model_dump()) # Fixed missing parentheses + responses.append(response) + return self._ensemble(responses) + + def batch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[t.Any]: + return [self.score(reasoning, n, **input_dict) for input_dict in inputs] + + async def abatch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[MetricResult]: + async_tasks = [] + for input_dict in inputs: + # Add reasoning and n to the input parameters + async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict)) + + # Run all tasks concurrently and return results + return await asyncio.gather(*async_tasks) diff --git a/ragas_annotator/metric/decorator.py b/ragas_annotator/metric/decorator.py new file mode 100644 index 0000000..016773a --- /dev/null +++ b/ragas_annotator/metric/decorator.py @@ -0,0 +1,124 @@ +"""decorator factory for creating custom metrics""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/decorator.ipynb. + +# %% auto 0 +__all__ = ['create_metric_decorator'] + +# %% ../../nbs/metric/decorator.ipynb 2 +import typing as t +import inspect +import asyncio +from dataclasses import dataclass +from . import MetricResult + + + + +def create_metric_decorator(metric_class): + """ + Factory function that creates decorator factories for different metric types. + + Args: + metric_class: The metric class to use (DiscreteMetrics, NumericMetrics, etc.) + + Returns: + A decorator factory function for the specified metric type + """ + def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params): + """ + Creates a decorator that wraps a function into a metric instance. + + Args: + llm: The language model instance to use + prompt: The prompt template + name: Optional name for the metric (defaults to function name) + **metric_params: Additional parameters specific to the metric type + (values for DiscreteMetrics, range for NumericMetrics, etc.) + + Returns: + A decorator function + """ + def decorator(func): + # Get metric name and check if function is async + metric_name = name or func.__name__ + is_async = inspect.iscoroutinefunction(func) + + @dataclass + class CustomMetric(metric_class): + def _extract_result(self, result, reasoning: bool): + """Extract score and reason from the result.""" + if isinstance(result, tuple) and len(result) == 2: + score, reason = result + else: + score, reason = result, None + + # Use "result" instead of "score" for the new MetricResult implementation + return MetricResult(result=score, reason=reason if reasoning else None) + + def _run_sync_in_async(self, func, *args, **kwargs): + """Run a synchronous function in an async context.""" + # For sync functions, just run them normally + return func(*args, **kwargs) + + def _execute_metric(self, is_async_execution, reasoning, **kwargs): + """Execute the metric function with proper async handling.""" + try: + if is_async: + # Async function implementation + if is_async_execution: + # In async context, await the function directly + result = func(self.llm, self.prompt, **kwargs) + else: + # In sync context, run the async function in an event loop + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + result = loop.run_until_complete(func(self.llm, self.prompt, **kwargs)) + else: + # Sync function implementation + result = func(self.llm, self.prompt, **kwargs) + + return self._extract_result(result, reasoning) + except Exception as e: + # Handle errors gracefully + error_msg = f"Error executing metric {self.name}: {str(e)}" + return MetricResult(result=None, reason=error_msg) + + def score(self, reasoning: bool = True, n: int = 1, **kwargs): + """Synchronous scoring method.""" + return self._execute_metric(is_async_execution=False, reasoning=reasoning, **kwargs) + + async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs): + """Asynchronous scoring method.""" + if is_async: + # For async functions, await the result + result = await func(self.llm, self.prompt, **kwargs) + return self._extract_result(result, reasoning) + else: + # For sync functions, run normally + result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs) + return self._extract_result(result, reasoning) + + # Create the metric instance with all parameters + metric_instance = CustomMetric( + name=metric_name, + prompt=prompt, + llm=llm, + **metric_params + ) + + # Preserve metadata + metric_instance.__name__ = metric_name + metric_instance.__doc__ = func.__doc__ + + return metric_instance + + return decorator + + return decorator_factory + + + diff --git a/ragas_annotator/metric/discrete.py b/ragas_annotator/metric/discrete.py new file mode 100644 index 0000000..d4f77f7 --- /dev/null +++ b/ragas_annotator/metric/discrete.py @@ -0,0 +1,56 @@ +"""Base class from which all discrete metrics should inherit.""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/discrete.ipynb. + +# %% auto 0 +__all__ = ['discrete_metric', 'DiscreteMetric'] + +# %% ../../nbs/metric/discrete.ipynb 2 +import typing as t +from dataclasses import dataclass, field +from pydantic import BaseModel, create_model +from collections import Counter +from . import Metric, MetricResult +from .decorator import create_metric_decorator + + +@dataclass +class DiscreteMetric(Metric): + values: t.List[str] = field(default_factory=lambda: ["pass", "fail"]) + + def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]: + """Get or create a response model based on reasoning parameter.""" + + if with_reasoning in self._response_models: + return self._response_models[with_reasoning] + + model_name = 'response_model' + values = tuple(self.values) + fields = {"result": (t.Literal[values], ...)} + + if with_reasoning: + fields["reason"] = (str, ...) # type: ignore + + model = create_model(model_name, **fields) # type: ignore + self._response_models[with_reasoning] = model + return model + + def _ensemble(self,results:t.List[MetricResult]) -> MetricResult: + + + if len(results)==1: + return results[0] + + candidates = [candidate.result for candidate in results] + counter = Counter(candidates) + max_count = max(counter.values()) + for candidate in results: + if counter[candidate.result] == max_count: + result = candidate.result + reason = candidate.reason + return MetricResult(result=result, reason=reason) + + return results[0] + + +discrete_metric = create_metric_decorator(DiscreteMetric) diff --git a/ragas_annotator/metric/llm.py b/ragas_annotator/metric/llm.py new file mode 100644 index 0000000..c602e53 --- /dev/null +++ b/ragas_annotator/metric/llm.py @@ -0,0 +1,35 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/llm.ipynb. + +# %% auto 0 +__all__ = ['LLM'] + +# %% ../../nbs/metric/llm.ipynb 1 +import openai +import instructor +from dataclasses import dataclass + +@dataclass +class LLM: + + def __post_init__(self): + self.aclient = instructor.from_openai(openai.AsyncOpenAI()) + self.client = instructor.from_openai(openai.OpenAI()) + + + def generate(self,prompt,response_model): + return self.client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "user", "content": prompt}, + ], + response_model=response_model, + ) + + async def agenerate(self,prompt,response_model): + return await self.aclient.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "user", "content": prompt}, + ], + response_model=response_model, + ) diff --git a/ragas_annotator/metric/numeric.py b/ragas_annotator/metric/numeric.py new file mode 100644 index 0000000..bc39a1f --- /dev/null +++ b/ragas_annotator/metric/numeric.py @@ -0,0 +1,48 @@ +"""Base class for all numeric metrics""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/numeric.ipynb. + +# %% auto 0 +__all__ = ['numeric_metric', 'NumericMetric'] + +# %% ../../nbs/metric/numeric.ipynb 2 +import typing as t +from dataclasses import dataclass, field +from pydantic import BaseModel, create_model +from . import Metric, MetricResult +from .decorator import create_metric_decorator + +@dataclass +class NumericMetric(Metric): + range: t.Tuple[float,float] + + def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]: + """Get or create a response model based on reasoning parameter.""" + + if with_reasoning in self._response_models: + return self._response_models[with_reasoning] + + model_name = 'response_model' + fields = {"result": (float,...)} + + if with_reasoning: + fields["reason"] = (str, ...) #type: ignore + + model = create_model(model_name, **fields) + self._response_models[with_reasoning] = model + return model + + def _ensemble(self,results:t.List[MetricResult]) -> MetricResult: + + if len(results)==1: + return results[0] + + candidates = [candidate.result for candidate in results] + result = sum(candidates)/len(candidates) + reason = results[0].reason + + return MetricResult(result=result,reason=reason) + + +numeric_metric = create_metric_decorator(NumericMetric) + diff --git a/ragas_annotator/metric/ranking.py b/ragas_annotator/metric/ranking.py new file mode 100644 index 0000000..fb883d5 --- /dev/null +++ b/ragas_annotator/metric/ranking.py @@ -0,0 +1,82 @@ +"""Base class for ranking metrics""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/ranking.ipynb. + +# %% auto 0 +__all__ = ['ranking_metric', 'RankingMetric'] + +# %% ../../nbs/metric/ranking.ipynb 2 +import typing as t +from dataclasses import dataclass +from pydantic import BaseModel, Field +from . import Metric, MetricResult +from .decorator import create_metric_decorator + +@dataclass +class RankingMetric(Metric): + num_ranks: int + + def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]: + """Get or create a response model based on reasoning parameter.""" + + if with_reasoning in self._response_models: + return self._response_models[with_reasoning] + + # Store values needed for validation + num_ranks = self.num_ranks + + # Create explicit model classes instead of using create_model + if with_reasoning: + # Model with result and reason + class ResponseModelWithReason(BaseModel): + result: t.List[int] = Field(...) + reason: str = Field(...) + + def model_post_init(self, __context): + expected = set(range(num_ranks)) + if set(self.result) != expected: + raise ValueError( + f"'result' must contain exactly the numbers {sorted(expected)} without repetition." + ) + + self._response_models[with_reasoning] = ResponseModelWithReason + return ResponseModelWithReason + else: + # Model with just result + class ResponseModel(BaseModel): + result: t.List[int] = Field(...) + + def model_post_init(self, __context): + expected = set(range(num_ranks)) + if set(self.result) != expected: + raise ValueError( + f"'result' must contain exactly the numbers {sorted(expected)} without repetition." + ) + + self._response_models[with_reasoning] = ResponseModel + return ResponseModel + + def _ensemble(self, results: t.List[MetricResult]) -> MetricResult: + if len(results) == 1: + return results[0] + + n_items = self.num_ranks # Use the class attribute instead of len(results) + borda_scores = [0] * n_items + + for result in results: + for position_idx, item_idx in enumerate(result.result): + borda_scores[item_idx] += (n_items - position_idx) # Fixed the formula + + indexed_scores = [(score, i) for i, score in enumerate(borda_scores)] + indexed_scores.sort(key=lambda x: (-x[0], x[1])) + final_ranking = [pos for _, pos in indexed_scores] + + if any(r.reason for r in results): + reason = "Ensemble ranking based on multiple evaluations.\n" + '\n'.join([r.reason for r in results if r.reason]) + else: + reason = None + + return MetricResult(result=final_ranking, reason=reason) + + +ranking_metric = create_metric_decorator(RankingMetric) diff --git a/ragas_annotator/metric/result.py b/ragas_annotator/metric/result.py new file mode 100644 index 0000000..c4636c7 --- /dev/null +++ b/ragas_annotator/metric/result.py @@ -0,0 +1,173 @@ +"""MetricResult object to store the result of a metric""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/result.ipynb. + +# %% auto 0 +__all__ = ['MetricResult'] + +# %% ../../nbs/metric/result.ipynb 2 +import typing as t + + + + + +class MetricResult: + """Class to hold the result of a metric evaluation. + + This class behaves like its underlying result value but still provides access + to additional metadata like reasoning. + + Works with: + - DiscreteMetrics (string results) + - NumericMetrics (float/int results) + - RankingMetrics (list results) + """ + + def __init__(self, result: t.Any, reason: t.Optional[str] = None): + self._result = result + self.reason = reason + + def __repr__(self): + return repr(self._result) + + # Access to underlying result + @property + def result(self): + """Get the raw result value.""" + return self._result + + + # String conversion - works for all types + def __str__(self): + return str(self._result) + + # Container-like behaviors for list results (RankingMetric) + def __getitem__(self, key): + if not hasattr(self._result, "__getitem__"): + raise TypeError(f"{type(self._result).__name__} object is not subscriptable") + return self._result[key] + + def __iter__(self): + if not hasattr(self._result, "__iter__"): + raise TypeError(f"{type(self._result).__name__} object is not iterable") + return iter(self._result) + + def __len__(self): + if not hasattr(self._result, "__len__"): + raise TypeError(f"{type(self._result).__name__} has no len()") + return len(self._result) + + # Numeric operations for numeric results (NumericMetric) + def __float__(self): + if isinstance(self._result, (int, float)): + return float(self._result) + raise TypeError(f"Cannot convert {type(self._result).__name__} to float") + + def __int__(self): + if isinstance(self._result, (int, float)): + return int(self._result) + raise TypeError(f"Cannot convert {type(self._result).__name__} to int") + + def __add__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot add {type(self._result).__name__} objects") + if isinstance(other, MetricResult): + return self._result + other._result + return self._result + other + + def __radd__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot add {type(self._result).__name__} objects") + return other + self._result + + def __sub__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot subtract {type(self._result).__name__} objects") + if isinstance(other, MetricResult): + return self._result - other._result + return self._result - other + + def __rsub__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot subtract {type(self._result).__name__} objects") + return other - self._result + + def __mul__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot multiply {type(self._result).__name__} objects") + if isinstance(other, MetricResult): + return self._result * other._result + return self._result * other + + def __rmul__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot multiply {type(self._result).__name__} objects") + return other * self._result + + def __truediv__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot divide {type(self._result).__name__} objects") + if isinstance(other, MetricResult): + return self._result / other._result + return self._result / other + + def __rtruediv__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot divide {type(self._result).__name__} objects") + return other / self._result + + # Comparison operations - work for all types with same-type comparisons + def __eq__(self, other): + if isinstance(other, MetricResult): + return self._result == other._result + return self._result == other + + def __lt__(self, other): + if isinstance(other, MetricResult): + return self._result < other._result + return self._result < other + + def __le__(self, other): + if isinstance(other, MetricResult): + return self._result <= other._result + return self._result <= other + + def __gt__(self, other): + if isinstance(other, MetricResult): + return self._result > other._result + return self._result > other + + def __ge__(self, other): + if isinstance(other, MetricResult): + return self._result >= other._result + return self._result >= other + + # Method forwarding for type-specific behaviors + def __getattr__(self, name): + """Forward attribute access to the result object if it has that attribute. + + This allows calling string methods on discrete results, + numeric methods on numeric results, and list methods on ranking results. + """ + if hasattr(self._result, name): + attr = getattr(self._result, name) + if callable(attr): + # If it's a method, wrap it to return MetricResult when appropriate + def wrapper(*args, **kwargs): + result = attr(*args, **kwargs) + # If the result is of the same type as self._result, wrap it + if isinstance(result, type(self._result)): + return MetricResult(result=result, reason=self.reason) + return result + return wrapper + return attr + raise AttributeError(f"{type(self).__name__} has no attribute '{name}'") + + # JSON/dict serialization + def to_dict(self): + """Convert the result to a dictionary.""" + return { + "result": self._result, + "reason": self.reason + } diff --git a/settings.ini b/settings.ini index 07ddb41..0215d37 100644 --- a/settings.ini +++ b/settings.ini @@ -38,7 +38,7 @@ status = 3 user = explodinggradients ### Dependencies ### -requirements = notion-client fastcore tqdm langfuse +requirements = notion-client fastcore tqdm langfuse openai instructor pydantic dev_requirements = pytest # console_scripts = # conda_user =