From b0f437fc95286a8e02579fdc45173db634f51419 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 20 Mar 2025 18:13:36 -0700 Subject: [PATCH 01/17] notebook for llm as judge --- nbs/metric/base.ipynb | 1099 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1099 insertions(+) create mode 100644 nbs/metric/base.ipynb diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb new file mode 100644 index 0000000..e76d171 --- /dev/null +++ b/nbs/metric/base.ipynb @@ -0,0 +1,1099 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c48aac0f-c63c-4bfb-95b0-a1239f41ccb3", + "metadata": {}, + "source": [ + "# Base\n", + "> baseclass for metric" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d903a59c-3ed8-4b7d-bb9d-39180df72950", + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.base" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5d14fc66-b8af-4a75-b761-20b8e9ce19f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| hide\n", + "from dotenv import load_dotenv\n", + "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b05c525-e153-49ab-b768-5069f624f215", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import typing as t\n", + "from typing import Any, Callable, Dict, List, Optional, Union\n", + "from abc import ABC, abstractmethod\n", + "import asyncio\n", + "from dataclasses import dataclass\n", + "from pydantic import BaseModel\n", + "import openai\n", + "import instructor\n", + "from dataclasses import dataclass, field\n", + "from pydantic import BaseModel, create_model\n", + "import typing as t\n", + "import inspect" + ] + }, + { + "cell_type": "markdown", + "id": "b5140197-08a5-46de-b9bd-818e9be8c951", + "metadata": {}, + "source": [ + "### MetricResult\n", + "> Class to hold the result metric" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "56dbaeb3-e105-4daf-a5b2-a91cb1fb976f", + "metadata": {}, + "outputs": [], + "source": [ + "import typing as t\n", + "from typing import Any, Callable, Dict, List, Optional, Union\n", + "\n", + "class MetricResult:\n", + " \"\"\"Class to hold the result of a metric evaluation.\n", + " \n", + " This class behaves like its underlying result value but still provides access\n", + " to additional metadata like reasoning.\n", + " \n", + " Works with:\n", + " - DiscreteMetrics (string results)\n", + " - NumericMetrics (float/int results)\n", + " - RankingMetrics (list results)\n", + " \"\"\"\n", + " \n", + " def __init__(self, result: Any, reason: t.Optional[str] = None):\n", + " self._result = result\n", + " self.reason = reason\n", + " \n", + " def __repr__(self):\n", + " return repr(self._result)\n", + " \n", + " # Access to underlying result\n", + " @property\n", + " def result(self):\n", + " \"\"\"Get the raw result value.\"\"\"\n", + " return self._result\n", + " \n", + " \n", + " # String conversion - works for all types\n", + " def __str__(self):\n", + " return str(self._result)\n", + " \n", + " # Container-like behaviors for list results (RankingMetric)\n", + " def __getitem__(self, key):\n", + " if not hasattr(self._result, \"__getitem__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} object is not subscriptable\")\n", + " return self._result[key]\n", + " \n", + " def __iter__(self):\n", + " if not hasattr(self._result, \"__iter__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} object is not iterable\")\n", + " return iter(self._result)\n", + " \n", + " def __len__(self):\n", + " if not hasattr(self._result, \"__len__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} has no len()\")\n", + " return len(self._result)\n", + " \n", + " # Numeric operations for numeric results (NumericMetric)\n", + " def __float__(self):\n", + " if isinstance(self._result, (int, float)):\n", + " return float(self._result)\n", + " raise TypeError(f\"Cannot convert {type(self._result).__name__} to float\")\n", + " \n", + " def __int__(self):\n", + " if isinstance(self._result, (int, float)):\n", + " return int(self._result)\n", + " raise TypeError(f\"Cannot convert {type(self._result).__name__} to int\")\n", + " \n", + " def __add__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result + other._result\n", + " return self._result + other\n", + " \n", + " def __radd__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", + " return other + self._result\n", + " \n", + " def __sub__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result - other._result\n", + " return self._result - other\n", + " \n", + " def __rsub__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", + " return other - self._result\n", + " \n", + " def __mul__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result * other._result\n", + " return self._result * other\n", + " \n", + " def __rmul__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", + " return other * self._result\n", + " \n", + " def __truediv__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result / other._result\n", + " return self._result / other\n", + " \n", + " def __rtruediv__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", + " return other / self._result\n", + " \n", + " # Comparison operations - work for all types with same-type comparisons\n", + " def __eq__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result == other._result\n", + " return self._result == other\n", + " \n", + " def __lt__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result < other._result\n", + " return self._result < other\n", + " \n", + " def __le__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result <= other._result\n", + " return self._result <= other\n", + " \n", + " def __gt__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result > other._result\n", + " return self._result > other\n", + " \n", + " def __ge__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result >= other._result\n", + " return self._result >= other\n", + " \n", + " # Method forwarding for type-specific behaviors\n", + " def __getattr__(self, name):\n", + " \"\"\"Forward attribute access to the result object if it has that attribute.\n", + " \n", + " This allows calling string methods on discrete results, \n", + " numeric methods on numeric results, and list methods on ranking results.\n", + " \"\"\"\n", + " if hasattr(self._result, name):\n", + " attr = getattr(self._result, name)\n", + " if callable(attr):\n", + " # If it's a method, wrap it to return MetricResult when appropriate\n", + " def wrapper(*args, **kwargs):\n", + " result = attr(*args, **kwargs)\n", + " # If the result is of the same type as self._result, wrap it\n", + " if isinstance(result, type(self._result)):\n", + " return MetricResult(result=result, reason=self.reason)\n", + " return result\n", + " return wrapper\n", + " return attr\n", + " raise AttributeError(f\"{type(self).__name__} has no attribute '{name}'\")\n", + " \n", + " # JSON/dict serialization\n", + " def to_dict(self):\n", + " \"\"\"Convert the result to a dictionary.\"\"\"\n", + " return {\n", + " \"result\": self._result,\n", + " \"reason\": self.reason\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "89555bee-23a3-4129-86a5-0a2ffeed00c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'low'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = MetricResult(result='low',reason=\"my reason\")\n", + "result\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f21b7a50-3142-40e9-9612-0603f5b8654d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'my reason'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.reason" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7f588f-2a13-4c10-9775-00101a03e0a5", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class LLM:\n", + "\n", + " def __post_init__(self):\n", + " self.aclient = instructor.from_openai(openai.AsyncOpenAI())\n", + " self.client = instructor.from_openai(openai.OpenAI())\n", + "\n", + " \n", + " def generate(self,prompt,response_model):\n", + " return self.client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " response_model=response_model,\n", + " )\n", + "\n", + " async def agenerate(self,prompt,response_model):\n", + " return await self.aclient.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " response_model=response_model,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "1bec7946-c61c-4631-9880-fff575974e39", + "metadata": {}, + "source": [ + "### Metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c6fecbe-4fdd-464e-961e-48a528bc3278", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c5e19478-d947-405b-a229-4a1e7daa2fd3", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'dataclass' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#| export\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;129m@dataclass\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mMetric\u001b[39;00m(ABC):\n\u001b[1;32m 5\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\u001b[39;00m\n\u001b[1;32m 6\u001b[0m name:\u001b[38;5;28mstr\u001b[39m\n", + "\u001b[0;31mNameError\u001b[0m: name 'dataclass' is not defined" + ] + } + ], + "source": [ + "#| export\n", + "\n", + "@dataclass\n", + "class Metric(ABC):\n", + " \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n", + " name:str\n", + " prompt:str\n", + " llm:LLM\n", + " _response_models: Dict[bool, Type[BaseModel]] = field(\n", + " default_factory=dict, init=False, repr=False\n", + " )\n", + " \n", + " @abstractmethod\n", + " def _get_response_model(self, with_reasoning: bool) -> Type[BaseModel]:\n", + " \"\"\"Get the appropriate response model.\"\"\"\n", + " pass\n", + " \n", + " def score(self, reasoning=True, n=1, **kwargs) -> Any:\n", + " \n", + " prompt_input = self.prompt.format(**kwargs)\n", + " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " return MetricResult(**response.model_dump())\n", + "\n", + " async def ascore(self, reasoning=True, n=1, **kwargs):\n", + " prompt_input = self.prompt.format(**kwargs)\n", + " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " return MetricResult(**response.model_dump())\n", + " \n", + " def batch_score(self, inputs: List[Dict[str, Any]], reasoning:bool=True, n:int=1) -> List[Any]:\n", + " \n", + " return [self.score(**input_dict) for input_dict in inputs]\n", + " \n", + " async def abatch_score(self, inputs: List[Dict[str, Any]], reasoning: bool = True, n: int = 1) -> List[MetricResult]:\n", + " \n", + " async_tasks = []\n", + " for input_dict in inputs:\n", + " # Add reasoning and n to the input parameters\n", + " async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict))\n", + " \n", + " # Run all tasks concurrently and return results\n", + " return await asyncio.gather(*async_tasks)\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "4af79bb4-4c3e-4004-b7f3-ec36e50b4ca5", + "metadata": {}, + "source": [ + "### DiscreteMetric \n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f09683e0-5ec3-4e60-a8c4-1657e2fe60b9", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "@dataclass\n", + "class DiscreteMetrics(Metric):\n", + " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", + " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(default_factory=dict, init=False, repr=False)\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " fields = {\"score\": (t.Literal[tuple(self.values)], ...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...)\n", + " \n", + " model = create_model(model_name, **fields)\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + "\n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "f33ef89f-4ccc-4307-9944-4f372ce77830", + "metadata": {}, + "source": [ + "### decorator factory for discrete_metric" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "99d5afd6-72bd-42d7-bff0-effce9cf8cd9", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "\n", + "def discrete_metric(llm, prompt, values:t.List[str],name:t.Optional[str]=None):\n", + "\n", + " def decorator(metric):\n", + " metric_name = name or metric.__name__ \n", + " is_async = inspect.iscoroutinefunction(metric)\n", + "\n", + " @dataclass\n", + " class CustomDiscreteMetric(DiscreteMetrics):\n", + "\n", + " def score(self,reasoning:bool=True, n:int=1,**kwargs):\n", + "\n", + " if is_async:\n", + " # For async functions, we need to run them in an event loop\n", + " import asyncio\n", + " \n", + " # Get or create an event loop\n", + " try:\n", + " loop = asyncio.get_event_loop()\n", + " except RuntimeError:\n", + " loop = asyncio.new_event_loop()\n", + " asyncio.set_event_loop(loop)\n", + " \n", + " # Run the async function and get the result\n", + " result = loop.run_until_complete(metric(self.llm,self.prompt,**kwargs))\n", + " else:\n", + " # For sync functions, just call directly\n", + " result = metric(self.llm,self.prompt,**kwargs)\n", + " \n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " score, reason = result\n", + " else:\n", + " score = result\n", + " reason = None\n", + " \n", + " return MetricResult(score=score, reason=reason if reasoning else None)\n", + " \n", + " async def ascore(self, reasoning=True, n=1, **kwargs):\n", + " if is_async:\n", + " # For async functions, await them directly\n", + " result = await metric(self.llm,self.prompt,**kwargs)\n", + " else:\n", + " # For sync functions, run them normally\n", + " result = metric(self.llm,self.prompt,**kwargs)\n", + "\n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " score, reason = result\n", + " else:\n", + " score = result\n", + " reason = None\n", + " \n", + " metric_instance = CustomDiscreteMetric(\n", + " name=metric_name,\n", + " prompt=prompt,\n", + " llm=llm,\n", + " values=values,\n", + " \n", + " )\n", + " metric_instance.__name__ = name\n", + " metric_instance.__doc__ = metric.__doc__\n", + " \n", + " return metric_instance\n", + " \n", + " return decorator\n", + " \n", + " \n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "07f49ad3-4476-4bcc-ac34-a87fb7a8652a", + "metadata": {}, + "source": [ + "### Usage pattern" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "aeae8fe5-e81a-44ac-9ad7-a240655a0f06", + "metadata": {}, + "outputs": [], + "source": [ + "my_metric = DiscreteMetrics(\n", + " name='helpfulness',\n", + " llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " values=[\"low\",\"med\",\"high\"],\n", + ")\n", + "\n", + "result = my_metric.score(response=\"this is my response\")\n", + "result #gives \"low\"\n", + "result.reason #gives reasoning from llm\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b5e499d8-8258-46ce-b719-0389d3cfd8db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MetricResult(score=low, reason=None)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## score without reasoning to save reasoning tokens cost\n", + "result = my_metric.score(response=\"this is my response\",reasoning=False)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a9a5d6c6-4cfc-4f45-8b19-996315a95370", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'reason'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@discrete_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: t.List[bool]\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = sum(response.output)\n", + " if total < 1:\n", + " score = 'low'\n", + " else:\n", + " score = 'high'\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "result.score\n", + "result.reason" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2b4809ca-d921-4084-bb9e-fe3a72f438a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MetricResult(score=high, reason=reason)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "markdown", + "id": "05f60c70-fc32-41f8-aa7c-c8685d77398a", + "metadata": {}, + "source": [ + "## Numeric Metric" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "6a1c66fb-3c1c-4bc6-9996-0b5beb304b9c", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "@dataclass\n", + "class NumericMetrics(Metric):\n", + " range: t.Tuple[float,float]\n", + " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(default_factory=dict, init=False, repr=False)\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " fields = {\"score\": (float,...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...)\n", + " \n", + " model = create_model(model_name, **fields)\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + " \n", + " def score(self, reasoning=True, n=1, **kwargs) -> Any:\n", + " \n", + " prompt_input = self.prompt.format(**kwargs)\n", + " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " return MetricResult(**response.model_dump())\n", + "\n", + " async def ascore(self, reasoning=True, n=1, **kwargs):\n", + " prompt_input = self.prompt.format(**kwargs)\n", + " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " return MetricResult(**response.model_dump())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "251cdea8-fc71-46bd-8a00-fb8e33e10350", + "metadata": {}, + "outputs": [], + "source": [ + "my_metric = NumericMetrics(\n", + " name='helpfulness',\n", + " llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " range=(0,10),\n", + ")\n", + "\n", + "result = my_metric.score(response=\"this is my response\")\n", + "result #gives \"low\"\n", + "result.reason #gives reasoning from llm\n", + "\n", + "result = my_metric.batch_score(inputs=[{\"response\":\"this is my response\"}])\n" + ] + }, + { + "cell_type": "markdown", + "id": "c96520ae-294b-4868-8b2f-22a30ebd5f25", + "metadata": {}, + "source": [ + "### decorator factory" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "265af384-ed35-4262-acfe-6847b22d3089", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "\n", + "def numeric_metric(llm, prompt, range: t.Tuple[float,float],name:t.Optional[str]=None):\n", + "\n", + " def decorator(metric):\n", + " metric_name = name or metric.__name__ \n", + " is_async = inspect.iscoroutinefunction(metric)\n", + "\n", + " @dataclass\n", + " class CustomNumericMetric(NumericMetrics):\n", + "\n", + " def score(self,reasoning:bool=True, n:int=1,**kwargs):\n", + "\n", + " if is_async:\n", + " # For async functions, we need to run them in an event loop\n", + " import asyncio\n", + " \n", + " # Get or create an event loop\n", + " try:\n", + " loop = asyncio.get_event_loop()\n", + " except RuntimeError:\n", + " loop = asyncio.new_event_loop()\n", + " asyncio.set_event_loop(loop)\n", + " \n", + " # Run the async function and get the result\n", + " result = loop.run_until_complete(metric(self.llm,self.prompt,**kwargs))\n", + " else:\n", + " # For sync functions, just call directly\n", + " result = metric(self.llm,self.prompt,**kwargs)\n", + " \n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " score, reason = result\n", + " else:\n", + " score = result\n", + " reason = None\n", + " \n", + " return MetricResult(score=score, reason=reason if reasoning else None)\n", + " \n", + " async def ascore(self, reasoning=True, n=1, **kwargs):\n", + " if is_async:\n", + " # For async functions, await them directly\n", + " result = await metric(self.llm,self.prompt,**kwargs)\n", + " else:\n", + " # For sync functions, run them normally\n", + " result = metric(self.llm,self.prompt,**kwargs)\n", + "\n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " score, reason = result\n", + " else:\n", + " score = result\n", + " reason = None\n", + " \n", + " metric_instance = CustomNumericMetric(\n", + " name=metric_name,\n", + " prompt=prompt,\n", + " llm=llm,\n", + " range=range,\n", + " \n", + " )\n", + " metric_instance.__name__ = name\n", + " metric_instance.__doc__ = metric.__doc__\n", + " \n", + " return metric_instance\n", + " \n", + " return decorator\n", + " \n", + " \n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "009c1944-bda7-41b1-9235-dcda5acbed55", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'reason'" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@numeric_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',range=(0,10))\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: int\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = response.output\n", + " if total < 1:\n", + " score = 0\n", + " else:\n", + " score = 10\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "result # 10\n", + "result.reason # the reason for the answer\n", + "\n", + "result1 = my_metric.score(response='my response 1') # result\n", + "result2 = my_metric.score(response='my response 2') # result\n", + "\n", + "result1 + result2 # should be addable and behave like a float\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "ff32e972-0900-4ff1-94db-1378302f8d97", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.score\n" + ] + }, + { + "cell_type": "markdown", + "id": "90794704-5e45-4dd5-8862-b4fb9694a5b5", + "metadata": {}, + "source": [ + "### Ranking metric" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "0b1bbf8f-c7fa-4004-9165-fa388f7ba15d", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "@dataclass\n", + "class RankingMetrics(Metric):\n", + " num_ranks: int\n", + " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(default_factory=dict, init=False, repr=False)\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " fields = {\"score\": (t.List[int],...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...)\n", + " \n", + " model = create_model(model_name, **fields)\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + " \n", + " def score(self, reasoning=True, n=1, **kwargs) -> Any:\n", + " \n", + " prompt_input = self.prompt.format(**kwargs)\n", + " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " return MetricResult(**response.model_dump())\n", + "\n", + " async def ascore(self, reasoning=True, n=1, **kwargs):\n", + " prompt_input = self.prompt.format(**kwargs)\n", + " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " return MetricResult(**response.model_dump())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "716881a1-0a93-46b3-b41b-aee0f987a1a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 2, 3]\n", + "The responses are ranked based on their length and detail, with the longest and most detailed answer receiving the highest rank.\n" + ] + } + ], + "source": [ + "# User instantiates a ranking metric by providing a name, an LLM, a prompt template, and the number of rankings desired.\n", + "my_ranking_metric = RankingMetrics(\n", + " name='response_ranking',\n", + " llm=LLM(), # Your language model instance\n", + " prompt=\"Rank the following responses:\\n{candidates}\",\n", + " num_ranks=3\n", + ")\n", + "\n", + "# To score a single input (ranking candidate responses)\n", + "result = my_ranking_metric.score(candidates=[\n", + " \"short answer.\",\n", + " \"a bit more detailed.\",\n", + " \"the longest and most detailed answer.\"\n", + "])\n", + "print(result.score) # Might output something like: [1, 0, 2]\n", + "print(result.reason) # Provides the reasoning behind the ranking\n" + ] + }, + { + "cell_type": "markdown", + "id": "5b53bd5e-06c9-4430-9c06-f2225ddd7bd5", + "metadata": {}, + "source": [ + "### decorator factory for ranking metric" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "4c4e9170-67b9-4841-9df2-6afc490b89dd", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "def ranking_metric(llm, prompt, num_ranks: int, name: t.Optional[str] = None):\n", + " def decorator(metric):\n", + " metric_name = name or metric.__name__\n", + " is_async = inspect.iscoroutinefunction(metric)\n", + "\n", + " @dataclass\n", + " class CustomRankingMetric(RankingMetrics):\n", + " # Inherits: name, prompt, llm, num_ranks from RankingMetrics.\n", + " # No extra fields are needed.\n", + " def score(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " if is_async:\n", + " # For async functions, run in an event loop.\n", + " import asyncio\n", + " try:\n", + " loop = asyncio.get_event_loop()\n", + " except RuntimeError:\n", + " loop = asyncio.new_event_loop()\n", + " asyncio.set_event_loop(loop)\n", + " result = loop.run_until_complete(metric(self.llm, self.prompt, **kwargs))\n", + " else:\n", + " result = metric(self.llm, self.prompt, **kwargs)\n", + "\n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " ranking, reason = result\n", + " else:\n", + " ranking = result\n", + " reason = None\n", + "\n", + " return MetricResult(score=ranking, reason=reason if reasoning else None)\n", + "\n", + " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " if is_async:\n", + " result = await metric(self.llm, self.prompt, **kwargs)\n", + " else:\n", + " result = metric(self.llm, self.prompt, **kwargs)\n", + "\n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " ranking, reason = result\n", + " else:\n", + " ranking = result\n", + " reason = None\n", + "\n", + " return MetricResult(score=ranking, reason=reason if reasoning else None)\n", + "\n", + " metric_instance = CustomRankingMetric(\n", + " name=metric_name,\n", + " prompt=prompt,\n", + " llm=llm,\n", + " num_ranks=num_ranks\n", + " )\n", + " metric_instance.__name__ = metric_name\n", + " metric_instance.__doc__ = metric.__doc__\n", + "\n", + " return metric_instance\n", + " return decorator" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "cbb1729b-8b25-48d8-a472-c03dd1e0d861", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 0, 2]\n", + "Ranked based on response clarity and detail.\n" + ] + } + ], + "source": [ + "@ranking_metric(\n", + " llm=LLM(), # Your language model instance\n", + " prompt=\"Rank the following responses:\\n{candidates}\",\n", + " name='new_ranking_metric',\n", + " num_ranks=3\n", + ")\n", + "def my_ranking_metric(llm, prompt, **kwargs):\n", + " # Your custom logic that calls the LLM and returns a tuple of (ranking, reason)\n", + " # For example, process the prompt (formatted with candidates) and produce a ranking.\n", + " ranking = [1, 0, 2] # Dummy ranking: second candidate is best, then first, then third.\n", + " reason = \"Ranked based on response clarity and detail.\"\n", + " return ranking, reason\n", + "\n", + "# Using the decorator-based ranking metric:\n", + "result = my_ranking_metric.score(candidates=[\n", + " \"Response A: short answer.\",\n", + " \"Response B: a bit more detailed.\",\n", + " \"Response C: the longest and most detailed answer.\"\n", + "])\n", + "print(result.score) # E.g., [1, 0, 2]\n", + "print(result.reason) # E.g., \"Ranked based on response clarity and detail.\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87e6646a-c65a-4317-994c-43aae31d65e2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (random)", + "language": "python", + "name": "random" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 64bef1a212f608da2478e4cf3618c7e616aef0c4 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 20 Mar 2025 23:25:39 -0700 Subject: [PATCH 02/17] finish implementation --- nbs/metric/base.ipynb | 645 ++++++++++++++++++++++-------------------- 1 file changed, 331 insertions(+), 314 deletions(-) diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb index e76d171..6133f6a 100644 --- a/nbs/metric/base.ipynb +++ b/nbs/metric/base.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 13, "id": "5d14fc66-b8af-4a75-b761-20b8e9ce19f8", "metadata": {}, "outputs": [ @@ -31,7 +31,7 @@ "True" ] }, - "execution_count": 1, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -44,10 +44,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "0b05c525-e153-49ab-b768-5069f624f215", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "#| export\n", "import typing as t\n", @@ -75,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "id": "56dbaeb3-e105-4daf-a5b2-a91cb1fb976f", "metadata": {}, "outputs": [], @@ -246,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "id": "89555bee-23a3-4129-86a5-0a2ffeed00c7", "metadata": {}, "outputs": [ @@ -256,7 +265,7 @@ "'low'" ] }, - "execution_count": 9, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -268,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 17, "id": "f21b7a50-3142-40e9-9612-0603f5b8654d", "metadata": {}, "outputs": [ @@ -278,7 +287,7 @@ "'my reason'" ] }, - "execution_count": 11, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -289,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "be7f588f-2a13-4c10-9775-00101a03e0a5", "metadata": {}, "outputs": [], @@ -339,22 +348,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 108, "id": "c5e19478-d947-405b-a229-4a1e7daa2fd3", "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'dataclass' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#| export\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;129m@dataclass\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mMetric\u001b[39;00m(ABC):\n\u001b[1;32m 5\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\u001b[39;00m\n\u001b[1;32m 6\u001b[0m name:\u001b[38;5;28mstr\u001b[39m\n", - "\u001b[0;31mNameError\u001b[0m: name 'dataclass' is not defined" - ] - } - ], + "outputs": [], "source": [ "#| export\n", "\n", @@ -364,29 +361,43 @@ " name:str\n", " prompt:str\n", " llm:LLM\n", - " _response_models: Dict[bool, Type[BaseModel]] = field(\n", + " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(\n", " default_factory=dict, init=False, repr=False\n", " )\n", " \n", " @abstractmethod\n", - " def _get_response_model(self, with_reasoning: bool) -> Type[BaseModel]:\n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", " \"\"\"Get the appropriate response model.\"\"\"\n", " pass\n", + "\n", + " @abstractmethod\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + " pass\n", + " \n", " \n", " def score(self, reasoning=True, n=1, **kwargs) -> Any:\n", - " \n", + "\n", + " responses = []\n", " prompt_input = self.prompt.format(**kwargs)\n", - " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning))\n", - " return MetricResult(**response.model_dump())\n", + " for _ in range(n):\n", + " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning)) \n", + " response = MetricResult(**response.model_dump())\n", + " responses.append(response)\n", + " return self._ensemble(responses)\n", + "\n", "\n", " async def ascore(self, reasoning=True, n=1, **kwargs):\n", + " \n", " prompt_input = self.prompt.format(**kwargs)\n", - " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", - " return MetricResult(**response.model_dump())\n", + " for _ in range(n):\n", + " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " response = MetricResult(**response.model_dump)\n", + " responses.append(response)\n", + " return self._ensemble(responses)\n", " \n", " def batch_score(self, inputs: List[Dict[str, Any]], reasoning:bool=True, n:int=1) -> List[Any]:\n", " \n", - " return [self.score(**input_dict) for input_dict in inputs]\n", + " return [self.score(reasoning,n,**input_dict) for input_dict in inputs]\n", " \n", " async def abatch_score(self, inputs: List[Dict[str, Any]], reasoning: bool = True, n: int = 1) -> List[MetricResult]:\n", " \n", @@ -412,17 +423,17 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 109, "id": "f09683e0-5ec3-4e60-a8c4-1657e2fe60b9", "metadata": {}, "outputs": [], "source": [ "#| export\n", + "from collections import Counter\n", "\n", "@dataclass\n", "class DiscreteMetrics(Metric):\n", " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", - " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(default_factory=dict, init=False, repr=False)\n", " \n", " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", @@ -431,7 +442,7 @@ " return self._response_models[with_reasoning]\n", " \n", " model_name = 'response_model'\n", - " fields = {\"score\": (t.Literal[tuple(self.values)], ...)}\n", + " fields = {\"result\": (t.Literal[tuple(self.values)], ...)}\n", " \n", " if with_reasoning:\n", " fields[\"reason\"] = (str, ...)\n", @@ -440,6 +451,23 @@ " self._response_models[with_reasoning] = model\n", " return model \n", "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + " \n", + " candidates = [candidate.result for candidate in results]\n", + " counter = Counter(candidates)\n", + " max_count = max(counter.values())\n", + " for candidate in results:\n", + " if counter[candidate.result] == max_count:\n", + " result = candidate.result \n", + " reason = candidate.reason\n", + " break\n", + " \n", + " return MetricResult(result=result,reason=reason)\n", + "\n", "\n", " " ] @@ -454,81 +482,125 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 110, "id": "99d5afd6-72bd-42d7-bff0-effce9cf8cd9", "metadata": {}, "outputs": [], "source": [ - "#| export\n", - "\n", - "\n", - "def discrete_metric(llm, prompt, values:t.List[str],name:t.Optional[str]=None):\n", - "\n", - " def decorator(metric):\n", - " metric_name = name or metric.__name__ \n", - " is_async = inspect.iscoroutinefunction(metric)\n", - "\n", - " @dataclass\n", - " class CustomDiscreteMetric(DiscreteMetrics):\n", - "\n", - " def score(self,reasoning:bool=True, n:int=1,**kwargs):\n", + "import typing as t\n", + "from typing import Any, Callable, Dict, List, Optional, Type, Union\n", + "import inspect\n", + "import asyncio\n", + "from dataclasses import dataclass\n", + "from abc import ABC\n", "\n", - " if is_async:\n", - " # For async functions, we need to run them in an event loop\n", - " import asyncio\n", + "def create_metric_decorator(metric_class):\n", + " \"\"\"\n", + " Factory function that creates decorator factories for different metric types.\n", + " \n", + " Args:\n", + " metric_class: The metric class to use (DiscreteMetrics, NumericMetrics, etc.)\n", + " \n", + " Returns:\n", + " A decorator factory function for the specified metric type\n", + " \"\"\"\n", + " def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params):\n", + " \"\"\"\n", + " Creates a decorator that wraps a function into a metric instance.\n", + " \n", + " Args:\n", + " llm: The language model instance to use\n", + " prompt: The prompt template\n", + " name: Optional name for the metric (defaults to function name)\n", + " **metric_params: Additional parameters specific to the metric type\n", + " (values for DiscreteMetrics, range for NumericMetrics, etc.)\n", + " \n", + " Returns:\n", + " A decorator function\n", + " \"\"\"\n", + " def decorator(func):\n", + " # Get metric name and check if function is async\n", + " metric_name = name or func.__name__\n", + " is_async = inspect.iscoroutinefunction(func)\n", + " \n", + " @dataclass\n", + " class CustomMetric(metric_class):\n", + " def _extract_result(self, result, reasoning: bool):\n", + " \"\"\"Extract score and reason from the result.\"\"\"\n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " score, reason = result\n", + " else:\n", + " score, reason = result, None\n", " \n", - " # Get or create an event loop\n", + " # Use \"result\" instead of \"score\" for the new MetricResult implementation\n", + " return MetricResult(result=score, reason=reason if reasoning else None)\n", + " \n", + " def _run_sync_in_async(self, func, *args, **kwargs):\n", + " \"\"\"Run a synchronous function in an async context.\"\"\"\n", + " # For sync functions, just run them normally\n", + " return func(*args, **kwargs)\n", + " \n", + " def _execute_metric(self, is_async_execution, reasoning, **kwargs):\n", + " \"\"\"Execute the metric function with proper async handling.\"\"\"\n", " try:\n", - " loop = asyncio.get_event_loop()\n", - " except RuntimeError:\n", - " loop = asyncio.new_event_loop()\n", - " asyncio.set_event_loop(loop)\n", - " \n", - " # Run the async function and get the result\n", - " result = loop.run_until_complete(metric(self.llm,self.prompt,**kwargs))\n", - " else:\n", - " # For sync functions, just call directly\n", - " result = metric(self.llm,self.prompt,**kwargs)\n", - " \n", - " if isinstance(result, tuple) and len(result) == 2:\n", - " score, reason = result\n", - " else:\n", - " score = result\n", - " reason = None\n", - " \n", - " return MetricResult(score=score, reason=reason if reasoning else None)\n", + " if is_async:\n", + " # Async function implementation\n", + " if is_async_execution:\n", + " # In async context, await the function directly\n", + " result = func(self.llm, self.prompt, **kwargs)\n", + " else:\n", + " # In sync context, run the async function in an event loop\n", + " try:\n", + " loop = asyncio.get_event_loop()\n", + " except RuntimeError:\n", + " loop = asyncio.new_event_loop()\n", + " asyncio.set_event_loop(loop)\n", + " result = loop.run_until_complete(func(self.llm, self.prompt, **kwargs))\n", + " else:\n", + " # Sync function implementation\n", + " result = func(self.llm, self.prompt, **kwargs)\n", + " \n", + " return self._extract_result(result, reasoning)\n", + " except Exception as e:\n", + " # Handle errors gracefully\n", + " error_msg = f\"Error executing metric {self.name}: {str(e)}\"\n", + " return MetricResult(result=None, reason=error_msg)\n", + " \n", + " def score(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " \"\"\"Synchronous scoring method.\"\"\"\n", + " return self._execute_metric(is_async_execution=False, reasoning=reasoning, **kwargs)\n", + " \n", + " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " \"\"\"Asynchronous scoring method.\"\"\"\n", + " if is_async:\n", + " # For async functions, await the result\n", + " result = await func(self.llm, self.prompt, **kwargs)\n", + " return self._extract_result(result, reasoning)\n", + " else:\n", + " # For sync functions, run normally\n", + " result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs)\n", + " return self._extract_result(result, reasoning)\n", " \n", - " async def ascore(self, reasoning=True, n=1, **kwargs):\n", - " if is_async:\n", - " # For async functions, await them directly\n", - " result = await metric(self.llm,self.prompt,**kwargs)\n", - " else:\n", - " # For sync functions, run them normally\n", - " result = metric(self.llm,self.prompt,**kwargs)\n", - "\n", - " if isinstance(result, tuple) and len(result) == 2:\n", - " score, reason = result\n", - " else:\n", - " score = result\n", - " reason = None\n", - " \n", - " metric_instance = CustomDiscreteMetric(\n", - " name=metric_name,\n", - " prompt=prompt,\n", - " llm=llm,\n", - " values=values,\n", + " # Create the metric instance with all parameters\n", + " metric_instance = CustomMetric(\n", + " name=metric_name,\n", + " prompt=prompt,\n", + " llm=llm,\n", + " **metric_params\n", + " )\n", " \n", - " )\n", - " metric_instance.__name__ = name\n", - " metric_instance.__doc__ = metric.__doc__\n", - " \n", - " return metric_instance\n", - " \n", - " return decorator\n", + " # Preserve metadata\n", + " metric_instance.__name__ = metric_name\n", + " metric_instance.__doc__ = func.__doc__\n", " \n", + " return metric_instance\n", + " \n", + " return decorator\n", " \n", + " return decorator_factory\n", "\n", - " " + "# Create specific decorator factories for each metric type\n", + "discrete_metric = create_metric_decorator(DiscreteMetrics)\n" ] }, { @@ -541,11 +613,23 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 113, "id": "aeae8fe5-e81a-44ac-9ad7-a240655a0f06", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\"The answer provided lacks specific context or detail needed to evaluate its helpfulness fully. Without more information, it's difficult to determine its applicability.\"" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "\n", "my_metric = DiscreteMetrics(\n", " name='helpfulness',\n", " llm=LLM(),\n", @@ -561,30 +645,30 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 114, "id": "b5e499d8-8258-46ce-b719-0389d3cfd8db", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "MetricResult(score=low, reason=None)" + "'low'" ] }, - "execution_count": 18, + "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## score without reasoning to save reasoning tokens cost\n", - "result = my_metric.score(response=\"this is my response\",reasoning=False)\n", + "result = my_metric.score(response=\"this is my response\",reasoning=False,n=3)\n", "result" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 115, "id": "a9a5d6c6-4cfc-4f45-8b19-996315a95370", "metadata": {}, "outputs": [ @@ -594,7 +678,7 @@ "'reason'" ] }, - "execution_count": 24, + "execution_count": 115, "metadata": {}, "output_type": "execute_result" } @@ -618,31 +702,10 @@ " return score,\"reason\"\n", "\n", "result = my_metric.score(response='my response') # result\n", - "result.score\n", + "result\n", "result.reason" ] }, - { - "cell_type": "code", - "execution_count": 25, - "id": "2b4809ca-d921-4084-bb9e-fe3a72f438a1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "MetricResult(score=high, reason=reason)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result" - ] - }, { "cell_type": "markdown", "id": "05f60c70-fc32-41f8-aa7c-c8685d77398a", @@ -653,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 124, "id": "6a1c66fb-3c1c-4bc6-9996-0b5beb304b9c", "metadata": {}, "outputs": [], @@ -663,7 +726,6 @@ "@dataclass\n", "class NumericMetrics(Metric):\n", " range: t.Tuple[float,float]\n", - " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(default_factory=dict, init=False, repr=False)\n", " \n", " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", @@ -672,7 +734,7 @@ " return self._response_models[with_reasoning]\n", " \n", " model_name = 'response_model'\n", - " fields = {\"score\": (float,...)}\n", + " fields = {\"result\": (float,...)}\n", " \n", " if with_reasoning:\n", " fields[\"reason\"] = (str, ...)\n", @@ -680,26 +742,37 @@ " model = create_model(model_name, **fields)\n", " self._response_models[with_reasoning] = model\n", " return model \n", - " \n", - " def score(self, reasoning=True, n=1, **kwargs) -> Any:\n", - " \n", - " prompt_input = self.prompt.format(**kwargs)\n", - " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning))\n", - " return MetricResult(**response.model_dump())\n", "\n", - " async def ascore(self, reasoning=True, n=1, **kwargs):\n", - " prompt_input = self.prompt.format(**kwargs)\n", - " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", - " return MetricResult(**response.model_dump())\n", - "\n" + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + " \n", + " candidates = [candidate.result for candidate in results]\n", + " result = sum(candidates)/len(candidates)\n", + " reason = results[0].reason\n", + " \n", + " return MetricResult(result=result,reason=reason)\n", + " \n" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 129, "id": "251cdea8-fc71-46bd-8a00-fb8e33e10350", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'The response lacks sufficient information or context to be considered helpful. It does not address any specific question or provide any useful insights.'" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "my_metric = NumericMetrics(\n", " name='helpfulness',\n", @@ -711,8 +784,28 @@ "result = my_metric.score(response=\"this is my response\")\n", "result #gives \"low\"\n", "result.reason #gives reasoning from llm\n", - "\n", - "result = my_metric.batch_score(inputs=[{\"response\":\"this is my response\"}])\n" + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "b0994c80-c6db-4f3b-9ed9-1b32d61428c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.0]" + ] + }, + "execution_count": 130, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_metric.batch_score(inputs=[{\"response\":\"this is my response\"}])\n" ] }, { @@ -725,96 +818,28 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 120, "id": "265af384-ed35-4262-acfe-6847b22d3089", "metadata": {}, "outputs": [], "source": [ "#| export\n", - "\n", - "\n", - "def numeric_metric(llm, prompt, range: t.Tuple[float,float],name:t.Optional[str]=None):\n", - "\n", - " def decorator(metric):\n", - " metric_name = name or metric.__name__ \n", - " is_async = inspect.iscoroutinefunction(metric)\n", - "\n", - " @dataclass\n", - " class CustomNumericMetric(NumericMetrics):\n", - "\n", - " def score(self,reasoning:bool=True, n:int=1,**kwargs):\n", - "\n", - " if is_async:\n", - " # For async functions, we need to run them in an event loop\n", - " import asyncio\n", - " \n", - " # Get or create an event loop\n", - " try:\n", - " loop = asyncio.get_event_loop()\n", - " except RuntimeError:\n", - " loop = asyncio.new_event_loop()\n", - " asyncio.set_event_loop(loop)\n", - " \n", - " # Run the async function and get the result\n", - " result = loop.run_until_complete(metric(self.llm,self.prompt,**kwargs))\n", - " else:\n", - " # For sync functions, just call directly\n", - " result = metric(self.llm,self.prompt,**kwargs)\n", - " \n", - " if isinstance(result, tuple) and len(result) == 2:\n", - " score, reason = result\n", - " else:\n", - " score = result\n", - " reason = None\n", - " \n", - " return MetricResult(score=score, reason=reason if reasoning else None)\n", - " \n", - " async def ascore(self, reasoning=True, n=1, **kwargs):\n", - " if is_async:\n", - " # For async functions, await them directly\n", - " result = await metric(self.llm,self.prompt,**kwargs)\n", - " else:\n", - " # For sync functions, run them normally\n", - " result = metric(self.llm,self.prompt,**kwargs)\n", - "\n", - " if isinstance(result, tuple) and len(result) == 2:\n", - " score, reason = result\n", - " else:\n", - " score = result\n", - " reason = None\n", - " \n", - " metric_instance = CustomNumericMetric(\n", - " name=metric_name,\n", - " prompt=prompt,\n", - " llm=llm,\n", - " range=range,\n", - " \n", - " )\n", - " metric_instance.__name__ = name\n", - " metric_instance.__doc__ = metric.__doc__\n", - " \n", - " return metric_instance\n", - " \n", - " return decorator\n", - " \n", - " \n", - "\n", - " " + "numeric_metric = create_metric_decorator(NumericMetrics)" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 132, "id": "009c1944-bda7-41b1-9235-dcda5acbed55", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'reason'" + "20" ] }, - "execution_count": 38, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" } @@ -847,38 +872,38 @@ "result1 + result2 # should be addable and behave like a float\n" ] }, + { + "cell_type": "markdown", + "id": "90794704-5e45-4dd5-8862-b4fb9694a5b5", + "metadata": {}, + "source": [ + "### Ranking metric" + ] + }, { "cell_type": "code", - "execution_count": 39, - "id": "ff32e972-0900-4ff1-94db-1378302f8d97", + "execution_count": 145, + "id": "9e2bb718-ba9a-4965-a952-462ac0159766", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0" + "typing.Literal[[0, 1, 2]]" ] }, - "execution_count": 39, + "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "result.score\n" - ] - }, - { - "cell_type": "markdown", - "id": "90794704-5e45-4dd5-8862-b4fb9694a5b5", - "metadata": {}, - "source": [ - "### Ranking metric" + "t.Literal[[i for i in range(3)]]" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 159, "id": "0b1bbf8f-c7fa-4004-9165-fa388f7ba15d", "metadata": {}, "outputs": [], @@ -888,7 +913,6 @@ "@dataclass\n", "class RankingMetrics(Metric):\n", " num_ranks: int\n", - " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(default_factory=dict, init=False, repr=False)\n", " \n", " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", @@ -897,40 +921,80 @@ " return self._response_models[with_reasoning]\n", " \n", " model_name = 'response_model'\n", - " fields = {\"score\": (t.List[int],...)}\n", - " \n", + "\n", + " # Custom validator to ensure 'result' is a permutation of 0 .. num_ranks-1\n", + " def validate_result(cls, v):\n", + " expected = set(range(self.num_ranks))\n", + " if set(v) != expected:\n", + " raise ValueError(\n", + " f\"'result' must contain exactly the numbers {sorted(expected)} without repetition.\"\n", + " )\n", + " return v\n", + "\n", + " # Define the fields dynamically\n", + " fields = {\"result\": (List[int], ...)}\n", " if with_reasoning:\n", " fields[\"reason\"] = (str, ...)\n", " \n", - " model = create_model(model_name, **fields)\n", + " # Create the dynamic model with the custom validator attached\n", + " model = create_model(\n", + " model_name,\n", + " **fields,\n", + " __validators__={\n", + " 'result_validator': validator('result', allow_reuse=True)(validate_result)\n", + " }\n", + " )\n", " self._response_models[with_reasoning] = model\n", " return model \n", - " \n", - " def score(self, reasoning=True, n=1, **kwargs) -> Any:\n", - " \n", - " prompt_input = self.prompt.format(**kwargs)\n", - " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning))\n", - " return MetricResult(**response.model_dump())\n", "\n", - " async def ascore(self, reasoning=True, n=1, **kwargs):\n", - " prompt_input = self.prompt.format(**kwargs)\n", - " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", - " return MetricResult(**response.model_dump())\n", - "\n" + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + "\n", + " n_items = len(results)\n", + " borda_scores = [0] * n_items\n", + "\n", + " for result in results:\n", + " for position_idx,item_idx in enumerate(result.result):\n", + " borda_scores[item_idx] += (n_items - (position_idx-1))\n", + "\n", + " indexed_scores = [(score, i) for i, score in enumerate(borda_scores)] \n", + " indexed_scores.sort(key=lambda x: (-x[0], x[1])) \n", + " final_ranking = [pos for _, pos in indexed_scores]\n", + "\n", + " if any(r.reason for r in results):\n", + " reason = \"Ensemble ranking based on multiple evaluations.\\n\" + '\\n'.join([r.reason for r in results if r.reason])\n", + " else:\n", + " reason = None\n", + " \n", + " \n", + " return MetricResult(result=result,reason=reason)" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 161, "id": "716881a1-0a93-46b3-b41b-aee0f987a1a6", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ww/sk5dkfhn673234cmy5w7008r0000gn/T/ipykernel_95467/972172485.py:40: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/\n", + " 'result_validator': validator('result', allow_reuse=True)(validate_result)\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "[1, 2, 3]\n", - "The responses are ranked based on their length and detail, with the longest and most detailed answer receiving the highest rank.\n" + "[0, 1, 2]\n", + "Ensemble ranking based on multiple evaluations.\n", + "The responses are ranked from the shortest to the longest and most detailed.\n", + "The responses are ranked from shortest to longest and most detailed.\n", + "Responses ranked from shortest to longest.\n" ] } ], @@ -940,7 +1004,7 @@ " name='response_ranking',\n", " llm=LLM(), # Your language model instance\n", " prompt=\"Rank the following responses:\\n{candidates}\",\n", - " num_ranks=3\n", + " num_ranks=3,\n", ")\n", "\n", "# To score a single input (ranking candidate responses)\n", @@ -948,11 +1012,19 @@ " \"short answer.\",\n", " \"a bit more detailed.\",\n", " \"the longest and most detailed answer.\"\n", - "])\n", - "print(result.score) # Might output something like: [1, 0, 2]\n", + "],n=3)\n", + "print(result) # Might output something like: [1, 0, 2]\n", "print(result.reason) # Provides the reasoning behind the ranking\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e198d7d-fbab-448e-aab1-f10f4234dff6", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "5b53bd5e-06c9-4430-9c06-f2225ddd7bd5", @@ -963,73 +1035,18 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 46, "id": "4c4e9170-67b9-4841-9df2-6afc490b89dd", "metadata": {}, "outputs": [], "source": [ "#| export\n", - "\n", - "def ranking_metric(llm, prompt, num_ranks: int, name: t.Optional[str] = None):\n", - " def decorator(metric):\n", - " metric_name = name or metric.__name__\n", - " is_async = inspect.iscoroutinefunction(metric)\n", - "\n", - " @dataclass\n", - " class CustomRankingMetric(RankingMetrics):\n", - " # Inherits: name, prompt, llm, num_ranks from RankingMetrics.\n", - " # No extra fields are needed.\n", - " def score(self, reasoning: bool = True, n: int = 1, **kwargs):\n", - " if is_async:\n", - " # For async functions, run in an event loop.\n", - " import asyncio\n", - " try:\n", - " loop = asyncio.get_event_loop()\n", - " except RuntimeError:\n", - " loop = asyncio.new_event_loop()\n", - " asyncio.set_event_loop(loop)\n", - " result = loop.run_until_complete(metric(self.llm, self.prompt, **kwargs))\n", - " else:\n", - " result = metric(self.llm, self.prompt, **kwargs)\n", - "\n", - " if isinstance(result, tuple) and len(result) == 2:\n", - " ranking, reason = result\n", - " else:\n", - " ranking = result\n", - " reason = None\n", - "\n", - " return MetricResult(score=ranking, reason=reason if reasoning else None)\n", - "\n", - " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):\n", - " if is_async:\n", - " result = await metric(self.llm, self.prompt, **kwargs)\n", - " else:\n", - " result = metric(self.llm, self.prompt, **kwargs)\n", - "\n", - " if isinstance(result, tuple) and len(result) == 2:\n", - " ranking, reason = result\n", - " else:\n", - " ranking = result\n", - " reason = None\n", - "\n", - " return MetricResult(score=ranking, reason=reason if reasoning else None)\n", - "\n", - " metric_instance = CustomRankingMetric(\n", - " name=metric_name,\n", - " prompt=prompt,\n", - " llm=llm,\n", - " num_ranks=num_ranks\n", - " )\n", - " metric_instance.__name__ = metric_name\n", - " metric_instance.__doc__ = metric.__doc__\n", - "\n", - " return metric_instance\n", - " return decorator" + "ranking_metric = create_metric_decorator(RankingMetrics)" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 48, "id": "cbb1729b-8b25-48d8-a472-c03dd1e0d861", "metadata": {}, "outputs": [ @@ -1062,14 +1079,14 @@ " \"Response B: a bit more detailed.\",\n", " \"Response C: the longest and most detailed answer.\"\n", "])\n", - "print(result.score) # E.g., [1, 0, 2]\n", + "print(result) # E.g., [1, 0, 2]\n", "print(result.reason) # E.g., \"Ranked based on response clarity and detail.\"\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "87e6646a-c65a-4317-994c-43aae31d65e2", + "id": "23e38ce5-aac9-489b-96c0-947011dbbdf7", "metadata": {}, "outputs": [], "source": [] From 935ef1f323112ad504fa0c82c43721e39d52d349 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 13:38:22 -0700 Subject: [PATCH 03/17] nb files for metric --- nbs/metric/base.ipynb | 1029 ++-------------------------------- nbs/metric/decorator.ipynb | 240 ++++++++ nbs/metric/discrete.ipynb | 203 +++++++ nbs/metric/llm.ipynb | 59 ++ nbs/metric/numeric.ipynb | 189 +++++++ nbs/metric/ranking.ipynb | 224 ++++++++ nbs/metric/result.ipynb | 268 +++++++++ nbs/metric/test_base.ipynb | 1076 ++++++++++++++++++++++++++++++++++++ 8 files changed, 2311 insertions(+), 977 deletions(-) create mode 100644 nbs/metric/decorator.ipynb create mode 100644 nbs/metric/discrete.ipynb create mode 100644 nbs/metric/llm.ipynb create mode 100644 nbs/metric/numeric.ipynb create mode 100644 nbs/metric/ranking.ipynb create mode 100644 nbs/metric/result.ipynb create mode 100644 nbs/metric/test_base.ipynb diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb index 6133f6a..dd9d599 100644 --- a/nbs/metric/base.ipynb +++ b/nbs/metric/base.ipynb @@ -1,18 +1,9 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "c48aac0f-c63c-4bfb-95b0-a1239f41ccb3", - "metadata": {}, - "source": [ - "# Base\n", - "> baseclass for metric" - ] - }, { "cell_type": "code", - "execution_count": 10, - "id": "d903a59c-3ed8-4b7d-bb9d-39180df72950", + "execution_count": null, + "id": "00ef8db1", "metadata": {}, "outputs": [], "source": [ @@ -21,8 +12,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "5d14fc66-b8af-4a75-b761-20b8e9ce19f8", + "execution_count": 4, + "id": "125fcb9a", "metadata": {}, "outputs": [ { @@ -31,7 +22,7 @@ "True" ] }, - "execution_count": 13, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -42,10 +33,19 @@ "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" ] }, + { + "cell_type": "markdown", + "id": "2eb8f806", + "metadata": {}, + "source": [ + "# BaseMetric\n", + "> base class for all type of metrics in ragas" + ] + }, { "cell_type": "code", - "execution_count": 14, - "id": "0b05c525-e153-49ab-b768-5069f624f215", + "execution_count": 1, + "id": "e8ccff58", "metadata": {}, "outputs": [ { @@ -59,308 +59,21 @@ ], "source": [ "#| export\n", - "import typing as t\n", - "from typing import Any, Callable, Dict, List, Optional, Union\n", + "\n", "from abc import ABC, abstractmethod\n", "import asyncio\n", - "from dataclasses import dataclass\n", - "from pydantic import BaseModel\n", - "import openai\n", - "import instructor\n", "from dataclasses import dataclass, field\n", - "from pydantic import BaseModel, create_model\n", - "import typing as t\n", - "import inspect" - ] - }, - { - "cell_type": "markdown", - "id": "b5140197-08a5-46de-b9bd-818e9be8c951", - "metadata": {}, - "source": [ - "### MetricResult\n", - "> Class to hold the result metric" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "56dbaeb3-e105-4daf-a5b2-a91cb1fb976f", - "metadata": {}, - "outputs": [], - "source": [ + "from pydantic import BaseModel\n", "import typing as t\n", - "from typing import Any, Callable, Dict, List, Optional, Union\n", - "\n", - "class MetricResult:\n", - " \"\"\"Class to hold the result of a metric evaluation.\n", - " \n", - " This class behaves like its underlying result value but still provides access\n", - " to additional metadata like reasoning.\n", - " \n", - " Works with:\n", - " - DiscreteMetrics (string results)\n", - " - NumericMetrics (float/int results)\n", - " - RankingMetrics (list results)\n", - " \"\"\"\n", - " \n", - " def __init__(self, result: Any, reason: t.Optional[str] = None):\n", - " self._result = result\n", - " self.reason = reason\n", - " \n", - " def __repr__(self):\n", - " return repr(self._result)\n", - " \n", - " # Access to underlying result\n", - " @property\n", - " def result(self):\n", - " \"\"\"Get the raw result value.\"\"\"\n", - " return self._result\n", - " \n", - " \n", - " # String conversion - works for all types\n", - " def __str__(self):\n", - " return str(self._result)\n", - " \n", - " # Container-like behaviors for list results (RankingMetric)\n", - " def __getitem__(self, key):\n", - " if not hasattr(self._result, \"__getitem__\"):\n", - " raise TypeError(f\"{type(self._result).__name__} object is not subscriptable\")\n", - " return self._result[key]\n", - " \n", - " def __iter__(self):\n", - " if not hasattr(self._result, \"__iter__\"):\n", - " raise TypeError(f\"{type(self._result).__name__} object is not iterable\")\n", - " return iter(self._result)\n", - " \n", - " def __len__(self):\n", - " if not hasattr(self._result, \"__len__\"):\n", - " raise TypeError(f\"{type(self._result).__name__} has no len()\")\n", - " return len(self._result)\n", - " \n", - " # Numeric operations for numeric results (NumericMetric)\n", - " def __float__(self):\n", - " if isinstance(self._result, (int, float)):\n", - " return float(self._result)\n", - " raise TypeError(f\"Cannot convert {type(self._result).__name__} to float\")\n", - " \n", - " def __int__(self):\n", - " if isinstance(self._result, (int, float)):\n", - " return int(self._result)\n", - " raise TypeError(f\"Cannot convert {type(self._result).__name__} to int\")\n", - " \n", - " def __add__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", - " if isinstance(other, MetricResult):\n", - " return self._result + other._result\n", - " return self._result + other\n", - " \n", - " def __radd__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", - " return other + self._result\n", - " \n", - " def __sub__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", - " if isinstance(other, MetricResult):\n", - " return self._result - other._result\n", - " return self._result - other\n", - " \n", - " def __rsub__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", - " return other - self._result\n", - " \n", - " def __mul__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", - " if isinstance(other, MetricResult):\n", - " return self._result * other._result\n", - " return self._result * other\n", - " \n", - " def __rmul__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", - " return other * self._result\n", - " \n", - " def __truediv__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", - " if isinstance(other, MetricResult):\n", - " return self._result / other._result\n", - " return self._result / other\n", - " \n", - " def __rtruediv__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", - " return other / self._result\n", - " \n", - " # Comparison operations - work for all types with same-type comparisons\n", - " def __eq__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result == other._result\n", - " return self._result == other\n", - " \n", - " def __lt__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result < other._result\n", - " return self._result < other\n", - " \n", - " def __le__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result <= other._result\n", - " return self._result <= other\n", - " \n", - " def __gt__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result > other._result\n", - " return self._result > other\n", - " \n", - " def __ge__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result >= other._result\n", - " return self._result >= other\n", - " \n", - " # Method forwarding for type-specific behaviors\n", - " def __getattr__(self, name):\n", - " \"\"\"Forward attribute access to the result object if it has that attribute.\n", - " \n", - " This allows calling string methods on discrete results, \n", - " numeric methods on numeric results, and list methods on ranking results.\n", - " \"\"\"\n", - " if hasattr(self._result, name):\n", - " attr = getattr(self._result, name)\n", - " if callable(attr):\n", - " # If it's a method, wrap it to return MetricResult when appropriate\n", - " def wrapper(*args, **kwargs):\n", - " result = attr(*args, **kwargs)\n", - " # If the result is of the same type as self._result, wrap it\n", - " if isinstance(result, type(self._result)):\n", - " return MetricResult(result=result, reason=self.reason)\n", - " return result\n", - " return wrapper\n", - " return attr\n", - " raise AttributeError(f\"{type(self).__name__} has no attribute '{name}'\")\n", - " \n", - " # JSON/dict serialization\n", - " def to_dict(self):\n", - " \"\"\"Convert the result to a dictionary.\"\"\"\n", - " return {\n", - " \"result\": self._result,\n", - " \"reason\": self.reason\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "89555bee-23a3-4129-86a5-0a2ffeed00c7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'low'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result = MetricResult(result='low',reason=\"my reason\")\n", - "result\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "f21b7a50-3142-40e9-9612-0603f5b8654d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'my reason'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result.reason" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "be7f588f-2a13-4c10-9775-00101a03e0a5", - "metadata": {}, - "outputs": [], - "source": [ - "@dataclass\n", - "class LLM:\n", - "\n", - " def __post_init__(self):\n", - " self.aclient = instructor.from_openai(openai.AsyncOpenAI())\n", - " self.client = instructor.from_openai(openai.OpenAI())\n", - "\n", - " \n", - " def generate(self,prompt,response_model):\n", - " return self.client.chat.completions.create(\n", - " model=\"gpt-4o-mini\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": prompt},\n", - " ],\n", - " response_model=response_model,\n", - " )\n", - "\n", - " async def agenerate(self,prompt,response_model):\n", - " return await self.aclient.chat.completions.create(\n", - " model=\"gpt-4o-mini\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": prompt},\n", - " ],\n", - " response_model=response_model,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "1bec7946-c61c-4631-9880-fff575974e39", - "metadata": {}, - "source": [ - "### Metric" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c6fecbe-4fdd-464e-961e-48a528bc3278", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 108, - "id": "c5e19478-d947-405b-a229-4a1e7daa2fd3", - "metadata": {}, - "outputs": [], - "source": [ - "#| export\n", + "from ragas_annotator.metric import MetricResult\n", + "from ragas_annotator.metric import LLM\n", "\n", "@dataclass\n", "class Metric(ABC):\n", " \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n", - " name:str\n", - " prompt:str\n", - " llm:LLM\n", + " name: str\n", + " prompt: str\n", + " llm: LLM\n", " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(\n", " default_factory=dict, init=False, repr=False\n", " )\n", @@ -371,12 +84,11 @@ " pass\n", "\n", " @abstractmethod\n", - " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + " def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:\n", " pass\n", " \n", " \n", - " def score(self, reasoning=True, n=1, **kwargs) -> Any:\n", - "\n", + " def score(self, reasoning: bool = True, n: int = 1, **kwargs) -> t.Any:\n", " responses = []\n", " prompt_input = self.prompt.format(**kwargs)\n", " for _ in range(n):\n", @@ -386,731 +98,94 @@ " return self._ensemble(responses)\n", "\n", "\n", - " async def ascore(self, reasoning=True, n=1, **kwargs):\n", - " \n", + " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs) -> MetricResult:\n", + " responses = [] # Added missing initialization\n", " prompt_input = self.prompt.format(**kwargs)\n", " for _ in range(n):\n", " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", - " response = MetricResult(**response.model_dump)\n", + " response = MetricResult(**response.model_dump()) # Fixed missing parentheses\n", " responses.append(response)\n", " return self._ensemble(responses)\n", " \n", - " def batch_score(self, inputs: List[Dict[str, Any]], reasoning:bool=True, n:int=1) -> List[Any]:\n", - " \n", - " return [self.score(reasoning,n,**input_dict) for input_dict in inputs]\n", + " def batch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[t.Any]:\n", + " return [self.score(reasoning, n, **input_dict) for input_dict in inputs]\n", " \n", - " async def abatch_score(self, inputs: List[Dict[str, Any]], reasoning: bool = True, n: int = 1) -> List[MetricResult]:\n", - " \n", + " async def abatch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[MetricResult]:\n", " async_tasks = []\n", " for input_dict in inputs:\n", " # Add reasoning and n to the input parameters\n", " async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict))\n", " \n", " # Run all tasks concurrently and return results\n", - " return await asyncio.gather(*async_tasks)\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "4af79bb4-4c3e-4004-b7f3-ec36e50b4ca5", - "metadata": {}, - "source": [ - "### DiscreteMetric \n" + " return await asyncio.gather(*async_tasks)" ] }, { "cell_type": "code", - "execution_count": 109, - "id": "f09683e0-5ec3-4e60-a8c4-1657e2fe60b9", + "execution_count": 2, + "id": "fcf208fa", "metadata": {}, "outputs": [], "source": [ - "#| export\n", - "from collections import Counter\n", + "## Example of a concrete implementation of the Metric class\n", + "\n", "\n", "@dataclass\n", - "class DiscreteMetrics(Metric):\n", + "class NumericMetric(Metric):\n", " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", " \n", " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", " \n", - " if with_reasoning in self._response_models:\n", - " return self._response_models[with_reasoning]\n", - " \n", - " model_name = 'response_model'\n", - " fields = {\"result\": (t.Literal[tuple(self.values)], ...)}\n", - " \n", - " if with_reasoning:\n", - " fields[\"reason\"] = (str, ...)\n", - " \n", - " model = create_model(model_name, **fields)\n", - " self._response_models[with_reasoning] = model\n", - " return model \n", - "\n", - " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", - "\n", - "\n", - " if len(results)==1:\n", - " return results[0]\n", - " \n", - " candidates = [candidate.result for candidate in results]\n", - " counter = Counter(candidates)\n", - " max_count = max(counter.values())\n", - " for candidate in results:\n", - " if counter[candidate.result] == max_count:\n", - " result = candidate.result \n", - " reason = candidate.reason\n", - " break\n", - " \n", - " return MetricResult(result=result,reason=reason)\n", - "\n", - "\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "f33ef89f-4ccc-4307-9944-4f372ce77830", - "metadata": {}, - "source": [ - "### decorator factory for discrete_metric" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "id": "99d5afd6-72bd-42d7-bff0-effce9cf8cd9", - "metadata": {}, - "outputs": [], - "source": [ - "import typing as t\n", - "from typing import Any, Callable, Dict, List, Optional, Type, Union\n", - "import inspect\n", - "import asyncio\n", - "from dataclasses import dataclass\n", - "from abc import ABC\n", - "\n", - "def create_metric_decorator(metric_class):\n", - " \"\"\"\n", - " Factory function that creates decorator factories for different metric types.\n", - " \n", - " Args:\n", - " metric_class: The metric class to use (DiscreteMetrics, NumericMetrics, etc.)\n", - " \n", - " Returns:\n", - " A decorator factory function for the specified metric type\n", - " \"\"\"\n", - " def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params):\n", - " \"\"\"\n", - " Creates a decorator that wraps a function into a metric instance.\n", - " \n", - " Args:\n", - " llm: The language model instance to use\n", - " prompt: The prompt template\n", - " name: Optional name for the metric (defaults to function name)\n", - " **metric_params: Additional parameters specific to the metric type\n", - " (values for DiscreteMetrics, range for NumericMetrics, etc.)\n", - " \n", - " Returns:\n", - " A decorator function\n", - " \"\"\"\n", - " def decorator(func):\n", - " # Get metric name and check if function is async\n", - " metric_name = name or func.__name__\n", - " is_async = inspect.iscoroutinefunction(func)\n", - " \n", - " @dataclass\n", - " class CustomMetric(metric_class):\n", - " def _extract_result(self, result, reasoning: bool):\n", - " \"\"\"Extract score and reason from the result.\"\"\"\n", - " if isinstance(result, tuple) and len(result) == 2:\n", - " score, reason = result\n", - " else:\n", - " score, reason = result, None\n", - " \n", - " # Use \"result\" instead of \"score\" for the new MetricResult implementation\n", - " return MetricResult(result=score, reason=reason if reasoning else None)\n", - " \n", - " def _run_sync_in_async(self, func, *args, **kwargs):\n", - " \"\"\"Run a synchronous function in an async context.\"\"\"\n", - " # For sync functions, just run them normally\n", - " return func(*args, **kwargs)\n", - " \n", - " def _execute_metric(self, is_async_execution, reasoning, **kwargs):\n", - " \"\"\"Execute the metric function with proper async handling.\"\"\"\n", - " try:\n", - " if is_async:\n", - " # Async function implementation\n", - " if is_async_execution:\n", - " # In async context, await the function directly\n", - " result = func(self.llm, self.prompt, **kwargs)\n", - " else:\n", - " # In sync context, run the async function in an event loop\n", - " try:\n", - " loop = asyncio.get_event_loop()\n", - " except RuntimeError:\n", - " loop = asyncio.new_event_loop()\n", - " asyncio.set_event_loop(loop)\n", - " result = loop.run_until_complete(func(self.llm, self.prompt, **kwargs))\n", - " else:\n", - " # Sync function implementation\n", - " result = func(self.llm, self.prompt, **kwargs)\n", - " \n", - " return self._extract_result(result, reasoning)\n", - " except Exception as e:\n", - " # Handle errors gracefully\n", - " error_msg = f\"Error executing metric {self.name}: {str(e)}\"\n", - " return MetricResult(result=None, reason=error_msg)\n", - " \n", - " def score(self, reasoning: bool = True, n: int = 1, **kwargs):\n", - " \"\"\"Synchronous scoring method.\"\"\"\n", - " return self._execute_metric(is_async_execution=False, reasoning=reasoning, **kwargs)\n", - " \n", - " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):\n", - " \"\"\"Asynchronous scoring method.\"\"\"\n", - " if is_async:\n", - " # For async functions, await the result\n", - " result = await func(self.llm, self.prompt, **kwargs)\n", - " return self._extract_result(result, reasoning)\n", - " else:\n", - " # For sync functions, run normally\n", - " result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs)\n", - " return self._extract_result(result, reasoning)\n", - " \n", - " # Create the metric instance with all parameters\n", - " metric_instance = CustomMetric(\n", - " name=metric_name,\n", - " prompt=prompt,\n", - " llm=llm,\n", - " **metric_params\n", - " )\n", + " class mymodel(BaseModel):\n", + " result: int\n", + " reason: t.Optional[str] = None\n", " \n", - " # Preserve metadata\n", - " metric_instance.__name__ = metric_name\n", - " metric_instance.__doc__ = func.__doc__\n", - " \n", - " return metric_instance\n", - " \n", - " return decorator\n", - " \n", - " return decorator_factory\n", - "\n", - "# Create specific decorator factories for each metric type\n", - "discrete_metric = create_metric_decorator(DiscreteMetrics)\n" - ] - }, - { - "cell_type": "markdown", - "id": "07f49ad3-4476-4bcc-ac34-a87fb7a8652a", - "metadata": {}, - "source": [ - "### Usage pattern" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "id": "aeae8fe5-e81a-44ac-9ad7-a240655a0f06", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"The answer provided lacks specific context or detail needed to evaluate its helpfulness fully. Without more information, it's difficult to determine its applicability.\"" - ] - }, - "execution_count": 113, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "my_metric = DiscreteMetrics(\n", - " name='helpfulness',\n", - " llm=LLM(),\n", - " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", - " values=[\"low\",\"med\",\"high\"],\n", - ")\n", - "\n", - "result = my_metric.score(response=\"this is my response\")\n", - "result #gives \"low\"\n", - "result.reason #gives reasoning from llm\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "id": "b5e499d8-8258-46ce-b719-0389d3cfd8db", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'low'" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "## score without reasoning to save reasoning tokens cost\n", - "result = my_metric.score(response=\"this is my response\",reasoning=False,n=3)\n", - "result" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "id": "a9a5d6c6-4cfc-4f45-8b19-996315a95370", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'reason'" - ] - }, - "execution_count": 115, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@discrete_metric(llm=LLM(),\n", - " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", - " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", - "def my_metric(llm,prompt,**kwargs):\n", - "\n", - " class response_model(BaseModel):\n", - " output: t.List[bool]\n", - " reason: str\n", - " \n", - " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", - " total = sum(response.output)\n", - " if total < 1:\n", - " score = 'low'\n", - " else:\n", - " score = 'high'\n", - " return score,\"reason\"\n", - "\n", - "result = my_metric.score(response='my response') # result\n", - "result\n", - "result.reason" - ] - }, - { - "cell_type": "markdown", - "id": "05f60c70-fc32-41f8-aa7c-c8685d77398a", - "metadata": {}, - "source": [ - "## Numeric Metric" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "id": "6a1c66fb-3c1c-4bc6-9996-0b5beb304b9c", - "metadata": {}, - "outputs": [], - "source": [ - "#| export\n", - "\n", - "@dataclass\n", - "class NumericMetrics(Metric):\n", - " range: t.Tuple[float,float]\n", - " \n", - " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", - " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", - " \n", - " if with_reasoning in self._response_models:\n", - " return self._response_models[with_reasoning]\n", - " \n", - " model_name = 'response_model'\n", - " fields = {\"result\": (float,...)}\n", - " \n", - " if with_reasoning:\n", - " fields[\"reason\"] = (str, ...)\n", - " \n", - " model = create_model(model_name, **fields)\n", - " self._response_models[with_reasoning] = model\n", - " return model \n", + " return mymodel \n", "\n", " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", - "\n", - " if len(results)==1:\n", - " return results[0]\n", - " \n", - " candidates = [candidate.result for candidate in results]\n", - " result = sum(candidates)/len(candidates)\n", - " reason = results[0].reason\n", - " \n", - " return MetricResult(result=result,reason=reason)\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "id": "251cdea8-fc71-46bd-8a00-fb8e33e10350", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The response lacks sufficient information or context to be considered helpful. It does not address any specific question or provide any useful insights.'" - ] - }, - "execution_count": 129, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_metric = NumericMetrics(\n", - " name='helpfulness',\n", - " llm=LLM(),\n", - " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", - " range=(0,10),\n", - ")\n", - "\n", - "result = my_metric.score(response=\"this is my response\")\n", - "result #gives \"low\"\n", - "result.reason #gives reasoning from llm\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "id": "b0994c80-c6db-4f3b-9ed9-1b32d61428c6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0.0]" - ] - }, - "execution_count": 130, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_metric.batch_score(inputs=[{\"response\":\"this is my response\"}])\n" - ] - }, - { - "cell_type": "markdown", - "id": "c96520ae-294b-4868-8b2f-22a30ebd5f25", - "metadata": {}, - "source": [ - "### decorator factory" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "id": "265af384-ed35-4262-acfe-6847b22d3089", - "metadata": {}, - "outputs": [], - "source": [ - "#| export\n", - "numeric_metric = create_metric_decorator(NumericMetrics)" - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "id": "009c1944-bda7-41b1-9235-dcda5acbed55", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "20" - ] - }, - "execution_count": 132, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@numeric_metric(llm=LLM(),\n", - " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", - " name='new_metric',range=(0,10))\n", - "def my_metric(llm,prompt,**kwargs):\n", - "\n", - " class response_model(BaseModel):\n", - " output: int\n", - " reason: str\n", " \n", - " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", - " total = response.output\n", - " if total < 1:\n", - " score = 0\n", - " else:\n", - " score = 10\n", - " return score,\"reason\"\n", + " return results[0] # Placeholder for ensemble logic\n", "\n", - "result = my_metric.score(response='my response') # result\n", - "result # 10\n", - "result.reason # the reason for the answer\n", - "\n", - "result1 = my_metric.score(response='my response 1') # result\n", - "result2 = my_metric.score(response='my response 2') # result\n", - "\n", - "result1 + result2 # should be addable and behave like a float\n" - ] - }, - { - "cell_type": "markdown", - "id": "90794704-5e45-4dd5-8862-b4fb9694a5b5", - "metadata": {}, - "source": [ - "### Ranking metric" + " " ] }, { "cell_type": "code", - "execution_count": 145, - "id": "9e2bb718-ba9a-4965-a952-462ac0159766", + "execution_count": 6, + "id": "9ba99094", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "typing.Literal[[0, 1, 2]]" + "0" ] }, - "execution_count": 145, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "t.Literal[[i for i in range(3)]]" - ] - }, - { - "cell_type": "code", - "execution_count": 159, - "id": "0b1bbf8f-c7fa-4004-9165-fa388f7ba15d", - "metadata": {}, - "outputs": [], - "source": [ - "#| export\n", - "\n", - "@dataclass\n", - "class RankingMetrics(Metric):\n", - " num_ranks: int\n", - " \n", - " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", - " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", - " \n", - " if with_reasoning in self._response_models:\n", - " return self._response_models[with_reasoning]\n", - " \n", - " model_name = 'response_model'\n", - "\n", - " # Custom validator to ensure 'result' is a permutation of 0 .. num_ranks-1\n", - " def validate_result(cls, v):\n", - " expected = set(range(self.num_ranks))\n", - " if set(v) != expected:\n", - " raise ValueError(\n", - " f\"'result' must contain exactly the numbers {sorted(expected)} without repetition.\"\n", - " )\n", - " return v\n", - "\n", - " # Define the fields dynamically\n", - " fields = {\"result\": (List[int], ...)}\n", - " if with_reasoning:\n", - " fields[\"reason\"] = (str, ...)\n", - " \n", - " # Create the dynamic model with the custom validator attached\n", - " model = create_model(\n", - " model_name,\n", - " **fields,\n", - " __validators__={\n", - " 'result_validator': validator('result', allow_reuse=True)(validate_result)\n", - " }\n", - " )\n", - " self._response_models[with_reasoning] = model\n", - " return model \n", - "\n", - " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", - "\n", - " if len(results)==1:\n", - " return results[0]\n", - "\n", - " n_items = len(results)\n", - " borda_scores = [0] * n_items\n", - "\n", - " for result in results:\n", - " for position_idx,item_idx in enumerate(result.result):\n", - " borda_scores[item_idx] += (n_items - (position_idx-1))\n", - "\n", - " indexed_scores = [(score, i) for i, score in enumerate(borda_scores)] \n", - " indexed_scores.sort(key=lambda x: (-x[0], x[1])) \n", - " final_ranking = [pos for _, pos in indexed_scores]\n", - "\n", - " if any(r.reason for r in results):\n", - " reason = \"Ensemble ranking based on multiple evaluations.\\n\" + '\\n'.join([r.reason for r in results if r.reason])\n", - " else:\n", - " reason = None\n", - " \n", - " \n", - " return MetricResult(result=result,reason=reason)" - ] - }, - { - "cell_type": "code", - "execution_count": 161, - "id": "716881a1-0a93-46b3-b41b-aee0f987a1a6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/ww/sk5dkfhn673234cmy5w7008r0000gn/T/ipykernel_95467/972172485.py:40: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/\n", - " 'result_validator': validator('result', allow_reuse=True)(validate_result)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0, 1, 2]\n", - "Ensemble ranking based on multiple evaluations.\n", - "The responses are ranked from the shortest to the longest and most detailed.\n", - "The responses are ranked from shortest to longest and most detailed.\n", - "Responses ranked from shortest to longest.\n" - ] - } - ], - "source": [ - "# User instantiates a ranking metric by providing a name, an LLM, a prompt template, and the number of rankings desired.\n", - "my_ranking_metric = RankingMetrics(\n", - " name='response_ranking',\n", - " llm=LLM(), # Your language model instance\n", - " prompt=\"Rank the following responses:\\n{candidates}\",\n", - " num_ranks=3,\n", - ")\n", - "\n", - "# To score a single input (ranking candidate responses)\n", - "result = my_ranking_metric.score(candidates=[\n", - " \"short answer.\",\n", - " \"a bit more detailed.\",\n", - " \"the longest and most detailed answer.\"\n", - "],n=3)\n", - "print(result) # Might output something like: [1, 0, 2]\n", - "print(result.reason) # Provides the reasoning behind the ranking\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e198d7d-fbab-448e-aab1-f10f4234dff6", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "5b53bd5e-06c9-4430-9c06-f2225ddd7bd5", - "metadata": {}, - "source": [ - "### decorator factory for ranking metric" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "4c4e9170-67b9-4841-9df2-6afc490b89dd", - "metadata": {}, - "outputs": [], - "source": [ - "#| export\n", - "ranking_metric = create_metric_decorator(RankingMetrics)" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "cbb1729b-8b25-48d8-a472-c03dd1e0d861", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1, 0, 2]\n", - "Ranked based on response clarity and detail.\n" - ] - } - ], - "source": [ - "@ranking_metric(\n", - " llm=LLM(), # Your language model instance\n", - " prompt=\"Rank the following responses:\\n{candidates}\",\n", - " name='new_ranking_metric',\n", - " num_ranks=3\n", - ")\n", - "def my_ranking_metric(llm, prompt, **kwargs):\n", - " # Your custom logic that calls the LLM and returns a tuple of (ranking, reason)\n", - " # For example, process the prompt (formatted with candidates) and produce a ranking.\n", - " ranking = [1, 0, 2] # Dummy ranking: second candidate is best, then first, then third.\n", - " reason = \"Ranked based on response clarity and detail.\"\n", - " return ranking, reason\n", - "\n", - "# Using the decorator-based ranking metric:\n", - "result = my_ranking_metric.score(candidates=[\n", - " \"Response A: short answer.\",\n", - " \"Response B: a bit more detailed.\",\n", - " \"Response C: the longest and most detailed answer.\"\n", - "])\n", - "print(result) # E.g., [1, 0, 2]\n", - "print(result.reason) # E.g., \"Ranked based on response clarity and detail.\"\n" + "my_metric = NumericMetric(name=\"example\", prompt=\"What is the result of {input}?\", llm=LLM())\n", + "my_metric.score(input=\"test\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "23e38ce5-aac9-489b-96c0-947011dbbdf7", + "id": "1327f250", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { - "kernelspec": { - "display_name": "Python (random)", - "language": "python", - "name": "random" - }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" + "name": "python" } }, "nbformat": 4, - "nbformat_minor": 5 + "nbformat_minor": 2 } diff --git a/nbs/metric/decorator.ipynb b/nbs/metric/decorator.ipynb new file mode 100644 index 0000000..f60e4d9 --- /dev/null +++ b/nbs/metric/decorator.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.decorator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# decorator factory for metrics\n", + "> decorator factory for creating custom metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| hide\n", + "from dotenv import load_dotenv\n", + "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "import inspect\n", + "import asyncio\n", + "from dataclasses import dataclass\n", + "from ragas_annotator.metric import MetricResult\n", + "\n", + "\n", + "\n", + "\n", + "def create_metric_decorator(metric_class):\n", + " \"\"\"\n", + " Factory function that creates decorator factories for different metric types.\n", + " \n", + " Args:\n", + " metric_class: The metric class to use (DiscreteMetrics, NumericMetrics, etc.)\n", + " \n", + " Returns:\n", + " A decorator factory function for the specified metric type\n", + " \"\"\"\n", + " def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params):\n", + " \"\"\"\n", + " Creates a decorator that wraps a function into a metric instance.\n", + " \n", + " Args:\n", + " llm: The language model instance to use\n", + " prompt: The prompt template\n", + " name: Optional name for the metric (defaults to function name)\n", + " **metric_params: Additional parameters specific to the metric type\n", + " (values for DiscreteMetrics, range for NumericMetrics, etc.)\n", + " \n", + " Returns:\n", + " A decorator function\n", + " \"\"\"\n", + " def decorator(func):\n", + " # Get metric name and check if function is async\n", + " metric_name = name or func.__name__\n", + " is_async = inspect.iscoroutinefunction(func)\n", + " \n", + " @dataclass\n", + " class CustomMetric(metric_class):\n", + " def _extract_result(self, result, reasoning: bool):\n", + " \"\"\"Extract score and reason from the result.\"\"\"\n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " score, reason = result\n", + " else:\n", + " score, reason = result, None\n", + " \n", + " # Use \"result\" instead of \"score\" for the new MetricResult implementation\n", + " return MetricResult(result=score, reason=reason if reasoning else None)\n", + " \n", + " def _run_sync_in_async(self, func, *args, **kwargs):\n", + " \"\"\"Run a synchronous function in an async context.\"\"\"\n", + " # For sync functions, just run them normally\n", + " return func(*args, **kwargs)\n", + " \n", + " def _execute_metric(self, is_async_execution, reasoning, **kwargs):\n", + " \"\"\"Execute the metric function with proper async handling.\"\"\"\n", + " try:\n", + " if is_async:\n", + " # Async function implementation\n", + " if is_async_execution:\n", + " # In async context, await the function directly\n", + " result = func(self.llm, self.prompt, **kwargs)\n", + " else:\n", + " # In sync context, run the async function in an event loop\n", + " try:\n", + " loop = asyncio.get_event_loop()\n", + " except RuntimeError:\n", + " loop = asyncio.new_event_loop()\n", + " asyncio.set_event_loop(loop)\n", + " result = loop.run_until_complete(func(self.llm, self.prompt, **kwargs))\n", + " else:\n", + " # Sync function implementation\n", + " result = func(self.llm, self.prompt, **kwargs)\n", + " \n", + " return self._extract_result(result, reasoning)\n", + " except Exception as e:\n", + " # Handle errors gracefully\n", + " error_msg = f\"Error executing metric {self.name}: {str(e)}\"\n", + " return MetricResult(result=None, reason=error_msg)\n", + " \n", + " def score(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " \"\"\"Synchronous scoring method.\"\"\"\n", + " return self._execute_metric(is_async_execution=False, reasoning=reasoning, **kwargs)\n", + " \n", + " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " \"\"\"Asynchronous scoring method.\"\"\"\n", + " if is_async:\n", + " # For async functions, await the result\n", + " result = await func(self.llm, self.prompt, **kwargs)\n", + " return self._extract_result(result, reasoning)\n", + " else:\n", + " # For sync functions, run normally\n", + " result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs)\n", + " return self._extract_result(result, reasoning)\n", + " \n", + " # Create the metric instance with all parameters\n", + " metric_instance = CustomMetric(\n", + " name=metric_name,\n", + " prompt=prompt,\n", + " llm=llm,\n", + " **metric_params\n", + " )\n", + " \n", + " # Preserve metadata\n", + " metric_instance.__name__ = metric_name\n", + " metric_instance.__doc__ = func.__doc__\n", + " \n", + " return metric_instance\n", + " \n", + " return decorator\n", + " \n", + " return decorator_factory\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "low\n", + "reason\n" + ] + } + ], + "source": [ + "### Example usage\n", + "from ragas_annotator.metric import DiscreteMetric\n", + "from ragas_annotator.metric.llm import LLM\n", + "from pydantic import BaseModel\n", + "\n", + "discrete_metric = create_metric_decorator(DiscreteMetric)\n", + "\n", + "@discrete_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: t.List[bool]\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = sum(response.output)\n", + " if total < 1:\n", + " score = 'low'\n", + " else:\n", + " score = 'high'\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "print(result)\n", + "print(result.reason)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "random", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/discrete.ipynb b/nbs/metric/discrete.ipynb new file mode 100644 index 0000000..af2d94a --- /dev/null +++ b/nbs/metric/discrete.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.discrete" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| hide\n", + "from dotenv import load_dotenv\n", + "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DiscreteMetric\n", + "> Base class from which all discrete metrics should inherit." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import typing as t\n", + "from dataclasses import dataclass, field\n", + "from pydantic import BaseModel, create_model\n", + "from collections import Counter\n", + "from ragas_annotator.metric import Metric, MetricResult\n", + "from ragas_annotator.metric.decorator import create_metric_decorator\n", + "\n", + "\n", + "@dataclass\n", + "class DiscreteMetric(Metric):\n", + " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " values = tuple(self.values)\n", + " fields = {\"result\": (t.Literal[values], ...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...) # type: ignore\n", + " \n", + " model = create_model(model_name, **fields) # type: ignore\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + " \n", + " candidates = [candidate.result for candidate in results]\n", + " counter = Counter(candidates)\n", + " max_count = max(counter.values())\n", + " for candidate in results:\n", + " if counter[candidate.result] == max_count:\n", + " result = candidate.result \n", + " reason = candidate.reason\n", + " break\n", + " \n", + " return results[0]\n", + "\n", + "\n", + "discrete_metric = create_metric_decorator(DiscreteMetric)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "low\n", + "No context or content was provided for evaluation.\n" + ] + } + ], + "source": [ + "## Example usage\n", + "from ragas_annotator.metric.llm import LLM\n", + "\n", + "my_metric = DiscreteMetric(\n", + " llm=LLM(),\n", + " name='helpfulness',\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " values=[\"low\",\"med\",\"high\"],\n", + ")\n", + "\n", + "result = my_metric.score(response=\"this is my response\")\n", + "print(result) #gives \"low\"\n", + "print(result.reason) #gives reasoning from llm\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write custom discrete metric" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "low\n", + "reason\n" + ] + } + ], + "source": [ + "@discrete_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: t.List[bool]\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = sum(response.output)\n", + " if total < 1:\n", + " score = 'low'\n", + " else:\n", + " score = 'high'\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "print(result)\n", + "print(result.reason)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "random", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/llm.ipynb b/nbs/metric/llm.ipynb new file mode 100644 index 0000000..9a3ab54 --- /dev/null +++ b/nbs/metric/llm.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.llm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import openai\n", + "import instructor\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class LLM:\n", + "\n", + " def __post_init__(self):\n", + " self.aclient = instructor.from_openai(openai.AsyncOpenAI())\n", + " self.client = instructor.from_openai(openai.OpenAI())\n", + "\n", + " \n", + " def generate(self,prompt,response_model):\n", + " return self.client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " response_model=response_model,\n", + " )\n", + "\n", + " async def agenerate(self,prompt,response_model):\n", + " return await self.aclient.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " response_model=response_model,\n", + " )" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/numeric.ipynb b/nbs/metric/numeric.ipynb new file mode 100644 index 0000000..73372cf --- /dev/null +++ b/nbs/metric/numeric.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Numeric Metric\n", + "> Base class for all numeric metrics\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.numeric" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "from dataclasses import dataclass, field\n", + "from pydantic import BaseModel, create_model\n", + "from ragas_annotator.metric import Metric, MetricResult\n", + "from ragas_annotator.metric.decorator import create_metric_decorator\n", + "\n", + "@dataclass\n", + "class NumericMetrics(Metric):\n", + " range: t.Tuple[float,float]\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " fields = {\"result\": (float,...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...) #type: ignore\n", + " \n", + " model = create_model(model_name, **fields)\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + " \n", + " candidates = [candidate.result for candidate in results]\n", + " result = sum(candidates)/len(candidates)\n", + " reason = results[0].reason\n", + " \n", + " return MetricResult(result=result,reason=reason)\n", + " \n", + " \n", + "numeric_metric = create_metric_decorator(NumericMetrics)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The response does not provide any context or information that can be evaluated as helpful.'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Example usage\n", + "\n", + "from dotenv import load_dotenv\n", + "from ragas_annotator.metric.llm import LLM\n", + "\n", + "\n", + "\n", + "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')\n", + "\n", + "my_metric = NumericMetrics(\n", + " name='helpfulness',\n", + " llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " range=(0,10),\n", + ")\n", + "\n", + "result = my_metric.score(response=\"this is my response\")\n", + "result #gives \"low\"\n", + "result.reason #gives reasoning from llm\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write custom numeric metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "@numeric_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',range=(0,10))\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: int\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = response.output\n", + " if total < 1:\n", + " score = 0\n", + " else:\n", + " score = 10\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "result # 10\n", + "result.reason # the reason for the answer\n", + "\n", + "result1 = my_metric.score(response='my response 1') # result\n", + "result2 = my_metric.score(response='my response 2') # result\n", + "\n", + "result1 + result2 # should be addable and behave like a float\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "random", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/ranking.ipynb b/nbs/metric/ranking.ipynb new file mode 100644 index 0000000..ace4336 --- /dev/null +++ b/nbs/metric/ranking.ipynb @@ -0,0 +1,224 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RankingMetric\n", + "> Base class for ranking metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.ranking" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "from dataclasses import dataclass\n", + "from pydantic import BaseModel, Field\n", + "from ragas_annotator.metric import Metric, MetricResult\n", + "from ragas_annotator.metric.decorator import create_metric_decorator\n", + "\n", + "@dataclass\n", + "class RankingMetric(Metric):\n", + " num_ranks: int\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " # Store values needed for validation\n", + " num_ranks = self.num_ranks\n", + " \n", + " # Create explicit model classes instead of using create_model\n", + " if with_reasoning:\n", + " # Model with result and reason\n", + " class ResponseModelWithReason(BaseModel):\n", + " result: t.List[int] = Field(...)\n", + " reason: str = Field(...)\n", + " \n", + " def model_post_init(self, __context):\n", + " expected = set(range(num_ranks))\n", + " if set(self.result) != expected:\n", + " raise ValueError(\n", + " f\"'result' must contain exactly the numbers {sorted(expected)} without repetition.\"\n", + " )\n", + " \n", + " self._response_models[with_reasoning] = ResponseModelWithReason\n", + " return ResponseModelWithReason\n", + " else:\n", + " # Model with just result\n", + " class ResponseModel(BaseModel):\n", + " result: t.List[int] = Field(...)\n", + " \n", + " def model_post_init(self, __context):\n", + " expected = set(range(num_ranks))\n", + " if set(self.result) != expected:\n", + " raise ValueError(\n", + " f\"'result' must contain exactly the numbers {sorted(expected)} without repetition.\"\n", + " )\n", + " \n", + " self._response_models[with_reasoning] = ResponseModel\n", + " return ResponseModel\n", + "\n", + " def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:\n", + " if len(results) == 1:\n", + " return results[0]\n", + "\n", + " n_items = self.num_ranks # Use the class attribute instead of len(results)\n", + " borda_scores = [0] * n_items\n", + "\n", + " for result in results:\n", + " for position_idx, item_idx in enumerate(result.result):\n", + " borda_scores[item_idx] += (n_items - position_idx) # Fixed the formula\n", + "\n", + " indexed_scores = [(score, i) for i, score in enumerate(borda_scores)] \n", + " indexed_scores.sort(key=lambda x: (-x[0], x[1])) \n", + " final_ranking = [pos for _, pos in indexed_scores]\n", + "\n", + " if any(r.reason for r in results):\n", + " reason = \"Ensemble ranking based on multiple evaluations.\\n\" + '\\n'.join([r.reason for r in results if r.reason])\n", + " else:\n", + " reason = None\n", + " \n", + " return MetricResult(result=final_ranking, reason=reason)\n", + " \n", + "\n", + "ranking_metric = create_metric_decorator(RankingMetric)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 2]\n", + "Ensemble ranking based on multiple evaluations.\n", + "The ranking is based on the length and detail of the responses, with 'short answer.' being the least detailed (rank 0), 'a bit more detailed.' being moderate (rank 1), and 'the longest and most detailed answer.' being the most comprehensive (rank 2).\n", + "The ranking is based on the length and detail of the responses. The shortest response is ranked the lowest (0), the moderately detailed response is ranked higher (1), and the longest and most detailed response is ranked the highest (2).\n", + "Ranking is based on length and detail; the longest answer (2) is most detailed, followed by a bit more detailed (1), and the shortest answer (0) is the least detailed.\n" + ] + } + ], + "source": [ + "## Example usage\n", + "\n", + "from dotenv import load_dotenv\n", + "from ragas_annotator.metric.llm import LLM\n", + "\n", + "\n", + "\n", + "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')\n", + "\n", + "# User instantiates a ranking metric by providing a name, an LLM, a prompt template, and the number of rankings desired.\n", + "my_ranking_metric = RankingMetric(\n", + " name='response_ranking',\n", + " llm=LLM(), # Your language model instance\n", + " prompt=\"Rank the following responses:\\n{candidates}\",\n", + " num_ranks=3,\n", + ")\n", + "\n", + "# To score a single input (ranking candidate responses)\n", + "result = my_ranking_metric.score(candidates=[\n", + " \"short answer.\",\n", + " \"a bit more detailed.\",\n", + " \"the longest and most detailed answer.\"\n", + "],n=3)\n", + "print(result) # Might output something like: [1, 0, 2]\n", + "print(result.reason) # Provides the reasoning behind the ranking\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom ranking metric" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 0, 2]\n", + "Ranked based on response clarity and detail.\n" + ] + } + ], + "source": [ + "@ranking_metric(\n", + " llm=LLM(), # Your language model instance\n", + " prompt=\"Rank the following responses:\\n{candidates}\",\n", + " name='new_ranking_metric',\n", + " num_ranks=3\n", + ")\n", + "def my_ranking_metric(llm, prompt, **kwargs):\n", + " # Your custom logic that calls the LLM and returns a tuple of (ranking, reason)\n", + " # For example, process the prompt (formatted with candidates) and produce a ranking.\n", + " ranking = [1, 0, 2] # Dummy ranking: second candidate is best, then first, then third.\n", + " reason = \"Ranked based on response clarity and detail.\"\n", + " return ranking, reason\n", + "\n", + "# Using the decorator-based ranking metric:\n", + "result = my_ranking_metric.score(candidates=[\n", + " \"Response A: short answer.\",\n", + " \"Response B: a bit more detailed.\",\n", + " \"Response C: the longest and most detailed answer.\"\n", + "])\n", + "print(result) # E.g., [1, 0, 2]\n", + "print(result.reason) # E.g., \"Ranked based on response clarity and detail.\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "random", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/metric/result.ipynb b/nbs/metric/result.ipynb new file mode 100644 index 0000000..c10e80e --- /dev/null +++ b/nbs/metric/result.ipynb @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "215f57b4", + "metadata": {}, + "source": [ + "# MetricResult\n", + "> MetricResult object to store the result of a metric" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "164726f3", + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp metric.result" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0f1c801a-6568-4ba4-8bbe-30bf154174fe", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "class MetricResult:\n", + " \"\"\"Class to hold the result of a metric evaluation.\n", + " \n", + " This class behaves like its underlying result value but still provides access\n", + " to additional metadata like reasoning.\n", + " \n", + " Works with:\n", + " - DiscreteMetrics (string results)\n", + " - NumericMetrics (float/int results)\n", + " - RankingMetrics (list results)\n", + " \"\"\"\n", + " \n", + " def __init__(self, result: t.Any, reason: t.Optional[str] = None):\n", + " self._result = result\n", + " self.reason = reason\n", + " \n", + " def __repr__(self):\n", + " return repr(self._result)\n", + " \n", + " # Access to underlying result\n", + " @property\n", + " def result(self):\n", + " \"\"\"Get the raw result value.\"\"\"\n", + " return self._result\n", + " \n", + " \n", + " # String conversion - works for all types\n", + " def __str__(self):\n", + " return str(self._result)\n", + " \n", + " # Container-like behaviors for list results (RankingMetric)\n", + " def __getitem__(self, key):\n", + " if not hasattr(self._result, \"__getitem__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} object is not subscriptable\")\n", + " return self._result[key]\n", + " \n", + " def __iter__(self):\n", + " if not hasattr(self._result, \"__iter__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} object is not iterable\")\n", + " return iter(self._result)\n", + " \n", + " def __len__(self):\n", + " if not hasattr(self._result, \"__len__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} has no len()\")\n", + " return len(self._result)\n", + " \n", + " # Numeric operations for numeric results (NumericMetric)\n", + " def __float__(self):\n", + " if isinstance(self._result, (int, float)):\n", + " return float(self._result)\n", + " raise TypeError(f\"Cannot convert {type(self._result).__name__} to float\")\n", + " \n", + " def __int__(self):\n", + " if isinstance(self._result, (int, float)):\n", + " return int(self._result)\n", + " raise TypeError(f\"Cannot convert {type(self._result).__name__} to int\")\n", + " \n", + " def __add__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result + other._result\n", + " return self._result + other\n", + " \n", + " def __radd__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", + " return other + self._result\n", + " \n", + " def __sub__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result - other._result\n", + " return self._result - other\n", + " \n", + " def __rsub__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", + " return other - self._result\n", + " \n", + " def __mul__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result * other._result\n", + " return self._result * other\n", + " \n", + " def __rmul__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", + " return other * self._result\n", + " \n", + " def __truediv__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result / other._result\n", + " return self._result / other\n", + " \n", + " def __rtruediv__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", + " return other / self._result\n", + " \n", + " # Comparison operations - work for all types with same-type comparisons\n", + " def __eq__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result == other._result\n", + " return self._result == other\n", + " \n", + " def __lt__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result < other._result\n", + " return self._result < other\n", + " \n", + " def __le__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result <= other._result\n", + " return self._result <= other\n", + " \n", + " def __gt__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result > other._result\n", + " return self._result > other\n", + " \n", + " def __ge__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result >= other._result\n", + " return self._result >= other\n", + " \n", + " # Method forwarding for type-specific behaviors\n", + " def __getattr__(self, name):\n", + " \"\"\"Forward attribute access to the result object if it has that attribute.\n", + " \n", + " This allows calling string methods on discrete results, \n", + " numeric methods on numeric results, and list methods on ranking results.\n", + " \"\"\"\n", + " if hasattr(self._result, name):\n", + " attr = getattr(self._result, name)\n", + " if callable(attr):\n", + " # If it's a method, wrap it to return MetricResult when appropriate\n", + " def wrapper(*args, **kwargs):\n", + " result = attr(*args, **kwargs)\n", + " # If the result is of the same type as self._result, wrap it\n", + " if isinstance(result, type(self._result)):\n", + " return MetricResult(result=result, reason=self.reason)\n", + " return result\n", + " return wrapper\n", + " return attr\n", + " raise AttributeError(f\"{type(self).__name__} has no attribute '{name}'\")\n", + " \n", + " # JSON/dict serialization\n", + " def to_dict(self):\n", + " \"\"\"Convert the result to a dictionary.\"\"\"\n", + " return {\n", + " \"result\": self._result,\n", + " \"reason\": self.reason\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "24589401", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "42\n", + "This is a test\n", + "8.0\n", + "LOW\n", + "[2, 3]\n" + ] + } + ], + "source": [ + "### Example Usage\n", + "\n", + "metric_result = MetricResult(result=42, reason=\"This is a test\")\n", + "print(metric_result)\n", + "print(metric_result.reason)\n", + "\n", + "### Example with Numeric Operations\n", + "num_result1 = MetricResult(result=5.0)\n", + "num_result2 = MetricResult(result=3.0)\n", + "print(num_result1 + num_result2) # 8.0\n", + "\n", + "\n", + "### Example with String Operations\n", + "str_result = MetricResult(result=\"low\")\n", + "print(str_result.upper()) # \"LOW\"\n", + "\n", + "## Example with List Operations\n", + "list_result = MetricResult(result=[1, 2, 3])\n", + "print(list_result[1:]) # 2\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a984dde9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "random", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/metric/test_base.ipynb b/nbs/metric/test_base.ipynb new file mode 100644 index 0000000..02965d9 --- /dev/null +++ b/nbs/metric/test_base.ipynb @@ -0,0 +1,1076 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c48aac0f-c63c-4bfb-95b0-a1239f41ccb3", + "metadata": {}, + "source": [ + "# Test Base\n", + "\n", + "### Do not export" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5d14fc66-b8af-4a75-b761-20b8e9ce19f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#| hide\n", + "from dotenv import load_dotenv\n", + "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0b05c525-e153-49ab-b768-5069f624f215", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "#| export\n", + "import typing as t\n", + "from typing import Any, Callable, Dict, List, Optional, Union\n", + "from abc import ABC, abstractmethod\n", + "import asyncio\n", + "from dataclasses import dataclass\n", + "from pydantic import BaseModel\n", + "import openai\n", + "import instructor\n", + "from dataclasses import dataclass, field\n", + "from pydantic import BaseModel, create_model\n", + "import typing as t\n", + "import inspect" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "be7f588f-2a13-4c10-9775-00101a03e0a5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import openai\n", + "import instructor\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class LLM:\n", + "\n", + " def __post_init__(self):\n", + " self.aclient = instructor.from_openai(openai.AsyncOpenAI())\n", + " self.client = instructor.from_openai(openai.OpenAI())\n", + "\n", + " \n", + " def generate(self,prompt,response_model):\n", + " return self.client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " response_model=response_model,\n", + " )\n", + "\n", + " async def agenerate(self,prompt,response_model):\n", + " return await self.aclient.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ],\n", + " response_model=response_model,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "2dfed36e", + "metadata": {}, + "source": [ + "## MetricResult" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca623dd2", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "import typing as t\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "class MetricResult:\n", + " \"\"\"Class to hold the result of a metric evaluation.\n", + " \n", + " This class behaves like its underlying result value but still provides access\n", + " to additional metadata like reasoning.\n", + " \n", + " Works with:\n", + " - DiscreteMetrics (string results)\n", + " - NumericMetrics (float/int results)\n", + " - RankingMetrics (list results)\n", + " \"\"\"\n", + " \n", + " def __init__(self, result: t.Any, reason: t.Optional[str] = None):\n", + " self._result = result\n", + " self.reason = reason\n", + " \n", + " def __repr__(self):\n", + " return repr(self._result)\n", + " \n", + " # Access to underlying result\n", + " @property\n", + " def result(self):\n", + " \"\"\"Get the raw result value.\"\"\"\n", + " return self._result\n", + " \n", + " \n", + " # String conversion - works for all types\n", + " def __str__(self):\n", + " return str(self._result)\n", + " \n", + " # Container-like behaviors for list results (RankingMetric)\n", + " def __getitem__(self, key):\n", + " if not hasattr(self._result, \"__getitem__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} object is not subscriptable\")\n", + " return self._result[key]\n", + " \n", + " def __iter__(self):\n", + " if not hasattr(self._result, \"__iter__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} object is not iterable\")\n", + " return iter(self._result)\n", + " \n", + " def __len__(self):\n", + " if not hasattr(self._result, \"__len__\"):\n", + " raise TypeError(f\"{type(self._result).__name__} has no len()\")\n", + " return len(self._result)\n", + " \n", + " # Numeric operations for numeric results (NumericMetric)\n", + " def __float__(self):\n", + " if isinstance(self._result, (int, float)):\n", + " return float(self._result)\n", + " raise TypeError(f\"Cannot convert {type(self._result).__name__} to float\")\n", + " \n", + " def __int__(self):\n", + " if isinstance(self._result, (int, float)):\n", + " return int(self._result)\n", + " raise TypeError(f\"Cannot convert {type(self._result).__name__} to int\")\n", + " \n", + " def __add__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result + other._result\n", + " return self._result + other\n", + " \n", + " def __radd__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", + " return other + self._result\n", + " \n", + " def __sub__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result - other._result\n", + " return self._result - other\n", + " \n", + " def __rsub__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", + " return other - self._result\n", + " \n", + " def __mul__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result * other._result\n", + " return self._result * other\n", + " \n", + " def __rmul__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", + " return other * self._result\n", + " \n", + " def __truediv__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", + " if isinstance(other, MetricResult):\n", + " return self._result / other._result\n", + " return self._result / other\n", + " \n", + " def __rtruediv__(self, other):\n", + " if not isinstance(self._result, (int, float)):\n", + " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", + " return other / self._result\n", + " \n", + " # Comparison operations - work for all types with same-type comparisons\n", + " def __eq__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result == other._result\n", + " return self._result == other\n", + " \n", + " def __lt__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result < other._result\n", + " return self._result < other\n", + " \n", + " def __le__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result <= other._result\n", + " return self._result <= other\n", + " \n", + " def __gt__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result > other._result\n", + " return self._result > other\n", + " \n", + " def __ge__(self, other):\n", + " if isinstance(other, MetricResult):\n", + " return self._result >= other._result\n", + " return self._result >= other\n", + " \n", + " # Method forwarding for type-specific behaviors\n", + " def __getattr__(self, name):\n", + " \"\"\"Forward attribute access to the result object if it has that attribute.\n", + " \n", + " This allows calling string methods on discrete results, \n", + " numeric methods on numeric results, and list methods on ranking results.\n", + " \"\"\"\n", + " if hasattr(self._result, name):\n", + " attr = getattr(self._result, name)\n", + " if callable(attr):\n", + " # If it's a method, wrap it to return MetricResult when appropriate\n", + " def wrapper(*args, **kwargs):\n", + " result = attr(*args, **kwargs)\n", + " # If the result is of the same type as self._result, wrap it\n", + " if isinstance(result, type(self._result)):\n", + " return MetricResult(result=result, reason=self.reason)\n", + " return result\n", + " return wrapper\n", + " return attr\n", + " raise AttributeError(f\"{type(self).__name__} has no attribute '{name}'\")\n", + " \n", + " # JSON/dict serialization\n", + " def to_dict(self):\n", + " \"\"\"Convert the result to a dictionary.\"\"\"\n", + " return {\n", + " \"result\": self._result,\n", + " \"reason\": self.reason\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "1bec7946-c61c-4631-9880-fff575974e39", + "metadata": {}, + "source": [ + "### Metric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5e19478-d947-405b-a229-4a1e7daa2fd3", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "from abc import ABC, abstractmethod\n", + "import asyncio\n", + "from dataclasses import dataclass, field\n", + "from pydantic import BaseModel\n", + "import typing as t\n", + "from ragas_annotator.metric import MetricResult\n", + "\n", + "@dataclass\n", + "class Metric(ABC):\n", + " \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n", + " name: str\n", + " prompt: str\n", + " llm: 'LLM' # Forward reference with quotes\n", + " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(\n", + " default_factory=dict, init=False, repr=False\n", + " )\n", + " \n", + " @abstractmethod\n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get the appropriate response model.\"\"\"\n", + " pass\n", + "\n", + " @abstractmethod\n", + " def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:\n", + " pass\n", + " \n", + " \n", + " def score(self, reasoning: bool = True, n: int = 1, **kwargs) -> t.Any:\n", + " responses = []\n", + " prompt_input = self.prompt.format(**kwargs)\n", + " for _ in range(n):\n", + " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning)) \n", + " response = MetricResult(**response.model_dump())\n", + " responses.append(response)\n", + " return self._ensemble(responses)\n", + "\n", + "\n", + " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs) -> MetricResult:\n", + " responses = [] # Added missing initialization\n", + " prompt_input = self.prompt.format(**kwargs)\n", + " for _ in range(n):\n", + " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", + " response = MetricResult(**response.model_dump()) # Fixed missing parentheses\n", + " responses.append(response)\n", + " return self._ensemble(responses)\n", + " \n", + " def batch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[t.Any]:\n", + " return [self.score(reasoning, n, **input_dict) for input_dict in inputs]\n", + " \n", + " async def abatch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[MetricResult]:\n", + " async_tasks = []\n", + " for input_dict in inputs:\n", + " # Add reasoning and n to the input parameters\n", + " async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict))\n", + " \n", + " # Run all tasks concurrently and return results\n", + " return await asyncio.gather(*async_tasks)" + ] + }, + { + "cell_type": "markdown", + "id": "4af79bb4-4c3e-4004-b7f3-ec36e50b4ca5", + "metadata": {}, + "source": [ + "### DiscreteMetric \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f09683e0-5ec3-4e60-a8c4-1657e2fe60b9", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "from collections import Counter\n", + "\n", + "@dataclass\n", + "class DiscreteMetrics(Metric):\n", + " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " values = tuple(self.values)\n", + " fields = {\"result\": (t.Literal[values], ...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...)\n", + " \n", + " model = create_model(model_name, **fields)\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + " \n", + " candidates = [candidate.result for candidate in results]\n", + " counter = Counter(candidates)\n", + " max_count = max(counter.values())\n", + " for candidate in results:\n", + " if counter[candidate.result] == max_count:\n", + " result = candidate.result \n", + " reason = candidate.reason\n", + " break\n", + " \n", + " return MetricResult(result=result,reason=reason)\n", + "\n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "f33ef89f-4ccc-4307-9944-4f372ce77830", + "metadata": {}, + "source": [ + "### decorator factory for discrete_metric" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "99d5afd6-72bd-42d7-bff0-effce9cf8cd9", + "metadata": {}, + "outputs": [], + "source": [ + "import typing as t\n", + "from typing import Any, Callable, Dict, List, Optional, Type, Union\n", + "import inspect\n", + "import asyncio\n", + "from dataclasses import dataclass\n", + "from abc import ABC\n", + "\n", + "def create_metric_decorator(metric_class):\n", + " \"\"\"\n", + " Factory function that creates decorator factories for different metric types.\n", + " \n", + " Args:\n", + " metric_class: The metric class to use (DiscreteMetrics, NumericMetrics, etc.)\n", + " \n", + " Returns:\n", + " A decorator factory function for the specified metric type\n", + " \"\"\"\n", + " def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params):\n", + " \"\"\"\n", + " Creates a decorator that wraps a function into a metric instance.\n", + " \n", + " Args:\n", + " llm: The language model instance to use\n", + " prompt: The prompt template\n", + " name: Optional name for the metric (defaults to function name)\n", + " **metric_params: Additional parameters specific to the metric type\n", + " (values for DiscreteMetrics, range for NumericMetrics, etc.)\n", + " \n", + " Returns:\n", + " A decorator function\n", + " \"\"\"\n", + " def decorator(func):\n", + " # Get metric name and check if function is async\n", + " metric_name = name or func.__name__\n", + " is_async = inspect.iscoroutinefunction(func)\n", + " \n", + " @dataclass\n", + " class CustomMetric(metric_class):\n", + " def _extract_result(self, result, reasoning: bool):\n", + " \"\"\"Extract score and reason from the result.\"\"\"\n", + " if isinstance(result, tuple) and len(result) == 2:\n", + " score, reason = result\n", + " else:\n", + " score, reason = result, None\n", + " \n", + " # Use \"result\" instead of \"score\" for the new MetricResult implementation\n", + " return MetricResult(result=score, reason=reason if reasoning else None)\n", + " \n", + " def _run_sync_in_async(self, func, *args, **kwargs):\n", + " \"\"\"Run a synchronous function in an async context.\"\"\"\n", + " # For sync functions, just run them normally\n", + " return func(*args, **kwargs)\n", + " \n", + " def _execute_metric(self, is_async_execution, reasoning, **kwargs):\n", + " \"\"\"Execute the metric function with proper async handling.\"\"\"\n", + " try:\n", + " if is_async:\n", + " # Async function implementation\n", + " if is_async_execution:\n", + " # In async context, await the function directly\n", + " result = func(self.llm, self.prompt, **kwargs)\n", + " else:\n", + " # In sync context, run the async function in an event loop\n", + " try:\n", + " loop = asyncio.get_event_loop()\n", + " except RuntimeError:\n", + " loop = asyncio.new_event_loop()\n", + " asyncio.set_event_loop(loop)\n", + " result = loop.run_until_complete(func(self.llm, self.prompt, **kwargs))\n", + " else:\n", + " # Sync function implementation\n", + " result = func(self.llm, self.prompt, **kwargs)\n", + " \n", + " return self._extract_result(result, reasoning)\n", + " except Exception as e:\n", + " # Handle errors gracefully\n", + " error_msg = f\"Error executing metric {self.name}: {str(e)}\"\n", + " return MetricResult(result=None, reason=error_msg)\n", + " \n", + " def score(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " \"\"\"Synchronous scoring method.\"\"\"\n", + " return self._execute_metric(is_async_execution=False, reasoning=reasoning, **kwargs)\n", + " \n", + " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):\n", + " \"\"\"Asynchronous scoring method.\"\"\"\n", + " if is_async:\n", + " # For async functions, await the result\n", + " result = await func(self.llm, self.prompt, **kwargs)\n", + " return self._extract_result(result, reasoning)\n", + " else:\n", + " # For sync functions, run normally\n", + " result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs)\n", + " return self._extract_result(result, reasoning)\n", + " \n", + " # Create the metric instance with all parameters\n", + " metric_instance = CustomMetric(\n", + " name=metric_name,\n", + " prompt=prompt,\n", + " llm=llm,\n", + " **metric_params\n", + " )\n", + " \n", + " # Preserve metadata\n", + " metric_instance.__name__ = metric_name\n", + " metric_instance.__doc__ = func.__doc__\n", + " \n", + " return metric_instance\n", + " \n", + " return decorator\n", + " \n", + " return decorator_factory\n", + "\n", + "# Create specific decorator factories for each metric type\n", + "discrete_metric = create_metric_decorator(DiscreteMetrics)\n" + ] + }, + { + "cell_type": "markdown", + "id": "07f49ad3-4476-4bcc-ac34-a87fb7a8652a", + "metadata": {}, + "source": [ + "### Usage pattern" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "aeae8fe5-e81a-44ac-9ad7-a240655a0f06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"The answer provided lacks specific context or detail needed to evaluate its helpfulness fully. Without more information, it's difficult to determine its applicability.\"" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "my_metric = DiscreteMetrics(\n", + " name='helpfulness',\n", + " llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " values=[\"low\",\"med\",\"high\"],\n", + ")\n", + "\n", + "result = my_metric.score(response=\"this is my response\")\n", + "result #gives \"low\"\n", + "result.reason #gives reasoning from llm\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "b5e499d8-8258-46ce-b719-0389d3cfd8db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'low'" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## score without reasoning to save reasoning tokens cost\n", + "result = my_metric.score(response=\"this is my response\",reasoning=False,n=3)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "a9a5d6c6-4cfc-4f45-8b19-996315a95370", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'reason'" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@discrete_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: t.List[bool]\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = sum(response.output)\n", + " if total < 1:\n", + " score = 'low'\n", + " else:\n", + " score = 'high'\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "result\n", + "result.reason" + ] + }, + { + "cell_type": "markdown", + "id": "05f60c70-fc32-41f8-aa7c-c8685d77398a", + "metadata": {}, + "source": [ + "## Numeric Metric" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "6a1c66fb-3c1c-4bc6-9996-0b5beb304b9c", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "@dataclass\n", + "class NumericMetrics(Metric):\n", + " range: t.Tuple[float,float]\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + " fields = {\"result\": (float,...)}\n", + " \n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...)\n", + " \n", + " model = create_model(model_name, **fields)\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + " \n", + " candidates = [candidate.result for candidate in results]\n", + " result = sum(candidates)/len(candidates)\n", + " reason = results[0].reason\n", + " \n", + " return MetricResult(result=result,reason=reason)\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "251cdea8-fc71-46bd-8a00-fb8e33e10350", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The response lacks sufficient information or context to be considered helpful. It does not address any specific question or provide any useful insights.'" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_metric = NumericMetrics(\n", + " name='helpfulness',\n", + " llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " range=(0,10),\n", + ")\n", + "\n", + "result = my_metric.score(response=\"this is my response\")\n", + "result #gives \"low\"\n", + "result.reason #gives reasoning from llm\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "b0994c80-c6db-4f3b-9ed9-1b32d61428c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.0]" + ] + }, + "execution_count": 130, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_metric.batch_score(inputs=[{\"response\":\"this is my response\"}])\n" + ] + }, + { + "cell_type": "markdown", + "id": "c96520ae-294b-4868-8b2f-22a30ebd5f25", + "metadata": {}, + "source": [ + "### decorator factory" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "265af384-ed35-4262-acfe-6847b22d3089", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "numeric_metric = create_metric_decorator(NumericMetrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "009c1944-bda7-41b1-9235-dcda5acbed55", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@numeric_metric(llm=LLM(),\n", + " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", + " name='new_metric',range=(0,10))\n", + "def my_metric(llm,prompt,**kwargs):\n", + "\n", + " class response_model(BaseModel):\n", + " output: int\n", + " reason: str\n", + " \n", + " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", + " total = response.output\n", + " if total < 1:\n", + " score = 0\n", + " else:\n", + " score = 10\n", + " return score,\"reason\"\n", + "\n", + "result = my_metric.score(response='my response') # result\n", + "result # 10\n", + "result.reason # the reason for the answer\n", + "\n", + "result1 = my_metric.score(response='my response 1') # result\n", + "result2 = my_metric.score(response='my response 2') # result\n", + "\n", + "result1 + result2 # should be addable and behave like a float\n" + ] + }, + { + "cell_type": "markdown", + "id": "90794704-5e45-4dd5-8862-b4fb9694a5b5", + "metadata": {}, + "source": [ + "### Ranking metric" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "9e2bb718-ba9a-4965-a952-462ac0159766", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "typing.Literal[[0, 1, 2]]" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t.Literal[[i for i in range(3)]]" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "0b1bbf8f-c7fa-4004-9165-fa388f7ba15d", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "\n", + "@dataclass\n", + "class RankingMetrics(Metric):\n", + " num_ranks: int\n", + " \n", + " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", + " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", + " \n", + " if with_reasoning in self._response_models:\n", + " return self._response_models[with_reasoning]\n", + " \n", + " model_name = 'response_model'\n", + "\n", + " # Custom validator to ensure 'result' is a permutation of 0 .. num_ranks-1\n", + " def validate_result(cls, v):\n", + " expected = set(range(self.num_ranks))\n", + " if set(v) != expected:\n", + " raise ValueError(\n", + " f\"'result' must contain exactly the numbers {sorted(expected)} without repetition.\"\n", + " )\n", + " return v\n", + "\n", + " # Define the fields dynamically\n", + " fields = {\"result\": (List[int], ...)}\n", + " if with_reasoning:\n", + " fields[\"reason\"] = (str, ...)\n", + " \n", + " # Create the dynamic model with the custom validator attached\n", + " model = create_model(\n", + " model_name,\n", + " **fields,\n", + " __validators__={\n", + " 'result_validator': validator('result', allow_reuse=True)(validate_result)\n", + " }\n", + " )\n", + " self._response_models[with_reasoning] = model\n", + " return model \n", + "\n", + " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", + "\n", + " if len(results)==1:\n", + " return results[0]\n", + "\n", + " n_items = len(results)\n", + " borda_scores = [0] * n_items\n", + "\n", + " for result in results:\n", + " for position_idx,item_idx in enumerate(result.result):\n", + " borda_scores[item_idx] += (n_items - (position_idx-1))\n", + "\n", + " indexed_scores = [(score, i) for i, score in enumerate(borda_scores)] \n", + " indexed_scores.sort(key=lambda x: (-x[0], x[1])) \n", + " final_ranking = [pos for _, pos in indexed_scores]\n", + "\n", + " if any(r.reason for r in results):\n", + " reason = \"Ensemble ranking based on multiple evaluations.\\n\" + '\\n'.join([r.reason for r in results if r.reason])\n", + " else:\n", + " reason = None\n", + " \n", + " \n", + " return MetricResult(result=result,reason=reason)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "716881a1-0a93-46b3-b41b-aee0f987a1a6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/ww/sk5dkfhn673234cmy5w7008r0000gn/T/ipykernel_95467/972172485.py:40: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/\n", + " 'result_validator': validator('result', allow_reuse=True)(validate_result)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 2]\n", + "Ensemble ranking based on multiple evaluations.\n", + "The responses are ranked from the shortest to the longest and most detailed.\n", + "The responses are ranked from shortest to longest and most detailed.\n", + "Responses ranked from shortest to longest.\n" + ] + } + ], + "source": [ + "# User instantiates a ranking metric by providing a name, an LLM, a prompt template, and the number of rankings desired.\n", + "my_ranking_metric = RankingMetrics(\n", + " name='response_ranking',\n", + " llm=LLM(), # Your language model instance\n", + " prompt=\"Rank the following responses:\\n{candidates}\",\n", + " num_ranks=3,\n", + ")\n", + "\n", + "# To score a single input (ranking candidate responses)\n", + "result = my_ranking_metric.score(candidates=[\n", + " \"short answer.\",\n", + " \"a bit more detailed.\",\n", + " \"the longest and most detailed answer.\"\n", + "],n=3)\n", + "print(result) # Might output something like: [1, 0, 2]\n", + "print(result.reason) # Provides the reasoning behind the ranking\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e198d7d-fbab-448e-aab1-f10f4234dff6", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "5b53bd5e-06c9-4430-9c06-f2225ddd7bd5", + "metadata": {}, + "source": [ + "### decorator factory for ranking metric" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "4c4e9170-67b9-4841-9df2-6afc490b89dd", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "ranking_metric = create_metric_decorator(RankingMetrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "cbb1729b-8b25-48d8-a472-c03dd1e0d861", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 0, 2]\n", + "Ranked based on response clarity and detail.\n" + ] + } + ], + "source": [ + "@ranking_metric(\n", + " llm=LLM(), # Your language model instance\n", + " prompt=\"Rank the following responses:\\n{candidates}\",\n", + " name='new_ranking_metric',\n", + " num_ranks=3\n", + ")\n", + "def my_ranking_metric(llm, prompt, **kwargs):\n", + " # Your custom logic that calls the LLM and returns a tuple of (ranking, reason)\n", + " # For example, process the prompt (formatted with candidates) and produce a ranking.\n", + " ranking = [1, 0, 2] # Dummy ranking: second candidate is best, then first, then third.\n", + " reason = \"Ranked based on response clarity and detail.\"\n", + " return ranking, reason\n", + "\n", + "# Using the decorator-based ranking metric:\n", + "result = my_ranking_metric.score(candidates=[\n", + " \"Response A: short answer.\",\n", + " \"Response B: a bit more detailed.\",\n", + " \"Response C: the longest and most detailed answer.\"\n", + "])\n", + "print(result) # E.g., [1, 0, 2]\n", + "print(result.reason) # E.g., \"Ranked based on response clarity and detail.\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23e38ce5-aac9-489b-96c0-947011dbbdf7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "random", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From a739014ef973b49480e0a124bd42c12160d9e8c0 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 13:41:06 -0700 Subject: [PATCH 04/17] make naming consistent --- nbs/metric/numeric.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nbs/metric/numeric.ipynb b/nbs/metric/numeric.ipynb index 73372cf..847ad0a 100644 --- a/nbs/metric/numeric.ipynb +++ b/nbs/metric/numeric.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -32,7 +32,7 @@ "from ragas_annotator.metric.decorator import create_metric_decorator\n", "\n", "@dataclass\n", - "class NumericMetrics(Metric):\n", + "class NumericMetric(Metric):\n", " range: t.Tuple[float,float]\n", " \n", " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", @@ -63,12 +63,12 @@ " return MetricResult(result=result,reason=reason)\n", " \n", " \n", - "numeric_metric = create_metric_decorator(NumericMetrics)\n" + "numeric_metric = create_metric_decorator(NumericMetric)\n" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -92,7 +92,7 @@ "\n", "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')\n", "\n", - "my_metric = NumericMetrics(\n", + "my_metric = NumericMetric(\n", " name='helpfulness',\n", " llm=LLM(),\n", " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", From 793b20f27305c93927187de8ee0822b63aa12c46 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 13:41:50 -0700 Subject: [PATCH 05/17] metric implementation --- ragas_annotator/metric/__init__.py | 14 +++ ragas_annotator/metric/base.py | 66 +++++++++++ ragas_annotator/metric/decorator.py | 124 ++++++++++++++++++++ ragas_annotator/metric/discrete.py | 56 +++++++++ ragas_annotator/metric/llm.py | 35 ++++++ ragas_annotator/metric/numeric.py | 48 ++++++++ ragas_annotator/metric/ranking.py | 82 +++++++++++++ ragas_annotator/metric/result.py | 173 ++++++++++++++++++++++++++++ 8 files changed, 598 insertions(+) create mode 100644 ragas_annotator/metric/__init__.py create mode 100644 ragas_annotator/metric/base.py create mode 100644 ragas_annotator/metric/decorator.py create mode 100644 ragas_annotator/metric/discrete.py create mode 100644 ragas_annotator/metric/llm.py create mode 100644 ragas_annotator/metric/numeric.py create mode 100644 ragas_annotator/metric/ranking.py create mode 100644 ragas_annotator/metric/result.py diff --git a/ragas_annotator/metric/__init__.py b/ragas_annotator/metric/__init__.py new file mode 100644 index 0000000..57a31d3 --- /dev/null +++ b/ragas_annotator/metric/__init__.py @@ -0,0 +1,14 @@ +from ragas_annotator.metric.result import MetricResult +from ragas_annotator.metric.llm import LLM +from ragas_annotator.metric.base import Metric +from ragas_annotator.metric.discrete import DiscreteMetric +from ragas_annotator.metric.numeric import NumericMetric +from ragas_annotator.metric.ranking import RankingMetric + +__all__ = ['MetricResult', + 'LLM', + 'Metric', + 'DiscreteMetric', + 'NumericMetric', + 'RankingMetric', + ] diff --git a/ragas_annotator/metric/base.py b/ragas_annotator/metric/base.py new file mode 100644 index 0000000..a6349be --- /dev/null +++ b/ragas_annotator/metric/base.py @@ -0,0 +1,66 @@ +"""base class for all type of metrics in ragas""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/base.ipynb. + +# %% auto 0 +__all__ = ['Metric'] + +# %% ../../nbs/metric/base.ipynb 3 +from abc import ABC, abstractmethod +import asyncio +from dataclasses import dataclass, field +from pydantic import BaseModel +import typing as t +from . import MetricResult +from . import LLM + +@dataclass +class Metric(ABC): + """Base class for all metrics in the LLM evaluation library.""" + name: str + prompt: str + llm: LLM + _response_models: t.Dict[bool, t.Type[BaseModel]] = field( + default_factory=dict, init=False, repr=False + ) + + @abstractmethod + def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]: + """Get the appropriate response model.""" + pass + + @abstractmethod + def _ensemble(self, results: t.List[MetricResult]) -> MetricResult: + pass + + + def score(self, reasoning: bool = True, n: int = 1, **kwargs) -> t.Any: + responses = [] + prompt_input = self.prompt.format(**kwargs) + for _ in range(n): + response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning)) + response = MetricResult(**response.model_dump()) + responses.append(response) + return self._ensemble(responses) + + + async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs) -> MetricResult: + responses = [] # Added missing initialization + prompt_input = self.prompt.format(**kwargs) + for _ in range(n): + response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning)) + response = MetricResult(**response.model_dump()) # Fixed missing parentheses + responses.append(response) + return self._ensemble(responses) + + def batch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[t.Any]: + return [self.score(reasoning, n, **input_dict) for input_dict in inputs] + + async def abatch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[MetricResult]: + async_tasks = [] + for input_dict in inputs: + # Add reasoning and n to the input parameters + async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict)) + + # Run all tasks concurrently and return results + return await asyncio.gather(*async_tasks) diff --git a/ragas_annotator/metric/decorator.py b/ragas_annotator/metric/decorator.py new file mode 100644 index 0000000..378a2fd --- /dev/null +++ b/ragas_annotator/metric/decorator.py @@ -0,0 +1,124 @@ +"""decorator factory for creating custom metrics""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/decorator.ipynb. + +# %% auto 0 +__all__ = ['create_metric_decorator'] + +# %% ../../nbs/metric/decorator.ipynb 3 +import typing as t +import inspect +import asyncio +from dataclasses import dataclass +from . import MetricResult + + + + +def create_metric_decorator(metric_class): + """ + Factory function that creates decorator factories for different metric types. + + Args: + metric_class: The metric class to use (DiscreteMetrics, NumericMetrics, etc.) + + Returns: + A decorator factory function for the specified metric type + """ + def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params): + """ + Creates a decorator that wraps a function into a metric instance. + + Args: + llm: The language model instance to use + prompt: The prompt template + name: Optional name for the metric (defaults to function name) + **metric_params: Additional parameters specific to the metric type + (values for DiscreteMetrics, range for NumericMetrics, etc.) + + Returns: + A decorator function + """ + def decorator(func): + # Get metric name and check if function is async + metric_name = name or func.__name__ + is_async = inspect.iscoroutinefunction(func) + + @dataclass + class CustomMetric(metric_class): + def _extract_result(self, result, reasoning: bool): + """Extract score and reason from the result.""" + if isinstance(result, tuple) and len(result) == 2: + score, reason = result + else: + score, reason = result, None + + # Use "result" instead of "score" for the new MetricResult implementation + return MetricResult(result=score, reason=reason if reasoning else None) + + def _run_sync_in_async(self, func, *args, **kwargs): + """Run a synchronous function in an async context.""" + # For sync functions, just run them normally + return func(*args, **kwargs) + + def _execute_metric(self, is_async_execution, reasoning, **kwargs): + """Execute the metric function with proper async handling.""" + try: + if is_async: + # Async function implementation + if is_async_execution: + # In async context, await the function directly + result = func(self.llm, self.prompt, **kwargs) + else: + # In sync context, run the async function in an event loop + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + result = loop.run_until_complete(func(self.llm, self.prompt, **kwargs)) + else: + # Sync function implementation + result = func(self.llm, self.prompt, **kwargs) + + return self._extract_result(result, reasoning) + except Exception as e: + # Handle errors gracefully + error_msg = f"Error executing metric {self.name}: {str(e)}" + return MetricResult(result=None, reason=error_msg) + + def score(self, reasoning: bool = True, n: int = 1, **kwargs): + """Synchronous scoring method.""" + return self._execute_metric(is_async_execution=False, reasoning=reasoning, **kwargs) + + async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs): + """Asynchronous scoring method.""" + if is_async: + # For async functions, await the result + result = await func(self.llm, self.prompt, **kwargs) + return self._extract_result(result, reasoning) + else: + # For sync functions, run normally + result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs) + return self._extract_result(result, reasoning) + + # Create the metric instance with all parameters + metric_instance = CustomMetric( + name=metric_name, + prompt=prompt, + llm=llm, + **metric_params + ) + + # Preserve metadata + metric_instance.__name__ = metric_name + metric_instance.__doc__ = func.__doc__ + + return metric_instance + + return decorator + + return decorator_factory + + + diff --git a/ragas_annotator/metric/discrete.py b/ragas_annotator/metric/discrete.py new file mode 100644 index 0000000..63c4965 --- /dev/null +++ b/ragas_annotator/metric/discrete.py @@ -0,0 +1,56 @@ +"""Base class from which all discrete metrics should inherit.""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/discrete.ipynb. + +# %% auto 0 +__all__ = ['discrete_metric', 'DiscreteMetric'] + +# %% ../../nbs/metric/discrete.ipynb 3 +import typing as t +from dataclasses import dataclass, field +from pydantic import BaseModel, create_model +from collections import Counter +from . import Metric, MetricResult +from .decorator import create_metric_decorator + + +@dataclass +class DiscreteMetric(Metric): + values: t.List[str] = field(default_factory=lambda: ["pass", "fail"]) + + def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]: + """Get or create a response model based on reasoning parameter.""" + + if with_reasoning in self._response_models: + return self._response_models[with_reasoning] + + model_name = 'response_model' + values = tuple(self.values) + fields = {"result": (t.Literal[values], ...)} + + if with_reasoning: + fields["reason"] = (str, ...) # type: ignore + + model = create_model(model_name, **fields) # type: ignore + self._response_models[with_reasoning] = model + return model + + def _ensemble(self,results:t.List[MetricResult]) -> MetricResult: + + + if len(results)==1: + return results[0] + + candidates = [candidate.result for candidate in results] + counter = Counter(candidates) + max_count = max(counter.values()) + for candidate in results: + if counter[candidate.result] == max_count: + result = candidate.result + reason = candidate.reason + break + + return results[0] + + +discrete_metric = create_metric_decorator(DiscreteMetric) diff --git a/ragas_annotator/metric/llm.py b/ragas_annotator/metric/llm.py new file mode 100644 index 0000000..c602e53 --- /dev/null +++ b/ragas_annotator/metric/llm.py @@ -0,0 +1,35 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/llm.ipynb. + +# %% auto 0 +__all__ = ['LLM'] + +# %% ../../nbs/metric/llm.ipynb 1 +import openai +import instructor +from dataclasses import dataclass + +@dataclass +class LLM: + + def __post_init__(self): + self.aclient = instructor.from_openai(openai.AsyncOpenAI()) + self.client = instructor.from_openai(openai.OpenAI()) + + + def generate(self,prompt,response_model): + return self.client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "user", "content": prompt}, + ], + response_model=response_model, + ) + + async def agenerate(self,prompt,response_model): + return await self.aclient.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "user", "content": prompt}, + ], + response_model=response_model, + ) diff --git a/ragas_annotator/metric/numeric.py b/ragas_annotator/metric/numeric.py new file mode 100644 index 0000000..bc39a1f --- /dev/null +++ b/ragas_annotator/metric/numeric.py @@ -0,0 +1,48 @@ +"""Base class for all numeric metrics""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/numeric.ipynb. + +# %% auto 0 +__all__ = ['numeric_metric', 'NumericMetric'] + +# %% ../../nbs/metric/numeric.ipynb 2 +import typing as t +from dataclasses import dataclass, field +from pydantic import BaseModel, create_model +from . import Metric, MetricResult +from .decorator import create_metric_decorator + +@dataclass +class NumericMetric(Metric): + range: t.Tuple[float,float] + + def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]: + """Get or create a response model based on reasoning parameter.""" + + if with_reasoning in self._response_models: + return self._response_models[with_reasoning] + + model_name = 'response_model' + fields = {"result": (float,...)} + + if with_reasoning: + fields["reason"] = (str, ...) #type: ignore + + model = create_model(model_name, **fields) + self._response_models[with_reasoning] = model + return model + + def _ensemble(self,results:t.List[MetricResult]) -> MetricResult: + + if len(results)==1: + return results[0] + + candidates = [candidate.result for candidate in results] + result = sum(candidates)/len(candidates) + reason = results[0].reason + + return MetricResult(result=result,reason=reason) + + +numeric_metric = create_metric_decorator(NumericMetric) + diff --git a/ragas_annotator/metric/ranking.py b/ragas_annotator/metric/ranking.py new file mode 100644 index 0000000..fb883d5 --- /dev/null +++ b/ragas_annotator/metric/ranking.py @@ -0,0 +1,82 @@ +"""Base class for ranking metrics""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/ranking.ipynb. + +# %% auto 0 +__all__ = ['ranking_metric', 'RankingMetric'] + +# %% ../../nbs/metric/ranking.ipynb 2 +import typing as t +from dataclasses import dataclass +from pydantic import BaseModel, Field +from . import Metric, MetricResult +from .decorator import create_metric_decorator + +@dataclass +class RankingMetric(Metric): + num_ranks: int + + def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]: + """Get or create a response model based on reasoning parameter.""" + + if with_reasoning in self._response_models: + return self._response_models[with_reasoning] + + # Store values needed for validation + num_ranks = self.num_ranks + + # Create explicit model classes instead of using create_model + if with_reasoning: + # Model with result and reason + class ResponseModelWithReason(BaseModel): + result: t.List[int] = Field(...) + reason: str = Field(...) + + def model_post_init(self, __context): + expected = set(range(num_ranks)) + if set(self.result) != expected: + raise ValueError( + f"'result' must contain exactly the numbers {sorted(expected)} without repetition." + ) + + self._response_models[with_reasoning] = ResponseModelWithReason + return ResponseModelWithReason + else: + # Model with just result + class ResponseModel(BaseModel): + result: t.List[int] = Field(...) + + def model_post_init(self, __context): + expected = set(range(num_ranks)) + if set(self.result) != expected: + raise ValueError( + f"'result' must contain exactly the numbers {sorted(expected)} without repetition." + ) + + self._response_models[with_reasoning] = ResponseModel + return ResponseModel + + def _ensemble(self, results: t.List[MetricResult]) -> MetricResult: + if len(results) == 1: + return results[0] + + n_items = self.num_ranks # Use the class attribute instead of len(results) + borda_scores = [0] * n_items + + for result in results: + for position_idx, item_idx in enumerate(result.result): + borda_scores[item_idx] += (n_items - position_idx) # Fixed the formula + + indexed_scores = [(score, i) for i, score in enumerate(borda_scores)] + indexed_scores.sort(key=lambda x: (-x[0], x[1])) + final_ranking = [pos for _, pos in indexed_scores] + + if any(r.reason for r in results): + reason = "Ensemble ranking based on multiple evaluations.\n" + '\n'.join([r.reason for r in results if r.reason]) + else: + reason = None + + return MetricResult(result=final_ranking, reason=reason) + + +ranking_metric = create_metric_decorator(RankingMetric) diff --git a/ragas_annotator/metric/result.py b/ragas_annotator/metric/result.py new file mode 100644 index 0000000..c4636c7 --- /dev/null +++ b/ragas_annotator/metric/result.py @@ -0,0 +1,173 @@ +"""MetricResult object to store the result of a metric""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/metric/result.ipynb. + +# %% auto 0 +__all__ = ['MetricResult'] + +# %% ../../nbs/metric/result.ipynb 2 +import typing as t + + + + + +class MetricResult: + """Class to hold the result of a metric evaluation. + + This class behaves like its underlying result value but still provides access + to additional metadata like reasoning. + + Works with: + - DiscreteMetrics (string results) + - NumericMetrics (float/int results) + - RankingMetrics (list results) + """ + + def __init__(self, result: t.Any, reason: t.Optional[str] = None): + self._result = result + self.reason = reason + + def __repr__(self): + return repr(self._result) + + # Access to underlying result + @property + def result(self): + """Get the raw result value.""" + return self._result + + + # String conversion - works for all types + def __str__(self): + return str(self._result) + + # Container-like behaviors for list results (RankingMetric) + def __getitem__(self, key): + if not hasattr(self._result, "__getitem__"): + raise TypeError(f"{type(self._result).__name__} object is not subscriptable") + return self._result[key] + + def __iter__(self): + if not hasattr(self._result, "__iter__"): + raise TypeError(f"{type(self._result).__name__} object is not iterable") + return iter(self._result) + + def __len__(self): + if not hasattr(self._result, "__len__"): + raise TypeError(f"{type(self._result).__name__} has no len()") + return len(self._result) + + # Numeric operations for numeric results (NumericMetric) + def __float__(self): + if isinstance(self._result, (int, float)): + return float(self._result) + raise TypeError(f"Cannot convert {type(self._result).__name__} to float") + + def __int__(self): + if isinstance(self._result, (int, float)): + return int(self._result) + raise TypeError(f"Cannot convert {type(self._result).__name__} to int") + + def __add__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot add {type(self._result).__name__} objects") + if isinstance(other, MetricResult): + return self._result + other._result + return self._result + other + + def __radd__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot add {type(self._result).__name__} objects") + return other + self._result + + def __sub__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot subtract {type(self._result).__name__} objects") + if isinstance(other, MetricResult): + return self._result - other._result + return self._result - other + + def __rsub__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot subtract {type(self._result).__name__} objects") + return other - self._result + + def __mul__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot multiply {type(self._result).__name__} objects") + if isinstance(other, MetricResult): + return self._result * other._result + return self._result * other + + def __rmul__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot multiply {type(self._result).__name__} objects") + return other * self._result + + def __truediv__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot divide {type(self._result).__name__} objects") + if isinstance(other, MetricResult): + return self._result / other._result + return self._result / other + + def __rtruediv__(self, other): + if not isinstance(self._result, (int, float)): + raise TypeError(f"Cannot divide {type(self._result).__name__} objects") + return other / self._result + + # Comparison operations - work for all types with same-type comparisons + def __eq__(self, other): + if isinstance(other, MetricResult): + return self._result == other._result + return self._result == other + + def __lt__(self, other): + if isinstance(other, MetricResult): + return self._result < other._result + return self._result < other + + def __le__(self, other): + if isinstance(other, MetricResult): + return self._result <= other._result + return self._result <= other + + def __gt__(self, other): + if isinstance(other, MetricResult): + return self._result > other._result + return self._result > other + + def __ge__(self, other): + if isinstance(other, MetricResult): + return self._result >= other._result + return self._result >= other + + # Method forwarding for type-specific behaviors + def __getattr__(self, name): + """Forward attribute access to the result object if it has that attribute. + + This allows calling string methods on discrete results, + numeric methods on numeric results, and list methods on ranking results. + """ + if hasattr(self._result, name): + attr = getattr(self._result, name) + if callable(attr): + # If it's a method, wrap it to return MetricResult when appropriate + def wrapper(*args, **kwargs): + result = attr(*args, **kwargs) + # If the result is of the same type as self._result, wrap it + if isinstance(result, type(self._result)): + return MetricResult(result=result, reason=self.reason) + return result + return wrapper + return attr + raise AttributeError(f"{type(self).__name__} has no attribute '{name}'") + + # JSON/dict serialization + def to_dict(self): + """Convert the result to a dictionary.""" + return { + "result": self._result, + "reason": self.reason + } From abe6f7d8d2ce54a2e0b8c8aaf6ac8d05d7cd2fd0 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 16:10:38 -0700 Subject: [PATCH 06/17] add example usage --- nbs/metric/base.ipynb | 17 ++++++++++++----- nbs/metric/decorator.ipynb | 12 +++++++++--- nbs/metric/numeric.ipynb | 8 +++++++- nbs/metric/ranking.ipynb | 11 +++++++++-- nbs/metric/result.ipynb | 12 ++++++++++-- nbs/metric/test_base.ipynb | 26 +++++++++++++------------- 6 files changed, 60 insertions(+), 26 deletions(-) diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb index dd9d599..53c79f7 100644 --- a/nbs/metric/base.ipynb +++ b/nbs/metric/base.ipynb @@ -120,18 +120,25 @@ " return await asyncio.gather(*async_tasks)" ] }, + { + "cell_type": "markdown", + "id": "fc4b7458", + "metadata": {}, + "source": [ + "### Example\n" + ] + }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "fcf208fa", "metadata": {}, "outputs": [], "source": [ - "## Example of a concrete implementation of the Metric class\n", "\n", "\n", "@dataclass\n", - "class NumericMetric(Metric):\n", + "class CustomMetric(Metric):\n", " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", " \n", " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", @@ -152,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "9ba99094", "metadata": {}, "outputs": [ @@ -168,7 +175,7 @@ } ], "source": [ - "my_metric = NumericMetric(name=\"example\", prompt=\"What is the result of {input}?\", llm=LLM())\n", + "my_metric = CustomMetric(name=\"example\", prompt=\"What is the result of {input}?\", llm=LLM())\n", "my_metric.score(input=\"test\")" ] }, diff --git a/nbs/metric/decorator.ipynb b/nbs/metric/decorator.ipynb index f60e4d9..b30f6dc 100644 --- a/nbs/metric/decorator.ipynb +++ b/nbs/metric/decorator.ipynb @@ -164,9 +164,16 @@ "\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example usage\n" + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -179,7 +186,6 @@ } ], "source": [ - "### Example usage\n", "from ragas_annotator.metric import DiscreteMetric\n", "from ragas_annotator.metric.llm import LLM\n", "from pydantic import BaseModel\n", @@ -205,7 +211,7 @@ "\n", "result = my_metric.score(response='my response') # result\n", "print(result)\n", - "print(result.reason)" + "print(result.reason" ] }, { diff --git a/nbs/metric/numeric.ipynb b/nbs/metric/numeric.ipynb index 847ad0a..033e9ba 100644 --- a/nbs/metric/numeric.ipynb +++ b/nbs/metric/numeric.ipynb @@ -66,6 +66,13 @@ "numeric_metric = create_metric_decorator(NumericMetric)\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example usage" + ] + }, { "cell_type": "code", "execution_count": null, @@ -83,7 +90,6 @@ } ], "source": [ - "## Example usage\n", "\n", "from dotenv import load_dotenv\n", "from ragas_annotator.metric.llm import LLM\n", diff --git a/nbs/metric/ranking.ipynb b/nbs/metric/ranking.ipynb index ace4336..96f8275 100644 --- a/nbs/metric/ranking.ipynb +++ b/nbs/metric/ranking.ipynb @@ -101,9 +101,16 @@ "ranking_metric = create_metric_decorator(RankingMetric)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example usage" + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -119,7 +126,7 @@ } ], "source": [ - "## Example usage\n", + "\n", "\n", "from dotenv import load_dotenv\n", "from ragas_annotator.metric.llm import LLM\n", diff --git a/nbs/metric/result.ipynb b/nbs/metric/result.ipynb index c10e80e..167962c 100644 --- a/nbs/metric/result.ipynb +++ b/nbs/metric/result.ipynb @@ -195,9 +195,17 @@ " }" ] }, + { + "cell_type": "markdown", + "id": "490cdd2f", + "metadata": {}, + "source": [ + "### Example Usage" + ] + }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "24589401", "metadata": {}, "outputs": [ @@ -214,7 +222,7 @@ } ], "source": [ - "### Example Usage\n", + "\n", "\n", "metric_result = MetricResult(result=42, reason=\"This is a test\")\n", "print(metric_result)\n", diff --git a/nbs/metric/test_base.ipynb b/nbs/metric/test_base.ipynb index 02965d9..6ed8247 100644 --- a/nbs/metric/test_base.ipynb +++ b/nbs/metric/test_base.ipynb @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "0b05c525-e153-49ab-b768-5069f624f215", "metadata": {}, "outputs": [ @@ -49,7 +49,7 @@ } ], "source": [ - "#| export\n", + "\n", "import typing as t\n", "from typing import Any, Callable, Dict, List, Optional, Union\n", "from abc import ABC, abstractmethod\n", @@ -126,7 +126,7 @@ "metadata": {}, "outputs": [], "source": [ - "#| export\n", + "\n", "\n", "import typing as t\n", "\n", @@ -310,7 +310,7 @@ "metadata": {}, "outputs": [], "source": [ - "#| export\n", + "\n", "\n", "from abc import ABC, abstractmethod\n", "import asyncio\n", @@ -386,7 +386,7 @@ "metadata": {}, "outputs": [], "source": [ - "#| export\n", + "\n", "from collections import Counter\n", "\n", "@dataclass\n", @@ -675,12 +675,12 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": null, "id": "6a1c66fb-3c1c-4bc6-9996-0b5beb304b9c", "metadata": {}, "outputs": [], "source": [ - "#| export\n", + "\n", "\n", "@dataclass\n", "class NumericMetrics(Metric):\n", @@ -777,12 +777,12 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, "id": "265af384-ed35-4262-acfe-6847b22d3089", "metadata": {}, "outputs": [], "source": [ - "#| export\n", + "\n", "numeric_metric = create_metric_decorator(NumericMetrics)" ] }, @@ -862,12 +862,12 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": null, "id": "0b1bbf8f-c7fa-4004-9165-fa388f7ba15d", "metadata": {}, "outputs": [], "source": [ - "#| export\n", + "\n", "\n", "@dataclass\n", "class RankingMetrics(Metric):\n", @@ -995,12 +995,12 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "4c4e9170-67b9-4841-9df2-6afc490b89dd", "metadata": {}, "outputs": [], "source": [ - "#| export\n", + "\n", "ranking_metric = create_metric_decorator(RankingMetrics)" ] }, From e287488905d2b7601d93ae1b79f1fa61a939dad1 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 16:16:16 -0700 Subject: [PATCH 07/17] remove test base --- nbs/metric/test_base.ipynb | 1076 ------------------------------------ 1 file changed, 1076 deletions(-) delete mode 100644 nbs/metric/test_base.ipynb diff --git a/nbs/metric/test_base.ipynb b/nbs/metric/test_base.ipynb deleted file mode 100644 index 6ed8247..0000000 --- a/nbs/metric/test_base.ipynb +++ /dev/null @@ -1,1076 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c48aac0f-c63c-4bfb-95b0-a1239f41ccb3", - "metadata": {}, - "source": [ - "# Test Base\n", - "\n", - "### Do not export" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5d14fc66-b8af-4a75-b761-20b8e9ce19f8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| hide\n", - "from dotenv import load_dotenv\n", - "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b05c525-e153-49ab-b768-5069f624f215", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "\n", - "import typing as t\n", - "from typing import Any, Callable, Dict, List, Optional, Union\n", - "from abc import ABC, abstractmethod\n", - "import asyncio\n", - "from dataclasses import dataclass\n", - "from pydantic import BaseModel\n", - "import openai\n", - "import instructor\n", - "from dataclasses import dataclass, field\n", - "from pydantic import BaseModel, create_model\n", - "import typing as t\n", - "import inspect" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "be7f588f-2a13-4c10-9775-00101a03e0a5", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/miniforge/base/envs/random/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import openai\n", - "import instructor\n", - "from dataclasses import dataclass\n", - "\n", - "@dataclass\n", - "class LLM:\n", - "\n", - " def __post_init__(self):\n", - " self.aclient = instructor.from_openai(openai.AsyncOpenAI())\n", - " self.client = instructor.from_openai(openai.OpenAI())\n", - "\n", - " \n", - " def generate(self,prompt,response_model):\n", - " return self.client.chat.completions.create(\n", - " model=\"gpt-4o-mini\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": prompt},\n", - " ],\n", - " response_model=response_model,\n", - " )\n", - "\n", - " async def agenerate(self,prompt,response_model):\n", - " return await self.aclient.chat.completions.create(\n", - " model=\"gpt-4o-mini\",\n", - " messages=[\n", - " {\"role\": \"user\", \"content\": prompt},\n", - " ],\n", - " response_model=response_model,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "2dfed36e", - "metadata": {}, - "source": [ - "## MetricResult" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca623dd2", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "import typing as t\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "class MetricResult:\n", - " \"\"\"Class to hold the result of a metric evaluation.\n", - " \n", - " This class behaves like its underlying result value but still provides access\n", - " to additional metadata like reasoning.\n", - " \n", - " Works with:\n", - " - DiscreteMetrics (string results)\n", - " - NumericMetrics (float/int results)\n", - " - RankingMetrics (list results)\n", - " \"\"\"\n", - " \n", - " def __init__(self, result: t.Any, reason: t.Optional[str] = None):\n", - " self._result = result\n", - " self.reason = reason\n", - " \n", - " def __repr__(self):\n", - " return repr(self._result)\n", - " \n", - " # Access to underlying result\n", - " @property\n", - " def result(self):\n", - " \"\"\"Get the raw result value.\"\"\"\n", - " return self._result\n", - " \n", - " \n", - " # String conversion - works for all types\n", - " def __str__(self):\n", - " return str(self._result)\n", - " \n", - " # Container-like behaviors for list results (RankingMetric)\n", - " def __getitem__(self, key):\n", - " if not hasattr(self._result, \"__getitem__\"):\n", - " raise TypeError(f\"{type(self._result).__name__} object is not subscriptable\")\n", - " return self._result[key]\n", - " \n", - " def __iter__(self):\n", - " if not hasattr(self._result, \"__iter__\"):\n", - " raise TypeError(f\"{type(self._result).__name__} object is not iterable\")\n", - " return iter(self._result)\n", - " \n", - " def __len__(self):\n", - " if not hasattr(self._result, \"__len__\"):\n", - " raise TypeError(f\"{type(self._result).__name__} has no len()\")\n", - " return len(self._result)\n", - " \n", - " # Numeric operations for numeric results (NumericMetric)\n", - " def __float__(self):\n", - " if isinstance(self._result, (int, float)):\n", - " return float(self._result)\n", - " raise TypeError(f\"Cannot convert {type(self._result).__name__} to float\")\n", - " \n", - " def __int__(self):\n", - " if isinstance(self._result, (int, float)):\n", - " return int(self._result)\n", - " raise TypeError(f\"Cannot convert {type(self._result).__name__} to int\")\n", - " \n", - " def __add__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", - " if isinstance(other, MetricResult):\n", - " return self._result + other._result\n", - " return self._result + other\n", - " \n", - " def __radd__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot add {type(self._result).__name__} objects\")\n", - " return other + self._result\n", - " \n", - " def __sub__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", - " if isinstance(other, MetricResult):\n", - " return self._result - other._result\n", - " return self._result - other\n", - " \n", - " def __rsub__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot subtract {type(self._result).__name__} objects\")\n", - " return other - self._result\n", - " \n", - " def __mul__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", - " if isinstance(other, MetricResult):\n", - " return self._result * other._result\n", - " return self._result * other\n", - " \n", - " def __rmul__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot multiply {type(self._result).__name__} objects\")\n", - " return other * self._result\n", - " \n", - " def __truediv__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", - " if isinstance(other, MetricResult):\n", - " return self._result / other._result\n", - " return self._result / other\n", - " \n", - " def __rtruediv__(self, other):\n", - " if not isinstance(self._result, (int, float)):\n", - " raise TypeError(f\"Cannot divide {type(self._result).__name__} objects\")\n", - " return other / self._result\n", - " \n", - " # Comparison operations - work for all types with same-type comparisons\n", - " def __eq__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result == other._result\n", - " return self._result == other\n", - " \n", - " def __lt__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result < other._result\n", - " return self._result < other\n", - " \n", - " def __le__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result <= other._result\n", - " return self._result <= other\n", - " \n", - " def __gt__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result > other._result\n", - " return self._result > other\n", - " \n", - " def __ge__(self, other):\n", - " if isinstance(other, MetricResult):\n", - " return self._result >= other._result\n", - " return self._result >= other\n", - " \n", - " # Method forwarding for type-specific behaviors\n", - " def __getattr__(self, name):\n", - " \"\"\"Forward attribute access to the result object if it has that attribute.\n", - " \n", - " This allows calling string methods on discrete results, \n", - " numeric methods on numeric results, and list methods on ranking results.\n", - " \"\"\"\n", - " if hasattr(self._result, name):\n", - " attr = getattr(self._result, name)\n", - " if callable(attr):\n", - " # If it's a method, wrap it to return MetricResult when appropriate\n", - " def wrapper(*args, **kwargs):\n", - " result = attr(*args, **kwargs)\n", - " # If the result is of the same type as self._result, wrap it\n", - " if isinstance(result, type(self._result)):\n", - " return MetricResult(result=result, reason=self.reason)\n", - " return result\n", - " return wrapper\n", - " return attr\n", - " raise AttributeError(f\"{type(self).__name__} has no attribute '{name}'\")\n", - " \n", - " # JSON/dict serialization\n", - " def to_dict(self):\n", - " \"\"\"Convert the result to a dictionary.\"\"\"\n", - " return {\n", - " \"result\": self._result,\n", - " \"reason\": self.reason\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "1bec7946-c61c-4631-9880-fff575974e39", - "metadata": {}, - "source": [ - "### Metric" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5e19478-d947-405b-a229-4a1e7daa2fd3", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "from abc import ABC, abstractmethod\n", - "import asyncio\n", - "from dataclasses import dataclass, field\n", - "from pydantic import BaseModel\n", - "import typing as t\n", - "from ragas_annotator.metric import MetricResult\n", - "\n", - "@dataclass\n", - "class Metric(ABC):\n", - " \"\"\"Base class for all metrics in the LLM evaluation library.\"\"\"\n", - " name: str\n", - " prompt: str\n", - " llm: 'LLM' # Forward reference with quotes\n", - " _response_models: t.Dict[bool, t.Type[BaseModel]] = field(\n", - " default_factory=dict, init=False, repr=False\n", - " )\n", - " \n", - " @abstractmethod\n", - " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", - " \"\"\"Get the appropriate response model.\"\"\"\n", - " pass\n", - "\n", - " @abstractmethod\n", - " def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:\n", - " pass\n", - " \n", - " \n", - " def score(self, reasoning: bool = True, n: int = 1, **kwargs) -> t.Any:\n", - " responses = []\n", - " prompt_input = self.prompt.format(**kwargs)\n", - " for _ in range(n):\n", - " response = self.llm.generate(prompt_input, response_model = self._get_response_model(reasoning)) \n", - " response = MetricResult(**response.model_dump())\n", - " responses.append(response)\n", - " return self._ensemble(responses)\n", - "\n", - "\n", - " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs) -> MetricResult:\n", - " responses = [] # Added missing initialization\n", - " prompt_input = self.prompt.format(**kwargs)\n", - " for _ in range(n):\n", - " response = await self.llm.agenerate(prompt_input, response_model = self._get_response_model(reasoning))\n", - " response = MetricResult(**response.model_dump()) # Fixed missing parentheses\n", - " responses.append(response)\n", - " return self._ensemble(responses)\n", - " \n", - " def batch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[t.Any]:\n", - " return [self.score(reasoning, n, **input_dict) for input_dict in inputs]\n", - " \n", - " async def abatch_score(self, inputs: t.List[t.Dict[str, t.Any]], reasoning: bool = True, n: int = 1) -> t.List[MetricResult]:\n", - " async_tasks = []\n", - " for input_dict in inputs:\n", - " # Add reasoning and n to the input parameters\n", - " async_tasks.append(self.ascore(reasoning=reasoning, n=n, **input_dict))\n", - " \n", - " # Run all tasks concurrently and return results\n", - " return await asyncio.gather(*async_tasks)" - ] - }, - { - "cell_type": "markdown", - "id": "4af79bb4-4c3e-4004-b7f3-ec36e50b4ca5", - "metadata": {}, - "source": [ - "### DiscreteMetric \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f09683e0-5ec3-4e60-a8c4-1657e2fe60b9", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "from collections import Counter\n", - "\n", - "@dataclass\n", - "class DiscreteMetrics(Metric):\n", - " values: t.List[str] = field(default_factory=lambda: [\"pass\", \"fail\"])\n", - " \n", - " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", - " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", - " \n", - " if with_reasoning in self._response_models:\n", - " return self._response_models[with_reasoning]\n", - " \n", - " model_name = 'response_model'\n", - " values = tuple(self.values)\n", - " fields = {\"result\": (t.Literal[values], ...)}\n", - " \n", - " if with_reasoning:\n", - " fields[\"reason\"] = (str, ...)\n", - " \n", - " model = create_model(model_name, **fields)\n", - " self._response_models[with_reasoning] = model\n", - " return model \n", - "\n", - " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", - "\n", - "\n", - " if len(results)==1:\n", - " return results[0]\n", - " \n", - " candidates = [candidate.result for candidate in results]\n", - " counter = Counter(candidates)\n", - " max_count = max(counter.values())\n", - " for candidate in results:\n", - " if counter[candidate.result] == max_count:\n", - " result = candidate.result \n", - " reason = candidate.reason\n", - " break\n", - " \n", - " return MetricResult(result=result,reason=reason)\n", - "\n", - "\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "f33ef89f-4ccc-4307-9944-4f372ce77830", - "metadata": {}, - "source": [ - "### decorator factory for discrete_metric" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "id": "99d5afd6-72bd-42d7-bff0-effce9cf8cd9", - "metadata": {}, - "outputs": [], - "source": [ - "import typing as t\n", - "from typing import Any, Callable, Dict, List, Optional, Type, Union\n", - "import inspect\n", - "import asyncio\n", - "from dataclasses import dataclass\n", - "from abc import ABC\n", - "\n", - "def create_metric_decorator(metric_class):\n", - " \"\"\"\n", - " Factory function that creates decorator factories for different metric types.\n", - " \n", - " Args:\n", - " metric_class: The metric class to use (DiscreteMetrics, NumericMetrics, etc.)\n", - " \n", - " Returns:\n", - " A decorator factory function for the specified metric type\n", - " \"\"\"\n", - " def decorator_factory(llm, prompt, name: t.Optional[str] = None, **metric_params):\n", - " \"\"\"\n", - " Creates a decorator that wraps a function into a metric instance.\n", - " \n", - " Args:\n", - " llm: The language model instance to use\n", - " prompt: The prompt template\n", - " name: Optional name for the metric (defaults to function name)\n", - " **metric_params: Additional parameters specific to the metric type\n", - " (values for DiscreteMetrics, range for NumericMetrics, etc.)\n", - " \n", - " Returns:\n", - " A decorator function\n", - " \"\"\"\n", - " def decorator(func):\n", - " # Get metric name and check if function is async\n", - " metric_name = name or func.__name__\n", - " is_async = inspect.iscoroutinefunction(func)\n", - " \n", - " @dataclass\n", - " class CustomMetric(metric_class):\n", - " def _extract_result(self, result, reasoning: bool):\n", - " \"\"\"Extract score and reason from the result.\"\"\"\n", - " if isinstance(result, tuple) and len(result) == 2:\n", - " score, reason = result\n", - " else:\n", - " score, reason = result, None\n", - " \n", - " # Use \"result\" instead of \"score\" for the new MetricResult implementation\n", - " return MetricResult(result=score, reason=reason if reasoning else None)\n", - " \n", - " def _run_sync_in_async(self, func, *args, **kwargs):\n", - " \"\"\"Run a synchronous function in an async context.\"\"\"\n", - " # For sync functions, just run them normally\n", - " return func(*args, **kwargs)\n", - " \n", - " def _execute_metric(self, is_async_execution, reasoning, **kwargs):\n", - " \"\"\"Execute the metric function with proper async handling.\"\"\"\n", - " try:\n", - " if is_async:\n", - " # Async function implementation\n", - " if is_async_execution:\n", - " # In async context, await the function directly\n", - " result = func(self.llm, self.prompt, **kwargs)\n", - " else:\n", - " # In sync context, run the async function in an event loop\n", - " try:\n", - " loop = asyncio.get_event_loop()\n", - " except RuntimeError:\n", - " loop = asyncio.new_event_loop()\n", - " asyncio.set_event_loop(loop)\n", - " result = loop.run_until_complete(func(self.llm, self.prompt, **kwargs))\n", - " else:\n", - " # Sync function implementation\n", - " result = func(self.llm, self.prompt, **kwargs)\n", - " \n", - " return self._extract_result(result, reasoning)\n", - " except Exception as e:\n", - " # Handle errors gracefully\n", - " error_msg = f\"Error executing metric {self.name}: {str(e)}\"\n", - " return MetricResult(result=None, reason=error_msg)\n", - " \n", - " def score(self, reasoning: bool = True, n: int = 1, **kwargs):\n", - " \"\"\"Synchronous scoring method.\"\"\"\n", - " return self._execute_metric(is_async_execution=False, reasoning=reasoning, **kwargs)\n", - " \n", - " async def ascore(self, reasoning: bool = True, n: int = 1, **kwargs):\n", - " \"\"\"Asynchronous scoring method.\"\"\"\n", - " if is_async:\n", - " # For async functions, await the result\n", - " result = await func(self.llm, self.prompt, **kwargs)\n", - " return self._extract_result(result, reasoning)\n", - " else:\n", - " # For sync functions, run normally\n", - " result = self._run_sync_in_async(func, self.llm, self.prompt, **kwargs)\n", - " return self._extract_result(result, reasoning)\n", - " \n", - " # Create the metric instance with all parameters\n", - " metric_instance = CustomMetric(\n", - " name=metric_name,\n", - " prompt=prompt,\n", - " llm=llm,\n", - " **metric_params\n", - " )\n", - " \n", - " # Preserve metadata\n", - " metric_instance.__name__ = metric_name\n", - " metric_instance.__doc__ = func.__doc__\n", - " \n", - " return metric_instance\n", - " \n", - " return decorator\n", - " \n", - " return decorator_factory\n", - "\n", - "# Create specific decorator factories for each metric type\n", - "discrete_metric = create_metric_decorator(DiscreteMetrics)\n" - ] - }, - { - "cell_type": "markdown", - "id": "07f49ad3-4476-4bcc-ac34-a87fb7a8652a", - "metadata": {}, - "source": [ - "### Usage pattern" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "id": "aeae8fe5-e81a-44ac-9ad7-a240655a0f06", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"The answer provided lacks specific context or detail needed to evaluate its helpfulness fully. Without more information, it's difficult to determine its applicability.\"" - ] - }, - "execution_count": 113, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "my_metric = DiscreteMetrics(\n", - " name='helpfulness',\n", - " llm=LLM(),\n", - " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", - " values=[\"low\",\"med\",\"high\"],\n", - ")\n", - "\n", - "result = my_metric.score(response=\"this is my response\")\n", - "result #gives \"low\"\n", - "result.reason #gives reasoning from llm\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "id": "b5e499d8-8258-46ce-b719-0389d3cfd8db", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'low'" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "## score without reasoning to save reasoning tokens cost\n", - "result = my_metric.score(response=\"this is my response\",reasoning=False,n=3)\n", - "result" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "id": "a9a5d6c6-4cfc-4f45-8b19-996315a95370", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'reason'" - ] - }, - "execution_count": 115, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@discrete_metric(llm=LLM(),\n", - " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", - " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", - "def my_metric(llm,prompt,**kwargs):\n", - "\n", - " class response_model(BaseModel):\n", - " output: t.List[bool]\n", - " reason: str\n", - " \n", - " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", - " total = sum(response.output)\n", - " if total < 1:\n", - " score = 'low'\n", - " else:\n", - " score = 'high'\n", - " return score,\"reason\"\n", - "\n", - "result = my_metric.score(response='my response') # result\n", - "result\n", - "result.reason" - ] - }, - { - "cell_type": "markdown", - "id": "05f60c70-fc32-41f8-aa7c-c8685d77398a", - "metadata": {}, - "source": [ - "## Numeric Metric" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a1c66fb-3c1c-4bc6-9996-0b5beb304b9c", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "@dataclass\n", - "class NumericMetrics(Metric):\n", - " range: t.Tuple[float,float]\n", - " \n", - " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", - " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", - " \n", - " if with_reasoning in self._response_models:\n", - " return self._response_models[with_reasoning]\n", - " \n", - " model_name = 'response_model'\n", - " fields = {\"result\": (float,...)}\n", - " \n", - " if with_reasoning:\n", - " fields[\"reason\"] = (str, ...)\n", - " \n", - " model = create_model(model_name, **fields)\n", - " self._response_models[with_reasoning] = model\n", - " return model \n", - "\n", - " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", - "\n", - " if len(results)==1:\n", - " return results[0]\n", - " \n", - " candidates = [candidate.result for candidate in results]\n", - " result = sum(candidates)/len(candidates)\n", - " reason = results[0].reason\n", - " \n", - " return MetricResult(result=result,reason=reason)\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "id": "251cdea8-fc71-46bd-8a00-fb8e33e10350", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The response lacks sufficient information or context to be considered helpful. It does not address any specific question or provide any useful insights.'" - ] - }, - "execution_count": 129, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_metric = NumericMetrics(\n", - " name='helpfulness',\n", - " llm=LLM(),\n", - " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", - " range=(0,10),\n", - ")\n", - "\n", - "result = my_metric.score(response=\"this is my response\")\n", - "result #gives \"low\"\n", - "result.reason #gives reasoning from llm\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "id": "b0994c80-c6db-4f3b-9ed9-1b32d61428c6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0.0]" - ] - }, - "execution_count": 130, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "my_metric.batch_score(inputs=[{\"response\":\"this is my response\"}])\n" - ] - }, - { - "cell_type": "markdown", - "id": "c96520ae-294b-4868-8b2f-22a30ebd5f25", - "metadata": {}, - "source": [ - "### decorator factory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "265af384-ed35-4262-acfe-6847b22d3089", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "numeric_metric = create_metric_decorator(NumericMetrics)" - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "id": "009c1944-bda7-41b1-9235-dcda5acbed55", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "20" - ] - }, - "execution_count": 132, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "@numeric_metric(llm=LLM(),\n", - " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", - " name='new_metric',range=(0,10))\n", - "def my_metric(llm,prompt,**kwargs):\n", - "\n", - " class response_model(BaseModel):\n", - " output: int\n", - " reason: str\n", - " \n", - " response = llm.generate(prompt.format(**kwargs),response_model=response_model)\n", - " total = response.output\n", - " if total < 1:\n", - " score = 0\n", - " else:\n", - " score = 10\n", - " return score,\"reason\"\n", - "\n", - "result = my_metric.score(response='my response') # result\n", - "result # 10\n", - "result.reason # the reason for the answer\n", - "\n", - "result1 = my_metric.score(response='my response 1') # result\n", - "result2 = my_metric.score(response='my response 2') # result\n", - "\n", - "result1 + result2 # should be addable and behave like a float\n" - ] - }, - { - "cell_type": "markdown", - "id": "90794704-5e45-4dd5-8862-b4fb9694a5b5", - "metadata": {}, - "source": [ - "### Ranking metric" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "id": "9e2bb718-ba9a-4965-a952-462ac0159766", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "typing.Literal[[0, 1, 2]]" - ] - }, - "execution_count": 145, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "t.Literal[[i for i in range(3)]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b1bbf8f-c7fa-4004-9165-fa388f7ba15d", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "@dataclass\n", - "class RankingMetrics(Metric):\n", - " num_ranks: int\n", - " \n", - " def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:\n", - " \"\"\"Get or create a response model based on reasoning parameter.\"\"\"\n", - " \n", - " if with_reasoning in self._response_models:\n", - " return self._response_models[with_reasoning]\n", - " \n", - " model_name = 'response_model'\n", - "\n", - " # Custom validator to ensure 'result' is a permutation of 0 .. num_ranks-1\n", - " def validate_result(cls, v):\n", - " expected = set(range(self.num_ranks))\n", - " if set(v) != expected:\n", - " raise ValueError(\n", - " f\"'result' must contain exactly the numbers {sorted(expected)} without repetition.\"\n", - " )\n", - " return v\n", - "\n", - " # Define the fields dynamically\n", - " fields = {\"result\": (List[int], ...)}\n", - " if with_reasoning:\n", - " fields[\"reason\"] = (str, ...)\n", - " \n", - " # Create the dynamic model with the custom validator attached\n", - " model = create_model(\n", - " model_name,\n", - " **fields,\n", - " __validators__={\n", - " 'result_validator': validator('result', allow_reuse=True)(validate_result)\n", - " }\n", - " )\n", - " self._response_models[with_reasoning] = model\n", - " return model \n", - "\n", - " def _ensemble(self,results:t.List[MetricResult]) -> MetricResult:\n", - "\n", - " if len(results)==1:\n", - " return results[0]\n", - "\n", - " n_items = len(results)\n", - " borda_scores = [0] * n_items\n", - "\n", - " for result in results:\n", - " for position_idx,item_idx in enumerate(result.result):\n", - " borda_scores[item_idx] += (n_items - (position_idx-1))\n", - "\n", - " indexed_scores = [(score, i) for i, score in enumerate(borda_scores)] \n", - " indexed_scores.sort(key=lambda x: (-x[0], x[1])) \n", - " final_ranking = [pos for _, pos in indexed_scores]\n", - "\n", - " if any(r.reason for r in results):\n", - " reason = \"Ensemble ranking based on multiple evaluations.\\n\" + '\\n'.join([r.reason for r in results if r.reason])\n", - " else:\n", - " reason = None\n", - " \n", - " \n", - " return MetricResult(result=result,reason=reason)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "716881a1-0a93-46b3-b41b-aee0f987a1a6", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/ww/sk5dkfhn673234cmy5w7008r0000gn/T/ipykernel_95467/972172485.py:40: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/\n", - " 'result_validator': validator('result', allow_reuse=True)(validate_result)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0, 1, 2]\n", - "Ensemble ranking based on multiple evaluations.\n", - "The responses are ranked from the shortest to the longest and most detailed.\n", - "The responses are ranked from shortest to longest and most detailed.\n", - "Responses ranked from shortest to longest.\n" - ] - } - ], - "source": [ - "# User instantiates a ranking metric by providing a name, an LLM, a prompt template, and the number of rankings desired.\n", - "my_ranking_metric = RankingMetrics(\n", - " name='response_ranking',\n", - " llm=LLM(), # Your language model instance\n", - " prompt=\"Rank the following responses:\\n{candidates}\",\n", - " num_ranks=3,\n", - ")\n", - "\n", - "# To score a single input (ranking candidate responses)\n", - "result = my_ranking_metric.score(candidates=[\n", - " \"short answer.\",\n", - " \"a bit more detailed.\",\n", - " \"the longest and most detailed answer.\"\n", - "],n=3)\n", - "print(result) # Might output something like: [1, 0, 2]\n", - "print(result.reason) # Provides the reasoning behind the ranking\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e198d7d-fbab-448e-aab1-f10f4234dff6", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "5b53bd5e-06c9-4430-9c06-f2225ddd7bd5", - "metadata": {}, - "source": [ - "### decorator factory for ranking metric" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c4e9170-67b9-4841-9df2-6afc490b89dd", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "ranking_metric = create_metric_decorator(RankingMetrics)" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "cbb1729b-8b25-48d8-a472-c03dd1e0d861", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1, 0, 2]\n", - "Ranked based on response clarity and detail.\n" - ] - } - ], - "source": [ - "@ranking_metric(\n", - " llm=LLM(), # Your language model instance\n", - " prompt=\"Rank the following responses:\\n{candidates}\",\n", - " name='new_ranking_metric',\n", - " num_ranks=3\n", - ")\n", - "def my_ranking_metric(llm, prompt, **kwargs):\n", - " # Your custom logic that calls the LLM and returns a tuple of (ranking, reason)\n", - " # For example, process the prompt (formatted with candidates) and produce a ranking.\n", - " ranking = [1, 0, 2] # Dummy ranking: second candidate is best, then first, then third.\n", - " reason = \"Ranked based on response clarity and detail.\"\n", - " return ranking, reason\n", - "\n", - "# Using the decorator-based ranking metric:\n", - "result = my_ranking_metric.score(candidates=[\n", - " \"Response A: short answer.\",\n", - " \"Response B: a bit more detailed.\",\n", - " \"Response C: the longest and most detailed answer.\"\n", - "])\n", - "print(result) # E.g., [1, 0, 2]\n", - "print(result.reason) # E.g., \"Ranked based on response clarity and detail.\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23e38ce5-aac9-489b-96c0-947011dbbdf7", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "random", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From a7e335c090272997d6c8917457e0b2876e26972c Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 16:21:15 -0700 Subject: [PATCH 08/17] clean notebooks --- nbs/metric/base.ipynb | 14 +++++--------- nbs/metric/decorator.ipynb | 20 ++++---------------- nbs/metric/discrete.ipynb | 24 ++++++------------------ nbs/metric/llm.ipynb | 6 +----- nbs/metric/numeric.ipynb | 20 ++++---------------- nbs/metric/ranking.ipynb | 18 +++--------------- nbs/metric/result.ipynb | 18 +++--------------- 7 files changed, 26 insertions(+), 94 deletions(-) diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb index 53c79f7..57d1fe4 100644 --- a/nbs/metric/base.ipynb +++ b/nbs/metric/base.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "125fcb9a", "metadata": {}, "outputs": [ @@ -22,7 +22,7 @@ "True" ] }, - "execution_count": 4, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "e8ccff58", "metadata": {}, "outputs": [ @@ -169,7 +169,7 @@ "0" ] }, - "execution_count": 6, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -188,11 +188,7 @@ "source": [] } ], - "metadata": { - "language_info": { - "name": "python" - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 2 } diff --git a/nbs/metric/decorator.ipynb b/nbs/metric/decorator.ipynb index b30f6dc..e706128 100644 --- a/nbs/metric/decorator.ipynb +++ b/nbs/metric/decorator.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -28,7 +28,7 @@ "True" ] }, - "execution_count": 4, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -224,21 +224,9 @@ ], "metadata": { "kernelspec": { - "display_name": "random", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" } }, "nbformat": 4, diff --git a/nbs/metric/discrete.ipynb b/nbs/metric/discrete.ipynb index af2d94a..cd3346f 100644 --- a/nbs/metric/discrete.ipynb +++ b/nbs/metric/discrete.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -20,7 +20,7 @@ "True" ] }, - "execution_count": 1, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -181,21 +181,9 @@ ], "metadata": { "kernelspec": { - "display_name": "random", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" } }, "nbformat": 4, diff --git a/nbs/metric/llm.ipynb b/nbs/metric/llm.ipynb index 9a3ab54..b2cc66a 100644 --- a/nbs/metric/llm.ipynb +++ b/nbs/metric/llm.ipynb @@ -49,11 +49,7 @@ ] } ], - "metadata": { - "language_info": { - "name": "python" - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 2 } diff --git a/nbs/metric/numeric.ipynb b/nbs/metric/numeric.ipynb index 033e9ba..c0cf2e4 100644 --- a/nbs/metric/numeric.ipynb +++ b/nbs/metric/numeric.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ "'The response does not provide any context or information that can be evaluated as helpful.'" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -129,7 +129,7 @@ "20" ] }, - "execution_count": 8, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -173,21 +173,9 @@ ], "metadata": { "kernelspec": { - "display_name": "random", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" } }, "nbformat": 4, diff --git a/nbs/metric/ranking.ipynb b/nbs/metric/ranking.ipynb index 96f8275..0106b3a 100644 --- a/nbs/metric/ranking.ipynb +++ b/nbs/metric/ranking.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -163,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -209,21 +209,9 @@ ], "metadata": { "kernelspec": { - "display_name": "random", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" } }, "nbformat": 4, diff --git a/nbs/metric/result.ipynb b/nbs/metric/result.ipynb index 167962c..cba95c7 100644 --- a/nbs/metric/result.ipynb +++ b/nbs/metric/result.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "164726f3", "metadata": {}, "outputs": [], @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "0f1c801a-6568-4ba4-8bbe-30bf154174fe", "metadata": {}, "outputs": [], @@ -254,21 +254,9 @@ ], "metadata": { "kernelspec": { - "display_name": "random", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" } }, "nbformat": 4, From c2d090d1a3ce9af3dd86f0cb5c5c46fe5aaac3c3 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 16:24:04 -0700 Subject: [PATCH 09/17] nbs utils --- nbs/sidebar.yml | 22 ++++++++- ragas_annotator/_modidx.py | 91 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 2 deletions(-) diff --git a/nbs/sidebar.yml b/nbs/sidebar.yml index f065330..c6a1ec2 100644 --- a/nbs/sidebar.yml +++ b/nbs/sidebar.yml @@ -4,17 +4,35 @@ website: - index.ipynb - dataset.ipynb - experiment.ipynb + - init_module.ipynb - section: backends contents: + - backends/factory.ipynb + - backends/mock_notion_client.ipynb - backends/notion.ipynb + - section: metric + contents: + - metric/base.ipynb + - metric/decorator.ipynb + - metric/discrete.ipynb + - metric/llm.ipynb + - metric/numeric.ipynb + - metric/ranking.ipynb + - metric/result.ipynb + - metric/test_base.ipynb - section: model contents: - model/notion_model.ipynb - model/notion_types.ipynb - section: project contents: - - project/experiment.ipynb - - project/project.ipynb + - project/comparison.ipynb + - project/core.ipynb + - project/experiments.ipynb + - project/naming.ipynb + - section: tracing + contents: + - tracing/langfuse.ipynb - section: utils contents: - utils/exceptions.ipynb diff --git a/ragas_annotator/_modidx.py b/ragas_annotator/_modidx.py index 58458aa..c055929 100644 --- a/ragas_annotator/_modidx.py +++ b/ragas_annotator/_modidx.py @@ -131,6 +131,97 @@ 'ragas_annotator/experiment.py'), 'ragas_annotator.experiment.Experiment.__str__': ( 'experiment.html#experiment.__str__', 'ragas_annotator/experiment.py')}, + 'ragas_annotator.metric.base': { 'ragas_annotator.metric.base.Metric': ( 'metric/base.html#metric', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric._ensemble': ( 'metric/base.html#metric._ensemble', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric._get_response_model': ( 'metric/base.html#metric._get_response_model', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric.abatch_score': ( 'metric/base.html#metric.abatch_score', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric.ascore': ( 'metric/base.html#metric.ascore', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric.batch_score': ( 'metric/base.html#metric.batch_score', + 'ragas_annotator/metric/base.py'), + 'ragas_annotator.metric.base.Metric.score': ( 'metric/base.html#metric.score', + 'ragas_annotator/metric/base.py')}, + 'ragas_annotator.metric.decorator': { 'ragas_annotator.metric.decorator.create_metric_decorator': ( 'metric/decorator.html#create_metric_decorator', + 'ragas_annotator/metric/decorator.py')}, + 'ragas_annotator.metric.discrete': { 'ragas_annotator.metric.discrete.DiscreteMetric': ( 'metric/discrete.html#discretemetric', + 'ragas_annotator/metric/discrete.py'), + 'ragas_annotator.metric.discrete.DiscreteMetric._ensemble': ( 'metric/discrete.html#discretemetric._ensemble', + 'ragas_annotator/metric/discrete.py'), + 'ragas_annotator.metric.discrete.DiscreteMetric._get_response_model': ( 'metric/discrete.html#discretemetric._get_response_model', + 'ragas_annotator/metric/discrete.py')}, + 'ragas_annotator.metric.llm': { 'ragas_annotator.metric.llm.LLM': ('metric/llm.html#llm', 'ragas_annotator/metric/llm.py'), + 'ragas_annotator.metric.llm.LLM.__post_init__': ( 'metric/llm.html#llm.__post_init__', + 'ragas_annotator/metric/llm.py'), + 'ragas_annotator.metric.llm.LLM.agenerate': ( 'metric/llm.html#llm.agenerate', + 'ragas_annotator/metric/llm.py'), + 'ragas_annotator.metric.llm.LLM.generate': ( 'metric/llm.html#llm.generate', + 'ragas_annotator/metric/llm.py')}, + 'ragas_annotator.metric.numeric': { 'ragas_annotator.metric.numeric.NumericMetric': ( 'metric/numeric.html#numericmetric', + 'ragas_annotator/metric/numeric.py'), + 'ragas_annotator.metric.numeric.NumericMetric._ensemble': ( 'metric/numeric.html#numericmetric._ensemble', + 'ragas_annotator/metric/numeric.py'), + 'ragas_annotator.metric.numeric.NumericMetric._get_response_model': ( 'metric/numeric.html#numericmetric._get_response_model', + 'ragas_annotator/metric/numeric.py')}, + 'ragas_annotator.metric.ranking': { 'ragas_annotator.metric.ranking.RankingMetric': ( 'metric/ranking.html#rankingmetric', + 'ragas_annotator/metric/ranking.py'), + 'ragas_annotator.metric.ranking.RankingMetric._ensemble': ( 'metric/ranking.html#rankingmetric._ensemble', + 'ragas_annotator/metric/ranking.py'), + 'ragas_annotator.metric.ranking.RankingMetric._get_response_model': ( 'metric/ranking.html#rankingmetric._get_response_model', + 'ragas_annotator/metric/ranking.py')}, + 'ragas_annotator.metric.result': { 'ragas_annotator.metric.result.MetricResult': ( 'metric/result.html#metricresult', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__add__': ( 'metric/result.html#metricresult.__add__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__eq__': ( 'metric/result.html#metricresult.__eq__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__float__': ( 'metric/result.html#metricresult.__float__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__ge__': ( 'metric/result.html#metricresult.__ge__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__getattr__': ( 'metric/result.html#metricresult.__getattr__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__getitem__': ( 'metric/result.html#metricresult.__getitem__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__gt__': ( 'metric/result.html#metricresult.__gt__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__init__': ( 'metric/result.html#metricresult.__init__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__int__': ( 'metric/result.html#metricresult.__int__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__iter__': ( 'metric/result.html#metricresult.__iter__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__le__': ( 'metric/result.html#metricresult.__le__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__len__': ( 'metric/result.html#metricresult.__len__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__lt__': ( 'metric/result.html#metricresult.__lt__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__mul__': ( 'metric/result.html#metricresult.__mul__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__radd__': ( 'metric/result.html#metricresult.__radd__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__repr__': ( 'metric/result.html#metricresult.__repr__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__rmul__': ( 'metric/result.html#metricresult.__rmul__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__rsub__': ( 'metric/result.html#metricresult.__rsub__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__rtruediv__': ( 'metric/result.html#metricresult.__rtruediv__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__str__': ( 'metric/result.html#metricresult.__str__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__sub__': ( 'metric/result.html#metricresult.__sub__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.__truediv__': ( 'metric/result.html#metricresult.__truediv__', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.result': ( 'metric/result.html#metricresult.result', + 'ragas_annotator/metric/result.py'), + 'ragas_annotator.metric.result.MetricResult.to_dict': ( 'metric/result.html#metricresult.to_dict', + 'ragas_annotator/metric/result.py')}, 'ragas_annotator.model.notion_model': { 'ragas_annotator.model.notion_model.NotionModel': ( 'model/notion_model.html#notionmodel', 'ragas_annotator/model/notion_model.py'), 'ragas_annotator.model.notion_model.NotionModel.__getattr__': ( 'model/notion_model.html#notionmodel.__getattr__', From 9a3ee21d0312f504eddf227954b259ad75b330b7 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 17:05:19 -0700 Subject: [PATCH 10/17] add openai --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 5936f99..9d4a726 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,4 +5,6 @@ build-backend = "setuptools.build_meta" [dependency-groups] dev = [ "nbdev>=2.3.35", + "openai", + ] From af379d4a59065eb3ef0e563e15724595e77d3693 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 17:12:41 -0700 Subject: [PATCH 11/17] add openai --- nbs/metric/discrete.ipynb | 22 ---------------------- nbs/metric/llm.ipynb | 12 +++++++++++- nbs/metric/numeric.ipynb | 5 ----- nbs/metric/ranking.ipynb | 7 ------- pyproject.toml | 2 -- ragas_annotator/metric/discrete.py | 2 +- settings.ini | 2 +- 7 files changed, 13 insertions(+), 39 deletions(-) diff --git a/nbs/metric/discrete.ipynb b/nbs/metric/discrete.ipynb index cd3346f..1ea80b9 100644 --- a/nbs/metric/discrete.ipynb +++ b/nbs/metric/discrete.ipynb @@ -9,28 +9,6 @@ "#| default_exp metric.discrete" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| hide\n", - "from dotenv import load_dotenv\n", - "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/nbs/metric/llm.ipynb b/nbs/metric/llm.ipynb index b2cc66a..7914ee0 100644 --- a/nbs/metric/llm.ipynb +++ b/nbs/metric/llm.ipynb @@ -49,7 +49,17 @@ ] } ], - "metadata": {}, + "metadata": { + "kernelspec": { + "display_name": "random", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.16" + } + }, "nbformat": 4, "nbformat_minor": 2 } diff --git a/nbs/metric/numeric.ipynb b/nbs/metric/numeric.ipynb index c0cf2e4..b78d203 100644 --- a/nbs/metric/numeric.ipynb +++ b/nbs/metric/numeric.ipynb @@ -91,13 +91,8 @@ ], "source": [ "\n", - "from dotenv import load_dotenv\n", "from ragas_annotator.metric.llm import LLM\n", "\n", - "\n", - "\n", - "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')\n", - "\n", "my_metric = NumericMetric(\n", " name='helpfulness',\n", " llm=LLM(),\n", diff --git a/nbs/metric/ranking.ipynb b/nbs/metric/ranking.ipynb index 0106b3a..a5cd3ba 100644 --- a/nbs/metric/ranking.ipynb +++ b/nbs/metric/ranking.ipynb @@ -127,15 +127,8 @@ ], "source": [ "\n", - "\n", - "from dotenv import load_dotenv\n", "from ragas_annotator.metric.llm import LLM\n", "\n", - "\n", - "\n", - "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')\n", - "\n", - "# User instantiates a ranking metric by providing a name, an LLM, a prompt template, and the number of rankings desired.\n", "my_ranking_metric = RankingMetric(\n", " name='response_ranking',\n", " llm=LLM(), # Your language model instance\n", diff --git a/pyproject.toml b/pyproject.toml index 9d4a726..5936f99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,4 @@ build-backend = "setuptools.build_meta" [dependency-groups] dev = [ "nbdev>=2.3.35", - "openai", - ] diff --git a/ragas_annotator/metric/discrete.py b/ragas_annotator/metric/discrete.py index 63c4965..c35c2db 100644 --- a/ragas_annotator/metric/discrete.py +++ b/ragas_annotator/metric/discrete.py @@ -5,7 +5,7 @@ # %% auto 0 __all__ = ['discrete_metric', 'DiscreteMetric'] -# %% ../../nbs/metric/discrete.ipynb 3 +# %% ../../nbs/metric/discrete.ipynb 2 import typing as t from dataclasses import dataclass, field from pydantic import BaseModel, create_model diff --git a/settings.ini b/settings.ini index 07ddb41..655c1c3 100644 --- a/settings.ini +++ b/settings.ini @@ -38,7 +38,7 @@ status = 3 user = explodinggradients ### Dependencies ### -requirements = notion-client fastcore tqdm langfuse +requirements = notion-client fastcore tqdm langfuse openai dev_requirements = pytest # console_scripts = # conda_user = From bd33547ecb201a3c3acf380b481e970f1d5e7fcb Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 17:14:15 -0700 Subject: [PATCH 12/17] clean --- nbs/metric/llm.ipynb | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nbs/metric/llm.ipynb b/nbs/metric/llm.ipynb index 7914ee0..6ceca63 100644 --- a/nbs/metric/llm.ipynb +++ b/nbs/metric/llm.ipynb @@ -51,13 +51,9 @@ ], "metadata": { "kernelspec": { - "display_name": "random", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.10.16" } }, "nbformat": 4, From fdd9289e818cd142eaff10051482a32bc30406f4 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 17:21:44 -0700 Subject: [PATCH 13/17] add dependencies --- settings.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/settings.ini b/settings.ini index 655c1c3..0215d37 100644 --- a/settings.ini +++ b/settings.ini @@ -38,7 +38,7 @@ status = 3 user = explodinggradients ### Dependencies ### -requirements = notion-client fastcore tqdm langfuse openai +requirements = notion-client fastcore tqdm langfuse openai instructor pydantic dev_requirements = pytest # console_scripts = # conda_user = From 7977bfedbab8e8be2188cd0a83e2bff4dc7ce8bc Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 17:26:15 -0700 Subject: [PATCH 14/17] clean notebooks --- nbs/metric/base.ipynb | 23 ----------------------- nbs/metric/decorator.ipynb | 26 ++------------------------ ragas_annotator/metric/base.py | 2 +- ragas_annotator/metric/decorator.py | 2 +- 4 files changed, 4 insertions(+), 49 deletions(-) diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb index 57d1fe4..61fcc04 100644 --- a/nbs/metric/base.ipynb +++ b/nbs/metric/base.ipynb @@ -10,29 +10,6 @@ "#| default_exp metric.base" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "125fcb9a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| hide\n", - "from dotenv import load_dotenv\n", - "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" - ] - }, { "cell_type": "markdown", "id": "2eb8f806", diff --git a/nbs/metric/decorator.ipynb b/nbs/metric/decorator.ipynb index e706128..a3114e1 100644 --- a/nbs/metric/decorator.ipynb +++ b/nbs/metric/decorator.ipynb @@ -17,28 +17,6 @@ "> decorator factory for creating custom metrics" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#| hide\n", - "from dotenv import load_dotenv\n", - "load_dotenv('/Users/shahules/Myprojects/ragas_annotator/.envrc')" - ] - }, { "cell_type": "code", "execution_count": null, @@ -180,7 +158,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "low\n", + "high\n", "reason\n" ] } @@ -211,7 +189,7 @@ "\n", "result = my_metric.score(response='my response') # result\n", "print(result)\n", - "print(result.reason" + "print(result.reason)" ] }, { diff --git a/ragas_annotator/metric/base.py b/ragas_annotator/metric/base.py index a6349be..d37b9c5 100644 --- a/ragas_annotator/metric/base.py +++ b/ragas_annotator/metric/base.py @@ -5,7 +5,7 @@ # %% auto 0 __all__ = ['Metric'] -# %% ../../nbs/metric/base.ipynb 3 +# %% ../../nbs/metric/base.ipynb 2 from abc import ABC, abstractmethod import asyncio from dataclasses import dataclass, field diff --git a/ragas_annotator/metric/decorator.py b/ragas_annotator/metric/decorator.py index 378a2fd..016773a 100644 --- a/ragas_annotator/metric/decorator.py +++ b/ragas_annotator/metric/decorator.py @@ -5,7 +5,7 @@ # %% auto 0 __all__ = ['create_metric_decorator'] -# %% ../../nbs/metric/decorator.ipynb 3 +# %% ../../nbs/metric/decorator.ipynb 2 import typing as t import inspect import asyncio From c3c779d701bd7b02f364c33749eba33b6c59eb0f Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 17:31:05 -0700 Subject: [PATCH 15/17] ignore tests --- nbs/metric/base.ipynb | 2 +- nbs/metric/decorator.ipynb | 3 +++ nbs/metric/discrete.ipynb | 11 ++++++++++- nbs/metric/numeric.ipynb | 3 +++ nbs/metric/ranking.ipynb | 5 +++++ 5 files changed, 22 insertions(+), 2 deletions(-) diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb index 61fcc04..7f0cbeb 100644 --- a/nbs/metric/base.ipynb +++ b/nbs/metric/base.ipynb @@ -112,7 +112,7 @@ "metadata": {}, "outputs": [], "source": [ - "\n", + "#| eval: false\n", "\n", "@dataclass\n", "class CustomMetric(Metric):\n", diff --git a/nbs/metric/decorator.ipynb b/nbs/metric/decorator.ipynb index a3114e1..70131f0 100644 --- a/nbs/metric/decorator.ipynb +++ b/nbs/metric/decorator.ipynb @@ -164,6 +164,9 @@ } ], "source": [ + "#| eval: false\n", + "\n", + "\n", "from ragas_annotator.metric import DiscreteMetric\n", "from ragas_annotator.metric.llm import LLM\n", "from pydantic import BaseModel\n", diff --git a/nbs/metric/discrete.ipynb b/nbs/metric/discrete.ipynb index 1ea80b9..46f9f50 100644 --- a/nbs/metric/discrete.ipynb +++ b/nbs/metric/discrete.ipynb @@ -74,6 +74,13 @@ "discrete_metric = create_metric_decorator(DiscreteMetric)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example usage" + ] + }, { "cell_type": "code", "execution_count": null, @@ -89,7 +96,9 @@ } ], "source": [ - "## Example usage\n", + "\n", + "#| eval: false\n", + "\n", "from ragas_annotator.metric.llm import LLM\n", "\n", "my_metric = DiscreteMetric(\n", diff --git a/nbs/metric/numeric.ipynb b/nbs/metric/numeric.ipynb index b78d203..e3b08b0 100644 --- a/nbs/metric/numeric.ipynb +++ b/nbs/metric/numeric.ipynb @@ -90,6 +90,7 @@ } ], "source": [ + "#| eval: false\n", "\n", "from ragas_annotator.metric.llm import LLM\n", "\n", @@ -130,6 +131,8 @@ } ], "source": [ + "\n", + "#| eval: false\n", "\n", "@numeric_metric(llm=LLM(),\n", " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", diff --git a/nbs/metric/ranking.ipynb b/nbs/metric/ranking.ipynb index a5cd3ba..48e2aa3 100644 --- a/nbs/metric/ranking.ipynb +++ b/nbs/metric/ranking.ipynb @@ -126,6 +126,8 @@ } ], "source": [ + "\n", + "#| eval: false\n", "\n", "from ragas_annotator.metric.llm import LLM\n", "\n", @@ -169,6 +171,9 @@ } ], "source": [ + "#| eval: false\n", + "\n", + "\n", "@ranking_metric(\n", " llm=LLM(), # Your language model instance\n", " prompt=\"Rank the following responses:\\n{candidates}\",\n", From c5f614bbcbc8fb28d6c443a976546d1b08f3af21 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 17:33:13 -0700 Subject: [PATCH 16/17] ignore tests --- nbs/metric/base.ipynb | 29 ----------------------------- nbs/metric/discrete.ipynb | 1 + 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/nbs/metric/base.ipynb b/nbs/metric/base.ipynb index 7f0cbeb..c6d6e24 100644 --- a/nbs/metric/base.ipynb +++ b/nbs/metric/base.ipynb @@ -131,38 +131,9 @@ " \n", " return results[0] # Placeholder for ensemble logic\n", "\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ba99094", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ "my_metric = CustomMetric(name=\"example\", prompt=\"What is the result of {input}?\", llm=LLM())\n", "my_metric.score(input=\"test\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1327f250", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {}, diff --git a/nbs/metric/discrete.ipynb b/nbs/metric/discrete.ipynb index 46f9f50..c99cdb1 100644 --- a/nbs/metric/discrete.ipynb +++ b/nbs/metric/discrete.ipynb @@ -136,6 +136,7 @@ } ], "source": [ + "#| eval: false\n", "@discrete_metric(llm=LLM(),\n", " prompt=\"Evaluate if given answer is helpful\\n\\n{response}\",\n", " name='new_metric',values=[\"low\",\"med\",\"high\"])\n", From 334ec8d95114124473dc80122360a51453afcb4a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 21 Mar 2025 18:08:32 -0700 Subject: [PATCH 17/17] fix discrete --- nbs/metric/discrete.ipynb | 2 +- ragas_annotator/metric/discrete.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nbs/metric/discrete.ipynb b/nbs/metric/discrete.ipynb index c99cdb1..c27815c 100644 --- a/nbs/metric/discrete.ipynb +++ b/nbs/metric/discrete.ipynb @@ -66,7 +66,7 @@ " if counter[candidate.result] == max_count:\n", " result = candidate.result \n", " reason = candidate.reason\n", - " break\n", + " return MetricResult(result=result, reason=reason)\n", " \n", " return results[0]\n", "\n", diff --git a/ragas_annotator/metric/discrete.py b/ragas_annotator/metric/discrete.py index c35c2db..d4f77f7 100644 --- a/ragas_annotator/metric/discrete.py +++ b/ragas_annotator/metric/discrete.py @@ -48,7 +48,7 @@ def _ensemble(self,results:t.List[MetricResult]) -> MetricResult: if counter[candidate.result] == max_count: result = candidate.result reason = candidate.reason - break + return MetricResult(result=result, reason=reason) return results[0]