From cbc0d14b68503b646f05a52c245d2d049893fe0a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Nov 2024 16:41:09 +0530 Subject: [PATCH 01/36] add optimizer class --- src/ragas/optimizers/base.py | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 src/ragas/optimizers/base.py diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py new file mode 100644 index 000000000..b8e2b1ad1 --- /dev/null +++ b/src/ragas/optimizers/base.py @@ -0,0 +1,37 @@ +import typing as t +from abc import ABC, abstractmethod + +from ragas.config import InstructionConfig +from ragas.metrics.base import MetricWithLLM + + +class Optimizer(ABC): + """ + Abstract base class for all optimizers. + """ + + @abstractmethod + def optimize( + self, + metric: MetricWithLLM, + train_data: t.Any, + config: InstructionConfig, + ) -> MetricWithLLM: + """ + Optimizes the prompts for the given metric. + + Parameters + ---------- + metric : MetricWithLLM + The metric to optimize. + train_data : Any + The training data. + config : InstructionConfig + The training configuration. + + Returns + ------- + MetricWithLLM + The optimized metric. + """ + pass From bba3004bc22aa2d4db7a9e0f8e1cf121702885db Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Nov 2024 16:41:21 +0530 Subject: [PATCH 02/36] add train method --- src/ragas/metrics/base.py | 16 ++++++++++++++++ src/ragas/optimizers/__init__.py | 0 2 files changed, 16 insertions(+) create mode 100644 src/ragas/optimizers/__init__.py diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 978954ec7..0ef7fc437 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import json import logging import typing as t from abc import ABC, abstractmethod @@ -11,6 +12,7 @@ from pysbd import Segmenter from ragas.callbacks import ChainType, new_group +from ragas.config import DemonstrationConfig, InstructionConfig from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.executor import is_event_loop_running from ragas.prompt import PromptMixin @@ -213,6 +215,20 @@ def init(self, run_config: RunConfig): ) self.llm.set_run_config(run_config) + def train( + self, + path: str, + demonstration_config: DemonstrationConfig, + instruction_config: InstructionConfig, + callbacks: Callbacks, + ) -> None: + + if not path.endswith(".json"): + raise ValueError("Train data must be in json format") + + _ = json.load(open(path)) + return + @dataclass class MetricWithEmbeddings(Metric): diff --git a/src/ragas/optimizers/__init__.py b/src/ragas/optimizers/__init__.py new file mode 100644 index 000000000..e69de29bb From 1ee5e069ac2ea6f5e78083b412156ddf5505a40b Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Nov 2024 16:41:33 +0530 Subject: [PATCH 03/36] train configs --- src/ragas/config.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 src/ragas/config.py diff --git a/src/ragas/config.py b/src/ragas/config.py new file mode 100644 index 000000000..58b52d06d --- /dev/null +++ b/src/ragas/config.py @@ -0,0 +1,23 @@ +import typing as t + +from pydantic import BaseModel, Field + +# from ragas.embeddings import BaseRagasEmbeddings +from ragas.llms import BaseRagasLLM + +DEFAULT_OPTIMIZER_CONFIG = {"max_steps": 100} + + +class DemonstrationConfig(BaseModel): + enabled: bool = True + top_k: int = 3 + technique: t.Literal["random", "similarity"] = "similarity" + # embedding: BaseRagasEmbeddings + + +class InstructionConfig(BaseModel): + enabled: bool = True + optimizer_config: t.Dict[str, t.Any] = Field( + default_factory=lambda: DEFAULT_OPTIMIZER_CONFIG + ) + model: BaseRagasLLM From 35d3d2156dfa9e06fa242727d0c0cfaee81577ec Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Nov 2024 17:16:35 +0530 Subject: [PATCH 04/36] add core schema --- src/ragas/embeddings/base.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index 9981058cf..ca07be914 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -10,11 +10,14 @@ from langchain_core.embeddings import Embeddings from langchain_openai.embeddings import OpenAIEmbeddings from pydantic.dataclasses import dataclass +from pydantic_core import CoreSchema, core_schema from ragas.run_config import RunConfig, add_async_retry, add_retry if t.TYPE_CHECKING: from llama_index.core.base.embeddings.base import BaseEmbedding + from pydantic import GetCoreSchemaHandler + DEFAULT_MODEL_NAME = "BAAI/bge-small-en-v1.5" @@ -64,6 +67,17 @@ def set_run_config(self, run_config: RunConfig): """ self.run_config = run_config + @classmethod + def __get_pydantic_core_schema__( + cls, source_type: t.Any, handler: GetCoreSchemaHandler + ) -> CoreSchema: + """ + Define how Pydantic generates a schema for BaseRagasEmbeddings. + """ + return core_schema.no_info_after_validator_function( + cls, core_schema.is_instance_schema(cls) # The validator function + ) + class LangchainEmbeddingsWrapper(BaseRagasEmbeddings): """ From e4e362caeca6bcfcda3a47038bb3a3fbdc6d28ee Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Nov 2024 17:16:49 +0530 Subject: [PATCH 05/36] add embedding --- src/ragas/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ragas/config.py b/src/ragas/config.py index 58b52d06d..440cc0763 100644 --- a/src/ragas/config.py +++ b/src/ragas/config.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, Field -# from ragas.embeddings import BaseRagasEmbeddings +from ragas.embeddings import BaseRagasEmbeddings from ragas.llms import BaseRagasLLM DEFAULT_OPTIMIZER_CONFIG = {"max_steps": 100} @@ -12,7 +12,7 @@ class DemonstrationConfig(BaseModel): enabled: bool = True top_k: int = 3 technique: t.Literal["random", "similarity"] = "similarity" - # embedding: BaseRagasEmbeddings + embedding: t.Optional[BaseRagasEmbeddings] = None class InstructionConfig(BaseModel): @@ -20,4 +20,4 @@ class InstructionConfig(BaseModel): optimizer_config: t.Dict[str, t.Any] = Field( default_factory=lambda: DEFAULT_OPTIMIZER_CONFIG ) - model: BaseRagasLLM + model: t.Optional[BaseRagasLLM] = None From 0e7ff2124bfed40ad8e390c19f2b2c8e56fc4641 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Nov 2024 19:00:53 +0530 Subject: [PATCH 06/36] add type annotation --- src/ragas/config.py | 2 ++ src/ragas/metrics/base.py | 3 ++- src/ragas/optimizers/base.py | 8 ++++++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ragas/config.py b/src/ragas/config.py index 440cc0763..05fb9086f 100644 --- a/src/ragas/config.py +++ b/src/ragas/config.py @@ -4,6 +4,7 @@ from ragas.embeddings import BaseRagasEmbeddings from ragas.llms import BaseRagasLLM +from ragas.optimizers.base import Optimizer DEFAULT_OPTIMIZER_CONFIG = {"max_steps": 100} @@ -17,6 +18,7 @@ class DemonstrationConfig(BaseModel): class InstructionConfig(BaseModel): enabled: bool = True + optimizer: Optimizer optimizer_config: t.Dict[str, t.Any] = Field( default_factory=lambda: DEFAULT_OPTIMIZER_CONFIG ) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 0ef7fc437..51e8eb4d0 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -12,7 +12,6 @@ from pysbd import Segmenter from ragas.callbacks import ChainType, new_group -from ragas.config import DemonstrationConfig, InstructionConfig from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.executor import is_event_loop_running from ragas.prompt import PromptMixin @@ -22,8 +21,10 @@ if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks + from ragas.config import DemonstrationConfig, InstructionConfig from ragas.embeddings import BaseRagasEmbeddings from ragas.llms import BaseRagasLLM + logger = logging.getLogger(__name__) diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py index b8e2b1ad1..a2875edec 100644 --- a/src/ragas/optimizers/base.py +++ b/src/ragas/optimizers/base.py @@ -1,7 +1,8 @@ import typing as t from abc import ABC, abstractmethod -from ragas.config import InstructionConfig +from ragas.embeddings.base import BaseRagasEmbeddings +from ragas.llms.base import BaseRagasLLM from ragas.metrics.base import MetricWithLLM @@ -10,12 +11,15 @@ class Optimizer(ABC): Abstract base class for all optimizers. """ + llm: BaseRagasLLM + embedding: t.Optional[BaseRagasEmbeddings] = None + @abstractmethod def optimize( self, metric: MetricWithLLM, train_data: t.Any, - config: InstructionConfig, + config: t.Dict[t.Any, t.Any], ) -> MetricWithLLM: """ Optimizes the prompts for the given metric. From b39be13edb340de9bc440447519de8b6e30d9569 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Nov 2024 11:09:41 +0530 Subject: [PATCH 07/36] add train --- src/ragas/config.py | 6 +++--- src/ragas/metrics/base.py | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/ragas/config.py b/src/ragas/config.py index 05fb9086f..1af3b2553 100644 --- a/src/ragas/config.py +++ b/src/ragas/config.py @@ -4,7 +4,7 @@ from ragas.embeddings import BaseRagasEmbeddings from ragas.llms import BaseRagasLLM -from ragas.optimizers.base import Optimizer +from ragas.optimizers import GeneticOptimizer, Optimizer DEFAULT_OPTIMIZER_CONFIG = {"max_steps": 100} @@ -18,8 +18,8 @@ class DemonstrationConfig(BaseModel): class InstructionConfig(BaseModel): enabled: bool = True - optimizer: Optimizer + optimizer: Optimizer = GeneticOptimizer() optimizer_config: t.Dict[str, t.Any] = Field( default_factory=lambda: DEFAULT_OPTIMIZER_CONFIG ) - model: t.Optional[BaseRagasLLM] = None + llm: t.Optional[BaseRagasLLM] = None diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 51e8eb4d0..645f61aed 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -227,7 +227,23 @@ def train( if not path.endswith(".json"): raise ValueError("Train data must be in json format") - _ = json.load(open(path)) + data = json.load(open(path)) + data = data.get(self.name) + if data is None: + raise ValueError(f"Metric '{self.name}' not found in train data") + + optimizer = instruction_config.optimizer + llm = instruction_config.llm or self.llm + if llm is None: + raise ValueError( + f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run." # noqa + ) + if optimizer.llm is None: + optimizer.llm = llm + + optimizer_config = instruction_config.optimizer_config or {} + optimizer.optimize(self, data, optimizer_config, callbacks) + return From 6ccd177bbccf8680e140e46255813fe1386c5a45 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Nov 2024 11:09:57 +0530 Subject: [PATCH 08/36] add optimizer genetic based --- src/ragas/optimizers/__init__.py | 7 +++++++ src/ragas/optimizers/base.py | 9 ++++++--- src/ragas/optimizers/genetic.py | 24 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 src/ragas/optimizers/genetic.py diff --git a/src/ragas/optimizers/__init__.py b/src/ragas/optimizers/__init__.py index e69de29bb..2780da08f 100644 --- a/src/ragas/optimizers/__init__.py +++ b/src/ragas/optimizers/__init__.py @@ -0,0 +1,7 @@ +from ragas.optimizers.base import Optimizer +from ragas.optimizers.genetic import GeneticOptimizer + +__all__ = [ + "Optimizer", + "GeneticOptimizer", +] diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py index a2875edec..c3bf4819c 100644 --- a/src/ragas/optimizers/base.py +++ b/src/ragas/optimizers/base.py @@ -1,18 +1,20 @@ import typing as t from abc import ABC, abstractmethod +from dataclasses import dataclass + +from langchain_core.callbacks import Callbacks -from ragas.embeddings.base import BaseRagasEmbeddings from ragas.llms.base import BaseRagasLLM from ragas.metrics.base import MetricWithLLM +@dataclass class Optimizer(ABC): """ Abstract base class for all optimizers. """ - llm: BaseRagasLLM - embedding: t.Optional[BaseRagasEmbeddings] = None + llm: t.Optional[BaseRagasLLM] = None @abstractmethod def optimize( @@ -20,6 +22,7 @@ def optimize( metric: MetricWithLLM, train_data: t.Any, config: t.Dict[t.Any, t.Any], + callbacks: Callbacks, ) -> MetricWithLLM: """ Optimizes the prompts for the given metric. diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py new file mode 100644 index 000000000..86ec16a81 --- /dev/null +++ b/src/ragas/optimizers/genetic.py @@ -0,0 +1,24 @@ +import typing as t + +from langchain_core.callbacks import Callbacks + +from ragas.metrics.base import MetricWithLLM +from ragas.optimizers.base import Optimizer + + +class GeneticOptimizer(Optimizer): + """ + A genetic algorithm optimizer that balances exploration and exploitation. + """ + + def optimize( + self, + metric: MetricWithLLM, + train_data: t.Any, + config: t.Dict[t.Any, t.Any], + callbacks: Callbacks, + ) -> MetricWithLLM: + + # max_steps = config.get("max_steps", 100) + + return metric From 0444af8db6dd7a471951c8c3c9aad0927cd65faf Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Nov 2024 12:05:07 +0530 Subject: [PATCH 09/36] added prompts --- src/ragas/optimizers/genetic.py | 103 ++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 86ec16a81..582fdc09e 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -1,9 +1,112 @@ import typing as t from langchain_core.callbacks import Callbacks +from pydantic import BaseModel from ragas.metrics.base import MetricWithLLM from ragas.optimizers.base import Optimizer +from ragas.prompt import PydanticPrompt + + +class FormattedExamples(BaseModel): + examples: t.List[t.Tuple[str, t.Any]] + + @classmethod + def from_examples( + cls, examples: t.List[t.Dict[t.Dict[str, t.Any], t.Dict[str, t.Any]]] + ) -> "FormattedExamples": + + formated_examples = [] + for example in examples: + input_, output = list(example.items())[0] + input_ = "".join(f"\n{key}:\n\t{val}\n" for key, val in input_.items()) + formated_examples.append((input_, output)) + + return cls(examples=formated_examples) + + +class OutputInstruction(BaseModel): + instruction: str + + +class ReverseEngineerPrompt(PydanticPrompt[FormattedExamples, OutputInstruction]): + name: str = "reverse_engineer" + instruction: str = ( + "Given a set of (input containing (user_input, response, reference, etc), expected output) pairs that were manually annotated, guess and generate the instruction given to the annotator." + ) + input_model = FormattedExamples + output_model = OutputInstruction + + +class ParentPrompts(BaseModel): + parent_1: str + parent_2: str + + +class CrossOverPrompt(PydanticPrompt[ParentPrompts, OutputInstruction]): + name: str = "crossover" + instruction: str = ( + "You are a mutator who is familiar with the concept of cross-over in genetic algorithm, namely " + "combining the genetic information of two parents to generate new offspring. Given two parent " + "prompts, you will perform a cross-over to generate an offspring prompt that covers the same " + "semantic meaning as both parents." + ) + input_model = ParentPrompts + output_model = OutputInstruction + examples = [ + ( + ParentPrompts( + parent_1="Now you are a categorizer, your mission is to ascertain the sentiment of the provided text, either favorable or unfavorable.", + parent_2="Assign a sentiment label to the given sentence from [’negative’, ’positive’] and return only the label without any other text.", + ), + OutputInstruction( + instruction="Your mission is to ascertain the sentiment of the provided text and assign a sentiment label from [’negative’, ’positive’].", + ), + ) + ] + + +class FeedbackExample(BaseModel): + input: str + output: t.Dict[str, t.Any] + expected_output: t.Dict[str, t.Any] + + +class FeedbackMutationInput(BaseModel): + instruction: str + examples: t.List[FeedbackExample] + + +class FeedbackMutationOutput(BaseModel): + feedbacks: t.List[str] + + +class FeedbackMutationPrompt( + PydanticPrompt[FeedbackMutationInput, FeedbackMutationOutput] +): + name: str = "feedback_mutation" + instruction: str = ( + "You're an expert reviewer. Given an instruction and a set of (input containing (user_input, response, reference), output, expected_output) examples, give maximum 3 feedbacks on how the instruction can be improved to correct the mistakes in incorrect outputs and reach expected output." + "Do not provide the feedback to add examples with the instruction." + ) + input_model = FeedbackMutationInput + output_model = FeedbackMutationOutput + + +class FeedbackMutationPromptInput(BaseModel): + instruction: str + feedbacks: t.List[str] + + +class FeedbackMutationPromptGeneration( + PydanticPrompt[FeedbackMutationPromptInput, OutputInstruction] +): + name: str = "feedback_mutation_generation" + instruction: str = ( + "You are a mutator. Given an instruction and a set of feedbacks on how the instruction can be improved generate a new instruction that incorporates the feedback." + ) + input_model = FeedbackMutationPromptInput + output_model = OutputInstruction class GeneticOptimizer(Optimizer): From cc60cd2824a6536360d861fcb40c808608c706bf Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Nov 2024 16:40:52 +0530 Subject: [PATCH 10/36] add loss and output type --- src/ragas/losses.py | 99 +++++++++++++++++++++++++++++++++++++++ src/ragas/metrics/base.py | 7 +++ 2 files changed, 106 insertions(+) create mode 100644 src/ragas/losses.py diff --git a/src/ragas/losses.py b/src/ragas/losses.py new file mode 100644 index 000000000..52661bd87 --- /dev/null +++ b/src/ragas/losses.py @@ -0,0 +1,99 @@ +import typing as t +from abc import ABC, abstractmethod + + +class Loss(ABC): + """ + Abstract base class for all loss functions. + """ + + @abstractmethod + def __call__(self, predicted: t.List, actual: t.List) -> float: + raise NotImplementedError + + +class MSELoss(Loss): + """ + Mean Squared Error loss function. + """ + + reduction: t.Literal["mean", "sum"] = "mean" + + def __call__(self, predicted: t.List[float], actual: t.List[float]) -> float: + + errors = [(p - a) ** 2 for p, a in zip(predicted, actual)] + if self.reduction == "mean": + return sum(errors) / len(errors) + elif self.reduction == "sum": + return sum(errors) + else: + raise ValueError(f"Invalid reduction method: {self.reduction}") + + +class BinaryMetricLoss(Loss): + """ + Computes the loss for binary metrics. + Supports accuracy and F1-score. + """ + + metric: t.Literal["accuracy", "f1_score"] = "accuracy" + + def __call__(self, predicted: t.List[int], actual: t.List[int]) -> float: + """ + Computes the loss using the specified reduction. + + Parameters + ---------- + predicted : list[int] + List of predicted binary values (0 or 1). + actual : list[int] + List of actual binary values (0 or 1). + + Returns + ------- + float + The computed loss based on the reduction type. + """ + if len(predicted) != len(actual): + raise ValueError("Predicted and actual lists must have the same length.") + + if self.metric == "accuracy": + return self._accuracy(predicted, actual) + elif self.metric == "f1_score": + return self._f1_score(predicted, actual) + else: + raise ValueError(f"Unsupported reduction type: {self.metric}") + + def _accuracy(self, predicted: list[int], actual: t.List[int]) -> float: + """ + Computes accuracy as the reduction operation. + + Returns + ------- + float + Accuracy (proportion of correct predictions). + """ + correct = sum(p == a for p, a in zip(predicted, actual)) + return correct / len(actual) + + def _f1_score(self, predicted: t.List[int], actual: t.List[int]) -> float: + """ + Computes F1-score as the reduction operation. + + Returns + ------- + float + The F1-score. + """ + tp = sum(p == 1 and a == 1 for p, a in zip(predicted, actual)) + fp = sum(p == 1 and a == 0 for p, a in zip(predicted, actual)) + fn = sum(p == 0 and a == 1 for p, a in zip(predicted, actual)) + + precision = tp / (tp + fp) if tp + fp > 0 else 0 + recall = tp / (tp + fn) if tp + fn > 0 else 0 + f1 = ( + (2 * precision * recall) / (precision + recall) + if precision + recall > 0 + else 0 + ) + return f1 diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 645f61aed..ca28ae228 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -54,6 +54,12 @@ class MetricType(Enum): MULTI_TURN = "multi_turn" +class MetricOutputType(Enum): + BINARY = "binary" + SCORING = "rational" + RANKING = "ranking" + + @dataclass class Metric(ABC): """ @@ -70,6 +76,7 @@ class Metric(ABC): _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=dict) name: str = field(default="", repr=True) + output_type: MetricOutputType = MetricOutputType.SCORING # TODO: remove default and make it required, add corresponding value to every metric def __post_init__(self): if self.name == "": From adfda6700a9ad06a39608d4d512f65071b9215b4 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Nov 2024 17:11:56 +0530 Subject: [PATCH 11/36] add loss --- src/ragas/config.py | 2 ++ src/ragas/metrics/base.py | 11 +++++++++++ src/ragas/optimizers/base.py | 2 ++ src/ragas/optimizers/genetic.py | 6 ++++++ 4 files changed, 21 insertions(+) diff --git a/src/ragas/config.py b/src/ragas/config.py index 1af3b2553..062eb76f4 100644 --- a/src/ragas/config.py +++ b/src/ragas/config.py @@ -5,6 +5,7 @@ from ragas.embeddings import BaseRagasEmbeddings from ragas.llms import BaseRagasLLM from ragas.optimizers import GeneticOptimizer, Optimizer +from ragas.losses import Loss DEFAULT_OPTIMIZER_CONFIG = {"max_steps": 100} @@ -18,6 +19,7 @@ class DemonstrationConfig(BaseModel): class InstructionConfig(BaseModel): enabled: bool = True + loss: t.Optional[Loss] = None optimizer: Optimizer = GeneticOptimizer() optimizer_config: t.Dict[str, t.Any] = Field( default_factory=lambda: DEFAULT_OPTIMIZER_CONFIG diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index ca28ae228..9227ed64c 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -14,6 +14,7 @@ from ragas.callbacks import ChainType, new_group from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.executor import is_event_loop_running +from ragas.losses import BinaryMetricLoss, MSELoss from ragas.prompt import PromptMixin from ragas.run_config import RunConfig from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES, camel_to_snake, deprecated @@ -247,6 +248,16 @@ def train( ) if optimizer.llm is None: optimizer.llm = llm + + if instruction_config.loss is None: + if self.output_type == MetricOutputType.BINARY: + loss_fun = BinaryMetricLoss() + elif self.output_type == MetricOutputType.SCORING: + loss_fun = MSELoss() + else: + raise NotImplementedError(f"Output type '{self.output_type}' not implemented") + else: + loss_fun = instruction_config.loss optimizer_config = instruction_config.optimizer_config or {} optimizer.optimize(self, data, optimizer_config, callbacks) diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py index c3bf4819c..9eaf510b4 100644 --- a/src/ragas/optimizers/base.py +++ b/src/ragas/optimizers/base.py @@ -6,6 +6,7 @@ from ragas.llms.base import BaseRagasLLM from ragas.metrics.base import MetricWithLLM +from ragas.losses import Loss @dataclass @@ -21,6 +22,7 @@ def optimize( self, metric: MetricWithLLM, train_data: t.Any, + loss: Loss, config: t.Dict[t.Any, t.Any], callbacks: Callbacks, ) -> MetricWithLLM: diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 582fdc09e..58c132433 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -6,6 +6,7 @@ from ragas.metrics.base import MetricWithLLM from ragas.optimizers.base import Optimizer from ragas.prompt import PydanticPrompt +from ragas.losses import Loss class FormattedExamples(BaseModel): @@ -118,6 +119,7 @@ def optimize( self, metric: MetricWithLLM, train_data: t.Any, + loss: Loss, config: t.Dict[t.Any, t.Any], callbacks: Callbacks, ) -> MetricWithLLM: @@ -125,3 +127,7 @@ def optimize( # max_steps = config.get("max_steps", 100) return metric + + + def _reverse_engineer_instruction(self, examples: t.List[t.Dict[t.Dict[str, t.Any]]]) -> str: + return "instruction" From 919e5d4e652052e08ce0e461bdf1eccd330b62f1 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 27 Nov 2024 17:54:03 +0530 Subject: [PATCH 12/36] dataset and optimizer conf --- src/ragas/config.py | 2 +- src/ragas/dataset_schema.py | 18 ++++++++++ src/ragas/loaders.py | 61 +++++++++++++++++++++++++++++++++ src/ragas/metrics/base.py | 22 ++++++++---- src/ragas/optimizers/base.py | 4 +-- src/ragas/optimizers/genetic.py | 28 +++++++++++---- 6 files changed, 119 insertions(+), 16 deletions(-) create mode 100644 src/ragas/loaders.py diff --git a/src/ragas/config.py b/src/ragas/config.py index 062eb76f4..b12e9b2a2 100644 --- a/src/ragas/config.py +++ b/src/ragas/config.py @@ -4,8 +4,8 @@ from ragas.embeddings import BaseRagasEmbeddings from ragas.llms import BaseRagasLLM -from ragas.optimizers import GeneticOptimizer, Optimizer from ragas.losses import Loss +from ragas.optimizers import GeneticOptimizer, Optimizer DEFAULT_OPTIMIZER_CONFIG = {"max_steps": 100} diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index b3e07edc4..ae7fc3038 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -526,3 +526,21 @@ def upload(self, base_url: str = RAGAS_API_URL, verbose: bool = True) -> str: if verbose: print(f"Evaluation results uploaded! View at {evaluation_endpoint}") return evaluation_endpoint + + +class PromptAnnotation(BaseModel): + prompt_input: t.Dict[str, t.Any] + prompt_output: t.Dict[str, t.Any] + is_accepted: bool + is_edited: t.Union[t.Dict[str, t.Any], None] + + +class SampleAnnotation(BaseModel): + metric_input: t.Dict[str, t.Any] + metric_output: float + prompts: t.Dict[str, t.List[PromptAnnotation]] + is_accepted: bool + + +class MetricAnnotation(BaseModel): + root: t.Dict[str, t.List[SampleAnnotation]] diff --git a/src/ragas/loaders.py b/src/ragas/loaders.py new file mode 100644 index 000000000..f28711946 --- /dev/null +++ b/src/ragas/loaders.py @@ -0,0 +1,61 @@ +import json +import typing as t + +import numpy as np +from pydantic import BaseModel + + +class PromptAnnotation(BaseModel): + prompt_input: t.Dict[str, t.Any] + prompt_output: t.Dict[str, t.Any] + is_accepted: bool + edited_output: t.Union[t.Dict[str, t.Any], None] + + +class SampleAnnotation(BaseModel): + metric_input: t.Dict[str, t.Any] + metric_output: float + prompts: t.Dict[str, PromptAnnotation] + is_accepted: bool + + +class MetricAnnotation(BaseModel): + + root: t.Dict[str, t.List[SampleAnnotation]] + + @classmethod + def from_json(cls, path) -> "MetricAnnotation": + + dataset = json.load(open(path)) + return cls( + root={ + key: [SampleAnnotation(**sample) for sample in value] + for key, value in dataset.items() + } + ) + + def train_test_split( + self, + test_size: float = 0.2, + random_state: t.Optional[np.random.RandomState] = None, + stratify: t.Optional[t.List[t.Any]] = None, + ): + """ + Split the dataset into training and testing sets. + + Parameters: + test_size (float): The proportion of the dataset to include in the test split. + seed (int): Random seed for reproducibility. + stratify (list): The column values to stratify the split on. + """ + pass + + def batch(self, batch_size: int, stratiy: t.Optional[str] = None): + """ + Create a batch iterator. + + Parameters: + batch_size (int): The number of samples in each batch. + stratify (str): The column to stratify the batches on. + """ + pass diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 9227ed64c..475783aa0 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -57,7 +57,8 @@ class MetricType(Enum): class MetricOutputType(Enum): BINARY = "binary" - SCORING = "rational" + DISCRETE = "discrete" + CONTINUOUS = "continuous" RANKING = "ranking" @@ -77,7 +78,9 @@ class Metric(ABC): _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=dict) name: str = field(default="", repr=True) - output_type: MetricOutputType = MetricOutputType.SCORING # TODO: remove default and make it required, add corresponding value to every metric + output_type: MetricOutputType = ( + MetricOutputType.DISCRETE + ) # TODO: remove default and make it required, add corresponding value to every metric def __post_init__(self): if self.name == "": @@ -248,19 +251,26 @@ def train( ) if optimizer.llm is None: optimizer.llm = llm - + if instruction_config.loss is None: if self.output_type == MetricOutputType.BINARY: loss_fun = BinaryMetricLoss() - elif self.output_type == MetricOutputType.SCORING: + elif ( + self.output_type == MetricOutputType.CONTINUOUS + or self.output_type == MetricOutputType.DISCRETE + ): loss_fun = MSELoss() else: - raise NotImplementedError(f"Output type '{self.output_type}' not implemented") + raise NotImplementedError( + f"Output type '{self.output_type}' not implemented" + ) else: loss_fun = instruction_config.loss + optimizer.metric = self + optimizer_config = instruction_config.optimizer_config or {} - optimizer.optimize(self, data, optimizer_config, callbacks) + optimizer.optimize(data, loss_fun, optimizer_config, callbacks) return diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py index 9eaf510b4..3d3543516 100644 --- a/src/ragas/optimizers/base.py +++ b/src/ragas/optimizers/base.py @@ -5,8 +5,8 @@ from langchain_core.callbacks import Callbacks from ragas.llms.base import BaseRagasLLM -from ragas.metrics.base import MetricWithLLM from ragas.losses import Loss +from ragas.metrics.base import MetricWithLLM @dataclass @@ -15,12 +15,12 @@ class Optimizer(ABC): Abstract base class for all optimizers. """ + metric: t.Optional[MetricWithLLM] = None llm: t.Optional[BaseRagasLLM] = None @abstractmethod def optimize( self, - metric: MetricWithLLM, train_data: t.Any, loss: Loss, config: t.Dict[t.Any, t.Any], diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 58c132433..b0312d23a 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -3,10 +3,10 @@ from langchain_core.callbacks import Callbacks from pydantic import BaseModel +from ragas.losses import Loss from ragas.metrics.base import MetricWithLLM from ragas.optimizers.base import Optimizer from ragas.prompt import PydanticPrompt -from ragas.losses import Loss class FormattedExamples(BaseModel): @@ -117,17 +117,31 @@ class GeneticOptimizer(Optimizer): def optimize( self, - metric: MetricWithLLM, train_data: t.Any, loss: Loss, config: t.Dict[t.Any, t.Any], callbacks: Callbacks, ) -> MetricWithLLM: - # max_steps = config.get("max_steps", 100) + if self.metric is None: + raise ValueError("No metric provided for optimization.") + + if self.llm is None: + raise ValueError("No llm provided for optimization.") + + # max_steps = config.get("max_steps", 100 + return self.metric + + def _initialize_population( + self, dataset: t.List[t.Dict[t.Dict[str, t.Any]]] + ) -> t.List[str]: + + return ["instruction"] + + def _reverse_engineer_instruction( + self, dataset: t.List[t.Dict[t.Dict[str, t.Any]]] + ) -> str: + return "instruction" - return metric - - - def _reverse_engineer_instruction(self, examples: t.List[t.Dict[t.Dict[str, t.Any]]]) -> str: + def _cross_over(self, parent_1: str, parent_2: str) -> str: return "instruction" From 8f65e4784089b3ab02d2976b28ebdb800f0dce69 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 27 Nov 2024 19:53:05 +0530 Subject: [PATCH 13/36] add filter and load --- src/ragas/dataset_schema.py | 18 ----------------- src/ragas/loaders.py | 39 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index ae7fc3038..b3e07edc4 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -526,21 +526,3 @@ def upload(self, base_url: str = RAGAS_API_URL, verbose: bool = True) -> str: if verbose: print(f"Evaluation results uploaded! View at {evaluation_endpoint}") return evaluation_endpoint - - -class PromptAnnotation(BaseModel): - prompt_input: t.Dict[str, t.Any] - prompt_output: t.Dict[str, t.Any] - is_accepted: bool - is_edited: t.Union[t.Dict[str, t.Any], None] - - -class SampleAnnotation(BaseModel): - metric_input: t.Dict[str, t.Any] - metric_output: float - prompts: t.Dict[str, t.List[PromptAnnotation]] - is_accepted: bool - - -class MetricAnnotation(BaseModel): - root: t.Dict[str, t.List[SampleAnnotation]] diff --git a/src/ragas/loaders.py b/src/ragas/loaders.py index f28711946..9606bfb64 100644 --- a/src/ragas/loaders.py +++ b/src/ragas/loaders.py @@ -11,6 +11,9 @@ class PromptAnnotation(BaseModel): is_accepted: bool edited_output: t.Union[t.Dict[str, t.Any], None] + def __getitem__(self, key): + return getattr(self, key) + class SampleAnnotation(BaseModel): metric_input: t.Dict[str, t.Any] @@ -18,22 +21,47 @@ class SampleAnnotation(BaseModel): prompts: t.Dict[str, PromptAnnotation] is_accepted: bool + def __getitem__(self, key): + return getattr(self, key) + class MetricAnnotation(BaseModel): root: t.Dict[str, t.List[SampleAnnotation]] + def __getitem__(self, key): + return self.root[key] + @classmethod - def from_json(cls, path) -> "MetricAnnotation": + def from_json(cls, path, metric_name: t.Optional[str]) -> "MetricAnnotation": dataset = json.load(open(path)) + if metric_name is not None and metric_name not in dataset: + raise ValueError(f"Split {metric_name} not found in the dataset.") + return cls( root={ key: [SampleAnnotation(**sample) for sample in value] for key, value in dataset.items() + if metric_name is None or key == metric_name + } + ) + + def filter(self, function: t.Optional[t.Callable] = None): + + if function is None: + function = lambda x: True # noqa: E731 + + return MetricAnnotation( + root={ + key: [sample for sample in value if function(sample)] + for key, value in self.root.items() } ) + def __len__(self): + return sum(len(value) for value in self.root.values()) + def train_test_split( self, test_size: float = 0.2, @@ -50,12 +78,17 @@ def train_test_split( """ pass - def batch(self, batch_size: int, stratiy: t.Optional[str] = None): + def batch( + self, + batch_size: int, + stratify: t.Optional[str] = None, + drop_last_batch: bool = False, + ): """ Create a batch iterator. Parameters: batch_size (int): The number of samples in each batch. stratify (str): The column to stratify the batches on. + drop_last_batch (bool): Whether to drop the last batch if it is smaller than the specified batch size. """ - pass From 681ea725c9a24fd9a0480920b54a971484c7043b Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 27 Nov 2024 20:55:27 +0530 Subject: [PATCH 14/36] add batching --- src/ragas/loaders.py | 118 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 110 insertions(+), 8 deletions(-) diff --git a/src/ragas/loaders.py b/src/ragas/loaders.py index 9606bfb64..22cb87a17 100644 --- a/src/ragas/loaders.py +++ b/src/ragas/loaders.py @@ -1,5 +1,7 @@ import json +import random import typing as t +from collections import defaultdict import numpy as np from pydantic import BaseModel @@ -30,7 +32,7 @@ class MetricAnnotation(BaseModel): root: t.Dict[str, t.List[SampleAnnotation]] def __getitem__(self, key): - return self.root[key] + return SingleMetricAnnotation(name=key, samples=self.root[key]) @classmethod def from_json(cls, path, metric_name: t.Optional[str]) -> "MetricAnnotation": @@ -47,20 +49,39 @@ def from_json(cls, path, metric_name: t.Optional[str]) -> "MetricAnnotation": } ) + def __len__(self): + return sum(len(value) for value in self.root.values()) + + +class SingleMetricAnnotation(BaseModel): + name: str + samples: t.List[SampleAnnotation] + + def __getitem__(self, key): + return getattr(self, key) + + @classmethod + def from_json(cls, path) -> "SingleMetricAnnotation": + + dataset = json.load(open(path)) + + return cls( + name=dataset["name"], + samples=[SampleAnnotation(**sample) for sample in dataset["samples"]], + ) + def filter(self, function: t.Optional[t.Callable] = None): if function is None: function = lambda x: True # noqa: E731 - return MetricAnnotation( - root={ - key: [sample for sample in value if function(sample)] - for key, value in self.root.items() - } + return SingleMetricAnnotation( + name=self.name, + samples=[sample for sample in self.samples if function(sample)], ) def __len__(self): - return sum(len(value) for value in self.root.values()) + return len(self.samples) def train_test_split( self, @@ -81,7 +102,6 @@ def train_test_split( def batch( self, batch_size: int, - stratify: t.Optional[str] = None, drop_last_batch: bool = False, ): """ @@ -92,3 +112,85 @@ def batch( stratify (str): The column to stratify the batches on. drop_last_batch (bool): Whether to drop the last batch if it is smaller than the specified batch size. """ + + samples = self.samples[:] + random.shuffle(samples) + + all_batches = [ + samples[i : i + batch_size] + for i in range(0, len(samples), batch_size) + if len(samples[i : i + batch_size]) == batch_size or not drop_last_batch + ] + + return all_batches + + def stratified_batches( + self, + batch_size: int, + stratify_key: str, + drop_last_batch: bool = False, + replace: bool = False, + ) -> t.List[t.List[SampleAnnotation]]: + """ + Create stratified batches based on a specified key, ensuring proportional representation. + + Parameters: + batch_size (int): Number of samples per batch. + stratify_key (str): Key in `metric_input` used for stratification (e.g., class labels). + drop_last_batch (bool): If True, drops the last batch if it has fewer samples than `batch_size`. + replace (bool): If True, allows reusing samples from the same class to fill a batch if necessary. + + Returns: + List[List[SampleAnnotation]]: A list of stratified batches, each batch being a list of SampleAnnotation objects. + """ + # Group samples based on the stratification key + class_groups = defaultdict(list) + for sample in self.samples: + key = sample.metric_input.get(stratify_key) + if key is None: + raise ValueError( + f"Stratify key '{stratify_key}' not found in metric_input." + ) + class_groups[key].append(sample) + + # Shuffle each class group for randomness + for group in class_groups.values(): + random.shuffle(group) + + # Determine the number of batches required + total_samples = len(self.samples) + num_batches = ( + total_samples // batch_size + if drop_last_batch + else (total_samples + batch_size - 1) // batch_size + ) + samples_per_class_per_batch = { + cls: max(1, len(samples) // num_batches) + for cls, samples in class_groups.items() + } + + # Create stratified batches + all_batches = [] + while len(all_batches) < num_batches: + batch = [] + for cls, samples in list(class_groups.items()): + # Determine the number of samples to take from this class + count = min( + samples_per_class_per_batch[cls], + len(samples), + batch_size - len(batch), + ) + if count > 0: + # Add samples from the current class + batch.extend(samples[:count]) + class_groups[cls] = samples[count:] # Remove used samples + elif replace and len(batch) < batch_size: + # Reuse samples if `replace` is True + batch.extend(random.choices(samples, k=batch_size - len(batch))) + + # Shuffle the batch to mix classes + random.shuffle(batch) + if len(batch) == batch_size or not drop_last_batch: + all_batches.append(batch) + + return all_batches From 3b613be83451dc1a9aa8137bc15438fa75c5d8b6 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 28 Nov 2024 12:40:13 +0530 Subject: [PATCH 15/36] add batching and reverse engineering --- src/ragas/loaders.py | 14 +++------ src/ragas/metrics/base.py | 9 ++---- src/ragas/optimizers/base.py | 3 +- src/ragas/optimizers/genetic.py | 54 ++++++++++++++++++++++++++++----- 4 files changed, 56 insertions(+), 24 deletions(-) diff --git a/src/ragas/loaders.py b/src/ragas/loaders.py index 22cb87a17..675cee2d2 100644 --- a/src/ragas/loaders.py +++ b/src/ragas/loaders.py @@ -57,8 +57,8 @@ class SingleMetricAnnotation(BaseModel): name: str samples: t.List[SampleAnnotation] - def __getitem__(self, key): - return getattr(self, key) + def __getitem__(self, idx): + return self.samples[idx] @classmethod def from_json(cls, path) -> "SingleMetricAnnotation": @@ -146,11 +146,7 @@ def stratified_batches( # Group samples based on the stratification key class_groups = defaultdict(list) for sample in self.samples: - key = sample.metric_input.get(stratify_key) - if key is None: - raise ValueError( - f"Stratify key '{stratify_key}' not found in metric_input." - ) + key = sample[stratify_key] class_groups[key].append(sample) # Shuffle each class group for randomness @@ -160,9 +156,9 @@ def stratified_batches( # Determine the number of batches required total_samples = len(self.samples) num_batches = ( - total_samples // batch_size + np.ceil(total_samples / batch_size).astype(int) if drop_last_batch - else (total_samples + batch_size - 1) // batch_size + else np.floor(total_samples / batch_size).astype(int) ) samples_per_class_per_batch = { cls: max(1, len(samples) // num_batches) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 475783aa0..af5606674 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import json import logging import typing as t from abc import ABC, abstractmethod @@ -14,6 +13,7 @@ from ragas.callbacks import ChainType, new_group from ragas.dataset_schema import MultiTurnSample, SingleTurnSample from ragas.executor import is_event_loop_running +from ragas.loaders import MetricAnnotation from ragas.losses import BinaryMetricLoss, MSELoss from ragas.prompt import PromptMixin from ragas.run_config import RunConfig @@ -238,10 +238,7 @@ def train( if not path.endswith(".json"): raise ValueError("Train data must be in json format") - data = json.load(open(path)) - data = data.get(self.name) - if data is None: - raise ValueError(f"Metric '{self.name}' not found in train data") + dataset = MetricAnnotation.from_json(path, metric_name=self.name) optimizer = instruction_config.optimizer llm = instruction_config.llm or self.llm @@ -270,7 +267,7 @@ def train( optimizer.metric = self optimizer_config = instruction_config.optimizer_config or {} - optimizer.optimize(data, loss_fun, optimizer_config, callbacks) + optimizer.optimize(dataset[self.name], loss_fun, optimizer_config, callbacks) return diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py index 3d3543516..1a23d4d65 100644 --- a/src/ragas/optimizers/base.py +++ b/src/ragas/optimizers/base.py @@ -5,6 +5,7 @@ from langchain_core.callbacks import Callbacks from ragas.llms.base import BaseRagasLLM +from ragas.loaders import SingleMetricAnnotation from ragas.losses import Loss from ragas.metrics.base import MetricWithLLM @@ -21,7 +22,7 @@ class Optimizer(ABC): @abstractmethod def optimize( self, - train_data: t.Any, + dataset: SingleMetricAnnotation, loss: Loss, config: t.Dict[t.Any, t.Any], callbacks: Callbacks, diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index b0312d23a..54bb1ea09 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -3,6 +3,7 @@ from langchain_core.callbacks import Callbacks from pydantic import BaseModel +from ragas.loaders import SampleAnnotation, SingleMetricAnnotation from ragas.losses import Loss from ragas.metrics.base import MetricWithLLM from ragas.optimizers.base import Optimizer @@ -115,9 +116,11 @@ class GeneticOptimizer(Optimizer): A genetic algorithm optimizer that balances exploration and exploitation. """ + reverse_engineer_prompt = ReverseEngineerPrompt() + def optimize( self, - train_data: t.Any, + dataset: SingleMetricAnnotation, loss: Loss, config: t.Dict[t.Any, t.Any], callbacks: Callbacks, @@ -133,15 +136,50 @@ def optimize( return self.metric def _initialize_population( - self, dataset: t.List[t.Dict[t.Dict[str, t.Any]]] + self, dataset: SingleMetricAnnotation, population_size: int ) -> t.List[str]: - return ["instruction"] - - def _reverse_engineer_instruction( - self, dataset: t.List[t.Dict[t.Dict[str, t.Any]]] - ) -> str: - return "instruction" + candidates = [] + dataset = dataset.filter(lambda x: x["is_accepted"]) + batches = dataset.stratified_batches( + batch_size=population_size, + stratify_key="metric_output", + replace=False, + drop_last_batch=False, + ) + for batch in batches: + candidate = self._reverse_engineer_instruction(batch) + candidates.append(candidate) + + async def _reverse_engineer_instruction( + self, batch: t.List[SampleAnnotation] + ) -> t.Dict[str, str]: + + prompt_annotations = {key: [] for key in batch[0]["prompts"].keys()} + candidates = {} + for sample in batch: + input_ouputs = sample["prompts"] + for name, example in input_ouputs.items(): + input_ = { + key: val + for key, val in example["prompt_input"].items() + if val is not None + } + output = ( + example["edited_output"] + if example["edited_output"] + else example["prompt_output"] + ) + prompt_annotations[name].append({"input": input_, "output": output}) + + for prompt_name, examples in prompt_annotations.items(): + formatted_examples = FormattedExamples.from_examples(examples) + instruction = await self.reverse_engineer_prompt.generate( + data=formatted_examples, llm=self.llm + ) + candidates[prompt_name] = instruction + + return candidates def _cross_over(self, parent_1: str, parent_2: str) -> str: return "instruction" From 8c3e64aa75083b5594ce7254ffd40299ee3d4377 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 28 Nov 2024 17:02:39 +0530 Subject: [PATCH 16/36] add executors and population size --- src/ragas/metrics/base.py | 6 +- src/ragas/optimizers/base.py | 7 ++- src/ragas/optimizers/genetic.py | 104 ++++++++++++++++++++++++++++---- 3 files changed, 102 insertions(+), 15 deletions(-) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index af5606674..905783204 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -232,7 +232,7 @@ def train( path: str, demonstration_config: DemonstrationConfig, instruction_config: InstructionConfig, - callbacks: Callbacks, + callbacks: Callbacks = None, ) -> None: if not path.endswith(".json"): @@ -267,7 +267,9 @@ def train( optimizer.metric = self optimizer_config = instruction_config.optimizer_config or {} - optimizer.optimize(dataset[self.name], loss_fun, optimizer_config, callbacks) + optimizer.optimize( + dataset[self.name], loss_fun, optimizer_config, callbacks=callbacks + ) return diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py index 1a23d4d65..657aad438 100644 --- a/src/ragas/optimizers/base.py +++ b/src/ragas/optimizers/base.py @@ -8,6 +8,7 @@ from ragas.loaders import SingleMetricAnnotation from ragas.losses import Loss from ragas.metrics.base import MetricWithLLM +from ragas.run_config import RunConfig @dataclass @@ -25,7 +26,11 @@ def optimize( dataset: SingleMetricAnnotation, loss: Loss, config: t.Dict[t.Any, t.Any], - callbacks: Callbacks, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + callbacks: t.Optional[Callbacks] = None, + with_debugging_logs=False, + raise_exceptions: bool = True, ) -> MetricWithLLM: """ Optimizes the prompts for the given metric. diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 54bb1ea09..a3dc2966c 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -3,11 +3,16 @@ from langchain_core.callbacks import Callbacks from pydantic import BaseModel +from ragas.callbacks import new_group +from ragas.executor import Executor from ragas.loaders import SampleAnnotation, SingleMetricAnnotation from ragas.losses import Loss from ragas.metrics.base import MetricWithLLM from ragas.optimizers.base import Optimizer from ragas.prompt import PydanticPrompt +from ragas.run_config import RunConfig + +RAGAS_OPTIMIZATION_GROUP = "ragas_optimization" class FormattedExamples(BaseModel): @@ -20,7 +25,7 @@ def from_examples( formated_examples = [] for example in examples: - input_, output = list(example.items())[0] + input_, output = example.values() input_ = "".join(f"\n{key}:\n\t{val}\n" for key, val in input_.items()) formated_examples.append((input_, output)) @@ -117,44 +122,109 @@ class GeneticOptimizer(Optimizer): """ reverse_engineer_prompt = ReverseEngineerPrompt() + cross_over_prompt = CrossOverPrompt() def optimize( self, dataset: SingleMetricAnnotation, loss: Loss, config: t.Dict[t.Any, t.Any], - callbacks: Callbacks, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + callbacks: t.Optional[Callbacks] = None, + with_debugging_logs=False, + raise_exceptions: bool = True, ) -> MetricWithLLM: + callbacks = callbacks or [] + if self.metric is None: raise ValueError("No metric provided for optimization.") if self.llm is None: raise ValueError("No llm provided for optimization.") + population_size = config.get("population_size", 3) + num_demonstrations = config.get("num_demonstrations", 3) + + # new group for optimization + optimization_generation_rm, optimization_generation_grp = new_group( + name=RAGAS_OPTIMIZATION_GROUP, + inputs={"metric": self.metric.name}, + callbacks=callbacks, + ) + + initial_population = self._initialize_population( + dataset, + population_size, + num_demonstrations, + run_config, + batch_size, + optimization_generation_grp, + raise_exceptions, + ) # max_steps = config.get("max_steps", 100 return self.metric def _initialize_population( - self, dataset: SingleMetricAnnotation, population_size: int - ) -> t.List[str]: + self, + dataset: SingleMetricAnnotation, + population_size: int, + num_demonstrations: int = 3, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + callbacks: t.Optional[Callbacks] = None, + raise_exceptions: bool = True, + ) -> t.List[t.Dict[str, str]]: + + initialize_population_rm, initialize_population_grp = new_group( + name="initialize_population", + inputs={"population_size": population_size}, + callbacks=callbacks, + ) + + exec = Executor( + desc="Intialiize Population", + raise_exceptions=raise_exceptions, + run_config=run_config, + keep_progress_bar=False, + batch_size=batch_size, + ) candidates = [] dataset = dataset.filter(lambda x: x["is_accepted"]) batches = dataset.stratified_batches( - batch_size=population_size, + batch_size=num_demonstrations, stratify_key="metric_output", replace=False, drop_last_batch=False, ) - for batch in batches: - candidate = self._reverse_engineer_instruction(batch) - candidates.append(candidate) + for batch in batches[:population_size]: + exec.submit( + self._reverse_engineer_instruction, + batch=batch, + callbacks=initialize_population_grp, + ) + + try: + candidates = exec.results() + except Exception as e: + initialize_population_rm.on_chain_error(e) + raise e + else: + initialize_population_rm.on_chain_end( + outputs={"initial_population": candidates} + ) + + return candidates async def _reverse_engineer_instruction( - self, batch: t.List[SampleAnnotation] + self, batch: t.List[SampleAnnotation], callbacks: Callbacks = None ) -> t.Dict[str, str]: + if self.llm is None: + raise ValueError("No llm provided for optimization.") + prompt_annotations = {key: [] for key in batch[0]["prompts"].keys()} candidates = {} for sample in batch: @@ -175,11 +245,21 @@ async def _reverse_engineer_instruction( for prompt_name, examples in prompt_annotations.items(): formatted_examples = FormattedExamples.from_examples(examples) instruction = await self.reverse_engineer_prompt.generate( - data=formatted_examples, llm=self.llm + data=formatted_examples, llm=self.llm, callbacks=callbacks ) candidates[prompt_name] = instruction return candidates - def _cross_over(self, parent_1: str, parent_2: str) -> str: - return "instruction" + async def _cross_over( + self, parent_1: str, parent_2: str, callbacks: Callbacks = None + ) -> str: + + if self.llm is None: + raise ValueError("No llm provided for optimization.") + + parents = ParentPrompts(parent_1=parent_1, parent_2=parent_2) + offspring = await self.cross_over_prompt.generate( + data=parents, llm=self.llm, callbacks=callbacks + ) + return offspring.instruction From c931601386161e4b98e0cd2f30d67e38eab298cd Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 28 Nov 2024 18:13:46 +0530 Subject: [PATCH 17/36] remove cross over --- src/ragas/optimizers/genetic.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index a3dc2966c..6f2edbac0 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -163,8 +163,12 @@ def optimize( optimization_generation_grp, raise_exceptions, ) + # TODO: replace with metric.get_prompts('function_name') + seed_prompts = {key: val.instruction for key, val in self.metric.get_prompts().items() if key in initial_population[0].keys()} + initial_population.append(seed_prompts) + # max_steps = config.get("max_steps", 100 - return self.metric + return initial_population def _initialize_population( self, @@ -224,6 +228,9 @@ async def _reverse_engineer_instruction( if self.llm is None: raise ValueError("No llm provided for optimization.") + + if self.metric is None: + raise ValueError("No metric provided for optimization.") prompt_annotations = {key: [] for key in batch[0]["prompts"].keys()} candidates = {} @@ -247,7 +254,7 @@ async def _reverse_engineer_instruction( instruction = await self.reverse_engineer_prompt.generate( data=formatted_examples, llm=self.llm, callbacks=callbacks ) - candidates[prompt_name] = instruction + candidates[prompt_name] = instruction.instruction return candidates From 703dedede02f059bb08bd7c39cb389976b4f4309 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 28 Nov 2024 23:27:26 +0530 Subject: [PATCH 18/36] allow evaluating fitness --- src/ragas/loaders.py | 20 +++++++- src/ragas/optimizers/genetic.py | 88 +++++++++++++++++++++++++++++++-- 2 files changed, 101 insertions(+), 7 deletions(-) diff --git a/src/ragas/loaders.py b/src/ragas/loaders.py index 675cee2d2..7f08df64c 100644 --- a/src/ragas/loaders.py +++ b/src/ragas/loaders.py @@ -6,6 +6,8 @@ import numpy as np from pydantic import BaseModel +from ragas.dataset_schema import EvaluationDataset + class PromptAnnotation(BaseModel): prompt_input: t.Dict[str, t.Any] @@ -22,6 +24,7 @@ class SampleAnnotation(BaseModel): metric_output: float prompts: t.Dict[str, PromptAnnotation] is_accepted: bool + target: t.Optional[float] = None def __getitem__(self, key): return getattr(self, key) @@ -57,9 +60,22 @@ class SingleMetricAnnotation(BaseModel): name: str samples: t.List[SampleAnnotation] + def to_evaluation_dataset(self) -> EvaluationDataset: + samples = [sample.metric_input for sample in self.samples] + return EvaluationDataset.from_list(samples) + def __getitem__(self, idx): return self.samples[idx] + def __iter__(self) -> t.Iterator[SampleAnnotation]: # type: ignore + return iter(self.samples) + + def select(self, indices: t.List[int]) -> "SingleMetricAnnotation": + return SingleMetricAnnotation( + name=self.name, + samples=[self.samples[idx] for idx in indices], + ) + @classmethod def from_json(cls, path) -> "SingleMetricAnnotation": @@ -86,9 +102,9 @@ def __len__(self): def train_test_split( self, test_size: float = 0.2, - random_state: t.Optional[np.random.RandomState] = None, + seed: int = 42, stratify: t.Optional[t.List[t.Any]] = None, - ): + ) -> t.Tuple["SingleMetricAnnotation", "SingleMetricAnnotation"]: """ Split the dataset into training and testing sets. diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 6f2edbac0..502d1059e 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -4,6 +4,7 @@ from pydantic import BaseModel from ragas.callbacks import new_group +from ragas.evaluation import evaluate from ragas.executor import Executor from ragas.loaders import SampleAnnotation, SingleMetricAnnotation from ragas.losses import Loss @@ -164,11 +165,25 @@ def optimize( raise_exceptions, ) # TODO: replace with metric.get_prompts('function_name') - seed_prompts = {key: val.instruction for key, val in self.metric.get_prompts().items() if key in initial_population[0].keys()} + seed_prompts = { + key: val.instruction + for key, val in self.metric.get_prompts().items() + if key in initial_population[0].keys() + } initial_population.append(seed_prompts) + fitness_scores = self.evaluate_fitness( + initial_population, + dataset, + loss, + run_config, + batch_size, + optimization_generation_grp, + raise_exceptions, + ) + # max_steps = config.get("max_steps", 100 - return initial_population + return fitness_scores def _initialize_population( self, @@ -182,13 +197,13 @@ def _initialize_population( ) -> t.List[t.Dict[str, str]]: initialize_population_rm, initialize_population_grp = new_group( - name="initialize_population", + name="Initializing Population", inputs={"population_size": population_size}, callbacks=callbacks, ) exec = Executor( - desc="Intialiize Population", + desc="Initializing Population", raise_exceptions=raise_exceptions, run_config=run_config, keep_progress_bar=False, @@ -228,7 +243,7 @@ async def _reverse_engineer_instruction( if self.llm is None: raise ValueError("No llm provided for optimization.") - + if self.metric is None: raise ValueError("No metric provided for optimization.") @@ -270,3 +285,66 @@ async def _cross_over( data=parents, llm=self.llm, callbacks=callbacks ) return offspring.instruction + + def _set_instructions(self, candidates: t.Dict[str, str]): + if self.metric is None: + raise ValueError("No metric provided for optimization.") + prompts = self.metric.get_prompts() + for key, val in candidates.items(): + prompts[key].instruction = val + self.metric.set_prompts(**prompts) + + def evaluate_fitness( + self, + candidates: t.List[t.Dict[str, str]], + dataset: SingleMetricAnnotation, + loss_fn: Loss, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + callbacks: t.Optional[Callbacks] = None, + raise_exceptions: bool = True, + ) -> t.List[float]: + + if self.metric is None: + raise ValueError("No metric provided for optimization.") + + losses = [] + training_ids = [] + y_true = [] + for idx, sample in enumerate(dataset): + if sample["is_accepted"]: + training_ids.append(idx) + y_true.append(sample.metric_output) + elif not sample["is_accepted"] and self.metric.output_type.name == "BINARY": + training_ids.append(idx) + y_true.append(int(not sample.metric_output)) + + dataset = dataset.select(training_ids) + eval_dataset = dataset.to_evaluation_dataset() + for idx, candidate in enumerate(candidates): + + initialize_population_rm, initialize_population_grp = new_group( + name=f"Validating Candidate - {idx}", + inputs={"candidate": candidate}, + callbacks=callbacks, + ) + + self._set_instructions(candidate) + results = evaluate( + eval_dataset, + metrics=[self.metric], + llm=self.llm, + run_config=run_config, + batch_size=batch_size, + callbacks=initialize_population_grp, + raise_exceptions=raise_exceptions, + ) + y_pred = results.to_pandas()[self.metric.name].values.tolist() + loss = loss_fn(y_true, y_pred) + losses.append(loss) + + initialize_population_rm.on_chain_end( + outputs={"loss": loss} + ) + + return losses From 6a0577a294662eba169092b5358c853160737167 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 29 Nov 2024 09:36:01 +0530 Subject: [PATCH 19/36] make runconfig optional --- src/ragas/evaluation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index fd53925b3..52203639b 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -60,7 +60,7 @@ def evaluate( embeddings: t.Optional[BaseRagasEmbeddings | LangchainEmbeddings] = None, callbacks: Callbacks = None, in_ci: bool = False, - run_config: RunConfig = RunConfig(), + run_config: t.Optional[RunConfig] = None, token_usage_parser: t.Optional[TokenUsageParser] = None, raise_exceptions: bool = False, column_map: t.Optional[t.Dict[str, str]] = None, @@ -147,6 +147,7 @@ def evaluate( """ column_map = column_map or {} callbacks = callbacks or [] + run_config = run_config or RunConfig() if helicone_config.is_enabled: import uuid From 2fb0daf5d21ab2cd0075480412cb5bf021f99511 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 29 Nov 2024 12:42:57 +0530 Subject: [PATCH 20/36] fix fitness validation --- src/ragas/callbacks.py | 3 ++- src/ragas/dataset_schema.py | 4 +++- src/ragas/evaluation.py | 3 +++ src/ragas/optimizers/genetic.py | 28 ++++++++++++++-------------- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py index 7b16059f5..6866470e2 100644 --- a/src/ragas/callbacks.py +++ b/src/ragas/callbacks.py @@ -133,11 +133,12 @@ def __str__(self): def parse_run_traces( traces: t.Dict[str, ChainRun], + parent_run_id: t.Optional[uuid.UUID] = None, ) -> t.List[t.Dict[str, t.Any]]: root_traces = [ chain_trace for chain_trace in traces.values() - if chain_trace.parent_run_id is None + if chain_trace.parent_run_id == str(parent_run_id) ] if len(root_traces) > 1: raise ValueError( diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index b3e07edc4..3b29db02b 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -4,6 +4,7 @@ import typing as t from abc import ABC, abstractmethod from dataclasses import dataclass, field +from uuid import UUID from datasets import Dataset as HFDataset from pydantic import BaseModel, field_validator @@ -375,6 +376,7 @@ class EvaluationResult: cost_cb: t.Optional[CostCallbackHandler] = None traces: t.List[t.Dict[str, t.Any]] = field(default_factory=list) ragas_traces: t.Dict[str, ChainRun] = field(default_factory=dict, repr=False) + run_id: t.Optional[UUID] = None def __post_init__(self): # transform scores from list of dicts to dict of lists @@ -392,7 +394,7 @@ def __post_init__(self): values.append(value + 1e-10) # parse the traces - self.traces = parse_run_traces(self.ragas_traces) + self.traces = parse_run_traces(self.ragas_traces, self.run_id) def __repr__(self) -> str: score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()] diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 52203639b..5cdfd0ca7 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -1,6 +1,7 @@ from __future__ import annotations import typing as t +from uuid import UUID import numpy as np from datasets import Dataset @@ -66,6 +67,7 @@ def evaluate( column_map: t.Optional[t.Dict[str, str]] = None, show_progress: bool = True, batch_size: t.Optional[int] = None, + run_id: t.Optional[UUID] = None, ) -> EvaluationResult: """ Run the evaluation on the dataset with different metrics @@ -335,6 +337,7 @@ def evaluate( cost_cb, ), ragas_traces=tracer.traces, + run_id=run_id, ) if not evaluation_group_cm.ended: evaluation_rm.on_chain_end({"scores": result.scores}) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 502d1059e..be4ee56b7 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -171,7 +171,7 @@ def optimize( if key in initial_population[0].keys() } initial_population.append(seed_prompts) - + fitness_scores = self.evaluate_fitness( initial_population, dataset, @@ -285,7 +285,7 @@ async def _cross_over( data=parents, llm=self.llm, callbacks=callbacks ) return offspring.instruction - + def _set_instructions(self, candidates: t.Dict[str, str]): if self.metric is None: raise ValueError("No metric provided for optimization.") @@ -321,14 +321,15 @@ def evaluate_fitness( dataset = dataset.select(training_ids) eval_dataset = dataset.to_evaluation_dataset() - for idx, candidate in enumerate(candidates): - - initialize_population_rm, initialize_population_grp = new_group( - name=f"Validating Candidate - {idx}", - inputs={"candidate": candidate}, - callbacks=callbacks, - ) - + + initialize_population_rm, initialize_population_grp = new_group( + name="Evaluating candidate fitness", + inputs={"candidates": candidates}, + callbacks=callbacks, + ) + run_id = initialize_population_rm.run_id + for candidate in candidates: + self._set_instructions(candidate) results = evaluate( eval_dataset, @@ -338,13 +339,12 @@ def evaluate_fitness( batch_size=batch_size, callbacks=initialize_population_grp, raise_exceptions=raise_exceptions, + run_id=run_id, ) y_pred = results.to_pandas()[self.metric.name].values.tolist() loss = loss_fn(y_true, y_pred) losses.append(loss) - - initialize_population_rm.on_chain_end( - outputs={"loss": loss} - ) + + initialize_population_rm.on_chain_end(outputs={"losses": losses}) return losses From d10b445f16c82e8be3dd915be6aacf2308f4bbd4 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 30 Nov 2024 16:44:19 +0530 Subject: [PATCH 21/36] implement feedback mutation --- src/ragas/optimizers/genetic.py | 204 +++++++++++++++++++++++++++++++- 1 file changed, 198 insertions(+), 6 deletions(-) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index be4ee56b7..225dded70 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -4,6 +4,7 @@ from pydantic import BaseModel from ragas.callbacks import new_group +from ragas.dataset_schema import EvaluationResult from ragas.evaluation import evaluate from ragas.executor import Executor from ragas.loaders import SampleAnnotation, SingleMetricAnnotation @@ -12,6 +13,10 @@ from ragas.optimizers.base import Optimizer from ragas.prompt import PydanticPrompt from ragas.run_config import RunConfig +from ragas.dataset_schema import EvaluationDataset +import logging + +logger = logging.getLogger(__name__) RAGAS_OPTIMIZATION_GROUP = "ragas_optimization" @@ -94,7 +99,7 @@ class FeedbackMutationPrompt( ): name: str = "feedback_mutation" instruction: str = ( - "You're an expert reviewer. Given an instruction and a set of (input containing (user_input, response, reference), output, expected_output) examples, give maximum 3 feedbacks on how the instruction can be improved to correct the mistakes in incorrect outputs and reach expected output." + "You're an expert reviewer. Given an instruction and a set of (input containing (user_input, response, reference, etc), output, expected_output) examples, give maximum 3 feedbacks on how the instruction can be improved to correct the mistakes in incorrect outputs and reach expected output." "Do not provide the feedback to add examples with the instruction." ) input_model = FeedbackMutationInput @@ -124,6 +129,8 @@ class GeneticOptimizer(Optimizer): reverse_engineer_prompt = ReverseEngineerPrompt() cross_over_prompt = CrossOverPrompt() + feedback_generation_prompt = FeedbackMutationPrompt() + feedback_mutation_prompt = FeedbackMutationPromptGeneration() def optimize( self, @@ -147,6 +154,7 @@ def optimize( population_size = config.get("population_size", 3) num_demonstrations = config.get("num_demonstrations", 3) + # elitism_rate = config.get("elitism_rate", 0.5) # new group for optimization optimization_generation_rm, optimization_generation_grp = new_group( @@ -183,7 +191,16 @@ def optimize( ) # max_steps = config.get("max_steps", 100 - return fitness_scores + improved_prompts = self.feedback_mutation( + initial_population, + dataset, + sample_size=10, + run_config=run_config, + batch_size=batch_size, + callbacks=optimization_generation_grp, + raise_exceptions=raise_exceptions, + ) + return improved_prompts def _initialize_population( self, @@ -294,21 +311,177 @@ def _set_instructions(self, candidates: t.Dict[str, str]): prompts[key].instruction = val self.metric.set_prompts(**prompts) - def evaluate_fitness( + def feedback_mutation( self, candidates: t.List[t.Dict[str, str]], dataset: SingleMetricAnnotation, - loss_fn: Loss, + sample_size: int, run_config: t.Optional[RunConfig] = None, batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, - ) -> t.List[float]: + ) -> t.List[str]: + + if self.metric is None: + raise ValueError("No metric provided for optimization.") + + feedback_rm, feedback_grp = new_group( + name="Feedback mutation", + inputs={"candidates": candidates}, + callbacks=callbacks, + ) + improved_candidates = [] + dataset = dataset.filter(lambda x: x["is_accepted"]) + for candidate in candidates: + candidate_rm, candidate_grp = new_group( + name="Candidate feedback mutation", + inputs={"candidate": candidate}, + callbacks=feedback_grp, + ) + dataset_sample = dataset.sample(sample_size, stratify_key="metric_output") + batch, target = self._get_evaluation_dataset(dataset_sample) + self._set_instructions(candidate) + results = evaluate( + batch, + metrics=[self.metric], + llm=self.llm, + run_config=run_config, + batch_size=batch_size, + callbacks=candidate_grp, + raise_exceptions=raise_exceptions, + run_id=candidate_rm.run_id, + ) + + exec = Executor( + desc="Getting feedbacks", + raise_exceptions=raise_exceptions, + run_config=run_config, + keep_progress_bar=False, + batch_size=batch_size, + ) + exec.submit( + self._feedback_mutation, + candidate=candidate, + dataset=dataset_sample, + results=results, + target=target, + callbacks=candidate_grp, + ) + try: + improved_candidate = exec.results() + improved_candidates.append(improved_candidate) + except Exception as e: + feedback_rm.on_chain_error(e) + raise e + else: + feedback_rm.on_chain_end( + outputs={"improved_candidates": improved_candidates} + ) + + return improved_candidates + + async def _feedback_mutation( + self, + candidate: t.Dict[str, str], + dataset: SampleAnnotation, + results: EvaluationResult, + target: t.List[float], + callbacks: Callbacks = None, + ) -> t.Dict[str, str]: + + if self.llm is None: + raise ValueError("No llm provided for optimization.") + + if self.metric is None: + raise ValueError("No metric provided for optimization.") + + feedback_candidates = await self._get_feedbacks( + candidate, dataset, results, target, callbacks + ) + improved_candidates = await self._implement_feedbacks( + candidate, feedback_candidates, callbacks + ) + return improved_candidates + + async def _implement_feedbacks( + self, + candidate: t.Dict[str, str], + feedbacks: t.Dict[str, t.List[str]], + callbacks: Callbacks = None, + ) -> t.Dict[str, str]: + + if self.llm is None: + raise ValueError("No llm provided for optimization.") + + improved_candidate = {} + for key in candidate.keys(): + feedback = feedbacks[key] + if feedbacks: + feedback_input = FeedbackMutationPromptInput( + instruction=candidate[key], feedbacks=feedback + ) + improved_candidate[key] = await self.feedback_mutation_prompt.generate( + data=feedback_input, llm=self.llm, callbacks=callbacks + ) + else: + improved_candidate[key] = candidate[key] + logger.warning(f"No feedbacks found for the prompt {key}. Returning the original prompt.") + + return improved_candidate + + async def _get_feedbacks( + self, + candidate: t.Dict[str, str], + dataset: SampleAnnotation, + results: EvaluationResult, + target: t.List[float], + callbacks: Callbacks = None, + ) -> t.Dict[str, t.List[str]]: + + + def dict_to_str(dict: t.Dict[str, t.Any]) -> str: + return "".join(f"\n{key}:\n\t{val}\n" for key, val in dict.items()) + + if self.llm is None: + raise ValueError("No llm provided for optimization.") + + if self.metric is None: + raise ValueError("No metric provided for optimization.") + + prediction = results.to_pandas()[self.metric.name].values.tolist() + indices = [idx for idx in range(len(target)) if target[idx] != prediction[idx]] + traces = [trace[self.metric.name] for trace in results.traces] + if indices: + feedback_candidates = {} + for key in candidate.keys(): + feedback_data = [ + FeedbackExample( + input=dict_to_str(traces[idx][key]["input"].model_dump(exclude_none=True)), + output=traces[idx][key]["output"][0].model_dump(exclude_none=True), + expected_output=dataset[idx]["prompts"][key]["prompt_output"], + ) + for idx in indices + ] + prompt_input = FeedbackMutationInput( + instruction=candidate[key], examples=feedback_data + ) + feedbacks = await self.feedback_generation_prompt.generate( + data=prompt_input, llm=self.llm, callbacks=callbacks + ) + feedback_candidates[key] = feedbacks.feedbacks + else: + logger.warning("No samples found for the feedback generation.") + feedback_candidates = {key: [] for key in candidate.keys()} + + return feedback_candidates + + def _get_evaluation_dataset( + self, dataset: SingleMetricAnnotation + ) -> t.Tuple[EvaluationDataset, t.List[float]]: if self.metric is None: raise ValueError("No metric provided for optimization.") - losses = [] training_ids = [] y_true = [] for idx, sample in enumerate(dataset): @@ -321,6 +494,25 @@ def evaluate_fitness( dataset = dataset.select(training_ids) eval_dataset = dataset.to_evaluation_dataset() + return eval_dataset, y_true + + def evaluate_fitness( + self, + candidates: t.List[t.Dict[str, str]], + dataset: SingleMetricAnnotation, + loss_fn: Loss, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + callbacks: t.Optional[Callbacks] = None, + raise_exceptions: bool = True, + ) -> t.List[float]: + + if self.metric is None: + raise ValueError("No metric provided for optimization.") + + losses = [] + + eval_dataset, y_true = self._get_evaluation_dataset(dataset) initialize_population_rm, initialize_population_grp = new_group( name="Evaluating candidate fitness", From a4771f874c7652726ba831fe09d429949b51fe97 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 30 Nov 2024 16:44:29 +0530 Subject: [PATCH 22/36] add sample fun --- src/ragas/loaders.py | 55 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/ragas/loaders.py b/src/ragas/loaders.py index 7f08df64c..868987dbb 100644 --- a/src/ragas/loaders.py +++ b/src/ragas/loaders.py @@ -115,6 +115,61 @@ def train_test_split( """ pass + def sample( + self, n: int, stratify_key: t.Optional[str] = None + ) -> "SingleMetricAnnotation": + """ + Create a subset of the dataset. + + Parameters: + n (int): The number of samples to include in the subset. + stratify_key (str): The column to stratify the subset on. + + Returns: + SingleMetricAnnotation: A subset of the dataset with `n` samples. + """ + if n > len(self.samples): + raise ValueError( + "Requested sample size exceeds the number of available samples." + ) + + if stratify_key is None: + # Simple random sampling + sampled_indices = random.sample(range(len(self.samples)), n) + sampled_samples = [self.samples[i] for i in sampled_indices] + else: + # Stratified sampling + class_groups = defaultdict(list) + for idx, sample in enumerate(self.samples): + key = sample.metric_input.get(stratify_key) + class_groups[key].append(idx) + + # Determine the proportion of samples to take from each class + total_samples = sum(len(indices) for indices in class_groups.values()) + proportions = { + cls: len(indices) / total_samples + for cls, indices in class_groups.items() + } + + sampled_indices = [] + for cls, indices in class_groups.items(): + cls_sample_count = int(np.round(proportions[cls] * n)) + cls_sample_count = min( + cls_sample_count, len(indices) + ) # Don't oversample + sampled_indices.extend(random.sample(indices, cls_sample_count)) + + # Handle any rounding discrepancies to ensure exactly `n` samples + while len(sampled_indices) < n: + remaining_indices = set(range(len(self.samples))) - set(sampled_indices) + if not remaining_indices: + break + sampled_indices.append(random.choice(list(remaining_indices))) + + sampled_samples = [self.samples[i] for i in sampled_indices] + + return SingleMetricAnnotation(name=self.name, samples=sampled_samples) + def batch( self, batch_size: int, From 80c194dae999d6baadaeb9804651528092cf7383 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 30 Nov 2024 16:44:44 +0530 Subject: [PATCH 23/36] make run_id optional --- src/ragas/callbacks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py index 6866470e2..ae5ea8b2e 100644 --- a/src/ragas/callbacks.py +++ b/src/ragas/callbacks.py @@ -133,13 +133,15 @@ def __str__(self): def parse_run_traces( traces: t.Dict[str, ChainRun], - parent_run_id: t.Optional[uuid.UUID] = None, + parent_run_id: t.Optional[str] = None, ) -> t.List[t.Dict[str, t.Any]]: + root_traces = [ chain_trace for chain_trace in traces.values() - if chain_trace.parent_run_id == str(parent_run_id) + if chain_trace.parent_run_id == parent_run_id ] + if len(root_traces) > 1: raise ValueError( "Multiple root traces found! This is a bug on our end, please file an issue and we will fix it ASAP :)" @@ -160,7 +162,7 @@ def parse_run_traces( prompt_traces = {} for i, prompt_uuid in enumerate(metric_trace.children): prompt_trace = traces[prompt_uuid] - prompt_traces[f"{i}_{prompt_trace.name}"] = { + prompt_traces[f"{prompt_trace.name}"] = { "input": prompt_trace.inputs.get("data", {}), "output": prompt_trace.outputs.get("output", {}), } From 2bc5834a5887257470bc9194168f7e6a7d40b2db Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 2 Dec 2024 17:38:25 +0530 Subject: [PATCH 24/36] organize evaluate --- src/ragas/optimizers/genetic.py | 85 ++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 225dded70..a995720cc 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -1,10 +1,12 @@ +import logging import typing as t +from uuid import UUID from langchain_core.callbacks import Callbacks from pydantic import BaseModel from ragas.callbacks import new_group -from ragas.dataset_schema import EvaluationResult +from ragas.dataset_schema import EvaluationDataset, EvaluationResult from ragas.evaluation import evaluate from ragas.executor import Executor from ragas.loaders import SampleAnnotation, SingleMetricAnnotation @@ -13,8 +15,6 @@ from ragas.optimizers.base import Optimizer from ragas.prompt import PydanticPrompt from ragas.run_config import RunConfig -from ragas.dataset_schema import EvaluationDataset -import logging logger = logging.getLogger(__name__) @@ -180,15 +180,15 @@ def optimize( } initial_population.append(seed_prompts) - fitness_scores = self.evaluate_fitness( - initial_population, - dataset, - loss, - run_config, - batch_size, - optimization_generation_grp, - raise_exceptions, - ) + # fitness_scores = self.evaluate_fitness( + # initial_population, + # dataset, + # loss, + # run_config, + # batch_size, + # optimization_generation_grp, + # raise_exceptions, + # ) # max_steps = config.get("max_steps", 100 improved_prompts = self.feedback_mutation( @@ -200,6 +200,7 @@ def optimize( callbacks=optimization_generation_grp, raise_exceptions=raise_exceptions, ) + return improved_prompts def _initialize_population( @@ -290,7 +291,7 @@ async def _reverse_engineer_instruction( return candidates - async def _cross_over( + async def _cross_over_prompts( self, parent_1: str, parent_2: str, callbacks: Callbacks = None ) -> str: @@ -340,11 +341,9 @@ def feedback_mutation( ) dataset_sample = dataset.sample(sample_size, stratify_key="metric_output") batch, target = self._get_evaluation_dataset(dataset_sample) - self._set_instructions(candidate) - results = evaluate( - batch, - metrics=[self.metric], - llm=self.llm, + results = self.evaluate_candidate( + candidate=candidate, + eval_dataset=batch, run_config=run_config, batch_size=batch_size, callbacks=candidate_grp, @@ -425,7 +424,9 @@ async def _implement_feedbacks( ) else: improved_candidate[key] = candidate[key] - logger.warning(f"No feedbacks found for the prompt {key}. Returning the original prompt.") + logger.warning( + f"No feedbacks found for the prompt {key}. Returning the original prompt." + ) return improved_candidate @@ -437,8 +438,7 @@ async def _get_feedbacks( target: t.List[float], callbacks: Callbacks = None, ) -> t.Dict[str, t.List[str]]: - - + def dict_to_str(dict: t.Dict[str, t.Any]) -> str: return "".join(f"\n{key}:\n\t{val}\n" for key, val in dict.items()) @@ -456,8 +456,12 @@ def dict_to_str(dict: t.Dict[str, t.Any]) -> str: for key in candidate.keys(): feedback_data = [ FeedbackExample( - input=dict_to_str(traces[idx][key]["input"].model_dump(exclude_none=True)), - output=traces[idx][key]["output"][0].model_dump(exclude_none=True), + input=dict_to_str( + traces[idx][key]["input"].model_dump(exclude_none=True) + ), + output=traces[idx][key]["output"][0].model_dump( + exclude_none=True + ), expected_output=dataset[idx]["prompts"][key]["prompt_output"], ) for idx in indices @@ -496,6 +500,33 @@ def _get_evaluation_dataset( eval_dataset = dataset.to_evaluation_dataset() return eval_dataset, y_true + def evaluate_candidate( + self, + candidate: t.Dict[str, str], + eval_dataset: EvaluationDataset, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + callbacks: t.Optional[Callbacks] = None, + raise_exceptions: bool = True, + run_id: t.Optional[UUID] = None, + ) -> EvaluationResult: + + if self.metric is None: + raise ValueError("No metric provided for optimization.") + + self._set_instructions(candidate) + results = evaluate( + eval_dataset, + metrics=[self.metric], + llm=self.llm, + run_config=run_config, + batch_size=batch_size, + callbacks=callbacks, + raise_exceptions=raise_exceptions, + run_id=run_id, + ) + return results + def evaluate_fitness( self, candidates: t.List[t.Dict[str, str]], @@ -522,11 +553,9 @@ def evaluate_fitness( run_id = initialize_population_rm.run_id for candidate in candidates: - self._set_instructions(candidate) - results = evaluate( - eval_dataset, - metrics=[self.metric], - llm=self.llm, + results = self.evaluate_candidate( + candidate=candidate, + eval_dataset=eval_dataset, run_config=run_config, batch_size=batch_size, callbacks=initialize_population_grp, From bab448fd978bf61b73ba4515f5ab6118a8287cad Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 2 Dec 2024 18:43:47 +0530 Subject: [PATCH 25/36] add cross over mutation --- src/ragas/optimizers/genetic.py | 138 ++++++++++++++++++++++++++++---- 1 file changed, 122 insertions(+), 16 deletions(-) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index a995720cc..8929dab11 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -2,6 +2,7 @@ import typing as t from uuid import UUID +import numpy as np from langchain_core.callbacks import Callbacks from pydantic import BaseModel @@ -11,8 +12,8 @@ from ragas.executor import Executor from ragas.loaders import SampleAnnotation, SingleMetricAnnotation from ragas.losses import Loss -from ragas.metrics.base import MetricWithLLM from ragas.optimizers.base import Optimizer +from ragas.optimizers.utils import hamming_distance from ragas.prompt import PydanticPrompt from ragas.run_config import RunConfig @@ -142,7 +143,7 @@ def optimize( callbacks: t.Optional[Callbacks] = None, with_debugging_logs=False, raise_exceptions: bool = True, - ) -> MetricWithLLM: + ) -> t.Dict[str, str]: callbacks = callbacks or [] @@ -154,7 +155,6 @@ def optimize( population_size = config.get("population_size", 3) num_demonstrations = config.get("num_demonstrations", 3) - # elitism_rate = config.get("elitism_rate", 0.5) # new group for optimization optimization_generation_rm, optimization_generation_grp = new_group( @@ -180,17 +180,6 @@ def optimize( } initial_population.append(seed_prompts) - # fitness_scores = self.evaluate_fitness( - # initial_population, - # dataset, - # loss, - # run_config, - # batch_size, - # optimization_generation_grp, - # raise_exceptions, - # ) - - # max_steps = config.get("max_steps", 100 improved_prompts = self.feedback_mutation( initial_population, dataset, @@ -201,7 +190,31 @@ def optimize( raise_exceptions=raise_exceptions, ) - return improved_prompts + improved_prompts = self.cross_over_mutation( + improved_prompts, + dataset, + run_config=run_config, + batch_size=batch_size, + callbacks=optimization_generation_grp, + raise_exceptions=raise_exceptions, + ) + + fitness_scores = self.evaluate_fitness( + improved_prompts, + dataset, + loss, + run_config=run_config, + batch_size=batch_size, + callbacks=optimization_generation_grp, + raise_exceptions=raise_exceptions, + ) + best_candidate_idx = improved_prompts[np.argmax(fitness_scores)] + + optimization_generation_rm.on_chain_end( + outputs={"best_candidate": improved_prompts[best_candidate_idx]} + ) + + return improved_prompts[best_candidate_idx] def _initialize_population( self, @@ -321,7 +334,7 @@ def feedback_mutation( batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, - ) -> t.List[str]: + ) -> t.List[t.Dict[str, str]]: if self.metric is None: raise ValueError("No metric provided for optimization.") @@ -569,3 +582,96 @@ def evaluate_fitness( initialize_population_rm.on_chain_end(outputs={"losses": losses}) return losses + + async def _cross_over_chain( + self, + parent_x: t.Dict[str, str], + parent_y: t.Dict[str, str], + callbacks: Callbacks, + ): + + if parent_x.keys() != parent_y.keys(): + raise ValueError("The parents must have the same prompt names.") + + chain_offsprings = {} + for key in parent_x.keys(): + offspring = await self._cross_over_prompts( + parent_x[key], parent_y[key], callbacks + ) + chain_offsprings[key] = offspring + + return chain_offsprings + + def cross_over_mutation( + self, + candidates: t.List[t.Dict[str, str]], + dataset: SingleMetricAnnotation, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + callbacks: t.Optional[Callbacks] = None, + raise_exceptions: bool = True, + ): + + if self.metric is None: + raise ValueError("No metric provided for optimization.") + + if self.llm is None: + raise ValueError("No llm provided for optimization.") + + eval_dataset, y_true = self._get_evaluation_dataset(dataset) + + cross_over_rm, cross_over_grp = new_group( + name="Cross-over mutation", + inputs={"candidates": candidates}, + callbacks=callbacks, + ) + run_id = cross_over_rm.run_id + prediction_vectors = [] + for candidate in candidates: + + results = self.evaluate_candidate( + candidate=candidate, + eval_dataset=eval_dataset, + run_config=run_config, + batch_size=batch_size, + callbacks=cross_over_grp, + raise_exceptions=raise_exceptions, + run_id=run_id, + ) + y_pred = results.to_pandas()[self.metric.name].values.tolist() + prediction = [int(pred == true) for pred, true in zip(y_pred, y_true)] + prediction_vectors.append(prediction) + + prediction_vectors = np.array(prediction_vectors) + distance_matrix = hamming_distance(prediction_vectors) + + exec = Executor( + desc="Mutating candidates", + raise_exceptions=raise_exceptions, + run_config=run_config, + keep_progress_bar=False, + batch_size=batch_size, + ) + + offspring_candidates = [] + for idx, candidate in enumerate(candidates): + parent_x = candidates[idx] + parent_y = candidates[np.argmin(distance_matrix[idx])] + exec.submit( + self._cross_over_chain, + parent_x=parent_x, + parent_y=parent_y, + callbacks=cross_over_grp, + ) + + try: + offspring_candidates = exec.results() + except Exception as e: + cross_over_rm.on_chain_error(e) + raise e + else: + cross_over_rm.on_chain_end( + outputs={"offspring_candidates": offspring_candidates} + ) + + return offspring_candidates From 1085b47ab4479b1ca8fd6a06456efb6ef1765b67 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 2 Dec 2024 18:43:57 +0530 Subject: [PATCH 26/36] add hamming distance --- src/ragas/optimizers/utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 src/ragas/optimizers/utils.py diff --git a/src/ragas/optimizers/utils.py b/src/ragas/optimizers/utils.py new file mode 100644 index 000000000..a061fb554 --- /dev/null +++ b/src/ragas/optimizers/utils.py @@ -0,0 +1,28 @@ +import numpy as np +from numpy.typing import NDArray + + +def hamming_distance(vectors: NDArray[np.int_]) -> NDArray[np.int_]: + """ + Calculate the Hamming distance between pairs of vectors in a list of lists. + + Args: + vectors (list of lists): A list where each inner list is a vector. + + Returns: + list of tuples: A list of tuples containing the pair indices and their Hamming distance. + """ + + # Validate that all vectors have the same dimension + length = len(vectors[0]) + if any(len(v) != length for v in vectors): + raise ValueError("All vectors must have the same dimensions.") + + # Calculate Hamming distances for all pairs + distances = np.zeros((len(vectors), len(vectors)), dtype=int) + for i in range(len(vectors)): + for j in range(i + 1, len(vectors)): + distance = np.sum(vectors[i] != vectors[j]) + distances[i][j] = distance + + return distances From b63757db376e5c67450dd69384b8c6aa20033047 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 2 Dec 2024 18:44:16 +0530 Subject: [PATCH 27/36] change optimzer return type --- src/ragas/optimizers/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py index 657aad438..fe09015c7 100644 --- a/src/ragas/optimizers/base.py +++ b/src/ragas/optimizers/base.py @@ -31,7 +31,7 @@ def optimize( callbacks: t.Optional[Callbacks] = None, with_debugging_logs=False, raise_exceptions: bool = True, - ) -> MetricWithLLM: + ) -> t.Dict[str, str]: """ Optimizes the prompts for the given metric. @@ -46,7 +46,7 @@ def optimize( Returns ------- - MetricWithLLM - The optimized metric. + Dict[str, str] + The optimized prompts for given chain. """ pass From d7aaa0da93c676acb44bdea35b7e8f1f94958bcc Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 2 Dec 2024 18:44:31 +0530 Subject: [PATCH 28/36] add notimplemented error for train/test split --- src/ragas/loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/loaders.py b/src/ragas/loaders.py index 868987dbb..a51bbdd44 100644 --- a/src/ragas/loaders.py +++ b/src/ragas/loaders.py @@ -113,7 +113,7 @@ def train_test_split( seed (int): Random seed for reproducibility. stratify (list): The column values to stratify the split on. """ - pass + raise NotImplementedError def sample( self, n: int, stratify_key: t.Optional[str] = None From bc2803cc17fb2dcc2f5c1017562997637b0bdbc3 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 2 Dec 2024 19:01:47 +0530 Subject: [PATCH 29/36] fix uuid --- src/ragas/dataset_schema.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index 3b29db02b..75e7c6465 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -41,6 +41,13 @@ def get_features(self) -> t.List[str]: """ return list(self.to_dict().keys()) + def to_string(self) -> str: + """ + Get the string representation of the sample. + """ + sample_dict = self.to_dict() + return "".join(f"\n{key}:\n\t{val}\n" for key, val in sample_dict.items()) + class SingleTurnSample(BaseSample): """ @@ -394,7 +401,8 @@ def __post_init__(self): values.append(value + 1e-10) # parse the traces - self.traces = parse_run_traces(self.ragas_traces, self.run_id) + run_id = str(self.run_id) if self.run_id is not None else None + self.traces = parse_run_traces(self.ragas_traces, run_id) def __repr__(self) -> str: score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()] From 4b2c6a93bac1d1c2bad3a974ca0aa1b51b5bc45c Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 3 Dec 2024 10:23:49 +0530 Subject: [PATCH 30/36] fix parsing --- src/ragas/optimizers/genetic.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 8929dab11..2a6d1960e 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -208,13 +208,13 @@ def optimize( callbacks=optimization_generation_grp, raise_exceptions=raise_exceptions, ) - best_candidate_idx = improved_prompts[np.argmax(fitness_scores)] + best_candidate = improved_prompts[np.argmax(fitness_scores)] optimization_generation_rm.on_chain_end( - outputs={"best_candidate": improved_prompts[best_candidate_idx]} + outputs={"best_candidate": best_candidate} ) - return improved_prompts[best_candidate_idx] + return best_candidate def _initialize_population( self, @@ -381,14 +381,17 @@ def feedback_mutation( ) try: improved_candidate = exec.results() - improved_candidates.append(improved_candidate) + improved_candidates.append(improved_candidate[0]) except Exception as e: - feedback_rm.on_chain_error(e) + candidate_rm.on_chain_error(e) raise e else: - feedback_rm.on_chain_end( - outputs={"improved_candidates": improved_candidates} + candidate_rm.on_chain_end( + outputs={"improved_candidate": improved_candidate[0]} ) + feedback_rm.on_chain_end( + outputs={"improved candidates": improved_candidates} + ) return improved_candidates @@ -432,9 +435,10 @@ async def _implement_feedbacks( feedback_input = FeedbackMutationPromptInput( instruction=candidate[key], feedbacks=feedback ) - improved_candidate[key] = await self.feedback_mutation_prompt.generate( + output = await self.feedback_mutation_prompt.generate( data=feedback_input, llm=self.llm, callbacks=callbacks ) + improved_candidate[key] = output.instruction else: improved_candidate[key] = candidate[key] logger.warning( From 607a95014e8bbdc3fc930d43afafc2c43a155f26 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 3 Dec 2024 23:05:11 +0530 Subject: [PATCH 31/36] merge changes --- src/ragas/dataset_schema.py | 1 - src/ragas/loaders.py | 263 -------------------------------- src/ragas/metrics/base.py | 21 +-- src/ragas/optimizers/base.py | 2 +- src/ragas/optimizers/genetic.py | 15 +- 5 files changed, 17 insertions(+), 285 deletions(-) delete mode 100644 src/ragas/loaders.py diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index 2000d8aeb..704144ee9 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -541,7 +541,6 @@ def upload(self, base_url: str = RAGAS_API_URL, verbose: bool = True) -> str: return evaluation_endpoint - class PromptAnnotation(BaseModel): prompt_input: t.Dict[str, t.Any] prompt_output: t.Dict[str, t.Any] diff --git a/src/ragas/loaders.py b/src/ragas/loaders.py deleted file mode 100644 index a51bbdd44..000000000 --- a/src/ragas/loaders.py +++ /dev/null @@ -1,263 +0,0 @@ -import json -import random -import typing as t -from collections import defaultdict - -import numpy as np -from pydantic import BaseModel - -from ragas.dataset_schema import EvaluationDataset - - -class PromptAnnotation(BaseModel): - prompt_input: t.Dict[str, t.Any] - prompt_output: t.Dict[str, t.Any] - is_accepted: bool - edited_output: t.Union[t.Dict[str, t.Any], None] - - def __getitem__(self, key): - return getattr(self, key) - - -class SampleAnnotation(BaseModel): - metric_input: t.Dict[str, t.Any] - metric_output: float - prompts: t.Dict[str, PromptAnnotation] - is_accepted: bool - target: t.Optional[float] = None - - def __getitem__(self, key): - return getattr(self, key) - - -class MetricAnnotation(BaseModel): - - root: t.Dict[str, t.List[SampleAnnotation]] - - def __getitem__(self, key): - return SingleMetricAnnotation(name=key, samples=self.root[key]) - - @classmethod - def from_json(cls, path, metric_name: t.Optional[str]) -> "MetricAnnotation": - - dataset = json.load(open(path)) - if metric_name is not None and metric_name not in dataset: - raise ValueError(f"Split {metric_name} not found in the dataset.") - - return cls( - root={ - key: [SampleAnnotation(**sample) for sample in value] - for key, value in dataset.items() - if metric_name is None or key == metric_name - } - ) - - def __len__(self): - return sum(len(value) for value in self.root.values()) - - -class SingleMetricAnnotation(BaseModel): - name: str - samples: t.List[SampleAnnotation] - - def to_evaluation_dataset(self) -> EvaluationDataset: - samples = [sample.metric_input for sample in self.samples] - return EvaluationDataset.from_list(samples) - - def __getitem__(self, idx): - return self.samples[idx] - - def __iter__(self) -> t.Iterator[SampleAnnotation]: # type: ignore - return iter(self.samples) - - def select(self, indices: t.List[int]) -> "SingleMetricAnnotation": - return SingleMetricAnnotation( - name=self.name, - samples=[self.samples[idx] for idx in indices], - ) - - @classmethod - def from_json(cls, path) -> "SingleMetricAnnotation": - - dataset = json.load(open(path)) - - return cls( - name=dataset["name"], - samples=[SampleAnnotation(**sample) for sample in dataset["samples"]], - ) - - def filter(self, function: t.Optional[t.Callable] = None): - - if function is None: - function = lambda x: True # noqa: E731 - - return SingleMetricAnnotation( - name=self.name, - samples=[sample for sample in self.samples if function(sample)], - ) - - def __len__(self): - return len(self.samples) - - def train_test_split( - self, - test_size: float = 0.2, - seed: int = 42, - stratify: t.Optional[t.List[t.Any]] = None, - ) -> t.Tuple["SingleMetricAnnotation", "SingleMetricAnnotation"]: - """ - Split the dataset into training and testing sets. - - Parameters: - test_size (float): The proportion of the dataset to include in the test split. - seed (int): Random seed for reproducibility. - stratify (list): The column values to stratify the split on. - """ - raise NotImplementedError - - def sample( - self, n: int, stratify_key: t.Optional[str] = None - ) -> "SingleMetricAnnotation": - """ - Create a subset of the dataset. - - Parameters: - n (int): The number of samples to include in the subset. - stratify_key (str): The column to stratify the subset on. - - Returns: - SingleMetricAnnotation: A subset of the dataset with `n` samples. - """ - if n > len(self.samples): - raise ValueError( - "Requested sample size exceeds the number of available samples." - ) - - if stratify_key is None: - # Simple random sampling - sampled_indices = random.sample(range(len(self.samples)), n) - sampled_samples = [self.samples[i] for i in sampled_indices] - else: - # Stratified sampling - class_groups = defaultdict(list) - for idx, sample in enumerate(self.samples): - key = sample.metric_input.get(stratify_key) - class_groups[key].append(idx) - - # Determine the proportion of samples to take from each class - total_samples = sum(len(indices) for indices in class_groups.values()) - proportions = { - cls: len(indices) / total_samples - for cls, indices in class_groups.items() - } - - sampled_indices = [] - for cls, indices in class_groups.items(): - cls_sample_count = int(np.round(proportions[cls] * n)) - cls_sample_count = min( - cls_sample_count, len(indices) - ) # Don't oversample - sampled_indices.extend(random.sample(indices, cls_sample_count)) - - # Handle any rounding discrepancies to ensure exactly `n` samples - while len(sampled_indices) < n: - remaining_indices = set(range(len(self.samples))) - set(sampled_indices) - if not remaining_indices: - break - sampled_indices.append(random.choice(list(remaining_indices))) - - sampled_samples = [self.samples[i] for i in sampled_indices] - - return SingleMetricAnnotation(name=self.name, samples=sampled_samples) - - def batch( - self, - batch_size: int, - drop_last_batch: bool = False, - ): - """ - Create a batch iterator. - - Parameters: - batch_size (int): The number of samples in each batch. - stratify (str): The column to stratify the batches on. - drop_last_batch (bool): Whether to drop the last batch if it is smaller than the specified batch size. - """ - - samples = self.samples[:] - random.shuffle(samples) - - all_batches = [ - samples[i : i + batch_size] - for i in range(0, len(samples), batch_size) - if len(samples[i : i + batch_size]) == batch_size or not drop_last_batch - ] - - return all_batches - - def stratified_batches( - self, - batch_size: int, - stratify_key: str, - drop_last_batch: bool = False, - replace: bool = False, - ) -> t.List[t.List[SampleAnnotation]]: - """ - Create stratified batches based on a specified key, ensuring proportional representation. - - Parameters: - batch_size (int): Number of samples per batch. - stratify_key (str): Key in `metric_input` used for stratification (e.g., class labels). - drop_last_batch (bool): If True, drops the last batch if it has fewer samples than `batch_size`. - replace (bool): If True, allows reusing samples from the same class to fill a batch if necessary. - - Returns: - List[List[SampleAnnotation]]: A list of stratified batches, each batch being a list of SampleAnnotation objects. - """ - # Group samples based on the stratification key - class_groups = defaultdict(list) - for sample in self.samples: - key = sample[stratify_key] - class_groups[key].append(sample) - - # Shuffle each class group for randomness - for group in class_groups.values(): - random.shuffle(group) - - # Determine the number of batches required - total_samples = len(self.samples) - num_batches = ( - np.ceil(total_samples / batch_size).astype(int) - if drop_last_batch - else np.floor(total_samples / batch_size).astype(int) - ) - samples_per_class_per_batch = { - cls: max(1, len(samples) // num_batches) - for cls, samples in class_groups.items() - } - - # Create stratified batches - all_batches = [] - while len(all_batches) < num_batches: - batch = [] - for cls, samples in list(class_groups.items()): - # Determine the number of samples to take from this class - count = min( - samples_per_class_per_batch[cls], - len(samples), - batch_size - len(batch), - ) - if count > 0: - # Add samples from the current class - batch.extend(samples[:count]) - class_groups[cls] = samples[count:] # Remove used samples - elif replace and len(batch) < batch_size: - # Reuse samples if `replace` is True - batch.extend(random.choices(samples, k=batch_size - len(batch))) - - # Shuffle the batch to mix classes - random.shuffle(batch) - if len(batch) == batch_size or not drop_last_batch: - all_batches.append(batch) - - return all_batches diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index e1e2097e5..fc21ff680 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -12,9 +12,8 @@ from ragas._analytics import EvaluationEvent, _analytics_batcher from ragas.callbacks import ChainType, new_group -from ragas.dataset_schema import MultiTurnSample, SingleTurnSample +from ragas.dataset_schema import MetricAnnotation, MultiTurnSample, SingleTurnSample from ragas.executor import is_event_loop_running -from ragas.loaders import MetricAnnotation from ragas.losses import BinaryMetricLoss, MSELoss from ragas.prompt import PromptMixin from ragas.run_config import RunConfig @@ -28,7 +27,6 @@ if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks - from ragas.config import DemonstrationConfig, InstructionConfig from ragas.config import DemonstrationConfig, InstructionConfig from ragas.embeddings import BaseRagasEmbeddings from ragas.llms import BaseRagasLLM @@ -69,13 +67,6 @@ class MetricOutputType(Enum): RANKING = "ranking" -class MetricOutputType(Enum): - BINARY = "binary" - DISCRETE = "discrete" - CONTINUOUS = "continuous" - RANKING = "ranking" - - @dataclass class Metric(ABC): """ @@ -92,9 +83,6 @@ class Metric(ABC): _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=dict) name: str = field(default="", repr=True) - output_type: MetricOutputType = ( - MetricOutputType.DISCRETE - ) # TODO: remove default and make it required, add corresponding value to every metric def __post_init__(self): if self.name == "": @@ -282,10 +270,13 @@ def train( optimizer.metric = self optimizer_config = instruction_config.optimizer_config or {} - optimizer.optimize( + optimized_prompts = optimizer.optimize( dataset[self.name], loss_fun, optimizer_config, callbacks=callbacks ) - + prompts = self.get_prompts() + for key, val in optimized_prompts.items(): + prompts[key].instruction = val + self.set_prompts(**prompts) return diff --git a/src/ragas/optimizers/base.py b/src/ragas/optimizers/base.py index 974e2d4bd..2ba582a52 100644 --- a/src/ragas/optimizers/base.py +++ b/src/ragas/optimizers/base.py @@ -4,8 +4,8 @@ from langchain_core.callbacks import Callbacks +from ragas.dataset_schema import SingleMetricAnnotation from ragas.llms.base import BaseRagasLLM -from ragas.loaders import SingleMetricAnnotation from ragas.losses import Loss from ragas.metrics.base import MetricWithLLM from ragas.run_config import RunConfig diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 2a6d1960e..d491bf719 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -7,10 +7,14 @@ from pydantic import BaseModel from ragas.callbacks import new_group -from ragas.dataset_schema import EvaluationDataset, EvaluationResult +from ragas.dataset_schema import ( + EvaluationDataset, + EvaluationResult, + SampleAnnotation, + SingleMetricAnnotation, +) from ragas.evaluation import evaluate from ragas.executor import Executor -from ragas.loaders import SampleAnnotation, SingleMetricAnnotation from ragas.losses import Loss from ragas.optimizers.base import Optimizer from ragas.optimizers.utils import hamming_distance @@ -389,9 +393,7 @@ def feedback_mutation( candidate_rm.on_chain_end( outputs={"improved_candidate": improved_candidate[0]} ) - feedback_rm.on_chain_end( - outputs={"improved candidates": improved_candidates} - ) + feedback_rm.on_chain_end(outputs={"improved candidates": improved_candidates}) return improved_candidates @@ -503,6 +505,9 @@ def _get_evaluation_dataset( if self.metric is None: raise ValueError("No metric provided for optimization.") + if self.metric.output_type is None: + raise ValueError("No output type provided for the metric.") + training_ids = [] y_true = [] for idx, sample in enumerate(dataset): From 27767eb66bdbc20dcfa48b2deedc3ed795f4709f Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 3 Dec 2024 23:27:33 +0530 Subject: [PATCH 32/36] add defaults --- src/ragas/losses.py | 14 ++++++++++++++ src/ragas/metrics/base.py | 14 ++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/ragas/losses.py b/src/ragas/losses.py index 52661bd87..0efc15eaa 100644 --- a/src/ragas/losses.py +++ b/src/ragas/losses.py @@ -1,6 +1,9 @@ import typing as t from abc import ABC, abstractmethod +from pydantic import GetCoreSchemaHandler +from pydantic_core import CoreSchema, core_schema + class Loss(ABC): """ @@ -11,6 +14,17 @@ class Loss(ABC): def __call__(self, predicted: t.List, actual: t.List) -> float: raise NotImplementedError + @classmethod + def __get_pydantic_core_schema__( + cls, source_type: t.Any, handler: GetCoreSchemaHandler + ) -> CoreSchema: + """ + Define how Pydantic generates a schema for BaseRagasEmbeddings. + """ + return core_schema.no_info_after_validator_function( + cls, core_schema.is_instance_schema(cls) # The validator function + ) + class MSELoss(Loss): """ diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index fc21ff680..ce77f0be5 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -233,14 +233,24 @@ def init(self, run_config: RunConfig): def train( self, path: str, - demonstration_config: DemonstrationConfig, - instruction_config: InstructionConfig, + demonstration_config: t.Optional[DemonstrationConfig] = None, + instruction_config: t.Optional[InstructionConfig] = None, callbacks: Callbacks = None, ) -> None: if not path.endswith(".json"): raise ValueError("Train data must be in json format") + if instruction_config is None: + from ragas.config import InstructionConfig + + instruction_config = InstructionConfig() + + if demonstration_config is None: + from ragas.config import DemonstrationConfig + + demonstration_config = DemonstrationConfig() + dataset = MetricAnnotation.from_json(path, metric_name=self.name) optimizer = instruction_config.optimizer From 75cc309683de9abf9f6f42c0cf660b2132104c14 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 3 Dec 2024 23:58:34 +0530 Subject: [PATCH 33/36] fix key mapping isssues --- src/ragas/optimizers/genetic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index d491bf719..245758b8f 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -547,6 +547,12 @@ def evaluate_candidate( raise_exceptions=raise_exceptions, run_id=run_id, ) + # remap the traces to the original prompt names + remap_traces = {val.name: key for key, val in self.metric.get_prompts().items()} + for trace in results.traces: + for key in remap_traces: + if key in trace[self.metric.name]: + trace[self.metric.name][remap_traces[key]] = trace[self.metric.name].pop(key) return results def evaluate_fitness( From e6374e6b913e9db47dc7ad63f0d05cf0b221cc4c Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 5 Dec 2024 14:06:10 +0530 Subject: [PATCH 34/36] fixed progress bar --- src/ragas/evaluation.py | 3 + src/ragas/executor.py | 38 +++++---- src/ragas/optimizers/genetic.py | 131 +++++++++++++++++++++----------- 3 files changed, 113 insertions(+), 59 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 6850fc7ad..5ba83e905 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -7,6 +7,7 @@ from langchain_core.callbacks import BaseCallbackHandler, BaseCallbackManager from langchain_core.embeddings import Embeddings as LangchainEmbeddings from langchain_core.language_models import BaseLanguageModel as LangchainLLM +from tqdm.auto import tqdm from ragas._analytics import track_was_completed from ragas.callbacks import ChainType, RagasTracer, new_group @@ -67,6 +68,7 @@ def evaluate( show_progress: bool = True, batch_size: t.Optional[int] = None, run_id: t.Optional[UUID] = None, + pbar: t.Optional[tqdm] = None, ) -> EvaluationResult: """ Run the evaluation on the dataset with different metrics @@ -229,6 +231,7 @@ def evaluate( run_config=run_config, show_progress=show_progress, batch_size=batch_size, + pbar=pbar, ) # Ragas Callbacks diff --git a/src/ragas/executor.py b/src/ragas/executor.py index 2ead05e17..9d90f0ca8 100644 --- a/src/ragas/executor.py +++ b/src/ragas/executor.py @@ -81,6 +81,7 @@ class Executor: batch_size: t.Optional[int] = None run_config: t.Optional[RunConfig] = field(default=None, repr=False) _nest_asyncio_applied: bool = field(default=False, repr=False) + pbar: t.Optional[tqdm] = None def wrap_callable_with_index( self, callable: t.Callable, counter: int @@ -127,21 +128,22 @@ async def _process_jobs(self) -> t.List[t.Any]: results = [] if not self.batch_size: - with tqdm( - total=len(self.jobs), - desc=self.desc, - disable=not self.show_progress, - ) as pbar: - # Create coroutines - coroutines = [ - afunc(*args, **kwargs) for afunc, args, kwargs, _ in self.jobs - ] - for future in await as_completed(coroutines, max_workers): - result = await future - results.append(result) - pbar.update(1) + # Use external progress bar if provided, otherwise create one + if self.pbar is None: + with tqdm( + total=len(self.jobs), + desc=self.desc, + disable=not self.show_progress, + ) as internal_pbar: + await self._process_coroutines( + self.jobs, internal_pbar, results, max_workers + ) + else: + await self._process_coroutines( + self.jobs, self.pbar, results, max_workers + ) - return results + return results # With batching, show nested progress bars batches = batched(self.jobs, self.batch_size) # generator of job tuples @@ -179,6 +181,14 @@ async def _process_jobs(self) -> t.List[t.Any]: return results + async def _process_coroutines(self, jobs, pbar, results, max_workers): + """Helper function to process coroutines and update the progress bar.""" + coroutines = [afunc(*args, **kwargs) for afunc, args, kwargs, _ in jobs] + for future in await as_completed(coroutines, max_workers): + result = await future + results.append(result) + pbar.update(1) + def results(self) -> t.List[t.Any]: """ Execute all submitted jobs and return their results. The results are returned in the order of job submission. diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 245758b8f..243b061f6 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -5,6 +5,7 @@ import numpy as np from langchain_core.callbacks import Callbacks from pydantic import BaseModel +from tqdm.auto import tqdm from ragas.callbacks import new_group from ragas.dataset_schema import ( @@ -159,6 +160,7 @@ def optimize( population_size = config.get("population_size", 3) num_demonstrations = config.get("num_demonstrations", 3) + sample_size = config.get("sample_size", 10) # new group for optimization optimization_generation_rm, optimization_generation_grp = new_group( @@ -167,51 +169,76 @@ def optimize( callbacks=callbacks, ) - initial_population = self._initialize_population( - dataset, - population_size, - num_demonstrations, - run_config, - batch_size, - optimization_generation_grp, - raise_exceptions, - ) - # TODO: replace with metric.get_prompts('function_name') - seed_prompts = { - key: val.instruction - for key, val in self.metric.get_prompts().items() - if key in initial_population[0].keys() - } - initial_population.append(seed_prompts) - - improved_prompts = self.feedback_mutation( - initial_population, - dataset, - sample_size=10, - run_config=run_config, - batch_size=batch_size, - callbacks=optimization_generation_grp, - raise_exceptions=raise_exceptions, - ) + stages = [ + {"name": "Initializing Population", "steps": population_size - 1}, + { + "name": "Feedback Mutation", + "steps": population_size * sample_size + population_size, + }, + { + "name": "Cross-over Mutation", + "steps": population_size * len(dataset) + population_size, + }, + {"name": "Fitness Evaluation", "steps": population_size * len(dataset)}, + ] + total_steps = sum([stage["steps"] for stage in stages]) + with tqdm( + total=total_steps, desc="Overall Progress", dynamic_ncols=True + ) as pbar: + + pbar.set_description(f"{stages[0]['name']} Step 1/{len(stages)}") + initial_population = self.initialize_population( + dataset, + population_size - 1, + num_demonstrations, + run_config, + batch_size, + optimization_generation_grp, + raise_exceptions, + pbar, + ) + # TODO: replace with metric.get_prompts('function_name') + seed_prompts = { + key: val.instruction + for key, val in self.metric.get_prompts().items() + if key in initial_population[0].keys() + } + initial_population.append(seed_prompts) + + pbar.set_description(f"{stages[1]['name']} Step 2/{len(stages)}") + improved_prompts = self.feedback_mutation( + initial_population, + dataset, + sample_size=sample_size, + run_config=run_config, + batch_size=batch_size, + callbacks=optimization_generation_grp, + raise_exceptions=raise_exceptions, + pbar=pbar, + ) - improved_prompts = self.cross_over_mutation( - improved_prompts, - dataset, - run_config=run_config, - batch_size=batch_size, - callbacks=optimization_generation_grp, - raise_exceptions=raise_exceptions, - ) + pbar.set_description(f"{stages[2]['name']} Step 3/{len(stages)}") + improved_prompts = self.cross_over_mutation( + improved_prompts, + dataset, + run_config=run_config, + batch_size=batch_size, + callbacks=optimization_generation_grp, + raise_exceptions=raise_exceptions, + pbar=pbar, + ) - fitness_scores = self.evaluate_fitness( - improved_prompts, - dataset, - loss, - run_config=run_config, - batch_size=batch_size, - callbacks=optimization_generation_grp, - raise_exceptions=raise_exceptions, - ) + pbar.set_description(f"{stages[3]['name']} Step 4/{len(stages)}") + fitness_scores = self.evaluate_fitness( + improved_prompts, + dataset, + loss, + run_config=run_config, + batch_size=batch_size, + callbacks=optimization_generation_grp, + raise_exceptions=raise_exceptions, + pbar=pbar, + ) best_candidate = improved_prompts[np.argmax(fitness_scores)] optimization_generation_rm.on_chain_end( @@ -220,7 +247,7 @@ def optimize( return best_candidate - def _initialize_population( + def initialize_population( self, dataset: SingleMetricAnnotation, population_size: int, @@ -229,6 +256,7 @@ def _initialize_population( batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, + pbar: t.Optional[tqdm] = None, ) -> t.List[t.Dict[str, str]]: initialize_population_rm, initialize_population_grp = new_group( @@ -243,6 +271,7 @@ def _initialize_population( run_config=run_config, keep_progress_bar=False, batch_size=batch_size, + pbar=pbar, ) candidates = [] @@ -338,6 +367,7 @@ def feedback_mutation( batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, + pbar: t.Optional[tqdm] = None, ) -> t.List[t.Dict[str, str]]: if self.metric is None: @@ -366,6 +396,7 @@ def feedback_mutation( callbacks=candidate_grp, raise_exceptions=raise_exceptions, run_id=candidate_rm.run_id, + pbar=pbar, ) exec = Executor( @@ -374,6 +405,7 @@ def feedback_mutation( run_config=run_config, keep_progress_bar=False, batch_size=batch_size, + pbar=pbar, ) exec.submit( self._feedback_mutation, @@ -531,6 +563,7 @@ def evaluate_candidate( callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, run_id: t.Optional[UUID] = None, + pbar: t.Optional[tqdm] = None, ) -> EvaluationResult: if self.metric is None: @@ -546,13 +579,16 @@ def evaluate_candidate( callbacks=callbacks, raise_exceptions=raise_exceptions, run_id=run_id, + pbar=pbar, ) # remap the traces to the original prompt names remap_traces = {val.name: key for key, val in self.metric.get_prompts().items()} for trace in results.traces: for key in remap_traces: if key in trace[self.metric.name]: - trace[self.metric.name][remap_traces[key]] = trace[self.metric.name].pop(key) + trace[self.metric.name][remap_traces[key]] = trace[ + self.metric.name + ].pop(key) return results def evaluate_fitness( @@ -564,6 +600,7 @@ def evaluate_fitness( batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, + pbar: t.Optional[tqdm] = None, ) -> t.List[float]: if self.metric is None: @@ -589,6 +626,7 @@ def evaluate_fitness( callbacks=initialize_population_grp, raise_exceptions=raise_exceptions, run_id=run_id, + pbar=pbar, ) y_pred = results.to_pandas()[self.metric.name].values.tolist() loss = loss_fn(y_true, y_pred) @@ -625,6 +663,7 @@ def cross_over_mutation( batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, + pbar: t.Optional[tqdm] = None, ): if self.metric is None: @@ -652,6 +691,7 @@ def cross_over_mutation( callbacks=cross_over_grp, raise_exceptions=raise_exceptions, run_id=run_id, + pbar=pbar, ) y_pred = results.to_pandas()[self.metric.name].values.tolist() prediction = [int(pred == true) for pred, true in zip(y_pred, y_true)] @@ -666,6 +706,7 @@ def cross_over_mutation( run_config=run_config, keep_progress_bar=False, batch_size=batch_size, + pbar=pbar, ) offspring_candidates = [] From 1801ea5e79dfe5e4c236789a9bea91bd74da3300 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 7 Dec 2024 13:28:13 +0530 Subject: [PATCH 35/36] added suggested improvements --- src/ragas/evaluation.py | 8 +- src/ragas/metrics/base.py | 28 ++++- src/ragas/optimizers/genetic.py | 201 +++++++++++++++++--------------- 3 files changed, 136 insertions(+), 101 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 5ba83e905..9fa1e0116 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -67,8 +67,8 @@ def evaluate( column_map: t.Optional[t.Dict[str, str]] = None, show_progress: bool = True, batch_size: t.Optional[int] = None, - run_id: t.Optional[UUID] = None, - pbar: t.Optional[tqdm] = None, + _run_id: t.Optional[UUID] = None, + _pbar: t.Optional[tqdm] = None, ) -> EvaluationResult: """ Run the evaluation on the dataset with different metrics @@ -231,7 +231,7 @@ def evaluate( run_config=run_config, show_progress=show_progress, batch_size=batch_size, - pbar=pbar, + pbar=_pbar, ) # Ragas Callbacks @@ -339,7 +339,7 @@ def evaluate( cost_cb, ), ragas_traces=tracer.traces, - run_id=run_id, + run_id=_run_id, ) if not evaluation_group_cm.ended: evaluation_rm.on_chain_end({"scores": result.scores}) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index ce77f0be5..52838b0b2 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -235,7 +235,11 @@ def train( path: str, demonstration_config: t.Optional[DemonstrationConfig] = None, instruction_config: t.Optional[InstructionConfig] = None, - callbacks: Callbacks = None, + callbacks: t.Optional[Callbacks] = None, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + with_debugging_logs=False, + raise_exceptions: bool = True, ) -> None: if not path.endswith(".json"): @@ -263,16 +267,21 @@ def train( optimizer.llm = llm if instruction_config.loss is None: - if self.output_type == MetricOutputType.BINARY: + if self.output_type is None: + raise ValueError( + f"Output type for metric '{self.name}' is not defined. Please set the output type in the metric or in the instruction config." + ) + + if self.output_type.name == MetricOutputType.BINARY.name: loss_fun = BinaryMetricLoss() elif ( - self.output_type == MetricOutputType.CONTINUOUS - or self.output_type == MetricOutputType.DISCRETE + self.output_type.name == MetricOutputType.CONTINUOUS.name + or self.output_type.name == MetricOutputType.DISCRETE.name ): loss_fun = MSELoss() else: raise NotImplementedError( - f"Output type '{self.output_type}' not implemented" + f"Output type '{self.output_type.name}' not implemented" ) else: loss_fun = instruction_config.loss @@ -281,7 +290,14 @@ def train( optimizer_config = instruction_config.optimizer_config or {} optimized_prompts = optimizer.optimize( - dataset[self.name], loss_fun, optimizer_config, callbacks=callbacks + dataset[self.name], + loss_fun, + optimizer_config, + callbacks=callbacks, + run_config=run_config, + batch_size=batch_size, + with_debugging_logs=with_debugging_logs, + raise_exceptions=raise_exceptions, ) prompts = self.get_prompts() for key, val in optimized_prompts.items(): diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index 243b061f6..cb35875e1 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -26,14 +26,16 @@ RAGAS_OPTIMIZATION_GROUP = "ragas_optimization" +example_type = t.TypeVar( + "example_type", bound=t.Dict[t.Dict[str, t.Any], t.Dict[str, t.Any]] +) + class FormattedExamples(BaseModel): examples: t.List[t.Tuple[str, t.Any]] @classmethod - def from_examples( - cls, examples: t.List[t.Dict[t.Dict[str, t.Any], t.Dict[str, t.Any]]] - ) -> "FormattedExamples": + def from_examples(cls, examples: t.List[example_type]) -> "FormattedExamples": formated_examples = [] for example in examples: @@ -184,20 +186,21 @@ def optimize( total_steps = sum([stage["steps"] for stage in stages]) with tqdm( total=total_steps, desc="Overall Progress", dynamic_ncols=True - ) as pbar: + ) as parent_pbar: - pbar.set_description(f"{stages[0]['name']} Step 1/{len(stages)}") + parent_pbar.set_description(f"{stages[0]['name']} Step 1/{len(stages)}") initial_population = self.initialize_population( - dataset, - population_size - 1, - num_demonstrations, - run_config, - batch_size, - optimization_generation_grp, - raise_exceptions, - pbar, + dataset=dataset, + population_size=population_size - 1, + num_demonstrations=num_demonstrations, + run_config=run_config, + batch_size=batch_size, + callbacks=optimization_generation_grp, + raise_exceptions=raise_exceptions, + parent_pbar=parent_pbar, ) - # TODO: replace with metric.get_prompts('function_name') + + # get the default prompt used in the metric as seed prompt seed_prompts = { key: val.instruction for key, val in self.metric.get_prompts().items() @@ -205,7 +208,7 @@ def optimize( } initial_population.append(seed_prompts) - pbar.set_description(f"{stages[1]['name']} Step 2/{len(stages)}") + parent_pbar.set_description(f"{stages[1]['name']} Step 2/{len(stages)}") improved_prompts = self.feedback_mutation( initial_population, dataset, @@ -214,30 +217,30 @@ def optimize( batch_size=batch_size, callbacks=optimization_generation_grp, raise_exceptions=raise_exceptions, - pbar=pbar, + parent_pbar=parent_pbar, ) - pbar.set_description(f"{stages[2]['name']} Step 3/{len(stages)}") + parent_pbar.set_description(f"{stages[2]['name']} Step 3/{len(stages)}") improved_prompts = self.cross_over_mutation( - improved_prompts, - dataset, + candidates=improved_prompts, + dataset=dataset, run_config=run_config, batch_size=batch_size, callbacks=optimization_generation_grp, raise_exceptions=raise_exceptions, - pbar=pbar, + parent_pbar=parent_pbar, ) - pbar.set_description(f"{stages[3]['name']} Step 4/{len(stages)}") + parent_pbar.set_description(f"{stages[3]['name']} Step 4/{len(stages)}") fitness_scores = self.evaluate_fitness( - improved_prompts, - dataset, - loss, + candidates=improved_prompts, + dataset=dataset, + loss_fn=loss, run_config=run_config, batch_size=batch_size, callbacks=optimization_generation_grp, raise_exceptions=raise_exceptions, - pbar=pbar, + parent_pbar=parent_pbar, ) best_candidate = improved_prompts[np.argmax(fitness_scores)] @@ -249,6 +252,7 @@ def optimize( def initialize_population( self, + *, dataset: SingleMetricAnnotation, population_size: int, num_demonstrations: int = 3, @@ -256,7 +260,7 @@ def initialize_population( batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, - pbar: t.Optional[tqdm] = None, + parent_pbar: t.Optional[tqdm] = None, ) -> t.List[t.Dict[str, str]]: initialize_population_rm, initialize_population_grp = new_group( @@ -271,7 +275,7 @@ def initialize_population( run_config=run_config, keep_progress_bar=False, batch_size=batch_size, - pbar=pbar, + pbar=parent_pbar, ) candidates = [] @@ -367,7 +371,7 @@ def feedback_mutation( batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, - pbar: t.Optional[tqdm] = None, + parent_pbar: t.Optional[tqdm] = None, ) -> t.List[t.Dict[str, str]]: if self.metric is None: @@ -380,51 +384,38 @@ def feedback_mutation( ) improved_candidates = [] dataset = dataset.filter(lambda x: x["is_accepted"]) + + exec = Executor( + desc="Feedback Mutation", + raise_exceptions=raise_exceptions, + run_config=run_config, + keep_progress_bar=False, + batch_size=batch_size, + pbar=parent_pbar, + ) + for candidate in candidates: - candidate_rm, candidate_grp = new_group( - name="Candidate feedback mutation", - inputs={"candidate": candidate}, - callbacks=feedback_grp, - ) dataset_sample = dataset.sample(sample_size, stratify_key="metric_output") - batch, target = self._get_evaluation_dataset(dataset_sample) - results = self.evaluate_candidate( - candidate=candidate, - eval_dataset=batch, - run_config=run_config, - batch_size=batch_size, - callbacks=candidate_grp, - raise_exceptions=raise_exceptions, - run_id=candidate_rm.run_id, - pbar=pbar, - ) - - exec = Executor( - desc="Getting feedbacks", - raise_exceptions=raise_exceptions, - run_config=run_config, - keep_progress_bar=False, - batch_size=batch_size, - pbar=pbar, - ) exec.submit( self._feedback_mutation, candidate=candidate, dataset=dataset_sample, - results=results, - target=target, - callbacks=candidate_grp, + callbacks=feedback_grp, + raise_exceptions=raise_exceptions, + batch_size=batch_size, + run_config=run_config, + parent_pbar=parent_pbar, + ) + + try: + improved_candidates = exec.results() + except Exception as e: + feedback_rm.on_chain_error(e) + raise e + else: + feedback_rm.on_chain_end( + outputs={"improved_candidate": improved_candidates} ) - try: - improved_candidate = exec.results() - improved_candidates.append(improved_candidate[0]) - except Exception as e: - candidate_rm.on_chain_error(e) - raise e - else: - candidate_rm.on_chain_end( - outputs={"improved_candidate": improved_candidate[0]} - ) feedback_rm.on_chain_end(outputs={"improved candidates": improved_candidates}) return improved_candidates @@ -432,10 +423,12 @@ def feedback_mutation( async def _feedback_mutation( self, candidate: t.Dict[str, str], - dataset: SampleAnnotation, - results: EvaluationResult, - target: t.List[float], - callbacks: Callbacks = None, + dataset: SingleMetricAnnotation, + run_config: t.Optional[RunConfig] = None, + batch_size: t.Optional[int] = None, + callbacks: t.Optional[Callbacks] = None, + raise_exceptions: bool = True, + parent_pbar: t.Optional[tqdm] = None, ) -> t.Dict[str, str]: if self.llm is None: @@ -444,13 +437,32 @@ async def _feedback_mutation( if self.metric is None: raise ValueError("No metric provided for optimization.") - feedback_candidates = await self._get_feedbacks( - candidate, dataset, results, target, callbacks + candidate_rm, candidate_grp = new_group( + name="Candidate feedback mutation", + inputs={"candidate": candidate}, + callbacks=callbacks, ) - improved_candidates = await self._implement_feedbacks( - candidate, feedback_candidates, callbacks + batch, target = self._get_evaluation_dataset(dataset) + results = self.evaluate_candidate( + candidate=candidate, + eval_dataset=batch, + run_config=run_config, + batch_size=batch_size, + callbacks=candidate_grp, + raise_exceptions=raise_exceptions, + run_id=candidate_rm.run_id, + parent_pbar=parent_pbar, + ) + + feedback_candidate = await self._get_feedbacks( + candidate, dataset, results, target, candidate_grp + ) + improved_candidate = await self._implement_feedbacks( + candidate, feedback_candidate, candidate_grp ) - return improved_candidates + + candidate_rm.on_chain_end(outputs={"improved_candidate": improved_candidate}) + return improved_candidate async def _implement_feedbacks( self, @@ -484,7 +496,7 @@ async def _implement_feedbacks( async def _get_feedbacks( self, candidate: t.Dict[str, str], - dataset: SampleAnnotation, + dataset: SingleMetricAnnotation, results: EvaluationResult, target: t.List[float], callbacks: Callbacks = None, @@ -504,29 +516,33 @@ def dict_to_str(dict: t.Dict[str, t.Any]) -> str: traces = [trace[self.metric.name] for trace in results.traces] if indices: feedback_candidates = {} - for key in candidate.keys(): + for prompt_name in candidate.keys(): feedback_data = [ FeedbackExample( input=dict_to_str( - traces[idx][key]["input"].model_dump(exclude_none=True) + traces[idx][prompt_name]["input"].model_dump( + exclude_none=True + ) ), - output=traces[idx][key]["output"][0].model_dump( + output=traces[idx][prompt_name]["output"][0].model_dump( exclude_none=True ), - expected_output=dataset[idx]["prompts"][key]["prompt_output"], + expected_output=dataset[idx]["prompts"][prompt_name][ + "prompt_output" + ], ) for idx in indices ] prompt_input = FeedbackMutationInput( - instruction=candidate[key], examples=feedback_data + instruction=candidate[prompt_name], examples=feedback_data ) feedbacks = await self.feedback_generation_prompt.generate( data=prompt_input, llm=self.llm, callbacks=callbacks ) - feedback_candidates[key] = feedbacks.feedbacks + feedback_candidates[prompt_name] = feedbacks.feedbacks else: logger.warning("No samples found for the feedback generation.") - feedback_candidates = {key: [] for key in candidate.keys()} + feedback_candidates = {prompt_name: [] for prompt_name in candidate.keys()} return feedback_candidates @@ -556,6 +572,7 @@ def _get_evaluation_dataset( def evaluate_candidate( self, + *, candidate: t.Dict[str, str], eval_dataset: EvaluationDataset, run_config: t.Optional[RunConfig] = None, @@ -563,7 +580,7 @@ def evaluate_candidate( callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, run_id: t.Optional[UUID] = None, - pbar: t.Optional[tqdm] = None, + parent_pbar: t.Optional[tqdm] = None, ) -> EvaluationResult: if self.metric is None: @@ -578,8 +595,8 @@ def evaluate_candidate( batch_size=batch_size, callbacks=callbacks, raise_exceptions=raise_exceptions, - run_id=run_id, - pbar=pbar, + _run_id=run_id, + _pbar=parent_pbar, ) # remap the traces to the original prompt names remap_traces = {val.name: key for key, val in self.metric.get_prompts().items()} @@ -593,6 +610,7 @@ def evaluate_candidate( def evaluate_fitness( self, + *, candidates: t.List[t.Dict[str, str]], dataset: SingleMetricAnnotation, loss_fn: Loss, @@ -600,7 +618,7 @@ def evaluate_fitness( batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, - pbar: t.Optional[tqdm] = None, + parent_pbar: t.Optional[tqdm] = None, ) -> t.List[float]: if self.metric is None: @@ -626,7 +644,7 @@ def evaluate_fitness( callbacks=initialize_population_grp, raise_exceptions=raise_exceptions, run_id=run_id, - pbar=pbar, + parent_pbar=parent_pbar, ) y_pred = results.to_pandas()[self.metric.name].values.tolist() loss = loss_fn(y_true, y_pred) @@ -657,13 +675,14 @@ async def _cross_over_chain( def cross_over_mutation( self, + *, candidates: t.List[t.Dict[str, str]], dataset: SingleMetricAnnotation, run_config: t.Optional[RunConfig] = None, batch_size: t.Optional[int] = None, callbacks: t.Optional[Callbacks] = None, raise_exceptions: bool = True, - pbar: t.Optional[tqdm] = None, + parent_pbar: t.Optional[tqdm] = None, ): if self.metric is None: @@ -691,7 +710,7 @@ def cross_over_mutation( callbacks=cross_over_grp, raise_exceptions=raise_exceptions, run_id=run_id, - pbar=pbar, + parent_pbar=parent_pbar, ) y_pred = results.to_pandas()[self.metric.name].values.tolist() prediction = [int(pred == true) for pred, true in zip(y_pred, y_true)] @@ -706,7 +725,7 @@ def cross_over_mutation( run_config=run_config, keep_progress_bar=False, batch_size=batch_size, - pbar=pbar, + pbar=parent_pbar, ) offspring_candidates = [] From fc2b40e4dabe98c29f71576a1f1317f616f35d81 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sat, 7 Dec 2024 16:24:54 +0530 Subject: [PATCH 36/36] check for empty feedback --- src/ragas/optimizers/genetic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/optimizers/genetic.py b/src/ragas/optimizers/genetic.py index cb35875e1..9dd7cf538 100644 --- a/src/ragas/optimizers/genetic.py +++ b/src/ragas/optimizers/genetic.py @@ -477,7 +477,7 @@ async def _implement_feedbacks( improved_candidate = {} for key in candidate.keys(): feedback = feedbacks[key] - if feedbacks: + if feedback: feedback_input = FeedbackMutationPromptInput( instruction=candidate[key], feedbacks=feedback )