diff --git a/.gitignore b/.gitignore index 11efa9cab..90a821e14 100644 --- a/.gitignore +++ b/.gitignore @@ -161,11 +161,7 @@ cython_debug/ # Ragas specific ragas/_version.py -experiments/**/data -experiments/**/storage +experiments/ **/fil-result/ -experiments/baselines/fiqa/datasets src/ragas/_version.py .python-version -experiments/retriever-benchmarks/datasets -experiments/tmp diff --git a/docs/conf.py b/docs/conf.py index dab9b26ca..b4715c4ca 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,3 @@ -import os from dataclasses import asdict from sphinxawesome_theme import ThemeOptions diff --git a/docs/howtos/customisations/embeddings.ipynb b/docs/howtos/customisations/embeddings.ipynb index 7cc4f9abe..8c31c816f 100644 --- a/docs/howtos/customisations/embeddings.ipynb +++ b/docs/howtos/customisations/embeddings.ipynb @@ -169,7 +169,7 @@ "\n", "result = evaluate(\n", " fiqa_eval[\"baseline\"].select(range(5)), # showing only 5 for demonstration\n", - " metrics=[answer_similarity]\n", + " metrics=[answer_similarity],\n", ")\n", "\n", "result" diff --git a/docs/howtos/customisations/gcp-vertexai.ipynb b/docs/howtos/customisations/gcp-vertexai.ipynb index b3b5e24c2..9623b84d3 100644 --- a/docs/howtos/customisations/gcp-vertexai.ipynb +++ b/docs/howtos/customisations/gcp-vertexai.ipynb @@ -98,7 +98,7 @@ "source": [ "from ragas.metrics import (\n", " context_precision,\n", - " answer_relevancy, # AnswerRelevancy\n", + " answer_relevancy, # AnswerRelevancy\n", " faithfulness,\n", " context_recall,\n", ")\n", @@ -110,7 +110,7 @@ " answer_relevancy,\n", " context_recall,\n", " context_precision,\n", - " harmfulness\n", + " harmfulness,\n", "]" ] }, @@ -137,7 +137,6 @@ "from langchain.embeddings import VertexAIEmbeddings\n", "\n", "\n", - "\n", "config = {\n", " \"project_id\": \"tmp-project-404003\",\n", "}\n", @@ -170,7 +169,7 @@ "for m in metrics:\n", " # change LLM for metric\n", " m.__setattr__(\"llm\", ragas_vertexai_llm)\n", - " \n", + "\n", " # check if this metric needs embeddings\n", " if hasattr(m, \"embeddings\"):\n", " # if so change with VertexAI Embeddings\n", @@ -276,13 +275,15 @@ ], "source": [ "from ragas import evaluate\n", - "import nest_asyncio # CHECK NOTES\n", + "import nest_asyncio # CHECK NOTES\n", "\n", - "# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function. \n", - "nest_asyncio.apply() \n", + "# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.\n", + "nest_asyncio.apply()\n", "\n", "result = evaluate(\n", - " fiqa_eval[\"baseline\"].select(range(1)), # using 1 as example due to quota constrains\n", + " fiqa_eval[\"baseline\"].select(\n", + " range(1)\n", + " ), # using 1 as example due to quota constrains\n", " metrics=metrics,\n", ")\n", "\n", diff --git a/docs/howtos/integrations/zeno.ipynb b/docs/howtos/integrations/zeno.ipynb index 8e00f56b3..1f1891638 100644 --- a/docs/howtos/integrations/zeno.ipynb +++ b/docs/howtos/integrations/zeno.ipynb @@ -186,7 +186,7 @@ " ]\n", "].copy()\n", "\n", - "output_df['output'] = df.apply(\n", + "output_df[\"output\"] = df.apply(\n", " lambda x: {\"answer\": x[\"answer\"], \"ground_truths\": list(x[\"ground_truths\"])}, axis=1\n", ")\n", "output_df[\"id\"] = output_df.index\n", diff --git a/pyproject.toml b/pyproject.toml index 3dd632d6b..30d4ff605 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,9 @@ package-dir = {"" = "src"} [tool.setuptools.dynamic] readme = {file = ["README.md"], content-type = "text/plain"} +[tool.ruff.lint] +ignore = ["E501"] + [build-system] requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py new file mode 100644 index 000000000..87e286a51 --- /dev/null +++ b/src/ragas/callbacks.py @@ -0,0 +1,58 @@ +import typing as t + +from langchain_core.callbacks import ( + AsyncCallbackManager, + AsyncCallbackManagerForChainGroup, + AsyncCallbackManagerForChainRun, + CallbackManager, + CallbackManagerForChainGroup, + CallbackManagerForChainRun, + Callbacks, +) + + +def new_group( + name: str, inputs: t.Dict, callbacks: Callbacks, is_async=False +) -> t.Tuple[CallbackManagerForChainRun, CallbackManagerForChainGroup]: + # start evaluation chain + if isinstance(callbacks, list): + cm = CallbackManager.configure(inheritable_callbacks=callbacks) + else: + cm = t.cast(CallbackManager, callbacks) + rm = cm.on_chain_start({"name": name}, inputs) + child_cm = rm.get_child() + group_cm = CallbackManagerForChainGroup( + child_cm.handlers, + child_cm.inheritable_handlers, + child_cm.parent_run_id, + parent_run_manager=rm, + tags=child_cm.tags, + inheritable_tags=child_cm.inheritable_tags, + metadata=child_cm.metadata, + inheritable_metadata=child_cm.inheritable_metadata, + ) + + return rm, group_cm + + +async def new_async_group( + name: str, inputs: t.Dict, callbacks: Callbacks +) -> t.Tuple[AsyncCallbackManagerForChainRun, AsyncCallbackManagerForChainGroup]: + # start evaluation chain + if isinstance(callbacks, list): + cm = AsyncCallbackManager.configure(inheritable_callbacks=callbacks) + else: + cm = t.cast(AsyncCallbackManager, callbacks) + rm = await cm.on_chain_start({"name": name}, inputs) + child_cm = rm.get_child() + group_cm = AsyncCallbackManagerForChainGroup( + child_cm.handlers, + child_cm.inheritable_handlers, + child_cm.parent_run_id, + parent_run_manager=rm, + tags=child_cm.tags, + inheritable_tags=child_cm.inheritable_tags, + metadata=child_cm.metadata, + inheritable_metadata=child_cm.inheritable_metadata, + ) + return rm, group_cm diff --git a/src/ragas/embeddings/__init__.py b/src/ragas/embeddings/__init__.py index 9e0cfda92..1de1ff06d 100644 --- a/src/ragas/embeddings/__init__.py +++ b/src/ragas/embeddings/__init__.py @@ -1,15 +1,15 @@ from ragas.embeddings.base import ( AzureOpenAIEmbeddings, + BaseRagasEmbeddings, FastEmbedEmbeddings, HuggingfaceEmbeddings, OpenAIEmbeddings, - RagasEmbeddings, ) __all__ = [ "HuggingfaceEmbeddings", "OpenAIEmbeddings", "AzureOpenAIEmbeddings", - "RagasEmbeddings", + "BaseRagasEmbeddings", "FastEmbedEmbeddings", ] diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index 355697161..8dc27e5e5 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -18,15 +18,11 @@ DEFAULT_MODEL_NAME = "BAAI/bge-small-en-v1.5" -class RagasEmbeddings(Embeddings): - def validate_api_key(self): - """ - Validates that the api key is set for the Embeddings - """ - pass +class BaseRagasEmbeddings(Embeddings): + ... -class OpenAIEmbeddings(BaseOpenAIEmbeddings, RagasEmbeddings): +class OpenAIEmbeddings(BaseOpenAIEmbeddings, BaseRagasEmbeddings): api_key: str = NO_KEY def __init__(self, api_key: str = NO_KEY): @@ -48,7 +44,7 @@ def validate_api_key(self): raise OpenAIKeyNotFound -class FastEmbedEmbeddings(BaseFastEmbedEmbeddings, RagasEmbeddings): +class FastEmbedEmbeddings(BaseFastEmbedEmbeddings, BaseRagasEmbeddings): """ Find the list of supported models at: https://qdrant.github.io/fastembed/examples/Supported_Models/ @@ -66,7 +62,7 @@ def validate_api_key(self): pass -class AzureOpenAIEmbeddings(BaseAzureOpenAIEmbeddings, RagasEmbeddings): +class AzureOpenAIEmbeddings(BaseAzureOpenAIEmbeddings, BaseRagasEmbeddings): azure_endpoint: t.Optional[str] = None deployment: t.Optional[str] = None api_version: t.Optional[str] = None @@ -104,7 +100,7 @@ def validate_api_key(self): @dataclass -class HuggingfaceEmbeddings(RagasEmbeddings): +class HuggingfaceEmbeddings(BaseRagasEmbeddings): model_name: str = DEFAULT_MODEL_NAME """Model name to use.""" cache_folder: t.Optional[str] = None @@ -178,6 +174,6 @@ def predict(self, texts: List[List[str]]) -> List[List[float]]: return predictions.tolist() -def embedding_factory() -> RagasEmbeddings: +def embedding_factory() -> BaseRagasEmbeddings: openai_embeddings = OpenAIEmbeddings() return openai_embeddings diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index c19ca4630..5ff48be70 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -5,21 +5,36 @@ import numpy as np from datasets import Dataset, concatenate_datasets +from langchain_core.language_models import BaseLanguageModel from ragas._analytics import EvaluationEvent, track -from ragas.metrics.base import Metric -from ragas.metrics.critique import AspectCritique +from ragas.callbacks import new_group +from ragas.embeddings.base import BaseRagasEmbeddings +from ragas.executor import Executor +from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper +from ragas.metrics.base import Metric, MetricWithLLM + +# from ragas.metrics.critique import AspectCritique from ragas.validation import ( remap_column_names, validate_column_dtypes, validate_evaluation_modes, ) +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + def evaluate( dataset: Dataset, metrics: list[Metric] | None = None, - column_map: dict[str, str] = {}, + llm: t.Optional[BaseRagasLLM] = None, + embeddings: t.Optional[BaseRagasEmbeddings] = None, + callbacks: Callbacks = [], + is_async: bool = False, + max_workers: t.Optional[int] = None, + raise_exceptions: bool = True, + column_map: t.Dict[str, str] = {}, ) -> Result: """ Run the evaluation on the dataset with different metrics @@ -81,6 +96,17 @@ def evaluate( ) metrics = [answer_relevancy, context_precision, faithfulness, context_recall] + # set the llm and embeddings + if llm is None: + from ragas.llms import llm_factory + + llm = llm_factory() + elif isinstance(llm, BaseLanguageModel): + llm = LangchainLLMWrapper(llm) + if embeddings is None: + from ragas.embeddings.base import embedding_factory + + embeddings = embedding_factory() # remap column names from the dataset dataset = remap_column_names(dataset, column_map) @@ -88,17 +114,69 @@ def evaluate( validate_evaluation_modes(dataset, metrics) validate_column_dtypes(dataset) - # run the evaluation on dataset with different metrics + binary_metrics = [] + for metric in metrics: + # if isinstance(metric, AspectCritique): + # binary_metrics.append(metric.name) + if isinstance(metric, MetricWithLLM): + if metric.llm is None: + metric.llm = llm + # initialize all the models in the metrics [m.init_model() for m in metrics] + executor = Executor( + is_async=is_async, max_workers=max_workers, raise_exceptions=raise_exceptions + ) + # new evaluation chain + row_run_managers = [] + evaluation_rm, evaluation_group_cm = new_group( + name="ragas evaluation", inputs={}, callbacks=callbacks, is_async=is_async + ) + for i, row in enumerate(dataset): + row = t.cast(t.Dict[str, t.Any], row) + row_rm, row_group_cm = new_group( + name=f"row {i}", + inputs=row, + callbacks=evaluation_group_cm, + is_async=is_async, + ) + row_run_managers.append((row_rm, row_group_cm)) + + if is_async: + [executor.submit(metric.ascore, row, row_group_cm) for metric in metrics] + else: + [executor.submit(metric.score, row, row_group_cm) for metric in metrics] + scores = [] - binary_metrics = [] - for metric in metrics: - if isinstance(metric, AspectCritique): - binary_metrics.append(metric.name) - print(f"evaluating with [{metric.name}]") - scores.append(metric.score(dataset).select_columns(metric.name)) + try: + # get the results + results = executor.results() + # convert results to dataset_like + for i, _ in enumerate(dataset): + s = {} + for j, m in enumerate(metrics): + s[m.name] = results[len(metrics) * i + j] + scores.append(s) + # close the row chain + row_rm, row_group_cm = row_run_managers[i] + if not row_group_cm.ended: + row_rm.on_chain_end(s) + + # run evaluation task + except Exception as e: + if not evaluation_group_cm.ended: + evaluation_rm.on_chain_error(e) + + raise e + finally: + result = Result( + scores=Dataset.from_list(scores), + dataset=dataset, + binary_columns=binary_metrics, + ) + if not evaluation_group_cm.ended: + evaluation_rm.on_chain_end(result) # log the evaluation event metrics_names = [m.name for m in metrics] @@ -110,23 +188,18 @@ def evaluate( num_rows=dataset.shape[0], ) ) - - return Result( - scores=concatenate_datasets(scores, axis=1), - dataset=dataset, - binary_columns=binary_metrics, - ) + return result @dataclass class Result(dict): scores: Dataset - dataset: Dataset | None = None - binary_columns: list[str] = field(default_factory=list) + dataset: t.Optional[Dataset] = None + binary_columns: t.List[str] = field(default_factory=list) def __post_init__(self): values = [] - for cn in self.scores.column_names: + for cn in self.scores[0].keys(): value = np.nanmean(self.scores[cn]) self[cn] = value if cn not in self.binary_columns: diff --git a/src/ragas/executor.py b/src/ragas/executor.py new file mode 100644 index 000000000..16fc105ff --- /dev/null +++ b/src/ragas/executor.py @@ -0,0 +1,121 @@ +import asyncio +import typing as t +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field + +import numpy as np +from tqdm.auto import tqdm + + +@dataclass +class Executor: + is_async: bool = True + max_workers: t.Optional[int] = None + futures: t.List[t.Any] = field(default_factory=list, repr=False) + raise_exceptions: bool = False + _is_new_eventloop: bool = False + + def __post_init__(self): + if self.is_async: + try: + self.executor = asyncio.get_running_loop() + except RuntimeError: + self.executor = asyncio.new_event_loop() + self._is_new_eventloop = True + else: + self.executor = ThreadPoolExecutor(max_workers=self.max_workers) + + def _validation_for_mode(self): + if self.is_async and self.max_workers is not None: + raise ValueError( + "Cannot evaluate with both async and threads. Either set is_async=False or max_workers=None." # noqa + ) + + def wrap_callable_with_index(self, callable: t.Callable, counter): + def wrapped_callable(*args, **kwargs): + return counter, callable(*args, **kwargs) + + async def wrapped_callable_async(*args, **kwargs): + return counter, await callable(*args, **kwargs) + + if self.is_async: + return wrapped_callable_async + else: + return wrapped_callable + + def submit(self, callable: t.Callable, *args, **kwargs): + if self.is_async: + self.executor = t.cast(asyncio.AbstractEventLoop, self.executor) + callable_with_index = self.wrap_callable_with_index( + callable, len(self.futures) + ) + # is type correct? + callable_with_index = t.cast(t.Callable, callable_with_index) + self.futures.append( + self.executor.create_task(callable_with_index(*args, **kwargs)) + ) + else: + self.executor = t.cast(ThreadPoolExecutor, self.executor) + callable_with_index = self.wrap_callable_with_index( + callable, len(self.futures) + ) + self.futures.append( + self.executor.submit(callable_with_index, *args, **kwargs) + ) + + async def _aresults(self) -> t.List[t.Any]: + results = [] + for future in tqdm( + asyncio.as_completed(self.futures), + desc="Evaluating", + total=len(self.futures), + ): + r = np.nan + try: + r = await future + except Exception as e: + if self.raise_exceptions: + raise e + results.append(r) + + return results + + def results(self) -> t.List[t.Any]: + results = [] + if self.is_async: + self.executor = t.cast(asyncio.AbstractEventLoop, self.executor) + try: + if self._is_new_eventloop: + results = self.executor.run_until_complete(self._aresults()) + + # event loop is running use nested_asyncio to hijack the event loop + else: + import nest_asyncio + + nest_asyncio.apply() + results = self.executor.run_until_complete(self._aresults()) + finally: + [f.cancel() for f in self.futures] + + else: + self.executor = t.cast(ThreadPoolExecutor, self.executor) + try: + for future in tqdm( + as_completed(self.futures), + desc="Evaluating", + total=len(self.futures), + ): + r = np.nan + try: + r = future.result() + except Exception as e: + r = np.nan + if self.raise_exceptions: + raise e + finally: + results.append(r) + finally: + self.executor.shutdown(wait=False) + + sorted_results = sorted(results, key=lambda x: x[0]) + return [r[1] for r in sorted_results] diff --git a/src/ragas/langchain/__init__.py b/src/ragas/langchain/__init__.py deleted file mode 100644 index 039bc0cc8..000000000 --- a/src/ragas/langchain/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ragas.langchain.evalchain import RagasEvaluatorChain - -__all__ = ["RagasEvaluatorChain"] diff --git a/src/ragas/langchain/evalchain.py b/src/ragas/langchain/evalchain.py deleted file mode 100644 index 43d4ad3c9..000000000 --- a/src/ragas/langchain/evalchain.py +++ /dev/null @@ -1,197 +0,0 @@ -from __future__ import annotations - -import typing as t -from collections import defaultdict - -from datasets import Dataset -from langchain.callbacks.manager import CallbackManagerForChainRun -from langchain.chains.base import Chain -from langchain.schema import RUN_KEY -from langsmith.evaluation import EvaluationResult, RunEvaluator -from langsmith.schemas import Example, Run - -from ragas.metrics.base import EvaluationMode, Metric -from ragas.validation import EVALMODE_TO_COLUMNS - -if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - - -class RagasEvaluatorChain(Chain, RunEvaluator): - """ - Wrapper around ragas Metrics to use them with langsmith. - """ - - metric: Metric - - def __init__(self, **kwargs: t.Any): - super().__init__(**kwargs) - self.metric.init_model() - - @property - def input_keys(self) -> list[str]: - keys = ["query", "result"] - if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]: - keys += ["source_documents"] - if self.metric.evaluation_mode in [EvaluationMode.gc]: - keys += ["ground_truths"] - return keys - - @property - def output_keys(self) -> list[str]: - return [f"{self.metric.name}_score"] - - def _call( - self, - inputs: dict[str, t.Any], - run_manager: t.Optional[CallbackManagerForChainRun] = None, - ) -> dict[str, t.Any]: - """ - Call the evaluation chain. - """ - self._validate(inputs) - contexts = [] - - _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() - callbacks = _run_manager.get_child() - - if "source_documents" in inputs: - for document in inputs["source_documents"]: - if isinstance(document, dict): - contexts.append(document["page_content"]) - else: - contexts.append(document.page_content) - ground_truths = [] - if "ground_truths" in inputs: - ground_truths = inputs["ground_truths"] - - question = inputs["query"] - answer = inputs["result"] - score = self.metric.score_single( - { - "question": question, - "answer": answer, - "contexts": contexts, - "ground_truths": ground_truths, - }, - callbacks=callbacks, - ) - return {f"{self.metric.name}_score": score} - - def _validate( - self, - input: dict[str, t.Any], - question_key: str = "query", - prediction_key: str = "result", - context_key: str = "source_documents", - ) -> None: - ... - # validate each example - required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode] - if "question" in required_columns and question_key not in input: - raise ValueError( - f'"{question_key}" is required in each example' - f"for the metric[{self.metric.name}] you have chosen." - ) - if "answer" in required_columns and prediction_key not in input: - raise ValueError( - f'"{prediction_key}" is required in each prediction' - f"for the metric[{self.metric.name}] you have chosen." - ) - if "contexts" in required_columns and context_key not in input: - raise ValueError( - f'"{context_key}" is required in each prediction for the ' - f"metric[{self.metric.name}] you have chosen." - ) - if "ground_truths" in required_columns and "ground_truths" not in input: - raise ValueError( - f'"ground_truths" is required in each prediction for the ' - f"metric[{self.metric.name}] you have chosen." - ) - - def evaluate( - self, - examples: t.Sequence[dict], - predictions: t.Sequence[dict], - question_key: str = "query", - prediction_key: str = "result", - context_key: str = "source_documents", - ground_truths_key: str = "ground_truths", - *, - callbacks: Callbacks = None, - ) -> list[dict]: - """Evaluate question answering examples and predictions.""" - dataset_dict = defaultdict(list) - - # validation - if len(examples) != len(predictions): - raise ValueError( - "number of examples and predictions must be same. Got " - f"len(examples)={len(examples)} and len(predictions)={len(predictions)}" - ) - - for i, example in enumerate(examples): - self._validate( - {**example, **predictions[i]}, question_key, prediction_key, context_key - ) - # transform into Dataset that is supported by ragas - if self.metric.evaluation_mode in [ - EvaluationMode.qac, - EvaluationMode.qc, - EvaluationMode.qa, - ]: - dataset_dict["question"].append(example[question_key]) - - if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qa]: - dataset_dict["answer"].append(predictions[i][prediction_key]) - - if self.metric.evaluation_mode in [ - EvaluationMode.qac, - EvaluationMode.qc, - EvaluationMode.gc, - ]: - dataset_dict["contexts"].append( - [d.page_content for d in predictions[i][context_key]] - ) - - if self.metric.evaluation_mode == EvaluationMode.gc: - if isinstance(example["ground_truths"], list): - dataset_dict["ground_truths"].append(example["ground_truths"]) - else: - dataset_dict["ground_truths"].append([example["ground_truths"]]) - - dataset = Dataset.from_dict(dataset_dict) - - # evaluate - dataset_with_scores = self.metric.score(dataset, callbacks=callbacks) - scores = [ - {f"{self.metric.name}_score": score} - for score in dataset_with_scores[self.metric.name] - ] - return scores - - def evaluate_run( - self, run: Run, example: t.Optional[Example] = None - ) -> EvaluationResult: - """ - Evaluate a langsmith run - """ - if run.outputs is None: - raise ValueError("The chain should return results and service_document.") - if example is None: - raise ValueError("Examples have to be provided.") - chain_eval = run.outputs - chain_eval["query"] = run.inputs["query"] - if self.metric.evaluation_mode == EvaluationMode.gc: - if example.outputs is None or "ground_truths" not in example.outputs: - raise ValueError("expected `ground_truths` in example outputs.") - chain_eval["ground_truths"] = example.outputs["ground_truths"] - eval_output = self(chain_eval, include_run_info=True) - - score_name = f"{self.metric.name}_score" - evaluation_result = EvaluationResult( - key=f"{self.metric.name}_score", score=eval_output[score_name] - ) - if RUN_KEY in eval_output: - evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY] - return evaluation_result diff --git a/src/ragas/langsmith/evaluator.py b/src/ragas/langsmith/evaluator.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/ragas/llama_index/__init__.py b/src/ragas/llama_index/__init__.py deleted file mode 100644 index c6f647b3d..000000000 --- a/src/ragas/llama_index/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ragas.llama_index.evaluation import evaluate - -__all__ = ["evaluate"] diff --git a/src/ragas/llama_index/evaluation.py b/src/ragas/llama_index/evaluation.py deleted file mode 100644 index b30738d29..000000000 --- a/src/ragas/llama_index/evaluation.py +++ /dev/null @@ -1,100 +0,0 @@ -from __future__ import annotations - -import typing as t - -from datasets import Dataset - -from ragas import evaluate as ragas_evaluate -from ragas.evaluation import Result -from ragas.metrics.base import Metric - -if t.TYPE_CHECKING: - from llama_index.indices.query.base import BaseQueryEngine - - -def evaluate( - query_engine: BaseQueryEngine, - metrics: list[Metric], - questions: list[str], - ground_truths: t.Optional[list[str]] = None, -) -> Result: - """ - Run evaluation of llama_index QueryEngine with different metrics - - Parameters - ---------- - query_engine : BaseQueryEngine - The QueryEngine that is to be evaluated - metrics : list[Metric] - The ragas metrics to use for evaluation. - questions : list[str] - List of questions to evaluate on - ground_truths : list[str], optional - List of ground_truths answer to the question to evaluate on. - - Returns - ------- - Result - Result object containing the scores of each metric. You can use this do analysis - later. - - Raises - ------ - ValueError - if validation fails because the columns required for the metrics are missing or - if the columns are of the wrong format. - - Examples - -------- - Once you have a llama_index QueryEngine created you can use it to evaluate on a list - of questions. - - Import everything you need: - - >>> from ragas.metrics import faithfulness, answer_relevancy, context_precision - >>> from ragas.metrics.critique import harmfulness - >>> from ragas.llama_index import evaluate - - init the query engine, get the questions and choose the metrics you want to use: - - >>> query_engine = # from llamaindex - >>> questions: list[str] = [] # from somewhere - >>> metrics = [faithfulness, answer_relevancy, context_precision, harmfulness] - - Run the evaluation: - - >>> r = evaluate(query_engine, metrics, questions) - - analysis the result: - - >>> print(r) # prints the scores of each metric - >>> r.to_pandas() # returns a pandas dataframe if you want to do further analysis - """ - - try: - from llama_index.async_utils import run_async_tasks - except ImportError: - raise ImportError( - "llama_index must be installed to use this function. " - "Please, install it with `pip install llama_index`." - ) - - # TODO: rate limit, error handling, retries - responses = run_async_tasks([query_engine.aquery(q) for q in questions]) - - answers = [] - contexts = [] - for r in responses: - answers.append(r.response) - contexts.append([c.node.get_content() for c in r.source_nodes]) - dataset_dict = { - "question": questions, - "answer": answers, - "contexts": contexts, - } - if ground_truths is not None: - dataset_dict["ground_truths"] = ground_truths - ds = Dataset.from_dict(dataset_dict) - result = ragas_evaluate(ds, metrics) - - return result diff --git a/src/ragas/llms/__init__.py b/src/ragas/llms/__init__.py index 6f48ae530..9d6285b6e 100644 --- a/src/ragas/llms/__init__.py +++ b/src/ragas/llms/__init__.py @@ -1,10 +1,12 @@ -from ragas.llms.base import RagasLLM -from ragas.llms.langchain import LangchainLLM -from ragas.llms.llamaindex import LlamaIndexLLM -from ragas.llms.openai import OpenAI +from langchain.chat_models import ChatOpenAI -__all__ = ["RagasLLM", "LangchainLLM", "LlamaIndexLLM", "llm_factory", "OpenAI"] +from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper +__all__ = [ + "BaseRagasLLM", + "llm_factory", +] -def llm_factory(model="gpt-3.5-turbo-16k") -> RagasLLM: - return OpenAI(model=model) + +def llm_factory(model="gpt-3.5-turbo-16k") -> BaseRagasLLM: + return LangchainLLMWrapper(ChatOpenAI(model=model)) diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index eec5569de..3d2e117e2 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -2,54 +2,139 @@ import typing as t from abc import ABC, abstractmethod +from dataclasses import dataclass -from langchain.schema import LLMResult +from langchain.chat_models import AzureChatOpenAI, ChatOpenAI, ChatVertexAI +from langchain.llms import AzureOpenAI, OpenAI, VertexAI +from langchain_core.language_models import BaseLanguageModel +from langchain_core.outputs import LLMResult if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - from langchain.prompts import ChatPromptTemplate + from langchain_core.callbacks import Callbacks + from langchain_core.prompts import ChatPromptTemplate + from ragas.llms.prompt import PromptValue -class RagasLLM(ABC): - """ - BaseLLM is the base class for all LLMs. It provides a consistent interface for other - classes that interact with LLMs like Langchains, LlamaIndex, LiteLLM etc. Handles - multiple_completions even if not supported by the LLM. +MULTIPLE_COMPLETION_SUPPORTED = [ + OpenAI, + ChatOpenAI, + AzureOpenAI, + AzureChatOpenAI, + ChatVertexAI, + VertexAI, +] - It currently takes in ChatPromptTemplates and returns LLMResults which are Langchain - primitives. - """ - # supports multiple compeletions for the given prompt - n_completions_supported: bool = False +def is_multiple_completion_supported(llm: BaseLanguageModel) -> bool: + """Return whether the given LLM supports n-completion.""" + for llm_type in MULTIPLE_COMPLETION_SUPPORTED: + if isinstance(llm, llm_type): + return True + return False - @property + +@dataclass +class BaseRagasLLM(ABC): @abstractmethod - def llm(self) -> t.Any: + def generate_text( + self, + prompt: PromptValue, + n: int = 1, + temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = [], + ) -> LLMResult: ... - def validate_api_key(self): - """ - Validates that the api key is set for the LLM - """ - pass - @abstractmethod - def generate( + async def agenerate_text( self, - prompts: list[ChatPromptTemplate], + prompt: PromptValue, n: int = 1, temperature: float = 1e-8, - callbacks: t.Optional[Callbacks] = None, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = [], ) -> LLMResult: ... - @abstractmethod - async def agenerate( + # TODO: remove after testset generator is refactored + def generate_text_with_hmpt( self, - prompt: ChatPromptTemplate, + prompts: t.List[ChatPromptTemplate], n: int = 1, temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = [], + ) -> LLMResult: + prompt = PromptValue(prompt_str=prompts[0].format()) + return self.generate_text(prompt, n, temperature, stop, callbacks) + + +@dataclass +class LangchainLLMWrapper(BaseRagasLLM): + """ + A simple base class for RagasLLMs that is based on Langchain's BaseLanguageModel + interface. it implements 2 functions: + - generate_text: for generating text from a given PromptValue + - agenerate_text: for generating text from a given PromptValue asynchronously + """ + + langchain_llm: BaseLanguageModel + + def generate_text( + self, + prompt: PromptValue, + n: int = 1, + temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: - ... + if is_multiple_completion_supported(self.langchain_llm): + return self.langchain_llm.generate_prompt( + prompts=[prompt], + n=n, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + else: + result = self.langchain_llm.generate_prompt( + prompts=[prompt] * n, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + # make LLMResult.generation appear as if it was n_completions + # note that LLMResult.runs is still a list that represents each run + generations = [[g[0] for g in result.generations]] + result.generations = generations + return result + + async def agenerate_text( + self, + prompt: PromptValue, + n: int = 1, + temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, + callbacks: t.Optional[Callbacks] = None, + ) -> LLMResult: + if is_multiple_completion_supported(self.langchain_llm): + return await self.langchain_llm.agenerate_prompt( + prompts=[prompt], + n=n, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + else: + result = await self.langchain_llm.agenerate_prompt( + prompts=[prompt] * n, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + # make LLMResult.generation appear as if it was n_completions + # note that LLMResult.runs is still a list that represents each run + generations = [[g[0] for g in result.generations]] + result.generations = generations + return result diff --git a/src/ragas/llms/json_load.py b/src/ragas/llms/json_load.py new file mode 100644 index 000000000..bebfdc80f --- /dev/null +++ b/src/ragas/llms/json_load.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import json +import logging +import typing as t +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + from ragas.llms.base import BaseRagasLLM + + +def load_as_json(text) -> t.Dict: + """ + validate and return given text as json + """ + + try: + return json.loads(text) + except ValueError as e: + logger.warn(f"Invalid json: {e}") + return {} + + +# not migrating to Prompt format to avoid circular imports +JSON_PROMPT = """\ +Rewrite the input into valid json + +Input: +{{ + "name": "John Doe", + "age": 30, + "isStudent": false + "address": {{ + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + }} + "hobbies": ["reading", "swimming", "cycling"] +}} +Output: +{{ + "name": "John Doe", + "age": 30, + "isStudent": false, + "address": {{ + "street": "123 Main St", + "city": "Anytown", + "state": "CA" + }}, + "hobbies": ["reading", "swimming", "cycling"] +}} + + +Input: +{{ + "statement": "The Earth is also known as "Terra" " +}} +Output: +{{ + "statement": "The Earth is also known as 'Terra'" +}} + +Input: +{input} + +Output: +""" + + +@dataclass +class JsonLoader: + max_retries: int = 2 + + def safe_load(self, text: str, llm: BaseRagasLLM, callbacks: Callbacks = None): + retry = 0 + while retry <= self.max_retries: + try: + start, end = self._find_outermost_json(text) + return json.loads(text[start:end]) + except ValueError: + text = self._fix_to_json(text, llm, callbacks) + retry += 1 + + return {} + + def _fix_to_json(self, text: str, llm: BaseRagasLLM, callbacks: Callbacks): + from ragas.llms.prompt import PromptValue + + # TODO (executor) + results = llm.generate_text( + PromptValue(prompt_str=JSON_PROMPT.format(input=text)), + n=1, + callbacks=callbacks, + ) + return results.generations[0][0].text + + def _find_outermost_json(self, text): + stack = [] + start_index = -1 + + for i, char in enumerate(text): + if char in "{[": + if len(stack) == 0: + start_index = i + stack.append(char) + + elif char in "}]": + if len(stack) > 0: + last = stack.pop() + if (char == "}" and last != "{") or (char == "]" and last != "["): + # Mismatched closing brace/bracket, invalid JSON + break + + if len(stack) == 0 and start_index != -1: + # Found a valid outermost JSON + return ( + start_index, + i + 1, + ) # Add 1 to include the closing brace/bracket in the range + + return -1, -1 # No valid JSON found + + +json_loader = JsonLoader() diff --git a/src/ragas/llms/langchain.py b/src/ragas/llms/langchain.py deleted file mode 100644 index f8ef9efee..000000000 --- a/src/ragas/llms/langchain.py +++ /dev/null @@ -1,224 +0,0 @@ -from __future__ import annotations - -import typing as t - -from langchain.chat_models import AzureChatOpenAI, BedrockChat, ChatOpenAI, ChatVertexAI -from langchain.chat_models.base import BaseChatModel -from langchain.llms import AmazonAPIGateway, AzureOpenAI, Bedrock, OpenAI, VertexAI -from langchain.llms.base import BaseLLM -from langchain.schema import LLMResult - -from ragas.async_utils import run_async_tasks -from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound -from ragas.llms.base import RagasLLM -from ragas.utils import NO_KEY - -if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - from langchain.prompts import ChatPromptTemplate - - -def isOpenAI(llm: BaseLLM | BaseChatModel) -> bool: - return isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI) - - -def isBedrock(llm: BaseLLM | BaseChatModel) -> bool: - return isinstance(llm, Bedrock) or isinstance(llm, BedrockChat) - - -def isAmazonAPIGateway(llm: BaseLLM | BaseChatModel) -> bool: - return isinstance(llm, AmazonAPIGateway) - - -# have to specify it twice for runtime and static checks -MULTIPLE_COMPLETION_SUPPORTED = [ - OpenAI, - ChatOpenAI, - AzureOpenAI, - AzureChatOpenAI, - ChatVertexAI, - VertexAI, -] -MultipleCompletionSupportedLLM = t.Union[ - OpenAI, ChatOpenAI, AzureOpenAI, AzureChatOpenAI, ChatVertexAI, VertexAI -] - - -def _compute_token_usage_langchain(list_llmresults: t.List[LLMResult]) -> t.Dict: - # compute total token usage by adding individual token usage - llm_output = list_llmresults[0].llm_output - if llm_output is None: - return {} - if (llm_output is not None) and ("token_usage" in llm_output): - sum_prompt_tokens = 0 - sum_completion_tokens = 0 - sum_total_tokens = 0 - for result in list_llmresults: - if result.llm_output is None: - continue - token_usage = result.llm_output["token_usage"] - sum_prompt_tokens += token_usage["prompt_tokens"] - sum_completion_tokens += token_usage["completion_tokens"] - sum_total_tokens += token_usage["total_tokens"] - - llm_output["token_usage"] = { - "prompt_tokens": sum_prompt_tokens, - "completion_tokens": sum_completion_tokens, - "sum_total_tokens": sum_total_tokens, - } - - return llm_output - - -class LangchainLLM(RagasLLM): - n_completions_supported: bool = True - - def __init__(self, llm: BaseLLM | BaseChatModel): - self.langchain_llm = llm - - @property - def llm(self) -> BaseLLM | BaseChatModel: - return self.langchain_llm - - def validate_api_key(self): - # if langchain OpenAI or ChatOpenAI - if isinstance(self.llm, ChatOpenAI) or isinstance(self.llm, OpenAI): - # make sure the type is LangchainLLM with ChatOpenAI - self.langchain_llm = t.cast(ChatOpenAI, self.langchain_llm) - # raise error if no api key - if self.langchain_llm.openai_api_key == NO_KEY: - raise OpenAIKeyNotFound - - # if langchain AzureOpenAI or ChatAzurerOpenAI - elif isinstance(self.llm, AzureChatOpenAI) or isinstance(self.llm, AzureOpenAI): - self.langchain_llm = t.cast(AzureChatOpenAI, self.langchain_llm) - # raise error if no api key - if self.langchain_llm.openai_api_key == NO_KEY: - raise AzureOpenAIKeyNotFound - - @staticmethod - def llm_supports_completions(llm): - for llm_type in MULTIPLE_COMPLETION_SUPPORTED: - if isinstance(llm, llm_type): - return True - - def _generate_multiple_completions( - self, - prompts: list[ChatPromptTemplate], - n: int = 1, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - self.langchain_llm = t.cast(MultipleCompletionSupportedLLM, self.langchain_llm) - old_n = self.langchain_llm.n - self.langchain_llm.n = n - - if isinstance(self.llm, BaseLLM): - ps = [p.format() for p in prompts] - result = self.llm.generate(ps, callbacks=callbacks) - else: # if BaseChatModel - ps = [p.format_messages() for p in prompts] - result = self.llm.generate(ps, callbacks=callbacks) - self.llm.n = old_n - - return result - - async def generate_completions( - self, - prompts: list[ChatPromptTemplate], - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - if isinstance(self.llm, BaseLLM): - ps = [p.format() for p in prompts] - result = await self.llm.agenerate(ps, callbacks=callbacks) - else: # if BaseChatModel - ps = [p.format_messages() for p in prompts] - result = await self.llm.agenerate(ps, callbacks=callbacks) - - return result - - async def agenerate( - self, - prompt: ChatPromptTemplate, - n: int = 1, - temperature: float = 1e-8, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - temperature = 0.2 if n > 1 else 0 - if isBedrock(self.llm) and ("model_kwargs" in self.llm.__dict__): - self.llm.model_kwargs = {"temperature": temperature} - else: - self.llm.temperature = temperature - - if self.llm_supports_completions(self.llm): - self.langchain_llm = t.cast( - MultipleCompletionSupportedLLM, self.langchain_llm - ) - old_n = self.langchain_llm.n - self.langchain_llm.n = n - if isinstance(self.llm, BaseLLM): - result = await self.llm.agenerate( - [prompt.format()], callbacks=callbacks - ) - else: # if BaseChatModel - result = await self.llm.agenerate( - [prompt.format_messages()], callbacks=callbacks - ) - self.langchain_llm.n = old_n - else: - if isinstance(self.llm, BaseLLM): - list_llmresults: list[LLMResult] = run_async_tasks( - [ - self.llm.agenerate([prompt.format()], callbacks=callbacks) - for _ in range(n) - ] - ) - else: - list_llmresults: list[LLMResult] = run_async_tasks( - [ - self.llm.agenerate( - [prompt.format_messages()], callbacks=callbacks - ) - for _ in range(n) - ] - ) - - # fill results as if the LLM supported multiple completions - generations = [r.generations[0][0] for r in list_llmresults] - llm_output = _compute_token_usage_langchain(list_llmresults) - result = LLMResult(generations=[generations], llm_output=llm_output) - - return result - - def generate( - self, - prompts: list[ChatPromptTemplate], - n: int = 1, - temperature: float = 1e-8, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - # set temperature to 0.2 for multiple completions - temperature = 0.2 if n > 1 else 1e-8 - if isBedrock(self.llm) and ("model_kwargs" in self.llm.__dict__): - self.llm.model_kwargs = {"temperature": temperature} - elif isAmazonAPIGateway(self.llm) and ("model_kwargs" in self.llm.__dict__): - self.llm.model_kwargs = {"temperature": temperature} - else: - self.llm.temperature = temperature - - if self.llm_supports_completions(self.llm): - return self._generate_multiple_completions(prompts, n, callbacks) - else: # call generate_completions n times to mimic multiple completions - list_llmresults = run_async_tasks( - [self.generate_completions(prompts, callbacks) for _ in range(n)] - ) - - # fill results as if the LLM supported multiple completions - generations = [] - for i in range(len(prompts)): - completions = [] - for result in list_llmresults: - completions.append(result.generations[i][0]) - generations.append(completions) - - llm_output = _compute_token_usage_langchain(list_llmresults) - return LLMResult(generations=generations, llm_output=llm_output) diff --git a/src/ragas/llms/llamaindex.py b/src/ragas/llms/llamaindex.py deleted file mode 100644 index d93afacfd..000000000 --- a/src/ragas/llms/llamaindex.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import annotations - -import typing as t - -from langchain.schema.output import Generation, LLMResult - -from ragas.async_utils import run_async_tasks -from ragas.llms.base import RagasLLM - -if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - from langchain.prompts import ChatPromptTemplate - - try: - from llama_index.llms.base import LLM as LiLLM - except ImportError: - raise ImportError( - "llama_index must be installed to use this function. " - "Please, install it with `pip install llama_index`." - ) - - -class LlamaIndexLLM(RagasLLM): - def __init__(self, llm: LiLLM) -> None: - self.llama_index_llm = llm - - @property - def llm(self) -> LiLLM: - return self.llama_index_llm - - def generate( - self, - prompts: list[ChatPromptTemplate], - n: int = 1, - temperature: float = 0, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - # set temperature to 0.2 for multiple completions - temperature = 0.2 if n > 1 else 0 - self.llm.temperature = temperature - - # get task coroutines - tasks = [] - for p in prompts: - tasks.extend([self.llm.acomplete(p.format()) for _ in range(n)]) - - # process results to LLMResult - # token usage is note included for now - results = run_async_tasks(tasks) - results2D = [results[i : i + n] for i in range(0, len(results), n)] - generations = [ - [Generation(text=r.text) for r in result] for result in results2D - ] - return LLMResult(generations=generations) diff --git a/src/ragas/llms/openai.py b/src/ragas/llms/openai.py deleted file mode 100644 index a4b1892e4..000000000 --- a/src/ragas/llms/openai.py +++ /dev/null @@ -1,232 +0,0 @@ -from __future__ import annotations - -import asyncio -import logging -import os -import typing as t -from abc import abstractmethod -from dataclasses import dataclass, field - -import openai -from langchain.adapters.openai import convert_message_to_dict -from langchain.callbacks.manager import ( - AsyncCallbackManagerForLLMRun, - CallbackManagerForLLMRun, -) -from langchain.schema import Generation, LLMResult -from openai import AsyncAzureOpenAI, AsyncClient, AsyncOpenAI -from tenacity import ( - RetryCallState, - before_sleep_log, - retry, - retry_base, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - -from ragas.async_utils import run_async_tasks -from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound -from ragas.llms.base import RagasLLM -from ragas.llms.langchain import _compute_token_usage_langchain -from ragas.utils import NO_KEY, get_debug_mode - -if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - from langchain.prompts import ChatPromptTemplate - -logger = logging.getLogger(__name__) - -errors = [ - openai.APITimeoutError, - openai.APIConnectionError, - openai.RateLimitError, - openai.APIConnectionError, - openai.InternalServerError, -] - - -def create_base_retry_decorator( - error_types: t.List[t.Type[BaseException]], - max_retries: int = 1, - run_manager: t.Optional[ - t.Union[AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun] - ] = None, -) -> t.Callable[[t.Any], t.Any]: - """Create a retry decorator for a given LLM and provided list of error types.""" - - log_level = logging.WARNING if get_debug_mode() else logging.DEBUG - _logging = before_sleep_log(logger, log_level) - - def _before_sleep(retry_state: RetryCallState) -> None: - _logging(retry_state) - if run_manager: - if isinstance(run_manager, AsyncCallbackManagerForLLMRun): - coro = run_manager.on_retry(retry_state) - try: - loop = asyncio.get_event_loop() - if loop.is_running(): - loop.create_task(coro) - else: - asyncio.run(coro) - except Exception as e: - logger.error(f"Error in on_retry: {e}") - else: - run_manager.on_retry(retry_state) - return None - - min_seconds = 4 - max_seconds = 10 - # Wait 2^x * 1 second between each retry starting with - # 4 seconds, then up to 10 seconds, then 10 seconds afterwards - retry_instance: "retry_base" = retry_if_exception_type(error_types[0]) - for error in error_types[1:]: - retry_instance = retry_instance | retry_if_exception_type(error) - return retry( - reraise=True, - stop=stop_after_attempt(max_retries), - wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds), - retry=retry_instance, - before_sleep=_before_sleep, - ) - - -retry_decorator = create_base_retry_decorator(errors, max_retries=4) - - -class OpenAIBase(RagasLLM): - def __init__(self, model: str, _api_key_env_var: str, timeout: int = 60) -> None: - self.model = model - self._api_key_env_var = _api_key_env_var - self.timeout = timeout - - # api key - key_from_env = os.getenv(self._api_key_env_var, NO_KEY) - if key_from_env != NO_KEY: - self.api_key = key_from_env - else: - self.api_key = self.api_key - self._client: AsyncClient - - @abstractmethod - def _client_init(self): - ... - - @property - def llm(self): - return self - - def create_llm_result(self, response) -> LLMResult: - """Create the LLMResult from the choices and prompts.""" - if not isinstance(response, dict): - response = response.model_dump() - - # token Usage - token_usage = response.get("usage", {}) - llm_output = { - "token_usage": token_usage, - "model_name": None, - "system_fingerprint": response.get("system_fingerprint", ""), - } - - choices = response["choices"] - generations = [ - Generation( - text=choice["message"]["content"], - generation_info=dict( - finish_reason=choice.get("finish_reason"), - logprobs=choice.get("logprobs"), - ), - ) - for choice in choices - ] - llm_output = {"token_usage": token_usage, "model_name": self.model} - return LLMResult(generations=[generations], llm_output=llm_output) - - def generate( - self, - prompts: list[ChatPromptTemplate], - n: int = 1, - temperature: float = 0, - callbacks: t.Optional[Callbacks] = None, - ) -> t.Any: # TODO: LLMResult - llm_results = run_async_tasks( - [self.agenerate(p, n, temperature, callbacks) for p in prompts] - ) - - generations = [r.generations[0] for r in llm_results] - llm_output = _compute_token_usage_langchain(llm_results) - return LLMResult(generations=generations, llm_output=llm_output) - - @retry_decorator - async def agenerate( - self, - prompt: ChatPromptTemplate, - n: int = 1, - temperature: float = 0, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - # TODO: use callbacks for llm generate - completion = await self._client.chat.completions.create( - model=self.model, - messages=[convert_message_to_dict(m) for m in prompt.format_messages()], # type: ignore - temperature=temperature, - n=n, - ) - - return self.create_llm_result(completion) - - -@dataclass -class OpenAI(OpenAIBase): - model: str = "gpt-3.5-turbo-16k" - api_key: str = field(default=NO_KEY, repr=False) - _api_key_env_var: str = "OPENAI_API_KEY" - - def __post_init__(self): - super().__init__(model=self.model, _api_key_env_var=self._api_key_env_var) - self._client_init() - - def _client_init(self): - self._client = AsyncOpenAI(api_key=self.api_key, timeout=self.timeout) - - def validate_api_key(self): - # before validating, check if the api key is already set - api_key = os.getenv(self._api_key_env_var, NO_KEY) - if api_key != NO_KEY: - self._client.api_key = api_key - if self.llm.api_key == NO_KEY: - os_env_key = os.getenv(self._api_key_env_var, NO_KEY) - if os_env_key != NO_KEY: - self.api_key = os_env_key - else: - raise OpenAIKeyNotFound - - -@dataclass -class AzureOpenAI(OpenAIBase): - azure_endpoint: str - deployment: str - api_version: str - api_key: str = field(default=NO_KEY, repr=False) - _api_key_env_var: str = "AZURE_OPENAI_API_KEY" - - def __post_init__(self): - super().__init__(model=self.deployment, _api_key_env_var=self._api_key_env_var) - self._client_init() - - def _client_init(self): - self._client = AsyncAzureOpenAI( - api_version=self.api_version, - azure_endpoint=self.azure_endpoint, - api_key=self.api_key, - timeout=self.timeout, - ) - - def validate_api_key(self): - if self.llm.api_key == NO_KEY: - os_env_key = os.getenv(self._api_key_env_var, NO_KEY) - if os_env_key != NO_KEY: - self.api_key = os_env_key - else: - raise AzureOpenAIKeyNotFound diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 70c6d312a..0d9aaeb9e 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -5,16 +5,29 @@ import os import typing as t -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from langchain_core.messages import BaseMessage, HumanMessage -from langchain_core.prompt_values import PromptValue -from langchain_core.pydantic_v1 import root_validator +from langchain_core.prompt_values import PromptValue as BasePromptValue +from langchain_core.pydantic_v1 import BaseModel, root_validator -from ragas.llms import RagasLLM -from ragas.utils import RAGAS_CACHE_HOME, json_loader +from ragas.llms import BaseRagasLLM +from ragas.llms.json_load import json_loader +from ragas.utils import get_cache_dir +Example = t.Dict[str, t.Any] -class Prompt(PromptValue): + +class PromptValue(BasePromptValue): + prompt_str: str + + def to_messages(self) -> t.List[BaseMessage]: + """Return prompt as a list of Messages.""" + return [HumanMessage(content=self.to_string())] + + def to_string(self) -> str: + return self.prompt_str + + +class Prompt(BaseModel): """ Prompt is a class that represents a prompt for the ragas metrics. @@ -32,7 +45,7 @@ class Prompt(PromptValue): name: str instruction: str - examples: t.List[t.Dict[str, t.Any]] = [] + examples: t.List[Example] = [] input_keys: t.List[str] output_key: str output_type: str = "json" @@ -80,27 +93,28 @@ def to_string(self) -> str: """ prompt_str = self.instruction + "\n" - # Format the examples to match the Langchain prompt template - for example in self.examples: - for key, value in example.items(): - value = json.dumps(value, ensure_ascii=False).encode("utf8").decode() - value = ( - value.replace("{", "{{").replace("}", "}}") - if self.output_type.lower() == "json" - else value - ) - prompt_str += f"\n{key}: {value}" - prompt_str += "\n" + if self.examples: + # Format the examples to match the Langchain prompt template + for example in self.examples: + for key, value in example.items(): + value = ( + json.dumps(value, ensure_ascii=False).encode("utf8").decode() + ) + value = ( + value.replace("{", "{{").replace("}", "}}") + if self.output_type.lower() == "json" + else value + ) + prompt_str += f"\n{key}: {value}" + prompt_str += "\n" - prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys) - prompt_str += f"\n{self.output_key}: \n" + if self.input_keys: + prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys) + if self.output_key: + prompt_str += f"\n{self.output_key}: \n" return prompt_str - def to_messages(self) -> t.List[BaseMessage]: - """Return prompt as a list of Messages.""" - return [HumanMessage(content=self.to_string())] - def get_example_str(self, example_no: int) -> str: """ Get the example string from the example number. @@ -119,7 +133,7 @@ def get_example_str(self, example_no: int) -> str: example_str += f"\n{key}: {value}" return example_str - def format(self, **kwargs: t.Any) -> ChatPromptTemplate: + def format(self, **kwargs: t.Any) -> PromptValue: """ Format the Prompt object into a ChatPromptTemplate object to be used in metrics. """ @@ -128,14 +142,13 @@ def format(self, **kwargs: t.Any) -> ChatPromptTemplate: f"Input variables {self.input_keys} do not match with the given parameters {list(kwargs.keys())}" ) prompt = self.to_string() - human_prompt = HumanMessagePromptTemplate.from_template(prompt) - return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) + return PromptValue(prompt_str=prompt.format(**kwargs)) def adapt( - self, language: str, llm: RagasLLM, cache_dir: t.Optional[str] = None + self, language: str, llm: BaseRagasLLM, cache_dir: t.Optional[str] = None ) -> Prompt: # TODO: Add callbacks - cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME + cache_dir = cache_dir if cache_dir else get_cache_dir() if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")): return self._load(language, self.name, cache_dir) @@ -159,7 +172,10 @@ def adapt( ) ) - results = [result[0].text for result in llm.generate(prompts).generations] + # NOTE: this is a slow loop, consider Executor to fasten this + results = [] + for p in prompts: + results.append(llm.generate_text(p).generations[0][0].text) per_example_items = len(self.input_keys) + 1 grouped_results = [ results[i : i + per_example_items] @@ -188,14 +204,14 @@ def adapt( return self def save(self, cache_dir: t.Optional[str] = None) -> None: - cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME + cache_dir = cache_dir if cache_dir else get_cache_dir() cache_dir = os.path.join(cache_dir, self.language) if not os.path.exists(cache_dir): os.makedirs(cache_dir) cache_path = os.path.join(cache_dir, f"{self.name}.json") with open(cache_path, "w") as file: - json.dump(self.to_json(), file, indent=4) + json.dump(self.dict(), file, indent=4) @classmethod def _load(cls, language: str, name: str, cache_dir: str) -> Prompt: diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 1d60c15f4..f18bd57b9 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -12,30 +12,22 @@ from ragas.metrics._faithfulness import Faithfulness, faithfulness from ragas.metrics.critique import AspectCritique -DEFAULT_METRICS = [ - answer_relevancy, - context_precision, - faithfulness, - context_recall, - context_relevancy, -] - __all__ = [ + "AnswerCorrectness", + "answer_correctness", "Faithfulness", "faithfulness", - "AnswerRelevancy", - "answer_relevancy", "AnswerSimilarity", "answer_similarity", - "AnswerCorrectness", - "answer_correctness", - "ContextRelevancy", - "context_relevancy", "ContextPrecision", "context_precision", - "AspectCritique", - "ContextRecall", - "context_recall", "ContextUtilization", "context_utilization", + "ContextRecall", + "context_recall", + "AspectCritique", + "context_relevancy", + "ContextRelevancy", + "AnswerRelevancy", + "answer_relevancy", ] diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index ef26754ac..fb323e116 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -5,18 +5,17 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader logger = logging.getLogger(__name__) if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks + from langchain_core.outputs import LLMResult CORRECTNESS_PROMPT = Prompt( name="answer_correctness", @@ -111,7 +110,86 @@ def __post_init__(self: t.Self): llm=self.llm, batch_size=self.batch_size ) + def _compute_statement_presence(self, result: LLMResult) -> float: + assert self.llm is not None, "LLM must be set" + + key_map = { + "TP": "statements that are present in both the answer and the ground truth", + "FP": "statements present in the answer but not found in the ground truth", + "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 + } + outputs = result.generations[0] + + prediction = json_loader.safe_load(outputs[0].text, self.llm) + prediction = prediction if isinstance(prediction, list) else [prediction] + if prediction: + prediction = [ + item.get(key_map[k], np.nan) + for item in prediction + for k in key_map.keys() + ] + tp, fp, fn = [ + len(item) if isinstance(item, list) else np.nan for item in prediction + ] + if any([np.isnan(i) for i in [tp, fp, fn]]): + score = np.nan + else: + score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0 + else: + score = np.nan + + return score + + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM must be set" + q, a, g = row["question"], row["answer"], row["ground_truths"][0] + p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a) + is_statement_present = self.llm.generate_text(p_value, callbacks=callbacks) + + f1_score = self._compute_statement_presence(is_statement_present) + + if self.weights[1] == 0: + similarity_score = 0 + else: + similarity_score = self.answer_similarity.score(row, callbacks=callbacks) # type: ignore + + score = np.average( + [f1_score, similarity_score], + weights=self.weights, + ) + + return float(score) + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM must be set" + + q, a, g = row["question"], row["answer"], row["ground_truths"][0] + p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a) + is_statement_present = await self.llm.agenerate_text( + p_value, callbacks=callbacks + ) + + f1_score = self._compute_statement_presence(is_statement_present) + + if self.weights[1] == 0: + similarity_score = 0 + else: + assert self.answer_similarity is not None, "AnswerSimilarity must be set" + + similarity_score = await self.answer_similarity.ascore( + row, callbacks=callbacks + ) + + score = np.average( + [f1_score, similarity_score], + weights=self.weights, + ) + + return float(score) + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + assert self.llm is not None, "llm must be set to compute score" + logger.info(f"Adapting AnswerCorrectness metric to {language}") self.correctness_prompt = self.correctness_prompt.adapt( language, self.llm, cache_dir @@ -120,77 +198,5 @@ def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: def save(self, cache_dir: t.Optional[str] = None) -> None: self.correctness_prompt.save(cache_dir) - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: - question, answer, ground_truths = ( - dataset["question"], - dataset["answer"], - dataset["ground_truths"], - ) - prompts = [] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for q, a, g in zip(question, answer, ground_truths): - prompts.append( - self.correctness_prompt.format( - question=q, ground_truth=g[0], answer=a - ) - ) - - result = self.llm.generate(prompts, callbacks=batch_group) - outputs = result.generations - key_map = { - "TP": "statements that are present in both the answer and the ground truth", - "FP": "statements present in the answer but not found in the ground truth", - "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 - } - - f1_score = [] - for prediction in outputs: - prediction = json_loader.safe_load(prediction[0].text, self.llm) - prediction = ( - prediction if isinstance(prediction, list) else [prediction] - ) - - if prediction: - prediction = [ - item.get(key_map[k], np.nan) - for item in prediction - for k in key_map.keys() - ] - tp, fp, fn = [ - len(item) if isinstance(item, list) else np.nan - for item in prediction - ] - - if any([np.isnan(i) for i in [tp, fp, fn]]): - score = np.nan - else: - score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0 - else: - score = np.nan - - f1_score.append(score) - - if self.weights[1] == 0: - similarity_scores = np.zeros(len(f1_score)) - else: - similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore - scores_stacked = np.vstack([f1_score, similarity_scores]) - scores = np.average( - scores_stacked, - axis=0, - weights=self.weights, - ) - - return scores.tolist() - answer_correctness = AnswerCorrectness() diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 6b736aabd..1a49aff9f 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -5,22 +5,21 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.embeddings import OpenAIEmbeddings from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader logger = logging.getLogger(__name__) if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks - from ragas.embeddings.base import RagasEmbeddings + from ragas.embeddings.base import BaseRagasEmbeddings + from ragas.llms.prompt import PromptValue QUESTION_GEN = Prompt( name="question_generation", @@ -79,7 +78,7 @@ class AnswerRelevancy(MetricWithLLM): question_generation: Prompt = field(default_factory=lambda: QUESTION_GEN) batch_size: int = 15 strictness: int = 3 - embeddings: RagasEmbeddings = field(default_factory=embedding_factory) + embeddings: BaseRagasEmbeddings = field(default_factory=embedding_factory) def init_model(self): super().init_model() @@ -88,55 +87,6 @@ def init_model(self): if self.embeddings.openai_api_key == "no-key": raise OpenAIKeyNotFound - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logger.info(f"Adapting AnswerRelevancy metric to {language}") - self.question_generation = self.question_generation.adapt( - language, self.llm, cache_dir - ) - - def save(self, cache_dir: str | None = None) -> None: - self.question_generation.save(cache_dir) - - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: - questions, answers, contexts = ( - dataset["question"], - dataset["answer"], - dataset["contexts"], - ) - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - prompts = [] - for ans, ctx in zip(answers, contexts): - prompts.append( - self.question_generation.format(answer=ans, context="\n".join(ctx)) - ) - - results = self.llm.generate( - prompts, - n=self.strictness, - callbacks=batch_group, - ) - results = [ - [json_loader.safe_load(i.text, self.llm) for i in r] - for r in results.generations - ] - scores = [] - for question, result in zip(questions, results): - gen_questions = [item.get("question", "") for item in result] - committal = np.any([item.get("noncommittal", False) for item in result]) - cosine_sim = self.calculate_similarity(question, gen_questions) - scores.append(cosine_sim.mean() * int(not committal)) - - return scores - def calculate_similarity( self: t.Self, question: str, generated_questions: list[str] ): @@ -155,5 +105,59 @@ def calculate_similarity( / norm ) + def _calculate_score(self, response: t.Sequence[t.Any], row: t.Dict) -> float: + question = row["question"] + gen_questions = [item.get("question", "") for item in response] + committal = np.any([item.get("noncommittal", False) for item in response]) + cosine_sim = self.calculate_similarity(question, gen_questions) + score = cosine_sim.mean() * int(not committal) + + return score + + def _create_question_gen_prompt(self, row: t.Dict) -> PromptValue: + ans, ctx = row["answer"], row["contexts"] + return self.question_generation.format(answer=ans, context="\n".join(ctx)) + + def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not set" + + prompt = self._create_question_gen_prompt(row) + result = self.llm.generate_text( + prompt, + n=self.strictness, + callbacks=callbacks, + ) + response = [ + json_loader.safe_load(r.text, self.llm) for r in result.generations[0] + ] + + return self._calculate_score(response, row) + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not set" + + prompt = self._create_question_gen_prompt(row) + result = await self.llm.agenerate_text( + prompt, + n=self.strictness, + callbacks=callbacks, + ) + response = [ + json_loader.safe_load(r.text, self.llm) for r in result.generations[0] + ] + + return self._calculate_score(response, row) + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "LLM is not set" + + logger.info(f"Adapting AnswerRelevancy metric to {language}") + self.question_generation = self.question_generation.adapt( + language, self.llm, cache_dir + ) + + def save(self, cache_dir: str | None = None) -> None: + self.question_generation.save(cache_dir) + answer_relevancy = AnswerRelevancy() diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 350c44c70..09b13de9b 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -5,20 +5,14 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset - -from ragas.embeddings.base import ( - HuggingfaceEmbeddings, - OpenAIEmbeddings, - embedding_factory, -) -from ragas.exceptions import OpenAIKeyNotFound + +from ragas.embeddings.base import HuggingfaceEmbeddings, embedding_factory from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks - from ragas.embeddings.base import RagasEmbeddings + from ragas.embeddings.base import BaseRagasEmbeddings logger = logging.getLogger(__name__) @@ -48,7 +42,7 @@ class AnswerSimilarity(MetricWithLLM): name: str = "answer_similarity" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.ga # type: ignore batch_size: int = 15 - embeddings: RagasEmbeddings = field(default_factory=embedding_factory) + embeddings: BaseRagasEmbeddings = field(default_factory=embedding_factory) is_cross_encoder: bool = False threshold: t.Optional[float] = None @@ -64,23 +58,14 @@ def __post_init__(self: t.Self): def init_model(self): super().init_model() - if isinstance(self.embeddings, OpenAIEmbeddings): - if self.embeddings.openai_api_key == "no-key": - raise OpenAIKeyNotFound - - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: - ground_truths, answers = dataset["ground_truths"], dataset["answer"] + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: + ground_truths, answers = row["ground_truths"], row["answer"] ground_truths = [item[0] for item in ground_truths] - if self.is_cross_encoder: - assert isinstance(self.embeddings, HuggingfaceEmbeddings) - inputs = [list(item) for item in list(zip(ground_truths, answers))] - scores = np.array(self.embeddings.predict(inputs)) + if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings): + raise NotImplementedError( + "async score [ascore()] not implemented for HuggingFace embeddings" + ) else: embeddings_1 = np.array(self.embeddings.embed_documents(ground_truths)) embeddings_2 = np.array(self.embeddings.embed_documents(answers)) @@ -91,7 +76,29 @@ def _score_batch( if self.threshold: scores = scores >= self.threshold # type: ignore - return scores.tolist() + return scores.tolist()[0] + + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: + ground_truths, answers = row["ground_truths"], row["answer"] + ground_truths = [item[0] for item in ground_truths] + + if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings): + raise NotImplementedError( + "async score [ascore()] not implemented for HuggingFace embeddings" + ) + else: + embeddings_1 = np.array( + await self.embeddings.aembed_documents(ground_truths) + ) + embeddings_2 = np.array(await self.embeddings.aembed_documents(answers)) + similarity = embeddings_1 @ embeddings_2.T + scores = np.diagonal(similarity) + + assert isinstance(scores, np.ndarray), "Expects ndarray" + if self.threshold: + scores = scores >= self.threshold # type: ignore + + return scores.tolist()[0] answer_similarity = AnswerSimilarity() diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 46643cfe5..872254a2a 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -6,14 +6,15 @@ import numpy as np from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.llms.prompt import Prompt +from ragas.llms.json_load import json_loader +from ragas.llms.prompt import Prompt, PromptValue from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks + +logger = logging.getLogger(__name__) CONTEXT_PRECISION = Prompt( name="context_precision", @@ -62,8 +63,8 @@ class ContextPrecision(MetricWithLLM): Attributes ---------- name : str - batch_size : int - Batch size for openai completion. + evaluation_mode: EvaluationMode + context_precision_prompt: Prompt """ name: str = "context_precision" # type: ignore @@ -71,84 +72,91 @@ class ContextPrecision(MetricWithLLM): context_precision_prompt: Prompt = field(default_factory=lambda: CONTEXT_PRECISION) batch_size: int = 15 - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logging.info(f"Adapting Context Precision to {language}") - self.context_precision_prompt = self.context_precision_prompt.adapt( - language, self.llm, cache_dir - ) - - def save(self, cache_dir: str | None = None) -> None: - self.context_precision_prompt.save(cache_dir) - - def get_dataset_attributes(self, dataset: Dataset): + def _get_row_attributes(self, row: t.Dict) -> t.Tuple[str, t.List[str], t.Any]: answer = "ground_truths" - if answer not in dataset.features.keys(): - logging.warning( + if answer not in row.keys(): + logger.warning( "Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead" ) answer = "answer" - return dataset["question"], dataset["contexts"], dataset[answer] + return row["question"], row["contexts"], row[answer] + + def _context_precision_prompt(self, row: t.Dict) -> t.List[PromptValue]: + question, contexts, answer = self._get_row_attributes(row) + return [ + self.context_precision_prompt.format( + question=question, context=c, answer=answer + ) + for c in contexts + ] + + def _calculate_average_precision(self, json_responses: t.List[t.Dict]) -> float: + score = np.nan + verdict_list = [ + int("1" == resp.get("verdict", "0").strip()) + if resp.get("verdict") + else np.nan + for resp in json_responses + ] + denominator = sum(verdict_list) + 1e-10 + numerator = sum( + [ + (sum(verdict_list[: i + 1]) / (i + 1)) * verdict_list[i] + for i in range(len(verdict_list)) + ] + ) + score = numerator / denominator + return score + + def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: + assert self.llm is not None, "LLM is not set" - def _score_batch( + human_prompts = self._context_precision_prompt(row) + responses: t.List[str] = [] + for hp in human_prompts: + result = self.llm.generate_text( + hp, + n=1, + callbacks=callbacks, + ) + responses.append(result.generations[0][0].text) + + json_responses = [json_loader.safe_load(item, self.llm) for item in responses] + score = self._calculate_average_precision(json_responses) + return score + + async def _ascore( self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list: - prompts = [] - questions, contexts, answers = self.get_dataset_attributes(dataset) - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for qstn, ctx, answer in zip(questions, contexts, answers): - human_prompts = [ - self.context_precision_prompt.format( - question=qstn, context=c, answer=answer - ) - for c in ctx - ] - - prompts.extend(human_prompts) - - responses: list[list[str]] = [] - results = self.llm.generate( - prompts, + row: t.Dict, + callbacks: Callbacks = [], + ) -> float: + assert self.llm is not None, "LLM is not set" + + human_prompts = self._context_precision_prompt(row) + responses: t.List[str] = [] + for hp in human_prompts: + result = await self.llm.agenerate_text( + hp, n=1, - callbacks=batch_group, + callbacks=callbacks, ) - responses = [[i.text for i in r] for r in results.generations] - context_lens = [len(ctx) for ctx in contexts] - context_lens.insert(0, 0) - context_lens = np.cumsum(context_lens) - grouped_responses = [ - responses[start:end] - for start, end in zip(context_lens[:-1], context_lens[1:]) - ] - scores = [] - - for response in grouped_responses: - response = [ - json_loader.safe_load(item, self.llm) for item in sum(response, []) - ] - response = [ - int("1" == resp.get("verdict", "0").strip()) - if resp.get("verdict") - else np.nan - for resp in response - ] - denominator = sum(response) + 1e-10 - numerator = sum( - [ - (sum(response[: i + 1]) / (i + 1)) * response[i] - for i in range(len(response)) - ] - ) - scores.append(numerator / denominator) - - return scores + responses.append(result.generations[0][0].text) + + json_responses = [json_loader.safe_load(item, self.llm) for item in responses] + score = self._calculate_average_precision(json_responses) + return score + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "LLM is not set" + + logging.info(f"Adapting Context Precision to {language}") + self.context_precision_prompt = self.context_precision_prompt.adapt( + language, self.llm, cache_dir + ) + + def save(self, cache_dir: str | None = None) -> None: + self.context_precision_prompt.save(cache_dir) @dataclass diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 3dd0fcc4d..d3abb430d 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -5,15 +5,15 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks + + from ragas.llms.prompt import PromptValue logger = logging.getLogger(__name__) @@ -86,7 +86,50 @@ class ContextRecall(MetricWithLLM): context_recall_prompt: Prompt = field(default_factory=lambda: CONTEXT_RECALL_RA) batch_size: int = 15 + def _create_context_recall_prompt(self, row: t.Dict) -> PromptValue: + qstn, ctx, gt = row["question"], row["contexts"], row["ground_truths"] + gt = "\n".join(gt) if isinstance(gt, list) else gt + ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx + + return self.context_recall_prompt.format(question=qstn, context=ctx, answer=gt) + + def _compute_score(self, response: t.Any) -> float: + if response: + response = [ + int(item.get("Attributed", "0").strip() == "1") + if item.get("Attributed") + else np.nan + for item in response + ] + denom = len(response) + numerator = sum(response) + return numerator / denom + else: + return np.nan + + def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "set LLM before use" + + result = self.llm.generate_text( + self._create_context_recall_prompt(row), callbacks=callbacks + ) + response = json_loader.safe_load(result.generations[0][0].text, self.llm) + + return self._compute_score(response) + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "set LLM before use" + + result = await self.llm.agenerate_text( + self._create_context_recall_prompt(row), callbacks=callbacks + ) + response = json_loader.safe_load(result.generations[0][0].text, self.llm) + + return self._compute_score(response) + def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "set LLM before use" + logger.info(f"Adapting Context Recall to {language}") self.context_recall_prompt = self.context_recall_prompt.adapt( language, self.llm, cache_dir @@ -95,56 +138,5 @@ def adapt(self, language: str, cache_dir: str | None = None) -> None: def save(self, cache_dir: str | None = None) -> None: self.context_recall_prompt.save(cache_dir) - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list: - prompts = [] - question, ground_truths, contexts = ( - dataset["question"], - dataset["ground_truths"], - dataset["contexts"], - ) - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for qstn, gt, ctx in zip(question, ground_truths, contexts): - gt = "\n".join(gt) if isinstance(gt, list) else gt - ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx - prompts.append( - self.context_recall_prompt.format( - question=qstn, context=ctx, answer=gt - ) - ) - - responses: list[list[str]] = [] - results = self.llm.generate( - prompts, - n=1, - callbacks=batch_group, - ) - responses = [[i.text for i in r] for r in results.generations] - scores = [] - for response in responses: - response = json_loader.safe_load(response[0], self.llm) - if response: - response = [ - int(item.get("Attributed", "0").strip() == "1") - if item.get("Attributed") - else np.nan - for item in response - ] - denom = len(response) - numerator = sum(response) - scores.append(numerator / denom) - else: - scores.append(np.nan) - - return scores - context_recall = ContextRecall() diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index a43fa4454..a7031adf4 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -7,8 +7,6 @@ import numpy as np import pysbd -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM @@ -58,66 +56,65 @@ class ContextRelevancy(MetricWithLLM): batch_size: int = 15 show_deprecation_warning: bool = False - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logger.info(f"Adapting Context Relevancy to {language}") - self.context_relevancy_prompt = self.context_relevancy_prompt.adapt( - language, self.llm, cache_dir + def _compute_score(self, response: str, row: t.Dict) -> float: + context = "\n".join(row["contexts"]) + context_sents = sent_tokenize(context) + indices = ( + sent_tokenize(response.strip()) + if response.lower() != "insufficient information." + else [] ) + # print(len(indices)) + if len(context_sents) == 0: + return 0 + else: + return min(len(indices) / len(context_sents), 1) - def save(self, cache_dir: str | None = None) -> None: - self.context_relevancy_prompt.save(cache_dir) + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not initialized" - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: if self.show_deprecation_warning: logger.warning( "The 'context_relevancy' metric is going to be deprecated soon! Please use the 'context_precision' metric instead. It is a drop-in replacement just a simple search and replace should work." # noqa ) - prompts = [] - questions, contexts = dataset["question"], dataset["contexts"] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for q, c in zip(questions, contexts): - prompts.append( - self.context_relevancy_prompt.format( - question=q, context="\n".join(c) - ) - ) - - responses: list[list[str]] = [] - results = self.llm.generate( - prompts, - n=1, - callbacks=batch_group, + + question, contexts = row["question"], row["contexts"] + result = self.llm.generate_text( + self.context_relevancy_prompt.format( + question=question, context="\n".join(contexts) + ), + callbacks=callbacks, + ) + + return self._compute_score(result.generations[0][0].text, row) + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not initialized" + + if self.show_deprecation_warning: + logger.warning( + "The 'context_relevancy' metric is going to be deprecated soon! Please use the 'context_precision' metric instead. It is a drop-in replacement just a simple search and replace should work." # noqa ) - responses = [[i.text for i in r] for r in results.generations] - - scores = [] - for context, n_response in zip(contexts, responses): - context = "\n".join(context) - overlap_scores = [] - context_sents = sent_tokenize(context) - for output in n_response: - indices = ( - sent_tokenize(output.strip()) - if output.lower() != "insufficient information." - else [] - ) - if len(context_sents) == 0: - score = 0 - else: - score = min(len(indices) / len(context_sents), 1) - overlap_scores.append(score) - scores.append(np.mean(overlap_scores)) - - return scores + + question, contexts = row["question"], row["contexts"] + result = await self.llm.agenerate_text( + self.context_relevancy_prompt.format( + question=question, context="\n".join(contexts) + ), + callbacks=callbacks, + ) + return self._compute_score(result.generations[0][0].text, row) + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "set LLM before use" + + logger.info(f"Adapting Context Relevancy to {language}") + self.context_relevancy_prompt = self.context_relevancy_prompt.adapt( + language, self.llm, cache_dir + ) + + def save(self, cache_dir: str | None = None) -> None: + self.context_relevancy_prompt.save(cache_dir) context_relevancy = ContextRelevancy() diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 9c24cf142..53154704a 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -5,15 +5,16 @@ from dataclasses import dataclass, field import numpy as np -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: - from datasets import Dataset - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks + from langchain_core.outputs import LLMResult + + from ragas.llms.prompt import PromptValue logger = logging.getLogger(__name__) @@ -119,7 +120,79 @@ class Faithfulness(MetricWithLLM): ) batch_size: int = 15 + def _create_answer_prompt(self, row: t.Dict) -> PromptValue: + question, answer = row["question"], row["answer"] + + # extract statements from answer given the question + prompt_value = LONG_FORM_ANSWER_PROMPT.format(question=question, answer=answer) + return prompt_value + + def _create_nli_prompt(self, row: t.Dict, answer_result: LLMResult) -> PromptValue: + assert self.llm is not None, "llm must be set to compute score" + + contexts = row["contexts"] + # check if the statements are support in the contexts + contexts_str: str = "\n".join(contexts) + statements = json_loader.safe_load( + answer_result.generations[0][0].text, self.llm + ).get("statements", []) + statements = statements if statements != [] else ["Nil"] + statements_str: str = "\n".join( + [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] + ) + prompt_value = NLI_STATEMENTS_MESSAGE.format( + context=contexts_str, statements=statements_str + ) + return prompt_value + + def _compute_score(self, result: LLMResult): + assert self.llm is not None, "llm must be set to compute score" + + # check the verdicts and compute the score + output = result.generations[0][0] + verdict_score_map = {"1": 1, "0": 0, "null": np.nan} + output = json_loader.safe_load(output.text, self.llm) + output = output if isinstance(output, list) else [output] + faithful_statements = sum( + verdict_score_map.get( + statement_with_validation.get("verdict", "").lower(), np.nan + ) + for statement_with_validation in output + ) + num_statements = len(output) + if num_statements: + score = faithful_statements / num_statements + else: + score = np.nan + + return score + + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + """ + returns the NLI score for each (q, c, a) pair + """ + assert self.llm is not None, "LLM is not set" + p = self._create_answer_prompt(row) + result = await self.llm.agenerate_text(p, callbacks=callbacks) + + p = self._create_nli_prompt(row, result) + result = await self.llm.agenerate_text(p, callbacks=callbacks) + + return self._compute_score(result) + + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not set" + p = self._create_answer_prompt(row) + result = self.llm.generate_text(p, callbacks=callbacks) + + p = self._create_nli_prompt(row, result) + result = self.llm.generate_text(p, callbacks=callbacks) + + return self._compute_score(result) + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + assert self.llm is not None, "LLM is not set" + logger.info(f"Adapting Faithfulness metric to {language}") self.long_form_answer_prompt = self.long_form_answer_prompt.adapt( language, self.llm, cache_dir @@ -132,67 +205,5 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: self.long_form_answer_prompt.save(cache_dir) self.nli_statements_message.save(cache_dir) - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: - """ - returns the NLI score for each (q, c, a) pair - """ - - question, answer, contexts = ( - dataset["question"], - dataset["answer"], - dataset["contexts"], - ) - prompts = [] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for q, a in zip(question, answer): - human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=q, answer=a) - prompts.append(human_prompt) - - result = self.llm.generate(prompts, callbacks=batch_group) - - prompts = [] - for context, output in zip(contexts, result.generations): - statements = json_loader.safe_load(output[0].text, self.llm).get( - "statements", [] - ) - statements = statements if statements != [] else ["Nil"] - statements_str: str = "\n".join( - [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] - ) - contexts_str: str = "\n".join(context) - human_prompt = self.nli_statements_message.format( - context=contexts_str, statements=statements_str - ) - prompts.append(human_prompt) - - result = self.llm.generate(prompts, callbacks=batch_group) - outputs = result.generations - verdict_score_map = {"1": 1, "0": 0, "null": np.nan} - scores = [] - for output in outputs: - output = json_loader.safe_load(output[0].text, self.llm) - output = output if isinstance(output, list) else [output] - faithful_statements = sum( - verdict_score_map.get(str(dict.get("verdict", "")).lower(), np.nan) - for dict in output - ) - num_statements = len(output) - if num_statements: - score = faithful_statements / num_statements - else: - score = np.nan - scores.append(score) - - return scores - faithfulness = Faithfulness() diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index cfe7225c9..79e832ab7 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -8,34 +8,15 @@ import typing as t from abc import ABC, abstractmethod -from dataclasses import dataclass, field +from dataclasses import dataclass from enum import Enum -from math import floor -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from tqdm import tqdm - -from ragas.embeddings.base import RagasEmbeddings -from ragas.llms import RagasLLM, llm_factory +from ragas.callbacks import new_group if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - + from langchain_core.callbacks import Callbacks -def make_batches(total_size: int, batch_size: int) -> list[range]: - """ - Take a total size and batch size and return a list of ranges for the batches - """ - tail = total_size % batch_size - num_batches = floor(total_size / batch_size) - batches = [ - range(i, i + batch_size) for i in range(0, batch_size * num_batches, batch_size) - ] - if tail != 0: - batches.append(range(batch_size * num_batches, batch_size * num_batches + tail)) - - return batches + from ragas.llms import BaseRagasLLM EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg") @@ -62,67 +43,68 @@ def init_model(self): """ ... - # @abstractmethod def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: """ Adapt the metric to a different language. """ - pass + raise NotImplementedError( + "adapt() is not implemented for {} metric".format(self.name) + ) - # @abstractmethod def save(self, cache_dir: t.Optional[str] = None) -> None: """ Save the metric to a path. """ - pass + raise NotImplementedError( + "adapt() is not implemented for {} metric".format(self.name) + ) def score( self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - ) -> Dataset: - scores = [] - cm = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group(f"ragas_{self.name}", callback_manager=cm) as group: - for batch in tqdm(self.get_batches(len(dataset))): - score = self._score_batch(dataset.select(batch), callbacks=group) - scores.extend(score) - - return dataset.add_column(f"{self.name}", scores) # type: ignore + row: t.Dict, + callbacks: Callbacks = [], + ) -> float: + rm, group_cm = new_group( + self.name, inputs=row, callbacks=callbacks, is_async=False + ) + try: + score = self._score(row=row, callbacks=group_cm) + except Exception as e: + if not group_cm.ended: + rm.on_chain_error(e) + raise e + else: + if not group_cm.ended: + rm.on_chain_end({"output": score}) + return score @abstractmethod - def _score_batch( - selfself: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list: + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: ... - def score_single( - self: t.Self, - ds_row: dict, - callbacks: t.Optional[Callbacks] = None, - ) -> float: - """ - Score for a single row of dataset - """ - # TODO: validation check if they are string - - ds = Dataset.from_dict({k: [v] for k, v in ds_row.items()}) - score = self._score_batch( - ds, callback_group_name=self.name, callbacks=callbacks + async def ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: + rm, group_cm = new_group( + self.name, inputs=row, callbacks=callbacks, is_async=True ) + try: + score = await self._ascore(row=row, callbacks=group_cm) + except Exception as e: + if not group_cm.ended: + rm.on_chain_error(e) + raise e + else: + if not group_cm.ended: + rm.on_chain_end({"output": score}) + return score - return score[0] - - def get_batches(self, dataset_size: int) -> list[range]: - return make_batches(dataset_size, self.batch_size) + @abstractmethod + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + ... @dataclass class MetricWithLLM(Metric): - llm: RagasLLM = field(default_factory=llm_factory) + llm: t.Optional[BaseRagasLLM] = None def init_model(self): """ @@ -130,10 +112,7 @@ def init_model(self): to load all the models Also check if the api key is valid for OpenAI and AzureOpenAI """ - if hasattr(self.llm, "validate_api_key"): - self.llm.validate_api_key() - if hasattr(self, "embeddings"): - # since we are using Langchain Embeddings directly, we need to check this - if hasattr(self.embeddings, "validate_api_key"): - self.embeddings = t.cast(RagasEmbeddings, self.embeddings) - self.embeddings.validate_api_key() + if self.llm is None: + raise ValueError( + f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run." # noqa + ) diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 69105a3ed..213fcd51a 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -6,18 +6,15 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.llms import llm_factory +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks - from ragas.llms import RagasLLM + from ragas.llms import BaseRagasLLM logger = logging.getLogger(__name__) @@ -69,8 +66,8 @@ class AspectCritique(MetricWithLLM): definition: str = field(default="", repr=True) strictness: int = field(default=1, repr=False) batch_size: int = field(default=15, repr=False) - llm: RagasLLM = field( - default_factory=llm_factory, + llm: BaseRagasLLM | None = field( + default=None, repr=False, ) @@ -85,13 +82,6 @@ def __post_init__(self: t.Self): self.strictness if self.strictness % 2 != 0 else self.strictness + 1 ) - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logger.info(f"Adapting Critic to {language}") - self.critic_prompt.adapt(language, self.llm, cache_dir) - - def save(self, cache_dir: str | None = None) -> None: - self.critic_prompt.save(cache_dir) - def prompt_format( self: t.Self, question: str, @@ -106,57 +96,58 @@ def prompt_format( input=question, submission=answer, criteria=self.definition ) - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[int]: - questions, contexts, answers = [ - dataset[key] if key in dataset.features else None - for key in ("question", "context", "answer") - ] - assert isinstance(questions, list) - assert isinstance(answers, list) - if contexts is None: - contexts = [None] * len(questions) - - prompts = [] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for question, context, answer in zip(questions, contexts, answers): - human_prompt = self.prompt_format(question, answer, context) - prompts.append(human_prompt) - - results = self.llm.generate( - prompts, - n=self.strictness, - callbacks=batch_group, + def _compute_score(self, safe_loaded_responses): + ANSWER_DICT = {"1": 1, "0": 0} + if self.strictness > 1: + score = Counter( + [ + ANSWER_DICT.get(item.get("verdict", np.nan), np.nan) + for item in safe_loaded_responses + ] + ).most_common(1)[0][0] + else: + score = ANSWER_DICT.get( + safe_loaded_responses[0].get("verdict", np.nan), np.nan ) - responses: list[list[str]] = [ - [i.text for i in r] for r in results.generations - ] - - scores = [] - answer_dict = {"1": 1, "0": 0} - for response in responses: - response = [json_loader.safe_load(item, self.llm) for item in response] - if self.strictness > 1: - score = Counter( - [ - answer_dict.get(item.get("verdict", np.nan), np.nan) - for item in response - ] - ).most_common(1)[0][0] - else: - score = answer_dict.get(response[0].get("verdict", np.nan), np.nan) - - scores.append(score) - - return scores + + return score + + def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "set LLM before use" + + q, c, a = row["question"], row["contexts"], row["answer"] + + result = self.llm.generate_text( + self.prompt_format(q, a, c), callbacks=callbacks + ) + + responses = [r.text for r in result.generations[0]] + safe_loaded_responses = [json_loader.safe_load(r, self.llm) for r in responses] + + return self._compute_score(safe_loaded_responses) + + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "set LLM before use" + + q, c, a = row["question"], row["contexts"], row["answer"] + + result = await self.llm.agenerate_text( + self.prompt_format(q, a, c), callbacks=callbacks + ) + + responses = [r.text for r in result.generations[0]] + safe_loaded_responses = [json_loader.safe_load(r, self.llm) for r in responses] + + return self._compute_score(safe_loaded_responses) + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "set LLM before use" + + logger.info(f"Adapting Critic to {language}") + self.critic_prompt.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: str | None = None) -> None: + self.critic_prompt.save(cache_dir) harmfulness = AspectCritique( diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py index f53206ba9..259b8316d 100644 --- a/src/ragas/testset/testset_generator.py +++ b/src/ragas/testset/testset_generator.py @@ -32,6 +32,7 @@ from tqdm import tqdm from ragas.llms import llm_factory +from ragas.llms.json_load import load_as_json from ragas.testset.prompts import ( ANSWER_FORMULATE, COMPRESS_QUESTION, @@ -45,10 +46,9 @@ SEED_QUESTION, ) from ragas.testset.utils import load_as_score -from ragas.utils import load_as_json if t.TYPE_CHECKING: - from ragas.llms.base import RagasLLM + from ragas.llms.base import BaseRagasLLM DEFAULT_TEST_DISTRIBUTION = { @@ -127,8 +127,8 @@ class TestsetGenerator: def __init__( self, - generator_llm: RagasLLM, - critic_llm: RagasLLM, + generator_llm: BaseRagasLLM, + critic_llm: BaseRagasLLM, embeddings_model: Embeddings, testset_distribution: t.Optional[t.Dict[str, float]] = None, chat_qa: float = 0.0, @@ -198,7 +198,7 @@ def _filter_context(self, context: str) -> bool: """ human_prompt = SCORE_CONTEXT.format(context=context) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.critic_llm.generate(prompts=[prompt]) + results = self.critic_llm.generate_text_with_hmpt(prompts=[prompt]) output = results.generations[0][0].text.strip() score = load_as_score(output) return score >= self.threshold @@ -206,14 +206,14 @@ def _filter_context(self, context: str) -> bool: def _seed_question(self, context: str) -> str: human_prompt = SEED_QUESTION.format(context=context) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.generator_llm.generate(prompts=[prompt]) + results = self.generator_llm.generate_text_with_hmpt(prompts=[prompt]) return results.generations[0][0].text.strip() def _filter_question(self, question: str) -> bool: human_prompt = FILTER_QUESTION.format(question=question) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.critic_llm.generate(prompts=[prompt]) + results = self.critic_llm.generate_text_with_hmpt(prompts=[prompt]) results = results.generations[0][0].text.strip() json_results = load_as_json(results) return json_results.get("verdict") != "No" @@ -231,7 +231,7 @@ def _multicontext_question( question=question, context1=context1, context2=context2 ) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.generator_llm.generate(prompts=[prompt]) + results = self.generator_llm.generate_text_with_hmpt(prompts=[prompt]) return results.generations[0][0].text.strip() def _compress_question(self, question: str) -> str: @@ -243,13 +243,13 @@ def _conversational_question(self, question: str) -> str: def _question_transformation(self, prompt, question: str) -> str: human_prompt = prompt.format(question=question) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.generator_llm.generate(prompts=[prompt]) + results = self.generator_llm.generate_text_with_hmpt(prompts=[prompt]) return results.generations[0][0].text.strip() def _qc_template(self, prompt, question, context) -> str: human_prompt = prompt.format(question=question, context=context) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.generator_llm.generate(prompts=[prompt]) + results = self.generator_llm.generate_text_with_hmpt(prompts=[prompt]) return results.generations[0][0].text.strip() def _generate_answer(self, question: str, context: t.List[str]) -> t.List[str]: diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 00cc57d5f..49a1b3423 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -1,27 +1,20 @@ from __future__ import annotations -import json import os -import typing as t -import warnings -from dataclasses import dataclass from functools import lru_cache -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate - -if t.TYPE_CHECKING: - from ragas.llms import RagasLLM - DEBUG_ENV_VAR = "RAGAS_DEBUG" # constant to tell us that there is no key passed to the llm/embeddings NO_KEY = "no-key" -# Cache location -DEFAULT_XDG_CACHE_HOME = "~/.cache" -XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME) -DEFAULT_RAGAS_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "ragas") -RAGAS_CACHE_HOME = os.path.expanduser(os.getenv("RAGAS_HOME", DEFAULT_RAGAS_CACHE_HOME)) + +@lru_cache(maxsize=1) +def get_cache_dir() -> str: + "get cache location" + DEFAULT_XDG_CACHE_HOME = "~/.cache" + xdg_cache = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME) + default_ragas_cache = os.path.join(xdg_cache, "ragas") + return os.path.expanduser(os.getenv("RAGAS_CACHE_HOME", default_ragas_cache)) @lru_cache(maxsize=1) @@ -30,133 +23,3 @@ def get_debug_mode() -> bool: return True else: return False - - -def load_as_json(text): - """ - validate and return given text as json - """ - - try: - return json.loads(text) - except ValueError as e: - warnings.warn(f"Invalid json: {e}") - - return {} - - -# not migrating to Prompt format to avoid circular imports -JSON_PROMPT = HumanMessagePromptTemplate.from_template( - """ - -Rewrite the input into valid json - - -Input: -{{ - "name": "John Doe", - "age": 30, - "isStudent": false - "address": {{ - "street": "123 Main St", - "city": "Anytown", - "state": "CA", - }} - "hobbies": ["reading", "swimming", "cycling"] -}} -Output: -{{ - "name": "John Doe", - "age": 30, - "isStudent": false, - "address": {{ - "street": "123 Main St", - "city": "Anytown", - "state": "CA" - }}, - "hobbies": ["reading", "swimming", "cycling"] -}} - - -Input: -{{ - "statement": "The Earth is also known as "Terra" " -}} -Output: -{{ - "statement": "The Earth is also known as 'Terra'" -}} - -Input: -{input} - -Output: -""" -) - - -@dataclass -class JsonLoader: - max_retries: int = 2 - - def safe_load(self, text: str, llm: RagasLLM): - retry = 0 - while retry <= self.max_retries: - try: - start, end = self._find_outermost_json(text) - return json.loads(text[start:end]) - except ValueError: - text = self._fix_to_json(text, llm) - retry += 1 - - return {} - - def _fix_to_json( - self, - text, - llm, - callbacks: t.Optional[CallbackManager] = None, - callback_group_name: str = "batch", - ): - # TODO (executor) - with trace_as_chain_group( - callback_group_name, callback_manager=callbacks - ) as batch_group: - human_prompt = ChatPromptTemplate.from_messages( - [JSON_PROMPT.format(input=text)] - ) - results = llm.generate( - [human_prompt], - n=1, - callbacks=batch_group, - ) - return results.generations[0][0].text - - def _find_outermost_json(self, text): - stack = [] - start_index = -1 - - for i, char in enumerate(text): - if char in "{[": - if len(stack) == 0: - start_index = i - stack.append(char) - - elif char in "}]": - if len(stack) > 0: - last = stack.pop() - if (char == "}" and last != "{") or (char == "]" and last != "["): - # Mismatched closing brace/bracket, invalid JSON - break - - if len(stack) == 0 and start_index != -1: - # Found a valid outermost JSON - return ( - start_index, - i + 1, - ) # Add 1 to include the closing brace/bracket in the range - - return -1, -1 # No valid JSON found - - -json_loader = JsonLoader() diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index 1edb8c4f4..106fae92e 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -4,9 +4,13 @@ from ragas import evaluate from ragas.metrics import ( + answer_correctness, answer_relevancy, + answer_similarity, context_precision, context_recall, + context_relevancy, + context_utilization, faithfulness, ) from ragas.metrics.critique import harmfulness @@ -16,16 +20,37 @@ assert isinstance(ds, DatasetDict) fiqa = ds["baseline"] +# metrics +metrics = [ + faithfulness, + context_recall, + answer_relevancy, + answer_correctness, + harmfulness, + context_relevancy, + context_precision, + context_utilization, + answer_similarity, +] + if __name__ == "__main__": + # asyncio + start = time.time() + print("ignored") + # _ = evaluate( + # fiqa, + # metrics=[ + # faithfulness, + # ], + # is_async=True, + # ) + print(f"Time taken [Asyncio]: {time.time() - start:.2f}s") + + # Threads start = time.time() _ = evaluate( fiqa, - metrics=[ - answer_relevancy, - context_precision, - faithfulness, - harmfulness, - context_recall, - ], + metrics=metrics, + is_async=False, ) - print(f"Time taken: {time.time() - start:.2f}s") + print(f"Time taken [Threads]: {time.time() - start:.2f}s") diff --git a/tests/unit/llms/test_llm.py b/tests/unit/llms/test_llm.py new file mode 100644 index 000000000..09b6b0f03 --- /dev/null +++ b/tests/unit/llms/test_llm.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +import typing as t + +from langchain.schema import Generation, LLMResult + +from ragas.llms.base import BaseRagasLLM + +if t.TYPE_CHECKING: + from ragas.llms.prompt import PromptValue + + +class TestLLM(BaseRagasLLM): + def llm(self): + return self + + def generate_text( + self, prompt: PromptValue, n=1, temperature=1e-8, stop=None, callbacks=[] + ): + generations = [[Generation(text=prompt.prompt_str)] * n] + return LLMResult(generations=generations) + + async def agenerate_text( + self, prompt: PromptValue, n=1, temperature=1e-8, stop=None, callbacks=[] + ): + return self.generate_text(prompt, n, temperature, stop, callbacks) diff --git a/tests/unit/test_prompt.py b/tests/unit/llms/test_prompt.py similarity index 100% rename from tests/unit/test_prompt.py rename to tests/unit/llms/test_prompt.py diff --git a/src/ragas/langsmith/__init__.py b/tests/unit/test_executor.py similarity index 100% rename from src/ragas/langsmith/__init__.py rename to tests/unit/test_executor.py diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py deleted file mode 100644 index 903fe23f0..000000000 --- a/tests/unit/test_llm.py +++ /dev/null @@ -1,146 +0,0 @@ -from __future__ import annotations - -import os - -import pytest -from langchain.prompts.chat import ChatPromptTemplate -from langchain.schema import Generation, LLMResult - -from ragas.embeddings import AzureOpenAIEmbeddings, OpenAIEmbeddings -from ragas.llms.base import RagasLLM -from ragas.llms.openai import ( - AzureOpenAI, - AzureOpenAIKeyNotFound, - OpenAI, - OpenAIKeyNotFound, -) -from ragas.utils import NO_KEY - - -class TestLLM(RagasLLM): - def llm(self): - return self - - def generate( - self, prompts: list[ChatPromptTemplate], n=1, temperature=0, callbacks=None - ): - prompt_strs = [p.format() for p in prompts] - generations = [[Generation(text=prompt_str)] * n for prompt_str in prompt_strs] - return LLMResult(generations=generations) - - async def agenerate( - self, prompt: ChatPromptTemplate, n=1, temperature=0, callbacks=None - ): - return self.generate([prompt], n, temperature, callbacks) - - def validate_api_key(self): - if os.getenv("FAKELLM_API_KEY", NO_KEY) == NO_KEY: - raise ValueError("FAKELLM_API_KEY not found in environment variables.") - - -def test_validate_api_key(): - llm = TestLLM() - with pytest.raises(ValueError): - llm.validate_api_key() - os.environ["FAKELLM_API_KEY"] = "random-key-102848595" - # just check if no error is raised - assert llm.validate_api_key() is None - - -def openai_llm_factory(with_api_key): - if with_api_key: - api_key = "random-key-102848595" - return OpenAI(api_key=api_key), api_key - else: - return OpenAI() - - -def openai_embedding_factory(with_api_key): - if with_api_key: - api_key = "random-key-102848595" - return OpenAIEmbeddings(api_key=api_key), api_key - else: - return OpenAIEmbeddings() - - -def azure_llm_factory(with_api_key): - if with_api_key: - api_key = "random-key-102848595" - return ( - AzureOpenAI( - api_version="2020-09-03", - api_key=api_key, - azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", - deployment="en-fr", - ), - api_key, - ) - else: - return AzureOpenAI( - azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", - deployment="en-fr", - api_version="2020-09-03", - ) - - -def azure_embed_factory(with_api_key): - if with_api_key: - api_key = "random-key-102848595" - return ( - AzureOpenAIEmbeddings( - api_version="2020-09-03", - api_key=api_key, - azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", - deployment="en-fr", - ), - api_key, - ) - else: - return AzureOpenAIEmbeddings( - azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", - deployment="en-fr", - api_version="2020-09-03", - ) - - -@pytest.mark.parametrize( - "factory, key_not_found_exception, environ_key", - [ - (openai_llm_factory, OpenAIKeyNotFound, "OPENAI_API_KEY"), - (azure_llm_factory, AzureOpenAIKeyNotFound, "AZURE_OPENAI_API_KEY"), - (openai_embedding_factory, OpenAIKeyNotFound, "OPENAI_API_KEY"), - (azure_embed_factory, AzureOpenAIKeyNotFound, "AZURE_OPENAI_API_KEY"), - ], -) -def test_validate_api_key_for_different_llms( - factory, key_not_found_exception, environ_key -): - # load key from environment variables - if environ_key in os.environ: - os.environ.pop(environ_key) - obj = factory(with_api_key=False) - with pytest.raises(key_not_found_exception): - obj.validate_api_key() - os.environ[environ_key] = "random-key-102848595" - obj = factory(with_api_key=False) - assert obj.validate_api_key() is None - - # load key which is passed as argument - if environ_key in os.environ: - os.environ.pop(environ_key) - obj, _ = factory(with_api_key=True) - assert obj.validate_api_key() is None - - # assert order of precedence - os.environ[environ_key] = "random-key-102848595" - obj, api_key = factory(with_api_key=True) - assert obj.validate_api_key - assert obj.api_key == api_key - - # assert loading key from environment variables after instantiation - if environ_key in os.environ: - os.environ.pop(environ_key) - obj = factory(with_api_key=False) - os.environ[environ_key] = "random-key-102848595" - assert obj.validate_api_key() is None - assert obj.api_key == "random-key-102848595" diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py index 056524952..e69de29bb 100644 --- a/tests/unit/test_metric.py +++ b/tests/unit/test_metric.py @@ -1,11 +0,0 @@ -import pytest - -from ragas.metrics.base import make_batches - - -@pytest.mark.parametrize( - "batch_size, total_size, len_expected", [(5, 10, 2), (5, 11, 3), (5, 9, 2)] -) -def test_make_batches(batch_size, total_size, len_expected): - batches = make_batches(total_size, batch_size) - assert len(batches) == len_expected