From 41a3c67fe757c00908c81cf153ef1c0b207c72ae Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 6 Mar 2024 12:41:32 -0800 Subject: [PATCH 1/7] fixed EvaluatorChain --- src/ragas/integrations/langchain.py | 186 ++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 src/ragas/integrations/langchain.py diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py new file mode 100644 index 000000000..081eb8d8b --- /dev/null +++ b/src/ragas/integrations/langchain.py @@ -0,0 +1,186 @@ +from __future__ import annotations + +import typing as t + +from langchain.callbacks.manager import CallbackManagerForChainRun +from langchain.chains.base import Chain +from langchain.schema import RUN_KEY +from langsmith.evaluation import EvaluationResult, RunEvaluator +from langsmith.schemas import Example, Run +from langchain_openai.chat_models import ChatOpenAI +from langchain_openai.embeddings import OpenAIEmbeddings + +from ragas.metrics.base import ( + EvaluationMode, + Metric, + MetricWithLLM, + MetricWithEmbeddings, +) +from ragas.validation import EVALMODE_TO_COLUMNS +from ragas.llms import LangchainLLMWrapper +from ragas.embeddings import LangchainEmbeddingsWrapper +from ragas.run_config import RunConfig + +__all__ = ["EvaluatorChain"] + + +class EvaluatorChain(Chain, RunEvaluator): + """ + Wrapper around ragas Metrics to use them with langsmith. + """ + + metric: Metric + + def __init__(self, metric: Metric, **kwargs: t.Any): + kwargs["metric"] = metric + super().__init__(**kwargs) + if "run_config" in kwargs: + run_config = kwargs["run_config"] + else: + run_config = RunConfig() + if isinstance(self.metric, MetricWithLLM): + llm = kwargs.get("llm", ChatOpenAI()) + t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm) + if isinstance(self.metric, MetricWithEmbeddings): + embeddings = kwargs.get("embeddings", OpenAIEmbeddings()) + t.cast( + MetricWithEmbeddings, self.metric + ).embeddings = LangchainEmbeddingsWrapper(embeddings) + self.metric.init(run_config) + + @property + def input_keys(self) -> list[str]: + keys = ["question", "answer"] + if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]: + keys += ["contexts"] + if self.metric.evaluation_mode in [EvaluationMode.gc]: + keys += ["ground_truth"] + return keys + + @property + def output_keys(self) -> list[str]: + return [self.metric.name] + + def _call( + self, + inputs: dict[str, t.Any], + run_manager: t.Optional[CallbackManagerForChainRun] = None, + ) -> dict[str, t.Any]: + """ + Call the evaluation chain. + """ + self._validate(inputs) + _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() + callbacks = _run_manager.get_child() + + c = inputs.get("contexts", [""]) + g = inputs.get("ground_truth", "") + q = inputs.get("question", "") + a = inputs.get("answer", "") + score = self.metric.score( + { + "question": q, + "answer": a, + "contexts": c, + "ground_truth": g, + }, + callbacks=callbacks, + ) + return {self.metric.name: score} + + def _validate( + self, + input: dict[str, t.Any], + question_key: str = "question", + prediction_key: str = "answer", + context_key: str = "contexts", + ) -> None: + # validate each example + required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode] + if "question" in required_columns and question_key not in input: + raise ValueError( + f'"{question_key}" is required in each example' + f"for the metric[{self.metric.name}] you have chosen." + ) + if "answer" in required_columns and prediction_key not in input: + raise ValueError( + f'"{prediction_key}" is required in each prediction' + f"for the metric[{self.metric.name}] you have chosen." + ) + if "contexts" in required_columns and context_key not in input: + raise ValueError( + f'"{context_key}" is required in each prediction for the ' + f"metric[{self.metric.name}] you have chosen." + ) + if "ground_truth" in required_columns and "ground_truth" not in input: + raise ValueError( + f'"ground_truth" is required in each prediction for the ' + f"metric[{self.metric.name}] you have chosen." + ) + + @staticmethod + def _keys_are_present(keys_to_check: list, dict_to_check: dict) -> list[str]: + return [k for k in keys_to_check if k not in dict_to_check] + + def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> None: + if example is None: + raise ValueError( + "expected example to be provided. Please check langsmith dataset and ensure valid dataset is uploaded." + ) + if example.inputs is None: + raise ValueError( + "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded." + ) + if example.outputs is None: + raise ValueError( + "expected example.inputs to be provided. Please check langsmith dataset and ensure valid dataset is uploaded." + ) + if "question" not in example.inputs or "ground_truth" not in example.outputs: + raise ValueError( + "Expected 'question' and 'ground_truth' in example." + f"Got: {[k for k in example.inputs.keys()]}" + ) + assert ( + run.outputs is not None + ), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys." + missing_keys = self._keys_are_present(["answer", "contexts"], run.outputs) + if missing_keys: + raise ValueError( + "Expected 'answer' and 'contexts' in run.outputs." + f"Got: {[k for k in run.outputs.keys()]}" + ) + + def evaluate_run( + self, run: Run, example: t.Optional[Example] = None + ) -> EvaluationResult: + """ + Evaluate a langsmith run + """ + self._validate_langsmith_eval(run, example) + + # this is just to suppress the type checker error + # actual check and error message is in the _validate_langsmith_eval + assert run.outputs is not None + assert example is not None + assert example.inputs is not None + assert example.outputs is not None + + chain_eval = run.outputs + chain_eval["question"] = example.inputs["question"] + if self.metric.evaluation_mode in [ + EvaluationMode.gc, + EvaluationMode.ga, + EvaluationMode.qcg, + EvaluationMode.qga, + ]: + if example.outputs is None or "ground_truth" not in example.outputs: + raise ValueError("expected `ground_truth` in example outputs.") + chain_eval["ground_truth"] = example.outputs["ground_truth"] + eval_output = self(chain_eval, include_run_info=True) + + evaluation_result = EvaluationResult( + key=self.metric.name, score=eval_output[self.metric.name] + ) + if RUN_KEY in eval_output: + evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY] + return evaluation_result From 593c2125ccfb8e8bae7a4396ce626eb7cba9b70e Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 6 Mar 2024 20:17:24 -0800 Subject: [PATCH 2/7] async invoke --- src/ragas/integrations/langchain.py | 65 ++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py index 081eb8d8b..b449d9af0 100644 --- a/src/ragas/integrations/langchain.py +++ b/src/ragas/integrations/langchain.py @@ -2,7 +2,6 @@ import typing as t -from langchain.callbacks.manager import CallbackManagerForChainRun from langchain.chains.base import Chain from langchain.schema import RUN_KEY from langsmith.evaluation import EvaluationResult, RunEvaluator @@ -21,7 +20,11 @@ from ragas.embeddings import LangchainEmbeddingsWrapper from ragas.run_config import RunConfig -__all__ = ["EvaluatorChain"] +if t.TYPE_CHECKING: + from langchain.callbacks.manager import ( + CallbackManagerForChainRun, + AsyncCallbackManagerForChainRun, + ) class EvaluatorChain(Chain, RunEvaluator): @@ -50,10 +53,34 @@ def __init__(self, metric: Metric, **kwargs: t.Any): @property def input_keys(self) -> list[str]: - keys = ["question", "answer"] - if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]: + keys = [] + if self.metric.evaluation_mode in [ + EvaluationMode.qac, + EvaluationMode.qa, + EvaluationMode.qc, + EvaluationMode.qga, + EvaluationMode.qcg, + ]: + keys += ["question"] + if self.metric.evaluation_mode in [ + EvaluationMode.qac, + EvaluationMode.qa, + EvaluationMode.ga, + EvaluationMode.qga, + ]: + keys += ["answer"] + if self.metric.evaluation_mode in [ + EvaluationMode.qac, + EvaluationMode.gc, + EvaluationMode.gc, + EvaluationMode.qcg, + ]: keys += ["contexts"] - if self.metric.evaluation_mode in [EvaluationMode.gc]: + if self.metric.evaluation_mode in [ + EvaluationMode.gc, + EvaluationMode.qga, + EvaluationMode.qcg, + ]: keys += ["ground_truth"] return keys @@ -88,6 +115,34 @@ def _call( ) return {self.metric.name: score} + async def _acall( + self, + inputs: t.Dict[str, t.Any], + run_manager: t.Optional[AsyncCallbackManagerForChainRun] = None, + ) -> t.Dict[str, t.Any]: + """ + Call the evaluation chain. + """ + self._validate(inputs) + _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager() + # TODO: currently AsyncCallbacks are not supported in ragas + callbacks = _run_manager.get_child() + + c = inputs.get("contexts", [""]) + g = inputs.get("ground_truth", "") + q = inputs.get("question", "") + a = inputs.get("answer", "") + score = await self.metric.ascore( + { + "question": q, + "answer": a, + "contexts": c, + "ground_truth": g, + }, + callbacks=[], + ) + return {self.metric.name: score} + def _validate( self, input: dict[str, t.Any], From 844cf05a00eea0c9842afa59889e2d1b428535d1 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Mon, 11 Mar 2024 20:16:14 -0700 Subject: [PATCH 3/7] langmsmith fixes --- src/ragas/integrations/__init__.py | 0 src/ragas/integrations/langchain.py | 37 ++++--------------- src/ragas/integrations/langsmith.py | 55 +++++++++++++++++++++++++++++ src/ragas/metrics/base.py | 21 +++++++++++ 4 files changed, 82 insertions(+), 31 deletions(-) create mode 100644 src/ragas/integrations/__init__.py create mode 100644 src/ragas/integrations/langsmith.py diff --git a/src/ragas/integrations/__init__.py b/src/ragas/integrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py index b449d9af0..ae8add433 100644 --- a/src/ragas/integrations/langchain.py +++ b/src/ragas/integrations/langchain.py @@ -14,6 +14,7 @@ Metric, MetricWithLLM, MetricWithEmbeddings, + get_required_columns, ) from ragas.validation import EVALMODE_TO_COLUMNS from ragas.llms import LangchainLLMWrapper @@ -53,36 +54,7 @@ def __init__(self, metric: Metric, **kwargs: t.Any): @property def input_keys(self) -> list[str]: - keys = [] - if self.metric.evaluation_mode in [ - EvaluationMode.qac, - EvaluationMode.qa, - EvaluationMode.qc, - EvaluationMode.qga, - EvaluationMode.qcg, - ]: - keys += ["question"] - if self.metric.evaluation_mode in [ - EvaluationMode.qac, - EvaluationMode.qa, - EvaluationMode.ga, - EvaluationMode.qga, - ]: - keys += ["answer"] - if self.metric.evaluation_mode in [ - EvaluationMode.qac, - EvaluationMode.gc, - EvaluationMode.gc, - EvaluationMode.qcg, - ]: - keys += ["contexts"] - if self.metric.evaluation_mode in [ - EvaluationMode.gc, - EvaluationMode.qga, - EvaluationMode.qcg, - ]: - keys += ["ground_truth"] - return keys + return get_required_columns(self.metric.evaluation_mode) @property def output_keys(self) -> list[str]: @@ -198,7 +170,10 @@ def _validate_langsmith_eval(self, run: Run, example: t.Optional[Example]) -> No assert ( run.outputs is not None ), "the current run has no outputs. The chain should output 'answer' and 'contexts' keys." - missing_keys = self._keys_are_present(["answer", "contexts"], run.outputs) + output_keys = get_required_columns( + self.metric.evaluation_mode, ["question", "ground_truth"] + ) + missing_keys = self._keys_are_present(output_keys, run.outputs) if missing_keys: raise ValueError( "Expected 'answer' and 'contexts' in run.outputs." diff --git a/src/ragas/integrations/langsmith.py b/src/ragas/integrations/langsmith.py new file mode 100644 index 000000000..edea4ebb0 --- /dev/null +++ b/src/ragas/integrations/langsmith.py @@ -0,0 +1,55 @@ +import typing as t + +from langchain.smith import RunEvalConfig +from langsmith import Client +from langsmith.utils import LangSmithNotFoundError + + +from ragas.integrations.langchain import EvaluatorChain + + +def evaluate( + dataset_name: str, + llm_or_chain_factory: t.Any, + run_name: str = "", + metrics: t.Optional[list] = None, + verbose: bool = False, +) -> t.Dict[str, t.Any]: + # get sensible run name + if not run_name: + run_name = llm_or_chain_factory.get_name() + # init client and validate dataset + client = Client() + try: + _ = client.read_dataset(dataset_name=dataset_name) + except LangSmithNotFoundError: + raise ValueError( + f"Dataset {dataset_name} not found in langsmith, make sure it exists in langsmith" + ) + + # make config + if metrics is None: + from ragas.metrics import ( + answer_relevancy, + context_precision, + context_recall, + faithfulness, + ) + + metrics = [answer_relevancy, context_precision, faithfulness, context_recall] + + metrics = [EvaluatorChain(m) for m in metrics] + eval_config = RunEvalConfig( + custom_evaluators=metrics, + ) + + # run evaluation with langsmith + run = client.run_on_dataset( + dataset_name=dataset_name, + llm_or_chain_factory=llm_or_chain_factory, + evaluation=eval_config, + verbose=verbose, + # Any experiment metadata can be specified here + ) + + return run diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index e0ed3fb7a..f4d05bf9e 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -25,6 +25,27 @@ EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg") +def get_required_columns( + eval_mod: EvaluationMode, ignore_columns: t.List[str] = [] +) -> t.List[str]: + if eval_mod == EvaluationMode.qac: + keys = ["question", "answer", "contexts"] + elif eval_mod == EvaluationMode.qa: + keys = ["question", "answer"] + elif eval_mod == EvaluationMode.qc: + keys = ["question", "contexts"] + elif eval_mod == EvaluationMode.gc: + keys = ["contexts", "ground_truth"] + elif eval_mod == EvaluationMode.ga: + keys = ["answer", "ground_truth"] + elif eval_mod == EvaluationMode.qga: + keys = ["question", "contexts", "answer", "ground_truth"] + elif eval_mod == EvaluationMode.qcg: + keys = ["question", "contexts", "ground_truth"] + + return [k for k in keys if k not in ignore_columns] + + @dataclass class Metric(ABC): @property From 4871ad451908a6da8a91ba2dc56e81d5b5a76627 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 20 Mar 2024 07:49:19 -0700 Subject: [PATCH 4/7] upload dataset --- src/ragas/integrations/langsmith.py | 138 ++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 7 deletions(-) diff --git a/src/ragas/integrations/langsmith.py b/src/ragas/integrations/langsmith.py index edea4ebb0..933ebeb41 100644 --- a/src/ragas/integrations/langsmith.py +++ b/src/ragas/integrations/langsmith.py @@ -1,23 +1,146 @@ +from __future__ import annotations + import typing as t from langchain.smith import RunEvalConfig -from langsmith import Client -from langsmith.utils import LangSmithNotFoundError - from ragas.integrations.langchain import EvaluatorChain +if t.TYPE_CHECKING: + from ragas.testset.generator import TestDataset + from langsmith.schemas import Dataset as LangsmithDataset + +try: + from langsmith import Client + from langsmith.utils import LangSmithNotFoundError +except ImportError: + raise ImportError( + "Please install langsmith to use this feature. You can install it via pip install langsmith" + ) + + +def upload_dataset( + dataset: TestDataset, dataset_name: str, dataset_desc: str = "" +) -> LangsmithDataset: + """ + Uploads a new dataset to LangSmith, converting it from a TestDataset object to a + pandas DataFrame before upload. If a dataset with the specified name already + exists, the function raises an error. + + Parameters + ---------- + dataset : TestDataset + The dataset to be uploaded. + dataset_name : str + The name for the new dataset in LangSmith. + dataset_desc : str, optional + A description for the new dataset. The default is an empty string. + + Returns + ------- + LangsmithDataset + The dataset object as stored in LangSmith after upload. + + Raises + ------ + ValueError + If a dataset with the specified name already exists in LangSmith. + + Notes + ----- + The function attempts to read a dataset by the given name to check its existence. + If not found, it proceeds to upload the dataset after converting it to a pandas + DataFrame. This involves specifying input and output keys for the dataset being + uploaded. + """ + client = Client() + try: + # check if dataset exists + dataset = client.read_dataset(dataset_name=dataset_name) + raise ValueError( + f"Dataset {dataset_name} already exists in langsmith. [{dataset}]" + ) + except LangSmithNotFoundError: + # if not create a new one with the generated query examples + dataset = client.upload_dataframe( + df=dataset.to_pandas(), + name=dataset_name, + input_keys=["question"], + output_keys=["ground_truth"], + description=dataset_desc, + ) + + print( + f"Created a new dataset '{dataset.name}'. Dataset is accessible at {dataset.url}" + ) + return dataset + def evaluate( dataset_name: str, llm_or_chain_factory: t.Any, - run_name: str = "", + experiment_name: t.Optional[str] = None, metrics: t.Optional[list] = None, verbose: bool = False, ) -> t.Dict[str, t.Any]: - # get sensible run name - if not run_name: - run_name = llm_or_chain_factory.get_name() + """ + Evaluates a language model or a chain factory on a specified dataset using + LangSmith, with the option to customize metrics and verbosity. + + Parameters + ---------- + dataset_name : str + The name of the dataset to use for evaluation. This dataset must exist in + LangSmith. + llm_or_chain_factory : Any + The language model or chain factory to be evaluated. This parameter is + flexible and can accept a variety of objects depending on the implementation. + experiment_name : Optional[str], optional + The name of the experiment. This can be used to categorize or identify the + evaluation run within LangSmith. The default is None. + metrics : Optional[list], optional + A list of custom metrics (functions or evaluators) to be used for the + evaluation. If None, a default set of metrics (answer relevancy, context + precision, context recall, and faithfulness) are used. + The default is None. + verbose : bool, optional + If True, detailed progress and results will be printed during the evaluation + process. + The default is False. + + Returns + ------- + Dict[str, Any] + A dictionary containing the results of the evaluation. + + Raises + ------ + ValueError + If the specified dataset does not exist in LangSmith. + + See Also + -------- + Client.read_dataset : Method to read an existing dataset. + Client.run_on_dataset : Method to run the evaluation on the specified dataset. + + Examples + -------- + >>> results = evaluate( + ... dataset_name="MyDataset", + ... llm_or_chain_factory=my_llm, + ... experiment_name="experiment_1_with_vanila_rag", + ... verbose=True + ... ) + >>> print(results) + {'evaluation_result': ...} + + Notes + ----- + The function initializes a client to interact with LangSmith, validates the existence + of the specified dataset, prepares evaluation metrics, and runs the evaluation, + returning the results. Custom evaluation metrics can be specified, or a default set + will be used if none are provided. + """ # init client and validate dataset client = Client() try: @@ -50,6 +173,7 @@ def evaluate( evaluation=eval_config, verbose=verbose, # Any experiment metadata can be specified here + project_name=experiment_name, ) return run From 5b42ed51a63f9c91f82fee5c32e2ec9aaf4f3791 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 28 Mar 2024 08:43:24 -0700 Subject: [PATCH 5/7] fix linting and fmt --- docs/alfred.py | 11 ++++++----- src/ragas/embeddings/base.py | 4 +--- src/ragas/evaluation.py | 2 +- src/ragas/executor.py | 13 ++++++++----- src/ragas/integrations/langchain.py | 16 ++++++++-------- src/ragas/integrations/langsmith.py | 3 ++- src/ragas/llms/base.py | 7 ++----- src/ragas/testset/docstore.py | 2 +- src/ragas/testset/evolutions.py | 12 ++---------- src/ragas/testset/generator.py | 6 +++--- 10 files changed, 34 insertions(+), 42 deletions(-) diff --git a/docs/alfred.py b/docs/alfred.py index da966d2ab..c7f50e38d 100644 --- a/docs/alfred.py +++ b/docs/alfred.py @@ -1,14 +1,15 @@ from __future__ import annotations -import os -from collections import namedtuple import argparse import asyncio -from tqdm.asyncio import tqdm +import os import typing as t -from langchain_openai.chat_models import ChatOpenAI -from langchain_core.language_models.chat_models import BaseChatModel +from collections import namedtuple + from langchain.prompts import ChatPromptTemplate +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_openai.chat_models import ChatOpenAI +from tqdm.asyncio import tqdm File = namedtuple("File", "name content") diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index cd6e83d9c..38d5efd72 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -44,9 +44,7 @@ def set_run_config(self, run_config: RunConfig): class LangchainEmbeddingsWrapper(BaseRagasEmbeddings): def __init__( - self, - embeddings: Embeddings, - run_config: t.Optional[RunConfig] = None + self, embeddings: Embeddings, run_config: t.Optional[RunConfig] = None ): self.embeddings = embeddings if run_config is None: diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 9529c9984..37f2d2486 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -15,9 +15,9 @@ LangchainEmbeddingsWrapper, embedding_factory, ) -from ragas.llms import llm_factory from ragas.exceptions import ExceptionInRunner from ragas.executor import Executor +from ragas.llms import llm_factory from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper from ragas.metrics._answer_correctness import AnswerCorrectness from ragas.metrics.base import Metric, MetricWithEmbeddings, MetricWithLLM diff --git a/src/ragas/executor.py b/src/ragas/executor.py index 3d034a967..70c6e0a00 100644 --- a/src/ragas/executor.py +++ b/src/ragas/executor.py @@ -1,8 +1,8 @@ from __future__ import annotations -import sys import asyncio import logging +import sys import threading import typing as t from dataclasses import dataclass, field @@ -24,20 +24,23 @@ def runner_exception_hook(args: threading.ExceptHookArgs): # set a custom exception hook # threading.excepthook = runner_exception_hook + def as_completed(loop, coros, max_workers): loop_arg_dict = {"loop": loop} if sys.version_info[:2] < (3, 10) else {} if max_workers == -1: return asyncio.as_completed(coros, **loop_arg_dict) - + # loop argument is removed since Python 3.10 semaphore = asyncio.Semaphore(max_workers, **loop_arg_dict) + async def sema_coro(coro): async with semaphore: return await coro - + sema_coros = [sema_coro(c) for c in coros] return asyncio.as_completed(sema_coros, **loop_arg_dict) + class Runner(threading.Thread): def __init__( self, @@ -45,7 +48,7 @@ def __init__( desc: str, keep_progress_bar: bool = True, raise_exceptions: bool = True, - run_config: t.Optional[RunConfig] = None + run_config: t.Optional[RunConfig] = None, ): super().__init__() self.jobs = jobs @@ -59,7 +62,7 @@ def __init__( self.futures = as_completed( loop=self.loop, coros=[coro for coro, _ in self.jobs], - max_workers=self.run_config.max_workers + max_workers=self.run_config.max_workers, ) async def _aresults(self) -> t.List[t.Any]: diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py index ae8add433..064c9a7f5 100644 --- a/src/ragas/integrations/langchain.py +++ b/src/ragas/integrations/langchain.py @@ -4,27 +4,27 @@ from langchain.chains.base import Chain from langchain.schema import RUN_KEY -from langsmith.evaluation import EvaluationResult, RunEvaluator -from langsmith.schemas import Example, Run from langchain_openai.chat_models import ChatOpenAI from langchain_openai.embeddings import OpenAIEmbeddings +from langsmith.evaluation import EvaluationResult, RunEvaluator +from langsmith.schemas import Example, Run +from ragas.embeddings import LangchainEmbeddingsWrapper +from ragas.llms import LangchainLLMWrapper from ragas.metrics.base import ( EvaluationMode, Metric, - MetricWithLLM, MetricWithEmbeddings, + MetricWithLLM, get_required_columns, ) -from ragas.validation import EVALMODE_TO_COLUMNS -from ragas.llms import LangchainLLMWrapper -from ragas.embeddings import LangchainEmbeddingsWrapper from ragas.run_config import RunConfig +from ragas.validation import EVALMODE_TO_COLUMNS if t.TYPE_CHECKING: from langchain.callbacks.manager import ( - CallbackManagerForChainRun, AsyncCallbackManagerForChainRun, + CallbackManagerForChainRun, ) @@ -98,7 +98,7 @@ async def _acall( self._validate(inputs) _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager() # TODO: currently AsyncCallbacks are not supported in ragas - callbacks = _run_manager.get_child() + _run_manager.get_child() c = inputs.get("contexts", [""]) g = inputs.get("ground_truth", "") diff --git a/src/ragas/integrations/langsmith.py b/src/ragas/integrations/langsmith.py index 933ebeb41..32762878e 100644 --- a/src/ragas/integrations/langsmith.py +++ b/src/ragas/integrations/langsmith.py @@ -7,9 +7,10 @@ from ragas.integrations.langchain import EvaluatorChain if t.TYPE_CHECKING: - from ragas.testset.generator import TestDataset from langsmith.schemas import Dataset as LangsmithDataset + from ragas.testset.generator import TestDataset + try: from langsmith import Client from langsmith.utils import LangSmithNotFoundError diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index f2e0d7820..5979d336b 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -119,9 +119,7 @@ class LangchainLLMWrapper(BaseRagasLLM): """ def __init__( - self, - langchain_llm: BaseLanguageModel, - run_config: t.Optional[RunConfig] = None + self, langchain_llm: BaseLanguageModel, run_config: t.Optional[RunConfig] = None ): self.langchain_llm = langchain_llm if run_config is None: @@ -206,8 +204,7 @@ def set_run_config(self, run_config: RunConfig): def llm_factory( - model: str = "gpt-3.5-turbo-16k", - run_config: t.Optional[RunConfig] = None + model: str = "gpt-3.5-turbo-16k", run_config: t.Optional[RunConfig] = None ) -> BaseRagasLLM: timeout = None if run_config is not None: diff --git a/src/ragas/testset/docstore.py b/src/ragas/testset/docstore.py index 7ab3b3916..456ec6c40 100644 --- a/src/ragas/testset/docstore.py +++ b/src/ragas/testset/docstore.py @@ -78,7 +78,7 @@ class Direction(str, Enum): PREV = "prev" UP = "up" DOWN = "down" - + class Node(Document): keyphrases: t.List[str] = Field(default_factory=list, repr=False) diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py index 8167b3bb2..61e22b465 100644 --- a/src/ragas/testset/evolutions.py +++ b/src/ragas/testset/evolutions.py @@ -88,11 +88,7 @@ def merge_nodes(nodes: CurrentNodes) -> Node: new_node.embedding = np.average(node_embeddings, axis=0) return new_node - def init( - self, - is_async: bool = True, - run_config: t.Optional[RunConfig] = None - ): + def init(self, is_async: bool = True, run_config: t.Optional[RunConfig] = None): self.is_async = is_async if run_config is None: run_config = RunConfig() @@ -339,11 +335,7 @@ class ComplexEvolution(Evolution): default_factory=lambda: compress_question_prompt ) - def init( - self, - is_async: bool = True, - run_config: t.Optional[RunConfig] = None - ): + def init(self, is_async: bool = True, run_config: t.Optional[RunConfig] = None): if run_config is None: run_config = RunConfig() super().init(is_async=is_async, run_config=run_config) diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py index 4febcc5e1..ebf7648a1 100644 --- a/src/ragas/testset/generator.py +++ b/src/ragas/testset/generator.py @@ -120,7 +120,7 @@ def generate_with_llamaindex_docs( with_debugging_logs=False, is_async: bool = True, raise_exceptions: bool = True, - run_config: t.Optional[RunConfig] = None + run_config: t.Optional[RunConfig] = None, ): # chunk documents and add to docstore self.docstore.add_documents( @@ -146,7 +146,7 @@ def generate_with_langchain_docs( with_debugging_logs=False, is_async: bool = True, raise_exceptions: bool = True, - run_config: t.Optional[RunConfig] = None + run_config: t.Optional[RunConfig] = None, ): # chunk documents and add to docstore self.docstore.add_documents( @@ -184,7 +184,7 @@ def generate( with_debugging_logs=False, is_async: bool = True, raise_exceptions: bool = True, - run_config: t.Optional[RunConfig] = None + run_config: t.Optional[RunConfig] = None, ): # validate distributions if not check_if_sum_is_close(list(distributions.values()), 1.0, 3): From 3eb6ae66130a948d7da5dbd9df15cff850d805eb Mon Sep 17 00:00:00 2001 From: Jithin James Date: Thu, 28 Mar 2024 08:50:01 -0700 Subject: [PATCH 6/7] Update src/ragas/metrics/base.py Co-authored-by: Massimiliano Pronesti --- src/ragas/metrics/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index f4d05bf9e..dd02204bf 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -26,7 +26,7 @@ def get_required_columns( - eval_mod: EvaluationMode, ignore_columns: t.List[str] = [] + eval_mod: EvaluationMode, ignore_columns: t.Oprional[t.List[str]] = None ) -> t.List[str]: if eval_mod == EvaluationMode.qac: keys = ["question", "answer", "contexts"] From aae083c419badee6322bf9eaa88f0d731e48413a Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 28 Mar 2024 08:56:53 -0700 Subject: [PATCH 7/7] complete change --- src/ragas/metrics/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index dd02204bf..d7cd39aac 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -4,6 +4,7 @@ C - contexts: context used for generation G - ground_truth: ground truth answer """ + from __future__ import annotations import asyncio @@ -26,7 +27,7 @@ def get_required_columns( - eval_mod: EvaluationMode, ignore_columns: t.Oprional[t.List[str]] = None + eval_mod: EvaluationMode, ignore_columns: t.Optional[t.List[str]] = None ) -> t.List[str]: if eval_mod == EvaluationMode.qac: keys = ["question", "answer", "contexts"] @@ -42,6 +43,7 @@ def get_required_columns( keys = ["question", "contexts", "answer", "ground_truth"] elif eval_mod == EvaluationMode.qcg: keys = ["question", "contexts", "ground_truth"] + ignore_columns = ignore_columns or [] return [k for k in keys if k not in ignore_columns]