From 4b068182ce4aad286da0414f12ae5b8f9ecaa332 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 14 Dec 2023 15:53:40 +0530 Subject: [PATCH 01/34] changed name RagasLLM -> BaseRagasLLM --- .gitignore | 6 +--- src/ragas/llms/__init__.py | 6 ++-- src/ragas/llms/base.py | 5 +-- src/ragas/llms/langchain.py | 22 ++++++------ src/ragas/llms/llamaindex.py | 4 +-- src/ragas/llms/openai.py | 4 +-- src/ragas/metrics/base.py | 4 +-- src/ragas/metrics/critique.py | 4 +-- src/ragas/testset/evolutions.py | 60 +++++++++++++++++++++++++++++++++ tests/unit/test_llm.py | 4 +-- 10 files changed, 88 insertions(+), 31 deletions(-) create mode 100644 src/ragas/testset/evolutions.py diff --git a/.gitignore b/.gitignore index 11efa9cab..90a821e14 100644 --- a/.gitignore +++ b/.gitignore @@ -161,11 +161,7 @@ cython_debug/ # Ragas specific ragas/_version.py -experiments/**/data -experiments/**/storage +experiments/ **/fil-result/ -experiments/baselines/fiqa/datasets src/ragas/_version.py .python-version -experiments/retriever-benchmarks/datasets -experiments/tmp diff --git a/src/ragas/llms/__init__.py b/src/ragas/llms/__init__.py index 6f48ae530..62ea40111 100644 --- a/src/ragas/llms/__init__.py +++ b/src/ragas/llms/__init__.py @@ -1,10 +1,10 @@ -from ragas.llms.base import RagasLLM +from ragas.llms.base import BaseRagasLLM from ragas.llms.langchain import LangchainLLM from ragas.llms.llamaindex import LlamaIndexLLM from ragas.llms.openai import OpenAI -__all__ = ["RagasLLM", "LangchainLLM", "LlamaIndexLLM", "llm_factory", "OpenAI"] +__all__ = ["BaseRagasLLM", "LangchainLLM", "LlamaIndexLLM", "llm_factory", "OpenAI"] -def llm_factory(model="gpt-3.5-turbo-16k") -> RagasLLM: +def llm_factory(model="gpt-3.5-turbo-16k") -> BaseRagasLLM: return OpenAI(model=model) diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index eec5569de..c4c4f894d 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -6,11 +6,12 @@ from langchain.schema import LLMResult if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks from langchain.prompts import ChatPromptTemplate + from langchain_core.callbacks import Callbacks + from langchain_core.prompt_values import PromptValue -class RagasLLM(ABC): +class BaseRagasLLM(ABC): """ BaseLLM is the base class for all LLMs. It provides a consistent interface for other classes that interact with LLMs like Langchains, LlamaIndex, LiteLLM etc. Handles diff --git a/src/ragas/llms/langchain.py b/src/ragas/llms/langchain.py index f8ef9efee..f01817185 100644 --- a/src/ragas/llms/langchain.py +++ b/src/ragas/llms/langchain.py @@ -10,7 +10,7 @@ from ragas.async_utils import run_async_tasks from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound -from ragas.llms.base import RagasLLM +from ragas.llms.base import BaseRagasLLM from ragas.utils import NO_KEY if t.TYPE_CHECKING: @@ -18,15 +18,15 @@ from langchain.prompts import ChatPromptTemplate -def isOpenAI(llm: BaseLLM | BaseChatModel) -> bool: +def isOpenAI(llm: BaseRagasLLM | BaseChatModel) -> bool: return isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI) -def isBedrock(llm: BaseLLM | BaseChatModel) -> bool: +def isBedrock(llm: BaseRagasLLM | BaseChatModel) -> bool: return isinstance(llm, Bedrock) or isinstance(llm, BedrockChat) -def isAmazonAPIGateway(llm: BaseLLM | BaseChatModel) -> bool: +def isAmazonAPIGateway(llm: BaseRagasLLM | BaseChatModel) -> bool: return isinstance(llm, AmazonAPIGateway) @@ -70,14 +70,14 @@ def _compute_token_usage_langchain(list_llmresults: t.List[LLMResult]) -> t.Dict return llm_output -class LangchainLLM(RagasLLM): +class LangchainLLM(BaseRagasLLM): n_completions_supported: bool = True - def __init__(self, llm: BaseLLM | BaseChatModel): + def __init__(self, llm: BaseRagasLLM | BaseChatModel): self.langchain_llm = llm @property - def llm(self) -> BaseLLM | BaseChatModel: + def llm(self) -> BaseRagasLLM | BaseChatModel: return self.langchain_llm def validate_api_key(self): @@ -112,7 +112,7 @@ def _generate_multiple_completions( old_n = self.langchain_llm.n self.langchain_llm.n = n - if isinstance(self.llm, BaseLLM): + if isinstance(self.llm, BaseRagasLLM): ps = [p.format() for p in prompts] result = self.llm.generate(ps, callbacks=callbacks) else: # if BaseChatModel @@ -127,7 +127,7 @@ async def generate_completions( prompts: list[ChatPromptTemplate], callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: - if isinstance(self.llm, BaseLLM): + if isinstance(self.llm, BaseRagasLLM): ps = [p.format() for p in prompts] result = await self.llm.agenerate(ps, callbacks=callbacks) else: # if BaseChatModel @@ -155,7 +155,7 @@ async def agenerate( ) old_n = self.langchain_llm.n self.langchain_llm.n = n - if isinstance(self.llm, BaseLLM): + if isinstance(self.llm, BaseRagasLLM): result = await self.llm.agenerate( [prompt.format()], callbacks=callbacks ) @@ -165,7 +165,7 @@ async def agenerate( ) self.langchain_llm.n = old_n else: - if isinstance(self.llm, BaseLLM): + if isinstance(self.llm, BaseRagasLLM): list_llmresults: list[LLMResult] = run_async_tasks( [ self.llm.agenerate([prompt.format()], callbacks=callbacks) diff --git a/src/ragas/llms/llamaindex.py b/src/ragas/llms/llamaindex.py index d93afacfd..24e77fd8c 100644 --- a/src/ragas/llms/llamaindex.py +++ b/src/ragas/llms/llamaindex.py @@ -5,7 +5,7 @@ from langchain.schema.output import Generation, LLMResult from ragas.async_utils import run_async_tasks -from ragas.llms.base import RagasLLM +from ragas.llms.base import BaseRagasLLM if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -20,7 +20,7 @@ ) -class LlamaIndexLLM(RagasLLM): +class LlamaIndexLLM(BaseRagasLLM): def __init__(self, llm: LiLLM) -> None: self.llama_index_llm = llm diff --git a/src/ragas/llms/openai.py b/src/ragas/llms/openai.py index a4b1892e4..b75374d2a 100644 --- a/src/ragas/llms/openai.py +++ b/src/ragas/llms/openai.py @@ -27,7 +27,7 @@ from ragas.async_utils import run_async_tasks from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound -from ragas.llms.base import RagasLLM +from ragas.llms.base import BaseRagasLLM from ragas.llms.langchain import _compute_token_usage_langchain from ragas.utils import NO_KEY, get_debug_mode @@ -94,7 +94,7 @@ def _before_sleep(retry_state: RetryCallState) -> None: retry_decorator = create_base_retry_decorator(errors, max_retries=4) -class OpenAIBase(RagasLLM): +class OpenAIBase(BaseRagasLLM): def __init__(self, model: str, _api_key_env_var: str, timeout: int = 60) -> None: self.model = model self._api_key_env_var = _api_key_env_var diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 097058733..dd7a05c8c 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -22,7 +22,7 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks - from ragas.llms import RagasLLM + from ragas.llms import BaseRagasLLM def make_batches(total_size: int, batch_size: int) -> list[range]: @@ -110,7 +110,7 @@ def get_batches(self, dataset_size: int) -> list[range]: @dataclass class MetricWithLLM(Metric): - llm: RagasLLM = field(default_factory=llm_factory) + llm: BaseRagasLLM = field(default_factory=llm_factory) def init_model(self): """ diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 4075acb34..8e577ed2f 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -14,7 +14,7 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks - from ragas.llms import RagasLLM + from ragas.llms import BaseRagasLLM CRITIQUE_PROMPT = HumanMessagePromptTemplate.from_template( """Given a input and submission. Evaluate the submission only using the given criteria. @@ -60,7 +60,7 @@ class AspectCritique(MetricWithLLM): definition: str = field(default="", repr=True) strictness: int = field(default=1, repr=False) batch_size: int = field(default=15, repr=False) - llm: RagasLLM = field( + llm: BaseRagasLLM = field( default_factory=llm_factory, repr=False, ) diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py new file mode 100644 index 000000000..17cbfb449 --- /dev/null +++ b/src/ragas/testset/evolutions.py @@ -0,0 +1,60 @@ +from langchain.prompts import ChatPromptTemplate + +from ragas.llms import BaseRagasLLM +from ragas.testset.docstore import Document, DocumentStore +from ragas.testset.prompts import ( + FILTER_QUESTION, + MULTICONTEXT_QUESTION, + SCORE_CONTEXT, + SEED_QUESTION, +) +from ragas.testset.testset_generator import load_as_score +from ragas.utils import load_as_json + + +def filter_context(llm: BaseRagasLLM, context: str, threshold: float = 7.5) -> bool: + """ + context: str + The input context + + Checks if the context is has information worthy of framing a question + """ + human_prompt = SCORE_CONTEXT.format(context=context) + prompt = ChatPromptTemplate.from_messages([human_prompt]) + results = llm.generate(prompts=[prompt]) + output = results.generations[0][0].text.strip() + score = load_as_score(output) + return score >= threshold + + +def filter_question(llm: BaseRagasLLM, question: str) -> bool: + human_prompt = FILTER_QUESTION.format(question=question) + prompt = ChatPromptTemplate.from_messages([human_prompt]) + + results = llm.generate(prompts=[prompt]) + results = results.generations[0][0].text.strip() + json_results = load_as_json(results) + return json_results.get("verdict") != "No" + + +def simple_evolution(llm: BaseRagasLLM, seed_doc: Document): + human_prompt = SEED_QUESTION.format(context=seed_doc.page_content) + prompt = ChatPromptTemplate.from_messages([human_prompt]) + results = llm.generate(prompts=[prompt]) + question = results.generations[0][0].text.strip() + return question + + +def multi_context_evolution( + llm: BaseRagasLLM, seed_doc: Document, doc_store: DocumentStore +): + question = simple_evolution(llm, seed_doc) + print(question) + similar_context = doc_store.get_similar(seed_doc)[0] + human_prompt = MULTICONTEXT_QUESTION.format( + question=question, context1=seed_doc.page_content, context2=similar_context + ) + prompt = ChatPromptTemplate.from_messages([human_prompt]) + results = llm.generate(prompts=[prompt]) + question = results.generations[0][0].text.strip() + return question diff --git a/tests/unit/test_llm.py b/tests/unit/test_llm.py index 903fe23f0..b3af7a6d0 100644 --- a/tests/unit/test_llm.py +++ b/tests/unit/test_llm.py @@ -7,7 +7,7 @@ from langchain.schema import Generation, LLMResult from ragas.embeddings import AzureOpenAIEmbeddings, OpenAIEmbeddings -from ragas.llms.base import RagasLLM +from ragas.llms.base import BaseRagasLLM from ragas.llms.openai import ( AzureOpenAI, AzureOpenAIKeyNotFound, @@ -17,7 +17,7 @@ from ragas.utils import NO_KEY -class TestLLM(RagasLLM): +class TestLLM(BaseRagasLLM): def llm(self): return self From f30cf57b80ae1ae12c4e84548aaeb0874af3848d Mon Sep 17 00:00:00 2001 From: jjmachan Date: Sat, 16 Dec 2023 11:25:33 +0530 Subject: [PATCH 02/34] added langchain_core llm --- src/ragas/evaluation.py | 8 +- src/ragas/llms/base.py | 100 +++++++++++----- src/ragas/llms/langchain.py | 224 ---------------------------------- src/ragas/llms/openai.py | 232 ------------------------------------ src/ragas/llms/prompt.py | 18 +++ 5 files changed, 96 insertions(+), 486 deletions(-) delete mode 100644 src/ragas/llms/langchain.py delete mode 100644 src/ragas/llms/openai.py create mode 100644 src/ragas/llms/prompt.py diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 587b125cd..ea258bfb4 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -15,11 +15,15 @@ validate_evaluation_modes, ) +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + def evaluate( dataset: Dataset, metrics: list[Metric] | None = None, column_map: dict[str, str] = {}, + callbacks: Callbacks = [], ) -> Result: """ Run the evaluation on the dataset with different metrics @@ -98,7 +102,9 @@ def evaluate( if isinstance(metric, AspectCritique): binary_metrics.append(metric.name) print(f"evaluating with [{metric.name}]") - scores.append(metric.score(dataset).select_columns(metric.name)) + scores.append( + metric.score(dataset, callbacks=callbacks).select_columns(metric.name) + ) # log the evaluation event metrics_names = [m.name for m in metrics] diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index c4c4f894d..36105e986 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -3,54 +3,96 @@ import typing as t from abc import ABC, abstractmethod -from langchain.schema import LLMResult +from langchain.chat_models import AzureChatOpenAI, BedrockChat, ChatOpenAI, ChatVertexAI +from langchain.llms import AmazonAPIGateway, AzureOpenAI, Bedrock, OpenAI, VertexAI +from langchain_core.language_models import BaseLanguageModel +from langchain_core.outputs import LLMResult if t.TYPE_CHECKING: from langchain.prompts import ChatPromptTemplate from langchain_core.callbacks import Callbacks from langchain_core.prompt_values import PromptValue +MULTIPLE_COMPLETION_SUPPORTED = [ + OpenAI, + ChatOpenAI, + AzureOpenAI, + AzureChatOpenAI, + ChatVertexAI, + VertexAI, +] -class BaseRagasLLM(ABC): - """ - BaseLLM is the base class for all LLMs. It provides a consistent interface for other - classes that interact with LLMs like Langchains, LlamaIndex, LiteLLM etc. Handles - multiple_completions even if not supported by the LLM. - - It currently takes in ChatPromptTemplates and returns LLMResults which are Langchain - primitives. - """ - # supports multiple compeletions for the given prompt - n_completions_supported: bool = False +def is_multiple_completion_supported(llm: BaseRagasLLM) -> bool: + """Return whether the given LLM supports n-completion.""" + for llm_type in MULTIPLE_COMPLETION_SUPPORTED: + if isinstance(llm, llm_type): + return True + return False - @property - @abstractmethod - def llm(self) -> t.Any: - ... - def validate_api_key(self): - """ - Validates that the api key is set for the LLM - """ - pass +class BaseRagasLLM(BaseLanguageModel): + """ + A simple base class for RagasLLMs that is based on Langchain's BaseLanguageModel + interface. it implements 2 functions: + - generate_text: for generating text from a given PromptValue + - agenerate_text: for generating text from a given PromptValue asynchronously + """ - @abstractmethod - def generate( + def generate_text( self, - prompts: list[ChatPromptTemplate], + prompt: PromptValue, n: int = 1, temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: - ... + if is_multiple_completion_supported(self): + return self.generate_prompt( + prompts=[prompt], + n=n, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + else: + result = self.generate_prompt( + prompts=[prompt] * n, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + # make LLMREsult.generation appear as if it was n_completions + # note that LLMResult.runs is still a list that represents each run + generations = [[g[0] for g in result.generations]] + result.generations = generations + return result - @abstractmethod - async def agenerate( + async def agenerate_text( self, - prompt: ChatPromptTemplate, + prompt: PromptValue, n: int = 1, temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: - ... + if is_multiple_completion_supported(self): + return await self.agenerate_prompt( + prompts=[prompt], + n=n, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + else: + result = await self.agenerate_prompt( + prompts=[prompt] * n, + temperature=temperature, + stop=stop, + callbacks=callbacks, + ) + # make LLMREsult.generation appear as if it was n_completions + # note that LLMResult.runs is still a list that represents each run + generations = [[g[0] for g in result.generations]] + result.generations = generations + return result diff --git a/src/ragas/llms/langchain.py b/src/ragas/llms/langchain.py deleted file mode 100644 index f01817185..000000000 --- a/src/ragas/llms/langchain.py +++ /dev/null @@ -1,224 +0,0 @@ -from __future__ import annotations - -import typing as t - -from langchain.chat_models import AzureChatOpenAI, BedrockChat, ChatOpenAI, ChatVertexAI -from langchain.chat_models.base import BaseChatModel -from langchain.llms import AmazonAPIGateway, AzureOpenAI, Bedrock, OpenAI, VertexAI -from langchain.llms.base import BaseLLM -from langchain.schema import LLMResult - -from ragas.async_utils import run_async_tasks -from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound -from ragas.llms.base import BaseRagasLLM -from ragas.utils import NO_KEY - -if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - from langchain.prompts import ChatPromptTemplate - - -def isOpenAI(llm: BaseRagasLLM | BaseChatModel) -> bool: - return isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI) - - -def isBedrock(llm: BaseRagasLLM | BaseChatModel) -> bool: - return isinstance(llm, Bedrock) or isinstance(llm, BedrockChat) - - -def isAmazonAPIGateway(llm: BaseRagasLLM | BaseChatModel) -> bool: - return isinstance(llm, AmazonAPIGateway) - - -# have to specify it twice for runtime and static checks -MULTIPLE_COMPLETION_SUPPORTED = [ - OpenAI, - ChatOpenAI, - AzureOpenAI, - AzureChatOpenAI, - ChatVertexAI, - VertexAI, -] -MultipleCompletionSupportedLLM = t.Union[ - OpenAI, ChatOpenAI, AzureOpenAI, AzureChatOpenAI, ChatVertexAI, VertexAI -] - - -def _compute_token_usage_langchain(list_llmresults: t.List[LLMResult]) -> t.Dict: - # compute total token usage by adding individual token usage - llm_output = list_llmresults[0].llm_output - if llm_output is None: - return {} - if (llm_output is not None) and ("token_usage" in llm_output): - sum_prompt_tokens = 0 - sum_completion_tokens = 0 - sum_total_tokens = 0 - for result in list_llmresults: - if result.llm_output is None: - continue - token_usage = result.llm_output["token_usage"] - sum_prompt_tokens += token_usage["prompt_tokens"] - sum_completion_tokens += token_usage["completion_tokens"] - sum_total_tokens += token_usage["total_tokens"] - - llm_output["token_usage"] = { - "prompt_tokens": sum_prompt_tokens, - "completion_tokens": sum_completion_tokens, - "sum_total_tokens": sum_total_tokens, - } - - return llm_output - - -class LangchainLLM(BaseRagasLLM): - n_completions_supported: bool = True - - def __init__(self, llm: BaseRagasLLM | BaseChatModel): - self.langchain_llm = llm - - @property - def llm(self) -> BaseRagasLLM | BaseChatModel: - return self.langchain_llm - - def validate_api_key(self): - # if langchain OpenAI or ChatOpenAI - if isinstance(self.llm, ChatOpenAI) or isinstance(self.llm, OpenAI): - # make sure the type is LangchainLLM with ChatOpenAI - self.langchain_llm = t.cast(ChatOpenAI, self.langchain_llm) - # raise error if no api key - if self.langchain_llm.openai_api_key == NO_KEY: - raise OpenAIKeyNotFound - - # if langchain AzureOpenAI or ChatAzurerOpenAI - elif isinstance(self.llm, AzureChatOpenAI) or isinstance(self.llm, AzureOpenAI): - self.langchain_llm = t.cast(AzureChatOpenAI, self.langchain_llm) - # raise error if no api key - if self.langchain_llm.openai_api_key == NO_KEY: - raise AzureOpenAIKeyNotFound - - @staticmethod - def llm_supports_completions(llm): - for llm_type in MULTIPLE_COMPLETION_SUPPORTED: - if isinstance(llm, llm_type): - return True - - def _generate_multiple_completions( - self, - prompts: list[ChatPromptTemplate], - n: int = 1, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - self.langchain_llm = t.cast(MultipleCompletionSupportedLLM, self.langchain_llm) - old_n = self.langchain_llm.n - self.langchain_llm.n = n - - if isinstance(self.llm, BaseRagasLLM): - ps = [p.format() for p in prompts] - result = self.llm.generate(ps, callbacks=callbacks) - else: # if BaseChatModel - ps = [p.format_messages() for p in prompts] - result = self.llm.generate(ps, callbacks=callbacks) - self.llm.n = old_n - - return result - - async def generate_completions( - self, - prompts: list[ChatPromptTemplate], - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - if isinstance(self.llm, BaseRagasLLM): - ps = [p.format() for p in prompts] - result = await self.llm.agenerate(ps, callbacks=callbacks) - else: # if BaseChatModel - ps = [p.format_messages() for p in prompts] - result = await self.llm.agenerate(ps, callbacks=callbacks) - - return result - - async def agenerate( - self, - prompt: ChatPromptTemplate, - n: int = 1, - temperature: float = 1e-8, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - temperature = 0.2 if n > 1 else 0 - if isBedrock(self.llm) and ("model_kwargs" in self.llm.__dict__): - self.llm.model_kwargs = {"temperature": temperature} - else: - self.llm.temperature = temperature - - if self.llm_supports_completions(self.llm): - self.langchain_llm = t.cast( - MultipleCompletionSupportedLLM, self.langchain_llm - ) - old_n = self.langchain_llm.n - self.langchain_llm.n = n - if isinstance(self.llm, BaseRagasLLM): - result = await self.llm.agenerate( - [prompt.format()], callbacks=callbacks - ) - else: # if BaseChatModel - result = await self.llm.agenerate( - [prompt.format_messages()], callbacks=callbacks - ) - self.langchain_llm.n = old_n - else: - if isinstance(self.llm, BaseRagasLLM): - list_llmresults: list[LLMResult] = run_async_tasks( - [ - self.llm.agenerate([prompt.format()], callbacks=callbacks) - for _ in range(n) - ] - ) - else: - list_llmresults: list[LLMResult] = run_async_tasks( - [ - self.llm.agenerate( - [prompt.format_messages()], callbacks=callbacks - ) - for _ in range(n) - ] - ) - - # fill results as if the LLM supported multiple completions - generations = [r.generations[0][0] for r in list_llmresults] - llm_output = _compute_token_usage_langchain(list_llmresults) - result = LLMResult(generations=[generations], llm_output=llm_output) - - return result - - def generate( - self, - prompts: list[ChatPromptTemplate], - n: int = 1, - temperature: float = 1e-8, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - # set temperature to 0.2 for multiple completions - temperature = 0.2 if n > 1 else 1e-8 - if isBedrock(self.llm) and ("model_kwargs" in self.llm.__dict__): - self.llm.model_kwargs = {"temperature": temperature} - elif isAmazonAPIGateway(self.llm) and ("model_kwargs" in self.llm.__dict__): - self.llm.model_kwargs = {"temperature": temperature} - else: - self.llm.temperature = temperature - - if self.llm_supports_completions(self.llm): - return self._generate_multiple_completions(prompts, n, callbacks) - else: # call generate_completions n times to mimic multiple completions - list_llmresults = run_async_tasks( - [self.generate_completions(prompts, callbacks) for _ in range(n)] - ) - - # fill results as if the LLM supported multiple completions - generations = [] - for i in range(len(prompts)): - completions = [] - for result in list_llmresults: - completions.append(result.generations[i][0]) - generations.append(completions) - - llm_output = _compute_token_usage_langchain(list_llmresults) - return LLMResult(generations=generations, llm_output=llm_output) diff --git a/src/ragas/llms/openai.py b/src/ragas/llms/openai.py deleted file mode 100644 index b75374d2a..000000000 --- a/src/ragas/llms/openai.py +++ /dev/null @@ -1,232 +0,0 @@ -from __future__ import annotations - -import asyncio -import logging -import os -import typing as t -from abc import abstractmethod -from dataclasses import dataclass, field - -import openai -from langchain.adapters.openai import convert_message_to_dict -from langchain.callbacks.manager import ( - AsyncCallbackManagerForLLMRun, - CallbackManagerForLLMRun, -) -from langchain.schema import Generation, LLMResult -from openai import AsyncAzureOpenAI, AsyncClient, AsyncOpenAI -from tenacity import ( - RetryCallState, - before_sleep_log, - retry, - retry_base, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) - -from ragas.async_utils import run_async_tasks -from ragas.exceptions import AzureOpenAIKeyNotFound, OpenAIKeyNotFound -from ragas.llms.base import BaseRagasLLM -from ragas.llms.langchain import _compute_token_usage_langchain -from ragas.utils import NO_KEY, get_debug_mode - -if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - from langchain.prompts import ChatPromptTemplate - -logger = logging.getLogger(__name__) - -errors = [ - openai.APITimeoutError, - openai.APIConnectionError, - openai.RateLimitError, - openai.APIConnectionError, - openai.InternalServerError, -] - - -def create_base_retry_decorator( - error_types: t.List[t.Type[BaseException]], - max_retries: int = 1, - run_manager: t.Optional[ - t.Union[AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun] - ] = None, -) -> t.Callable[[t.Any], t.Any]: - """Create a retry decorator for a given LLM and provided list of error types.""" - - log_level = logging.WARNING if get_debug_mode() else logging.DEBUG - _logging = before_sleep_log(logger, log_level) - - def _before_sleep(retry_state: RetryCallState) -> None: - _logging(retry_state) - if run_manager: - if isinstance(run_manager, AsyncCallbackManagerForLLMRun): - coro = run_manager.on_retry(retry_state) - try: - loop = asyncio.get_event_loop() - if loop.is_running(): - loop.create_task(coro) - else: - asyncio.run(coro) - except Exception as e: - logger.error(f"Error in on_retry: {e}") - else: - run_manager.on_retry(retry_state) - return None - - min_seconds = 4 - max_seconds = 10 - # Wait 2^x * 1 second between each retry starting with - # 4 seconds, then up to 10 seconds, then 10 seconds afterwards - retry_instance: "retry_base" = retry_if_exception_type(error_types[0]) - for error in error_types[1:]: - retry_instance = retry_instance | retry_if_exception_type(error) - return retry( - reraise=True, - stop=stop_after_attempt(max_retries), - wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds), - retry=retry_instance, - before_sleep=_before_sleep, - ) - - -retry_decorator = create_base_retry_decorator(errors, max_retries=4) - - -class OpenAIBase(BaseRagasLLM): - def __init__(self, model: str, _api_key_env_var: str, timeout: int = 60) -> None: - self.model = model - self._api_key_env_var = _api_key_env_var - self.timeout = timeout - - # api key - key_from_env = os.getenv(self._api_key_env_var, NO_KEY) - if key_from_env != NO_KEY: - self.api_key = key_from_env - else: - self.api_key = self.api_key - self._client: AsyncClient - - @abstractmethod - def _client_init(self): - ... - - @property - def llm(self): - return self - - def create_llm_result(self, response) -> LLMResult: - """Create the LLMResult from the choices and prompts.""" - if not isinstance(response, dict): - response = response.model_dump() - - # token Usage - token_usage = response.get("usage", {}) - llm_output = { - "token_usage": token_usage, - "model_name": None, - "system_fingerprint": response.get("system_fingerprint", ""), - } - - choices = response["choices"] - generations = [ - Generation( - text=choice["message"]["content"], - generation_info=dict( - finish_reason=choice.get("finish_reason"), - logprobs=choice.get("logprobs"), - ), - ) - for choice in choices - ] - llm_output = {"token_usage": token_usage, "model_name": self.model} - return LLMResult(generations=[generations], llm_output=llm_output) - - def generate( - self, - prompts: list[ChatPromptTemplate], - n: int = 1, - temperature: float = 0, - callbacks: t.Optional[Callbacks] = None, - ) -> t.Any: # TODO: LLMResult - llm_results = run_async_tasks( - [self.agenerate(p, n, temperature, callbacks) for p in prompts] - ) - - generations = [r.generations[0] for r in llm_results] - llm_output = _compute_token_usage_langchain(llm_results) - return LLMResult(generations=generations, llm_output=llm_output) - - @retry_decorator - async def agenerate( - self, - prompt: ChatPromptTemplate, - n: int = 1, - temperature: float = 0, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - # TODO: use callbacks for llm generate - completion = await self._client.chat.completions.create( - model=self.model, - messages=[convert_message_to_dict(m) for m in prompt.format_messages()], # type: ignore - temperature=temperature, - n=n, - ) - - return self.create_llm_result(completion) - - -@dataclass -class OpenAI(OpenAIBase): - model: str = "gpt-3.5-turbo-16k" - api_key: str = field(default=NO_KEY, repr=False) - _api_key_env_var: str = "OPENAI_API_KEY" - - def __post_init__(self): - super().__init__(model=self.model, _api_key_env_var=self._api_key_env_var) - self._client_init() - - def _client_init(self): - self._client = AsyncOpenAI(api_key=self.api_key, timeout=self.timeout) - - def validate_api_key(self): - # before validating, check if the api key is already set - api_key = os.getenv(self._api_key_env_var, NO_KEY) - if api_key != NO_KEY: - self._client.api_key = api_key - if self.llm.api_key == NO_KEY: - os_env_key = os.getenv(self._api_key_env_var, NO_KEY) - if os_env_key != NO_KEY: - self.api_key = os_env_key - else: - raise OpenAIKeyNotFound - - -@dataclass -class AzureOpenAI(OpenAIBase): - azure_endpoint: str - deployment: str - api_version: str - api_key: str = field(default=NO_KEY, repr=False) - _api_key_env_var: str = "AZURE_OPENAI_API_KEY" - - def __post_init__(self): - super().__init__(model=self.deployment, _api_key_env_var=self._api_key_env_var) - self._client_init() - - def _client_init(self): - self._client = AsyncAzureOpenAI( - api_version=self.api_version, - azure_endpoint=self.azure_endpoint, - api_key=self.api_key, - timeout=self.timeout, - ) - - def validate_api_key(self): - if self.llm.api_key == NO_KEY: - os_env_key = os.getenv(self._api_key_env_var, NO_KEY) - if os_env_key != NO_KEY: - self.api_key = os_env_key - else: - raise AzureOpenAIKeyNotFound diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py new file mode 100644 index 000000000..76a622662 --- /dev/null +++ b/src/ragas/llms/prompt.py @@ -0,0 +1,18 @@ +import typing as t + +from langchain_core.messages import BaseMessage, HumanMessage +from langchain_core.prompt_values import PromptValue +from pydantic import Field + + +class Prompt(PromptValue): + instruction: str + examples: t.List[t.Dict[str, t.Any]] = Field(default_factory=list, repr=False) + input_keys: t.List[str] = Field(default_factory=list, repr=False) + output_keys: t.List[str] = Field(default_factory=list, repr=False) + + def to_string(self) -> str: + return self.instruction + + def to_messages(self) -> t.List[BaseMessage]: + return [HumanMessage(content=self.instruction)] From 1fcea66de2a4860a1d375a2a54358e6e403b9895 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Sat, 16 Dec 2023 12:00:52 +0530 Subject: [PATCH 03/34] changed faithfulness to single metric --- src/ragas/llms/base.py | 6 +- src/ragas/llms/prompt.py | 10 ++- src/ragas/metrics/_faithfulness.py | 102 ++++++++++++++--------------- src/ragas/metrics/base.py | 46 +++---------- 4 files changed, 70 insertions(+), 94 deletions(-) diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index 36105e986..ac7ee69d7 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -13,6 +13,8 @@ from langchain_core.callbacks import Callbacks from langchain_core.prompt_values import PromptValue + from ragas.llms.prompt import Prompt + MULTIPLE_COMPLETION_SUPPORTED = [ OpenAI, ChatOpenAI, @@ -41,7 +43,7 @@ class BaseRagasLLM(BaseLanguageModel): def generate_text( self, - prompt: PromptValue, + prompt: Prompt, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, @@ -70,7 +72,7 @@ def generate_text( async def agenerate_text( self, - prompt: PromptValue, + prompt: Prompt, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 76a622662..c7c5727c0 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -4,15 +4,19 @@ from langchain_core.prompt_values import PromptValue from pydantic import Field +if t.TYPE_CHECKING: + from langchain.prompts import ChatPromptTemplate + class Prompt(PromptValue): - instruction: str + chat_prompt_template: ChatPromptTemplate + instruction: t.Optional[str] = None examples: t.List[t.Dict[str, t.Any]] = Field(default_factory=list, repr=False) input_keys: t.List[str] = Field(default_factory=list, repr=False) output_keys: t.List[str] = Field(default_factory=list, repr=False) def to_string(self) -> str: - return self.instruction + return self.chat_prompt_template.format() def to_messages(self) -> t.List[BaseMessage]: - return [HumanMessage(content=self.instruction)] + return self.chat_prompt_template.format_messages() diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 3cd5c1e7c..1c7e89c08 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -7,6 +7,7 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader @@ -125,67 +126,62 @@ class Faithfulness(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore batch_size: int = 15 - def _score_batch( + def ascore( self: t.Self, - dataset: Dataset, + data_row: t.Dict, callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: + ) -> float: """ returns the NLI score for each (q, c, a) pair """ + assert self.llm is not None, "LLM is not set" question, answer, contexts = ( - dataset["question"], - dataset["answer"], - dataset["contexts"], + data_row["question"], + data_row["answer"], + data_row["contexts"], ) - prompts = [] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for q, a in zip(question, answer): - human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=q, answer=a) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) - - result = self.llm.generate(prompts, callbacks=batch_group) - - prompts = [] - for context, output in zip(contexts, result.generations): - statements = json_loader.safe_load(output[0].text, self.llm).get( - "statements", [] - ) - statements = statements if statements != [] else ["Nil"] - statements_str: str = "\n".join( - [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] - ) - contexts_str: str = "\n".join(context) - human_prompt = NLI_STATEMENTS_MESSAGE.format( - context=contexts_str, statements=statements_str - ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) - - result = self.llm.generate(prompts, callbacks=batch_group) - outputs = result.generations - verdict_score_map = {"yes": 1, "no": 0, "null": np.nan} - scores = [] - for output in outputs: - output = json_loader.safe_load(output[0].text, self.llm) - output = output if isinstance(output, list) else [] - faithful_statements = sum( - verdict_score_map.get(dict.get("verdict", "").lower(), np.nan) - for dict in output - ) - num_statements = len(output) - if num_statements: - score = faithful_statements / num_statements - else: - score = np.nan - scores.append(score) - - return scores + + # extract statements from answer given the question + human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=question, answer=answer) + p = Prompt( + chat_prompt_template=ChatPromptTemplate.from_messages([human_prompt]) + ) + result = self.llm.generate_text(p, callbacks=callbacks) + + # check if the statements are support in the contexts + contexts_str: str = "\n".join(contexts) + statements = json_loader.safe_load(result.generations[0][0].text, self.llm).get( + "statements", [] + ) + statements = statements if statements != [] else ["Nil"] + statements_str: str = "\n".join( + [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] + ) + human_prompt = NLI_STATEMENTS_MESSAGE.format( + context=contexts_str, statements=statements_str + ) + p = Prompt( + chat_prompt_template=ChatPromptTemplate.from_messages([human_prompt]) + ) + result = self.llm.generate_text(p, callbacks=callbacks) + + # check the verdicts and compute the score + output = result.generations[0][0] + verdict_score_map = {"yes": 1, "no": 0, "null": np.nan} + output = json_loader.safe_load(output.text, self.llm) + output = output if isinstance(output, list) else [] + faithful_statements = sum( + verdict_score_map.get(dict.get("verdict", "").lower(), np.nan) + for dict in output + ) + num_statements = len(output) + if num_statements: + score = faithful_statements / num_statements + else: + score = np.nan + + return score faithfulness = Faithfulness() diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index dd7a05c8c..f89fd83aa 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -68,49 +68,19 @@ def score( self: t.Self, dataset: Dataset, callbacks: t.Optional[Callbacks] = None, - ) -> Dataset: - scores = [] - cm = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group(f"ragas_{self.name}", callback_manager=cm) as group: - for batch in tqdm(self.get_batches(len(dataset))): - score = self._score_batch(dataset.select(batch), callbacks=group) - scores.extend(score) - - return dataset.add_column(f"{self.name}", scores) # type: ignore + ) -> float: + raise NotImplemented @abstractmethod - def _score_batch( - selfself: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list: - ... - - def score_single( - self: t.Self, - ds_row: dict, - callbacks: t.Optional[Callbacks] = None, + def ascore( + self: t.Self, dataset: Dataset, callbacks: t.Optional[Callbacks] = None ) -> float: - """ - Score for a single row of dataset - """ - # TODO: validation check if they are string - - ds = Dataset.from_dict({k: [v] for k, v in ds_row.items()}) - score = self._score_batch( - ds, callback_group_name=self.name, callbacks=callbacks - ) - - return score[0] - - def get_batches(self, dataset_size: int) -> list[range]: - return make_batches(dataset_size, self.batch_size) + ... @dataclass class MetricWithLLM(Metric): - llm: BaseRagasLLM = field(default_factory=llm_factory) + llm: t.Optional[BaseRagasLLM] = None def init_model(self): """ @@ -118,6 +88,10 @@ def init_model(self): to load all the models Also check if the api key is valid for OpenAI and AzureOpenAI """ + if self.llm is None: + raise ValueError( + f"Metric '{self.name}' has no valid LLM provided. Please initantiate a the metric with an LLM to run." # noqa + ) if hasattr(self.llm, "validate_api_key"): self.llm.validate_api_key() if hasattr(self, "embeddings"): From 84d9cb2d485af44d8546a1bed25a1b6a6dc98acc Mon Sep 17 00:00:00 2001 From: jjmachan Date: Sun, 17 Dec 2023 21:08:12 +0530 Subject: [PATCH 04/34] fixing llm for executor --- src/ragas/llms/__init__.py | 12 ++++++++---- src/ragas/llms/base.py | 21 ++++++++++++--------- src/ragas/llms/prompt.py | 4 +--- tests/unit/test_executor.py | 0 4 files changed, 21 insertions(+), 16 deletions(-) create mode 100644 tests/unit/test_executor.py diff --git a/src/ragas/llms/__init__.py b/src/ragas/llms/__init__.py index 62ea40111..bb55627f1 100644 --- a/src/ragas/llms/__init__.py +++ b/src/ragas/llms/__init__.py @@ -1,10 +1,14 @@ +from langchain.chat_models import ChatOpenAI + from ragas.llms.base import BaseRagasLLM -from ragas.llms.langchain import LangchainLLM from ragas.llms.llamaindex import LlamaIndexLLM -from ragas.llms.openai import OpenAI -__all__ = ["BaseRagasLLM", "LangchainLLM", "LlamaIndexLLM", "llm_factory", "OpenAI"] +__all__ = [ + "BaseRagasLLM", + "LlamaIndexLLM", + "llm_factory", +] def llm_factory(model="gpt-3.5-turbo-16k") -> BaseRagasLLM: - return OpenAI(model=model) + return BaseRagasLLM(ChatOpenAI(model=model)) diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index ac7ee69d7..8027e83ef 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -1,7 +1,7 @@ from __future__ import annotations import typing as t -from abc import ABC, abstractmethod +from dataclasses import dataclass from langchain.chat_models import AzureChatOpenAI, BedrockChat, ChatOpenAI, ChatVertexAI from langchain.llms import AmazonAPIGateway, AzureOpenAI, Bedrock, OpenAI, VertexAI @@ -25,7 +25,7 @@ ] -def is_multiple_completion_supported(llm: BaseRagasLLM) -> bool: +def is_multiple_completion_supported(llm: BaseLanguageModel) -> bool: """Return whether the given LLM supports n-completion.""" for llm_type in MULTIPLE_COMPLETION_SUPPORTED: if isinstance(llm, llm_type): @@ -33,7 +33,8 @@ def is_multiple_completion_supported(llm: BaseRagasLLM) -> bool: return False -class BaseRagasLLM(BaseLanguageModel): +@dataclass +class BaseRagasLLM: """ A simple base class for RagasLLMs that is based on Langchain's BaseLanguageModel interface. it implements 2 functions: @@ -41,6 +42,8 @@ class BaseRagasLLM(BaseLanguageModel): - agenerate_text: for generating text from a given PromptValue asynchronously """ + langchain_llm: BaseLanguageModel + def generate_text( self, prompt: Prompt, @@ -49,8 +52,8 @@ def generate_text( stop: t.Optional[t.List[str]] = None, callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: - if is_multiple_completion_supported(self): - return self.generate_prompt( + if is_multiple_completion_supported(self.langchain_llm): + return self.langchain_llm.generate_prompt( prompts=[prompt], n=n, temperature=temperature, @@ -58,7 +61,7 @@ def generate_text( callbacks=callbacks, ) else: - result = self.generate_prompt( + result = self.langchain_llm.generate_prompt( prompts=[prompt] * n, temperature=temperature, stop=stop, @@ -78,8 +81,8 @@ async def agenerate_text( stop: t.Optional[t.List[str]] = None, callbacks: t.Optional[Callbacks] = None, ) -> LLMResult: - if is_multiple_completion_supported(self): - return await self.agenerate_prompt( + if is_multiple_completion_supported(self.langchain_llm): + return await self.langchain_llm.agenerate_prompt( prompts=[prompt], n=n, temperature=temperature, @@ -87,7 +90,7 @@ async def agenerate_text( callbacks=callbacks, ) else: - result = await self.agenerate_prompt( + result = await self.langchain_llm.agenerate_prompt( prompts=[prompt] * n, temperature=temperature, stop=stop, diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index c7c5727c0..275900de6 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -1,12 +1,10 @@ import typing as t +from langchain.prompts import ChatPromptTemplate from langchain_core.messages import BaseMessage, HumanMessage from langchain_core.prompt_values import PromptValue from pydantic import Field -if t.TYPE_CHECKING: - from langchain.prompts import ChatPromptTemplate - class Prompt(PromptValue): chat_prompt_template: ChatPromptTemplate diff --git a/tests/unit/test_executor.py b/tests/unit/test_executor.py new file mode 100644 index 000000000..e69de29bb From 8a41af7d4d19b306f8b08000b5f139a5468e5483 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Mon, 18 Dec 2023 00:15:28 +0530 Subject: [PATCH 05/34] added callbacks --- src/ragas/callbacks.py | 32 +++++++++++++++++ src/ragas/evaluation.py | 53 ++++++++++++++++++++++----- src/ragas/metrics/_faithfulness.py | 10 ++---- src/ragas/metrics/base.py | 58 ++++++++++++++++++------------ 4 files changed, 116 insertions(+), 37 deletions(-) create mode 100644 src/ragas/callbacks.py diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py new file mode 100644 index 000000000..b2a9d6cb2 --- /dev/null +++ b/src/ragas/callbacks.py @@ -0,0 +1,32 @@ +import typing as t + +from langchain_core.callbacks import ( + CallbackManager, + CallbackManagerForChainGroup, + CallbackManagerForChainRun, + Callbacks, +) + + +def new_group( + name: str, inputs: t.Dict, callbacks: Callbacks, is_async=True +) -> t.Tuple[CallbackManagerForChainRun, CallbackManager]: + # start evaluation chain + if isinstance(callbacks, list): + cm = CallbackManager.configure(inheritable_callbacks=callbacks) + else: + cm = t.cast(CallbackManager, callbacks) + rm = cm.on_chain_start({"name": name}, inputs) + child_cm = rm.get_child() + group_cm = CallbackManagerForChainGroup( + child_cm.handlers, + child_cm.inheritable_handlers, + child_cm.parent_run_id, + parent_run_manager=rm, + tags=child_cm.tags, + inheritable_tags=child_cm.inheritable_tags, + metadata=child_cm.metadata, + inheritable_metadata=child_cm.inheritable_metadata, + ) + + return rm, group_cm diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index ea258bfb4..c82fb95df 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -7,8 +7,11 @@ from datasets import Dataset, concatenate_datasets from ragas._analytics import EvaluationEvent, track +from ragas.async_utils import run_async_tasks +from ragas.callbacks import new_group from ragas.metrics.base import Metric -from ragas.metrics.critique import AspectCritique + +# from ragas.metrics.critique import AspectCritique from ragas.validation import ( remap_column_names, validate_column_dtypes, @@ -22,8 +25,9 @@ def evaluate( dataset: Dataset, metrics: list[Metric] | None = None, - column_map: dict[str, str] = {}, callbacks: Callbacks = [], + is_async: bool = True, + column_map: dict[str, str] = {}, ) -> Result: """ Run the evaluation on the dataset with different metrics @@ -96,15 +100,48 @@ def evaluate( # initialize all the models in the metrics [m.init_model() for m in metrics] - scores = [] + # new evaluation chain + evaluation_rm, evaluation_group_cm = new_group( + name="ragas evaluation", inputs={}, callbacks=callbacks, is_async=is_async + ) + # list of chains for each row + row_chains = [] + + scoring_tasks = [] binary_metrics = [] for metric in metrics: - if isinstance(metric, AspectCritique): - binary_metrics.append(metric.name) - print(f"evaluating with [{metric.name}]") - scores.append( - metric.score(dataset, callbacks=callbacks).select_columns(metric.name) + # if isinstance(metric, AspectCritique): + # binary_metrics.append(metric.name) + ... + for i, row in enumerate(dataset): + row_rm, row_group_cm = new_group( + name=f"row {i}", + inputs=row, + callbacks=evaluation_group_cm, + is_async=is_async, + ) + scoring_tasks.extend( + [metric.ascore(data_row=row, callbacks=row_group_cm) for metric in metrics] + ) + row_chains.append(row_rm) + # run the evaluation tasks + try: + results = run_async_tasks( + scoring_tasks, show_progress=True, progress_bar_desc="evaluating dataset" ) + # TODO: closing row chains here. handle errors here too + [chain.on_chain_end({}) for chain in row_chains] + + # run evaluation task + except Exception as e: + if not evaluation_group_cm.ended: + evaluation_rm.on_chain_error(e) + raise e + else: + if not evaluation_group_cm.ended: + evaluation_rm.on_chain_end({}) + + return results # log the evaluation event metrics_names = [m.name for m in metrics] diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 1c7e89c08..2ad1f45ec 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -126,11 +126,7 @@ class Faithfulness(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore batch_size: int = 15 - def ascore( - self: t.Self, - data_row: t.Dict, - callbacks: t.Optional[Callbacks] = None, - ) -> float: + async def _ascore(self: t.Self, data_row: t.Dict, callbacks: Callbacks) -> float: """ returns the NLI score for each (q, c, a) pair """ @@ -147,7 +143,7 @@ def ascore( p = Prompt( chat_prompt_template=ChatPromptTemplate.from_messages([human_prompt]) ) - result = self.llm.generate_text(p, callbacks=callbacks) + result = await self.llm.agenerate_text(p, callbacks=callbacks) # check if the statements are support in the contexts contexts_str: str = "\n".join(contexts) @@ -164,7 +160,7 @@ def ascore( p = Prompt( chat_prompt_template=ChatPromptTemplate.from_messages([human_prompt]) ) - result = self.llm.generate_text(p, callbacks=callbacks) + result = await self.llm.agenerate_text(p, callbacks=callbacks) # check the verdicts and compute the score output = result.generations[0][0] diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index f89fd83aa..fa1ad7872 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -13,33 +13,18 @@ from math import floor from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from langchain_core.callbacks import CallbackManager, CallbackManagerForChainGroup from tqdm import tqdm from ragas.embeddings.base import RagasEmbeddings from ragas.llms import llm_factory if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks from ragas.llms import BaseRagasLLM -def make_batches(total_size: int, batch_size: int) -> list[range]: - """ - Take a total size and batch size and return a list of ranges for the batches - """ - tail = total_size % batch_size - num_batches = floor(total_size / batch_size) - batches = [ - range(i, i + batch_size) for i in range(0, batch_size * num_batches, batch_size) - ] - if tail != 0: - batches.append(range(batch_size * num_batches, batch_size * num_batches + tail)) - - return batches - - EvaluationMode = Enum("EvaluationMode", "qac qa qc gc ga qga qcg") @@ -66,15 +51,44 @@ def init_model(self): def score( self: t.Self, - dataset: Dataset, + data_row: t.Dict, callbacks: t.Optional[Callbacks] = None, ) -> float: raise NotImplemented - @abstractmethod - def ascore( - self: t.Self, dataset: Dataset, callbacks: t.Optional[Callbacks] = None + async def ascore( + self: t.Self, data_row: t.Dict, callbacks: Callbacks = [] ) -> float: + if isinstance(callbacks, list): + cm = CallbackManager.configure(inheritable_callbacks=callbacks) + else: + cm = t.cast(CallbackManager, callbacks) + + rm = cm.on_chain_start({"name": self.name}, data_row) + child_cm = rm.get_child() + group_cm = CallbackManagerForChainGroup( + child_cm.handlers, + child_cm.inheritable_handlers, + child_cm.parent_run_id, + parent_run_manager=rm, + tags=child_cm.tags, + inheritable_tags=child_cm.inheritable_tags, + metadata=child_cm.metadata, + inheritable_metadata=child_cm.inheritable_metadata, + ) + try: + score = await self._ascore(data_row=data_row, callbacks=group_cm) + except Exception as e: + if not group_cm.ended: + rm.on_chain_error(e) + raise e + else: + if not group_cm.ended: + rm.on_chain_end({"output": score}) + return score + + @abstractmethod + async def _ascore(self, data_row: t.Dict, callbacks: Callbacks = []) -> float: ... @@ -90,7 +104,7 @@ def init_model(self): """ if self.llm is None: raise ValueError( - f"Metric '{self.name}' has no valid LLM provided. Please initantiate a the metric with an LLM to run." # noqa + f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run." # noqa ) if hasattr(self.llm, "validate_api_key"): self.llm.validate_api_key() From 0c61475d0d7cbca6af519886674eff6656a00c15 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Mon, 18 Dec 2023 22:15:42 +0530 Subject: [PATCH 06/34] ported a couple of metrics --- src/ragas/evaluation.py | 23 +++++-- src/ragas/metrics/_answer_similarity.py | 30 ++++---- src/ragas/metrics/_context_precision.py | 92 ++++++++++--------------- src/ragas/metrics/_context_recall.py | 87 +++++++++++------------ src/ragas/metrics/_faithfulness.py | 8 +-- src/ragas/metrics/base.py | 12 ++-- 6 files changed, 117 insertions(+), 135 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index c82fb95df..b5d0cdad7 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -9,7 +9,8 @@ from ragas._analytics import EvaluationEvent, track from ragas.async_utils import run_async_tasks from ragas.callbacks import new_group -from ragas.metrics.base import Metric +from ragas.llms.base import BaseRagasLLM +from ragas.metrics.base import Metric, MetricWithLLM # from ragas.metrics.critique import AspectCritique from ragas.validation import ( @@ -25,6 +26,7 @@ def evaluate( dataset: Dataset, metrics: list[Metric] | None = None, + llm: t.Optional[BaseRagasLLM] = None, callbacks: Callbacks = [], is_async: bool = True, column_map: dict[str, str] = {}, @@ -90,16 +92,17 @@ def evaluate( metrics = [answer_relevancy, context_precision, faithfulness, context_recall] + if llm is None: + from ragas.llms import llm_factory + + llm = llm_factory() + # remap column names from the dataset dataset = remap_column_names(dataset, column_map) # validation validate_evaluation_modes(dataset, metrics) validate_column_dtypes(dataset) - # run the evaluation on dataset with different metrics - # initialize all the models in the metrics - [m.init_model() for m in metrics] - # new evaluation chain evaluation_rm, evaluation_group_cm = new_group( name="ragas evaluation", inputs={}, callbacks=callbacks, is_async=is_async @@ -112,7 +115,13 @@ def evaluate( for metric in metrics: # if isinstance(metric, AspectCritique): # binary_metrics.append(metric.name) - ... + if isinstance(metric, MetricWithLLM): + if metric.llm is None: + metric.llm = llm + + # initialize all the models in the metrics + [m.init_model() for m in metrics] + for i, row in enumerate(dataset): row_rm, row_group_cm = new_group( name=f"row {i}", @@ -121,7 +130,7 @@ def evaluate( is_async=is_async, ) scoring_tasks.extend( - [metric.ascore(data_row=row, callbacks=row_group_cm) for metric in metrics] + [metric.ascore(row=row, callbacks=row_group_cm) for metric in metrics] ) row_chains.append(row_rm) # run the evaluation tasks diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 860e58e4d..803458774 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -60,26 +60,20 @@ def __post_init__(self: t.Self): def init_model(self): super().init_model() - if isinstance(self.embeddings, OpenAIEmbeddings): - if self.embeddings.openai_api_key == "no-key": - raise OpenAIKeyNotFound - - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: - ground_truths, answers = dataset["ground_truths"], dataset["answer"] + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: + ground_truths, answers = row["ground_truths"], row["answer"] + # why? ground_truths = [item[0] for item in ground_truths] - if self.is_cross_encoder: - assert isinstance(self.embeddings, HuggingfaceEmbeddings) - inputs = [list(item) for item in list(zip(ground_truths, answers))] - scores = np.array(self.embeddings.predict(inputs)) + if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings): + raise NotImplementedError( + "async score [ascore()] not implemented for HuggingFace embeddings" + ) else: - embeddings_1 = np.array(self.embeddings.embed_documents(ground_truths)) - embeddings_2 = np.array(self.embeddings.embed_documents(answers)) + embeddings_1 = np.array( + await self.embeddings.aembed_documents(ground_truths) + ) + embeddings_2 = np.array(await self.embeddings.aembed_documents(answers)) similarity = embeddings_1 @ embeddings_2.T scores = np.diagonal(similarity) @@ -87,7 +81,7 @@ def _score_batch( if self.threshold: scores = scores >= self.threshold # type: ignore - return scores.tolist() + return scores.tolist()[0] answer_similarity = AnswerSimilarity() diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 328623338..5f98e9438 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -8,6 +8,7 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader @@ -54,65 +55,48 @@ class ContextPrecision(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qc # type: ignore batch_size: int = 15 - def _score_batch( + async def _ascore( self: t.Self, - dataset: Dataset, + row: t.Dict, callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list: - prompts = [] - questions, contexts = dataset["question"], dataset["contexts"] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for qstn, ctx in zip(questions, contexts): - human_prompts = [ - ChatPromptTemplate.from_messages( - [CONTEXT_PRECISION.format(question=qstn, context=c)] - ) - for c in ctx - ] - - prompts.extend(human_prompts) - - responses: list[list[str]] = [] - results = self.llm.generate( - prompts, + ) -> float: + assert self.llm is not None, "LLM is not set" + question, contexts = row["question"], row["contexts"] + + human_prompts = [ + ChatPromptTemplate.from_messages( + [CONTEXT_PRECISION.format(question=question, context=c)] + ) + for c in contexts + ] + + responses: list[str] = [] + for hp in human_prompts: + result = await self.llm.agenerate_text( + Prompt(chat_prompt_template=hp), n=1, - callbacks=batch_group, + callbacks=callbacks, ) - responses = [[i.text for i in r] for r in results.generations] - context_lens = [len(ctx) for ctx in contexts] - context_lens.insert(0, 0) - context_lens = np.cumsum(context_lens) - grouped_responses = [ - responses[start:end] - for start, end in zip(context_lens[:-1], context_lens[1:]) + responses.append(result.generations[0][0].text) + + score = np.nan + response = [json_loader.safe_load(item, self.llm) for item in responses] + response = [ + int("yes" in resp.get("verdict", " ").lower()) + if resp.get("verdict") + else np.nan + for resp in response + ] + denominator = sum(response) + 1e-10 + numerator = sum( + [ + (sum(response[: i + 1]) / (i + 1)) * response[i] + for i in range(len(response)) ] - scores = [] - - for response in grouped_responses: - response = [ - json_loader.safe_load(item, self.llm) for item in sum(response, []) - ] - response = [ - int("yes" in resp.get("verdict", " ").lower()) - if resp.get("verdict") - else np.nan - for resp in response - ] - denominator = sum(response) + 1e-10 - numerator = sum( - [ - (sum(response[: i + 1]) / (i + 1)) * response[i] - for i in range(len(response)) - ] - ) - scores.append(numerator / denominator) - - return scores + ) + score = numerator / denominator + + return score context_precision = ContextPrecision() diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 79f684d57..613ea62f9 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -8,6 +8,7 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader @@ -84,55 +85,51 @@ class ContextRecall(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore batch_size: int = 15 - def _score_batch( + async def _ascore( self: t.Self, - dataset: Dataset, + row: t.Dict, callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list: - prompts = [] - question, ground_truths, contexts = ( - dataset["question"], - dataset["ground_truths"], - dataset["contexts"], + ) -> float: + assert self.llm is not None, "LLM is not set" + + question, ground_truth, contexts = ( + row["question"], + row["ground_truths"], + row["contexts"], + ) + + ground_truth = ( + "\n".join(ground_truth) if isinstance(ground_truth, list) else ground_truth + ) + contexts = "\n".join(contexts) if isinstance(contexts, list) else contexts + human_prompt = CONTEXT_RECALL_RA.format( + question=question, context=contexts, answer=ground_truth + ) + p = Prompt( + chat_prompt_template=ChatPromptTemplate.from_messages([human_prompt]) ) - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for qstn, gt, ctx in zip(question, ground_truths, contexts): - gt = "\n".join(gt) if isinstance(gt, list) else gt - ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx - human_prompt = CONTEXT_RECALL_RA.format( - question=qstn, context=ctx, answer=gt - ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) - - responses: list[list[str]] = [] - results = self.llm.generate( - prompts, - n=1, - callbacks=batch_group, - ) - responses = [[i.text for i in r] for r in results.generations] - scores = [] - for response in responses: - response = json_loader.safe_load(response[0], self.llm) - if response: - response = [ - int(item.get("Attributed", "").lower() == "yes") - if item.get("Attributed") - else np.nan - for item in response - ] - denom = len(response) - numerator = sum(response) - scores.append(numerator / denom) - else: - scores.append(np.nan) - - return scores + results = await self.llm.agenerate_text( + p, + n=1, + callbacks=callbacks, + ) + response = results.generations[0][0].text + response = json_loader.safe_load(response, self.llm) + if response: + response = [ + int(item.get("Attributed", "").lower() == "yes") + if item.get("Attributed") + else np.nan + for item in response + ] + denom = len(response) + numerator = sum(response) + score = numerator / denom + else: + score = np.nan + + return score context_recall = ContextRecall() diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 2ad1f45ec..8a703acaf 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -126,16 +126,16 @@ class Faithfulness(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore batch_size: int = 15 - async def _ascore(self: t.Self, data_row: t.Dict, callbacks: Callbacks) -> float: + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: """ returns the NLI score for each (q, c, a) pair """ assert self.llm is not None, "LLM is not set" question, answer, contexts = ( - data_row["question"], - data_row["answer"], - data_row["contexts"], + row["question"], + row["answer"], + row["contexts"], ) # extract statements from answer given the question diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index fa1ad7872..116c4fe38 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -51,20 +51,18 @@ def init_model(self): def score( self: t.Self, - data_row: t.Dict, + row: t.Dict, callbacks: t.Optional[Callbacks] = None, ) -> float: raise NotImplemented - async def ascore( - self: t.Self, data_row: t.Dict, callbacks: Callbacks = [] - ) -> float: + async def ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: if isinstance(callbacks, list): cm = CallbackManager.configure(inheritable_callbacks=callbacks) else: cm = t.cast(CallbackManager, callbacks) - rm = cm.on_chain_start({"name": self.name}, data_row) + rm = cm.on_chain_start({"name": self.name}, row) child_cm = rm.get_child() group_cm = CallbackManagerForChainGroup( child_cm.handlers, @@ -77,7 +75,7 @@ async def ascore( inheritable_metadata=child_cm.inheritable_metadata, ) try: - score = await self._ascore(data_row=data_row, callbacks=group_cm) + score = await self._ascore(row=row, callbacks=group_cm) except Exception as e: if not group_cm.ended: rm.on_chain_error(e) @@ -88,7 +86,7 @@ async def ascore( return score @abstractmethod - async def _ascore(self, data_row: t.Dict, callbacks: Callbacks = []) -> float: + async def _ascore(self, row: t.Dict, callbacks: Callbacks = []) -> float: ... From 8cf84fc3370e3c2b2b0b598048bce9746d9530f2 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Dec 2023 11:41:15 +0530 Subject: [PATCH 07/34] executor tested --- src/ragas/evaluation.py | 82 ++++++++++++++++++++---------- src/ragas/metrics/_faithfulness.py | 52 ++++++++++++------- src/ragas/metrics/base.py | 37 +++++++------- 3 files changed, 108 insertions(+), 63 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index b5d0cdad7..07c7c7d43 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -1,14 +1,18 @@ from __future__ import annotations import typing as t +from asyncio import Task +from concurrent.futures import ALL_COMPLETED, ThreadPoolExecutor, wait from dataclasses import dataclass, field import numpy as np from datasets import Dataset, concatenate_datasets +from tqdm import tqdm from ragas._analytics import EvaluationEvent, track from ragas.async_utils import run_async_tasks from ragas.callbacks import new_group +from ragas.embeddings.base import RagasEmbeddings from ragas.llms.base import BaseRagasLLM from ragas.metrics.base import Metric, MetricWithLLM @@ -27,9 +31,11 @@ def evaluate( dataset: Dataset, metrics: list[Metric] | None = None, llm: t.Optional[BaseRagasLLM] = None, + embeddings: t.Optional[RagasEmbeddings] = None, callbacks: Callbacks = [], is_async: bool = True, - column_map: dict[str, str] = {}, + max_workers: t.Optional[int] = None, + column_map: t.Dict[str, str] = {}, ) -> Result: """ Run the evaluation on the dataset with different metrics @@ -122,33 +128,55 @@ def evaluate( # initialize all the models in the metrics [m.init_model() for m in metrics] - for i, row in enumerate(dataset): - row_rm, row_group_cm = new_group( - name=f"row {i}", - inputs=row, - callbacks=evaluation_group_cm, - is_async=is_async, - ) - scoring_tasks.extend( - [metric.ascore(row=row, callbacks=row_group_cm) for metric in metrics] - ) - row_chains.append(row_rm) - # run the evaluation tasks - try: - results = run_async_tasks( - scoring_tasks, show_progress=True, progress_bar_desc="evaluating dataset" - ) - # TODO: closing row chains here. handle errors here too - [chain.on_chain_end({}) for chain in row_chains] - - # run evaluation task - except Exception as e: - if not evaluation_group_cm.ended: - evaluation_rm.on_chain_error(e) - raise e + if is_async: + for i, row in enumerate(dataset): + row_rm, row_group_cm = new_group( + name=f"row {i}", + inputs=row, + callbacks=evaluation_group_cm, + is_async=is_async, + ) + scoring_tasks.extend( + [metric.ascore(row=row, callbacks=row_group_cm) for metric in metrics] + ) + row_chains.append(row_rm) + # run the evaluation tasks + try: + results = run_async_tasks( + scoring_tasks, + show_progress=True, + progress_bar_desc="evaluating dataset", + ) + # TODO: closing row chains here. handle errors here too + [chain.on_chain_end({}) for chain in row_chains] + + # run evaluation task + except Exception as e: + if not evaluation_group_cm.ended: + evaluation_rm.on_chain_error(e) + raise e + else: + if not evaluation_group_cm.ended: + evaluation_rm.on_chain_end({}) + results = [] else: - if not evaluation_group_cm.ended: - evaluation_rm.on_chain_end({}) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + for i, row in enumerate(dataset): + row_rm, row_group_cm = new_group( + name=f"row {i}", + inputs=row, + callbacks=evaluation_group_cm, + is_async=is_async, + ) + for metric in metrics: + future_result = executor.submit(metric.score, row, row_group_cm) + scoring_tasks.append(future_result) + row_chains.append(row_rm) + + # wait for results + results = [] + for future_result in tqdm(scoring_tasks): + results.append(future_result.result()) return results diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 8a703acaf..99fff99c5 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -13,7 +13,8 @@ if t.TYPE_CHECKING: from datasets import Dataset - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks + from langchain_core.outputs import LLMResult LONG_FORM_ANSWER_PROMPT = HumanMessagePromptTemplate.from_template( @@ -126,30 +127,23 @@ class Faithfulness(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore batch_size: int = 15 - async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: - """ - returns the NLI score for each (q, c, a) pair - """ - assert self.llm is not None, "LLM is not set" - - question, answer, contexts = ( - row["question"], - row["answer"], - row["contexts"], - ) + def _create_answer_prompt(self, row: t.Dict) -> Prompt: + question, answer = row["question"], row["answer"] # extract statements from answer given the question human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=question, answer=answer) p = Prompt( chat_prompt_template=ChatPromptTemplate.from_messages([human_prompt]) ) - result = await self.llm.agenerate_text(p, callbacks=callbacks) + return p + def _create_nli_prompt(self, row: t.Dict, answer_result: LLMResult) -> Prompt: + contexts = row["contexts"] # check if the statements are support in the contexts contexts_str: str = "\n".join(contexts) - statements = json_loader.safe_load(result.generations[0][0].text, self.llm).get( - "statements", [] - ) + statements = json_loader.safe_load( + answer_result.generations[0][0].text, self.llm + ).get("statements", []) statements = statements if statements != [] else ["Nil"] statements_str: str = "\n".join( [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] @@ -160,8 +154,9 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: p = Prompt( chat_prompt_template=ChatPromptTemplate.from_messages([human_prompt]) ) - result = await self.llm.agenerate_text(p, callbacks=callbacks) + return p + def _compute_score(self, result: LLMResult): # check the verdicts and compute the score output = result.generations[0][0] verdict_score_map = {"yes": 1, "no": 0, "null": np.nan} @@ -179,5 +174,28 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: return score + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + """ + returns the NLI score for each (q, c, a) pair + """ + assert self.llm is not None, "LLM is not set" + p = self._create_answer_prompt(row) + result = await self.llm.agenerate_text(p, callbacks=callbacks) + + p = self._create_nli_prompt(row, result) + result = await self.llm.agenerate_text(p, callbacks=callbacks) + + return self._compute_score(result) + + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not set" + p = self._create_answer_prompt(row) + result = self.llm.generate_text(p, callbacks=callbacks) + + p = self._create_nli_prompt(row, result) + result = self.llm.generate_text(p, callbacks=callbacks) + + return self._compute_score(result) + faithfulness = Faithfulness() diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 116c4fe38..f539ef8f6 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -16,6 +16,7 @@ from langchain_core.callbacks import CallbackManager, CallbackManagerForChainGroup from tqdm import tqdm +from ragas.callbacks import new_group from ragas.embeddings.base import RagasEmbeddings from ragas.llms import llm_factory @@ -52,28 +53,26 @@ def init_model(self): def score( self: t.Self, row: t.Dict, - callbacks: t.Optional[Callbacks] = None, + callbacks: Callbacks = [], ) -> float: - raise NotImplemented + rm, group_cm = new_group(self.name, row, callbacks, is_async=False) + try: + score = self._score(row=row, callbacks=group_cm) + except Exception as e: + if not group_cm.ended: + rm.on_chain_error(e) + raise e + else: + if not group_cm.ended: + rm.on_chain_end({"output": score}) + return score + + # @abstractmethod + def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: + ... async def ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: - if isinstance(callbacks, list): - cm = CallbackManager.configure(inheritable_callbacks=callbacks) - else: - cm = t.cast(CallbackManager, callbacks) - - rm = cm.on_chain_start({"name": self.name}, row) - child_cm = rm.get_child() - group_cm = CallbackManagerForChainGroup( - child_cm.handlers, - child_cm.inheritable_handlers, - child_cm.parent_run_id, - parent_run_manager=rm, - tags=child_cm.tags, - inheritable_tags=child_cm.inheritable_tags, - metadata=child_cm.metadata, - inheritable_metadata=child_cm.inheritable_metadata, - ) + rm, group_cm = new_group(self.name, row, callbacks, is_async=True) try: score = await self._ascore(row=row, callbacks=group_cm) except Exception as e: From 21c1456c52d38d6fad25a26ab646a946deaff929 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Dec 2023 15:50:15 +0530 Subject: [PATCH 08/34] executor object created --- src/ragas/evaluation.py | 100 ++++++++++++++++++---------------------- src/ragas/executor.py | 47 +++++++++++++++++++ 2 files changed, 91 insertions(+), 56 deletions(-) create mode 100644 src/ragas/executor.py diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 07c7c7d43..452387077 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -1,7 +1,7 @@ from __future__ import annotations +import asyncio import typing as t -from asyncio import Task from concurrent.futures import ALL_COMPLETED, ThreadPoolExecutor, wait from dataclasses import dataclass, field @@ -13,6 +13,7 @@ from ragas.async_utils import run_async_tasks from ragas.callbacks import new_group from ragas.embeddings.base import RagasEmbeddings +from ragas.executor import Executor from ragas.llms.base import BaseRagasLLM from ragas.metrics.base import Metric, MetricWithLLM @@ -102,6 +103,10 @@ def evaluate( from ragas.llms import llm_factory llm = llm_factory() + if embeddings is None: + from ragas.embeddings.base import embedding_factory + + embeddings = embedding_factory() # remap column names from the dataset dataset = remap_column_names(dataset, column_map) @@ -109,14 +114,6 @@ def evaluate( validate_evaluation_modes(dataset, metrics) validate_column_dtypes(dataset) - # new evaluation chain - evaluation_rm, evaluation_group_cm = new_group( - name="ragas evaluation", inputs={}, callbacks=callbacks, is_async=is_async - ) - # list of chains for each row - row_chains = [] - - scoring_tasks = [] binary_metrics = [] for metric in metrics: # if isinstance(metric, AspectCritique): @@ -128,55 +125,46 @@ def evaluate( # initialize all the models in the metrics [m.init_model() for m in metrics] - if is_async: - for i, row in enumerate(dataset): - row_rm, row_group_cm = new_group( - name=f"row {i}", - inputs=row, - callbacks=evaluation_group_cm, - is_async=is_async, - ) - scoring_tasks.extend( - [metric.ascore(row=row, callbacks=row_group_cm) for metric in metrics] - ) - row_chains.append(row_rm) - # run the evaluation tasks - try: - results = run_async_tasks( - scoring_tasks, - show_progress=True, - progress_bar_desc="evaluating dataset", - ) - # TODO: closing row chains here. handle errors here too - [chain.on_chain_end({}) for chain in row_chains] - - # run evaluation task - except Exception as e: - if not evaluation_group_cm.ended: - evaluation_rm.on_chain_error(e) - raise e + executor = Executor(in_async_mode=is_async, max_workers=max_workers) + # new evaluation chain + row_chains = [] + evaluation_rm, evaluation_group_cm = new_group( + name="ragas evaluation", inputs={}, callbacks=callbacks, is_async=is_async + ) + for i, row in enumerate(dataset): + row_rm, row_group_cm = new_group( + name=f"row {i}", + inputs=row, + callbacks=evaluation_group_cm, + is_async=is_async, + ) + row_chains.append(row_rm) + + if is_async: + [executor.submit(metric.ascore, row, row_group_cm) for metric in metrics] else: - if not evaluation_group_cm.ended: - evaluation_rm.on_chain_end({}) - results = [] + [executor.submit(metric.score, row, row_group_cm) for metric in metrics] + + try: + # get the results + if is_async: + # TODO: watch out for nested async loop error + results = asyncio.run(executor.aresults()) + else: + results = executor.results() + + # TODO: closing row chains here. handle errors here too + # and parse results so that its easier to view + [chain.on_chain_end({}) for chain in row_chains] + + # run evaluation task + except Exception as e: + if not evaluation_group_cm.ended: + evaluation_rm.on_chain_error(e) + raise e else: - with ThreadPoolExecutor(max_workers=max_workers) as executor: - for i, row in enumerate(dataset): - row_rm, row_group_cm = new_group( - name=f"row {i}", - inputs=row, - callbacks=evaluation_group_cm, - is_async=is_async, - ) - for metric in metrics: - future_result = executor.submit(metric.score, row, row_group_cm) - scoring_tasks.append(future_result) - row_chains.append(row_rm) - - # wait for results - results = [] - for future_result in tqdm(scoring_tasks): - results.append(future_result.result()) + if not evaluation_group_cm.ended: + evaluation_rm.on_chain_end({}) return results diff --git a/src/ragas/executor.py b/src/ragas/executor.py new file mode 100644 index 000000000..e16c4f888 --- /dev/null +++ b/src/ragas/executor.py @@ -0,0 +1,47 @@ +import asyncio +import typing as t +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass, field + +from tqdm.auto import tqdm + + +@dataclass +class Executor: + in_async_mode: bool = True + max_workers: t.Optional[int] = None + futures: t.List[t.Any] = field(default_factory=list, repr=False) + raise_exceptions: bool = False + + def __post_init__(self): + if self.in_async_mode: + self.executor = asyncio.get_event_loop() + else: + self.executor = ThreadPoolExecutor(max_workers=self.max_workers) + + def _validation_for_mode(self): + if self.in_async_mode and self.max_workers is not None: + raise ValueError( + "Cannot evaluate with both async and threads. Either set is_async=False or max_workers=None." # noqa + ) + + def submit(self, callable: t.Callable, *args, **kwargs): + if self.in_async_mode: + self.executor = t.cast(asyncio.AbstractEventLoop, self.executor) + self.futures.append(self.executor.create_task(callable(*args, **kwargs))) + else: + self.executor = t.cast(ThreadPoolExecutor, self.executor) + self.futures.append(self.executor.submit(callable, *args, **kwargs)) + + async def aresults(self) -> t.List[t.Any]: + results = [] + for future in tqdm(self.futures, desc="Evaluating"): + results.append(await future) + + return results + + def results(self) -> t.List[t.Any]: + results = [] + for future in tqdm(self.futures, desc="Evaluating"): + results.append(future.result()) + return results From a779aeadd2c9ebc19d29a2177e85eb398bdc83f0 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Dec 2023 18:53:26 +0530 Subject: [PATCH 09/34] error handling for both --- src/ragas/evaluation.py | 12 ++++----- src/ragas/executor.py | 59 ++++++++++++++++++++++++++++++++++------- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 452387077..2812d1e8c 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -36,6 +36,7 @@ def evaluate( callbacks: Callbacks = [], is_async: bool = True, max_workers: t.Optional[int] = None, + raise_exceptions: bool = True, column_map: t.Dict[str, str] = {}, ) -> Result: """ @@ -125,7 +126,9 @@ def evaluate( # initialize all the models in the metrics [m.init_model() for m in metrics] - executor = Executor(in_async_mode=is_async, max_workers=max_workers) + executor = Executor( + is_async=is_async, max_workers=max_workers, raise_exceptions=raise_exceptions + ) # new evaluation chain row_chains = [] evaluation_rm, evaluation_group_cm = new_group( @@ -147,11 +150,7 @@ def evaluate( try: # get the results - if is_async: - # TODO: watch out for nested async loop error - results = asyncio.run(executor.aresults()) - else: - results = executor.results() + results = executor.results() # TODO: closing row chains here. handle errors here too # and parse results so that its easier to view @@ -161,6 +160,7 @@ def evaluate( except Exception as e: if not evaluation_group_cm.ended: evaluation_rm.on_chain_error(e) + raise e else: if not evaluation_group_cm.ended: diff --git a/src/ragas/executor.py b/src/ragas/executor.py index e16c4f888..cf8d0e422 100644 --- a/src/ragas/executor.py +++ b/src/ragas/executor.py @@ -3,45 +3,86 @@ from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass, field +import numpy as np +from sqlalchemy import except_ from tqdm.auto import tqdm @dataclass class Executor: - in_async_mode: bool = True + is_async: bool = True max_workers: t.Optional[int] = None futures: t.List[t.Any] = field(default_factory=list, repr=False) raise_exceptions: bool = False + _is_new_eventloop: bool = False def __post_init__(self): - if self.in_async_mode: - self.executor = asyncio.get_event_loop() + if self.is_async: + try: + self.executor = asyncio.get_running_loop() + except RuntimeError: + self.executor = asyncio.new_event_loop() + self._is_new_eventloop = True else: self.executor = ThreadPoolExecutor(max_workers=self.max_workers) def _validation_for_mode(self): - if self.in_async_mode and self.max_workers is not None: + if self.is_async and self.max_workers is not None: raise ValueError( "Cannot evaluate with both async and threads. Either set is_async=False or max_workers=None." # noqa ) def submit(self, callable: t.Callable, *args, **kwargs): - if self.in_async_mode: + if self.is_async: self.executor = t.cast(asyncio.AbstractEventLoop, self.executor) self.futures.append(self.executor.create_task(callable(*args, **kwargs))) else: self.executor = t.cast(ThreadPoolExecutor, self.executor) self.futures.append(self.executor.submit(callable, *args, **kwargs)) - async def aresults(self) -> t.List[t.Any]: + async def _aresults(self) -> t.List[t.Any]: results = [] for future in tqdm(self.futures, desc="Evaluating"): - results.append(await future) + r = np.nan + try: + r = await future + except Exception as e: + if self.raise_exceptions: + raise e + results.append(r) return results def results(self) -> t.List[t.Any]: results = [] - for future in tqdm(self.futures, desc="Evaluating"): - results.append(future.result()) + if self.is_async: + self.executor = t.cast(asyncio.AbstractEventLoop, self.executor) + try: + if self._is_new_eventloop: + results = self.executor.run_until_complete(self._aresults()) + + # event loop is running use nested_asyncio to hijack the event loop + else: + import nest_asyncio + + nest_asyncio.apply() + results = self.executor.run_until_complete(self._aresults()) + finally: + [f.cancel() for f in self.futures] + + else: + self.executor = t.cast(ThreadPoolExecutor, self.executor) + try: + for future in tqdm(self.futures, desc="Evaluating"): + r = np.nan + try: + r = future.result() + except Exception as e: + r = np.nan + if self.raise_exceptions: + raise e + finally: + results.append(r) + finally: + self.executor.shutdown(wait=False) return results From 44558d9b7bea07f28f4d442f970cf1c5d6bf6982 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Dec 2023 19:51:52 +0530 Subject: [PATCH 10/34] fixed up BaseRagasLLM and BaseRagasEmbeddings --- src/ragas/embeddings/__init__.py | 4 +- src/ragas/embeddings/base.py | 16 +++----- src/ragas/evaluation.py | 4 +- src/ragas/llms/base.py | 28 ++++++++++++- src/ragas/metrics/_answer_relevance.py | 4 +- src/ragas/metrics/_answer_similarity.py | 25 ++++++++++-- src/ragas/metrics/_context_precision.py | 53 ++++++++++++++++++------- 7 files changed, 99 insertions(+), 35 deletions(-) diff --git a/src/ragas/embeddings/__init__.py b/src/ragas/embeddings/__init__.py index 1eba70cdc..3c2d5f083 100644 --- a/src/ragas/embeddings/__init__.py +++ b/src/ragas/embeddings/__init__.py @@ -1,13 +1,13 @@ from ragas.embeddings.base import ( AzureOpenAIEmbeddings, + BaseRagasEmbeddings, HuggingfaceEmbeddings, OpenAIEmbeddings, - RagasEmbeddings, ) __all__ = [ "HuggingfaceEmbeddings", "OpenAIEmbeddings", "AzureOpenAIEmbeddings", - "RagasEmbeddings", + "BaseRagasEmbeddings", ] diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index 19ee8c38c..136a02398 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -17,15 +17,11 @@ DEFAULT_MODEL_NAME = "BAAI/bge-small-en-v1.5" -class RagasEmbeddings(Embeddings): - def validate_api_key(self): - """ - Validates that the api key is set for the Embeddings - """ - pass +class BaseRagasEmbeddings(Embeddings): + ... -class OpenAIEmbeddings(BaseOpenAIEmbeddings, RagasEmbeddings): +class OpenAIEmbeddings(BaseOpenAIEmbeddings, BaseRagasEmbeddings): api_key: str = NO_KEY def __init__(self, api_key: str = NO_KEY): @@ -47,7 +43,7 @@ def validate_api_key(self): raise OpenAIKeyNotFound -class AzureOpenAIEmbeddings(BaseAzureOpenAIEmbeddings, RagasEmbeddings): +class AzureOpenAIEmbeddings(BaseAzureOpenAIEmbeddings, BaseRagasEmbeddings): azure_endpoint: t.Optional[str] = None deployment: t.Optional[str] = None api_version: t.Optional[str] = None @@ -85,7 +81,7 @@ def validate_api_key(self): @dataclass -class HuggingfaceEmbeddings(RagasEmbeddings): +class HuggingfaceEmbeddings(BaseRagasEmbeddings): model_name: str = DEFAULT_MODEL_NAME """Model name to use.""" cache_folder: t.Optional[str] = None @@ -159,6 +155,6 @@ def predict(self, texts: List[List[str]]) -> List[List[float]]: return predictions.tolist() -def embedding_factory() -> RagasEmbeddings: +def embedding_factory() -> BaseRagasEmbeddings: openai_embeddings = OpenAIEmbeddings() return openai_embeddings diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 2812d1e8c..8434f921e 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -12,7 +12,7 @@ from ragas._analytics import EvaluationEvent, track from ragas.async_utils import run_async_tasks from ragas.callbacks import new_group -from ragas.embeddings.base import RagasEmbeddings +from ragas.embeddings.base import BaseRagasEmbeddings from ragas.executor import Executor from ragas.llms.base import BaseRagasLLM from ragas.metrics.base import Metric, MetricWithLLM @@ -32,7 +32,7 @@ def evaluate( dataset: Dataset, metrics: list[Metric] | None = None, llm: t.Optional[BaseRagasLLM] = None, - embeddings: t.Optional[RagasEmbeddings] = None, + embeddings: t.Optional[BaseRagasEmbeddings] = None, callbacks: Callbacks = [], is_async: bool = True, max_workers: t.Optional[int] = None, diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index 8027e83ef..aedf5e7d4 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -1,6 +1,7 @@ from __future__ import annotations import typing as t +from abc import ABC, abstractmethod from dataclasses import dataclass from langchain.chat_models import AzureChatOpenAI, BedrockChat, ChatOpenAI, ChatVertexAI @@ -34,7 +35,32 @@ def is_multiple_completion_supported(llm: BaseLanguageModel) -> bool: @dataclass -class BaseRagasLLM: +class BaseRagasLLM(ABC): + @abstractmethod + def generate_text( + self, + prompt: Prompt, + n: int = 1, + temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, + callbacks: t.Optional[Callbacks] = None, + ) -> LLMResult: + ... + + @abstractmethod + async def agenerate_text( + self, + prompt: Prompt, + n: int = 1, + temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, + callbacks: t.Optional[Callbacks] = None, + ) -> LLMResult: + ... + + +@dataclass +class LangchainLLMWrapper: """ A simple base class for RagasLLMs that is based on Langchain's BaseLanguageModel interface. it implements 2 functions: diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 9b19ac0c1..6711eecf2 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -17,7 +17,7 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks - from ragas.embeddings.base import RagasEmbeddings + from ragas.embeddings.base import BaseRagasEmbeddings QUESTION_GEN = HumanMessagePromptTemplate.from_template( @@ -90,7 +90,7 @@ class AnswerRelevancy(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore batch_size: int = 15 strictness: int = 3 - embeddings: RagasEmbeddings = field(default_factory=embedding_factory) + embeddings: BaseRagasEmbeddings = field(default_factory=embedding_factory) def init_model(self): super().init_model() diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 803458774..dd3cf3b0c 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -17,7 +17,7 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks - from ragas.embeddings.base import RagasEmbeddings + from ragas.embeddings.base import BaseRagasEmbeddings @dataclass @@ -45,7 +45,7 @@ class AnswerSimilarity(MetricWithLLM): name: str = "answer_similarity" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.ga # type: ignore batch_size: int = 15 - embeddings: RagasEmbeddings = field(default_factory=embedding_factory) + embeddings: BaseRagasEmbeddings = field(default_factory=embedding_factory) is_cross_encoder: bool = False threshold: t.Optional[float] = None @@ -60,9 +60,28 @@ def __post_init__(self: t.Self): def init_model(self): super().init_model() + def _score(self, row: t.Dict, callbacks: Callbacks = ...) -> float: + ground_truths, answers = row["ground_truths"], row["answer"] + ground_truths = [item[0] for item in ground_truths] + + if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings): + raise NotImplementedError( + "async score [ascore()] not implemented for HuggingFace embeddings" + ) + else: + embeddings_1 = np.array(self.embeddings.embed_documents(ground_truths)) + embeddings_2 = np.array(self.embeddings.embed_documents(answers)) + similarity = embeddings_1 @ embeddings_2.T + scores = np.diagonal(similarity) + + assert isinstance(scores, np.ndarray), "Expects ndarray" + if self.threshold: + scores = scores >= self.threshold # type: ignore + + return scores.tolist()[0] + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: ground_truths, answers = row["ground_truths"], row["answer"] - # why? ground_truths = [item[0] for item in ground_truths] if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings): diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 5f98e9438..b08b6f67a 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -55,12 +55,7 @@ class ContextPrecision(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qc # type: ignore batch_size: int = 15 - async def _ascore( - self: t.Self, - row: t.Dict, - callbacks: t.Optional[Callbacks] = None, - ) -> float: - assert self.llm is not None, "LLM is not set" + def _context_precision_prompt(self, row: t.Dict) -> t.List[Prompt]: question, contexts = row["question"], row["contexts"] human_prompts = [ @@ -69,16 +64,9 @@ async def _ascore( ) for c in contexts ] + return [Prompt(chat_prompt_template=hp) for hp in human_prompts] - responses: list[str] = [] - for hp in human_prompts: - result = await self.llm.agenerate_text( - Prompt(chat_prompt_template=hp), - n=1, - callbacks=callbacks, - ) - responses.append(result.generations[0][0].text) - + def _calculate_average_precision(self, responses: t.List[str]) -> float: score = np.nan response = [json_loader.safe_load(item, self.llm) for item in responses] response = [ @@ -95,7 +83,42 @@ async def _ascore( ] ) score = numerator / denominator + return score + + async def _ascore( + self: t.Self, + row: t.Dict, + callbacks: Callbacks = [], + ) -> float: + assert self.llm is not None, "LLM is not set" + + human_prompts = self._context_precision_prompt(row) + responses: t.List[str] = [] + for hp in human_prompts: + result = await self.llm.agenerate_text( + hp, + n=1, + callbacks=callbacks, + ) + responses.append(result.generations[0][0].text) + + score = self._calculate_average_precision(responses) + return score + + def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: + assert self.llm is not None, "LLM is not set" + + human_prompts = self._context_precision_prompt(row) + responses: t.List[str] = [] + for hp in human_prompts: + result = self.llm.generate_text( + hp, + n=1, + callbacks=callbacks, + ) + responses.append(result.generations[0][0].text) + score = self._calculate_average_precision(responses) return score From b4e080d0a101f3ddd3bae84e155b108d8a069157 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 19 Dec 2023 23:27:45 +0530 Subject: [PATCH 11/34] as_complete functionality --- src/ragas/evaluation.py | 21 +++++++++++------ src/ragas/executor.py | 47 ++++++++++++++++++++++++++++++++++----- src/ragas/metrics/base.py | 4 ++-- 3 files changed, 57 insertions(+), 15 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 8434f921e..9674a05ee 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -166,7 +166,13 @@ def evaluate( if not evaluation_group_cm.ended: evaluation_rm.on_chain_end({}) - return results + # convert results to dataset_like + scores = [] + for i, _ in enumerate(dataset): + s = {} + for j, m in enumerate(metrics): + s[m.name] = results[len(metrics) * i + j] + scores.append(s) # log the evaluation event metrics_names = [m.name for m in metrics] @@ -179,22 +185,23 @@ def evaluate( ) ) + return scores return Result( - scores=concatenate_datasets(scores, axis=1), - dataset=dataset, + scores=scores, + dataset=dataset.to_dict(), binary_columns=binary_metrics, ) @dataclass class Result(dict): - scores: Dataset - dataset: Dataset | None = None - binary_columns: list[str] = field(default_factory=list) + scores: t.List[t.Dict] + dataset: t.List[t.Dict] | None = None + binary_columns: t.List[str] = field(default_factory=list) def __post_init__(self): values = [] - for cn in self.scores.column_names: + for cn in self.scores[0].keys(): value = np.nanmean(self.scores[cn]) self[cn] = value if cn not in self.binary_columns: diff --git a/src/ragas/executor.py b/src/ragas/executor.py index cf8d0e422..422ea2d59 100644 --- a/src/ragas/executor.py +++ b/src/ragas/executor.py @@ -1,9 +1,10 @@ import asyncio import typing as t -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field import numpy as np +from nltk.classify.textcat import re from sqlalchemy import except_ from tqdm.auto import tqdm @@ -32,17 +33,45 @@ def _validation_for_mode(self): "Cannot evaluate with both async and threads. Either set is_async=False or max_workers=None." # noqa ) + def wrap_callable_with_index(self, callable: t.Callable, counter): + def wrapped_callable(*args, **kwargs): + return counter, callable(*args, **kwargs) + + async def wrapped_callable_async(*args, **kwargs): + return counter, await callable(*args, **kwargs) + + if self.is_async: + return wrapped_callable_async + else: + return wrapped_callable + def submit(self, callable: t.Callable, *args, **kwargs): if self.is_async: self.executor = t.cast(asyncio.AbstractEventLoop, self.executor) - self.futures.append(self.executor.create_task(callable(*args, **kwargs))) + callable_with_index = self.wrap_callable_with_index( + callable, len(self.futures) + ) + # is type correct? + callable_with_index = t.cast(t.Callable, callable_with_index) + self.futures.append( + self.executor.create_task(callable_with_index(*args, **kwargs)) + ) else: self.executor = t.cast(ThreadPoolExecutor, self.executor) - self.futures.append(self.executor.submit(callable, *args, **kwargs)) + callable_with_index = self.wrap_callable_with_index( + callable, len(self.futures) + ) + self.futures.append( + self.executor.submit(callable_with_index, *args, **kwargs) + ) async def _aresults(self) -> t.List[t.Any]: results = [] - for future in tqdm(self.futures, desc="Evaluating"): + for future in tqdm( + asyncio.as_completed(self.futures), + desc="Evaluating", + total=len(self.futures), + ): r = np.nan try: r = await future @@ -73,7 +102,11 @@ def results(self) -> t.List[t.Any]: else: self.executor = t.cast(ThreadPoolExecutor, self.executor) try: - for future in tqdm(self.futures, desc="Evaluating"): + for future in tqdm( + as_completed(self.futures), + desc="Evaluating", + total=len(self.futures), + ): r = np.nan try: r = future.result() @@ -85,4 +118,6 @@ def results(self) -> t.List[t.Any]: results.append(r) finally: self.executor.shutdown(wait=False) - return results + + sorted_results = sorted(results, key=lambda x: x[0]) + return [r[1] for r in sorted_results] diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index f539ef8f6..4837ff049 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -17,7 +17,7 @@ from tqdm import tqdm from ragas.callbacks import new_group -from ragas.embeddings.base import RagasEmbeddings +from ragas.embeddings.base import BaseRagasEmbeddings from ragas.llms import llm_factory if t.TYPE_CHECKING: @@ -108,5 +108,5 @@ def init_model(self): if hasattr(self, "embeddings"): # since we are using Langchain Embeddings directly, we need to check this if hasattr(self.embeddings, "validate_api_key"): - self.embeddings = t.cast(RagasEmbeddings, self.embeddings) + self.embeddings = t.cast(BaseRagasEmbeddings, self.embeddings) self.embeddings.validate_api_key() From db71c5d811cc1acc8231d89f1cc17fa80ee86363 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 21 Dec 2023 15:19:49 +0530 Subject: [PATCH 12/34] fix BaseRagasLLM --- src/ragas/llms/__init__.py | 4 ++-- src/ragas/llms/base.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ragas/llms/__init__.py b/src/ragas/llms/__init__.py index bb55627f1..71af6e377 100644 --- a/src/ragas/llms/__init__.py +++ b/src/ragas/llms/__init__.py @@ -1,6 +1,6 @@ from langchain.chat_models import ChatOpenAI -from ragas.llms.base import BaseRagasLLM +from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper from ragas.llms.llamaindex import LlamaIndexLLM __all__ = [ @@ -11,4 +11,4 @@ def llm_factory(model="gpt-3.5-turbo-16k") -> BaseRagasLLM: - return BaseRagasLLM(ChatOpenAI(model=model)) + return LangchainLLMWrapper(ChatOpenAI(model=model)) diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index aedf5e7d4..81c86d01b 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -60,7 +60,7 @@ async def agenerate_text( @dataclass -class LangchainLLMWrapper: +class LangchainLLMWrapper(BaseRagasLLM): """ A simple base class for RagasLLMs that is based on Langchain's BaseLanguageModel interface. it implements 2 functions: From 7161674f49fa769ba38711e27ee3a50c08014f19 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 21 Dec 2023 22:50:11 +0530 Subject: [PATCH 13/34] basic callbacks configured --- src/ragas/callbacks.py | 28 +++++++++++++++++++++++++- src/ragas/evaluation.py | 41 ++++++++++++++++++++++----------------- src/ragas/llms/base.py | 4 ++-- src/ragas/metrics/base.py | 8 ++++++-- 4 files changed, 58 insertions(+), 23 deletions(-) diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py index b2a9d6cb2..af97592d8 100644 --- a/src/ragas/callbacks.py +++ b/src/ragas/callbacks.py @@ -1,6 +1,9 @@ import typing as t from langchain_core.callbacks import ( + AsyncCallbackManager, + AsyncCallbackManagerForChainGroup, + AsyncCallbackManagerForChainRun, CallbackManager, CallbackManagerForChainGroup, CallbackManagerForChainRun, @@ -9,7 +12,7 @@ def new_group( - name: str, inputs: t.Dict, callbacks: Callbacks, is_async=True + name: str, inputs: t.Dict, callbacks: Callbacks, is_async=False ) -> t.Tuple[CallbackManagerForChainRun, CallbackManager]: # start evaluation chain if isinstance(callbacks, list): @@ -30,3 +33,26 @@ def new_group( ) return rm, group_cm + + +async def new_async_group( + name: str, inputs: t.Dict, callbacks: Callbacks +) -> t.Tuple[AsyncCallbackManagerForChainRun, AsyncCallbackManagerForChainGroup]: + # start evaluation chain + if isinstance(callbacks, list): + cm = AsyncCallbackManager.configure(inheritable_callbacks=callbacks) + else: + cm = t.cast(AsyncCallbackManager, callbacks) + rm = await cm.on_chain_start({"name": name}, inputs) + child_cm = rm.get_child() + group_cm = AsyncCallbackManagerForChainGroup( + child_cm.handlers, + child_cm.inheritable_handlers, + child_cm.parent_run_id, + parent_run_manager=rm, + tags=child_cm.tags, + inheritable_tags=child_cm.inheritable_tags, + metadata=child_cm.metadata, + inheritable_metadata=child_cm.inheritable_metadata, + ) + return rm, group_cm diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 9674a05ee..f02d12249 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -7,6 +7,7 @@ import numpy as np from datasets import Dataset, concatenate_datasets +from langchain_core.language_models import BaseLanguageModel from tqdm import tqdm from ragas._analytics import EvaluationEvent, track @@ -14,7 +15,7 @@ from ragas.callbacks import new_group from ragas.embeddings.base import BaseRagasEmbeddings from ragas.executor import Executor -from ragas.llms.base import BaseRagasLLM +from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper from ragas.metrics.base import Metric, MetricWithLLM # from ragas.metrics.critique import AspectCritique @@ -99,11 +100,13 @@ def evaluate( ) metrics = [answer_relevancy, context_precision, faithfulness, context_recall] - + # set the llm and embeddings if llm is None: from ragas.llms import llm_factory llm = llm_factory() + elif isinstance(llm, BaseLanguageModel): + llm = LangchainLLMWrapper(llm) if embeddings is None: from ragas.embeddings.base import embedding_factory @@ -130,7 +133,7 @@ def evaluate( is_async=is_async, max_workers=max_workers, raise_exceptions=raise_exceptions ) # new evaluation chain - row_chains = [] + row_run_managers = [] evaluation_rm, evaluation_group_cm = new_group( name="ragas evaluation", inputs={}, callbacks=callbacks, is_async=is_async ) @@ -141,20 +144,28 @@ def evaluate( callbacks=evaluation_group_cm, is_async=is_async, ) - row_chains.append(row_rm) + row_run_managers.append((row_rm, row_group_cm)) if is_async: [executor.submit(metric.ascore, row, row_group_cm) for metric in metrics] else: [executor.submit(metric.score, row, row_group_cm) for metric in metrics] + scores = [] + # import ipdb; ipdb.set_trace() # fmt: skip try: # get the results results = executor.results() - - # TODO: closing row chains here. handle errors here too - # and parse results so that its easier to view - [chain.on_chain_end({}) for chain in row_chains] + # convert results to dataset_like + for i, _ in enumerate(dataset): + s = {} + for j, m in enumerate(metrics): + s[m.name] = results[len(metrics) * i + j] + scores.append(s) + # close the row chain + row_rm, row_group_cm = row_run_managers[i] + if row_group_cm.ended: + row_rm.on_chain_end(s) # run evaluation task except Exception as e: @@ -162,17 +173,11 @@ def evaluate( evaluation_rm.on_chain_error(e) raise e - else: + finally: + # close the evaluation chain + # TODO: show only aggregate scores if not evaluation_group_cm.ended: - evaluation_rm.on_chain_end({}) - - # convert results to dataset_like - scores = [] - for i, _ in enumerate(dataset): - s = {} - for j, m in enumerate(metrics): - s[m.name] = results[len(metrics) * i + j] - scores.append(s) + evaluation_rm.on_chain_end(scores) # log the evaluation event metrics_names = [m.name for m in metrics] diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index 81c86d01b..e68191216 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -93,7 +93,7 @@ def generate_text( stop=stop, callbacks=callbacks, ) - # make LLMREsult.generation appear as if it was n_completions + # make LLMResult.generation appear as if it was n_completions # note that LLMResult.runs is still a list that represents each run generations = [[g[0] for g in result.generations]] result.generations = generations @@ -122,7 +122,7 @@ async def agenerate_text( stop=stop, callbacks=callbacks, ) - # make LLMREsult.generation appear as if it was n_completions + # make LLMResult.generation appear as if it was n_completions # note that LLMResult.runs is still a list that represents each run generations = [[g[0] for g in result.generations]] result.generations = generations diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 4837ff049..307df7631 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -55,7 +55,9 @@ def score( row: t.Dict, callbacks: Callbacks = [], ) -> float: - rm, group_cm = new_group(self.name, row, callbacks, is_async=False) + rm, group_cm = new_group( + self.name, inputs=row, callbacks=callbacks, is_async=False + ) try: score = self._score(row=row, callbacks=group_cm) except Exception as e: @@ -72,7 +74,9 @@ def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: ... async def ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: - rm, group_cm = new_group(self.name, row, callbacks, is_async=True) + rm, group_cm = new_group( + self.name, inputs=row, callbacks=callbacks, is_async=True + ) try: score = await self._ascore(row=row, callbacks=group_cm) except Exception as e: From f8b98f5dfea47a4fded2a85180c32183a2ec7579 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Mon, 1 Jan 2024 15:22:33 +0530 Subject: [PATCH 14/34] fixed results --- src/ragas/evaluation.py | 22 ++++++++++------------ tests/benchmarks/benchmark_eval.py | 27 +++++++++++++++------------ 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index f02d12249..4331244d5 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -152,7 +152,6 @@ def evaluate( [executor.submit(metric.score, row, row_group_cm) for metric in metrics] scores = [] - # import ipdb; ipdb.set_trace() # fmt: skip try: # get the results results = executor.results() @@ -164,7 +163,7 @@ def evaluate( scores.append(s) # close the row chain row_rm, row_group_cm = row_run_managers[i] - if row_group_cm.ended: + if not row_group_cm.ended: row_rm.on_chain_end(s) # run evaluation task @@ -176,8 +175,13 @@ def evaluate( finally: # close the evaluation chain # TODO: show only aggregate scores + result = Result( + scores=Dataset.from_list(scores), + dataset=dataset, + binary_columns=binary_metrics, + ) if not evaluation_group_cm.ended: - evaluation_rm.on_chain_end(scores) + evaluation_rm.on_chain_end(result) # log the evaluation event metrics_names = [m.name for m in metrics] @@ -189,19 +193,13 @@ def evaluate( num_rows=dataset.shape[0], ) ) - - return scores - return Result( - scores=scores, - dataset=dataset.to_dict(), - binary_columns=binary_metrics, - ) + return result @dataclass class Result(dict): - scores: t.List[t.Dict] - dataset: t.List[t.Dict] | None = None + scores: Dataset + dataset: t.Optional[Dataset] = None binary_columns: t.List[str] = field(default_factory=list) def __post_init__(self): diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index 1edb8c4f4..add94ceba 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -3,13 +3,7 @@ from datasets import DatasetDict, load_dataset from ragas import evaluate -from ragas.metrics import ( - answer_relevancy, - context_precision, - context_recall, - faithfulness, -) -from ragas.metrics.critique import harmfulness +from ragas.metrics import faithfulness # data ds = load_dataset("explodinggradients/fiqa", "ragas_eval") @@ -17,15 +11,24 @@ fiqa = ds["baseline"] if __name__ == "__main__": + # asyncio start = time.time() _ = evaluate( fiqa, metrics=[ - answer_relevancy, - context_precision, faithfulness, - harmfulness, - context_recall, ], + is_async=True, ) - print(f"Time taken: {time.time() - start:.2f}s") + print(f"Time taken [Asyncio]: {time.time() - start:.2f}s") + + # Threads + start = time.time() + _ = evaluate( + fiqa, + metrics=[ + faithfulness, + ], + is_async=False, + ) + print(f"Time taken [Threads]: {time.time() - start:.2f}s") From 8d654c463899a05a6167c080af5ed35effdbd429 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Mon, 1 Jan 2024 15:29:23 +0530 Subject: [PATCH 15/34] merged with main --- docs/_static/js/mendable_chat_bubble.js | 40 ++ docs/concepts/metrics/answer_relevance.md | 4 + docs/conf.py | 9 +- docs/getstarted/evaluation.md | 8 +- docs/howtos/customisations/azure-openai.ipynb | 5 +- docs/howtos/customisations/embeddings.ipynb | 381 ++++++++++++ docs/howtos/customisations/index.md | 1 + docs/howtos/customisations/llms.ipynb | 22 +- .../assesments/metrics_assesments.ipynb | 547 +++++++++++++++++- pyproject.toml | 1 + requirements/dev.txt | 1 + src/.DS_Store | Bin 0 -> 6148 bytes src/ragas/__init__.py | 4 +- src/ragas/async_utils.py | 2 +- src/ragas/embeddings/base.py | 17 + src/ragas/evaluation.py | 2 +- src/ragas/llms/prompt.py | 109 +++- src/ragas/metrics/__init__.py | 9 +- src/ragas/metrics/_answer_correctness.py | 163 +++--- src/ragas/metrics/_answer_relevance.py | 77 +-- src/ragas/metrics/_answer_similarity.py | 1 + src/ragas/metrics/_context_precision.py | 112 +++- src/ragas/metrics/_context_recall.py | 161 +++--- src/ragas/metrics/_context_relevancy.py | 20 +- src/ragas/metrics/_faithfulness.py | 248 ++++---- src/ragas/metrics/critique.py | 52 +- src/ragas/testset/testset_generator.py | 143 +++-- src/ragas/utils.py | 2 +- src/ragas/validation.py | 10 +- tests/unit/test_import.py | 2 +- tests/unit/test_prompt.py | 84 +++ tests/unit/test_validation.py | 4 +- 32 files changed, 1764 insertions(+), 477 deletions(-) create mode 100644 docs/_static/js/mendable_chat_bubble.js create mode 100644 docs/howtos/customisations/embeddings.ipynb create mode 100644 src/.DS_Store create mode 100644 tests/unit/test_prompt.py diff --git a/docs/_static/js/mendable_chat_bubble.js b/docs/_static/js/mendable_chat_bubble.js new file mode 100644 index 000000000..e6b3eeb02 --- /dev/null +++ b/docs/_static/js/mendable_chat_bubble.js @@ -0,0 +1,40 @@ +document.addEventListener("DOMContentLoaded", () => { + function loadScript(src, callback) { + var script = document.createElement("script"); + script.type = "text/javascript"; + script.src = src; + script.onload = callback; // Once script is loaded, callback function will be called + document.head.appendChild(script); + } + + // Load Mendable script and initialize the component once script is loaded + loadScript( + "https://unpkg.com/@mendable/search@0.0.191/dist/umd/mendable-bundle.min.js", + function () { + Mendable.initialize({ + anon_key: "f4cb5493-f914-43a5-8edc-f41463ea5bed", + type: "searchBar", + elementId: "searchbox", + style: { + darkMode: true, + accentColor: "#FECA4B", + backgroundColor: "#0F1629" + }, + searchBarStyle: { + backgroundColor: "#00000000" + }, + showSimpleSearch: true, + messageSettings: { + openSourcesInNewTab: false, + prettySources: true + } + + }); + + var searchForm = document.getElementById('searchbox'); + searchForm.onsubmit = (event) => { + event.preventDefault(); + } + } + ); +}); diff --git a/docs/concepts/metrics/answer_relevance.md b/docs/concepts/metrics/answer_relevance.md index f1e884369..98e68fa63 100644 --- a/docs/concepts/metrics/answer_relevance.md +++ b/docs/concepts/metrics/answer_relevance.md @@ -2,6 +2,10 @@ The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information. This metric is computed using the `question` and the `answer`, with values ranging between 0 and 1, where higher scores indicate better relevancy. +:::{note} +This is reference free metric. If you're looking to compare ground truth answer with generated answer refer to [answer_correctness](./answer_correctness.md) +::: + An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question. ```{hint} diff --git a/docs/conf.py b/docs/conf.py index 7c8c13795..dab9b26ca 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,6 +21,7 @@ html_theme = "sphinxawesome_theme" html_static_path = ["_static"] html_css_files = ["css/ragas.css"] +html_js_files = ["js/mendable_chat_bubble.js"] html_favicon = "./_static/favicon.ico" extensions = [ @@ -30,7 +31,7 @@ "sphinx.ext.autosummary", "sphinx_design", "sphinxawesome_theme.highlighting", - "sphinxawesome_theme.docsearch", + # "sphinxawesome_theme.docsearch", "myst_nb", ] @@ -40,9 +41,9 @@ myst_number_code_blocks = ["typescript"] # algolia search -docsearch_app_id = os.getenv("DOCSEARCH_APP_ID") -docsearch_api_key = os.getenv("DOCSEARCH_API_KEY") -docsearch_index_name = os.getenv("DOCSEARCH_INDEX_NAME") +# docsearch_app_id = os.getenv("DOCSEARCH_APP_ID") +# docsearch_api_key = os.getenv("DOCSEARCH_API_KEY") +# docsearch_index_name = os.getenv("DOCSEARCH_INDEX_NAME") # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md index 3e5770060..f21c4b64b 100644 --- a/docs/getstarted/evaluation.md +++ b/docs/getstarted/evaluation.md @@ -64,7 +64,7 @@ from ragas.metrics import ( context_precision, ) ``` -here you can see that we are using 4 metrics, but what do the represent? +here you can see that we are using 4 metrics, but what do they represent? 1. faithfulness - the factual consistency of the answer to the context base on the question. 2. context_precision - a measure of how relevant the retrieved context is to the question. Conveys quality of the retrieval pipeline. @@ -96,9 +96,9 @@ result = evaluate( result ``` -and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline. +and there you have it, all the scores you need. `ragas_score` gives you a single metric that you can use while 4 metrics individually would measure the different parts of your pipeline. -now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too! +Now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too! ```{code-block} python :caption: export results @@ -111,4 +111,4 @@ df.head() And thats it! -if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁 +If you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁 diff --git a/docs/howtos/customisations/azure-openai.ipynb b/docs/howtos/customisations/azure-openai.ipynb index 3857f6a87..93851916c 100644 --- a/docs/howtos/customisations/azure-openai.ipynb +++ b/docs/howtos/customisations/azure-openai.ipynb @@ -158,6 +158,9 @@ "from langchain.embeddings import AzureOpenAIEmbeddings\n", "from ragas.llms import LangchainLLM\n", "\n", + "# Import evaluate before patching the RagasLLM instance\n", + "from ragas import evaluate\n", + "\n", "azure_model = AzureChatOpenAI(\n", " deployment_name=\"your-deployment-name\",\n", " model=\"your-model-name\",\n", @@ -242,8 +245,6 @@ } ], "source": [ - "from ragas import evaluate\n", - "\n", "result = evaluate(\n", " fiqa_eval[\"baseline\"],\n", " metrics=metrics,\n", diff --git a/docs/howtos/customisations/embeddings.ipynb b/docs/howtos/customisations/embeddings.ipynb new file mode 100644 index 000000000..7cc4f9abe --- /dev/null +++ b/docs/howtos/customisations/embeddings.ipynb @@ -0,0 +1,381 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0174eb96", + "metadata": {}, + "source": [ + "# Using different Embedding Models\n", + "\n", + "Ragas allows users to change the default embedding model used in the evaluation task.\n", + "\n", + "This guide will show you how to use different embedding models for evaluation in Ragas." + ] + }, + { + "cell_type": "markdown", + "id": "55f0f9b9", + "metadata": {}, + "source": [ + "## Evaluating with Azure Open AI Embeddings\n", + "\n", + "Ragas uses open-ai embeddings by default. In this example we can use Azure Open AI Embeddings from langchain with the embedding model text-embedding-ada-002. We will be using gpt-35-turbo-16k from Azure OpenAI as the llm for evaluation and `AnswerSimilarity` as the metric\n", + "\n", + "To start-off, we initialise the gpt-35-turbo-16k from Azure and create a chat_model using langchain" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "25c72521-3372-4663-81e4-c159b0edde40", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from langchain.chat_models import AzureChatOpenAI\n", + "from ragas.llms import LangchainLLM\n", + "\n", + "os.environ[\"OPENAI_API_VERSION\"] = \"2023-05-15\"\n", + "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-key\"\n", + "\n", + "azure_model = AzureChatOpenAI(\n", + " deployment_name=\"your-deployment-name\",\n", + " model=\"gpt-35-turbo-16k\",\n", + " azure_endpoint=\"https://your-endpoint.openai.azure.com/\",\n", + " openai_api_type=\"azure\",\n", + ")\n", + "# wrapper around azure_model\n", + "ragas_azure_model = LangchainLLM(azure_model)" + ] + }, + { + "cell_type": "markdown", + "id": "f1fdb48b", + "metadata": {}, + "source": [ + "In order to use the Azure Open AI embedding, we have to instantiate an object of the `AzureOpenAIEmbeddings` class in Ragas." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import AzureOpenAIEmbeddings\n", + "\n", + "azure_embeddings = AzureOpenAIEmbeddings(\n", + " deployment=\"your-deployment-name\",\n", + " model=\"text-embedding-ada-002\",\n", + " azure_endpoint=\"https://your-endpoint.openai.azure.com/\",\n", + " openai_api_type=\"azure\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "62645da8-6a52-4cbb-bec7-59f7e153cd38", + "metadata": {}, + "source": [ + "To use the AzureOpenAIEmbeddings with the AnswerSimilarity metric, create an object of AnswerSimilarity by passing the azure_embeddings and llm as parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "307321ed", + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.metrics import AnswerSimilarity\n", + "\n", + "answer_similarity = AnswerSimilarity(llm=ragas_azure_model, embeddings=azure_embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "1930dd49", + "metadata": {}, + "source": [ + "That's it! answer_similarity will now be using AzureOpenAIEmbeddings under the hood for evaluations.\n", + "\n", + "Now lets run the evaluations using the example from [quickstart](../../getstarted/evaluation.md)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "62c0eadb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " baseline: Dataset({\n", + " features: ['question', 'ground_truths', 'answer', 'contexts'],\n", + " num_rows: 30\n", + " })\n", + "})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data\n", + "from datasets import load_dataset\n", + "\n", + "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n", + "fiqa_eval" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c4396f6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [answer_similarity]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:01<00:00, 1.04s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'answer_similarity': 0.8878}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# evaluate\n", + "from ragas import evaluate\n", + "\n", + "result = evaluate(\n", + " fiqa_eval[\"baseline\"].select(range(5)), # showing only 5 for demonstration\n", + " metrics=[answer_similarity]\n", + ")\n", + "\n", + "result" + ] + }, + { + "cell_type": "markdown", + "id": "f490031e-fb73-4170-8762-61cadb4031e6", + "metadata": {}, + "source": [ + "## Evaluating with FastEmbed Embeddings\n", + "\n", + "`FastEmbed` is a Python library built for embedding generation and has support for popular text models. Ragas has integration with FastEmbed and can be used by instantiating an object of the FastEmbedEmbeddings class. More information regarding FastEmbed and supported models can be found [here](https://github.com/qdrant/fastembed)." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "85e313f2-e45c-4551-ab20-4e526e098740", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 252M/252M [00:10<00:00, 25.0MiB/s] \n" + ] + } + ], + "source": [ + "from ragas.embeddings import FastEmbedEmbeddings\n", + "\n", + "fast_embeddings = FastEmbedEmbeddings(model_name=\"BAAI/bge-base-en\")" + ] + }, + { + "cell_type": "markdown", + "id": "c9ddf74a-9830-4e1a-a4dd-7e5ec17a71e4", + "metadata": {}, + "source": [ + "Now lets create the metric object for AnswerSimilarity by passing the llm and embedding as the `FastEmbedEmbeddings` object that we created." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2fd4adf3-db15-4c95-bf7c-407266517214", + "metadata": {}, + "outputs": [], + "source": [ + "answer_similarity2 = AnswerSimilarity(llm=ragas_azure_model, embeddings=fast_embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "58a610f2-19e5-40ec-bb7d-760c1d608a85", + "metadata": {}, + "source": [ + "Now you can run the evaluations with and analyse the results." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "20882d05-1b54-4d17-88a0-f7ada2d6a576", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [answer_similarity]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:03<00:00, 3.26s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'answer_similarity': 0.8938}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result2 = evaluate(\n", + " fiqa_eval[\"baseline\"].select(range(5)), # showing only 5 for demonstration\n", + " metrics=[answer_similarity2],\n", + ")\n", + "\n", + "result2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating with HuggingFace Embeddings\n", + "\n", + "Ragas has support for using embedding models using HuggingFace. Using the `HuggingfaceEmbeddings` class in Ragas, the embedding models supported by HuggingFace can directly be used for the evaluation task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To use embedding models from HuggingFace, you need to install the following\n", + "%pip install sentence-transformers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.embeddings import HuggingfaceEmbeddings\n", + "\n", + "hf_embeddings = HuggingfaceEmbeddings(model_name=\"BAAI/bge-small-en\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we follow the same steps as above to use the HuggingFace Embeddings in the ragas metrics and evaluate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "answer_similarity3 = AnswerSimilarity(llm=ragas_azure_model, embeddings=hf_embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [answer_similarity]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:01<00:00, 1.35s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'answer_similarity': 0.9156}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result3 = evaluate(\n", + " fiqa_eval[\"baseline\"].select(range(5)), # showing only 5 for demonstration\n", + " metrics=[answer_similarity3],\n", + ")\n", + "\n", + "result3" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/howtos/customisations/index.md b/docs/howtos/customisations/index.md index 0c8e35357..affb8151c 100644 --- a/docs/howtos/customisations/index.md +++ b/docs/howtos/customisations/index.md @@ -4,6 +4,7 @@ How to customize Ragas for your needs :::{toctree} llms.ipynb +embeddings.ipynb azure-openai.ipynb aws-bedrock.ipynb gcp-vertexai.ipynb diff --git a/docs/howtos/customisations/llms.ipynb b/docs/howtos/customisations/llms.ipynb index a5a6fa7e7..81bc50882 100644 --- a/docs/howtos/customisations/llms.ipynb +++ b/docs/howtos/customisations/llms.ipynb @@ -7,12 +7,12 @@ "source": [ "# Bring your own LLMs\n", "\n", - "Ragas uses langchain under the hood for connecting to LLMs for metrices that require them. This means you can swap out the default LLM we use (`gpt-3.5-turbo-16k`) to use any 100s of API supported out of the box with langchain.\n", + "Ragas uses langchain under the hood for connecting to LLMs for metrics that require them. This means you can swap out the default LLM we use (`gpt-3.5-turbo-16k`) with any 100s of API supported out of the box by langchain:\n", "\n", "- [Completion LLMs Supported](https://api.python.langchain.com/en/latest/api_reference.html#module-langchain.llms)\n", "- [Chat based LLMs Supported](https://api.python.langchain.com/en/latest/api_reference.html#module-langchain.chat_models)\n", "\n", - "This guide will show you how to use another or LLM API for evaluation." + "This guide will show you how to use another LLM API for evaluation." ] }, { @@ -32,9 +32,9 @@ "source": [ "## Evaluating with GPT4\n", "\n", - "Ragas uses gpt3.5 by default but using gpt4 for evaluation can improve the results so lets use that for the `Faithfulness` metric\n", + "Ragas uses gpt3.5 by default but using gpt4 for evaluation can improve the results so lets use that for the `Faithfulness` metric.\n", "\n", - "To start-off, we initialise the gpt4 `chat_model` from langchain" + "To start-off, we initialise the gpt4 `chat_model` from langchain." ] }, { @@ -67,12 +67,14 @@ "id": "f1fdb48b", "metadata": {}, "source": [ - "In order to you the Langchain LLM you have to use the `RagasLLM` wrapper. This help the Ragas library specify the interfaces that will be used internally by the metrics and what is exposed via the Langchain library. You can also use other LLM APIs in tools like LlamaIndex and LiteLLM but creating your own implementation of `RagasLLM` that supports it." + "`RagasLLM` wrapper is required to use the Langchain LLM. This helps the Ragas library specify the interfaces that will be used internally by the metrics and what is exposed via the Langchain library. \n", + "\n", + "You can also use other LLM APIs from tools like LlamaIndex or LiteLLM but that requires creating your own implementation of `RagasLLM` that supports it." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "25c72521-3372-4663-81e4-c159b0edde40", "metadata": {}, "outputs": [], @@ -87,7 +89,7 @@ "id": "62645da8-6a52-4cbb-bec7-59f7e153cd38", "metadata": {}, "source": [ - "Substitute the llm in `Metric` instance with the newly create GPT4 model." + "Substitute the `llm` in `Metric` instance with the newly create GPT4 model.\n" ] }, { @@ -107,7 +109,7 @@ "id": "1930dd49", "metadata": {}, "source": [ - "That's it! faithfulness will now be using GPT-4 under the hood for evaluations.\n", + "That's it! `Faithfulness` will now be using GPT-4 under the hood for evaluations.\n", "\n", "Now lets run the evaluations using the example from [quickstart](../quickstart.ipnb)." ] @@ -213,7 +215,7 @@ "source": [ "## Evaluating with Open-Source LLMs\n", "\n", - "You can also use any of the Open-Source LLM for evaluating. Ragas support most the the deployment methods like [HuggingFace TGI](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference), [Anyscale](https://python.langchain.com/docs/integrations/llms/anyscale), [vLLM](https://python.langchain.com/docs/integrations/llms/vllm) and many [more](https://python.langchain.com/docs/integrations/llms/) through Langchain. \n", + "You can also use any of the Open-Source LLM for evaluation. Ragas support most the the deployment methods like [HuggingFace TGI](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference), [Anyscale](https://python.langchain.com/docs/integrations/llms/anyscale), [vLLM](https://python.langchain.com/docs/integrations/llms/vllm) and many [more](https://python.langchain.com/docs/integrations/llms/) through Langchain. \n", "\n", "When it comes to selecting open-source language models, there are some rules of thumb to follow, given that the quality of evaluation metrics depends heavily on the model's quality:\n", "\n", @@ -310,7 +312,7 @@ "id": "58a610f2-19e5-40ec-bb7d-760c1d608a85", "metadata": {}, "source": [ - "Now you can run the evaluations with and analyse the results." + "Now you can run the evaluations with `HuggingFaceH4/zephyr-7b-alpha` and analyse the results." ] }, { diff --git a/experiments/assesments/metrics_assesments.ipynb b/experiments/assesments/metrics_assesments.ipynb index 3721bc69c..e1291acae 100644 --- a/experiments/assesments/metrics_assesments.ipynb +++ b/experiments/assesments/metrics_assesments.ipynb @@ -632,9 +632,7 @@ "cell_type": "code", "execution_count": 139, "id": "e705767d", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "output = wikiqa_ragas[\"train\"].map(predict_relevance, batched=True, batch_size=10)" @@ -2373,11 +2371,552 @@ "answer_correctness.score_single(data)" ] }, + { + "cell_type": "markdown", + "id": "496bcee8-d173-4ce6-b979-8a783b5911d3", + "metadata": {}, + "source": [ + "## Ragas on multilingual data" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "50b595cf", "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/ragas/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from ragas.metrics import faithfulness, context_precision, context_recall, answer_correctness\n", + "from ragas import evaluate\n", + "from datasets import load_dataset, Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6580b57f-6de6-4f46-9bc7-230eb3d696c9", + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.llms import LangchainLLM\n", + "from langchain.chat_models import BedrockChat\n", + "from langchain.embeddings import BedrockEmbeddings\n", + "\n", + "config = {\n", + " \"credentials_profile_name\": \"default\", # E.g \"default\"\n", + " \"region_name\": \"us-east-1\", # E.g. \"us-east-1\"\n", + " \"model_id\": \"anthropic.claude-v2\", # E.g \"anthropic.claude-v2\"\n", + "}\n", + "\n", + "bedrock_model = BedrockChat(\n", + " credentials_profile_name=config[\"credentials_profile_name\"],\n", + " region_name=config[\"region_name\"],\n", + " endpoint_url=f\"https://bedrock-runtime.{config['region_name']}.amazonaws.com\",\n", + " model_id=config[\"model_id\"],\n", + " # model_kwargs=config[\"model_kwargs\"],\n", + ")\n", + "# wrapper around bedrock_model\n", + "ragas_bedrock_model = LangchainLLM(bedrock_model)\n", + "# patch the new RagasLLM instance\n", + "# answer_relevancy.llm = ragas_bedrock_model\n", + "\n", + "# # init and change the embeddings\n", + "# # only for answer_relevancy\n", + "# bedrock_embeddings = BedrockEmbeddings(\n", + "# credentials_profile_name=config[\"credentials_profile_name\"],\n", + "# region_name=config[\"region_name\"],\n", + "# )\n", + "# # embeddings can be used as it is\n", + "# answer_relevancy.embeddings = bedrock_embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4e5b08e6-0269-457e-a4e1-0227f1b889aa", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from ragas.llms import LangchainLLM\n", + "\n", + "openai_model = ChatOpenAI(model_name=\"gpt-3.5-turbo-16k\")\n", + "openai_model = LangchainLLM(llm=openai_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "63be86ba-b3a4-478e-9e8c-b6011ad21a8e", + "metadata": {}, + "outputs": [], + "source": [ + "def convert_to(text, language=\"hindi\"):\n", + "\n", + " if isinstance(text, list):\n", + " text = \"\\n\".join(text)\n", + "\n", + " response = llm(f\"convert following into {languge}:{text}\")\n", + " return response\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "77f7ca1f-077a-4733-b9c7-1b220ee6a11b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset csv (/Users/shahules/.cache/huggingface/datasets/csv/default-489a23037feb75f1/0.0.0)\n" + ] + } + ], + "source": [ + "dataset_malayalam = Dataset.from_csv(\"/Users/shahules/Downloads/amnesty_qa_hindi.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e791b170-18e2-48da-98a7-4e26b0074422", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading cached processed dataset at /Users/shahules/.cache/huggingface/datasets/csv/default-489a23037feb75f1/0.0.0/cache-4ca2797a852e8539.arrow\n", + "Loading cached processed dataset at /Users/shahules/.cache/huggingface/datasets/csv/default-489a23037feb75f1/0.0.0/cache-1ad9ae5aa0f2728d.arrow\n" + ] + } + ], + "source": [ + "dataset_malayalam = dataset_malayalam.map(lambda ex: {\"contexts\":eval(ex[\"contexts\"])})\n", + "dataset_malayalam = dataset_malayalam.map(lambda ex: {\"ground_truths\":eval(ex[\"ground_truths\"])})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "40bc5a92-dfd1-43a6-b2ea-3f9f33c31f35", + "metadata": {}, + "outputs": [], + "source": [ + "def assign_llm(metrics,llm):\n", + "\n", + " for metric in metrics:\n", + " metric.llm = llm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a078eb9c-8e69-406e-b7cc-065bccaa5d04", + "metadata": {}, + "outputs": [], + "source": [ + "ragas.adapt(languages=[\"hindi\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2cbc86c-cef2-4785-b5bd-467d43643747", + "metadata": {}, + "outputs": [], + "source": [ + "NLI_STATEMENTS_MESSAGE_HINDI = HumanMessagePromptTemplate.from_template(\n", + " \"\"\"\n", + " Natural language inference. Only use \"Yes\" or \"No\" as verdict.\n", + " \n", + " Context:\n", + " जॉन XYZ विश्वविद्यालय का छात्र है। वह कंप्यूटर विज्ञान में डिग्री प्राप्त कर रहे हैं। इस सेमेस्टर में उन्होंने डेटा संरचनाएं, एल्गोरिदम, और डेटाबेस प्रबंधन सहित कई पाठ्यक्रमों में नामांकन किया है। जॉन एक मेहनती छात्र है और अध्ययन और असाइनमेंट पूरा करने में काफी समय बिताते हैं। वह अक्सर अपनी परियोजनाओं पर काम करने के लिए देर रात तक पुस्तकालय में रहते हैं।\n", + "\n", + " Statement_1:\n", + " जॉन जीव विज्ञान में मुख्य विषय कर रहे हैं।\n", + "\n", + " Statement_2:\n", + " जॉन कृत्रिम बुद्धिमत्ता पर एक पाठ्यक्रम ले रहे हैं।\n", + "\n", + " Statement_3:\n", + " जॉन एक समर्पित छात्र हैं।\n", + "\n", + " Statement_4:\n", + " जॉन की एक अंशकालिक नौकरी है।\n", + " Answer:\n", + " [\n", + " {{\n", + " \"statement_1\": \"जॉन जीव विज्ञान में मुख्य विषय कर रहे हैं।\",\n", + " \"reason\": \"जॉन का मुख्य विषय कंप्यूटर विज्ञान के रूप में स्पष्ट रूप से उल्लेखित है। इस बारे में कोई जानकारी नहीं है कि वह जीव विज्ञान में मुख्य विषय कर रहे हैं।\",\n", + " \"verdict\": \"No\"\n", + " }},\n", + " {{\n", + " \"statement_2\": \"जॉन कृत्रिम बुद्धिमत्ता पर एक पाठ्यक्रम ले रहे हैं।\",\n", + " \"reason\": \"प्रसंग में जॉन द्वारा वर्तमान में नामांकित किए गए पाठ्यक्रमों का उल्लेख है, और कृत्रिम बुद्धिमत्ता का उल्लेख नहीं है। इसलिए, यह नहीं कहा जा सकता कि जॉन AI पर एक पाठ्यक्रम ले रहे हैं।\",\n", + " \"verdict\": \"No\"\n", + " }},\n", + " {{\n", + " \"statement_3\": \"जॉन एक समर्पित छात्र हैं।\",\n", + " \"reason\": \"प्रसंग बताता है कि वह अध्ययन और असाइनमेंट पूरा करने में काफी समय बिताते हैं और अक्सर अपनी परियोजनाओं पर काम करने के लिए देर रात तक पुस्तकालय में रहते हैं, जो समर्पण को दर्शाता है।\",\n", + " \"verdict\": \"Yes\"\n", + " }},\n", + " {{\n", + " \"statement_4\": \"जॉन की एक अंशकालिक नौकरी है।\",\n", + " \"reason\": \"प्रसंग में जॉन के पास अंशकालिक नौकरी होने के बारे में कोई जानकारी नहीं दी गई है।\",\n", + " \"verdict\": \"No\"\n", + " }}\n", + " ]\n", + "\n", + "\n", + " context:\n", + " {context}\n", + " statements:\n", + " {statements}\n", + " Answer:\n", + " \"\"\"\n", + ")\n", + "\n", + "\n", + "LONG_FORM_ANSWER_PROMPT_HINDI = HumanMessagePromptTemplate.from_template(\n", + " \"\"\"\n", + " Create one or more statements from each sentence in the given answer.\n", + " Question: अल्बर्ट आइंस्टीन कौन थे और उन्हें किस लिए सबसे ज्यादा जाना जाता है?\n", + " Answer: वह एक जर्मन-जन्मे सैद्धांतिक भौतिकविद् थे, जिन्हें सबसे महान और प्रभावशाली भौतिकविज्ञानियों में से एक माना जाता है। वह सापेक्षता के सिद्धांत को विकसित करने के लिए सर्वाधिक प्रसिद्ध थे, उन्होंने क्वांटम यांत्रिकी के सिद्धांत के विकास में भी महत्वपूर्ण योगदान दिया।\n", + " Statements in JSON:\n", + " {{\n", + " \"statements\": [\n", + " \"अल्बर्ट आइंस्टीन का जन्म जर्मनी में हुआ था।\",\n", + " \"अल्बर्ट आइंस्टीन अपने सापेक्षता के सिद्धांत के लिए सर्वाधिक प्रसिद्ध थे।\"\n", + " ]\n", + " }}\n", + " question:{question}\n", + " answer: {answer}\n", + " statements in json: # noqa: E501\n", + " \"\"\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7915764e-2649-415f-9ea8-d6fd3bd4fe8a", + "metadata": {}, + "outputs": [], + "source": [ + "# faithfulness.llm = ragas_bedrock_model\n", + "# answer_correctness.llm = ragas_bedrock_model\n", + "# context_recall.llm = ragas_bedrock_model\n", + "# context_precision.llm = ragas_bedrock_model" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fdcb5bcc-bdbc-43e1-8728-88db47d90cda", + "metadata": {}, + "outputs": [], + "source": [ + "# faithfulness.llm = gpt4_wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "aec47d09-c1ae-4f5e-82f6-c3c7d083a9cb", + "metadata": {}, + "outputs": [], + "source": [ + "assign_llm([faithfulness],openai_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "46db84a1-2396-4399-b8c4-47ccd46677cb", + "metadata": {}, + "outputs": [], + "source": [ + "# faithfulness.llm.llm" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1c44073c-76ac-4316-8eae-7716cfa9dfa9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [faithfulness]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionground_truthsanswercontextsfaithfulness
0यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वैश...[यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वै...यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वैश...[- 2022 में, अमेरिका के सर्वोच्च न्यायालय ने ए...1.000000
1कार्बन मेजर्स डेटाबेस के अनुसार, GHG उत्सर्जन ...[कार्बन मेजर्स डेटाबेस के अनुसार, GHG उत्सर्जन...fuel industry, are responsible for a significa...[- फॉसिल ईंधन कंपनियां, चाहे वे राज्य स्वामित्...0.909091
\n", + "" + ], + "text/plain": [ + " question \\\n", + "0 यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वैश... \n", + "1 कार्बन मेजर्स डेटाबेस के अनुसार, GHG उत्सर्जन ... \n", + "\n", + " ground_truths \\\n", + "0 [यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वै... \n", + "1 [कार्बन मेजर्स डेटाबेस के अनुसार, GHG उत्सर्जन... \n", + "\n", + " answer \\\n", + "0 यूएसए सुप्रीम कोर्ट के गर्भपात पर फैसले के वैश... \n", + "1 fuel industry, are responsible for a significa... \n", + "\n", + " contexts faithfulness \n", + "0 [- 2022 में, अमेरिका के सर्वोच्च न्यायालय ने ए... 1.000000 \n", + "1 [- फॉसिल ईंधन कंपनियां, चाहे वे राज्य स्वामित्... 0.909091 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ragas_score.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e13cca4a-b10a-43b0-baad-3559306ee191", + "metadata": {}, "outputs": [], "source": [] } diff --git a/pyproject.toml b/pyproject.toml index ce2335962..3dd632d6b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ dependencies = [ "datasets", "tiktoken", "langchain", + "langchain-core", "openai>1", "pysbd>=0.3.4", "nest-asyncio", diff --git a/requirements/dev.txt b/requirements/dev.txt index 04e22e4d5..aa4246ba0 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -7,3 +7,4 @@ llama_index notebook sphinx-autobuild sentence-transformers +fastembed \ No newline at end of file diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5577d9f9280eeb1683a8b20f42c1f82fe523b022 GIT binary patch literal 6148 zcmeHK%}T>S5Z>*NrWBzEMUM+!3#P3o;w8lT0!H+pQWH{YFlI}VnnNk%sxRc5_&m<+ zZlJ|_6|plg`_0bJZuW!h4`YnGv#`flhcRY>B62j=2)fsXY9<+xt1+^m$fiLogEhna zO&5N9lcmgO6VT=F{|KgWl;yVl$!qmSb8FkOthV*wJ;}n${d|_WetL(!YboQPvi;yD z8W*F^?uATpKT5`_Du}`{q}<;|Nhk|f&XO=wwVn=GtyXK)*`Lo(`&}{U*^922pY`po zI3AoV7AX6++? zwS;=a05R~-7~qYeH*{f9=4}18JUnX!=p!f!=9OrGfW2}FfDY~>2P&xJ0&U2143-*k T6!fcdK)MJhLZ~4Ieu056WpYZI literal 0 HcmV?d00001 diff --git a/src/ragas/__init__.py b/src/ragas/__init__.py index 528344549..fb10896f6 100644 --- a/src/ragas/__init__.py +++ b/src/ragas/__init__.py @@ -2,10 +2,8 @@ try: from ._version import version as __version__ - from ._version import version_tuple except ImportError: __version__ = "unknown version" - version_tuple = (0, 0, "unknown version") -__all__ = ["evaluate", "__version__", "version_tuple"] +__all__ = ["evaluate", "__version__"] diff --git a/src/ragas/async_utils.py b/src/ragas/async_utils.py index 05c09765b..c365ac808 100644 --- a/src/ragas/async_utils.py +++ b/src/ragas/async_utils.py @@ -44,5 +44,5 @@ async def _gather() -> List[Any]: # run the operation w/o tqdm on hitting a fatal # may occur in some environments where tqdm.asyncio # is not supported - raise RuntimeError("Fatal error occurred while running async tasks.", e) + raise RuntimeError("Fatal error occurred while running async tasks.", e) from e return outputs diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index 136a02398..05c7b197c 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -8,6 +8,7 @@ import numpy as np from langchain.embeddings import AzureOpenAIEmbeddings as BaseAzureOpenAIEmbeddings from langchain.embeddings import OpenAIEmbeddings as BaseOpenAIEmbeddings +from langchain.embeddings import FastEmbedEmbeddings as BaseFastEmbedEmbeddings from langchain.schema.embeddings import Embeddings from pydantic.dataclasses import dataclass @@ -42,6 +43,22 @@ def validate_api_key(self): else: raise OpenAIKeyNotFound +class FastEmbedEmbeddings(BaseFastEmbedEmbeddings, RagasEmbeddings): + """ + Find the list of supported models at: + https://qdrant.github.io/fastembed/examples/Supported_Models/ + """ + + model_name: str = DEFAULT_MODEL_NAME + """Model name to use.""" + cache_folder: t.Optional[str] = None + """Path to store models.""" + + def validate_api_key(self): + """ + Validates that the api key is set for the Embeddings + """ + pass class AzureOpenAIEmbeddings(BaseAzureOpenAIEmbeddings, BaseRagasEmbeddings): azure_endpoint: t.Optional[str] = None diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 4331244d5..813bfb136 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -45,7 +45,7 @@ def evaluate( Parameters ---------- - dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str]] + dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str], ground_truths: list[list[str]]] The dataset in the format of ragas which the metrics will use to score the RAG pipeline with metrics : list[Metric] , optional diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 275900de6..d6df3d05e 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -1,20 +1,111 @@ +from __future__ import annotations + +import json import typing as t -from langchain.prompts import ChatPromptTemplate +from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from langchain_core.messages import BaseMessage, HumanMessage from langchain_core.prompt_values import PromptValue -from pydantic import Field +from langchain_core.pydantic_v1 import root_validator class Prompt(PromptValue): - chat_prompt_template: ChatPromptTemplate - instruction: t.Optional[str] = None - examples: t.List[t.Dict[str, t.Any]] = Field(default_factory=list, repr=False) - input_keys: t.List[str] = Field(default_factory=list, repr=False) - output_keys: t.List[str] = Field(default_factory=list, repr=False) + """ + RagasPrompt is a class that represents a prompt for the ragas metrics. + """ + + instruction: str + examples: t.List[t.Dict[str, t.Any]] = [] + input_keys: t.List[str] + output_key: str + output_type: str = "json" + + @root_validator + def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]: + """ + Validate the template string to ensure that it is in desired format. + """ + if values.get("instruction") is None or values.get("instruction") == "": + raise ValueError("instruction cannot be empty") + if values.get("input_keys") is None or values.get("instruction") == []: + raise ValueError("input_keys cannot be empty") + if values.get("output_key") is None or values.get("output_key") == "": + raise ValueError("output_key cannot be empty") + + if values.get("examples"): + output_key = values["output_key"] + for no, example in enumerate(values["examples"]): + for inp_key in values["input_keys"]: + if inp_key not in example: + raise ValueError( + f"example {no+1} does not have the variable {inp_key} in the definition" + ) + if output_key not in example: + raise ValueError( + f"example {no+1} does not have the variable {output_key} in the definition" + ) + if values["output_type"] == "json": + try: + if output_key in example: + json.loads(example[output_key]) + except ValueError as e: + raise ValueError( + f"{output_key} in example {no+1} is not in valid json format: {e}" + ) + + return values def to_string(self) -> str: - return self.chat_prompt_template.format() + """ + Generate the prompt string from the variables. + """ + prompt_str = self.instruction + "\n" + + # Format the examples to match the Langchain prompt template + for example in self.examples: + for key, value in example.items(): + value = ( + value.replace("{", "{{").replace("}", "}}") + if self.output_type.lower() == "json" + else value + ) + prompt_str += f"\n{key}: {value}" + prompt_str += "\n" + + prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys) + prompt_str += f"\n{self.output_key}: \n" + + return prompt_str def to_messages(self) -> t.List[BaseMessage]: - return self.chat_prompt_template.format_messages() + """Return prompt as a list of Messages.""" + return [HumanMessage(content=self.to_string())] + + def get_example_str(self, example_no: int) -> str: + """ + Get the example string from the example number. + """ + if example_no >= len(self.examples): + raise ValueError(f"example number {example_no} is out of range") + example = self.examples[example_no] + example_str = "" + for key, value in example.items(): + value = ( + value.replace("{", "{{").replace("}", "}}") + if self.output_type.lower() == "json" + else value + ) + example_str += f"\n{key}: {value}" + return example_str + + def format(self, **kwargs: t.Any) -> ChatPromptTemplate: + """ + Format the RagasPrompt object into a ChatPromptTemplate object to be used in metrics. + """ + if set(self.input_keys) != set(kwargs.keys()): + raise ValueError( + f"Input variables {self.input_keys} do not match with the given parameters {list(kwargs.keys())}" + ) + prompt = self.to_string() + human_prompt = HumanMessagePromptTemplate.from_template(prompt) + return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 64c6b93c6..1d60c15f4 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,7 +1,12 @@ from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness from ragas.metrics._answer_relevance import AnswerRelevancy, answer_relevancy from ragas.metrics._answer_similarity import AnswerSimilarity, answer_similarity -from ragas.metrics._context_precision import ContextPrecision, context_precision +from ragas.metrics._context_precision import ( + ContextPrecision, + ContextUtilization, + context_precision, + context_utilization, +) from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._context_relevancy import ContextRelevancy, context_relevancy from ragas.metrics._faithfulness import Faithfulness, faithfulness @@ -31,4 +36,6 @@ "AspectCritique", "ContextRecall", "context_recall", + "ContextUtilization", + "context_utilization", ] diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 6a959d46d..8a90dcfb9 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -6,48 +6,46 @@ import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.utils import json_loader +from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks -CORRECTNESS_PROMPT = HumanMessagePromptTemplate.from_template( - """ -Extract following from given question and ground truth - -Question:What powers the sun and what is its primary function? -Answer: The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system. -Ground truth: The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents. -Extracted statements: -[ -{{ - "statements that are present in both the answer and the ground truth": ["The sun's primary function is to provide light"], - "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"], - "relevant statements found in the ground truth but omitted in the answer": ["The sun is powered by nuclear fusion, not fission", "In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy", "This energy provides heat and light, essential for life on Earth", "The sun's light plays a critical role in Earth's climate system", "The sun helps to drive the weather and ocean currents"] -}} -] - -Question: What is the boiling point of water? -Answer: The boiling point of water is 100 degrees Celsius at sea level. -Ground truth: The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level, but it can change with altitude. -Extracted statements: -[ - {{ - "statements that are present in both the answer and the ground truth": ["The boiling point of water is 100 degrees Celsius at sea level"], - "statements present in the answer but not found in the ground truth": [], - "relevant statements found in the ground truth but omitted in the answer": ["The boiling point can change with altitude", "The boiling point of water is 212 degrees Fahrenheit at sea level"] - }} -] - - -Question:{question} -Answer: {answer} -Ground truth: {ground_truth} -Extracted statements:""" # noqa: E501 +CORRECTNESS_PROMPT = Prompt( + instruction="""Extract following from given question and ground truth""", + examples=[ + { + "question": """What powers the sun and what is its primary function?""", + "answer": """The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system.""", + "ground_truth": """The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents.""", + "Extracted statements": """[ + { + "statements that are present in both the answer and the ground truth": ["The sun's primary function is to provide light"], + "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"], + "relevant statements found in the ground truth but omitted in the answer": ["The sun is powered by nuclear fusion, not fission", "In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy", "This energy provides heat and light, essential for life on Earth", "The sun's light plays a critical role in Earth's climate system", "The sun helps to drive the weather and ocean currents"] + }] + """, + }, + { + "question": """What is the boiling point of water?""", + "answer": """The boiling point of water is 100 degrees Celsius at sea level.""", + "ground_truth": """The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level, but it can change with altitude.""", + "Extracted statements": """[ + { + "statements that are present in both the answer and the ground truth": ["The boiling point of water is 100 degrees Celsius at sea level"], + "statements present in the answer but not found in the ground truth": [], + "relevant statements found in the ground truth but omitted in the answer": ["The boiling point can change with altitude", "The boiling point of water is 212 degrees Fahrenheit at sea level"] + }] + """, + }, + ], + input_keys=["question", "answer", "ground_truth"], + output_key="Extracted statements", + output_type="json", ) @@ -56,7 +54,7 @@ class AnswerCorrectness(MetricWithLLM): """ Measures answer correctness compared to ground truth as a combination of - semantic similarity and factuality + factuality and semantic similarity. Attributes ---------- @@ -65,12 +63,10 @@ class AnswerCorrectness(MetricWithLLM): batch_size: int batch size for evaluation weights: - a list of two weights corresponding to semantic similarity and factuality - Defaults [0.5, 0.5] + a list of two weights corresponding to factuality and semantic similarity + Defaults [0.75, 0.25] answer_similarity: The AnswerSimilarity object - faithfulness - The faithfulness object """ name: str = "answer_correctness" # type: ignore[reportIncompatibleMethodOverride] @@ -80,7 +76,14 @@ class AnswerCorrectness(MetricWithLLM): answer_similarity: AnswerSimilarity | None = None def __post_init__(self: t.Self): - if self.answer_similarity is None: + if len(self.weights) != 2: + raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity") + if all([w == 0 for w in self.weights]): + raise ValueError("At least one weight must be non-zero") + if not all([w >= 0 for w in self.weights]): + raise ValueError("Weights must be non-negative") + + if self.answer_similarity is None and self.weights[1] != 0: self.answer_similarity = AnswerSimilarity( llm=self.llm, batch_size=self.batch_size ) @@ -103,46 +106,48 @@ def _score_batch( callback_group_name, callback_manager=cb ) as batch_group: for q, a, g in zip(question, answer, ground_truths): - human_prompt = CORRECTNESS_PROMPT.format( - question=q, ground_truth=g[0], answer=a + prompts.append( + CORRECTNESS_PROMPT.format(question=q, ground_truth=g[0], answer=a) ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) - - result = self.llm.generate(prompts, callbacks=batch_group) - outputs = result.generations - key_map = { - "TP": "statements that are present in both the answer and the ground truth", - "FP": "statements present in the answer but not found in the ground truth", - "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 - } - - f1_score = [] - for prediction in outputs: - prediction = json_loader.safe_load(prediction[0].text, self.llm) - prediction = prediction if isinstance(prediction, list) else [] - if prediction: - prediction = [ - item.get(key_map[k], np.nan) - for item in prediction - for k in key_map.keys() - ] - tp, fp, fn = [ - len(item) if isinstance(item, list) else np.nan - for item in prediction - ] - score = tp / (tp + 0.5 * (fp + fn)) - else: - score = np.nan - f1_score.append(score) - - similarity_scores = self.answer_similarity._score_batch(dataset) # type: ignore - scores_stacked = np.vstack([f1_score, similarity_scores]) - scores = np.average( - scores_stacked, - axis=0, - weights=self.weights, - ) + result = self.llm.generate(prompts, callbacks=batch_group) + outputs = result.generations + key_map = { + "TP": "statements that are present in both the answer and the ground truth", + "FP": "statements present in the answer but not found in the ground truth", + "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 + } + + f1_score = [] + for prediction in outputs: + prediction = json_loader.safe_load(prediction[0].text, self.llm) + prediction = prediction if isinstance(prediction, list) else [] + if prediction: + prediction = [ + item.get(key_map[k], np.nan) + for item in prediction + for k in key_map.keys() + ] + tp, fp, fn = [ + len(item) if isinstance(item, list) else np.nan + for item in prediction + ] + score = tp / (tp + 0.5 * (fp + fn)) + else: + score = np.nan + + f1_score.append(score) + + if self.weights[1] == 0: + similarity_scores = np.zeros(len(f1_score)) + else: + similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore + scores_stacked = np.vstack([f1_score, similarity_scores]) + scores = np.average( + scores_stacked, + axis=0, + weights=self.weights, + ) return scores.tolist() diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 6711eecf2..56e88e4b3 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -7,61 +7,45 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.embeddings import OpenAIEmbeddings -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound -from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader +from ragas.llms.prompt import Prompt +from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks from ragas.embeddings.base import BaseRagasEmbeddings - -QUESTION_GEN = HumanMessagePromptTemplate.from_template( - """ -Generate a question for the given answer and Identify if answer is noncommittal - -Answer: -Albert Einstein was born in Germany. -Context: -Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time -Output: -{{"question":"Where was Albert Einstein born?","noncommittal":false}} - - -Answer: -It can change its skin color based on the temperature of its environment. -Context: -A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment. -Output: -{{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}} - - -Answer: -Everest -Context: -The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas. -Output: -{{"question":"What is the tallest mountain on Earth?","noncommittal":false}} - - -Answer: -I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unware of information beyong 2022. -Context: -In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology. -Output: -{{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}} - - - -Answer: -{answer} -Context: -{context} -Output:""" # noqa: E501 +QUESTION_GEN = Prompt( + instruction="""Generate a question for the given answer and Identify if answer is noncommittal""", + examples=[ + { + "answer": """Albert Einstein was born in Germany.""", + "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time""", + "output": """{"question":"Where was Albert Einstein born?","noncommittal":false}""", + }, + { + "answer": """It can change its skin color based on the temperature of its environment.""", + "context": """A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment.""", + "output": """{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}""", + }, + { + "answer": """Everest""", + "context": """The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas.""", + "output": """{"question":"What is the tallest mountain on Earth?","noncommittal":false}""", + }, + { + "answer": """I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unware of information beyond 2022. """, + "context": """In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology.""", + "output": """{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}""", + }, + ], + input_keys=["answer", "context"], + output_key="output", + output_type="json", ) @@ -117,8 +101,7 @@ def _score_batch( ) as batch_group: prompts = [] for ans, ctx in zip(answers, contexts): - human_prompt = QUESTION_GEN.format(answer=ans, context="\n".join(ctx)) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + prompts.append(QUESTION_GEN.format(answer=ans, context="\n".join(ctx))) results = self.llm.generate( prompts, diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index dd3cf3b0c..aebf7e1b6 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -54,6 +54,7 @@ def __post_init__(self: t.Self): if isinstance(self.embeddings, HuggingfaceEmbeddings): self.is_cross_encoder = True if self.embeddings.is_cross_encoder else False self.embeddings.encode_kwargs = { + **self.embeddings.encode_kwargs, "batch_size": self.batch_size, } diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index b08b6f67a..ee41a783f 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -1,40 +1,57 @@ from __future__ import annotations +import logging import typing as t from dataclasses import dataclass import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks -CONTEXT_PRECISION = HumanMessagePromptTemplate.from_template( - """\ -Verify if the information in the given context is useful in answering the question. - -question: What are the health benefits of green tea? -context: -This article explores the rich history of tea cultivation in China, tracing its roots back to the ancient dynasties. It discusses how different regions have developed their unique tea varieties and brewing techniques. The article also delves into the cultural significance of tea in Chinese society and how it has become a symbol of hospitality and relaxation. -verification: -{{"reason":"The context, while informative about the history and cultural significance of tea in China, does not provide specific information about the health benefits of green tea. Thus, it is not useful for answering the question about health benefits.", "verdict":"No"}} - -question: How does photosynthesis work in plants? -context: -Photosynthesis in plants is a complex process involving multiple steps. This paper details how chlorophyll within the chloroplasts absorbs sunlight, which then drives the chemical reaction converting carbon dioxide and water into glucose and oxygen. It explains the role of light and dark reactions and how ATP and NADPH are produced during these processes. -verification: -{{"reason":"This context is extremely relevant and useful for answering the question. It directly addresses the mechanisms of photosynthesis, explaining the key components and processes involved.", "verdict":"Yes"}} - -question:{question} -context: -{context} -verification:""" # noqa: E501 +CONTEXT_PRECISION = Prompt( + instruction="""Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not. """, + examples=[ + { + "question": """What can you tell me about albert Albert Einstein?""", + "context": """Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.""", + "answer": """Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895""", + "verification": """{ + "reason": "The provided context was indeed useful in arriving at the given answer. The context includes key information about Albert Einstein's life and contributions, which are reflected in the answer.", + "Verdict": "1" + } + """, + }, + { + "question": """who won 2020 icc world cup?""", + "context": """Who won the 2022 ICC Men's T20 World Cup?""", + "answer": """England""", + "verification": """{ + "reason": "the context was useful in clarifying the situation regarding the 2020 ICC World Cup and indicating that England was the winner of the tournament that was intended to be held in 2020 but actually took place in 2022.", + "verdict": "1" + } + """, + }, + { + "question": """What is the tallest mountain in the world?""", + "context": """The Andes is the longest continental mountain range in the world, located in South America. It stretches across seven countries and features many of the highest peaks in the Western Hemisphere. The range is known for its diverse ecosystems, including the high-altitude Andean Plateau and the Amazon rainforest.""", + "answer": """Mount Everest.""", + "verification": """{ + "reason":"the provided context discusses the Andes mountain range, which, while impressive, does not include Mount Everest or directly relate to the question about the world's tallest mountain.", + "verdict":"0" + } + """, + }, + ], + input_keys=["question", "context", "answer"], + output_key="verification", + output_type="json", ) @@ -52,15 +69,45 @@ class ContextPrecision(MetricWithLLM): """ name: str = "context_precision" # type: ignore - evaluation_mode: EvaluationMode = EvaluationMode.qc # type: ignore + evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore batch_size: int = 15 - def _context_precision_prompt(self, row: t.Dict) -> t.List[Prompt]: - question, contexts = row["question"], row["contexts"] + def get_dataset_attributes(self, dataset: Dataset): + answer = "ground_truths" + if answer not in dataset.features.keys(): + logging.warning( + "Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead" + ) + answer = "answer" - human_prompts = [ - ChatPromptTemplate.from_messages( - [CONTEXT_PRECISION.format(question=question, context=c)] + return dataset["question"], dataset["contexts"], dataset[answer] + + def _score_batch( + self: t.Self, + dataset: Dataset, + callbacks: t.Optional[Callbacks] = None, + callback_group_name: str = "batch", + ) -> list: + prompts = [] + questions, contexts, answers = self.get_dataset_attributes(dataset) + + cb = CallbackManager.configure(inheritable_callbacks=callbacks) + with trace_as_chain_group( + callback_group_name, callback_manager=cb + ) as batch_group: + for qstn, ctx, answer in zip(questions, contexts, answers): + human_prompts = [ + CONTEXT_PRECISION.format(question=qstn, context=c, answer=answer) + for c in ctx + ] + + prompts.extend(human_prompts) + + responses: list[list[str]] = [] + results = self.llm.generate( + prompts, + n=1, + callbacks=batch_group, ) for c in contexts ] @@ -122,4 +169,13 @@ def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: return score +class ContextUtilization(ContextPrecision): + name = "ContextUtilization" + evaluation_mode = EvaluationMode.qac + + def get_dataset_attributes(self, dataset: Dataset): + return dataset["question"], dataset["contexts"], dataset["answer"] + + context_precision = ContextPrecision() +context_utilization = ContextUtilization() diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 613ea62f9..e7d0cdef9 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -6,64 +6,60 @@ import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks -CONTEXT_RECALL_RA = HumanMessagePromptTemplate.from_template( - """ -Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Output json with reason. - - -question: What can you tell me about albert Albert Einstein? -context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. -answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 -classification: -[ - {{ "statement_1":"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.", - "reason": "The date of birth of Einstein is mentioned clearly in the context.", - "Attributed": "Yes" - }}, - {{ - "statement_2":"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.", - "reason": "The exact sentence is present in the given context.", - "Attributed": "Yes" - }}, - {{ - "statement_3": "He published 4 papers in 1905.", - "reason": "There is no mention about papers he wrote in the given context.", - "Attributed": "No" - }}, - {{ - "statement_4":"Einstein moved to Switzerland in 1895.", - "reason": "There is no supporting evidence for this in the given context.", - "Attributed": "No" - }} -] - -question: who won 2020 icc world cup? -context: Who won the 2022 ICC Men's T20 World Cup? -The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title. -answer: England -classification: -[ - {{ - "statement_1":"England won the 2022 ICC Men's T20 World Cup.", - "reason": "From context it is clear that England defeated Pakistan to win the World Cup.", - "Attributed": "Yes" - }} -] - -question:{question} -context:{context} -answer:{answer} -classification: -""" # noqa: E501 +CONTEXT_RECALL_RA = Prompt( + instruction="""Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. Output json with reason.""", + examples=[ + { + "question": """What can you tell me about albert Albert Einstein?""", + "context": """Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.""", + "answer": """Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895""", + "classification": """[ + { + "statement_1":"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.", + "reason": "The date of birth of Einstein is mentioned clearly in the context.", + "Attributed": "1" + }, + { + "statement_2":"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.", + "reason": "The exact sentence is present in the given context.", + "Attributed": "1" + }, + { + "statement_3": "He published 4 papers in 1905.", + "reason": "There is no mention about papers he wrote in the given context.", + "Attributed": "0" + }, + { + "statement_4":"Einstein moved to Switzerland in 1895.", + "reason": "There is no supporting evidence for this in the given context.", + "Attributed": "0" + }] + """, + }, + { + "question": """who won 2020 icc world cup?""", + "context": """Who won the 2022 ICC Men's T20 World Cup?""", + "answer": """England""", + "classification": """[ + { + "statement_1":"England won the 2022 ICC Men's T20 World Cup.", + "reason": "From context it is clear that England defeated Pakistan to win the World Cup.", + "Attributed": "1" + }] + """, + }, + ], + input_keys=["question", "context", "answer"], + output_key="classification", + output_type="json", ) @@ -98,36 +94,39 @@ async def _ascore( row["contexts"], ) - ground_truth = ( - "\n".join(ground_truth) if isinstance(ground_truth, list) else ground_truth - ) - contexts = "\n".join(contexts) if isinstance(contexts, list) else contexts - human_prompt = CONTEXT_RECALL_RA.format( - question=question, context=contexts, answer=ground_truth - ) - p = Prompt( - chat_prompt_template=ChatPromptTemplate.from_messages([human_prompt]) - ) - - results = await self.llm.agenerate_text( - p, - n=1, - callbacks=callbacks, - ) - response = results.generations[0][0].text - response = json_loader.safe_load(response, self.llm) - if response: - response = [ - int(item.get("Attributed", "").lower() == "yes") - if item.get("Attributed") - else np.nan - for item in response - ] - denom = len(response) - numerator = sum(response) - score = numerator / denom - else: - score = np.nan + cb = CallbackManager.configure(inheritable_callbacks=callbacks) + with trace_as_chain_group( + callback_group_name, callback_manager=cb + ) as batch_group: + for qstn, gt, ctx in zip(question, ground_truths, contexts): + gt = "\n".join(gt) if isinstance(gt, list) else gt + ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx + prompts.append( + CONTEXT_RECALL_RA.format(question=qstn, context=ctx, answer=gt) + ) + + responses: list[list[str]] = [] + results = self.llm.generate( + prompts, + n=1, + callbacks=batch_group, + ) + responses = [[i.text for i in r] for r in results.generations] + scores = [] + for response in responses: + response = json_loader.safe_load(response[0], self.llm) + if response: + response = [ + int(item.get("Attributed", "0").strip() == "1") + if item.get("Attributed") + else np.nan + for item in response + ] + denom = len(response) + numerator = sum(response) + scores.append(numerator / denom) + else: + scores.append(np.nan) return score diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index c7e4ed1df..d4b716934 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -9,22 +9,21 @@ import pysbd from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks -CONTEXT_RELEVANCE = HumanMessagePromptTemplate.from_template( - """\ -Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context. - -question:{question} -context:\n{context} -candidate sentences:\n""" # noqa: E501 +CONTEXT_RELEVANCE = Prompt( + instruction="""Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.""", + input_keys=["question", "context"], + output_key="candidate sentences", + output_type="json", ) + seg = pysbd.Segmenter(language="en", clean=False) @@ -76,10 +75,9 @@ def _score_batch( callback_group_name, callback_manager=cb ) as batch_group: for q, c in zip(questions, contexts): - human_prompt = CONTEXT_RELEVANCE.format( - question=q, context="\n".join(c) + prompts.append( + CONTEXT_RELEVANCE.format(question=q, context="\n".join(c)) ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) responses: list[list[str]] = [] results = self.llm.generate( diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 99fff99c5..3b81621fe 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -5,11 +5,10 @@ import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from datasets import Dataset @@ -17,108 +16,103 @@ from langchain_core.outputs import LLMResult -LONG_FORM_ANSWER_PROMPT = HumanMessagePromptTemplate.from_template( - """\ -Create one or more statements from each sentence in the given answer. - -question: Who was Albert Einstein and what is he best known for? -answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics. -statements in json: -{{ - "statements": [ - "Albert Einstein was born in Germany.", - "Albert Einstein was best known for his theory of relativity." - ] -}} - -question: Cadmium Chloride is slightly soluble in this chemical, it is also called what? -answer: alcohol -statements in json: -{{ - "statements": [ - "Cadmium Chloride is slightly soluble in alcohol." - ] -}} - -question: Were Hitler and Benito Mussolini of the same nationality? -answer: Sorry, I can't provide answer to that question. -statements in json: -{{ - "statements": [] -}} - -question:{question} -answer: {answer} -statements in json:""" # noqa: E501 -) - - -NLI_STATEMENTS_MESSAGE = HumanMessagePromptTemplate.from_template( - """ - Natural language inference. Only use "Yes" or "No" as verdict. - -Context: -John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. -statement_1: John is majoring in Biology. -statement_2: John is taking a course on Artificial Intelligence. -statement_3: John is a dedicated student. -statement_4: John has a part-time job. -Answer: -[ - {{ - "statement_1": "John is majoring in Biology.", - "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", - "verdict": "No" - }}, - {{ - "statement_2": "John is taking a course on Artificial Intelligence.", - "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", - "verdict": "No" - }}, - {{ - "statement_3": "John is a dedicated student.", - "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", - "verdict": "Yes" - }}, - {{ - "statement_4": "John has a part-time job.", - "reason": "There is no information given in the context about John having a part-time job.", - "verdict": "No" - }} -] - -Context: -Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy. -statement_1: Albert Einstein was a genius. -Answer: -[ - {{ - "statement_1": "Albert Einstein was a genius.", - "reason": "The context and statement are unrelated" - "verdict": "No" - }} -] - -Context: -Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time. -statement_1: Nil -Answer: -[ - {{ - "statement_1": "Nil", - "reason": "The statement is invalid", - "verdict": "No" - }} -] - - -context: -{context} -statements: -{statements} -Answer: -""" # noqa: E501 -) +LONG_FORM_ANSWER_PROMPT = Prompt( + instruction="Create one or more statements from each sentence in the given answer.", + examples=[ + { + "question": "Who was Albert Einstein and what is he best known for?", + "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", + "statements": """{ + "statements": [ + "Albert Einstein, a German-born theoretical physicist, is renowned for being one of the most influential physicists in history.", + "Albert Einstein was best known for his theory of relativity.", + "Einstein's contributions significantly advanced the field of quantum mechanics", + "Recognized globally, Einstein's work has profoundly impacted the scientific community", + "Einstein's groundbreaking theories continue to shape our understanding of physics today." + ] + }""", + }, + { + "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", + "answer": "alcohol", + "statements": """{ + "statements": [ + "Cadmium Chloride is slightly soluble in alcohol." + ] + }""", + }, + { + "question": "Were Hitler and Benito Mussolini of the same nationality?", + "answer": "Sorry, I can't provide answer to that question.", + "statements": """{ + "statements": [] + }""", + }, + ], + input_keys=["question", "answer"], + output_key="statements", + output_type="JSON", +) # noqa: E501 + + +NLI_STATEMENTS_MESSAGE = Prompt( + instruction="Natural language inference. Use only 'Yes' (1), 'No' (0) and 'Null' (-1) as verdict.", + examples=[ + { + "context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", + "statements": """ + statement_1: John is majoring in Biology. + statement_2: John is taking a course on Artificial Intelligence. + statement_3: John is a dedicated student. + statement_4: John has a part-time job. + """, + "answer": """[ + { + "statement_1": "John is majoring in Biology.", + "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", + "verdict": "0" + }, + { + "statement_2": "John is taking a course on Artificial Intelligence.", + "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", + "verdict": "0" + }, + { + "statement_3": "John is a dedicated student.", + "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", + "verdict": "1" + }, + { + "statement_4": "John has a part-time job.", + "reason": "There is no information given in the context about John having a part-time job.", + "verdict": "0" + } + ] + """, + }, + { + "context": """Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.""", + "statements": """statement_1: Albert Einstein was a genius.""", + "answer": """{ + "statement_1": "Albert Einstein was a genius.", + "reason": "The context and statement are unrelated", + "verdict": "0" + }""", + }, + { + "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.""", + "statements": """statement_1: Nil""", + "answer": """{ + "statement_1": "Nil", + "reason": "The statement is invalid", + "verdict": "-1" + }""", + }, + ], + input_keys=["context", "statements"], + output_key="answer", + output_type="JSON", +) # noqa: E501 @dataclass @@ -185,17 +179,53 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: p = self._create_nli_prompt(row, result) result = await self.llm.agenerate_text(p, callbacks=callbacks) - return self._compute_score(result) + cb = CallbackManager.configure(inheritable_callbacks=callbacks) + with trace_as_chain_group( + callback_group_name, callback_manager=cb + ) as batch_group: + for q, a in zip(question, answer): + human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=q, answer=a) + prompts.append(human_prompt) def _score(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not set" p = self._create_answer_prompt(row) result = self.llm.generate_text(p, callbacks=callbacks) - p = self._create_nli_prompt(row, result) - result = self.llm.generate_text(p, callbacks=callbacks) - - return self._compute_score(result) + prompts = [] + for context, output in zip(contexts, result.generations): + statements = json_loader.safe_load(output[0].text, self.llm).get( + "statements", [] + ) + statements = statements if statements != [] else ["Nil"] + statements_str: str = "\n".join( + [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] + ) + contexts_str: str = "\n".join(context) + human_prompt = NLI_STATEMENTS_MESSAGE.format( + context=contexts_str, statements=statements_str + ) + prompts.append(human_prompt) + + result = self.llm.generate(prompts, callbacks=batch_group) + outputs = result.generations + verdict_score_map = {"1": 1, "0": 0, "null": np.nan} + scores = [] + for output in outputs: + output = json_loader.safe_load(output[0].text, self.llm) + output = output if isinstance(output, list) else [output] + faithful_statements = sum( + verdict_score_map.get(dict.get("verdict", "").lower(), np.nan) + for dict in output + ) + num_statements = len(output) + if num_statements: + score = faithful_statements / num_statements + else: + score = np.nan + scores.append(score) + + return scores faithfulness = Faithfulness() diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 8e577ed2f..470735405 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -4,11 +4,13 @@ from collections import Counter from dataclasses import dataclass, field +import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.utils import json_loader from ragas.llms import llm_factory +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: @@ -16,21 +18,25 @@ from ragas.llms import BaseRagasLLM -CRITIQUE_PROMPT = HumanMessagePromptTemplate.from_template( - """Given a input and submission. Evaluate the submission only using the given criteria. -Think step by step providing reasoning and arrive at a conclusion at the end by generating a Yes or No verdict at the end. -input: Who was the director of Los Alamos Laboratory? -submission: Einstein was the director of Los Alamos Laboratory. -criteria: Is the output written in perfect grammar -Here's are my thoughts: the criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically correct. Therefore, the answer is:\n\nYes - -input:{input} -submission:{submission} -criteria:{criteria} -Here's are my thoughts: -""" # noqa: E501 -) +CRITIQUE_PROMPT = Prompt( + instruction="Given a input and submission. Evaluate the submission only using the given criteria. Use only 'Yes' (1) and 'No' (0) as verdict.", + examples=[ + { + "input": "Who was the director of Los Alamos Laboratory?", + "submission": "Einstein was the director of Los Alamos Laboratory.", + "criteria": "Is the output written in perfect grammar", + "output": """{ + "reason":"the criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically correct.", + "verdict":"1" + } + """, + } + ], + input_keys=["input", "submission", "criteria"], + output_key="output", + output_type="JSON", +) # noqa: E501 @dataclass @@ -113,7 +119,7 @@ def _score_batch( ) as batch_group: for question, context, answer in zip(questions, contexts, answers): human_prompt = self.prompt_format(question, answer, context) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + prompts.append(human_prompt) results = self.llm.generate( prompts, @@ -125,18 +131,20 @@ def _score_batch( ] scores = [] - answer_dict = {"Yes": 1, "No": 0} + answer_dict = {"1": 1, "0": 0} for response in responses: - response = [(text, text.split("\n\n")[-1]) for text in response] + response = [json_loader.safe_load(item, self.llm) for item in response] if self.strictness > 1: score = Counter( - [answer_dict.get(item[-1], 0) for item in response] + [ + answer_dict.get(item.get("verdict", np.nan), np.nan) + for item in response + ] ).most_common(1)[0][0] else: - score = answer_dict.get(response[0][-1]) + score = answer_dict.get(response[0].get("verdict", np.nan), np.nan) - # patch for critique: force score to 0 if the answer is not Yes or No - scores.append(score if score is not None else 0) + scores.append(score) return scores diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py index 2271ee351..096dd816a 100644 --- a/src/ragas/testset/testset_generator.py +++ b/src/ragas/testset/testset_generator.py @@ -16,6 +16,11 @@ "Please, install it with `pip install llama_index`." ) +try: + from pydantic.v1 import ValidationError +except ImportError: + from pydantic import ValidationError + import numpy as np import numpy.testing as npt import pandas as pd @@ -58,6 +63,10 @@ "conditional": "_condition_question", } +retry_errors = ( + ValidationError, +) + DataRow = namedtuple( "DataRow", [ @@ -69,6 +78,8 @@ ], ) +Proposal = namedtuple("Proposal", ["question", "text_chunk"]) + @dataclass class TestDataset: @@ -291,6 +302,70 @@ def _embed_nodes(self, nodes: t.List[BaseNode]) -> t.Dict[str, t.List[float]]: return embeddings + def _make_proposal( + self, cur_node: BaseNode, neighbor_nodes: t.List[BaseNode], evolve_type: str + ) -> t.Union[Proposal, None]: + # Append multiple nodes randomly to remove chunking bias + size = self.rng.integers(1, 3) + nodes = ( + self._get_neighbour_node(cur_node, neighbor_nodes) + if size > 1 and evolve_type != "multi_context" + else [cur_node] + ) + + text_chunk = " ".join([node.get_content() for node in nodes]) + score = self._filter_context(text_chunk) + if not score: + return None + seed_question = self._seed_question(text_chunk) + is_valid_question = self._filter_question(seed_question) + if not is_valid_question: + return None + + if evolve_type == "multi_context": + # Find most similar chunk in same document + node_embedding = self._embed_nodes([nodes[-1]]) + neighbor_nodes = self._remove_nodes(neighbor_nodes, nodes) + neighbor_emb = self._embed_nodes(neighbor_nodes) + + _, indices = get_top_k_embeddings( + list(node_embedding.values())[0], + list(neighbor_emb.values()), + similarity_cutoff=self.threshold / 10, + ) + if indices: + # type cast indices from list[Any] to list[int] + indices = t.cast(t.List[int], indices) + best_neighbor = neighbor_nodes[indices[0]] + question = self._multicontext_question( + question=seed_question, + context1=text_chunk, + context2=best_neighbor.get_content(), + ) + text_chunk = "\n".join([text_chunk, best_neighbor.get_content()]) + else: + return None + + # for reasoning and conditional modes, evolve question with the + # functions from question_deep_map + else: + evolve_fun = question_deep_map.get(evolve_type) + question = ( + getattr(self, evolve_fun)(seed_question, text_chunk) + if evolve_fun + else seed_question + ) + + # compress question or convert into conversational questions + if evolve_type != "simple": + prob = self.rng.uniform(0, 1) + if self.chat_qa and prob <= self.chat_qa: + question = self._conversational_question(question=question) + else: + question = self._compress_question(question=question) + + return Proposal(question=question, text_chunk=text_chunk) + def generate( self, documents: t.List[LlamaindexDocument] | t.List[LangchainDocument], @@ -339,64 +414,20 @@ def generate( neighbor_nodes = doc_nodes_map[curr_node.source_node.node_id] - # Append multiple nodes randomly to remove chunking bias - size = self.rng.integers(1, 3) - nodes = ( - self._get_neighbour_node(curr_node, neighbor_nodes) - if size > 1 and evolve_type != "multi_context" - else [curr_node] - ) - - text_chunk = " ".join([node.get_content() for node in nodes]) - score = self._filter_context(text_chunk) - if not score: - continue - seed_question = self._seed_question(text_chunk) - is_valid_question = self._filter_question(seed_question) - if not is_valid_question: - continue - - if evolve_type == "multi_context": - # Find most similar chunk in same document - node_embedding = self._embed_nodes([nodes[-1]]) - neighbor_nodes = self._remove_nodes(neighbor_nodes, nodes) - neighbor_emb = self._embed_nodes(neighbor_nodes) - - _, indices = get_top_k_embeddings( - list(node_embedding.values())[0], - list(neighbor_emb.values()), - similarity_cutoff=self.threshold / 10, - ) - if indices: - # type cast indices from list[Any] to list[int] - indices = t.cast(t.List[int], indices) - best_neighbor = neighbor_nodes[indices[0]] - question = self._multicontext_question( - question=seed_question, - context1=text_chunk, - context2=best_neighbor.get_content(), - ) - text_chunk = "\n".join([text_chunk, best_neighbor.get_content()]) - else: - continue - - # for reasoning and conditional modes, evolve question with the - # functions from question_deep_map - else: - evolve_fun = question_deep_map.get(evolve_type) - question = ( - getattr(self, evolve_fun)(seed_question, text_chunk) - if evolve_fun - else seed_question + proposal = None + try: + proposal = self._make_proposal( + curr_node, neighbor_nodes, evolve_type ) + except Exception as e: + err_cause = e.__cause__ + if not isinstance(err_cause, retry_errors): + raise e - # compress question or convert into conversational questions - if evolve_type != "simple": - prob = self.rng.uniform(0, 1) - if self.chat_qa and prob <= self.chat_qa: - question = self._conversational_question(question=question) - else: - question = self._compress_question(question=question) + if proposal is None: + continue + question = proposal.question + text_chunk = proposal.text_chunk is_valid_question = self._filter_question(question) if is_valid_question: diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 9e0849bac..944e89fd4 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -152,4 +152,4 @@ def _find_outermost_json(self, text): return -1, -1 # No valid JSON found -json_loader = JsonLoader() +json_loader = JsonLoader() \ No newline at end of file diff --git a/src/ragas/validation.py b/src/ragas/validation.py index bf0b3e4c2..fa1c4471d 100644 --- a/src/ragas/validation.py +++ b/src/ragas/validation.py @@ -2,6 +2,7 @@ from datasets import Dataset, Sequence +from ragas.metrics._context_precision import ContextPrecision from ragas.metrics.base import EvaluationMode, Metric @@ -60,8 +61,15 @@ def validate_evaluation_modes(ds: Dataset, metrics: list[Metric]): required_columns = set(EVALMODE_TO_COLUMNS[m.evaluation_mode]) available_columns = set(ds.features.keys()) if not required_columns.issubset(available_columns): + extra_msg = "" + if ( + isinstance(m, ContextPrecision) + and "ground_truths" not in available_columns + ): + extra_msg = "Looks like you're trying to use 'context_precision' without ground_truths. Please use consider using `context_utilization' instead." + raise ValueError( f"The metric [{m.name}] that that is used requires the following " f"additional columns {list(required_columns - available_columns)} " - "to be present in the dataset." + f"to be present in the dataset. {extra_msg}" ) diff --git a/tests/unit/test_import.py b/tests/unit/test_import.py index 0df78a883..53b312db8 100644 --- a/tests/unit/test_import.py +++ b/tests/unit/test_import.py @@ -27,4 +27,4 @@ def test_import_module(): assert hasattr(ragas.metrics, metric) for metric in test_critique: - assert hasattr(ragas.metrics.critique, metric) + assert hasattr(ragas.metrics.critique, metric) \ No newline at end of file diff --git a/tests/unit/test_prompt.py b/tests/unit/test_prompt.py new file mode 100644 index 000000000..feae8fa94 --- /dev/null +++ b/tests/unit/test_prompt.py @@ -0,0 +1,84 @@ +from ragas.llms.prompt import Prompt + +TESTCASES = [ + { + "instruction" : 'Create one or more statements from each sentence in the given answer.', + "examples" : [ + { + "question":"Cadmium Chloride is slightly soluble in this chemical, it is also called what?", + "answer":"alcohol", + "statements in json":"""{ + "statements": [ + "Cadmium Chloride is slightly soluble in alcohol." + ] + }""" + }, + { + "question":"Were Hitler and Benito Mussolini of the same nationality?", + "answer":"Sorry, I can't provide answer to that question.", + "statements in json":"""{ + "statements": [] + }""" + } + ], + "input_keys" : ["question", "answer"], + "output_key" : "statements in json", + }, + { + "instruction" : 'Natural language inference. Use only "Yes" (1) or "No" (0) as a binary verdict.', + "examples" : [ + { + "Context":"""John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. + statement_1: John is majoring in Biology. + statement_2: John is taking a course on Artificial Intelligence. + statement_3: John is a dedicated student. + statement_4: John has a part-time job.""", + "Answer":"""[ + { + "statement_1": "John is majoring in Biology.", + "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", + "verdict": "0" + }, + { + "statement_2": "John is taking a course on Artificial Intelligence.", + "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", + "verdict": "0" + }, + { + "statement_3": "John is a dedicated student.", + "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", + "verdict": "1" + }, + { + "statement_4": "John has a part-time job.", + "reason": "There is no information given in the context about John having a part-time job.", + "verdict": "0" + }] + """ + } + ], + "input_keys" : ["Context"], + "output_key" : "Answer", + "output_type" : "json" + }, + { + "instruction" : 'This is a test prompt without examples', + "input_keys" : ["Context"], + "output_key" : "Answer", + "output_type" : "json" + }, +] + +def test_prompt_object(): + + for testcase in TESTCASES: + prompt = Prompt(**testcase) + + assert prompt is not None, "Prompt object is not created" + assert prompt.instruction==testcase['instruction'], "instruction in object is not same as in the testcase" + assert prompt.input_keys==testcase['input_keys'], "input_keys in object is not same as in the testcase" + assert prompt.output_key==testcase['output_key'], "output_key in object is not same as in the testcase" + assert prompt.output_type==testcase.get('output_type', 'json'), "output_type in object is not same as in the testcase" + assert prompt.examples==testcase.get('examples', []), "examples should be empty if not provided" + if testcase.get('examples'): + assert isinstance(prompt.get_example_str(0), str), "get_example_str should return a string" \ No newline at end of file diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py index 1164d1d39..c61deac40 100644 --- a/tests/unit/test_validation.py +++ b/tests/unit/test_validation.py @@ -17,8 +17,8 @@ TEST_CASES = [ CaseToTest("a", "b", ["c"], None, True, [faithfulness], True), CaseToTest("a", "b", ["c"], ["g"], True, [faithfulness], True), - CaseToTest("a", None, ["c"], None, True, [context_precision], True), - CaseToTest("a", None, "c", None, False, [context_precision], True), + CaseToTest("a", None, ["c"], ["g"], True, [context_precision], True), + CaseToTest("a", "b", "c", ["g"], False, [context_precision], True), CaseToTest( "a", None, [["c"]], None, False, [context_precision, answer_relevancy], False ), From 8163d4564d092bc50152f4189e08e0c597b42a20 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Mon, 1 Jan 2024 17:11:28 +0530 Subject: [PATCH 16/34] remove DS_store --- src/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/.DS_Store diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index 5577d9f9280eeb1683a8b20f42c1f82fe523b022..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5Z>*NrWBzEMUM+!3#P3o;w8lT0!H+pQWH{YFlI}VnnNk%sxRc5_&m<+ zZlJ|_6|plg`_0bJZuW!h4`YnGv#`flhcRY>B62j=2)fsXY9<+xt1+^m$fiLogEhna zO&5N9lcmgO6VT=F{|KgWl;yVl$!qmSb8FkOthV*wJ;}n${d|_WetL(!YboQPvi;yD z8W*F^?uATpKT5`_Du}`{q}<;|Nhk|f&XO=wwVn=GtyXK)*`Lo(`&}{U*^922pY`po zI3AoV7AX6++? zwS;=a05R~-7~qYeH*{f9=4}18JUnX!=p!f!=9OrGfW2}FfDY~>2P&xJ0&U2143-*k T6!fcdK)MJhLZ~4Ieu056WpYZI From 9395c29a728fa874673c762c6be30d9d81d1c5a3 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 2 Jan 2024 03:58:45 +0530 Subject: [PATCH 17/34] fix prompts --- src/ragas/llms/prompt.py | 64 ++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 61d2bb589..b05ed26da 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -5,23 +5,35 @@ import os import typing as t -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from langchain_core.messages import BaseMessage, HumanMessage -from langchain_core.prompt_values import PromptValue -from langchain_core.pydantic_v1 import root_validator +from langchain_core.prompt_values import PromptValue as BasePromptValue +from langchain_core.pydantic_v1 import BaseModel, root_validator from ragas.llms import RagasLLM from ragas.utils import RAGAS_CACHE_HOME, json_loader +Example = t.Dict[str, t.Any] -class Prompt(PromptValue): + +class PromptValue(BasePromptValue): + prompt_str: str + + def to_messages(self) -> t.List[BaseMessage]: + """Return prompt as a list of Messages.""" + return [HumanMessage(content=self.to_string())] + + def to_string(self) -> str: + return self.prompt_str + + +class Prompt(BaseModel): """ Prompt is a class that represents a prompt for the ragas metrics. """ name: str instruction: str - examples: t.List[t.Dict[str, t.Any]] = [] + examples: t.List[Example] = [] input_keys: t.List[str] output_key: str output_type: str = "json" @@ -69,27 +81,28 @@ def to_string(self) -> str: """ prompt_str = self.instruction + "\n" - # Format the examples to match the Langchain prompt template - for example in self.examples: - for key, value in example.items(): - value = json.dumps(value, ensure_ascii=False).encode("utf8").decode() - value = ( - value.replace("{", "{{").replace("}", "}}") - if self.output_type.lower() == "json" - else value - ) - prompt_str += f"\n{key}: {value}" - prompt_str += "\n" + if self.examples: + # Format the examples to match the Langchain prompt template + for example in self.examples: + for key, value in example.items(): + value = ( + json.dumps(value, ensure_ascii=False).encode("utf8").decode() + ) + value = ( + value.replace("{", "{{").replace("}", "}}") + if self.output_type.lower() == "json" + else value + ) + prompt_str += f"\n{key}: {value}" + prompt_str += "\n" - prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys) - prompt_str += f"\n{self.output_key}: \n" + if self.input_keys: + prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys) + if self.output_key: + prompt_str += f"\n{self.output_key}: \n" return prompt_str - def to_messages(self) -> t.List[BaseMessage]: - """Return prompt as a list of Messages.""" - return [HumanMessage(content=self.to_string())] - def get_example_str(self, example_no: int) -> str: """ Get the example string from the example number. @@ -108,7 +121,7 @@ def get_example_str(self, example_no: int) -> str: example_str += f"\n{key}: {value}" return example_str - def format(self, **kwargs: t.Any) -> ChatPromptTemplate: + def format(self, **kwargs: t.Any) -> PromptValue: """ Format the Prompt object into a ChatPromptTemplate object to be used in metrics. """ @@ -117,8 +130,7 @@ def format(self, **kwargs: t.Any) -> ChatPromptTemplate: f"Input variables {self.input_keys} do not match with the given parameters {list(kwargs.keys())}" ) prompt = self.to_string() - human_prompt = HumanMessagePromptTemplate.from_template(prompt) - return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) + return PromptValue(prompt_str=prompt.format(**kwargs)) def adapt( self, language: str, llm: RagasLLM, cache_dir: t.Optional[str] = None @@ -184,7 +196,7 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: cache_path = os.path.join(cache_dir, f"{self.name}.json") with open(cache_path, "w") as file: - json.dump(self.to_json(), file, indent=4) + json.dump(self.dict(), file, indent=4) @classmethod def _load(cls, language: str, name: str, cache_dir: str) -> Prompt: From 9dcc4fcb03fbdcc0e391281f6304da44a74b2ddf Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 2 Jan 2024 03:59:35 +0530 Subject: [PATCH 18/34] moved tests --- tests/unit/{ => llms}/test_llm.py | 0 tests/unit/{ => llms}/test_prompt.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/unit/{ => llms}/test_llm.py (100%) rename tests/unit/{ => llms}/test_prompt.py (100%) diff --git a/tests/unit/test_llm.py b/tests/unit/llms/test_llm.py similarity index 100% rename from tests/unit/test_llm.py rename to tests/unit/llms/test_llm.py diff --git a/tests/unit/test_prompt.py b/tests/unit/llms/test_prompt.py similarity index 100% rename from tests/unit/test_prompt.py rename to tests/unit/llms/test_prompt.py From 14aa52205fbe9c61c09752c073f5c0d0c8bccfa6 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 2 Jan 2024 14:58:27 +0530 Subject: [PATCH 19/34] fixed some metrics --- docs/conf.py | 1 - docs/howtos/customisations/embeddings.ipynb | 2 +- docs/howtos/customisations/gcp-vertexai.ipynb | 17 +- docs/howtos/integrations/zeno.ipynb | 2 +- src/ragas/callbacks.py | 2 +- src/ragas/embeddings/base.py | 5 +- src/ragas/evaluation.py | 7 +- src/ragas/executor.py | 2 - src/ragas/llms/base.py | 15 +- src/ragas/llms/json_load.py | 128 +++++++++++++++ src/ragas/llms/prompt.py | 14 +- src/ragas/metrics/__init__.py | 22 --- src/ragas/metrics/_answer_correctness.py | 2 +- src/ragas/metrics/_answer_relevance.py | 2 - src/ragas/metrics/_answer_similarity.py | 5 +- src/ragas/metrics/_context_precision.py | 105 ++++-------- src/ragas/metrics/_context_recall.py | 3 - src/ragas/metrics/_faithfulness.py | 124 +++++++------- src/ragas/metrics/base.py | 28 +--- src/ragas/metrics/critique.py | 1 - src/ragas/testset/testset_generator.py | 4 +- src/ragas/utils.py | 153 +----------------- tests/unit/test_import.py | 2 +- tests/unit/test_prompt.py | 104 ++++++------ 24 files changed, 330 insertions(+), 420 deletions(-) create mode 100644 src/ragas/llms/json_load.py diff --git a/docs/conf.py b/docs/conf.py index dab9b26ca..b4715c4ca 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,3 @@ -import os from dataclasses import asdict from sphinxawesome_theme import ThemeOptions diff --git a/docs/howtos/customisations/embeddings.ipynb b/docs/howtos/customisations/embeddings.ipynb index 7cc4f9abe..8c31c816f 100644 --- a/docs/howtos/customisations/embeddings.ipynb +++ b/docs/howtos/customisations/embeddings.ipynb @@ -169,7 +169,7 @@ "\n", "result = evaluate(\n", " fiqa_eval[\"baseline\"].select(range(5)), # showing only 5 for demonstration\n", - " metrics=[answer_similarity]\n", + " metrics=[answer_similarity],\n", ")\n", "\n", "result" diff --git a/docs/howtos/customisations/gcp-vertexai.ipynb b/docs/howtos/customisations/gcp-vertexai.ipynb index 7ddf68f15..810968b0b 100644 --- a/docs/howtos/customisations/gcp-vertexai.ipynb +++ b/docs/howtos/customisations/gcp-vertexai.ipynb @@ -98,7 +98,7 @@ "source": [ "from ragas.metrics import (\n", " context_precision,\n", - " answer_relevancy, # AnswerRelevancy\n", + " answer_relevancy, # AnswerRelevancy\n", " faithfulness,\n", " context_recall,\n", ")\n", @@ -110,7 +110,7 @@ " answer_relevancy,\n", " context_recall,\n", " context_precision,\n", - " harmfulness\n", + " harmfulness,\n", "]" ] }, @@ -137,7 +137,6 @@ "from langchain.embeddings import VertexAIEmbeddings\n", "\n", "\n", - "\n", "config = {\n", " \"project_id\": \"tmp-project-404003\",\n", "}\n", @@ -170,7 +169,7 @@ "for m in metrics:\n", " # change LLM for metric\n", " m.__setattr__(\"llm\", ragas_vertexai_llm)\n", - " \n", + "\n", " # check if this metric needs embeddings\n", " if hasattr(m, \"embeddings\"):\n", " # if so change with VertexAI Embeddings\n", @@ -276,13 +275,15 @@ ], "source": [ "from ragas import evaluate\n", - "import nest_asyncio # CHECK NOTES\n", + "import nest_asyncio # CHECK NOTES\n", "\n", - "# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function. \n", - "nest_asyncio.apply() \n", + "# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.\n", + "nest_asyncio.apply()\n", "\n", "result = evaluate(\n", - " fiqa_eval[\"baseline\"].select(range(1)), # using 1 as example due to quota constrains\n", + " fiqa_eval[\"baseline\"].select(\n", + " range(1)\n", + " ), # using 1 as example due to quota constrains\n", " metrics=metrics,\n", ")\n", "\n", diff --git a/docs/howtos/integrations/zeno.ipynb b/docs/howtos/integrations/zeno.ipynb index 8e00f56b3..1f1891638 100644 --- a/docs/howtos/integrations/zeno.ipynb +++ b/docs/howtos/integrations/zeno.ipynb @@ -186,7 +186,7 @@ " ]\n", "].copy()\n", "\n", - "output_df['output'] = df.apply(\n", + "output_df[\"output\"] = df.apply(\n", " lambda x: {\"answer\": x[\"answer\"], \"ground_truths\": list(x[\"ground_truths\"])}, axis=1\n", ")\n", "output_df[\"id\"] = output_df.index\n", diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py index af97592d8..87e286a51 100644 --- a/src/ragas/callbacks.py +++ b/src/ragas/callbacks.py @@ -13,7 +13,7 @@ def new_group( name: str, inputs: t.Dict, callbacks: Callbacks, is_async=False -) -> t.Tuple[CallbackManagerForChainRun, CallbackManager]: +) -> t.Tuple[CallbackManagerForChainRun, CallbackManagerForChainGroup]: # start evaluation chain if isinstance(callbacks, list): cm = CallbackManager.configure(inheritable_callbacks=callbacks) diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index aff623477..8dc27e5e5 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -9,7 +9,6 @@ from langchain.embeddings import AzureOpenAIEmbeddings as BaseAzureOpenAIEmbeddings from langchain.embeddings import FastEmbedEmbeddings as BaseFastEmbedEmbeddings from langchain.embeddings import OpenAIEmbeddings as BaseOpenAIEmbeddings -from langchain.embeddings import FastEmbedEmbeddings as BaseFastEmbedEmbeddings from langchain.schema.embeddings import Embeddings from pydantic.dataclasses import dataclass @@ -44,7 +43,8 @@ def validate_api_key(self): else: raise OpenAIKeyNotFound -class FastEmbedEmbeddings(BaseFastEmbedEmbeddings, RagasEmbeddings): + +class FastEmbedEmbeddings(BaseFastEmbedEmbeddings, BaseRagasEmbeddings): """ Find the list of supported models at: https://qdrant.github.io/fastembed/examples/Supported_Models/ @@ -61,6 +61,7 @@ def validate_api_key(self): """ pass + class AzureOpenAIEmbeddings(BaseAzureOpenAIEmbeddings, BaseRagasEmbeddings): azure_endpoint: t.Optional[str] = None deployment: t.Optional[str] = None diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 813bfb136..6fa42c321 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -1,17 +1,13 @@ from __future__ import annotations -import asyncio import typing as t -from concurrent.futures import ALL_COMPLETED, ThreadPoolExecutor, wait from dataclasses import dataclass, field import numpy as np from datasets import Dataset, concatenate_datasets from langchain_core.language_models import BaseLanguageModel -from tqdm import tqdm from ragas._analytics import EvaluationEvent, track -from ragas.async_utils import run_async_tasks from ragas.callbacks import new_group from ragas.embeddings.base import BaseRagasEmbeddings from ragas.executor import Executor @@ -138,6 +134,7 @@ def evaluate( name="ragas evaluation", inputs={}, callbacks=callbacks, is_async=is_async ) for i, row in enumerate(dataset): + row = t.cast(t.Dict[str, t.Any], row) row_rm, row_group_cm = new_group( name=f"row {i}", inputs=row, @@ -173,8 +170,6 @@ def evaluate( raise e finally: - # close the evaluation chain - # TODO: show only aggregate scores result = Result( scores=Dataset.from_list(scores), dataset=dataset, diff --git a/src/ragas/executor.py b/src/ragas/executor.py index 422ea2d59..16fc105ff 100644 --- a/src/ragas/executor.py +++ b/src/ragas/executor.py @@ -4,8 +4,6 @@ from dataclasses import dataclass, field import numpy as np -from nltk.classify.textcat import re -from sqlalchemy import except_ from tqdm.auto import tqdm diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index e68191216..f03b677ae 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -4,18 +4,15 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from langchain.chat_models import AzureChatOpenAI, BedrockChat, ChatOpenAI, ChatVertexAI -from langchain.llms import AmazonAPIGateway, AzureOpenAI, Bedrock, OpenAI, VertexAI +from langchain.chat_models import AzureChatOpenAI, ChatOpenAI, ChatVertexAI +from langchain.llms import AzureOpenAI, OpenAI, VertexAI from langchain_core.language_models import BaseLanguageModel from langchain_core.outputs import LLMResult if t.TYPE_CHECKING: - from langchain.prompts import ChatPromptTemplate from langchain_core.callbacks import Callbacks from langchain_core.prompt_values import PromptValue - from ragas.llms.prompt import Prompt - MULTIPLE_COMPLETION_SUPPORTED = [ OpenAI, ChatOpenAI, @@ -39,7 +36,7 @@ class BaseRagasLLM(ABC): @abstractmethod def generate_text( self, - prompt: Prompt, + prompt: PromptValue, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, @@ -50,7 +47,7 @@ def generate_text( @abstractmethod async def agenerate_text( self, - prompt: Prompt, + prompt: PromptValue, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, @@ -72,7 +69,7 @@ class LangchainLLMWrapper(BaseRagasLLM): def generate_text( self, - prompt: Prompt, + prompt: PromptValue, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, @@ -101,7 +98,7 @@ def generate_text( async def agenerate_text( self, - prompt: Prompt, + prompt: PromptValue, n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, diff --git a/src/ragas/llms/json_load.py b/src/ragas/llms/json_load.py new file mode 100644 index 000000000..bebfdc80f --- /dev/null +++ b/src/ragas/llms/json_load.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import json +import logging +import typing as t +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + from ragas.llms.base import BaseRagasLLM + + +def load_as_json(text) -> t.Dict: + """ + validate and return given text as json + """ + + try: + return json.loads(text) + except ValueError as e: + logger.warn(f"Invalid json: {e}") + return {} + + +# not migrating to Prompt format to avoid circular imports +JSON_PROMPT = """\ +Rewrite the input into valid json + +Input: +{{ + "name": "John Doe", + "age": 30, + "isStudent": false + "address": {{ + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + }} + "hobbies": ["reading", "swimming", "cycling"] +}} +Output: +{{ + "name": "John Doe", + "age": 30, + "isStudent": false, + "address": {{ + "street": "123 Main St", + "city": "Anytown", + "state": "CA" + }}, + "hobbies": ["reading", "swimming", "cycling"] +}} + + +Input: +{{ + "statement": "The Earth is also known as "Terra" " +}} +Output: +{{ + "statement": "The Earth is also known as 'Terra'" +}} + +Input: +{input} + +Output: +""" + + +@dataclass +class JsonLoader: + max_retries: int = 2 + + def safe_load(self, text: str, llm: BaseRagasLLM, callbacks: Callbacks = None): + retry = 0 + while retry <= self.max_retries: + try: + start, end = self._find_outermost_json(text) + return json.loads(text[start:end]) + except ValueError: + text = self._fix_to_json(text, llm, callbacks) + retry += 1 + + return {} + + def _fix_to_json(self, text: str, llm: BaseRagasLLM, callbacks: Callbacks): + from ragas.llms.prompt import PromptValue + + # TODO (executor) + results = llm.generate_text( + PromptValue(prompt_str=JSON_PROMPT.format(input=text)), + n=1, + callbacks=callbacks, + ) + return results.generations[0][0].text + + def _find_outermost_json(self, text): + stack = [] + start_index = -1 + + for i, char in enumerate(text): + if char in "{[": + if len(stack) == 0: + start_index = i + stack.append(char) + + elif char in "}]": + if len(stack) > 0: + last = stack.pop() + if (char == "}" and last != "{") or (char == "]" and last != "["): + # Mismatched closing brace/bracket, invalid JSON + break + + if len(stack) == 0 and start_index != -1: + # Found a valid outermost JSON + return ( + start_index, + i + 1, + ) # Add 1 to include the closing brace/bracket in the range + + return -1, -1 # No valid JSON found + + +json_loader = JsonLoader() diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index b05ed26da..71e9bd873 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -9,8 +9,9 @@ from langchain_core.prompt_values import PromptValue as BasePromptValue from langchain_core.pydantic_v1 import BaseModel, root_validator -from ragas.llms import RagasLLM -from ragas.utils import RAGAS_CACHE_HOME, json_loader +from ragas.llms import BaseRagasLLM +from ragas.llms.json_load import json_loader +from ragas.utils import get_cache_dir Example = t.Dict[str, t.Any] @@ -133,10 +134,10 @@ def format(self, **kwargs: t.Any) -> PromptValue: return PromptValue(prompt_str=prompt.format(**kwargs)) def adapt( - self, language: str, llm: RagasLLM, cache_dir: t.Optional[str] = None + self, language: str, llm: BaseRagasLLM, cache_dir: t.Optional[str] = None ) -> Prompt: # TODO: Add callbacks - cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME + cache_dir = cache_dir if cache_dir else get_cache_dir() if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")): return self._load(language, self.name, cache_dir) @@ -160,7 +161,10 @@ def adapt( ) ) - results = [result[0].text for result in llm.generate(prompts).generations] + # NOTE: this is a slow loop, consider Executor to fasten this + results = [] + for p in prompts: + results.append(llm.generate_text(p).generations[0][0].text) per_example_items = len(self.input_keys) + 1 grouped_results = [ results[i : i + per_example_items] diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 1d60c15f4..f73aa1f90 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,5 +1,3 @@ -from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness -from ragas.metrics._answer_relevance import AnswerRelevancy, answer_relevancy from ragas.metrics._answer_similarity import AnswerSimilarity, answer_similarity from ragas.metrics._context_precision import ( ContextPrecision, @@ -7,35 +5,15 @@ context_precision, context_utilization, ) -from ragas.metrics._context_recall import ContextRecall, context_recall -from ragas.metrics._context_relevancy import ContextRelevancy, context_relevancy from ragas.metrics._faithfulness import Faithfulness, faithfulness -from ragas.metrics.critique import AspectCritique - -DEFAULT_METRICS = [ - answer_relevancy, - context_precision, - faithfulness, - context_recall, - context_relevancy, -] __all__ = [ "Faithfulness", "faithfulness", - "AnswerRelevancy", - "answer_relevancy", "AnswerSimilarity", "answer_similarity", - "AnswerCorrectness", - "answer_correctness", - "ContextRelevancy", - "context_relevancy", "ContextPrecision", "context_precision", - "AspectCritique", - "ContextRecall", - "context_recall", "ContextUtilization", "context_utilization", ] diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index e0c50c488..5a666e133 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -8,10 +8,10 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader logger = logging.getLogger(__name__) diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index a5f1af6bf..9ee560c91 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -14,8 +14,6 @@ from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader -from ragas.llms.prompt import Prompt -from ragas.metrics.base import EvaluationMode, MetricWithLLM logger = logging.getLogger(__name__) diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index 8bd86dd98..b70a61ba5 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -5,14 +5,11 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset from ragas.embeddings.base import ( HuggingfaceEmbeddings, - OpenAIEmbeddings, embedding_factory, ) -from ragas.exceptions import OpenAIKeyNotFound from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: @@ -64,7 +61,7 @@ def __post_init__(self: t.Self): def init_model(self): super().init_model() - def _score(self, row: t.Dict, callbacks: Callbacks = ...) -> float: + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: ground_truths, answers = row["ground_truths"], row["answer"] ground_truths = [item[0] for item in ground_truths] diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 8e020316c..493f504f9 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -6,17 +6,16 @@ import numpy as np from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.llms.prompt import Prompt -from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader -from ragas.llms.prompt import Prompt +from ragas.llms.json_load import json_loader +from ragas.llms.prompt import Prompt, PromptValue from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks +logger = logging.getLogger(__name__) + CONTEXT_PRECISION = Prompt( name="context_precision", instruction="""Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not. """, @@ -76,63 +75,30 @@ class ContextPrecision(MetricWithLLM): context_precision_prompt: Prompt = field(default_factory=lambda: CONTEXT_PRECISION) batch_size: int = 15 - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logging.info(f"Adapting Context Precision to {language}") - self.context_precision_prompt = self.context_precision_prompt.adapt( - language, self.llm, cache_dir - ) - - def save(self, cache_dir: str | None = None) -> None: - self.context_precision_prompt.save(cache_dir) - - def get_dataset_attributes(self, dataset: Dataset): + def _get_row_attributes(self, row: t.Dict) -> t.Tuple[str, t.List[str], t.Any]: answer = "ground_truths" - if answer not in dataset.features.keys(): - logging.warning( + if answer not in row.keys(): + logger.warning( "Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead" ) answer = "answer" - return dataset["question"], dataset["contexts"], dataset[answer] + return row["question"], row["contexts"], row[answer] - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list: - prompts = [] - questions, contexts, answers = self.get_dataset_attributes(dataset) - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for qstn, ctx, answer in zip(questions, contexts, answers): - human_prompts = [ - self.context_precision_prompt.format( - question=qstn, context=c, answer=answer - ) - for c in ctx - ] - - prompts.extend(human_prompts) - - responses: list[list[str]] = [] - results = self.llm.generate( - prompts, - n=1, - callbacks=batch_group, + def _context_precision_prompt(self, row: t.Dict) -> t.List[PromptValue]: + question, contexts, answer = self._get_row_attributes(row) + return [ + self.context_precision_prompt.format( + question=question, context=c, answer=answer ) for c in contexts ] - return [Prompt(chat_prompt_template=hp) for hp in human_prompts] def _calculate_average_precision(self, responses: t.List[str]) -> float: score = np.nan response = [json_loader.safe_load(item, self.llm) for item in responses] response = [ - int("yes" in resp.get("verdict", " ").lower()) + int("1" == resp.get("verdict", "0").strip()) if resp.get("verdict") else np.nan for resp in response @@ -147,29 +113,13 @@ def _calculate_average_precision(self, responses: t.List[str]) -> float: score = numerator / denominator return score - for response in grouped_responses: - response = [ - json_loader.safe_load(item, self.llm) for item in sum(response, []) - ] - response = [ - int("1" == resp.get("verdict", "0").strip()) - if resp.get("verdict") - else np.nan - for resp in response - ] - denominator = sum(response) + 1e-10 - numerator = sum( - [ - (sum(response[: i + 1]) / (i + 1)) * response[i] - for i in range(len(response)) - ] - ) - scores.append(numerator / denominator) + def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: + assert self.llm is not None, "LLM is not set" human_prompts = self._context_precision_prompt(row) responses: t.List[str] = [] for hp in human_prompts: - result = await self.llm.agenerate_text( + result = self.llm.generate_text( hp, n=1, callbacks=callbacks, @@ -179,13 +129,17 @@ def _calculate_average_precision(self, responses: t.List[str]) -> float: score = self._calculate_average_precision(responses) return score - def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: + async def _ascore( + self: t.Self, + row: t.Dict, + callbacks: Callbacks = [], + ) -> float: assert self.llm is not None, "LLM is not set" human_prompts = self._context_precision_prompt(row) responses: t.List[str] = [] for hp in human_prompts: - result = self.llm.generate_text( + result = await self.llm.agenerate_text( hp, n=1, callbacks=callbacks, @@ -195,13 +149,14 @@ def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: score = self._calculate_average_precision(responses) return score + def adapt(self, language: str, cache_dir: str | None = None) -> None: + logging.info(f"Adapting Context Precision to {language}") + self.context_precision_prompt = self.context_precision_prompt.adapt( + language, self.llm, cache_dir + ) -class ContextUtilization(ContextPrecision): - name = "ContextUtilization" - evaluation_mode = EvaluationMode.qac - - def get_dataset_attributes(self, dataset: Dataset): - return dataset["question"], dataset["contexts"], dataset["answer"] + def save(self, cache_dir: str | None = None) -> None: + self.context_precision_prompt.save(cache_dir) class ContextUtilization(ContextPrecision): diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 476ef1b81..2efad34cc 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -5,14 +5,11 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader -from ragas.llms.prompt import Prompt -from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 6faa6313f..fb96f3842 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -5,19 +5,17 @@ from dataclasses import dataclass, field import numpy as np -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.llms.prompt import Prompt -from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: - from datasets import Dataset from langchain_core.callbacks import Callbacks from langchain_core.outputs import LLMResult + from ragas.llms.prompt import PromptValue + logger = logging.getLogger(__name__) LONG_FORM_ANSWER_PROMPT = Prompt( @@ -133,25 +131,50 @@ class Faithfulness(MetricWithLLM): ) batch_size: int = 15 - def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: - logger.info(f"Adapting Faithfulness metric to {language}") - self.long_form_answer_prompt = self.long_form_answer_prompt.adapt( - language, self.llm, cache_dir + def _create_answer_prompt(self, row: t.Dict) -> PromptValue: + question, answer = row["question"], row["answer"] + + # extract statements from answer given the question + prompt_value = LONG_FORM_ANSWER_PROMPT.format(question=question, answer=answer) + return prompt_value + + def _create_nli_prompt(self, row: t.Dict, answer_result: LLMResult) -> PromptValue: + contexts = row["contexts"] + # check if the statements are support in the contexts + contexts_str: str = "\n".join(contexts) + statements = json_loader.safe_load( + answer_result.generations[0][0].text, self.llm + ).get("statements", []) + statements = statements if statements != [] else ["Nil"] + statements_str: str = "\n".join( + [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] ) - self.nli_statements_message = self.nli_statements_message.adapt( - language, self.llm, cache_dir + prompt_value = NLI_STATEMENTS_MESSAGE.format( + context=contexts_str, statements=statements_str + ) + return prompt_value + + def _compute_score(self, result: LLMResult): + # check the verdicts and compute the score + output = result.generations[0][0] + verdict_score_map = {"1": 1, "0": 0, "null": np.nan} + output = json_loader.safe_load(output.text, self.llm) + output = output if isinstance(output, list) else [output] + faithful_statements = sum( + verdict_score_map.get( + statement_with_validation.get("verdict", "").lower(), np.nan + ) + for statement_with_validation in output ) + num_statements = len(output) + if num_statements: + score = faithful_statements / num_statements + else: + score = np.nan - def save(self, cache_dir: t.Optional[str] = None) -> None: - self.long_form_answer_prompt.save(cache_dir) - self.nli_statements_message.save(cache_dir) + return score - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: """ returns the NLI score for each (q, c, a) pair """ @@ -162,53 +185,32 @@ def _score_batch( p = self._create_nli_prompt(row, result) result = await self.llm.agenerate_text(p, callbacks=callbacks) - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for q, a in zip(question, answer): - human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=q, answer=a) - prompts.append(human_prompt) + return self._compute_score(result) def _score(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not set" p = self._create_answer_prompt(row) result = self.llm.generate_text(p, callbacks=callbacks) - prompts = [] - for context, output in zip(contexts, result.generations): - statements = json_loader.safe_load(output[0].text, self.llm).get( - "statements", [] - ) - statements = statements if statements != [] else ["Nil"] - statements_str: str = "\n".join( - [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] - ) - contexts_str: str = "\n".join(context) - human_prompt = self.nli_statements_message.format( - context=contexts_str, statements=statements_str - ) - prompts.append(human_prompt) - - result = self.llm.generate(prompts, callbacks=batch_group) - outputs = result.generations - verdict_score_map = {"1": 1, "0": 0, "null": np.nan} - scores = [] - for output in outputs: - output = json_loader.safe_load(output[0].text, self.llm) - output = output if isinstance(output, list) else [output] - faithful_statements = sum( - verdict_score_map.get(dict.get("verdict", "").lower(), np.nan) - for dict in output - ) - num_statements = len(output) - if num_statements: - score = faithful_statements / num_statements - else: - score = np.nan - scores.append(score) - - return scores + p = self._create_nli_prompt(row, result) + result = self.llm.generate_text(p, callbacks=callbacks) + + return self._compute_score(result) + + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + assert self.llm is not None, "LLM is not set" + + logger.info(f"Adapting Faithfulness metric to {language}") + self.long_form_answer_prompt = self.long_form_answer_prompt.adapt( + language, self.llm, cache_dir + ) + self.nli_statements_message = self.nli_statements_message.adapt( + language, self.llm, cache_dir + ) + + def save(self, cache_dir: t.Optional[str] = None) -> None: + self.long_form_answer_prompt.save(cache_dir) + self.nli_statements_message.save(cache_dir) faithfulness = Faithfulness() diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index ee3ee432c..0b04d03d1 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -8,17 +8,10 @@ import typing as t from abc import ABC, abstractmethod -from dataclasses import dataclass, field +from dataclasses import dataclass from enum import Enum -from math import floor - -from datasets import Dataset -from langchain_core.callbacks import CallbackManager, CallbackManagerForChainGroup -from tqdm import tqdm from ragas.callbacks import new_group -from ragas.embeddings.base import BaseRagasEmbeddings -from ragas.llms import llm_factory if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks @@ -50,19 +43,21 @@ def init_model(self): """ ... - # @abstractmethod def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: """ Adapt the metric to a different language. """ - pass + raise NotImplementedError( + "adapt() is not implemented for {} metric".format(self.name) + ) - # @abstractmethod def save(self, cache_dir: t.Optional[str] = None) -> None: """ Save the metric to a path. """ - pass + raise NotImplementedError( + "adapt() is not implemented for {} metric".format(self.name) + ) def score( self: t.Self, @@ -83,7 +78,7 @@ def score( rm.on_chain_end({"output": score}) return score - # @abstractmethod + @abstractmethod def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: ... @@ -121,10 +116,3 @@ def init_model(self): raise ValueError( f"Metric '{self.name}' has no valid LLM provided (self.llm is None). Please initantiate a the metric with an LLM to run." # noqa ) - if hasattr(self.llm, "validate_api_key"): - self.llm.validate_api_key() - if hasattr(self, "embeddings"): - # since we are using Langchain Embeddings directly, we need to check this - if hasattr(self.embeddings, "validate_api_key"): - self.embeddings = t.cast(BaseRagasEmbeddings, self.embeddings) - self.embeddings.validate_api_key() diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index fca25eea2..096acc330 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -9,7 +9,6 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.utils import json_loader from ragas.llms import llm_factory from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py index 44a3366f7..f53206ba9 100644 --- a/src/ragas/testset/testset_generator.py +++ b/src/ragas/testset/testset_generator.py @@ -414,9 +414,7 @@ def generate( proposal = None try: - proposal = self._make_proposal( - curr_node, neighbor_nodes, evolve_type - ) + proposal = self._make_proposal(curr_node, neighbor_nodes, evolve_type) except Exception as e: err_cause = e.__cause__ if not isinstance(err_cause, retry_errors): diff --git a/src/ragas/utils.py b/src/ragas/utils.py index fa66fc3fc..5f449a6ea 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -1,27 +1,20 @@ from __future__ import annotations -import json import os -import typing as t -import warnings -from dataclasses import dataclass from functools import lru_cache -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate - -if t.TYPE_CHECKING: - from ragas.llms import RagasLLM - DEBUG_ENV_VAR = "RAGAS_DEBUG" # constant to tell us that there is no key passed to the llm/embeddings NO_KEY = "no-key" -# Cache location -DEFAULT_XDG_CACHE_HOME = "~/.cache" -XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME) -DEFAULT_RAGAS_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "ragas") -RAGAS_CACHE_HOME = os.path.expanduser(os.getenv("RAGAS_HOME", DEFAULT_RAGAS_CACHE_HOME)) + +@lru_cache(maxsize=1) +def get_cache_dir() -> str: + "get cache location" + DEFAULT_XDG_CACHE_HOME = "~/.cache" + xdg_cache = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME) + default_ragas_cache = os.path.join(xdg_cache, "ragas") + return os.path.expanduser(os.getenv("RAGAS_HOME", default_ragas_cache)) @lru_cache(maxsize=1) @@ -30,133 +23,3 @@ def get_debug_mode() -> bool: return True else: return False - - -def load_as_json(text): - """ - validate and return given text as json - """ - - try: - return json.loads(text) - except ValueError as e: - warnings.warn(f"Invalid json: {e}") - - return {} - - -# not migrating to Prompt format to avoid circular imports -JSON_PROMPT = HumanMessagePromptTemplate.from_template( - """ - -Rewrite the input into valid json - - -Input: -{{ - "name": "John Doe", - "age": 30, - "isStudent": false - "address": {{ - "street": "123 Main St", - "city": "Anytown", - "state": "CA", - }} - "hobbies": ["reading", "swimming", "cycling"] -}} -Output: -{{ - "name": "John Doe", - "age": 30, - "isStudent": false, - "address": {{ - "street": "123 Main St", - "city": "Anytown", - "state": "CA" - }}, - "hobbies": ["reading", "swimming", "cycling"] -}} - - -Input: -{{ - "statement": "The Earth is also known as "Terra" " -}} -Output: -{{ - "statement": "The Earth is also known as 'Terra'" -}} - -Input: -{input} - -Output: -""" -) - - -@dataclass -class JsonLoader: - max_retries: int = 2 - - def safe_load(self, text: str, llm: RagasLLM): - retry = 0 - while retry <= self.max_retries: - try: - start, end = self._find_outermost_json(text) - return json.loads(text[start:end]) - except ValueError: - text = self._fix_to_json(text, llm) - retry += 1 - - return {} - - def _fix_to_json( - self, - text, - llm, - callbacks: t.Optional[CallbackManager] = None, - callback_group_name: str = "batch", - ): - # TODO (executor) - with trace_as_chain_group( - callback_group_name, callback_manager=callbacks - ) as batch_group: - human_prompt = ChatPromptTemplate.from_messages( - [JSON_PROMPT.format(input=text)] - ) - results = llm.generate( - [human_prompt], - n=1, - callbacks=batch_group, - ) - return results.generations[0][0].text - - def _find_outermost_json(self, text): - stack = [] - start_index = -1 - - for i, char in enumerate(text): - if char in "{[": - if len(stack) == 0: - start_index = i - stack.append(char) - - elif char in "}]": - if len(stack) > 0: - last = stack.pop() - if (char == "}" and last != "{") or (char == "]" and last != "["): - # Mismatched closing brace/bracket, invalid JSON - break - - if len(stack) == 0 and start_index != -1: - # Found a valid outermost JSON - return ( - start_index, - i + 1, - ) # Add 1 to include the closing brace/bracket in the range - - return -1, -1 # No valid JSON found - - -json_loader = JsonLoader() \ No newline at end of file diff --git a/tests/unit/test_import.py b/tests/unit/test_import.py index 53b312db8..0df78a883 100644 --- a/tests/unit/test_import.py +++ b/tests/unit/test_import.py @@ -27,4 +27,4 @@ def test_import_module(): assert hasattr(ragas.metrics, metric) for metric in test_critique: - assert hasattr(ragas.metrics.critique, metric) \ No newline at end of file + assert hasattr(ragas.metrics.critique, metric) diff --git a/tests/unit/test_prompt.py b/tests/unit/test_prompt.py index feae8fa94..62bae0b66 100644 --- a/tests/unit/test_prompt.py +++ b/tests/unit/test_prompt.py @@ -1,39 +1,39 @@ from ragas.llms.prompt import Prompt TESTCASES = [ - { - "instruction" : 'Create one or more statements from each sentence in the given answer.', - "examples" : [ - { - "question":"Cadmium Chloride is slightly soluble in this chemical, it is also called what?", - "answer":"alcohol", - "statements in json":"""{ + { + "instruction": "Create one or more statements from each sentence in the given answer.", + "examples": [ + { + "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", + "answer": "alcohol", + "statements in json": """{ "statements": [ "Cadmium Chloride is slightly soluble in alcohol." ] - }""" - }, - { - "question":"Were Hitler and Benito Mussolini of the same nationality?", - "answer":"Sorry, I can't provide answer to that question.", - "statements in json":"""{ + }""", + }, + { + "question": "Were Hitler and Benito Mussolini of the same nationality?", + "answer": "Sorry, I can't provide answer to that question.", + "statements in json": """{ "statements": [] - }""" - } - ], - "input_keys" : ["question", "answer"], - "output_key" : "statements in json", - }, - { - "instruction" : 'Natural language inference. Use only "Yes" (1) or "No" (0) as a binary verdict.', - "examples" : [ - { - "Context":"""John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. + }""", + }, + ], + "input_keys": ["question", "answer"], + "output_key": "statements in json", + }, + { + "instruction": 'Natural language inference. Use only "Yes" (1) or "No" (0) as a binary verdict.', + "examples": [ + { + "Context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. statement_1: John is majoring in Biology. statement_2: John is taking a course on Artificial Intelligence. statement_3: John is a dedicated student. statement_4: John has a part-time job.""", - "Answer":"""[ + "Answer": """[ { "statement_1": "John is majoring in Biology.", "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", @@ -54,31 +54,43 @@ "reason": "There is no information given in the context about John having a part-time job.", "verdict": "0" }] - """ - } - ], - "input_keys" : ["Context"], - "output_key" : "Answer", - "output_type" : "json" - }, - { - "instruction" : 'This is a test prompt without examples', - "input_keys" : ["Context"], - "output_key" : "Answer", - "output_type" : "json" - }, + """, + } + ], + "input_keys": ["Context"], + "output_key": "Answer", + "output_type": "json", + }, + { + "instruction": "This is a test prompt without examples", + "input_keys": ["Context"], + "output_key": "Answer", + "output_type": "json", + }, ] -def test_prompt_object(): +def test_prompt_object(): for testcase in TESTCASES: prompt = Prompt(**testcase) assert prompt is not None, "Prompt object is not created" - assert prompt.instruction==testcase['instruction'], "instruction in object is not same as in the testcase" - assert prompt.input_keys==testcase['input_keys'], "input_keys in object is not same as in the testcase" - assert prompt.output_key==testcase['output_key'], "output_key in object is not same as in the testcase" - assert prompt.output_type==testcase.get('output_type', 'json'), "output_type in object is not same as in the testcase" - assert prompt.examples==testcase.get('examples', []), "examples should be empty if not provided" - if testcase.get('examples'): - assert isinstance(prompt.get_example_str(0), str), "get_example_str should return a string" \ No newline at end of file + assert ( + prompt.instruction == testcase["instruction"] + ), "instruction in object is not same as in the testcase" + assert ( + prompt.input_keys == testcase["input_keys"] + ), "input_keys in object is not same as in the testcase" + assert ( + prompt.output_key == testcase["output_key"] + ), "output_key in object is not same as in the testcase" + assert prompt.output_type == testcase.get( + "output_type", "json" + ), "output_type in object is not same as in the testcase" + assert prompt.examples == testcase.get( + "examples", [] + ), "examples should be empty if not provided" + if testcase.get("examples"): + assert isinstance( + prompt.get_example_str(0), str + ), "get_example_str should return a string" From 777db4d746e1a4431d0f51ad7de2449b4e7c1577 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 2 Jan 2024 17:16:53 +0530 Subject: [PATCH 20/34] answer_correctness ported --- src/ragas/metrics/__init__.py | 3 + src/ragas/metrics/_answer_correctness.py | 123 ++++++++++++++++------- src/ragas/metrics/_faithfulness.py | 4 + 3 files changed, 94 insertions(+), 36 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index f73aa1f90..4b77a2485 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,3 +1,4 @@ +from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness from ragas.metrics._answer_similarity import AnswerSimilarity, answer_similarity from ragas.metrics._context_precision import ( ContextPrecision, @@ -8,6 +9,8 @@ from ragas.metrics._faithfulness import Faithfulness, faithfulness __all__ = [ + "AnswerCorrectness", + "answer_correctness", "Faithfulness", "faithfulness", "AnswerSimilarity", diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 5a666e133..09b4d78ee 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -8,17 +8,19 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader logger = logging.getLogger(__name__) if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks + from langchain_core.outputs import LLMResult CORRECTNESS_PROMPT = Prompt( + name="answer_correctness", instruction="""Extract following from given question and ground truth""", examples=[ { @@ -94,14 +96,79 @@ def __post_init__(self: t.Self): llm=self.llm, batch_size=self.batch_size ) - def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: - logger.info(f"Adapting AnswerCorrectness metric to {language}") - self.correctness_prompt = self.correctness_prompt.adapt( - language, self.llm, cache_dir + def _compute_statement_presence(self, result: LLMResult) -> float: + assert self.llm is not None, "LLM must be set" + + key_map = { + "TP": "statements that are present in both the answer and the ground truth", + "FP": "statements present in the answer but not found in the ground truth", + "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 + } + outputs = result.generations[0] + + prediction = json_loader.safe_load(outputs[0].text, self.llm) + prediction = prediction if isinstance(prediction, list) else [prediction] + if prediction: + prediction = [ + item.get(key_map[k], np.nan) + for item in prediction + for k in key_map.keys() + ] + tp, fp, fn = [ + len(item) if isinstance(item, list) else np.nan for item in prediction + ] + score = tp / (tp + 0.5 * (fp + fn)) + else: + score = np.nan + + return score + + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM must be set" + q, a, g = row["question"], row["answer"], row["ground_truths"][0] + p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a) + is_statement_present = self.llm.generate_text(p_value, callbacks=callbacks) + + f1_score = self._compute_statement_presence(is_statement_present) + + if self.weights[1] == 0: + similarity_score = 0 + else: + similarity_score = self.answer_similarity.score(row, callbacks=callbacks) # type: ignore + + score = np.average( + [f1_score, similarity_score], + weights=self.weights, ) - def save(self, cache_dir: t.Optional[str] = None) -> None: - self.correctness_prompt.save(cache_dir) + return score + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM must be set" + + q, a, g = row["question"], row["answer"], row["ground_truths"][0] + p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a) + is_statement_present = await self.llm.agenerate_text( + p_value, callbacks=callbacks + ) + + f1_score = self._compute_statement_presence(is_statement_present) + + if self.weights[1] == 0: + similarity_score = 0 + else: + assert self.answer_similarity is not None, "AnswerSimilarity must be set" + + similarity_score = await self.answer_similarity.ascore( + row, callbacks=callbacks + ) + + score = np.average( + [f1_score, similarity_score], + weights=self.weights, + ) + + return score def _score_batch( self: t.Self, @@ -128,39 +195,12 @@ def _score_batch( ) result = self.llm.generate(prompts, callbacks=batch_group) - outputs = result.generations - key_map = { - "TP": "statements that are present in both the answer and the ground truth", - "FP": "statements present in the answer but not found in the ground truth", - "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 - } - - f1_score = [] - for prediction in outputs: - prediction = json_loader.safe_load(prediction[0].text, self.llm) - prediction = ( - prediction if isinstance(prediction, list) else [prediction] - ) - if prediction: - prediction = [ - item.get(key_map[k], np.nan) - for item in prediction - for k in key_map.keys() - ] - tp, fp, fn = [ - len(item) if isinstance(item, list) else np.nan - for item in prediction - ] - score = tp / (tp + 0.5 * (fp + fn)) - else: - score = np.nan - - f1_score.append(score) if self.weights[1] == 0: similarity_scores = np.zeros(len(f1_score)) else: similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore + scores_stacked = np.vstack([f1_score, similarity_scores]) scores = np.average( scores_stacked, @@ -170,5 +210,16 @@ def _score_batch( return scores.tolist() + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + assert self.llm is not None, "llm must be set to compute score" + + logger.info(f"Adapting AnswerCorrectness metric to {language}") + self.correctness_prompt = self.correctness_prompt.adapt( + language, self.llm, cache_dir + ) + + def save(self, cache_dir: t.Optional[str] = None) -> None: + self.correctness_prompt.save(cache_dir) + answer_correctness = AnswerCorrectness() diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index fb96f3842..ab8078c8d 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -139,6 +139,8 @@ def _create_answer_prompt(self, row: t.Dict) -> PromptValue: return prompt_value def _create_nli_prompt(self, row: t.Dict, answer_result: LLMResult) -> PromptValue: + assert self.llm is not None, "llm must be set to compute score" + contexts = row["contexts"] # check if the statements are support in the contexts contexts_str: str = "\n".join(contexts) @@ -155,6 +157,8 @@ def _create_nli_prompt(self, row: t.Dict, answer_result: LLMResult) -> PromptVal return prompt_value def _compute_score(self, result: LLMResult): + assert self.llm is not None, "llm must be set to compute score" + # check the verdicts and compute the score output = result.generations[0][0] verdict_score_map = {"1": 1, "0": 0, "null": np.nan} From baebebc57fa7d52ee7af30a25de6d1f2871986cc Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 2 Jan 2024 20:19:42 +0530 Subject: [PATCH 21/34] formating --- src/ragas/metrics/_answer_correctness.py | 2 +- src/ragas/metrics/_answer_relevance.py | 108 ++++++++++++----------- src/ragas/metrics/_answer_similarity.py | 5 +- src/ragas/metrics/_context_recall.py | 21 ++--- src/ragas/metrics/_context_relevancy.py | 2 +- src/ragas/metrics/base.py | 4 +- 6 files changed, 71 insertions(+), 71 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 09b4d78ee..469613560 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -194,7 +194,7 @@ def _score_batch( ) ) - result = self.llm.generate(prompts, callbacks=batch_group) + self.llm.generate(prompts, callbacks=batch_group) if self.weights[1] == 0: similarity_scores = np.zeros(len(f1_score)) diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 9ee560c91..5891919ed 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -5,22 +5,22 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.embeddings import OpenAIEmbeddings from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader logger = logging.getLogger(__name__) if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks + from langchain_core.llms import LLMResult from ragas.embeddings.base import BaseRagasEmbeddings + from ragas.llms.prompt import PromptValue QUESTION_GEN = Prompt( name="question_generation", @@ -88,55 +88,6 @@ def init_model(self): if self.embeddings.openai_api_key == "no-key": raise OpenAIKeyNotFound - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logger.info(f"Adapting AnswerRelevancy metric to {language}") - self.question_generation = self.question_generation.adapt( - language, self.llm, cache_dir - ) - - def save(self, cache_dir: str | None = None) -> None: - self.question_generation.save(cache_dir) - - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: - questions, answers, contexts = ( - dataset["question"], - dataset["answer"], - dataset["contexts"], - ) - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - prompts = [] - for ans, ctx in zip(answers, contexts): - prompts.append( - self.question_generation.format(answer=ans, context="\n".join(ctx)) - ) - - results = self.llm.generate( - prompts, - n=self.strictness, - callbacks=batch_group, - ) - results = [ - [json_loader.safe_load(i.text, self.llm) for i in r] - for r in results.generations - ] - scores = [] - for question, result in zip(questions, results): - gen_questions = [item.get("question", "") for item in result] - committal = np.any([item.get("noncommittal", False) for item in result]) - cosine_sim = self.calculate_similarity(question, gen_questions) - scores.append(cosine_sim.mean() * int(not committal)) - - return scores - def calculate_similarity( self: t.Self, question: str, generated_questions: list[str] ): @@ -155,5 +106,56 @@ def calculate_similarity( / norm ) + def _calculate_score(self, result: LLMResult, row: t.Dict) -> float: + assert self.llm is not None, "LLM is not set" + + result = [ + json_loader.safe_load(r.text, self.llm) for r in result.generations[0] + ] + question = row["question"] + gen_questions = [item.get("question", "") for item in result] + committal = np.any([item.get("noncommittal", False) for item in result]) + cosine_sim = self.calculate_similarity(question, gen_questions) + score = cosine_sim.mean() * int(not committal) + + return score + + def _create_question_gen_prompt(self, row: t.Dict) -> PromptValue: + ans, ctx = row["answer"], row["contexts"] + return self.question_generation.format(answer=ans, context="\n".join(ctx)) + + def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not set" + + prompt = self._create_question_gen_prompt(row) + result = self.llm.generate_text( + prompt, + n=self.strictness, + callbacks=callbacks, + ) + + return self._calculate_score(result, row) + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not set" + + prompt = self._create_question_gen_prompt(row) + result = await self.llm.agenerate_text( + prompt, + n=self.strictness, + callbacks=callbacks, + ) + + return self._calculate_score(result, row) + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + logger.info(f"Adapting AnswerRelevancy metric to {language}") + self.question_generation = self.question_generation.adapt( + language, self.llm, cache_dir + ) + + def save(self, cache_dir: str | None = None) -> None: + self.question_generation.save(cache_dir) + answer_relevancy = AnswerRelevancy() diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index b70a61ba5..09b13de9b 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -6,10 +6,7 @@ import numpy as np -from ragas.embeddings.base import ( - HuggingfaceEmbeddings, - embedding_factory, -) +from ragas.embeddings.base import HuggingfaceEmbeddings, embedding_factory from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 2efad34cc..ed6a71540 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -5,6 +5,7 @@ from dataclasses import dataclass, field import numpy as np +from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms.prompt import Prompt @@ -60,7 +61,7 @@ """, }, ], - input_keys=["question", "context", "answer"], + input=["question", "context", "answer"], output_key="classification", output_type="json", ) @@ -96,15 +97,15 @@ def save(self, cache_dir: str | None = None) -> None: def _score_batch( self: t.Self, - row: t.Dict, + dataset: Dataset, callbacks: t.Optional[Callbacks] = None, - ) -> float: - assert self.llm is not None, "LLM is not set" - - question, ground_truth, contexts = ( - row["question"], - row["ground_truths"], - row["contexts"], + callback_group_name: str = "batch", + ) -> list: + prompts = [] + question, ground_truths, contexts = ( + dataset["question"], + dataset["ground_truths"], + dataset["contexts"], ) cb = CallbackManager.configure(inheritable_callbacks=callbacks) @@ -143,7 +144,7 @@ def _score_batch( else: scores.append(np.nan) - return score + return scores context_recall = ContextRecall() diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index a43fa4454..b3ced9e1a 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -21,7 +21,7 @@ CONTEXT_RELEVANCE = Prompt( name="context_relevancy", instruction="""Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.""", - input_keys=["question", "context"], + input=["question", "context"], output_key="candidate sentences", output_type="json", ) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 0b04d03d1..79e832ab7 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -79,7 +79,7 @@ def score( return score @abstractmethod - def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: ... async def ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: @@ -98,7 +98,7 @@ async def ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: return score @abstractmethod - async def _ascore(self, row: t.Dict, callbacks: Callbacks = []) -> float: + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ... From 4318d1b131ea041afd2094c478fa2e57aedbc191 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Tue, 2 Jan 2024 23:06:44 +0530 Subject: [PATCH 22/34] critique ported --- src/ragas/metrics/__init__.py | 2 ++ src/ragas/metrics/critique.py | 56 ++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 4b77a2485..7079103b3 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -7,6 +7,7 @@ context_utilization, ) from ragas.metrics._faithfulness import Faithfulness, faithfulness +from ragas.metrics.critique import AspectCritique __all__ = [ "AnswerCorrectness", @@ -19,4 +20,5 @@ "context_precision", "ContextUtilization", "context_utilization", + "AspectCritique", ] diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 096acc330..67cbd0b75 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -12,7 +12,7 @@ from ragas.llms import llm_factory from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader +from ragas.llms.json_load import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -86,13 +86,6 @@ def __post_init__(self: t.Self): self.strictness if self.strictness % 2 != 0 else self.strictness + 1 ) - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logger.info(f"Adapting Critic to {language}") - self.critic_prompt.adapt(language, self.llm, cache_dir) - - def save(self, cache_dir: str | None = None) -> None: - self.critic_prompt.save(cache_dir) - def prompt_format( self: t.Self, question: str, @@ -107,6 +100,46 @@ def prompt_format( input=question, submission=answer, criteria=self.definition ) + def _compute_score(self, safe_loaded_responses): + ANSWER_DICT = {"1": 1, "0": 0} + if self.strictness > 1: + score = Counter( + [ + ANSWER_DICT.get(item.get("verdict", np.nan), np.nan) + for item in safe_loaded_responses + ] + ).most_common(1)[0][0] + else: + score = ANSWER_DICT.get( + safe_loaded_responses[0].get("verdict", np.nan), np.nan + ) + + return score + + def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + q, c, a = row["question"], row["contexts"], row["answer"] + + result = self.llm.generate_text( + self.prompt_format(q, a, c), callbacks=callbacks + ) + + responses = [r.text for r in result.generations[0]] + safe_loaded_responses = [json_loader.safe_load(r, self.llm) for r in responses] + + return self._compute_score(safe_loaded_responses) + + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + q, c, a = row["question"], row["contexts"], row["answer"] + + result = await self.llm.agenerate_text( + self.prompt_format(q, a, c), callbacks=callbacks + ) + + responses = [r.text for r in result.generations[0]] + safe_loaded_responses = [json_loader.safe_load(r, self.llm) for r in responses] + + return self._compute_score(safe_loaded_responses) + def _score_batch( self: t.Self, dataset: Dataset, @@ -159,6 +192,13 @@ def _score_batch( return scores + def adapt(self, language: str, cache_dir: str | None = None) -> None: + logger.info(f"Adapting Critic to {language}") + self.critic_prompt.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: str | None = None) -> None: + self.critic_prompt.save(cache_dir) + harmfulness = AspectCritique( name="harmfulness", From 306ebe1af947ec0c9cc886c4765e4b5e0fc904a9 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 3 Jan 2024 12:02:14 +0530 Subject: [PATCH 23/34] contex_recall ported --- src/ragas/metrics/__init__.py | 7 ++- src/ragas/metrics/_context_precision.py | 2 +- src/ragas/metrics/_context_recall.py | 61 ++++++++++++++++++++----- 3 files changed, 55 insertions(+), 15 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 7079103b3..79205a7bf 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,4 +1,7 @@ from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness +from ragas.metrics._faithfulness import Faithfulness, faithfulness +from ragas.metrics.critique import AspectCritique +from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._answer_similarity import AnswerSimilarity, answer_similarity from ragas.metrics._context_precision import ( ContextPrecision, @@ -6,8 +9,6 @@ context_precision, context_utilization, ) -from ragas.metrics._faithfulness import Faithfulness, faithfulness -from ragas.metrics.critique import AspectCritique __all__ = [ "AnswerCorrectness", @@ -20,5 +21,7 @@ "context_precision", "ContextUtilization", "context_utilization", + "ContextRecall", + "context_recall", "AspectCritique", ] diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 493f504f9..8e896af4d 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -12,7 +12,7 @@ from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks logger = logging.getLogger(__name__) diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index ed6a71540..0e4b04591 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -10,10 +10,11 @@ from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader +from ragas.llms.json_load import json_loader if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks + from langchain_core.callbacks import Callbacks + from ragas.llms.prompt import PromptValue logger = logging.getLogger(__name__) @@ -61,7 +62,7 @@ """, }, ], - input=["question", "context", "answer"], + input_keys=["question", "context", "answer"], output_key="classification", output_type="json", ) @@ -86,15 +87,6 @@ class ContextRecall(MetricWithLLM): context_recall_prompt: Prompt = field(default_factory=lambda: CONTEXT_RECALL_RA) batch_size: int = 15 - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logger.info(f"Adapting Context Recall to {language}") - self.context_recall_prompt = self.context_recall_prompt.adapt( - language, self.llm, cache_dir - ) - - def save(self, cache_dir: str | None = None) -> None: - self.context_recall_prompt.save(cache_dir) - def _score_batch( self: t.Self, dataset: Dataset, @@ -146,5 +138,50 @@ def _score_batch( return scores + def _create_context_recall_prompt(self, row: t.Dict) -> PromptValue: + qstn, ctx, gt = row["question"], row["contexts"], row["ground_truths"] + gt = "\n".join(gt) if isinstance(gt, list) else gt + ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx + + return self.context_recall_prompt.format(question=qstn, context=ctx, answer=gt) + + def _compute_score(self, response: t.Any) -> float: + if response: + response = [ + int(item.get("Attributed", "0").strip() == "1") + if item.get("Attributed") + else np.nan + for item in response + ] + denom = len(response) + numerator = sum(response) + return numerator / denom + else: + return np.nan + + def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "set LLM before use" + + result = self.llm.generate_text( + self._create_context_recall_prompt(row), callbacks=callbacks + ) + response = json_loader.safe_load(result.generations[0][0].text, self.llm) + + return self._compute_score(response) + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + ... + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "set LLM before use" + + logger.info(f"Adapting Context Recall to {language}") + self.context_recall_prompt = self.context_recall_prompt.adapt( + language, self.llm, cache_dir + ) + + def save(self, cache_dir: str | None = None) -> None: + self.context_recall_prompt.save(cache_dir) + context_recall = ContextRecall() From 6ff94a27733c44930a12642457eca50a86cde127 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 3 Jan 2024 12:21:18 +0530 Subject: [PATCH 24/34] context_relevancy ported --- src/ragas/metrics/__init__.py | 3 ++ src/ragas/metrics/_context_relevancy.py | 70 ++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 79205a7bf..31a7e0ec7 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -3,6 +3,7 @@ from ragas.metrics.critique import AspectCritique from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._answer_similarity import AnswerSimilarity, answer_similarity +from ragas.metrics._context_relevancy import ContextRelevancy, context_relevancy from ragas.metrics._context_precision import ( ContextPrecision, ContextUtilization, @@ -24,4 +25,6 @@ "ContextRecall", "context_recall", "AspectCritique", + "context_relevancy", + "ContextRelevancy", ] diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index b3ced9e1a..f9204c0bc 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -20,8 +20,8 @@ CONTEXT_RELEVANCE = Prompt( name="context_relevancy", - instruction="""Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.""", - input=["question", "context"], + instruction='Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you\'re not allowed to make any changes to sentences from given context.', + input_keys=["question", "context"], output_key="candidate sentences", output_type="json", ) @@ -58,14 +58,57 @@ class ContextRelevancy(MetricWithLLM): batch_size: int = 15 show_deprecation_warning: bool = False - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logger.info(f"Adapting Context Relevancy to {language}") - self.context_relevancy_prompt = self.context_relevancy_prompt.adapt( - language, self.llm, cache_dir + def _compute_score(self, responses: t.Any, row: t.Dict) -> float: + context = "\n".join(row["contexts"]) + overlap_scores = [] + context_sents = sent_tokenize(context) + for output in responses: + indices = ( + sent_tokenize(output.strip()) + if output.lower() != "insufficient information." + else [] + ) + if len(context_sents) == 0: + score = 0 + else: + score = min(len(indices) / len(context_sents), 1) + overlap_scores.append(score) + return float(np.mean(overlap_scores)) + + def _score(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not initialized" + + if self.show_deprecation_warning: + logger.warning( + "The 'context_relevancy' metric is going to be deprecated soon! Please use the 'context_precision' metric instead. It is a drop-in replacement just a simple search and replace should work." # noqa + ) + + question, contexts = row["question"], row["contexts"] + result = self.llm.generate_text( + self.context_relevancy_prompt.format( + question=question, context="\n".join(contexts) + ), + callbacks=callbacks, ) - def save(self, cache_dir: str | None = None) -> None: - self.context_relevancy_prompt.save(cache_dir) + return self._compute_score(result.generations[0][0].text, row) + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "LLM is not initialized" + + if self.show_deprecation_warning: + logger.warning( + "The 'context_relevancy' metric is going to be deprecated soon! Please use the 'context_precision' metric instead. It is a drop-in replacement just a simple search and replace should work." # noqa + ) + + question, contexts = row["question"], row["contexts"] + result = await self.llm.agenerate_text( + self.context_relevancy_prompt.format( + question=question, context="\n".join(contexts) + ), + callbacks=callbacks, + ) + return self._compute_score(result.generations[0][0].text, row) def _score_batch( self: t.Self, @@ -119,5 +162,16 @@ def _score_batch( return scores + def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "set LLM before use" + + logger.info(f"Adapting Context Relevancy to {language}") + self.context_relevancy_prompt = self.context_relevancy_prompt.adapt( + language, self.llm, cache_dir + ) + + def save(self, cache_dir: str | None = None) -> None: + self.context_relevancy_prompt.save(cache_dir) + context_relevancy = ContextRelevancy() From 36916535f27476bb612026be625d35be75d54754 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 3 Jan 2024 12:24:33 +0530 Subject: [PATCH 25/34] added benchmark --- tests/benchmarks/benchmark_eval.py | 42 ++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index add94ceba..9e295f52e 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -3,32 +3,52 @@ from datasets import DatasetDict, load_dataset from ragas import evaluate -from ragas.metrics import faithfulness +from ragas.metrics import ( + faithfulness, + context_recall, + answer_correctness, + context_relevancy, + context_precision, + context_utilization, + answer_similarity, +) +from ragas.metrics.critique import harmfulness # data ds = load_dataset("explodinggradients/fiqa", "ragas_eval") assert isinstance(ds, DatasetDict) fiqa = ds["baseline"] +# metrics +metrics = [ + faithfulness, + context_recall, + answer_correctness, + harmfulness, + context_relevancy, + context_precision, + context_utilization, + answer_similarity, +] + if __name__ == "__main__": # asyncio start = time.time() - _ = evaluate( - fiqa, - metrics=[ - faithfulness, - ], - is_async=True, - ) + print("ignored") + # _ = evaluate( + # fiqa, + # metrics=[ + # faithfulness, + # ], + # is_async=True, + # ) print(f"Time taken [Asyncio]: {time.time() - start:.2f}s") # Threads start = time.time() _ = evaluate( fiqa, - metrics=[ - faithfulness, - ], + metrics=metrics, is_async=False, ) print(f"Time taken [Threads]: {time.time() - start:.2f}s") From 1ecde7b043dd8e1a3e1b7c33c17a137bfa59f4f9 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Wed, 3 Jan 2024 23:04:35 +0530 Subject: [PATCH 26/34] fix tests --- src/ragas/llms/base.py | 7 +- src/ragas/llms/prompt.py | 2 +- src/ragas/metrics/__init__.py | 11 +- src/ragas/metrics/_answer_relevance.py | 2 + src/ragas/metrics/_context_recall.py | 3 +- src/ragas/metrics/_faithfulness.py | 1 + src/ragas/metrics/critique.py | 2 +- src/ragas/testset/testset_generator.py | 4 +- src/ragas/utils.py | 2 +- tests/benchmarks/benchmark_eval.py | 10 +- tests/unit/llms/test_llm.py | 140 ++----------------------- tests/unit/test_metric.py | 11 -- tests/unit/test_prompt.py | 96 ----------------- 13 files changed, 37 insertions(+), 254 deletions(-) delete mode 100644 tests/unit/test_prompt.py diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index f03b677ae..36cdb2738 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -11,7 +11,8 @@ if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks - from langchain_core.prompt_values import PromptValue + + from ragas.llms.prompt import PromptValue MULTIPLE_COMPLETION_SUPPORTED = [ OpenAI, @@ -40,7 +41,7 @@ def generate_text( n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, - callbacks: t.Optional[Callbacks] = None, + callbacks: Callbacks = [], ) -> LLMResult: ... @@ -51,7 +52,7 @@ async def agenerate_text( n: int = 1, temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, - callbacks: t.Optional[Callbacks] = None, + callbacks: Callbacks = [], ) -> LLMResult: ... diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 71e9bd873..d4344fb40 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -193,7 +193,7 @@ def adapt( return self def save(self, cache_dir: t.Optional[str] = None) -> None: - cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME + cache_dir = cache_dir if cache_dir else get_cache_dir() cache_dir = os.path.join(cache_dir, self.language) if not os.path.exists(cache_dir): os.makedirs(cache_dir) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 31a7e0ec7..f18bd57b9 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,15 +1,16 @@ from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness -from ragas.metrics._faithfulness import Faithfulness, faithfulness -from ragas.metrics.critique import AspectCritique -from ragas.metrics._context_recall import ContextRecall, context_recall +from ragas.metrics._answer_relevance import AnswerRelevancy, answer_relevancy from ragas.metrics._answer_similarity import AnswerSimilarity, answer_similarity -from ragas.metrics._context_relevancy import ContextRelevancy, context_relevancy from ragas.metrics._context_precision import ( ContextPrecision, ContextUtilization, context_precision, context_utilization, ) +from ragas.metrics._context_recall import ContextRecall, context_recall +from ragas.metrics._context_relevancy import ContextRelevancy, context_relevancy +from ragas.metrics._faithfulness import Faithfulness, faithfulness +from ragas.metrics.critique import AspectCritique __all__ = [ "AnswerCorrectness", @@ -27,4 +28,6 @@ "AspectCritique", "context_relevancy", "ContextRelevancy", + "AnswerRelevancy", + "answer_relevancy", ] diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 5891919ed..2ad9f83c0 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -149,6 +149,8 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: return self._calculate_score(result, row) def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "LLM is not set" + logger.info(f"Adapting AnswerRelevancy metric to {language}") self.question_generation = self.question_generation.adapt( language, self.llm, cache_dir diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 0e4b04591..e647c5760 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -8,12 +8,13 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.llms.json_load import json_loader if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks + from ragas.llms.prompt import PromptValue logger = logging.getLogger(__name__) diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index ea150c26f..ab8078c8d 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -216,4 +216,5 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: self.long_form_answer_prompt.save(cache_dir) self.nli_statements_message.save(cache_dir) + faithfulness = Faithfulness() diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 67cbd0b75..f05eaa8dd 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -10,9 +10,9 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms import llm_factory +from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.llms.json_load import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py index f53206ba9..8b05834ea 100644 --- a/src/ragas/testset/testset_generator.py +++ b/src/ragas/testset/testset_generator.py @@ -32,6 +32,7 @@ from tqdm import tqdm from ragas.llms import llm_factory +from ragas.llms.json_load import load_as_json from ragas.testset.prompts import ( ANSWER_FORMULATE, COMPRESS_QUESTION, @@ -45,10 +46,9 @@ SEED_QUESTION, ) from ragas.testset.utils import load_as_score -from ragas.utils import load_as_json if t.TYPE_CHECKING: - from ragas.llms.base import RagasLLM + pass DEFAULT_TEST_DISTRIBUTION = { diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 5f449a6ea..49a1b3423 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -14,7 +14,7 @@ def get_cache_dir() -> str: DEFAULT_XDG_CACHE_HOME = "~/.cache" xdg_cache = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME) default_ragas_cache = os.path.join(xdg_cache, "ragas") - return os.path.expanduser(os.getenv("RAGAS_HOME", default_ragas_cache)) + return os.path.expanduser(os.getenv("RAGAS_CACHE_HOME", default_ragas_cache)) @lru_cache(maxsize=1) diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index 9e295f52e..106fae92e 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -4,13 +4,14 @@ from ragas import evaluate from ragas.metrics import ( - faithfulness, - context_recall, answer_correctness, - context_relevancy, + answer_relevancy, + answer_similarity, context_precision, + context_recall, + context_relevancy, context_utilization, - answer_similarity, + faithfulness, ) from ragas.metrics.critique import harmfulness @@ -23,6 +24,7 @@ metrics = [ faithfulness, context_recall, + answer_relevancy, answer_correctness, harmfulness, context_relevancy, diff --git a/tests/unit/llms/test_llm.py b/tests/unit/llms/test_llm.py index b3af7a6d0..09b6b0f03 100644 --- a/tests/unit/llms/test_llm.py +++ b/tests/unit/llms/test_llm.py @@ -1,146 +1,26 @@ from __future__ import annotations -import os +import typing as t -import pytest -from langchain.prompts.chat import ChatPromptTemplate from langchain.schema import Generation, LLMResult -from ragas.embeddings import AzureOpenAIEmbeddings, OpenAIEmbeddings from ragas.llms.base import BaseRagasLLM -from ragas.llms.openai import ( - AzureOpenAI, - AzureOpenAIKeyNotFound, - OpenAI, - OpenAIKeyNotFound, -) -from ragas.utils import NO_KEY + +if t.TYPE_CHECKING: + from ragas.llms.prompt import PromptValue class TestLLM(BaseRagasLLM): def llm(self): return self - def generate( - self, prompts: list[ChatPromptTemplate], n=1, temperature=0, callbacks=None + def generate_text( + self, prompt: PromptValue, n=1, temperature=1e-8, stop=None, callbacks=[] ): - prompt_strs = [p.format() for p in prompts] - generations = [[Generation(text=prompt_str)] * n for prompt_str in prompt_strs] + generations = [[Generation(text=prompt.prompt_str)] * n] return LLMResult(generations=generations) - async def agenerate( - self, prompt: ChatPromptTemplate, n=1, temperature=0, callbacks=None + async def agenerate_text( + self, prompt: PromptValue, n=1, temperature=1e-8, stop=None, callbacks=[] ): - return self.generate([prompt], n, temperature, callbacks) - - def validate_api_key(self): - if os.getenv("FAKELLM_API_KEY", NO_KEY) == NO_KEY: - raise ValueError("FAKELLM_API_KEY not found in environment variables.") - - -def test_validate_api_key(): - llm = TestLLM() - with pytest.raises(ValueError): - llm.validate_api_key() - os.environ["FAKELLM_API_KEY"] = "random-key-102848595" - # just check if no error is raised - assert llm.validate_api_key() is None - - -def openai_llm_factory(with_api_key): - if with_api_key: - api_key = "random-key-102848595" - return OpenAI(api_key=api_key), api_key - else: - return OpenAI() - - -def openai_embedding_factory(with_api_key): - if with_api_key: - api_key = "random-key-102848595" - return OpenAIEmbeddings(api_key=api_key), api_key - else: - return OpenAIEmbeddings() - - -def azure_llm_factory(with_api_key): - if with_api_key: - api_key = "random-key-102848595" - return ( - AzureOpenAI( - api_version="2020-09-03", - api_key=api_key, - azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", - deployment="en-fr", - ), - api_key, - ) - else: - return AzureOpenAI( - azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", - deployment="en-fr", - api_version="2020-09-03", - ) - - -def azure_embed_factory(with_api_key): - if with_api_key: - api_key = "random-key-102848595" - return ( - AzureOpenAIEmbeddings( - api_version="2020-09-03", - api_key=api_key, - azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", - deployment="en-fr", - ), - api_key, - ) - else: - return AzureOpenAIEmbeddings( - azure_endpoint="https://api.labs.cognitive.microsofttranslator.com", - deployment="en-fr", - api_version="2020-09-03", - ) - - -@pytest.mark.parametrize( - "factory, key_not_found_exception, environ_key", - [ - (openai_llm_factory, OpenAIKeyNotFound, "OPENAI_API_KEY"), - (azure_llm_factory, AzureOpenAIKeyNotFound, "AZURE_OPENAI_API_KEY"), - (openai_embedding_factory, OpenAIKeyNotFound, "OPENAI_API_KEY"), - (azure_embed_factory, AzureOpenAIKeyNotFound, "AZURE_OPENAI_API_KEY"), - ], -) -def test_validate_api_key_for_different_llms( - factory, key_not_found_exception, environ_key -): - # load key from environment variables - if environ_key in os.environ: - os.environ.pop(environ_key) - obj = factory(with_api_key=False) - with pytest.raises(key_not_found_exception): - obj.validate_api_key() - os.environ[environ_key] = "random-key-102848595" - obj = factory(with_api_key=False) - assert obj.validate_api_key() is None - - # load key which is passed as argument - if environ_key in os.environ: - os.environ.pop(environ_key) - obj, _ = factory(with_api_key=True) - assert obj.validate_api_key() is None - - # assert order of precedence - os.environ[environ_key] = "random-key-102848595" - obj, api_key = factory(with_api_key=True) - assert obj.validate_api_key - assert obj.api_key == api_key - - # assert loading key from environment variables after instantiation - if environ_key in os.environ: - os.environ.pop(environ_key) - obj = factory(with_api_key=False) - os.environ[environ_key] = "random-key-102848595" - assert obj.validate_api_key() is None - assert obj.api_key == "random-key-102848595" + return self.generate_text(prompt, n, temperature, stop, callbacks) diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py index 056524952..e69de29bb 100644 --- a/tests/unit/test_metric.py +++ b/tests/unit/test_metric.py @@ -1,11 +0,0 @@ -import pytest - -from ragas.metrics.base import make_batches - - -@pytest.mark.parametrize( - "batch_size, total_size, len_expected", [(5, 10, 2), (5, 11, 3), (5, 9, 2)] -) -def test_make_batches(batch_size, total_size, len_expected): - batches = make_batches(total_size, batch_size) - assert len(batches) == len_expected diff --git a/tests/unit/test_prompt.py b/tests/unit/test_prompt.py deleted file mode 100644 index 62bae0b66..000000000 --- a/tests/unit/test_prompt.py +++ /dev/null @@ -1,96 +0,0 @@ -from ragas.llms.prompt import Prompt - -TESTCASES = [ - { - "instruction": "Create one or more statements from each sentence in the given answer.", - "examples": [ - { - "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", - "answer": "alcohol", - "statements in json": """{ - "statements": [ - "Cadmium Chloride is slightly soluble in alcohol." - ] - }""", - }, - { - "question": "Were Hitler and Benito Mussolini of the same nationality?", - "answer": "Sorry, I can't provide answer to that question.", - "statements in json": """{ - "statements": [] - }""", - }, - ], - "input_keys": ["question", "answer"], - "output_key": "statements in json", - }, - { - "instruction": 'Natural language inference. Use only "Yes" (1) or "No" (0) as a binary verdict.', - "examples": [ - { - "Context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. - statement_1: John is majoring in Biology. - statement_2: John is taking a course on Artificial Intelligence. - statement_3: John is a dedicated student. - statement_4: John has a part-time job.""", - "Answer": """[ - { - "statement_1": "John is majoring in Biology.", - "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", - "verdict": "0" - }, - { - "statement_2": "John is taking a course on Artificial Intelligence.", - "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", - "verdict": "0" - }, - { - "statement_3": "John is a dedicated student.", - "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", - "verdict": "1" - }, - { - "statement_4": "John has a part-time job.", - "reason": "There is no information given in the context about John having a part-time job.", - "verdict": "0" - }] - """, - } - ], - "input_keys": ["Context"], - "output_key": "Answer", - "output_type": "json", - }, - { - "instruction": "This is a test prompt without examples", - "input_keys": ["Context"], - "output_key": "Answer", - "output_type": "json", - }, -] - - -def test_prompt_object(): - for testcase in TESTCASES: - prompt = Prompt(**testcase) - - assert prompt is not None, "Prompt object is not created" - assert ( - prompt.instruction == testcase["instruction"] - ), "instruction in object is not same as in the testcase" - assert ( - prompt.input_keys == testcase["input_keys"] - ), "input_keys in object is not same as in the testcase" - assert ( - prompt.output_key == testcase["output_key"] - ), "output_key in object is not same as in the testcase" - assert prompt.output_type == testcase.get( - "output_type", "json" - ), "output_type in object is not same as in the testcase" - assert prompt.examples == testcase.get( - "examples", [] - ), "examples should be empty if not provided" - if testcase.get("examples"): - assert isinstance( - prompt.get_example_str(0), str - ), "get_example_str should return a string" From f480eaf0a2581cfeb6cf0ca3edc3390de07e8161 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 4 Jan 2024 00:37:11 +0530 Subject: [PATCH 27/34] fix ci --- pyproject.toml | 3 + src/ragas/langchain/__init__.py | 3 - src/ragas/langchain/evalchain.py | 197 ----------------------- src/ragas/langsmith/__init__.py | 0 src/ragas/langsmith/evaluator.py | 0 src/ragas/llama_index/__init__.py | 3 - src/ragas/llama_index/evaluation.py | 101 ------------ src/ragas/llms/base.py | 13 ++ src/ragas/metrics/_answer_correctness.py | 46 +----- src/ragas/metrics/_answer_relevance.py | 22 +-- src/ragas/metrics/_context_precision.py | 25 +-- src/ragas/metrics/_context_recall.py | 62 +------ src/ragas/metrics/_context_relevancy.py | 54 ------- src/ragas/metrics/critique.py | 56 +------ src/ragas/testset/evolutions.py | 60 ------- src/ragas/testset/testset_generator.py | 18 +-- 16 files changed, 61 insertions(+), 602 deletions(-) delete mode 100644 src/ragas/langchain/__init__.py delete mode 100644 src/ragas/langchain/evalchain.py delete mode 100644 src/ragas/langsmith/__init__.py delete mode 100644 src/ragas/langsmith/evaluator.py delete mode 100644 src/ragas/llama_index/__init__.py delete mode 100644 src/ragas/llama_index/evaluation.py delete mode 100644 src/ragas/testset/evolutions.py diff --git a/pyproject.toml b/pyproject.toml index 3dd632d6b..30d4ff605 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,9 @@ package-dir = {"" = "src"} [tool.setuptools.dynamic] readme = {file = ["README.md"], content-type = "text/plain"} +[tool.ruff.lint] +ignore = ["E501"] + [build-system] requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" diff --git a/src/ragas/langchain/__init__.py b/src/ragas/langchain/__init__.py deleted file mode 100644 index 039bc0cc8..000000000 --- a/src/ragas/langchain/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ragas.langchain.evalchain import RagasEvaluatorChain - -__all__ = ["RagasEvaluatorChain"] diff --git a/src/ragas/langchain/evalchain.py b/src/ragas/langchain/evalchain.py deleted file mode 100644 index 43d4ad3c9..000000000 --- a/src/ragas/langchain/evalchain.py +++ /dev/null @@ -1,197 +0,0 @@ -from __future__ import annotations - -import typing as t -from collections import defaultdict - -from datasets import Dataset -from langchain.callbacks.manager import CallbackManagerForChainRun -from langchain.chains.base import Chain -from langchain.schema import RUN_KEY -from langsmith.evaluation import EvaluationResult, RunEvaluator -from langsmith.schemas import Example, Run - -from ragas.metrics.base import EvaluationMode, Metric -from ragas.validation import EVALMODE_TO_COLUMNS - -if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - - -class RagasEvaluatorChain(Chain, RunEvaluator): - """ - Wrapper around ragas Metrics to use them with langsmith. - """ - - metric: Metric - - def __init__(self, **kwargs: t.Any): - super().__init__(**kwargs) - self.metric.init_model() - - @property - def input_keys(self) -> list[str]: - keys = ["query", "result"] - if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]: - keys += ["source_documents"] - if self.metric.evaluation_mode in [EvaluationMode.gc]: - keys += ["ground_truths"] - return keys - - @property - def output_keys(self) -> list[str]: - return [f"{self.metric.name}_score"] - - def _call( - self, - inputs: dict[str, t.Any], - run_manager: t.Optional[CallbackManagerForChainRun] = None, - ) -> dict[str, t.Any]: - """ - Call the evaluation chain. - """ - self._validate(inputs) - contexts = [] - - _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() - callbacks = _run_manager.get_child() - - if "source_documents" in inputs: - for document in inputs["source_documents"]: - if isinstance(document, dict): - contexts.append(document["page_content"]) - else: - contexts.append(document.page_content) - ground_truths = [] - if "ground_truths" in inputs: - ground_truths = inputs["ground_truths"] - - question = inputs["query"] - answer = inputs["result"] - score = self.metric.score_single( - { - "question": question, - "answer": answer, - "contexts": contexts, - "ground_truths": ground_truths, - }, - callbacks=callbacks, - ) - return {f"{self.metric.name}_score": score} - - def _validate( - self, - input: dict[str, t.Any], - question_key: str = "query", - prediction_key: str = "result", - context_key: str = "source_documents", - ) -> None: - ... - # validate each example - required_columns = EVALMODE_TO_COLUMNS[self.metric.evaluation_mode] - if "question" in required_columns and question_key not in input: - raise ValueError( - f'"{question_key}" is required in each example' - f"for the metric[{self.metric.name}] you have chosen." - ) - if "answer" in required_columns and prediction_key not in input: - raise ValueError( - f'"{prediction_key}" is required in each prediction' - f"for the metric[{self.metric.name}] you have chosen." - ) - if "contexts" in required_columns and context_key not in input: - raise ValueError( - f'"{context_key}" is required in each prediction for the ' - f"metric[{self.metric.name}] you have chosen." - ) - if "ground_truths" in required_columns and "ground_truths" not in input: - raise ValueError( - f'"ground_truths" is required in each prediction for the ' - f"metric[{self.metric.name}] you have chosen." - ) - - def evaluate( - self, - examples: t.Sequence[dict], - predictions: t.Sequence[dict], - question_key: str = "query", - prediction_key: str = "result", - context_key: str = "source_documents", - ground_truths_key: str = "ground_truths", - *, - callbacks: Callbacks = None, - ) -> list[dict]: - """Evaluate question answering examples and predictions.""" - dataset_dict = defaultdict(list) - - # validation - if len(examples) != len(predictions): - raise ValueError( - "number of examples and predictions must be same. Got " - f"len(examples)={len(examples)} and len(predictions)={len(predictions)}" - ) - - for i, example in enumerate(examples): - self._validate( - {**example, **predictions[i]}, question_key, prediction_key, context_key - ) - # transform into Dataset that is supported by ragas - if self.metric.evaluation_mode in [ - EvaluationMode.qac, - EvaluationMode.qc, - EvaluationMode.qa, - ]: - dataset_dict["question"].append(example[question_key]) - - if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qa]: - dataset_dict["answer"].append(predictions[i][prediction_key]) - - if self.metric.evaluation_mode in [ - EvaluationMode.qac, - EvaluationMode.qc, - EvaluationMode.gc, - ]: - dataset_dict["contexts"].append( - [d.page_content for d in predictions[i][context_key]] - ) - - if self.metric.evaluation_mode == EvaluationMode.gc: - if isinstance(example["ground_truths"], list): - dataset_dict["ground_truths"].append(example["ground_truths"]) - else: - dataset_dict["ground_truths"].append([example["ground_truths"]]) - - dataset = Dataset.from_dict(dataset_dict) - - # evaluate - dataset_with_scores = self.metric.score(dataset, callbacks=callbacks) - scores = [ - {f"{self.metric.name}_score": score} - for score in dataset_with_scores[self.metric.name] - ] - return scores - - def evaluate_run( - self, run: Run, example: t.Optional[Example] = None - ) -> EvaluationResult: - """ - Evaluate a langsmith run - """ - if run.outputs is None: - raise ValueError("The chain should return results and service_document.") - if example is None: - raise ValueError("Examples have to be provided.") - chain_eval = run.outputs - chain_eval["query"] = run.inputs["query"] - if self.metric.evaluation_mode == EvaluationMode.gc: - if example.outputs is None or "ground_truths" not in example.outputs: - raise ValueError("expected `ground_truths` in example outputs.") - chain_eval["ground_truths"] = example.outputs["ground_truths"] - eval_output = self(chain_eval, include_run_info=True) - - score_name = f"{self.metric.name}_score" - evaluation_result = EvaluationResult( - key=f"{self.metric.name}_score", score=eval_output[score_name] - ) - if RUN_KEY in eval_output: - evaluation_result.evaluator_info[RUN_KEY] = eval_output[RUN_KEY] - return evaluation_result diff --git a/src/ragas/langsmith/__init__.py b/src/ragas/langsmith/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/ragas/langsmith/evaluator.py b/src/ragas/langsmith/evaluator.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/ragas/llama_index/__init__.py b/src/ragas/llama_index/__init__.py deleted file mode 100644 index c6f647b3d..000000000 --- a/src/ragas/llama_index/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ragas.llama_index.evaluation import evaluate - -__all__ = ["evaluate"] diff --git a/src/ragas/llama_index/evaluation.py b/src/ragas/llama_index/evaluation.py deleted file mode 100644 index dab200688..000000000 --- a/src/ragas/llama_index/evaluation.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -import typing as t - -from datasets import Dataset - -from ragas import evaluate as ragas_evaluate -from ragas.evaluation import Result -from ragas.metrics.base import Metric - -if t.TYPE_CHECKING: - from llama_index.indices.query.base import BaseQueryEngine - - -def evaluate( - query_engine: BaseQueryEngine, - metrics: list[Metric], - questions: list[str], - ground_truths: t.Optional[list[str]] = None, -) -> Result: - """ - Run evaluation of llama_index QueryEngine with different metrics - - Parameters - ---------- - query_engine : BaseQueryEngine - The QueryEngine that is to be evaluated - metrics : list[Metric] - The ragas metrics to use for evaluation. - questions : list[str] - List of questions to evaluate on - ground_truths : list[str], optional - List of ground_truths answer to the question to evaluate on. - - Returns - ------- - Result - Result object containing the scores of each metric. You can use this do analysis - later. If the top 3 metrics are provided then it also returns the `ragas_score` - for the entire pipeline. - - Raises - ------ - ValueError - if validation fails because the columns required for the metrics are missing or - if the columns are of the wrong format. - - Examples - -------- - Once you have a llama_index QueryEngine created you can use it to evaluate on a list - of questions. - - Import everything you need: - - >>> from ragas.metrics import faithfulness, answer_relevancy, context_precision - >>> from ragas.metrics.critique import harmfulness - >>> from ragas.llama_index import evaluate - - init the query engine, get the questions and choose the metrics you want to use: - - >>> query_engine = # from llamaindex - >>> questions: list[str] = [] # from somewhere - >>> metrics = [faithfulness, answer_relevancy, context_precision, harmfulness] - - Run the evaluation: - - >>> r = evaluate(query_engine, metrics, questions) - - analysis the result: - - >>> print(r) # prints the scores of each metric - >>> r.to_pandas() # returns a pandas dataframe if you want to do further analysis - """ - - try: - from llama_index.async_utils import run_async_tasks - except ImportError: - raise ImportError( - "llama_index must be installed to use this function. " - "Please, install it with `pip install llama_index`." - ) - - # TODO: rate limit, error handling, retries - responses = run_async_tasks([query_engine.aquery(q) for q in questions]) - - answers = [] - contexts = [] - for r in responses: - answers.append(r.response) - contexts.append([c.node.get_content() for c in r.source_nodes]) - dataset_dict = { - "question": questions, - "answer": answers, - "contexts": contexts, - } - if ground_truths is not None: - dataset_dict["ground_truths"] = ground_truths - ds = Dataset.from_dict(dataset_dict) - result = ragas_evaluate(ds, metrics) - - return result diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index 36cdb2738..3d2e117e2 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -11,6 +11,7 @@ if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks + from langchain_core.prompts import ChatPromptTemplate from ragas.llms.prompt import PromptValue @@ -56,6 +57,18 @@ async def agenerate_text( ) -> LLMResult: ... + # TODO: remove after testset generator is refactored + def generate_text_with_hmpt( + self, + prompts: t.List[ChatPromptTemplate], + n: int = 1, + temperature: float = 1e-8, + stop: t.Optional[t.List[str]] = None, + callbacks: Callbacks = [], + ) -> LLMResult: + prompt = PromptValue(prompt_str=prompts[0].format()) + return self.generate_text(prompt, n, temperature, stop, callbacks) + @dataclass class LangchainLLMWrapper(BaseRagasLLM): diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 469613560..f2f83e3ed 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -5,8 +5,6 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt @@ -141,7 +139,7 @@ def _score(self, row: t.Dict, callbacks: Callbacks) -> float: weights=self.weights, ) - return score + return float(score) async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM must be set" @@ -168,47 +166,7 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: weights=self.weights, ) - return score - - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: - question, answer, ground_truths = ( - dataset["question"], - dataset["answer"], - dataset["ground_truths"], - ) - prompts = [] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for q, a, g in zip(question, answer, ground_truths): - prompts.append( - self.correctness_prompt.format( - question=q, ground_truth=g[0], answer=a - ) - ) - - self.llm.generate(prompts, callbacks=batch_group) - - if self.weights[1] == 0: - similarity_scores = np.zeros(len(f1_score)) - else: - similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore - - scores_stacked = np.vstack([f1_score, similarity_scores]) - scores = np.average( - scores_stacked, - axis=0, - weights=self.weights, - ) - - return scores.tolist() + return float(score) def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: assert self.llm is not None, "llm must be set to compute score" diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 2ad9f83c0..1a49aff9f 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -17,7 +17,6 @@ if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks - from langchain_core.llms import LLMResult from ragas.embeddings.base import BaseRagasEmbeddings from ragas.llms.prompt import PromptValue @@ -106,15 +105,10 @@ def calculate_similarity( / norm ) - def _calculate_score(self, result: LLMResult, row: t.Dict) -> float: - assert self.llm is not None, "LLM is not set" - - result = [ - json_loader.safe_load(r.text, self.llm) for r in result.generations[0] - ] + def _calculate_score(self, response: t.Sequence[t.Any], row: t.Dict) -> float: question = row["question"] - gen_questions = [item.get("question", "") for item in result] - committal = np.any([item.get("noncommittal", False) for item in result]) + gen_questions = [item.get("question", "") for item in response] + committal = np.any([item.get("noncommittal", False) for item in response]) cosine_sim = self.calculate_similarity(question, gen_questions) score = cosine_sim.mean() * int(not committal) @@ -133,8 +127,11 @@ def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: n=self.strictness, callbacks=callbacks, ) + response = [ + json_loader.safe_load(r.text, self.llm) for r in result.generations[0] + ] - return self._calculate_score(result, row) + return self._calculate_score(response, row) async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not set" @@ -145,8 +142,11 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: n=self.strictness, callbacks=callbacks, ) + response = [ + json_loader.safe_load(r.text, self.llm) for r in result.generations[0] + ] - return self._calculate_score(result, row) + return self._calculate_score(response, row) def adapt(self, language: str, cache_dir: str | None = None) -> None: assert self.llm is not None, "LLM is not set" diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 8e896af4d..934d33165 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -66,8 +66,8 @@ class ContextPrecision(MetricWithLLM): Attributes ---------- name : str - batch_size : int - Batch size for openai completion. + evaluation_mode: EvaluationMode + context_precision_prompt: Prompt """ name: str = "context_precision" # type: ignore @@ -94,20 +94,19 @@ def _context_precision_prompt(self, row: t.Dict) -> t.List[PromptValue]: for c in contexts ] - def _calculate_average_precision(self, responses: t.List[str]) -> float: + def _calculate_average_precision(self, json_responses: t.List[t.Dict]) -> float: score = np.nan - response = [json_loader.safe_load(item, self.llm) for item in responses] - response = [ + verdict_list = [ int("1" == resp.get("verdict", "0").strip()) if resp.get("verdict") else np.nan - for resp in response + for resp in json_responses ] - denominator = sum(response) + 1e-10 + denominator = sum(verdict_list) + 1e-10 numerator = sum( [ - (sum(response[: i + 1]) / (i + 1)) * response[i] - for i in range(len(response)) + (sum(verdict_list[: i + 1]) / (i + 1)) * verdict_list[i] + for i in range(len(verdict_list)) ] ) score = numerator / denominator @@ -126,7 +125,8 @@ def _score(self, row: t.Dict, callbacks: Callbacks = []) -> float: ) responses.append(result.generations[0][0].text) - score = self._calculate_average_precision(responses) + json_responses = [json_loader.safe_load(item, self.llm) for item in responses] + score = self._calculate_average_precision(json_responses) return score async def _ascore( @@ -146,10 +146,13 @@ async def _ascore( ) responses.append(result.generations[0][0].text) - score = self._calculate_average_precision(responses) + json_responses = [json_loader.safe_load(item, self.llm) for item in responses] + score = self._calculate_average_precision(json_responses) return score def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "LLM is not set" + logging.info(f"Adapting Context Precision to {language}") self.context_precision_prompt = self.context_precision_prompt.adapt( language, self.llm, cache_dir diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index e647c5760..21429da95 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -5,8 +5,6 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt @@ -88,57 +86,6 @@ class ContextRecall(MetricWithLLM): context_recall_prompt: Prompt = field(default_factory=lambda: CONTEXT_RECALL_RA) batch_size: int = 15 - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list: - prompts = [] - question, ground_truths, contexts = ( - dataset["question"], - dataset["ground_truths"], - dataset["contexts"], - ) - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for qstn, gt, ctx in zip(question, ground_truths, contexts): - gt = "\n".join(gt) if isinstance(gt, list) else gt - ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx - prompts.append( - self.context_recall_prompt.format( - question=qstn, context=ctx, answer=gt - ) - ) - - responses: list[list[str]] = [] - results = self.llm.generate( - prompts, - n=1, - callbacks=batch_group, - ) - responses = [[i.text for i in r] for r in results.generations] - scores = [] - for response in responses: - response = json_loader.safe_load(response[0], self.llm) - if response: - response = [ - int(item.get("Attributed", "0").strip() == "1") - if item.get("Attributed") - else np.nan - for item in response - ] - denom = len(response) - numerator = sum(response) - scores.append(numerator / denom) - else: - scores.append(np.nan) - - return scores - def _create_context_recall_prompt(self, row: t.Dict) -> PromptValue: qstn, ctx, gt = row["question"], row["contexts"], row["ground_truths"] gt = "\n".join(gt) if isinstance(gt, list) else gt @@ -171,7 +118,14 @@ def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: return self._compute_score(response) async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: - ... + assert self.llm is not None, "set LLM before use" + + result = await self.llm.agenerate_text( + self._create_context_recall_prompt(row), callbacks=callbacks + ) + response = json_loader.safe_load(result.generations[0][0].text, self.llm) + + return self._compute_score(response) def adapt(self, language: str, cache_dir: str | None = None) -> None: assert self.llm is not None, "set LLM before use" diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index f9204c0bc..0e7016a82 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -7,8 +7,6 @@ import numpy as np import pysbd -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM @@ -110,58 +108,6 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ) return self._compute_score(result.generations[0][0].text, row) - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[float]: - if self.show_deprecation_warning: - logger.warning( - "The 'context_relevancy' metric is going to be deprecated soon! Please use the 'context_precision' metric instead. It is a drop-in replacement just a simple search and replace should work." # noqa - ) - prompts = [] - questions, contexts = dataset["question"], dataset["contexts"] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for q, c in zip(questions, contexts): - prompts.append( - self.context_relevancy_prompt.format( - question=q, context="\n".join(c) - ) - ) - - responses: list[list[str]] = [] - results = self.llm.generate( - prompts, - n=1, - callbacks=batch_group, - ) - responses = [[i.text for i in r] for r in results.generations] - - scores = [] - for context, n_response in zip(contexts, responses): - context = "\n".join(context) - overlap_scores = [] - context_sents = sent_tokenize(context) - for output in n_response: - indices = ( - sent_tokenize(output.strip()) - if output.lower() != "insufficient information." - else [] - ) - if len(context_sents) == 0: - score = 0 - else: - score = min(len(indices) / len(context_sents), 1) - overlap_scores.append(score) - scores.append(np.mean(overlap_scores)) - - return scores - def adapt(self, language: str, cache_dir: str | None = None) -> None: assert self.llm is not None, "set LLM before use" diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index f05eaa8dd..78532d434 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -6,8 +6,6 @@ from dataclasses import dataclass, field import numpy as np -from datasets import Dataset -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms import llm_factory from ragas.llms.json_load import json_loader @@ -70,7 +68,7 @@ class AspectCritique(MetricWithLLM): definition: str = field(default="", repr=True) strictness: int = field(default=1, repr=False) batch_size: int = field(default=15, repr=False) - llm: BaseRagasLLM = field( + llm: BaseRagasLLM = field( # type: ignore default_factory=llm_factory, repr=False, ) @@ -140,58 +138,6 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: return self._compute_score(safe_loaded_responses) - def _score_batch( - self: t.Self, - dataset: Dataset, - callbacks: t.Optional[Callbacks] = None, - callback_group_name: str = "batch", - ) -> list[int]: - questions, contexts, answers = [ - dataset[key] if key in dataset.features else None - for key in ("question", "context", "answer") - ] - assert isinstance(questions, list) - assert isinstance(answers, list) - if contexts is None: - contexts = [None] * len(questions) - - prompts = [] - - cb = CallbackManager.configure(inheritable_callbacks=callbacks) - with trace_as_chain_group( - callback_group_name, callback_manager=cb - ) as batch_group: - for question, context, answer in zip(questions, contexts, answers): - human_prompt = self.prompt_format(question, answer, context) - prompts.append(human_prompt) - - results = self.llm.generate( - prompts, - n=self.strictness, - callbacks=batch_group, - ) - responses: list[list[str]] = [ - [i.text for i in r] for r in results.generations - ] - - scores = [] - answer_dict = {"1": 1, "0": 0} - for response in responses: - response = [json_loader.safe_load(item, self.llm) for item in response] - if self.strictness > 1: - score = Counter( - [ - answer_dict.get(item.get("verdict", np.nan), np.nan) - for item in response - ] - ).most_common(1)[0][0] - else: - score = answer_dict.get(response[0].get("verdict", np.nan), np.nan) - - scores.append(score) - - return scores - def adapt(self, language: str, cache_dir: str | None = None) -> None: logger.info(f"Adapting Critic to {language}") self.critic_prompt.adapt(language, self.llm, cache_dir) diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py deleted file mode 100644 index 17cbfb449..000000000 --- a/src/ragas/testset/evolutions.py +++ /dev/null @@ -1,60 +0,0 @@ -from langchain.prompts import ChatPromptTemplate - -from ragas.llms import BaseRagasLLM -from ragas.testset.docstore import Document, DocumentStore -from ragas.testset.prompts import ( - FILTER_QUESTION, - MULTICONTEXT_QUESTION, - SCORE_CONTEXT, - SEED_QUESTION, -) -from ragas.testset.testset_generator import load_as_score -from ragas.utils import load_as_json - - -def filter_context(llm: BaseRagasLLM, context: str, threshold: float = 7.5) -> bool: - """ - context: str - The input context - - Checks if the context is has information worthy of framing a question - """ - human_prompt = SCORE_CONTEXT.format(context=context) - prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = llm.generate(prompts=[prompt]) - output = results.generations[0][0].text.strip() - score = load_as_score(output) - return score >= threshold - - -def filter_question(llm: BaseRagasLLM, question: str) -> bool: - human_prompt = FILTER_QUESTION.format(question=question) - prompt = ChatPromptTemplate.from_messages([human_prompt]) - - results = llm.generate(prompts=[prompt]) - results = results.generations[0][0].text.strip() - json_results = load_as_json(results) - return json_results.get("verdict") != "No" - - -def simple_evolution(llm: BaseRagasLLM, seed_doc: Document): - human_prompt = SEED_QUESTION.format(context=seed_doc.page_content) - prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = llm.generate(prompts=[prompt]) - question = results.generations[0][0].text.strip() - return question - - -def multi_context_evolution( - llm: BaseRagasLLM, seed_doc: Document, doc_store: DocumentStore -): - question = simple_evolution(llm, seed_doc) - print(question) - similar_context = doc_store.get_similar(seed_doc)[0] - human_prompt = MULTICONTEXT_QUESTION.format( - question=question, context1=seed_doc.page_content, context2=similar_context - ) - prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = llm.generate(prompts=[prompt]) - question = results.generations[0][0].text.strip() - return question diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py index 8b05834ea..259b8316d 100644 --- a/src/ragas/testset/testset_generator.py +++ b/src/ragas/testset/testset_generator.py @@ -48,7 +48,7 @@ from ragas.testset.utils import load_as_score if t.TYPE_CHECKING: - pass + from ragas.llms.base import BaseRagasLLM DEFAULT_TEST_DISTRIBUTION = { @@ -127,8 +127,8 @@ class TestsetGenerator: def __init__( self, - generator_llm: RagasLLM, - critic_llm: RagasLLM, + generator_llm: BaseRagasLLM, + critic_llm: BaseRagasLLM, embeddings_model: Embeddings, testset_distribution: t.Optional[t.Dict[str, float]] = None, chat_qa: float = 0.0, @@ -198,7 +198,7 @@ def _filter_context(self, context: str) -> bool: """ human_prompt = SCORE_CONTEXT.format(context=context) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.critic_llm.generate(prompts=[prompt]) + results = self.critic_llm.generate_text_with_hmpt(prompts=[prompt]) output = results.generations[0][0].text.strip() score = load_as_score(output) return score >= self.threshold @@ -206,14 +206,14 @@ def _filter_context(self, context: str) -> bool: def _seed_question(self, context: str) -> str: human_prompt = SEED_QUESTION.format(context=context) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.generator_llm.generate(prompts=[prompt]) + results = self.generator_llm.generate_text_with_hmpt(prompts=[prompt]) return results.generations[0][0].text.strip() def _filter_question(self, question: str) -> bool: human_prompt = FILTER_QUESTION.format(question=question) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.critic_llm.generate(prompts=[prompt]) + results = self.critic_llm.generate_text_with_hmpt(prompts=[prompt]) results = results.generations[0][0].text.strip() json_results = load_as_json(results) return json_results.get("verdict") != "No" @@ -231,7 +231,7 @@ def _multicontext_question( question=question, context1=context1, context2=context2 ) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.generator_llm.generate(prompts=[prompt]) + results = self.generator_llm.generate_text_with_hmpt(prompts=[prompt]) return results.generations[0][0].text.strip() def _compress_question(self, question: str) -> str: @@ -243,13 +243,13 @@ def _conversational_question(self, question: str) -> str: def _question_transformation(self, prompt, question: str) -> str: human_prompt = prompt.format(question=question) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.generator_llm.generate(prompts=[prompt]) + results = self.generator_llm.generate_text_with_hmpt(prompts=[prompt]) return results.generations[0][0].text.strip() def _qc_template(self, prompt, question, context) -> str: human_prompt = prompt.format(question=question, context=context) prompt = ChatPromptTemplate.from_messages([human_prompt]) - results = self.generator_llm.generate(prompts=[prompt]) + results = self.generator_llm.generate_text_with_hmpt(prompts=[prompt]) return results.generations[0][0].text.strip() def _generate_answer(self, question: str, context: t.List[str]) -> t.List[str]: From 9a053fcd8e9b94780ae5d174b06d37c765a6386e Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 4 Jan 2024 00:46:07 +0530 Subject: [PATCH 28/34] merged with main --- README.md | 5 +- docs/concepts/metrics/critique.md | 2 +- docs/getstarted/evaluation.md | 2 +- .../howtos/applications/compare_embeddings.md | 4 +- docs/howtos/applications/compare_llms.md | 4 +- docs/howtos/customisations/aws-bedrock.ipynb | 2 +- docs/howtos/customisations/azure-openai.ipynb | 2 +- docs/howtos/customisations/gcp-vertexai.ipynb | 2 +- docs/howtos/integrations/langfuse.ipynb | 2 +- docs/howtos/integrations/langsmith.ipynb | 2 +- docs/howtos/integrations/llamaindex.ipynb | 2 +- src/ragas/evaluation.py | 8 +- src/ragas/llama_index/evaluation.py | 100 ++++++++++++++++++ src/ragas/metrics/_answer_correctness.py | 6 +- 14 files changed, 123 insertions(+), 20 deletions(-) create mode 100644 src/ragas/llama_index/evaluation.py diff --git a/README.md b/README.md index 3c6af2a77..081b47753 100644 --- a/README.md +++ b/README.md @@ -78,8 +78,9 @@ os.environ["OPENAI_API_KEY"] = "your-openai-key" dataset: Dataset results = evaluate(dataset) -# {'ragas_score': 0.860, 'context_precision': 0.817, -# 'faithfulness': 0.892, 'answer_relevancy': 0.874} +# {'context_precision': 0.817, +# 'faithfulness': 0.892, +# 'answer_relevancy': 0.874} ``` Refer to our [documentation](https://docs.ragas.io/) to learn more. diff --git a/docs/concepts/metrics/critique.md b/docs/concepts/metrics/critique.md index 439d93e8a..00b807300 100644 --- a/docs/concepts/metrics/critique.md +++ b/docs/concepts/metrics/critique.md @@ -5,7 +5,7 @@ This is designed to assess submissions based on predefined aspects such as `harm Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. (Please refer to `SUPPORTED_ASPECTS` for a complete list). If you prefer, you can also create custom aspects to evaluate submissions according to your unique requirements. -The `strictness` parameter plays a crucial role in maintaining a certain level of self-consistency in predictions, with an ideal range typically falling between 2 to 4. It's important to note that the scores obtained from aspect critiques are binary and do not contribute to the final Ragas score due to their non-continuous nature. +The `strictness` parameter plays a crucial role in maintaining a certain level of self-consistency in predictions, with an ideal range typically falling between 2 to 4. ```{hint} diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md index f21c4b64b..0091d1832 100644 --- a/docs/getstarted/evaluation.md +++ b/docs/getstarted/evaluation.md @@ -96,7 +96,7 @@ result = evaluate( result ``` -and there you have it, all the scores you need. `ragas_score` gives you a single metric that you can use while 4 metrics individually would measure the different parts of your pipeline. +and there you have it, all the scores you need. Now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too! diff --git a/docs/howtos/applications/compare_embeddings.md b/docs/howtos/applications/compare_embeddings.md index 145857539..01b7a8d13 100644 --- a/docs/howtos/applications/compare_embeddings.md +++ b/docs/howtos/applications/compare_embeddings.md @@ -109,7 +109,7 @@ result = evaluate(query_engine1, metrics, test_questions, test_answers) ```{code-block} :caption: output -{'ragas_score': 0.3570, 'context_precision': 0.2378, 'context_recall': 0.7159} +{'context_precision': 0.2378, 'context_recall': 0.7159} ``` ## Evaluate Bge embeddings @@ -124,7 +124,7 @@ result = evaluate(query_engine2, metrics, test_questions, test_answers) ```{code-block} :caption: output -{'ragas_score': 0.3883, 'context_precision': 0.2655, 'context_recall': 0.7227} +{'context_precision': 0.2655, 'context_recall': 0.7227} ``` diff --git a/docs/howtos/applications/compare_llms.md b/docs/howtos/applications/compare_llms.md index 4ce5caced..95897ee2f 100644 --- a/docs/howtos/applications/compare_llms.md +++ b/docs/howtos/applications/compare_llms.md @@ -145,7 +145,7 @@ result_zephyr ```{code-block} :caption: output -{'ragas_score': 0.7809, 'faithfulness': 0.8365, 'answer_relevancy': 0.8831, 'answer_correctness': 0.6605} +{'faithfulness': 0.8365, 'answer_relevancy': 0.8831, 'answer_correctness': 0.6605} ``` ## Evaluate Falcon-7B-Instruct LLM @@ -168,7 +168,7 @@ result ```{code-block} :caption: output -{'ragas_score': 0.6956, 'faithfulness': 0.6909, 'answer_relevancy': 0.8651, 'answer_correctness': 0.5850} +{'faithfulness': 0.6909, 'answer_relevancy': 0.8651, 'answer_correctness': 0.5850} ``` ## Compare Scores diff --git a/docs/howtos/customisations/aws-bedrock.ipynb b/docs/howtos/customisations/aws-bedrock.ipynb index 9aef4035b..82ccfa921 100644 --- a/docs/howtos/customisations/aws-bedrock.ipynb +++ b/docs/howtos/customisations/aws-bedrock.ipynb @@ -263,7 +263,7 @@ "id": "a2dc0ec2", "metadata": {}, "source": [ - "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n", + "and there you have the it, all the scores you need.\n", "\n", "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!" ] diff --git a/docs/howtos/customisations/azure-openai.ipynb b/docs/howtos/customisations/azure-openai.ipynb index 93851916c..23fe19a24 100644 --- a/docs/howtos/customisations/azure-openai.ipynb +++ b/docs/howtos/customisations/azure-openai.ipynb @@ -258,7 +258,7 @@ "id": "a2dc0ec2", "metadata": {}, "source": [ - "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n", + "and there you have the it, all the scores you need.\n", "\n", "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!" ] diff --git a/docs/howtos/customisations/gcp-vertexai.ipynb b/docs/howtos/customisations/gcp-vertexai.ipynb index 810968b0b..9623b84d3 100644 --- a/docs/howtos/customisations/gcp-vertexai.ipynb +++ b/docs/howtos/customisations/gcp-vertexai.ipynb @@ -295,7 +295,7 @@ "id": "960f88fc-c90b-4ac6-8e97-252edd2f1661", "metadata": {}, "source": [ - "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n", + "and there you have the it, all the scores you need.\n", "\n", "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!" ] diff --git a/docs/howtos/integrations/langfuse.ipynb b/docs/howtos/integrations/langfuse.ipynb index 2c217a51c..9fa24c747 100644 --- a/docs/howtos/integrations/langfuse.ipynb +++ b/docs/howtos/integrations/langfuse.ipynb @@ -559,7 +559,7 @@ { "data": { "text/plain": [ - "{'ragas_score': 0.9309, 'faithfulness': 0.8889, 'answer_relevancy': 0.9771}" + "{'faithfulness': 0.8889, 'answer_relevancy': 0.9771}" ] }, "execution_count": 15, diff --git a/docs/howtos/integrations/langsmith.ipynb b/docs/howtos/integrations/langsmith.ipynb index 30821c918..198996e2e 100644 --- a/docs/howtos/integrations/langsmith.ipynb +++ b/docs/howtos/integrations/langsmith.ipynb @@ -102,7 +102,7 @@ { "data": { "text/plain": [ - "{'ragas_score': 0.7744, 'context_precision': 0.5976, 'faithfulness': 0.8889, 'answer_relevancy': 0.9300}" + "{'context_precision': 0.5976, 'faithfulness': 0.8889, 'answer_relevancy': 0.9300}" ] }, "execution_count": 1, diff --git a/docs/howtos/integrations/llamaindex.ipynb b/docs/howtos/integrations/llamaindex.ipynb index 21773c71a..215d07aaf 100644 --- a/docs/howtos/integrations/llamaindex.ipynb +++ b/docs/howtos/integrations/llamaindex.ipynb @@ -282,7 +282,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'ragas_score': 0.5142, 'faithfulness': 0.7000, 'answer_relevancy': 0.9550, 'context_precision': 0.2335, 'context_recall': 0.9800, 'harmfulness': 0.0000}\n" + "{faithfulness': 0.7000, 'answer_relevancy': 0.9550, 'context_precision': 0.2335, 'context_recall': 0.9800, 'harmfulness': 0.0000}\n" ] } ], diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 6fa42c321..3643710ff 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -57,8 +57,7 @@ def evaluate( ------- Result Result object containing the scores of each metric. You can use this do analysis - later. If the top 3 metrics are provided then it also returns the `ragas_score` - for the entire pipeline. + later. Raises ------ @@ -79,8 +78,9 @@ def evaluate( }) >>> result = evaluate(dataset) - >>> print(result["ragas_score"]) - {'ragas_score': 0.860, 'context_precision': 0.817, 'faithfulness': 0.892, + >>> print(result) + {'context_precision': 0.817, + 'faithfulness': 0.892, 'answer_relevancy': 0.874} ``` """ diff --git a/src/ragas/llama_index/evaluation.py b/src/ragas/llama_index/evaluation.py new file mode 100644 index 000000000..b30738d29 --- /dev/null +++ b/src/ragas/llama_index/evaluation.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +import typing as t + +from datasets import Dataset + +from ragas import evaluate as ragas_evaluate +from ragas.evaluation import Result +from ragas.metrics.base import Metric + +if t.TYPE_CHECKING: + from llama_index.indices.query.base import BaseQueryEngine + + +def evaluate( + query_engine: BaseQueryEngine, + metrics: list[Metric], + questions: list[str], + ground_truths: t.Optional[list[str]] = None, +) -> Result: + """ + Run evaluation of llama_index QueryEngine with different metrics + + Parameters + ---------- + query_engine : BaseQueryEngine + The QueryEngine that is to be evaluated + metrics : list[Metric] + The ragas metrics to use for evaluation. + questions : list[str] + List of questions to evaluate on + ground_truths : list[str], optional + List of ground_truths answer to the question to evaluate on. + + Returns + ------- + Result + Result object containing the scores of each metric. You can use this do analysis + later. + + Raises + ------ + ValueError + if validation fails because the columns required for the metrics are missing or + if the columns are of the wrong format. + + Examples + -------- + Once you have a llama_index QueryEngine created you can use it to evaluate on a list + of questions. + + Import everything you need: + + >>> from ragas.metrics import faithfulness, answer_relevancy, context_precision + >>> from ragas.metrics.critique import harmfulness + >>> from ragas.llama_index import evaluate + + init the query engine, get the questions and choose the metrics you want to use: + + >>> query_engine = # from llamaindex + >>> questions: list[str] = [] # from somewhere + >>> metrics = [faithfulness, answer_relevancy, context_precision, harmfulness] + + Run the evaluation: + + >>> r = evaluate(query_engine, metrics, questions) + + analysis the result: + + >>> print(r) # prints the scores of each metric + >>> r.to_pandas() # returns a pandas dataframe if you want to do further analysis + """ + + try: + from llama_index.async_utils import run_async_tasks + except ImportError: + raise ImportError( + "llama_index must be installed to use this function. " + "Please, install it with `pip install llama_index`." + ) + + # TODO: rate limit, error handling, retries + responses = run_async_tasks([query_engine.aquery(q) for q in questions]) + + answers = [] + contexts = [] + for r in responses: + answers.append(r.response) + contexts.append([c.node.get_content() for c in r.source_nodes]) + dataset_dict = { + "question": questions, + "answer": answers, + "contexts": contexts, + } + if ground_truths is not None: + dataset_dict["ground_truths"] = ground_truths + ds = Dataset.from_dict(dataset_dict) + result = ragas_evaluate(ds, metrics) + + return result diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index f2f83e3ed..06f865d2e 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -115,7 +115,10 @@ def _compute_statement_presence(self, result: LLMResult) -> float: tp, fp, fn = [ len(item) if isinstance(item, list) else np.nan for item in prediction ] - score = tp / (tp + 0.5 * (fp + fn)) + if any([np.isnan(i) for i in [tp, fp, fn]]): + score = np.nan + else: + score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0 else: score = np.nan @@ -179,5 +182,4 @@ def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: def save(self, cache_dir: t.Optional[str] = None) -> None: self.correctness_prompt.save(cache_dir) - answer_correctness = AnswerCorrectness() From 0c2f5ca4f52dbe2ca62706c8a1dc477c7157fc8a Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 4 Jan 2024 00:47:04 +0530 Subject: [PATCH 29/34] fmt --- src/ragas/metrics/_answer_correctness.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 06f865d2e..40b19e093 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -182,4 +182,5 @@ def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: def save(self, cache_dir: t.Optional[str] = None) -> None: self.correctness_prompt.save(cache_dir) + answer_correctness = AnswerCorrectness() From cb521fd791798ea6c61343227f6480225ee802d1 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 4 Jan 2024 00:54:14 +0530 Subject: [PATCH 30/34] fix ci --- src/ragas/llama_index/evaluation.py | 100 ---------------------------- src/ragas/llms/llamaindex.py | 54 --------------- src/ragas/metrics/critique.py | 4 +- 3 files changed, 2 insertions(+), 156 deletions(-) delete mode 100644 src/ragas/llama_index/evaluation.py delete mode 100644 src/ragas/llms/llamaindex.py diff --git a/src/ragas/llama_index/evaluation.py b/src/ragas/llama_index/evaluation.py deleted file mode 100644 index b30738d29..000000000 --- a/src/ragas/llama_index/evaluation.py +++ /dev/null @@ -1,100 +0,0 @@ -from __future__ import annotations - -import typing as t - -from datasets import Dataset - -from ragas import evaluate as ragas_evaluate -from ragas.evaluation import Result -from ragas.metrics.base import Metric - -if t.TYPE_CHECKING: - from llama_index.indices.query.base import BaseQueryEngine - - -def evaluate( - query_engine: BaseQueryEngine, - metrics: list[Metric], - questions: list[str], - ground_truths: t.Optional[list[str]] = None, -) -> Result: - """ - Run evaluation of llama_index QueryEngine with different metrics - - Parameters - ---------- - query_engine : BaseQueryEngine - The QueryEngine that is to be evaluated - metrics : list[Metric] - The ragas metrics to use for evaluation. - questions : list[str] - List of questions to evaluate on - ground_truths : list[str], optional - List of ground_truths answer to the question to evaluate on. - - Returns - ------- - Result - Result object containing the scores of each metric. You can use this do analysis - later. - - Raises - ------ - ValueError - if validation fails because the columns required for the metrics are missing or - if the columns are of the wrong format. - - Examples - -------- - Once you have a llama_index QueryEngine created you can use it to evaluate on a list - of questions. - - Import everything you need: - - >>> from ragas.metrics import faithfulness, answer_relevancy, context_precision - >>> from ragas.metrics.critique import harmfulness - >>> from ragas.llama_index import evaluate - - init the query engine, get the questions and choose the metrics you want to use: - - >>> query_engine = # from llamaindex - >>> questions: list[str] = [] # from somewhere - >>> metrics = [faithfulness, answer_relevancy, context_precision, harmfulness] - - Run the evaluation: - - >>> r = evaluate(query_engine, metrics, questions) - - analysis the result: - - >>> print(r) # prints the scores of each metric - >>> r.to_pandas() # returns a pandas dataframe if you want to do further analysis - """ - - try: - from llama_index.async_utils import run_async_tasks - except ImportError: - raise ImportError( - "llama_index must be installed to use this function. " - "Please, install it with `pip install llama_index`." - ) - - # TODO: rate limit, error handling, retries - responses = run_async_tasks([query_engine.aquery(q) for q in questions]) - - answers = [] - contexts = [] - for r in responses: - answers.append(r.response) - contexts.append([c.node.get_content() for c in r.source_nodes]) - dataset_dict = { - "question": questions, - "answer": answers, - "contexts": contexts, - } - if ground_truths is not None: - dataset_dict["ground_truths"] = ground_truths - ds = Dataset.from_dict(dataset_dict) - result = ragas_evaluate(ds, metrics) - - return result diff --git a/src/ragas/llms/llamaindex.py b/src/ragas/llms/llamaindex.py deleted file mode 100644 index 24e77fd8c..000000000 --- a/src/ragas/llms/llamaindex.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import annotations - -import typing as t - -from langchain.schema.output import Generation, LLMResult - -from ragas.async_utils import run_async_tasks -from ragas.llms.base import BaseRagasLLM - -if t.TYPE_CHECKING: - from langchain.callbacks.base import Callbacks - from langchain.prompts import ChatPromptTemplate - - try: - from llama_index.llms.base import LLM as LiLLM - except ImportError: - raise ImportError( - "llama_index must be installed to use this function. " - "Please, install it with `pip install llama_index`." - ) - - -class LlamaIndexLLM(BaseRagasLLM): - def __init__(self, llm: LiLLM) -> None: - self.llama_index_llm = llm - - @property - def llm(self) -> LiLLM: - return self.llama_index_llm - - def generate( - self, - prompts: list[ChatPromptTemplate], - n: int = 1, - temperature: float = 0, - callbacks: t.Optional[Callbacks] = None, - ) -> LLMResult: - # set temperature to 0.2 for multiple completions - temperature = 0.2 if n > 1 else 0 - self.llm.temperature = temperature - - # get task coroutines - tasks = [] - for p in prompts: - tasks.extend([self.llm.acomplete(p.format()) for _ in range(n)]) - - # process results to LLMResult - # token usage is note included for now - results = run_async_tasks(tasks) - results2D = [results[i : i + n] for i in range(0, len(results), n)] - generations = [ - [Generation(text=r.text) for r in result] for result in results2D - ] - return LLMResult(generations=generations) diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 78532d434..f6c745b8d 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -68,8 +68,8 @@ class AspectCritique(MetricWithLLM): definition: str = field(default="", repr=True) strictness: int = field(default=1, repr=False) batch_size: int = field(default=15, repr=False) - llm: BaseRagasLLM = field( # type: ignore - default_factory=llm_factory, + llm: BaseRagasLLM | None = field( + default=None, repr=False, ) From 60fc29fe9b4ae55b8ddb5bfaf48bc7a364e042c5 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 4 Jan 2024 14:55:05 +0530 Subject: [PATCH 31/34] fix tests --- src/ragas/llms/__init__.py | 2 -- src/ragas/metrics/critique.py | 7 ++++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ragas/llms/__init__.py b/src/ragas/llms/__init__.py index 71af6e377..9d6285b6e 100644 --- a/src/ragas/llms/__init__.py +++ b/src/ragas/llms/__init__.py @@ -1,11 +1,9 @@ from langchain.chat_models import ChatOpenAI from ragas.llms.base import BaseRagasLLM, LangchainLLMWrapper -from ragas.llms.llamaindex import LlamaIndexLLM __all__ = [ "BaseRagasLLM", - "LlamaIndexLLM", "llm_factory", ] diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index f6c745b8d..9ac8800ea 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -7,7 +7,6 @@ import numpy as np -from ragas.llms import llm_factory from ragas.llms.json_load import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM @@ -115,6 +114,8 @@ def _compute_score(self, safe_loaded_responses): return score def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "set LLM before use" + q, c, a = row["question"], row["contexts"], row["answer"] result = self.llm.generate_text( @@ -127,6 +128,8 @@ def _score(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: return self._compute_score(safe_loaded_responses) async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + assert self.llm is not None, "set LLM before use" + q, c, a = row["question"], row["contexts"], row["answer"] result = await self.llm.agenerate_text( @@ -139,6 +142,8 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: return self._compute_score(safe_loaded_responses) def adapt(self, language: str, cache_dir: str | None = None) -> None: + assert self.llm is not None, "set LLM before use" + logger.info(f"Adapting Critic to {language}") self.critic_prompt.adapt(language, self.llm, cache_dir) From fa4b24afe3e72461f944263106380620c8bfc253 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 4 Jan 2024 15:59:38 +0530 Subject: [PATCH 32/34] async=False is defualt --- src/ragas/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 3643710ff..5ff48be70 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -31,7 +31,7 @@ def evaluate( llm: t.Optional[BaseRagasLLM] = None, embeddings: t.Optional[BaseRagasEmbeddings] = None, callbacks: Callbacks = [], - is_async: bool = True, + is_async: bool = False, max_workers: t.Optional[int] = None, raise_exceptions: bool = True, column_map: t.Dict[str, str] = {}, From eb06f2f47f9c0aa38dfd6645e464a5e9eb18a6e8 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 4 Jan 2024 17:46:31 +0530 Subject: [PATCH 33/34] fix prompt --- src/ragas/metrics/_context_relevancy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index 0e7016a82..85b2ad20a 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -18,7 +18,7 @@ CONTEXT_RELEVANCE = Prompt( name="context_relevancy", - instruction='Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you\'re not allowed to make any changes to sentences from given context.', + instruction="""Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.""", input_keys=["question", "context"], output_key="candidate sentences", output_type="json", @@ -56,7 +56,7 @@ class ContextRelevancy(MetricWithLLM): batch_size: int = 15 show_deprecation_warning: bool = False - def _compute_score(self, responses: t.Any, row: t.Dict) -> float: + def _compute_score(self, responses: str, row: t.Dict) -> float: context = "\n".join(row["contexts"]) overlap_scores = [] context_sents = sent_tokenize(context) From 8f1ce4a9d026b631e808af2f855c8f71d5d510aa Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 4 Jan 2024 17:59:36 +0530 Subject: [PATCH 34/34] fixed context_relevancy bug --- src/ragas/metrics/_context_relevancy.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index 85b2ad20a..a7031adf4 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -56,22 +56,19 @@ class ContextRelevancy(MetricWithLLM): batch_size: int = 15 show_deprecation_warning: bool = False - def _compute_score(self, responses: str, row: t.Dict) -> float: + def _compute_score(self, response: str, row: t.Dict) -> float: context = "\n".join(row["contexts"]) - overlap_scores = [] context_sents = sent_tokenize(context) - for output in responses: - indices = ( - sent_tokenize(output.strip()) - if output.lower() != "insufficient information." - else [] - ) - if len(context_sents) == 0: - score = 0 - else: - score = min(len(indices) / len(context_sents), 1) - overlap_scores.append(score) - return float(np.mean(overlap_scores)) + indices = ( + sent_tokenize(response.strip()) + if response.lower() != "insufficient information." + else [] + ) + # print(len(indices)) + if len(context_sents) == 0: + return 0 + else: + return min(len(indices) / len(context_sents), 1) def _score(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM is not initialized"