From 8b8d1fe7c74caf2e648f00a4f48f3466e03cacc5 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 19 Oct 2023 17:22:25 +0530 Subject: [PATCH 01/41] add langchain loaders to docs --- docs/concepts/testset_generation.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/concepts/testset_generation.md b/docs/concepts/testset_generation.md index 99ca0328a..abe95067a 100644 --- a/docs/concepts/testset_generation.md +++ b/docs/concepts/testset_generation.md @@ -35,6 +35,27 @@ Moving forward, we are will be expanding the range of evolution techniques to of ## Example +```{code-block} python +:caption: loading documents using langchain +from langchain.document_loaders import PubMedLoader + +loader = PubMedLoader("liver", load_max_docs=10) +documents = loader.load() +``` +Checkout [langchain](https://python.langchain.com/docs/modules/data_connection/document_loaders/) document loaders to see more examples + +```{code-block} python +:caption: loading documents using llama-index +from llama_index import download_loader + +SemanticScholarReader = download_loader("SemanticScholarReader") +loader = SemanticScholarReader() +query_space = "large language models" +documents = loader.load_data(query=query_space, limit=10) +``` +Checkout [llama-index](https://gpt-index.readthedocs.io/en/stable/core_modules/data_modules/connector/root.html) document loaders to see more examples + + ```{code-block} python :caption: Customising test set generation from ragas.testset import TestsetGenerator From 716807bd06a4f8ff251ff30583ce2578d2bc8537 Mon Sep 17 00:00:00 2001 From: Tino Max Date: Sat, 16 Dec 2023 20:12:24 -0600 Subject: [PATCH 02/41] Added prompt class --- src/ragas/prompts/__init__.py | 7 +++ src/ragas/prompts/base.py | 101 ++++++++++++++++++++++++++++++++++ tests/unit/test_import.py | 1 + 3 files changed, 109 insertions(+) create mode 100644 src/ragas/prompts/__init__.py create mode 100644 src/ragas/prompts/base.py diff --git a/src/ragas/prompts/__init__.py b/src/ragas/prompts/__init__.py new file mode 100644 index 000000000..d5cdf8f41 --- /dev/null +++ b/src/ragas/prompts/__init__.py @@ -0,0 +1,7 @@ +from ragas.prompts.base import ( + RagasPrompt, +) + +__all__ = [ + "RagasPrompt", +] diff --git a/src/ragas/prompts/base.py b/src/ragas/prompts/base.py new file mode 100644 index 000000000..61e1ad075 --- /dev/null +++ b/src/ragas/prompts/base.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import json +import typing as t + +from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from langchain_core.prompt_values import PromptValue +from langchain_core.messages import BaseMessage + +try: + from pydantic.v1 import root_validator +except ImportError: + from pydantic import root_validator + +class RagasPrompt(PromptValue): + """ + RagasPrompt is a class that represents a prompt for the ragas metrics. + """ + instruction: str + examples: t.Optional[t.List[dict[str, t.Any]]] = [] + input_keys: t.List[str] + output_key: str + output_type: t.Optional[str] = 'JSON' + + def to_string(self) -> str: + """Return prompt value as string.""" + ... + + def to_messages(self) -> t.List[BaseMessage]: + """Return prompt as a list of Messages.""" + ... + + @root_validator + def validate(cls, values: t.Dict) -> t.Dict: + """ + Validate the template string to ensure that it is in desired format. + """ + if values.get("instruction") == None or values.get("instruction") == "": + raise ValueError( + "Instruction cannot be empty" + ) + if values.get("input_keys") == None or values.get("instruction") == []: + raise ValueError( + "Input keys cannot be empty" + ) + if values.get("output_key") == None or values.get("output_key") == "": + raise ValueError( + "Output key cannot be empty" + ) + + if values.get("examples"): + output_key = values.get("output_key") + for no, example in enumerate(values.get("examples")): + for inp_key in values.get("input_keys"): + if inp_key not in example: + raise ValueError( + f"Example {no+1} does not have the variable {inp_key} in the definition" + ) + if output_key not in example: + raise ValueError( + f"Example {no+1} does not have the variable {output_key} in the definition" + ) + if values.get("output_type") == 'JSON': + try: + example_json = json.loads(example[output_key]) + except ValueError as e: + raise ValueError( + f"{output_key} in example {no+1} is not in valid JSON format: {e}" + ) + + return values + + def generate_prompt_string(self) -> str: + """ + Generate the prompt string from the variables. + """ + prompt_str = self.instruction + '\n' + + # Format the examples to match the Langchain prompt template + for example in self.examples: + for key, value in example.items(): + value = value.replace('{','{{').replace('}','}}') if self.output_type == 'JSON' else value + prompt_str += f'\n{key}: {value}' + prompt_str += '\n' + + prompt_str += ''.join(f'\n{key}: {{{key}}}' for key in self.input_keys) + prompt_str += f'\n{self.output_key}: \n' + + return prompt_str + + def format(self, **kwargs: t.Any) -> str: + """ + Format the RagasPrompt object into a ChatPromptTemplate object to be used in metrics. + """ + if set(self.input_keys) != set(kwargs.keys()): + raise ValueError( + f"Input variables {self.input_keys} do not match with the given parameters {list(kwargs.keys())}" + ) + prompt = self.generate_prompt_string() + human_prompt = HumanMessagePromptTemplate.from_template(prompt) + return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) diff --git a/tests/unit/test_import.py b/tests/unit/test_import.py index 0df78a883..807cba7de 100644 --- a/tests/unit/test_import.py +++ b/tests/unit/test_import.py @@ -28,3 +28,4 @@ def test_import_module(): for metric in test_critique: assert hasattr(ragas.metrics.critique, metric) +test_import_module() \ No newline at end of file From 22dd97b2c5334bef9bc76c636419db12a6d9b291 Mon Sep 17 00:00:00 2001 From: Tino Max Date: Sat, 16 Dec 2023 20:22:32 -0600 Subject: [PATCH 03/41] fixed Lint errors --- src/ragas/prompts/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ragas/prompts/base.py b/src/ragas/prompts/base.py index 61e1ad075..343567742 100644 --- a/src/ragas/prompts/base.py +++ b/src/ragas/prompts/base.py @@ -35,15 +35,15 @@ def validate(cls, values: t.Dict) -> t.Dict: """ Validate the template string to ensure that it is in desired format. """ - if values.get("instruction") == None or values.get("instruction") == "": + if values.get("instruction") is None or values.get("instruction") == "": raise ValueError( "Instruction cannot be empty" ) - if values.get("input_keys") == None or values.get("instruction") == []: + if values.get("input_keys") is None or values.get("instruction") == []: raise ValueError( "Input keys cannot be empty" ) - if values.get("output_key") == None or values.get("output_key") == "": + if values.get("output_key") is None or values.get("output_key") == "": raise ValueError( "Output key cannot be empty" ) @@ -62,7 +62,7 @@ def validate(cls, values: t.Dict) -> t.Dict: ) if values.get("output_type") == 'JSON': try: - example_json = json.loads(example[output_key]) + json.loads(example[output_key]) except ValueError as e: raise ValueError( f"{output_key} in example {no+1} is not in valid JSON format: {e}" From 9dbcf496c6263937842d820447be4d3010a17280 Mon Sep 17 00:00:00 2001 From: Tino Max Date: Sat, 16 Dec 2023 22:54:09 -0600 Subject: [PATCH 04/41] resolve type issue --- src/ragas/prompts/base.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/ragas/prompts/base.py b/src/ragas/prompts/base.py index 343567742..2a86e0f88 100644 --- a/src/ragas/prompts/base.py +++ b/src/ragas/prompts/base.py @@ -17,10 +17,10 @@ class RagasPrompt(PromptValue): RagasPrompt is a class that represents a prompt for the ragas metrics. """ instruction: str - examples: t.Optional[t.List[dict[str, t.Any]]] = [] + examples: t.List[t.Dict[str, t.Any]] = [] input_keys: t.List[str] output_key: str - output_type: t.Optional[str] = 'JSON' + output_type: str = 'JSON' def to_string(self) -> str: """Return prompt value as string.""" @@ -29,38 +29,37 @@ def to_string(self) -> str: def to_messages(self) -> t.List[BaseMessage]: """Return prompt as a list of Messages.""" ... - - @root_validator - def validate(cls, values: t.Dict) -> t.Dict: + @root_validator() + def validate_prompt(cls, value: t.Dict[str, str]) -> t.Dict[str, str]: """ Validate the template string to ensure that it is in desired format. """ - if values.get("instruction") is None or values.get("instruction") == "": + if value.get("instruction") is None or value.get("instruction") == "": raise ValueError( "Instruction cannot be empty" ) - if values.get("input_keys") is None or values.get("instruction") == []: + if value.get("input_keys") is None or value.get("instruction") == []: raise ValueError( "Input keys cannot be empty" ) - if values.get("output_key") is None or values.get("output_key") == "": + if value.get("output_key") is None or value.get("output_key") == "": raise ValueError( "Output key cannot be empty" ) - if values.get("examples"): - output_key = values.get("output_key") - for no, example in enumerate(values.get("examples")): - for inp_key in values.get("input_keys"): - if inp_key not in example: + if value.get("examples"): + output_key = value["output_key"] + for no, example in enumerate(value['examples']): + for inp_key in value['input_keys']: + if not inp_key in example: raise ValueError( f"Example {no+1} does not have the variable {inp_key} in the definition" ) - if output_key not in example: + if not output_key in example: raise ValueError( f"Example {no+1} does not have the variable {output_key} in the definition" ) - if values.get("output_type") == 'JSON': + if value["output_type"] == 'JSON': try: json.loads(example[output_key]) except ValueError as e: @@ -68,7 +67,7 @@ def validate(cls, values: t.Dict) -> t.Dict: f"{output_key} in example {no+1} is not in valid JSON format: {e}" ) - return values + return value def generate_prompt_string(self) -> str: """ @@ -88,7 +87,7 @@ def generate_prompt_string(self) -> str: return prompt_str - def format(self, **kwargs: t.Any) -> str: + def format(self, **kwargs: t.Any) -> ChatPromptTemplate: """ Format the RagasPrompt object into a ChatPromptTemplate object to be used in metrics. """ From 677732b659eaff168b4951dabc014b8480cf756a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 17 Dec 2023 20:15:37 +0530 Subject: [PATCH 05/41] prompt adaptation --- src/ragas/prompts/base.py | 132 ++++++++++++++++++++++++++++++-------- 1 file changed, 106 insertions(+), 26 deletions(-) diff --git a/src/ragas/prompts/base.py b/src/ragas/prompts/base.py index 2a86e0f88..2dedeacf0 100644 --- a/src/ragas/prompts/base.py +++ b/src/ragas/prompts/base.py @@ -4,62 +4,62 @@ import typing as t from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate -from langchain_core.prompt_values import PromptValue from langchain_core.messages import BaseMessage +from langchain_core.prompt_values import PromptValue + +from ragas.llms.base import RagasLLM +from ragas.utils import json_loader try: from pydantic.v1 import root_validator except ImportError: from pydantic import root_validator + class RagasPrompt(PromptValue): """ RagasPrompt is a class that represents a prompt for the ragas metrics. """ + instruction: str examples: t.List[t.Dict[str, t.Any]] = [] input_keys: t.List[str] output_key: str - output_type: str = 'JSON' - + output_type: str = "JSON" + def to_string(self) -> str: """Return prompt value as string.""" ... - + def to_messages(self) -> t.List[BaseMessage]: """Return prompt as a list of Messages.""" ... + @root_validator() def validate_prompt(cls, value: t.Dict[str, str]) -> t.Dict[str, str]: """ Validate the template string to ensure that it is in desired format. """ if value.get("instruction") is None or value.get("instruction") == "": - raise ValueError( - "Instruction cannot be empty" - ) + raise ValueError("Instruction cannot be empty") if value.get("input_keys") is None or value.get("instruction") == []: - raise ValueError( - "Input keys cannot be empty" - ) + raise ValueError("Input keys cannot be empty") if value.get("output_key") is None or value.get("output_key") == "": - raise ValueError( - "Output key cannot be empty" - ) - + raise ValueError("Output key cannot be empty") + if value.get("examples"): output_key = value["output_key"] - for no, example in enumerate(value['examples']): - for inp_key in value['input_keys']: - if not inp_key in example: + for no, example in enumerate(value["examples"]): + for inp_key in value["input_keys"]: + if inp_key not in example: raise ValueError( f"Example {no+1} does not have the variable {inp_key} in the definition" ) - if not output_key in example: + if output_key not in example: raise ValueError( f"Example {no+1} does not have the variable {output_key} in the definition" ) - if value["output_type"] == 'JSON': + if value["output_type"] == "JSON": try: json.loads(example[output_key]) except ValueError as e: @@ -73,17 +73,21 @@ def generate_prompt_string(self) -> str: """ Generate the prompt string from the variables. """ - prompt_str = self.instruction + '\n' + prompt_str = self.instruction + "\n" # Format the examples to match the Langchain prompt template for example in self.examples: for key, value in example.items(): - value = value.replace('{','{{').replace('}','}}') if self.output_type == 'JSON' else value - prompt_str += f'\n{key}: {value}' - prompt_str += '\n' - - prompt_str += ''.join(f'\n{key}: {{{key}}}' for key in self.input_keys) - prompt_str += f'\n{self.output_key}: \n' + value = ( + value.replace("{", "{{").replace("}", "}}") + if self.output_type == "JSON" + else value + ) + prompt_str += f"\n{key}: {value}" + prompt_str += "\n" + + prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys) + prompt_str += f"\n{self.output_key}: \n" return prompt_str @@ -98,3 +102,79 @@ def format(self, **kwargs: t.Any) -> ChatPromptTemplate: prompt = self.generate_prompt_string() human_prompt = HumanMessagePromptTemplate.from_template(prompt) return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) + + def adapt(self, languge: str, llm: RagasLLM) -> None: + # TODO: Add callbacks + prompts = [] + for example in self.examples: + prompts.extend( + [ + str_translation.format(translate_to=languge, input=example.get(key)) + for key in self.input_keys + ] + ) + prompts.append( + json_translatation.format( + translate_to=languge, input=example.get(self.output_key) + ) + if self.output_type.lower() == "json" + else str_translation.format(languge, example.get(self.output_key)) + ) + + results = [result[0].text for result in llm.generate(prompts).generations] + per_example_items = len(self.input_keys) + 1 + grouped_results = [ + results[i : i + per_example_items] + for i in range(0, len(results), per_example_items) + ] + assert len(grouped_results) == len( + self.examples + ), "examples and adapted examples must be of equal length" + for i, example in enumerate(grouped_results): + example_dict = {} + example_dict.update( + {k: v for k, v in zip(self.input_keys, example[: len(self.input_keys)])} + ) + example_dict[self.output_key] = ( + json_loader.safe_load(example[-1], llm=llm) + if self.output_type.lower() == "json" + else example[-1] + ) + self.examples[i] = example_dict + + +str_translation = RagasPrompt( + instruction="Language translation", + examples=[ + { + "translate_to": "hindi", + "input": "Who was Albert Einstein and what is he best known for?", + "output": "अल्बर्ट आइंस्टीन कौन थे और वे किसके लिए सबसे ज्यादा प्रसिद्ध हैं?", + }, + ], + input_keys=["translate_to", "input"], + output_key="output", + output_type="str", +) + +json_translatation = RagasPrompt( + instruction="Translate values in given json to target language ", + examples=[ + { + "translate_to": "hindi", + "input": """{ + "statements": [ + "Albert Einstein was born in Germany.", + "Albert Einstein was best known for his theory of relativity." + ]}""", + "output": """{ + "statements": [ + "अल्बर्ट आइंस्टीन का जन्म जर्मनी में हुआ था।", + "अल्बर्ट आइंस्टीन अपने सापेक्षता के सिद्धांत के लिए सबसे अधिक प्रसिद्ध थे।" + ]}""", + } + ], + input_keys=["translate_to", "input"], + output_key="output", + output_type="JSON", +) From 93e184a63011e51fdad018e72d02bc923cc295e5 Mon Sep 17 00:00:00 2001 From: Tino Max Date: Thu, 21 Dec 2023 01:06:42 -0600 Subject: [PATCH 06/41] added prompt objects to metrics --- src/ragas/metrics/_answer_correctness.py | 72 ++++++++-------- src/ragas/metrics/_answer_relevance.py | 74 +++++++--------- src/ragas/metrics/_context_precision.py | 83 +++++++++--------- src/ragas/metrics/_context_recall.py | 103 +++++++++++------------ src/ragas/metrics/_context_relevancy.py | 21 ++--- 5 files changed, 167 insertions(+), 186 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index b5b22275e..53c2e8d6c 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -8,6 +8,7 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader @@ -15,39 +16,37 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks -CORRECTNESS_PROMPT = HumanMessagePromptTemplate.from_template( - """ -Extract following from given question and ground truth - -Question:What powers the sun and what is its primary function? -Answer: The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system. -Ground truth: The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents. -Extracted statements: -[ -{{ - "statements that are present in both the answer and the ground truth": ["The sun's primary function is to provide light"], - "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"], - "relevant statements found in the ground truth but omitted in the answer": ["The sun is powered by nuclear fusion, not fission", "In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy", "This energy provides heat and light, essential for life on Earth", "The sun's light plays a critical role in Earth's climate system", "The sun helps to drive the weather and ocean currents"] -}} -] - -Question: What is the boiling point of water? -Answer: The boiling point of water is 100 degrees Celsius at sea level. -Ground truth: The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level, but it can change with altitude. -Extracted statements: -[ - {{ - "statements that are present in both the answer and the ground truth": ["The boiling point of water is 100 degrees Celsius at sea level"], - "statements present in the answer but not found in the ground truth": [], - "relevant statements found in the ground truth but omitted in the answer": ["The boiling point can change with altitude", "The boiling point of water is 212 degrees Fahrenheit at sea level"] - }} -] - - -Question:{question} -Answer: {answer} -Ground truth: {ground_truth} -Extracted statements:""" # noqa: E501 +CORRECTNESS_PROMPT = Prompt( + instruction="""Extract following from given question and ground truth""", + examples=[ + { + "question": """What powers the sun and what is its primary function?""", + "answer": """The sun is powered by nuclear fission, similar to nuclear reactors on Earth, and its primary function is to provide light to the solar system.""", + "ground_truth": """The sun is actually powered by nuclear fusion, not fission. In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy. This energy is what lights up the sun and provides heat and light, essential for life on Earth. The sun's light also plays a critical role in Earth's climate system and helps to drive the weather and ocean currents.""", + "Extracted statements": """[ + { + "statements that are present in both the answer and the ground truth": ["The sun's primary function is to provide light"], + "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"], + "relevant statements found in the ground truth but omitted in the answer": ["The sun is powered by nuclear fusion, not fission", "In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy", "This energy provides heat and light, essential for life on Earth", "The sun's light plays a critical role in Earth's climate system", "The sun helps to drive the weather and ocean currents"] + }] + """ + }, + { + "question": """What is the boiling point of water?""", + "answer": """The boiling point of water is 100 degrees Celsius at sea level.""", + "ground_truth": """The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level, but it can change with altitude.""", + "Extracted statements": """[ + { + "statements that are present in both the answer and the ground truth": ["The boiling point of water is 100 degrees Celsius at sea level"], + "statements present in the answer but not found in the ground truth": [], + "relevant statements found in the ground truth but omitted in the answer": ["The boiling point can change with altitude", "The boiling point of water is 212 degrees Fahrenheit at sea level"] + }] + """ + } + ], + input_keys=["question", "answer", "ground_truth"], + output_key="Extracted statements", + output_type="json" ) @@ -101,10 +100,11 @@ def _score_batch( callback_group_name, callback_manager=cb ) as batch_group: for q, a, g in zip(question, answer, ground_truths): - human_prompt = CORRECTNESS_PROMPT.format( - question=q, ground_truth=g[0], answer=a + prompts.append( + CORRECTNESS_PROMPT.format( + question=q, ground_truth=g[0], answer=a + ) ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) result = self.llm.generate(prompts, callbacks=batch_group) outputs = result.generations diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 9b19ac0c1..1324d2420 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -11,6 +11,7 @@ from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader @@ -19,49 +20,33 @@ from ragas.embeddings.base import RagasEmbeddings - -QUESTION_GEN = HumanMessagePromptTemplate.from_template( - """ -Generate a question for the given answer and Identify if answer is noncommittal - -Answer: -Albert Einstein was born in Germany. -Context: -Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time -Output: -{{"question":"Where was Albert Einstein born?","noncommittal":false}} - - -Answer: -It can change its skin color based on the temperature of its environment. -Context: -A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment. -Output: -{{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}} - - -Answer: -Everest -Context: -The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas. -Output: -{{"question":"What is the tallest mountain on Earth?","noncommittal":false}} - - -Answer: -I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unware of information beyong 2022. -Context: -In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology. -Output: -{{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}} - - - -Answer: -{answer} -Context: -{context} -Output:""" # noqa: E501 +QUESTION_GEN = Prompt( + instruction="""Generate a question for the given answer and Identify if answer is noncommittal""", + examples=[ + { + "answer": """Albert Einstein was born in Germany.""", + "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time""", + "output": """{"question":"Where was Albert Einstein born?","noncommittal":false}""" + }, + { + "answer": """It can change its skin color based on the temperature of its environment.""", + "context": """A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment.""", + "output": """{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}""" + }, + { + "answer": """Everest""", + "context": """The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas.""", + "output": """{"question":"What is the tallest mountain on Earth?","noncommittal":false}""" + }, + { + "answer": """I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unware of information beyond 2022. """, + "context": """In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology.""", + "output": """{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}""" + } + ], + input_keys=["answer", "context"], + output_key="output", + output_type="json" ) @@ -117,8 +102,7 @@ def _score_batch( ) as batch_group: prompts = [] for ans, ctx in zip(answers, contexts): - human_prompt = QUESTION_GEN.format(answer=ans, context="\n".join(ctx)) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + prompts.append(QUESTION_GEN.format(answer=ans, context="\n".join(ctx))) results = self.llm.generate( prompts, diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 74dfde972..dc62b67c7 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -9,49 +9,50 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks -CONTEXT_PRECISION = HumanMessagePromptTemplate.from_template( - """Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not. - -question: What can you tell me about albert Albert Einstein? -context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. -answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 -verification: -{{ - "reason": "The provided context was indeed useful in arriving at the given answer. The context includes key information about Albert Einstein's life and contributions, which are reflected in the answer.", - "Verdict": "1" -}} - -question: who won 2020 icc world cup? -context: Who won the 2022 ICC Men's T20 World Cup? -The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title. -answer: England -verification: -{{ - "reason": "the context was useful in clarifying the situation regarding the 2020 ICC World Cup and indicating that England was the winner of the tournament that was intended to be held in 2020 but actually took place in 2022.", - "verdict": "1" -}} - - -question: What is the tallest mountain in the world? -context: The Andes is the longest continental mountain range in the world, located in South America. It stretches across seven countries and features many of the highest peaks in the Western Hemisphere. The range is known for its diverse ecosystems, including the high-altitude Andean Plateau and the Amazon rainforest. -answer: Mount Everest. -verification: -{{ - "reason":"the provided context discusses the Andes mountain range, which, while impressive, does not include Mount Everest or directly relate to the question about the world's tallest mountain.", - "verdict":"0" -}} - - -question:{question} -context:{context} -answer:{answer} -verification:""" +CONTEXT_PRECISION = Prompt( + instruction="""Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not. """, + examples=[ + { + "question": """What can you tell me about albert Albert Einstein?""", + "context": """Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.""", + "answer": """Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895""", + "verification": """{ + "reason": "The provided context was indeed useful in arriving at the given answer. The context includes key information about Albert Einstein's life and contributions, which are reflected in the answer.", + "Verdict": "1" + } + """ + }, + { + "question": """who won 2020 icc world cup?""", + "context": """Who won the 2022 ICC Men's T20 World Cup?""", + "answer": """England""", + "verification": """{ + "reason": "the context was useful in clarifying the situation regarding the 2020 ICC World Cup and indicating that England was the winner of the tournament that was intended to be held in 2020 but actually took place in 2022.", + "verdict": "1" + } + """ + }, + { + "question": """What is the tallest mountain in the world?""", + "context": """The Andes is the longest continental mountain range in the world, located in South America. It stretches across seven countries and features many of the highest peaks in the Western Hemisphere. The range is known for its diverse ecosystems, including the high-altitude Andean Plateau and the Amazon rainforest.""", + "answer": """Mount Everest.""", + "verification": """{ + "reason":"the provided context discusses the Andes mountain range, which, while impressive, does not include Mount Everest or directly relate to the question about the world's tallest mountain.", + "verdict":"0" + } + """ + } + ], + input_keys=["question", "context", "answer"], + output_key="verification", + output_type="json" ) @@ -97,12 +98,8 @@ def _score_batch( ) as batch_group: for qstn, ctx, answer in zip(questions, contexts, answers): human_prompts = [ - ChatPromptTemplate.from_messages( - [ - CONTEXT_PRECISION.format( - question=qstn, context=c, answer=answer - ) - ] + CONTEXT_PRECISION.format( + question=qstn, context=c, answer=answer ) for c in ctx ] diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 22ee2b928..5e483a283 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -8,61 +8,59 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks -CONTEXT_RECALL_RA = HumanMessagePromptTemplate.from_template( - """ -Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. Output json with reason. - - -question: What can you tell me about albert Albert Einstein? -context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. -answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 -classification: -[ - {{ "statement_1":"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.", - "reason": "The date of birth of Einstein is mentioned clearly in the context.", - "Attributed": "1" - }}, - {{ - "statement_2":"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.", - "reason": "The exact sentence is present in the given context.", - "Attributed": "1" - }}, - {{ - "statement_3": "He published 4 papers in 1905.", - "reason": "There is no mention about papers he wrote in the given context.", - "Attributed": "0" - }}, - {{ - "statement_4":"Einstein moved to Switzerland in 1895.", - "reason": "There is no supporting evidence for this in the given context.", - "Attributed": "0" - }} -] - -question: who won 2020 icc world cup? -context: Who won the 2022 ICC Men's T20 World Cup? -The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title. -answer: England -classification: -[ - {{ - "statement_1":"England won the 2022 ICC Men's T20 World Cup.", - "reason": "From context it is clear that England defeated Pakistan to win the World Cup.", - "Attributed": "1" - }} -] - -question:{question} -context:{context} -answer:{answer} -classification: -""" # noqa: E501 +CONTEXT_RECALL_RA = Prompt( + instruction="""Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. Output json with reason.""", + examples=[ + { + "question": """What can you tell me about albert Albert Einstein?""", + "context": """Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'. He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.""", + "answer": """Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895""", + "classification": """[ + { + "statement_1":"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.", + "reason": "The date of birth of Einstein is mentioned clearly in the context.", + "Attributed": "1" + }, + { + "statement_2":"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.", + "reason": "The exact sentence is present in the given context.", + "Attributed": "1" + }, + { + "statement_3": "He published 4 papers in 1905.", + "reason": "There is no mention about papers he wrote in the given context.", + "Attributed": "0" + }, + { + "statement_4":"Einstein moved to Switzerland in 1895.", + "reason": "There is no supporting evidence for this in the given context.", + "Attributed": "0" + }] + """ + }, + { + "question": """who won 2020 icc world cup?""", + "context": """Who won the 2022 ICC Men's T20 World Cup?""", + "answer": """England""", + "classification": """[ + { + "statement_1":"England won the 2022 ICC Men's T20 World Cup.", + "reason": "From context it is clear that England defeated Pakistan to win the World Cup.", + "Attributed": "1" + }] + """ + } + ], + input_keys=["question", "context", "answer"], + output_key="classification", + output_type="json" ) @@ -104,10 +102,11 @@ def _score_batch( for qstn, gt, ctx in zip(question, ground_truths, contexts): gt = "\n".join(gt) if isinstance(gt, list) else gt ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx - human_prompt = CONTEXT_RECALL_RA.format( - question=qstn, context=ctx, answer=gt + prompts.append( + CONTEXT_RECALL_RA.format( + question=qstn, context=ctx, answer=gt + ) ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) responses: list[list[str]] = [] results = self.llm.generate( diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index c7e4ed1df..77505a7a5 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -11,20 +11,20 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks -CONTEXT_RELEVANCE = HumanMessagePromptTemplate.from_template( - """\ -Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context. - -question:{question} -context:\n{context} -candidate sentences:\n""" # noqa: E501 +CONTEXT_RELEVANCE = Prompt( + instruction="""Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.""", + input_keys=["question", "context"], + output_key="candidate sentences", + output_type="json" ) + seg = pysbd.Segmenter(language="en", clean=False) @@ -76,10 +76,11 @@ def _score_batch( callback_group_name, callback_manager=cb ) as batch_group: for q, c in zip(questions, contexts): - human_prompt = CONTEXT_RELEVANCE.format( - question=q, context="\n".join(c) + prompts.append( + CONTEXT_RELEVANCE.format( + question=q, context="\n".join(c) + ) ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) responses: list[list[str]] = [] results = self.llm.generate( From dfb8f0305bbaed0657fd3848db3e9fb87976f6cb Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 10:07:45 +0530 Subject: [PATCH 07/41] added adaption --- src/ragas/metrics/_faithfulness.py | 233 ++++++++++++++++------------- src/ragas/metrics/base.py | 26 +++- src/ragas/prompts/base.py | 38 ++++- 3 files changed, 181 insertions(+), 116 deletions(-) diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 3717499c5..0908fe725 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -5,126 +5,143 @@ import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +from langchain.prompts import ChatPromptTemplate +from ragas.llms import RagasLLM +import os from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader - +from ragas.prompts import RagasPrompt if t.TYPE_CHECKING: from datasets import Dataset from langchain.callbacks.base import Callbacks - -LONG_FORM_ANSWER_PROMPT = HumanMessagePromptTemplate.from_template( - """\ -Create one or more statements from each sentence in the given answer. - -question: Who was Albert Einstein and what is he best known for? -answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics. -statements in json: -{{ - "statements": [ - "Albert Einstein was born in Germany.", - "Albert Einstein was best known for his theory of relativity." - ] -}} - -question: Cadmium Chloride is slightly soluble in this chemical, it is also called what? -answer: alcohol -statements in json: -{{ - "statements": [ - "Cadmium Chloride is slightly soluble in alcohol." - ] -}} - -question: Were Hitler and Benito Mussolini of the same nationality? -answer: Sorry, I can't provide answer to that question. -statements in json: -{{ - "statements": [] -}} - -question:{question} -answer: {answer} -statements in json:""" # noqa: E501 +LONG_FORM_ANSWER_PROMPT = RagasPrompt( + name="long_form_answer", + instruction="Create one or more statements from each sentence in the given answer.", + examples=[ + { + "question": "Who was Albert Einstein and what is he best known for?", + "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", + "statements": """{ + "statements": [ + "Albert Einstein, a German-born theoretical physicist, is renowned for being one of the most influential physicists in history.", + "Albert Einstein was best known for his theory of relativity.", + "Einstein's contributions significantly advanced the field of quantum mechanics", + "Recognized globally, Einstein's work has profoundly impacted the scientific community", + "Einstein's groundbreaking theories continue to shape our understanding of physics today." + ] + }""" + }, + { + "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", + "answer": "alcohol", + "statements": """{ + "statements": [ + "Cadmium Chloride is slightly soluble in alcohol." + ] + }""" + }, + { + "question":"Were Hitler and Benito Mussolini of the same nationality?", + "answer":"Sorry, I can't provide answer to that question.", + "statements":"""{ + "statements": [] + }""" + } + ], + input_keys=["question", "answer"], + output_key="statements", + output_type="JSON", ) -NLI_STATEMENTS_MESSAGE = HumanMessagePromptTemplate.from_template( - """ - Natural language inference. Use only "Yes" (1), "No" (0) and "Null" (-1) as verdict. - -Context: -John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. -statement_1: John is majoring in Biology. -statement_2: John is taking a course on Artificial Intelligence. -statement_3: John is a dedicated student. -statement_4: John has a part-time job. -Answer: -[ - {{ - "statement_1": "John is majoring in Biology.", - "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", - "verdict": "0" - }}, - {{ - "statement_2": "John is taking a course on Artificial Intelligence.", - "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", - "verdict": "0" - }}, - {{ - "statement_3": "John is a dedicated student.", - "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", - "verdict": "1" - }}, - {{ - "statement_4": "John has a part-time job.", - "reason": "There is no information given in the context about John having a part-time job.", - "verdict": "0" - }} -] - -Context: -Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy. -statement_1: Albert Einstein was a genius. -Answer: -[ - {{ - "statement_1": "Albert Einstein was a genius.", - "reason": "The context and statement are unrelated" - "verdict": "0" - }} -] - -Context: -Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time. -statement_1: Nil -Answer: -[ - {{ - "statement_1": "Nil", - "reason": "The statement is invalid", - "verdict": "-1" - }} -] - - -context: -{context} -statements: -{statements} -Answer: -""" # noqa: E501 +NLI_STATEMENTS_MESSAGE = RagasPrompt( + name="nli_statements", + instruction="Natural language inference. Use only 'Yes' (1), 'No' (0) and 'Null' (-1) as verdict.", + examples=[ + { + "context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", + "statements":""" + statement_1: John is majoring in Biology. + statement_2: John is taking a course on Artificial Intelligence. + statement_3: John is a dedicated student. + statement_4: John has a part-time job. + """, + "answer": """[ + { + "statement_1": "John is majoring in Biology.", + "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", + "verdict": "0" + }, + { + "statement_2": "John is taking a course on Artificial Intelligence.", + "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", + "verdict": "0" + }, + { + "statement_3": "John is a dedicated student.", + "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", + "verdict": "1" + }, + { + "statement_4": "John has a part-time job.", + "reason": "There is no information given in the context about John having a part-time job.", + "verdict": "0" + } + ] + """ + + + }, + { + "context": """Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.""", + "statements":"""statement_1: Albert Einstein was a genius.""", + "answer":"""{ + "statement_1": "Albert Einstein was a genius.", + "reason": "The context and statement are unrelated", + "verdict": "0" + }""" + }, + { + "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.""", + "statements":"""statement_1: Nil""", + "answer":"""{ + "statement_1": "Nil", + "reason": "The statement is invalid", + "verdict": "-1" + }""" + } + ], + input_keys=["context", "statements"], + output_key="answer", + output_type="JSON", ) - @dataclass class Faithfulness(MetricWithLLM): name: str = "faithfulness" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore batch_size: int = 15 - + + def __post_init__(self: t.Self): + + self.long_form_answer_prompt = LONG_FORM_ANSWER_PROMPT + self.nli_statements_message = NLI_STATEMENTS_MESSAGE + + def adapt(self, languge: str, cache_dir: str = "~/.cache/ragas/metrics/") -> None: + + cache_dir = os.path.join(cache_dir, self.name) + self.long_form_answer_prompt.adapt(languge, cache_dir, self.llm) + self.nli_statements_message.adapt(languge, cache_dir, self.llm) + + def save(self, cache_dir: str = "~/.cache/ragas/metrics/") -> None: + #cache_dir/metric_name/language/prompt_name.pkl + + cache_dir = os.path.join(cache_dir, self.name) + self.long_form_answer_prompt.save(cache_dir) + self.nli_statements_message.save(cache_dir) + def _score_batch( self: t.Self, dataset: Dataset, @@ -147,8 +164,8 @@ def _score_batch( callback_group_name, callback_manager=cb ) as batch_group: for q, a in zip(question, answer): - human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=q, answer=a) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + human_prompt = self.long_form_answer_prompt.format(question=q, answer=a) + prompts.append([human_prompt]) result = self.llm.generate(prompts, callbacks=batch_group) @@ -162,14 +179,14 @@ def _score_batch( [f"statement_{i+1}: {st}" for i, st in enumerate(statements)] ) contexts_str: str = "\n".join(context) - human_prompt = NLI_STATEMENTS_MESSAGE.format( + human_prompt = self.nli_statements_message.format( context=contexts_str, statements=statements_str ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + prompts.append([human_prompt]) result = self.llm.generate(prompts, callbacks=batch_group) outputs = result.generations - verdict_score_map = {"1": 1, "0": 0, "-1": np.nan} + verdict_score_map = {"1": 1, "0": 0, "1": np.nan} scores = [] for output in outputs: output = json_loader.safe_load(output[0].text, self.llm) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 097058733..cd69510d3 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -17,7 +17,7 @@ from tqdm import tqdm from ragas.embeddings.base import RagasEmbeddings -from ragas.llms import llm_factory +from ragas.llms import RagasLLM, llm_factory if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -63,6 +63,27 @@ def init_model(self): This method will lazy initialize the model. """ ... + + # @abstractmethod + def adapt(self, languge: str) -> None: + """ + Adapt the metric to a different language. + """ + pass + + # @abstractmethod + def save(self, path: t.Optional[str]=None) -> None: + """ + Save the metric to a path. + """ + pass + + # @abstractmethod + def load(self, path: t.Optional[str]=None) -> None: + """ + Load the metric from a path. + """ + pass def score( self: t.Self, @@ -111,7 +132,8 @@ def get_batches(self, dataset_size: int) -> list[range]: @dataclass class MetricWithLLM(Metric): llm: RagasLLM = field(default_factory=llm_factory) - + + def init_model(self): """ Init any models in the metric, this is invoked before evaluate() diff --git a/src/ragas/prompts/base.py b/src/ragas/prompts/base.py index 2dedeacf0..23eb3115b 100644 --- a/src/ragas/prompts/base.py +++ b/src/ragas/prompts/base.py @@ -6,6 +6,8 @@ from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from langchain_core.messages import BaseMessage from langchain_core.prompt_values import PromptValue +import pickle +import os from ragas.llms.base import RagasLLM from ragas.utils import json_loader @@ -20,12 +22,13 @@ class RagasPrompt(PromptValue): """ RagasPrompt is a class that represents a prompt for the ragas metrics. """ - + name: str instruction: str examples: t.List[t.Dict[str, t.Any]] = [] input_keys: t.List[str] output_key: str output_type: str = "JSON" + language: str = "en" def to_string(self) -> str: """Return prompt value as string.""" @@ -103,22 +106,26 @@ def format(self, **kwargs: t.Any) -> ChatPromptTemplate: human_prompt = HumanMessagePromptTemplate.from_template(prompt) return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) - def adapt(self, languge: str, llm: RagasLLM) -> None: + def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: # TODO: Add callbacks + cache_dir = cache_dir if cache_dir else "~/.cache/ragas/prompts" + if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")): + self._load(language, self.name, cache_dir) + prompts = [] for example in self.examples: prompts.extend( [ - str_translation.format(translate_to=languge, input=example.get(key)) + str_translation.format(translate_to=language, input=example.get(key)) for key in self.input_keys ] ) prompts.append( json_translatation.format( - translate_to=languge, input=example.get(self.output_key) + translate_to=language, input=example.get(self.output_key) ) if self.output_type.lower() == "json" - else str_translation.format(languge, example.get(self.output_key)) + else str_translation.format(translate_to=language, input=example.get(self.output_key)) ) results = [result[0].text for result in llm.generate(prompts).generations] @@ -141,9 +148,27 @@ def adapt(self, languge: str, llm: RagasLLM) -> None: else example[-1] ) self.examples[i] = example_dict - + + self.language = language + + def save(self, cache_dir: str = "~/.cache/ragas/prompts") -> None: + + cache_dir = os.path.join(cache_dir, self.language) + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + cache_path = os.path.join(cache_dir, f"{self.name}.json") + self.to_json(cache_path) + + + @classmethod + def _load(cls, language: str, name: str, cache_dir: str ) -> RagasPrompt: + + path = os.path.join(cache_dir, language, f"{name}.json") + cls(**json.load(open(path))) str_translation = RagasPrompt( + name="str_translation", instruction="Language translation", examples=[ { @@ -158,6 +183,7 @@ def adapt(self, languge: str, llm: RagasLLM) -> None: ) json_translatation = RagasPrompt( + name="json_translation", instruction="Translate values in given json to target language ", examples=[ { From d090df5efe3a0615a198bf4a264197d5de7b0fd7 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 11:16:57 +0530 Subject: [PATCH 08/41] noqa --- src/ragas/metrics/_faithfulness.py | 211 ++++++++++++++--------------- src/ragas/metrics/critique.py | 52 ++++--- 2 files changed, 134 insertions(+), 129 deletions(-) diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 3717499c5..5abb27800 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -5,9 +5,9 @@ import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.prompts import RagasPrompt from ragas.utils import json_loader if t.TYPE_CHECKING: @@ -15,108 +15,105 @@ from langchain.callbacks.base import Callbacks -LONG_FORM_ANSWER_PROMPT = HumanMessagePromptTemplate.from_template( - """\ -Create one or more statements from each sentence in the given answer. - -question: Who was Albert Einstein and what is he best known for? -answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics. -statements in json: -{{ - "statements": [ - "Albert Einstein was born in Germany.", - "Albert Einstein was best known for his theory of relativity." - ] -}} - -question: Cadmium Chloride is slightly soluble in this chemical, it is also called what? -answer: alcohol -statements in json: -{{ - "statements": [ - "Cadmium Chloride is slightly soluble in alcohol." - ] -}} - -question: Were Hitler and Benito Mussolini of the same nationality? -answer: Sorry, I can't provide answer to that question. -statements in json: -{{ - "statements": [] -}} - -question:{question} -answer: {answer} -statements in json:""" # noqa: E501 -) - - -NLI_STATEMENTS_MESSAGE = HumanMessagePromptTemplate.from_template( - """ - Natural language inference. Use only "Yes" (1), "No" (0) and "Null" (-1) as verdict. - -Context: -John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. -statement_1: John is majoring in Biology. -statement_2: John is taking a course on Artificial Intelligence. -statement_3: John is a dedicated student. -statement_4: John has a part-time job. -Answer: -[ - {{ - "statement_1": "John is majoring in Biology.", - "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", - "verdict": "0" - }}, - {{ - "statement_2": "John is taking a course on Artificial Intelligence.", - "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", - "verdict": "0" - }}, - {{ - "statement_3": "John is a dedicated student.", - "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", - "verdict": "1" - }}, - {{ - "statement_4": "John has a part-time job.", - "reason": "There is no information given in the context about John having a part-time job.", - "verdict": "0" - }} -] - -Context: -Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy. -statement_1: Albert Einstein was a genius. -Answer: -[ - {{ - "statement_1": "Albert Einstein was a genius.", - "reason": "The context and statement are unrelated" - "verdict": "0" - }} -] - -Context: -Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time. -statement_1: Nil -Answer: -[ - {{ - "statement_1": "Nil", - "reason": "The statement is invalid", - "verdict": "-1" - }} -] - - -context: -{context} -statements: -{statements} -Answer: -""" # noqa: E501 -) +LONG_FORM_ANSWER_PROMPT = RagasPrompt( + name="long_form_answer", + instruction="Create one or more statements from each sentence in the given answer.", + examples=[ + { + "question": "Who was Albert Einstein and what is he best known for?", + "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", + "statements": """{ + "statements": [ + "Albert Einstein, a German-born theoretical physicist, is renowned for being one of the most influential physicists in history.", + "Albert Einstein was best known for his theory of relativity.", + "Einstein's contributions significantly advanced the field of quantum mechanics", + "Recognized globally, Einstein's work has profoundly impacted the scientific community", + "Einstein's groundbreaking theories continue to shape our understanding of physics today." + ] + }""", + }, + { + "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", + "answer": "alcohol", + "statements": """{ + "statements": [ + "Cadmium Chloride is slightly soluble in alcohol." + ] + }""", + }, + { + "question": "Were Hitler and Benito Mussolini of the same nationality?", + "answer": "Sorry, I can't provide answer to that question.", + "statements": """{ + "statements": [] + }""", + }, + ], + input_keys=["question", "answer"], + output_key="statements", + output_type="JSON", +) # noqa: E501 + + +NLI_STATEMENTS_MESSAGE = RagasPrompt( + name="nli_statements", + instruction="Natural language inference. Use only 'Yes' (1), 'No' (0) and 'Null' (-1) as verdict.", + examples=[ + { + "context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", + "statements": """ + statement_1: John is majoring in Biology. + statement_2: John is taking a course on Artificial Intelligence. + statement_3: John is a dedicated student. + statement_4: John has a part-time job. + """, + "answer": """[ + { + "statement_1": "John is majoring in Biology.", + "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", + "verdict": "0" + }, + { + "statement_2": "John is taking a course on Artificial Intelligence.", + "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", + "verdict": "0" + }, + { + "statement_3": "John is a dedicated student.", + "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", + "verdict": "1" + }, + { + "statement_4": "John has a part-time job.", + "reason": "There is no information given in the context about John having a part-time job.", + "verdict": "0" + } + ] + """, + }, + { + "context": """Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.""", + "statements": """statement_1: Albert Einstein was a genius.""", + "answer": """{ + "statement_1": "Albert Einstein was a genius.", + "reason": "The context and statement are unrelated", + "verdict": "0" + }""", + }, + { + "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.""", + "statements": """statement_1: Nil""", + "answer": """{ + "statement_1": "Nil", + "reason": "The statement is invalid", + "verdict": "-1" + }""", + }, + ], + input_keys=["context", "statements"], + output_key="answer", + output_type="JSON", +) # noqa: E501 @dataclass @@ -148,7 +145,7 @@ def _score_batch( ) as batch_group: for q, a in zip(question, answer): human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=q, answer=a) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + prompts.append(human_prompt) result = self.llm.generate(prompts, callbacks=batch_group) @@ -165,15 +162,15 @@ def _score_batch( human_prompt = NLI_STATEMENTS_MESSAGE.format( context=contexts_str, statements=statements_str ) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + prompts.append(human_prompt) result = self.llm.generate(prompts, callbacks=batch_group) outputs = result.generations - verdict_score_map = {"1": 1, "0": 0, "-1": np.nan} + verdict_score_map = {"1": 1, "0": 0, "null": np.nan} scores = [] for output in outputs: output = json_loader.safe_load(output[0].text, self.llm) - output = output if isinstance(output, list) else [] + output = output if isinstance(output, list) else [output] faithful_statements = sum( verdict_score_map.get(dict.get("verdict", "").lower(), np.nan) for dict in output diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 4075acb34..aa8706235 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -4,33 +4,39 @@ from collections import Counter from dataclasses import dataclass, field +import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.llms import llm_factory from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.prompts import RagasPrompt +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks from ragas.llms import RagasLLM -CRITIQUE_PROMPT = HumanMessagePromptTemplate.from_template( - """Given a input and submission. Evaluate the submission only using the given criteria. -Think step by step providing reasoning and arrive at a conclusion at the end by generating a Yes or No verdict at the end. -input: Who was the director of Los Alamos Laboratory? -submission: Einstein was the director of Los Alamos Laboratory. -criteria: Is the output written in perfect grammar -Here's are my thoughts: the criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically correct. Therefore, the answer is:\n\nYes - -input:{input} -submission:{submission} -criteria:{criteria} -Here's are my thoughts: -""" # noqa: E501 -) +CRITIQUE_PROMPT = RagasPrompt( + instruction="Given a input and submission. Evaluate the submission only using the given criteria. Use only 'Yes' (1) and 'No' (0) as verdict.", + examples=[ + { + "input": "Who was the director of Los Alamos Laboratory?", + "submission": "Einstein was the director of Los Alamos Laboratory.", + "criteria": "Is the output written in perfect grammar", + "output": """{ + "reason":"the criteria for evaluation is whether the output is written in perfect grammar. In this case, the output is grammatically correct.", + "verdict":"1" + } + """, + } + ], + input_keys=["input", "submission", "criteria"], + output_key="output", + output_type="JSON", +) # noqa: E501 @dataclass @@ -113,7 +119,7 @@ def _score_batch( ) as batch_group: for question, context, answer in zip(questions, contexts, answers): human_prompt = self.prompt_format(question, answer, context) - prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + prompts.append(human_prompt) results = self.llm.generate( prompts, @@ -125,18 +131,20 @@ def _score_batch( ] scores = [] - answer_dict = {"Yes": 1, "No": 0} + answer_dict = {"1": 1, "0": 0} for response in responses: - response = [(text, text.split("\n\n")[-1]) for text in response] + response = [json_loader.safe_load(item, self.llm) for item in response] if self.strictness > 1: score = Counter( - [answer_dict.get(item[-1], 0) for item in response] + [ + answer_dict.get(item.get("verdict", np.nan), np.nan) + for item in response + ] ).most_common(1)[0][0] else: - score = answer_dict.get(response[0][-1]) + score = answer_dict.get(response[0].get("verdict", np.nan), np.nan) - # patch for critique: force score to 0 if the answer is not Yes or No - scores.append(score if score is not None else 0) + scores.append(score) return scores From 3deb30ef70bde39c0205ca05611fca6aa4335da7 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 12:00:29 +0530 Subject: [PATCH 09/41] prompt --- src/ragas/metrics/_faithfulness.py | 10 +++++----- src/ragas/metrics/critique.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 5abb27800..1f645f008 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -6,8 +6,8 @@ import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.prompts import RagasPrompt from ragas.utils import json_loader if t.TYPE_CHECKING: @@ -15,7 +15,7 @@ from langchain.callbacks.base import Callbacks -LONG_FORM_ANSWER_PROMPT = RagasPrompt( +LONG_FORM_ANSWER_PROMPT = Prompt( name="long_form_answer", instruction="Create one or more statements from each sentence in the given answer.", examples=[ @@ -55,7 +55,7 @@ ) # noqa: E501 -NLI_STATEMENTS_MESSAGE = RagasPrompt( +NLI_STATEMENTS_MESSAGE = Prompt( name="nli_statements", instruction="Natural language inference. Use only 'Yes' (1), 'No' (0) and 'Null' (-1) as verdict.", examples=[ @@ -112,8 +112,8 @@ ], input_keys=["context", "statements"], output_key="answer", - output_type="JSON", -) # noqa: E501 + output_type="JSON", +) # noqa: E501 @dataclass diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index aa8706235..7470cc788 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -9,8 +9,8 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from ragas.llms import llm_factory +from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.prompts import RagasPrompt from ragas.utils import json_loader if t.TYPE_CHECKING: @@ -19,7 +19,7 @@ from ragas.llms import RagasLLM -CRITIQUE_PROMPT = RagasPrompt( +CRITIQUE_PROMPT = Prompt( instruction="Given a input and submission. Evaluate the submission only using the given criteria. Use only 'Yes' (1) and 'No' (0) as verdict.", examples=[ { From d7937987ddba443e5373a37e68494629b15eb5fb Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 16:53:16 +0530 Subject: [PATCH 10/41] added name to prompts --- src/ragas/metrics/_answer_correctness.py | 1 + src/ragas/metrics/_answer_relevance.py | 1 + src/ragas/metrics/_context_precision.py | 1 + src/ragas/metrics/_context_recall.py | 2 +- src/ragas/metrics/_context_relevancy.py | 1 + src/ragas/metrics/_faithfulness.py | 6 ++++++ src/ragas/metrics/critique.py | 1 + 7 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 53c2e8d6c..a9fe7df9c 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -17,6 +17,7 @@ from langchain.callbacks.base import Callbacks CORRECTNESS_PROMPT = Prompt( + name="answer_correctness", instruction="""Extract following from given question and ground truth""", examples=[ { diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 1324d2420..89d41b81b 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -21,6 +21,7 @@ from ragas.embeddings.base import RagasEmbeddings QUESTION_GEN = Prompt( + name="question_generation", instruction="""Generate a question for the given answer and Identify if answer is noncommittal""", examples=[ { diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index dc62b67c7..9fce851ea 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -17,6 +17,7 @@ from langchain.callbacks.base import Callbacks CONTEXT_PRECISION = Prompt( + name="context_precision", instruction="""Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not. """, examples=[ { diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 5e483a283..6be8ac72b 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -6,7 +6,6 @@ import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM @@ -16,6 +15,7 @@ from langchain.callbacks.base import Callbacks CONTEXT_RECALL_RA = Prompt( + name="context_recall", instruction="""Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. Output json with reason.""", examples=[ { diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index 77505a7a5..e5c1e628b 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -18,6 +18,7 @@ from langchain.callbacks.base import Callbacks CONTEXT_RELEVANCE = Prompt( + name="context_relevancy", instruction="""Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.""", input_keys=["question", "context"], output_key="candidate sentences", diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index bbbed957f..316d3d1ad 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -10,6 +10,8 @@ from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader from ragas.llms.prompt import Prompt +import os + if t.TYPE_CHECKING: from datasets import Dataset from langchain.callbacks.base import Callbacks @@ -202,3 +204,7 @@ def _score_batch( faithfulness = Faithfulness() + +if __name__ == "__main__": + + faithfulness.adapt(languge="hindi") diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 7470cc788..28397d6c5 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -20,6 +20,7 @@ CRITIQUE_PROMPT = Prompt( + name="critique", instruction="Given a input and submission. Evaluate the submission only using the given criteria. Use only 'Yes' (1) and 'No' (0) as verdict.", examples=[ { From ebc33da934b3f4ac5d200cc6127541c07895a0cf Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 16:54:00 +0530 Subject: [PATCH 11/41] added attr to prompts --- src/ragas/llms/prompt.py | 106 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 742590bd5..db14bf28f 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -2,6 +2,8 @@ import json import typing as t +import os +from ragas.utils import json_loader from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from langchain_core.messages import BaseMessage, HumanMessage @@ -11,8 +13,9 @@ class Prompt(PromptValue): """ - RagasPrompt is a class that represents a prompt for the ragas metrics. + Prompt is a class that represents a prompt for the ragas metrics. """ + name: str instruction: str examples: t.List[t.Dict[str, t.Any]] = [] input_keys: t.List[str] @@ -99,12 +102,111 @@ def get_example_str(self, example_no: int) -> str: def format(self, **kwargs: t.Any) -> ChatPromptTemplate: """ - Format the RagasPrompt object into a ChatPromptTemplate object to be used in metrics. + Format the Prompt object into a ChatPromptTemplate object to be used in metrics. """ if set(self.input_keys) != set(kwargs.keys()): raise ValueError( f"Input variables {self.input_keys} do not match with the given parameters {list(kwargs.keys())}" ) prompt = self.to_string() + print(prompt) human_prompt = HumanMessagePromptTemplate.from_template(prompt) return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) + + def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: + # TODO: Add callbacks + cache_dir = cache_dir if cache_dir else "~/.cache/ragas/prompts" + if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")): + self._load(language, self.name, cache_dir) + + prompts = [] + for example in self.examples: + prompts.extend( + [ + str_translation.format(translate_to=language, input=example.get(key)) + for key in self.input_keys + ] + ) + prompts.append( + json_translatation.format( + translate_to=language, input=example.get(self.output_key) + ) + if self.output_type.lower() == "json" + else str_translation.format(translate_to=language, input=example.get(self.output_key)) + ) + + results = [result[0].text for result in llm.generate(prompts).generations] + per_example_items = len(self.input_keys) + 1 + grouped_results = [ + results[i : i + per_example_items] + for i in range(0, len(results), per_example_items) + ] + assert len(grouped_results) == len( + self.examples + ), "examples and adapted examples must be of equal length" + for i, example in enumerate(grouped_results): + example_dict = {} + example_dict.update( + {k: v for k, v in zip(self.input_keys, example[: len(self.input_keys)])} + ) + example_dict[self.output_key] = ( + json_loader.safe_load(example[-1], llm=llm) + if self.output_type.lower() == "json" + else example[-1] + ) + self.examples[i] = example_dict + + self.language = language + + def save(self, cache_dir: str = "~/.cache/ragas/prompts") -> None: + + cache_dir = os.path.join(cache_dir, self.language) + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + cache_path = os.path.join(cache_dir, f"{self.name}.json") + self.to_json(cache_path) + + + @classmethod + def _load(cls, language: str, name: str, cache_dir: str ) -> Prompt: + + path = os.path.join(cache_dir, language, f"{name}.json") + cls(**json.load(open(path))) + +str_translation = Prompt( + name="str_translation", + instruction="Language translation", + examples=[ + { + "translate_to": "hindi", + "input": "Who was Albert Einstein and what is he best known for?", + "output": "अल्बर्ट आइंस्टीन कौन थे और वे किसके लिए सबसे ज्यादा प्रसिद्ध हैं?", + }, + ], + input_keys=["translate_to", "input"], + output_key="output", + output_type="str", +) + +json_translatation = Prompt( + name="json_translation", + instruction="Translate values in given json to target language ", + examples=[ + { + "translate_to": "hindi", + "input": """{"statements": [ + "Albert Einstein was born in Germany.", + "Albert Einstein was best known for his theory of relativity." + ]}""", + "output": """{"statements": [ + "अल्बर्ट आइंस्टीन का जन्म जर्मनी में हुआ था।", + "अल्बर्ट आइंस्टीन अपने सापेक्षता के सिद्धांत के लिए सबसे अधिक प्रसिद्ध थे।" + ]}""", + } + ], + input_keys=["translate_to", "input"], + output_key="output", + output_type="JSON", +) + From 92261d9d56d168cc11a4a643212ede49e18d0627 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 17:17:06 +0530 Subject: [PATCH 12/41] convert json loader --- src/ragas/utils.py | 80 +++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 9e0849bac..3a35987a4 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -8,7 +8,8 @@ from functools import lru_cache from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate + +from ragas.llms.prompt import Prompt if t.TYPE_CHECKING: from ragas.llms import RagasLLM @@ -39,52 +40,46 @@ def load_as_json(text): return {} -JSON_PROMPT = HumanMessagePromptTemplate.from_template( - """ - -Rewrite the input into valid json - - -Input: -{{ +JSON_PROMPT = Prompt( + instruction="Rewrite the input into valid json", + examples=[ + { + "input": """{ "name": "John Doe", "age": 30, "isStudent": false - "address": {{ + "address": { "street": "123 Main St", "city": "Anytown", "state": "CA", - }} - "hobbies": ["reading", "swimming", "cycling"] -}} -Output: -{{ - "name": "John Doe", - "age": 30, - "isStudent": false, - "address": {{ - "street": "123 Main St", - "city": "Anytown", - "state": "CA" - }}, + } "hobbies": ["reading", "swimming", "cycling"] -}} - - -Input: -{{ - "statement": "The Earth is also known as "Terra" " -}} -Output: -{{ - "statement": "The Earth is also known as 'Terra'" -}} - -Input: -{input} - -Output: -""" + }""", + "output": """{ + "name": "John Doe", + "age": 30, + "isStudent": false, + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA" + }, + "hobbies": ["reading", "swimming", "cycling"] + }""", + }, + { + "input": """{ + "statement": "The Earth is also known as "Terra" " + }""", + "output": """{ + "statement": "The Earth is also known as 'Terra'" + } +""", + }, + ], + input_keys=["input"], + output_key="output", + output_type="JSON", ) @@ -115,9 +110,8 @@ def _fix_to_json( with trace_as_chain_group( callback_group_name, callback_manager=callbacks ) as batch_group: - human_prompt = ChatPromptTemplate.from_messages( - [JSON_PROMPT.format(input=text)] - ) + human_prompt = JSON_PROMPT.format(input=text) + results = llm.generate( [human_prompt], n=1, From 0b5f190b6864ba6519a6bfb8eed49bfe6c263c4e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 17:17:56 +0530 Subject: [PATCH 13/41] rmv unused imports --- src/ragas/metrics/_answer_correctness.py | 19 ++++++++----------- src/ragas/metrics/_answer_relevance.py | 13 ++++++------- src/ragas/metrics/_context_precision.py | 15 ++++++--------- src/ragas/metrics/_context_recall.py | 13 +++++-------- src/ragas/metrics/_context_relevancy.py | 7 ++----- 5 files changed, 27 insertions(+), 40 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 53c2e8d6c..cb49b4795 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -6,7 +6,6 @@ import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity @@ -29,7 +28,7 @@ "statements present in the answer but not found in the ground truth": ["The sun is powered by nuclear fission", "similar to nuclear reactors on Earth"], "relevant statements found in the ground truth but omitted in the answer": ["The sun is powered by nuclear fusion, not fission", "In its core, hydrogen atoms fuse to form helium, releasing a tremendous amount of energy", "This energy provides heat and light, essential for life on Earth", "The sun's light plays a critical role in Earth's climate system", "The sun helps to drive the weather and ocean currents"] }] - """ + """, }, { "question": """What is the boiling point of water?""", @@ -41,12 +40,12 @@ "statements present in the answer but not found in the ground truth": [], "relevant statements found in the ground truth but omitted in the answer": ["The boiling point can change with altitude", "The boiling point of water is 212 degrees Fahrenheit at sea level"] }] - """ - } + """, + }, ], input_keys=["question", "answer", "ground_truth"], output_key="Extracted statements", - output_type="json" + output_type="json", ) @@ -101,9 +100,7 @@ def _score_batch( ) as batch_group: for q, a, g in zip(question, answer, ground_truths): prompts.append( - CORRECTNESS_PROMPT.format( - question=q, ground_truth=g[0], answer=a - ) + CORRECTNESS_PROMPT.format(question=q, ground_truth=g[0], answer=a) ) result = self.llm.generate(prompts, callbacks=batch_group) @@ -113,7 +110,7 @@ def _score_batch( "FP": "statements present in the answer but not found in the ground truth", "FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501 } - + f1_score = [] for prediction in outputs: prediction = json_loader.safe_load(prediction[0].text, self.llm) @@ -131,9 +128,9 @@ def _score_batch( score = tp / (tp + 0.5 * (fp + fn)) else: score = np.nan - + f1_score.append(score) - + similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore scores_stacked = np.vstack([f1_score, similarity_scores]) scores = np.average( diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 1324d2420..a3c82698d 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -7,7 +7,6 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.embeddings import OpenAIEmbeddings -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound @@ -26,27 +25,27 @@ { "answer": """Albert Einstein was born in Germany.""", "context": """Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time""", - "output": """{"question":"Where was Albert Einstein born?","noncommittal":false}""" + "output": """{"question":"Where was Albert Einstein born?","noncommittal":false}""", }, { "answer": """It can change its skin color based on the temperature of its environment.""", "context": """A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment.""", - "output": """{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}""" + "output": """{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}""", }, { "answer": """Everest""", "context": """The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas.""", - "output": """{"question":"What is the tallest mountain on Earth?","noncommittal":false}""" + "output": """{"question":"What is the tallest mountain on Earth?","noncommittal":false}""", }, { "answer": """I don't know about the groundbreaking feature of the smartphone invented in 2023 as am unware of information beyond 2022. """, "context": """In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology.""", - "output": """{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}""" - } + "output": """{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}""", + }, ], input_keys=["answer", "context"], output_key="output", - output_type="json" + output_type="json", ) diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index dc62b67c7..c8fa32c5d 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -7,7 +7,6 @@ import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM @@ -27,7 +26,7 @@ "reason": "The provided context was indeed useful in arriving at the given answer. The context includes key information about Albert Einstein's life and contributions, which are reflected in the answer.", "Verdict": "1" } - """ + """, }, { "question": """who won 2020 icc world cup?""", @@ -37,7 +36,7 @@ "reason": "the context was useful in clarifying the situation regarding the 2020 ICC World Cup and indicating that England was the winner of the tournament that was intended to be held in 2020 but actually took place in 2022.", "verdict": "1" } - """ + """, }, { "question": """What is the tallest mountain in the world?""", @@ -47,12 +46,12 @@ "reason":"the provided context discusses the Andes mountain range, which, while impressive, does not include Mount Everest or directly relate to the question about the world's tallest mountain.", "verdict":"0" } - """ - } + """, + }, ], input_keys=["question", "context", "answer"], output_key="verification", - output_type="json" + output_type="json", ) @@ -98,9 +97,7 @@ def _score_batch( ) as batch_group: for qstn, ctx, answer in zip(questions, contexts, answers): human_prompts = [ - CONTEXT_PRECISION.format( - question=qstn, context=c, answer=answer - ) + CONTEXT_PRECISION.format(question=qstn, context=c, answer=answer) for c in ctx ] diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 5e483a283..751456d5b 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -6,7 +6,6 @@ import numpy as np from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM @@ -43,7 +42,7 @@ "reason": "There is no supporting evidence for this in the given context.", "Attributed": "0" }] - """ + """, }, { "question": """who won 2020 icc world cup?""", @@ -55,12 +54,12 @@ "reason": "From context it is clear that England defeated Pakistan to win the World Cup.", "Attributed": "1" }] - """ - } + """, + }, ], input_keys=["question", "context", "answer"], output_key="classification", - output_type="json" + output_type="json", ) @@ -103,9 +102,7 @@ def _score_batch( gt = "\n".join(gt) if isinstance(gt, list) else gt ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx prompts.append( - CONTEXT_RECALL_RA.format( - question=qstn, context=ctx, answer=gt - ) + CONTEXT_RECALL_RA.format(question=qstn, context=ctx, answer=gt) ) responses: list[list[str]] = [] diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index 77505a7a5..d4b716934 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -9,7 +9,6 @@ import pysbd from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM @@ -21,7 +20,7 @@ instruction="""Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.""", input_keys=["question", "context"], output_key="candidate sentences", - output_type="json" + output_type="json", ) @@ -77,9 +76,7 @@ def _score_batch( ) as batch_group: for q, c in zip(questions, contexts): prompts.append( - CONTEXT_RELEVANCE.format( - question=q, context="\n".join(c) - ) + CONTEXT_RELEVANCE.format(question=q, context="\n".join(c)) ) responses: list[list[str]] = [] From c57c1d7c7aa54e844c5c34ee1db912cc4a64788e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 18:43:51 +0530 Subject: [PATCH 14/41] added json loader --- src/ragas/json_loader.py | 120 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 src/ragas/json_loader.py diff --git a/src/ragas/json_loader.py b/src/ragas/json_loader.py new file mode 100644 index 000000000..687dda3af --- /dev/null +++ b/src/ragas/json_loader.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import json +import typing as t +from dataclasses import dataclass + +from langchain.callbacks.manager import CallbackManager, trace_as_chain_group + +from ragas.llms.prompt import Prompt + +if t.TYPE_CHECKING: + from ragas.llms import RagasLLM + +JSON_PROMPT = Prompt( + instruction="Rewrite the input into valid json", + examples=[ + { + "input": """{ + "name": "John Doe", + "age": 30, + "isStudent": false + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + } + "hobbies": ["reading", "swimming", "cycling"] + }""", + "output": """{ + "name": "John Doe", + "age": 30, + "isStudent": false, + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA" + }, + "hobbies": ["reading", "swimming", "cycling"] + }""", + }, + { + "input": """{ + "statement": "The Earth is also known as "Terra" " + }""", + "output": """{ + "statement": "The Earth is also known as 'Terra'" + } +""", + }, + ], + input_keys=["input"], + output_key="output", + output_type="JSON", +) + + +@dataclass +class JsonLoader: + max_retries: int = 2 + + def safe_load(self, text: str, llm: RagasLLM): + retry = 0 + while retry <= self.max_retries: + try: + start, end = self._find_outermost_json(text) + return json.loads(text[start:end]) + except ValueError: + text = self._fix_to_json(text, llm) + retry += 1 + + return {} + + def _fix_to_json( + self, + text, + llm, + callbacks: t.Optional[CallbackManager] = None, + callback_group_name: str = "batch", + ): + # TODO (executor) + with trace_as_chain_group( + callback_group_name, callback_manager=callbacks + ) as batch_group: + human_prompt = JSON_PROMPT.format(input=text) + + results = llm.generate( + [human_prompt], + n=1, + callbacks=batch_group, + ) + return results.generations[0][0].text + + def _find_outermost_json(self, text): + stack = [] + start_index = -1 + + for i, char in enumerate(text): + if char in "{[": + if len(stack) == 0: + start_index = i + stack.append(char) + + elif char in "}]": + if len(stack) > 0: + last = stack.pop() + if (char == "}" and last != "{") or (char == "]" and last != "["): + # Mismatched closing brace/bracket, invalid JSON + break + + if len(stack) == 0 and start_index != -1: + # Found a valid outermost JSON + return ( + start_index, + i + 1, + ) # Add 1 to include the closing brace/bracket in the range + + return -1, -1 # No valid JSON found + + +json_loader = JsonLoader() From 6d9ac0d169f7790f78f3afe714132c92df286108 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 18:44:16 +0530 Subject: [PATCH 15/41] moved json loader --- src/ragas/utils.py | 119 --------------------------------------------- 1 file changed, 119 deletions(-) diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 3a35987a4..c5c9ee386 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -1,19 +1,9 @@ from __future__ import annotations -import json import os -import typing as t import warnings -from dataclasses import dataclass from functools import lru_cache -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group - -from ragas.llms.prompt import Prompt - -if t.TYPE_CHECKING: - from ragas.llms import RagasLLM - DEBUG_ENV_VAR = "RAGAS_DEBUG" # constant to tell us that there is no key passed to the llm/embeddings NO_KEY = "no-key" @@ -38,112 +28,3 @@ def load_as_json(text): warnings.warn(f"Invalid json: {e}") return {} - - -JSON_PROMPT = Prompt( - instruction="Rewrite the input into valid json", - examples=[ - { - "input": """{ - "name": "John Doe", - "age": 30, - "isStudent": false - "address": { - "street": "123 Main St", - "city": "Anytown", - "state": "CA", - } - "hobbies": ["reading", "swimming", "cycling"] - }""", - "output": """{ - "name": "John Doe", - "age": 30, - "isStudent": false, - "address": { - "street": "123 Main St", - "city": "Anytown", - "state": "CA" - }, - "hobbies": ["reading", "swimming", "cycling"] - }""", - }, - { - "input": """{ - "statement": "The Earth is also known as "Terra" " - }""", - "output": """{ - "statement": "The Earth is also known as 'Terra'" - } -""", - }, - ], - input_keys=["input"], - output_key="output", - output_type="JSON", -) - - -@dataclass -class JsonLoader: - max_retries: int = 2 - - def safe_load(self, text: str, llm: RagasLLM): - retry = 0 - while retry <= self.max_retries: - try: - start, end = self._find_outermost_json(text) - return json.loads(text[start:end]) - except ValueError: - text = self._fix_to_json(text, llm) - retry += 1 - - return {} - - def _fix_to_json( - self, - text, - llm, - callbacks: t.Optional[CallbackManager] = None, - callback_group_name: str = "batch", - ): - # TODO (executor) - with trace_as_chain_group( - callback_group_name, callback_manager=callbacks - ) as batch_group: - human_prompt = JSON_PROMPT.format(input=text) - - results = llm.generate( - [human_prompt], - n=1, - callbacks=batch_group, - ) - return results.generations[0][0].text - - def _find_outermost_json(self, text): - stack = [] - start_index = -1 - - for i, char in enumerate(text): - if char in "{[": - if len(stack) == 0: - start_index = i - stack.append(char) - - elif char in "}]": - if len(stack) > 0: - last = stack.pop() - if (char == "}" and last != "{") or (char == "]" and last != "["): - # Mismatched closing brace/bracket, invalid JSON - break - - if len(stack) == 0 and start_index != -1: - # Found a valid outermost JSON - return ( - start_index, - i + 1, - ) # Add 1 to include the closing brace/bracket in the range - - return -1, -1 # No valid JSON found - - -json_loader = JsonLoader() From b8f9ca1f3c7d01fa43d2c7cf7cf3960be7b3c080 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 18:45:31 +0530 Subject: [PATCH 16/41] change json loader path --- src/ragas/metrics/_answer_correctness.py | 2 +- src/ragas/metrics/_answer_relevance.py | 2 +- src/ragas/metrics/_context_precision.py | 2 +- src/ragas/metrics/_context_recall.py | 2 +- src/ragas/metrics/_faithfulness.py | 2 +- src/ragas/metrics/critique.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index cb49b4795..81f1e0e4d 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -7,10 +7,10 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index a3c82698d..2c2df6072 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -10,9 +10,9 @@ from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound +from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index c8fa32c5d..72e01a2d8 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -8,9 +8,9 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 751456d5b..c914893ca 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -7,9 +7,9 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 1f645f008..1f3006aad 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -6,9 +6,9 @@ import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from datasets import Dataset diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 7470cc788..f51b8954e 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -8,10 +8,10 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from ragas.json_loader import json_loader from ragas.llms import llm_factory from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks From b4087b8ae3f797dc49c24f0d744a5c3c14e55dc1 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 18:46:14 +0530 Subject: [PATCH 17/41] linting change --- src/ragas/llms/prompt.py | 57 ++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 742590bd5..d6df3d05e 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -13,11 +13,12 @@ class Prompt(PromptValue): """ RagasPrompt is a class that represents a prompt for the ragas metrics. """ + instruction: str examples: t.List[t.Dict[str, t.Any]] = [] input_keys: t.List[str] output_key: str - output_type: str = 'json' + output_type: str = "json" @root_validator def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]: @@ -25,22 +26,16 @@ def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]: Validate the template string to ensure that it is in desired format. """ if values.get("instruction") is None or values.get("instruction") == "": - raise ValueError( - "instruction cannot be empty" - ) + raise ValueError("instruction cannot be empty") if values.get("input_keys") is None or values.get("instruction") == []: - raise ValueError( - "input_keys cannot be empty" - ) + raise ValueError("input_keys cannot be empty") if values.get("output_key") is None or values.get("output_key") == "": - raise ValueError( - "output_key cannot be empty" - ) - + raise ValueError("output_key cannot be empty") + if values.get("examples"): output_key = values["output_key"] - for no, example in enumerate(values['examples']): - for inp_key in values['input_keys']: + for no, example in enumerate(values["examples"]): + for inp_key in values["input_keys"]: if inp_key not in example: raise ValueError( f"example {no+1} does not have the variable {inp_key} in the definition" @@ -49,7 +44,7 @@ def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]: raise ValueError( f"example {no+1} does not have the variable {output_key} in the definition" ) - if values["output_type"] == 'json': + if values["output_type"] == "json": try: if output_key in example: json.loads(example[output_key]) @@ -64,37 +59,43 @@ def to_string(self) -> str: """ Generate the prompt string from the variables. """ - prompt_str = self.instruction + '\n' + prompt_str = self.instruction + "\n" # Format the examples to match the Langchain prompt template for example in self.examples: for key, value in example.items(): - value = value.replace('{','{{').replace('}','}}') if self.output_type == 'json' else value - prompt_str += f'\n{key}: {value}' - prompt_str += '\n' - - prompt_str += ''.join(f'\n{key}: {{{key}}}' for key in self.input_keys) - prompt_str += f'\n{self.output_key}: \n' + value = ( + value.replace("{", "{{").replace("}", "}}") + if self.output_type.lower() == "json" + else value + ) + prompt_str += f"\n{key}: {value}" + prompt_str += "\n" + + prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys) + prompt_str += f"\n{self.output_key}: \n" return prompt_str def to_messages(self) -> t.List[BaseMessage]: """Return prompt as a list of Messages.""" return [HumanMessage(content=self.to_string())] - + def get_example_str(self, example_no: int) -> str: """ Get the example string from the example number. """ if example_no >= len(self.examples): - raise ValueError( - f"example number {example_no} is out of range" - ) + raise ValueError(f"example number {example_no} is out of range") example = self.examples[example_no] - example_str = '' + example_str = "" for key, value in example.items(): - value = value.replace('{','{{').replace('}','}}') if self.output_type == 'json' else value - example_str += f'\n{key}: {value}' + value = ( + value.replace("{", "{{").replace("}", "}}") + if self.output_type.lower() == "json" + else value + ) + example_str += f"\n{key}: {value}" return example_str def format(self, **kwargs: t.Any) -> ChatPromptTemplate: From da3a2edce6da876bb079f5aeee17d1677c30d665 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 20:10:20 +0530 Subject: [PATCH 18/41] prompt adaption fix --- src/ragas/llms/prompt.py | 59 +++++++++++++++++------------- src/ragas/metrics/_faithfulness.py | 32 +++++----------- 2 files changed, 42 insertions(+), 49 deletions(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 42a69d6ae..c1f154101 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -1,26 +1,29 @@ from __future__ import annotations import json -import typing as t import os -from ragas.utils import json_loader +import typing as t from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from langchain_core.messages import BaseMessage, HumanMessage from langchain_core.prompt_values import PromptValue from langchain_core.pydantic_v1 import root_validator +from ragas.utils import RAGAS_CACHE_HOME + class Prompt(PromptValue): """ Prompt is a class that represents a prompt for the ragas metrics. """ + name: str instruction: str examples: t.List[t.Dict[str, t.Any]] = [] input_keys: t.List[str] output_key: str output_type: str = "json" + language = "en" @root_validator def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]: @@ -109,21 +112,24 @@ def format(self, **kwargs: t.Any) -> ChatPromptTemplate: f"Input variables {self.input_keys} do not match with the given parameters {list(kwargs.keys())}" ) prompt = self.to_string() - print(prompt) human_prompt = HumanMessagePromptTemplate.from_template(prompt) return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) - - def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: + + def adapt( + self, language: str, llm: RagasLLM, cache_dir: t.Optional[str] = None + ) -> None: # TODO: Add callbacks - cache_dir = cache_dir if cache_dir else "~/.cache/ragas/prompts" + cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")): self._load(language, self.name, cache_dir) - + prompts = [] for example in self.examples: prompts.extend( [ - str_translation.format(translate_to=language, input=example.get(key)) + str_translation.format( + translate_to=language, input=example.get(key) + ) for key in self.input_keys ] ) @@ -132,7 +138,9 @@ def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: translate_to=language, input=example.get(self.output_key) ) if self.output_type.lower() == "json" - else str_translation.format(translate_to=language, input=example.get(self.output_key)) + else str_translation.format( + translate_to=language, input=example.get(self.output_key) + ) ) results = [result[0].text for result in llm.generate(prompts).generations] @@ -149,30 +157,30 @@ def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: example_dict.update( {k: v for k, v in zip(self.input_keys, example[: len(self.input_keys)])} ) - example_dict[self.output_key] = ( - json_loader.safe_load(example[-1], llm=llm) - if self.output_type.lower() == "json" - else example[-1] - ) + # TODO : safe load json - now circular import error + example_dict[self.output_key] = example[-1] + self.examples[i] = example_dict - + self.language = language - - def save(self, cache_dir: str = "~/.cache/ragas/prompts") -> None: - + + def save(self, cache_dir: t.Optional[str] = None) -> None: + cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME cache_dir = os.path.join(cache_dir, self.language) if not os.path.exists(cache_dir): os.makedirs(cache_dir) - + cache_path = os.path.join(cache_dir, f"{self.name}.json") - self.to_json(cache_path) - + print(cache_path) + with open(cache_path, "w") as file: + json.dump(self.to_json(), file, indent=4) - @classmethod - def _load(cls, language: str, name: str, cache_dir: str ) -> Prompt: - + @classmethod + def _load(cls, language: str, name: str, cache_dir: str) -> Prompt: path = os.path.join(cache_dir, language, f"{name}.json") - cls(**json.load(open(path))) + print("loading from", path) + cls(**json.load(open(path))["kwargs"]) + str_translation = Prompt( name="str_translation", @@ -209,4 +217,3 @@ def _load(cls, language: str, name: str, cache_dir: str ) -> Prompt: output_key="output", output_type="JSON", ) - diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index f2c6bfd7b..13cefa9cc 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -9,12 +9,6 @@ from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM -<<<<<<< HEAD -from ragas.utils import json_loader -from ragas.llms.prompt import Prompt -import os -======= ->>>>>>> add-metric-prompts if t.TYPE_CHECKING: from datasets import Dataset @@ -121,29 +115,25 @@ output_type="JSON", ) # noqa: E501 + @dataclass class Faithfulness(MetricWithLLM): name: str = "faithfulness" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore batch_size: int = 15 - + def __post_init__(self: t.Self): - self.long_form_answer_prompt = LONG_FORM_ANSWER_PROMPT self.nli_statements_message = NLI_STATEMENTS_MESSAGE - - def adapt(self, languge: str, cache_dir: str = "~/.cache/ragas/metrics/") -> None: - - cache_dir = os.path.join(cache_dir, self.name) - self.long_form_answer_prompt.adapt(languge, cache_dir, self.llm) - self.nli_statements_message.adapt(languge, cache_dir, self.llm) - - def save(self, cache_dir: str = "~/.cache/ragas/metrics/") -> None: - - cache_dir = os.path.join(cache_dir, self.name) + + def adapt(self, languge: str, cache_dir: t.Optional[str] = None) -> None: + self.long_form_answer_prompt.adapt(languge, self.llm, cache_dir) + self.nli_statements_message.adapt(languge, self.llm, cache_dir) + + def save(self, cache_dir: t.Optional[str] = None) -> None: self.long_form_answer_prompt.save(cache_dir) self.nli_statements_message.save(cache_dir) - + def _score_batch( self: t.Self, dataset: Dataset, @@ -208,7 +198,3 @@ def _score_batch( faithfulness = Faithfulness() - -if __name__ == "__main__": - - faithfulness.adapt(languge="hindi") From 3693b5a51ee4fe53a283db94db0ba1f0d712b211 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 20:10:38 +0530 Subject: [PATCH 19/41] add ragas cache --- src/ragas/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ragas/utils.py b/src/ragas/utils.py index c5c9ee386..822cf6068 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -8,6 +8,12 @@ # constant to tell us that there is no key passed to the llm/embeddings NO_KEY = "no-key" +# Cache location +DEFAULT_XDG_CACHE_HOME = "~/.cache" +XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME) +DEFAULT_RAGAS_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "ragas") +RAGAS_CACHE_HOME = os.path.expanduser(os.getenv("RAGAS_HOME", DEFAULT_RAGAS_CACHE_HOME)) + @lru_cache(maxsize=1) def get_debug_mode() -> bool: From d4a423ab0fda09380e4dfddbb2f2d50e91eabc2c Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 25 Dec 2023 20:11:17 +0530 Subject: [PATCH 20/41] add name --- src/ragas/json_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragas/json_loader.py b/src/ragas/json_loader.py index 687dda3af..3d448dc66 100644 --- a/src/ragas/json_loader.py +++ b/src/ragas/json_loader.py @@ -12,6 +12,7 @@ from ragas.llms import RagasLLM JSON_PROMPT = Prompt( + name="json_safeloader", instruction="Rewrite the input into valid json", examples=[ { From c82861c65e5004026c1e6f709773a6077546bf85 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Dec 2023 18:10:33 +0530 Subject: [PATCH 21/41] added adapt and save --- src/ragas/metrics/_answer_correctness.py | 13 +++++++++++-- src/ragas/metrics/_answer_relevance.py | 15 +++++++++++++-- src/ragas/metrics/_context_precision.py | 15 +++++++++++++-- src/ragas/metrics/_context_recall.py | 15 +++++++++++++-- src/ragas/metrics/_context_relevancy.py | 12 ++++++++++-- src/ragas/metrics/_faithfulness.py | 8 ++++---- src/ragas/metrics/base.py | 18 +++++------------- src/ragas/metrics/critique.py | 9 ++++++++- 8 files changed, 77 insertions(+), 28 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 910136fb0..355f43207 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -7,10 +7,10 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -81,6 +81,13 @@ def __post_init__(self: t.Self): self.answer_similarity = AnswerSimilarity( llm=self.llm, batch_size=self.batch_size ) + self.correctness_prompt = CORRECTNESS_PROMPT + + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + self.correctness_prompt.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: t.Optional[str] = None) -> None: + self.correctness_prompt.save(cache_dir) def _score_batch( self: t.Self, @@ -101,7 +108,9 @@ def _score_batch( ) as batch_group: for q, a, g in zip(question, answer, ground_truths): prompts.append( - CORRECTNESS_PROMPT.format(question=q, ground_truth=g[0], answer=a) + self.correctness_prompt.format( + question=q, ground_truth=g[0], answer=a + ) ) result = self.llm.generate(prompts, callbacks=batch_group) diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 944e67715..30731af1b 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -10,9 +10,9 @@ from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound -from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -84,6 +84,15 @@ def init_model(self): if self.embeddings.openai_api_key == "no-key": raise OpenAIKeyNotFound + def __post_init__(self: t.Self): + self.question_generation = QUESTION_GEN + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + self.question_generation.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: str | None = None) -> None: + self.question_generation.save(cache_dir) + def _score_batch( self: t.Self, dataset: Dataset, @@ -102,7 +111,9 @@ def _score_batch( ) as batch_group: prompts = [] for ans, ctx in zip(answers, contexts): - prompts.append(QUESTION_GEN.format(answer=ans, context="\n".join(ctx))) + prompts.append( + self.question_generation.format(answer=ans, context="\n".join(ctx)) + ) results = self.llm.generate( prompts, diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index a4274c488..9e18c7ee2 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -8,9 +8,9 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -73,6 +73,15 @@ class ContextPrecision(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore batch_size: int = 15 + def __post_init__(self: t.Self): + self.context_precision_prompt = CONTEXT_PRECISION + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + self.context_precision_prompt.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: str | None = None) -> None: + self.context_precision_prompt.save(cache_dir) + def get_dataset_attributes(self, dataset: Dataset): answer = "ground_truths" if answer not in dataset.features.keys(): @@ -98,7 +107,9 @@ def _score_batch( ) as batch_group: for qstn, ctx, answer in zip(questions, contexts, answers): human_prompts = [ - CONTEXT_PRECISION.format(question=qstn, context=c, answer=answer) + self.context_precision_prompt.format( + question=qstn, context=c, answer=answer + ) for c in ctx ] diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index c5e2bdfe4..615e3a7d1 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -7,9 +7,9 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -82,6 +82,15 @@ class ContextRecall(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore batch_size: int = 15 + def __post_init__(self: t.Self): + self.context_recall_prompt = CONTEXT_RECALL_RA + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + self.context_recall_prompt.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: str | None = None) -> None: + self.context_recall_prompt.save(cache_dir) + def _score_batch( self: t.Self, dataset: Dataset, @@ -103,7 +112,9 @@ def _score_batch( gt = "\n".join(gt) if isinstance(gt, list) else gt ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx prompts.append( - CONTEXT_RECALL_RA.format(question=qstn, context=ctx, answer=gt) + self.context_recall_prompt.format( + question=qstn, context=ctx, answer=gt + ) ) responses: list[list[str]] = [] diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index 77ec5c258..30d9b4748 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -56,7 +56,13 @@ class ContextRelevancy(MetricWithLLM): show_deprecation_warning: bool = False def __post_init__(self: t.Self): - pass + self.context_relevancy_prompt = CONTEXT_RELEVANCE + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + self.context_relevancy_prompt.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: str | None = None) -> None: + self.context_relevancy_prompt.save(cache_dir) def _score_batch( self: t.Self, @@ -77,7 +83,9 @@ def _score_batch( ) as batch_group: for q, c in zip(questions, contexts): prompts.append( - CONTEXT_RELEVANCE.format(question=q, context="\n".join(c)) + self.context_relevancy_prompt.format( + question=q, context="\n".join(c) + ) ) responses: list[list[str]] = [] diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 13cefa9cc..50aa345ac 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -6,9 +6,9 @@ import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.json_loader import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from datasets import Dataset @@ -126,9 +126,9 @@ def __post_init__(self: t.Self): self.long_form_answer_prompt = LONG_FORM_ANSWER_PROMPT self.nli_statements_message = NLI_STATEMENTS_MESSAGE - def adapt(self, languge: str, cache_dir: t.Optional[str] = None) -> None: - self.long_form_answer_prompt.adapt(languge, self.llm, cache_dir) - self.nli_statements_message.adapt(languge, self.llm, cache_dir) + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + self.long_form_answer_prompt.adapt(language, self.llm, cache_dir) + self.nli_statements_message.adapt(language, self.llm, cache_dir) def save(self, cache_dir: t.Optional[str] = None) -> None: self.long_form_answer_prompt.save(cache_dir) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index cd69510d3..612e21505 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -63,27 +63,20 @@ def init_model(self): This method will lazy initialize the model. """ ... - + # @abstractmethod - def adapt(self, languge: str) -> None: + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: """ Adapt the metric to a different language. """ pass - + # @abstractmethod - def save(self, path: t.Optional[str]=None) -> None: + def save(self, cache_dir: t.Optional[str] = None) -> None: """ Save the metric to a path. """ pass - - # @abstractmethod - def load(self, path: t.Optional[str]=None) -> None: - """ - Load the metric from a path. - """ - pass def score( self: t.Self, @@ -132,8 +125,7 @@ def get_batches(self, dataset_size: int) -> list[range]: @dataclass class MetricWithLLM(Metric): llm: RagasLLM = field(default_factory=llm_factory) - - + def init_model(self): """ Init any models in the metric, this is invoked before evaluate() diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 9510c31df..e88eba85a 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -8,10 +8,10 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.json_loader import json_loader from ragas.llms import llm_factory from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -82,6 +82,13 @@ def __post_init__(self: t.Self): self.strictness = ( self.strictness if self.strictness % 2 != 0 else self.strictness + 1 ) + self.critic_prompt = CRITIQUE_PROMPT + + def adapt(self, language: str, cache_dir: str | None = None) -> None: + self.critic_prompt.adapt(language, self.llm, cache_dir) + + def save(self, cache_dir: str | None = None) -> None: + self.critic_prompt.save(cache_dir) def prompt_format( self: t.Self, From 7b9f90683e9d65c2b81f32acbdb83787ecf08a1a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Dec 2023 18:11:11 +0530 Subject: [PATCH 22/41] removed json_loader --- src/ragas/json_loader.py | 121 --------------------------------------- 1 file changed, 121 deletions(-) delete mode 100644 src/ragas/json_loader.py diff --git a/src/ragas/json_loader.py b/src/ragas/json_loader.py deleted file mode 100644 index 3d448dc66..000000000 --- a/src/ragas/json_loader.py +++ /dev/null @@ -1,121 +0,0 @@ -from __future__ import annotations - -import json -import typing as t -from dataclasses import dataclass - -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group - -from ragas.llms.prompt import Prompt - -if t.TYPE_CHECKING: - from ragas.llms import RagasLLM - -JSON_PROMPT = Prompt( - name="json_safeloader", - instruction="Rewrite the input into valid json", - examples=[ - { - "input": """{ - "name": "John Doe", - "age": 30, - "isStudent": false - "address": { - "street": "123 Main St", - "city": "Anytown", - "state": "CA", - } - "hobbies": ["reading", "swimming", "cycling"] - }""", - "output": """{ - "name": "John Doe", - "age": 30, - "isStudent": false, - "address": { - "street": "123 Main St", - "city": "Anytown", - "state": "CA" - }, - "hobbies": ["reading", "swimming", "cycling"] - }""", - }, - { - "input": """{ - "statement": "The Earth is also known as "Terra" " - }""", - "output": """{ - "statement": "The Earth is also known as 'Terra'" - } -""", - }, - ], - input_keys=["input"], - output_key="output", - output_type="JSON", -) - - -@dataclass -class JsonLoader: - max_retries: int = 2 - - def safe_load(self, text: str, llm: RagasLLM): - retry = 0 - while retry <= self.max_retries: - try: - start, end = self._find_outermost_json(text) - return json.loads(text[start:end]) - except ValueError: - text = self._fix_to_json(text, llm) - retry += 1 - - return {} - - def _fix_to_json( - self, - text, - llm, - callbacks: t.Optional[CallbackManager] = None, - callback_group_name: str = "batch", - ): - # TODO (executor) - with trace_as_chain_group( - callback_group_name, callback_manager=callbacks - ) as batch_group: - human_prompt = JSON_PROMPT.format(input=text) - - results = llm.generate( - [human_prompt], - n=1, - callbacks=batch_group, - ) - return results.generations[0][0].text - - def _find_outermost_json(self, text): - stack = [] - start_index = -1 - - for i, char in enumerate(text): - if char in "{[": - if len(stack) == 0: - start_index = i - stack.append(char) - - elif char in "}]": - if len(stack) > 0: - last = stack.pop() - if (char == "}" and last != "{") or (char == "]" and last != "["): - # Mismatched closing brace/bracket, invalid JSON - break - - if len(stack) == 0 and start_index != -1: - # Found a valid outermost JSON - return ( - start_index, - i + 1, - ) # Add 1 to include the closing brace/bracket in the range - - return -1, -1 # No valid JSON found - - -json_loader = JsonLoader() From 92f31161b2e255cc8129d561d6a7708df0f7b4bb Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Dec 2023 18:11:30 +0530 Subject: [PATCH 23/41] json loader without circular import error --- src/ragas/utils.py | 127 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 9 deletions(-) diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 822cf6068..33046a2e4 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -1,9 +1,14 @@ from __future__ import annotations +import json import os -import warnings +import typing as t +from dataclasses import dataclass from functools import lru_cache +from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate + DEBUG_ENV_VAR = "RAGAS_DEBUG" # constant to tell us that there is no key passed to the llm/embeddings NO_KEY = "no-key" @@ -23,14 +28,118 @@ def get_debug_mode() -> bool: return False -def load_as_json(text): - """ - validate and return given text as json +# not migrating to Prompt format to avoid circular imports +JSON_PROMPT = HumanMessagePromptTemplate.from_template( """ - try: - return json.loads(text) - except ValueError as e: - warnings.warn(f"Invalid json: {e}") +Rewrite the input into valid json + + +Input: +{{ + "name": "John Doe", + "age": 30, + "isStudent": false + "address": {{ + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + }} + "hobbies": ["reading", "swimming", "cycling"] +}} +Output: +{{ + "name": "John Doe", + "age": 30, + "isStudent": false, + "address": {{ + "street": "123 Main St", + "city": "Anytown", + "state": "CA" + }}, + "hobbies": ["reading", "swimming", "cycling"] +}} + + +Input: +{{ + "statement": "The Earth is also known as "Terra" " +}} +Output: +{{ + "statement": "The Earth is also known as 'Terra'" +}} + +Input: +{input} + +Output: +""" +) + + +@dataclass +class JsonLoader: + max_retries: int = 2 + + def safe_load(self, text: str, llm: RagasLLM): + retry = 0 + while retry <= self.max_retries: + try: + start, end = self._find_outermost_json(text) + return json.loads(text[start:end]) + except ValueError: + text = self._fix_to_json(text, llm) + retry += 1 + + return {} + + def _fix_to_json( + self, + text, + llm, + callbacks: t.Optional[CallbackManager] = None, + callback_group_name: str = "batch", + ): + # TODO (executor) + with trace_as_chain_group( + callback_group_name, callback_manager=callbacks + ) as batch_group: + human_prompt = ChatPromptTemplate.from_messages( + [JSON_PROMPT.format(input=text)] + ) + results = llm.generate( + [human_prompt], + n=1, + callbacks=batch_group, + ) + return results.generations[0][0].text + + def _find_outermost_json(self, text): + stack = [] + start_index = -1 + + for i, char in enumerate(text): + if char in "{[": + if len(stack) == 0: + start_index = i + stack.append(char) + + elif char in "}]": + if len(stack) > 0: + last = stack.pop() + if (char == "}" and last != "{") or (char == "]" and last != "["): + # Mismatched closing brace/bracket, invalid JSON + break + + if len(stack) == 0 and start_index != -1: + # Found a valid outermost JSON + return ( + start_index, + i + 1, + ) # Add 1 to include the closing brace/bracket in the range + + return -1, -1 # No valid JSON found + - return {} +json_loader = JsonLoader() From 29d8667726dc6d279f2c6e3b350d2f176446f102 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Tue, 26 Dec 2023 18:11:51 +0530 Subject: [PATCH 24/41] safe load json --- src/ragas/llms/prompt.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index c1f154101..c612f0f00 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import logging import os import typing as t @@ -9,7 +10,7 @@ from langchain_core.prompt_values import PromptValue from langchain_core.pydantic_v1 import root_validator -from ragas.utils import RAGAS_CACHE_HOME +from ragas.utils import RAGAS_CACHE_HOME, json_loader class Prompt(PromptValue): @@ -157,8 +158,11 @@ def adapt( example_dict.update( {k: v for k, v in zip(self.input_keys, example[: len(self.input_keys)])} ) - # TODO : safe load json - now circular import error - example_dict[self.output_key] = example[-1] + example_dict[self.output_key] = ( + json.dumps(json_loader.safe_load(example[-1], llm)) + if self.output_type.lower() == "json" + else example[-1] + ) self.examples[i] = example_dict @@ -171,14 +175,13 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: os.makedirs(cache_dir) cache_path = os.path.join(cache_dir, f"{self.name}.json") - print(cache_path) with open(cache_path, "w") as file: json.dump(self.to_json(), file, indent=4) @classmethod def _load(cls, language: str, name: str, cache_dir: str) -> Prompt: + logging.log(logging.INFO, f"Loading {name} from {cache_dir}") path = os.path.join(cache_dir, language, f"{name}.json") - print("loading from", path) cls(**json.load(open(path))["kwargs"]) From 5d8760184cdd198dd74e52f0a305b623f8f05290 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 10:35:07 +0530 Subject: [PATCH 25/41] merge metrics-prompts --- src/ragas/metrics/_context_recall.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 96e14c219..af54334d7 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -112,13 +112,9 @@ def _score_batch( gt = "\n".join(gt) if isinstance(gt, list) else gt ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx prompts.append( -<<<<<<< HEAD self.context_recall_prompt.format( question=qstn, context=ctx, answer=gt ) -======= - CONTEXT_RECALL_RA.format(question=qstn, context=ctx, answer=gt) ->>>>>>> 84cb885d4bb26073bc1bd43f563238699f5c6cd7 ) responses: list[list[str]] = [] From 6da261bee3eb31a81d4c146f2493ccf45123dec8 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 10:42:07 +0530 Subject: [PATCH 26/41] fix tests --- src/ragas/metrics/_context_relevancy.py | 4 ---- src/ragas/metrics/_faithfulness.py | 5 ----- src/ragas/utils.py | 12 ++++++++++++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index 9f55d536d..30d9b4748 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -83,13 +83,9 @@ def _score_batch( ) as batch_group: for q, c in zip(questions, contexts): prompts.append( -<<<<<<< HEAD self.context_relevancy_prompt.format( question=q, context="\n".join(c) ) -======= - CONTEXT_RELEVANCE.format(question=q, context="\n".join(c)) ->>>>>>> 84cb885d4bb26073bc1bd43f563238699f5c6cd7 ) responses: list[list[str]] = [] diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 720c7993b..7107cfc0f 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -6,11 +6,6 @@ import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -<<<<<<< HEAD -from ragas.llms.prompt import Prompt -from ragas.metrics.base import EvaluationMode, MetricWithLLM -======= ->>>>>>> 84cb885d4bb26073bc1bd43f563238699f5c6cd7 from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM diff --git a/src/ragas/utils.py b/src/ragas/utils.py index bc7d2b64d..100149673 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -3,6 +3,7 @@ import json import os import typing as t +import warnings from dataclasses import dataclass from functools import lru_cache @@ -26,7 +27,18 @@ def get_debug_mode() -> bool: return True else: return False + +def load_as_json(text): + """ + validate and return given text as json + """ + + try: + return json.loads(text) + except ValueError as e: + warnings.warn(f"Invalid json: {e}") + return {} # not migrating to Prompt format to avoid circular imports JSON_PROMPT = HumanMessagePromptTemplate.from_template( From 8d8d60c8754b0831e3a55f952c5ca99d65a3ea6e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 10:54:06 +0530 Subject: [PATCH 27/41] fix tests --- tests/unit/test_prompt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_prompt.py b/tests/unit/test_prompt.py index feae8fa94..ec676e37d 100644 --- a/tests/unit/test_prompt.py +++ b/tests/unit/test_prompt.py @@ -1,7 +1,7 @@ from ragas.llms.prompt import Prompt TESTCASES = [ - { + { "name":"test-prompt", "instruction" : 'Create one or more statements from each sentence in the given answer.', "examples" : [ { @@ -25,6 +25,7 @@ "output_key" : "statements in json", }, { + "name":"test-prompt", "instruction" : 'Natural language inference. Use only "Yes" (1) or "No" (0) as a binary verdict.', "examples" : [ { @@ -62,6 +63,7 @@ "output_type" : "json" }, { + "name":"test-prompt", "instruction" : 'This is a test prompt without examples', "input_keys" : ["Context"], "output_key" : "Answer", From fc48c9b34d3e5b85de1cbe64bbd61ff184a9c5de Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 10:54:55 +0530 Subject: [PATCH 28/41] remove loader --- src/ragas/json_loader.py | 155 --------------------------------------- 1 file changed, 155 deletions(-) delete mode 100644 src/ragas/json_loader.py diff --git a/src/ragas/json_loader.py b/src/ragas/json_loader.py deleted file mode 100644 index 944e89fd4..000000000 --- a/src/ragas/json_loader.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import annotations - -import json -import os -import typing as t -import warnings -from dataclasses import dataclass -from functools import lru_cache - -from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate - -if t.TYPE_CHECKING: - from ragas.llms import RagasLLM - -DEBUG_ENV_VAR = "RAGAS_DEBUG" -# constant to tell us that there is no key passed to the llm/embeddings -NO_KEY = "no-key" - - -@lru_cache(maxsize=1) -def get_debug_mode() -> bool: - if os.environ.get(DEBUG_ENV_VAR, str(False)).lower() == "true": - return True - else: - return False - - -def load_as_json(text): - """ - validate and return given text as json - """ - - try: - return json.loads(text) - except ValueError as e: - warnings.warn(f"Invalid json: {e}") - - return {} - - -JSON_PROMPT = HumanMessagePromptTemplate.from_template( - """ - -Rewrite the input into valid json - - -Input: -{{ - "name": "John Doe", - "age": 30, - "isStudent": false - "address": {{ - "street": "123 Main St", - "city": "Anytown", - "state": "CA", - }} - "hobbies": ["reading", "swimming", "cycling"] -}} -Output: -{{ - "name": "John Doe", - "age": 30, - "isStudent": false, - "address": {{ - "street": "123 Main St", - "city": "Anytown", - "state": "CA" - }}, - "hobbies": ["reading", "swimming", "cycling"] -}} - - -Input: -{{ - "statement": "The Earth is also known as "Terra" " -}} -Output: -{{ - "statement": "The Earth is also known as 'Terra'" -}} - -Input: -{input} - -Output: -""" -) - - -@dataclass -class JsonLoader: - max_retries: int = 2 - - def safe_load(self, text: str, llm: RagasLLM): - retry = 0 - while retry <= self.max_retries: - try: - start, end = self._find_outermost_json(text) - return json.loads(text[start:end]) - except ValueError: - text = self._fix_to_json(text, llm) - retry += 1 - - return {} - - def _fix_to_json( - self, - text, - llm, - callbacks: t.Optional[CallbackManager] = None, - callback_group_name: str = "batch", - ): - # TODO (executor) - with trace_as_chain_group( - callback_group_name, callback_manager=callbacks - ) as batch_group: - human_prompt = ChatPromptTemplate.from_messages( - [JSON_PROMPT.format(input=text)] - ) - results = llm.generate( - [human_prompt], - n=1, - callbacks=batch_group, - ) - return results.generations[0][0].text - - def _find_outermost_json(self, text): - stack = [] - start_index = -1 - - for i, char in enumerate(text): - if char in "{[": - if len(stack) == 0: - start_index = i - stack.append(char) - - elif char in "}]": - if len(stack) > 0: - last = stack.pop() - if (char == "}" and last != "{") or (char == "]" and last != "["): - # Mismatched closing brace/bracket, invalid JSON - break - - if len(stack) == 0 and start_index != -1: - # Found a valid outermost JSON - return ( - start_index, - i + 1, - ) # Add 1 to include the closing brace/bracket in the range - - return -1, -1 # No valid JSON found - - -json_loader = JsonLoader() \ No newline at end of file From 258f17924398036f8295a13bf70f3f1c2310b126 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 11:54:35 +0530 Subject: [PATCH 29/41] linting --- src/ragas/embeddings/base.py | 4 +- src/ragas/llms/prompt.py | 1 + src/ragas/metrics/_answer_correctness.py | 6 +- src/ragas/metrics/_answer_relevance.py | 2 +- src/ragas/metrics/_context_precision.py | 2 +- src/ragas/metrics/_context_recall.py | 2 +- src/ragas/metrics/_faithfulness.py | 2 +- src/ragas/metrics/base.py | 2 - src/ragas/metrics/critique.py | 1 - src/ragas/prompts/__init__.py | 4 +- src/ragas/prompts/base.py | 28 +++--- src/ragas/testset/testset_generator.py | 8 +- src/ragas/utils.py | 9 +- tests/unit/test_prompt.py | 109 +++++++++++++---------- 14 files changed, 98 insertions(+), 82 deletions(-) diff --git a/src/ragas/embeddings/base.py b/src/ragas/embeddings/base.py index 8c722b21d..355697161 100644 --- a/src/ragas/embeddings/base.py +++ b/src/ragas/embeddings/base.py @@ -7,8 +7,8 @@ import numpy as np from langchain.embeddings import AzureOpenAIEmbeddings as BaseAzureOpenAIEmbeddings -from langchain.embeddings import OpenAIEmbeddings as BaseOpenAIEmbeddings from langchain.embeddings import FastEmbedEmbeddings as BaseFastEmbedEmbeddings +from langchain.embeddings import OpenAIEmbeddings as BaseOpenAIEmbeddings from langchain.schema.embeddings import Embeddings from pydantic.dataclasses import dataclass @@ -47,6 +47,7 @@ def validate_api_key(self): else: raise OpenAIKeyNotFound + class FastEmbedEmbeddings(BaseFastEmbedEmbeddings, RagasEmbeddings): """ Find the list of supported models at: @@ -64,6 +65,7 @@ def validate_api_key(self): """ pass + class AzureOpenAIEmbeddings(BaseAzureOpenAIEmbeddings, RagasEmbeddings): azure_endpoint: t.Optional[str] = None deployment: t.Optional[str] = None diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index c612f0f00..d840ee216 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -10,6 +10,7 @@ from langchain_core.prompt_values import PromptValue from langchain_core.pydantic_v1 import root_validator +from ragas.llms import RagasLLM from ragas.utils import RAGAS_CACHE_HOME, json_loader diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 4e7cdd9fa..ff8faea0c 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -7,10 +7,10 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics._answer_similarity import AnswerSimilarity from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -78,7 +78,9 @@ class AnswerCorrectness(MetricWithLLM): def __post_init__(self: t.Self): if len(self.weights) != 2: - raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity") + raise ValueError( + "Expects a list of two weights. First for factuality, second for semantic similarity" + ) if all([w == 0 for w in self.weights]): raise ValueError("At least one weight must be non-zero") if not all([w >= 0 for w in self.weights]): diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 9b21f3285..30731af1b 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -10,9 +10,9 @@ from ragas.embeddings.base import embedding_factory from ragas.exceptions import OpenAIKeyNotFound -from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 474753ce2..9e18c7ee2 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -8,9 +8,9 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index af54334d7..615e3a7d1 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -7,9 +7,9 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 7107cfc0f..50aa345ac 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -6,9 +6,9 @@ import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.utils import json_loader from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.utils import json_loader if t.TYPE_CHECKING: from datasets import Dataset diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 612e21505..cfe7225c9 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -22,8 +22,6 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks - from ragas.llms import RagasLLM - def make_batches(total_size: int, batch_size: int) -> list[range]: """ diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 8c7a2257d..e88eba85a 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -8,7 +8,6 @@ from datasets import Dataset from langchain.callbacks.manager import CallbackManager, trace_as_chain_group -from ragas.utils import json_loader from ragas.llms import llm_factory from ragas.llms.prompt import Prompt from ragas.metrics.base import EvaluationMode, MetricWithLLM diff --git a/src/ragas/prompts/__init__.py b/src/ragas/prompts/__init__.py index d5cdf8f41..e08d385e5 100644 --- a/src/ragas/prompts/__init__.py +++ b/src/ragas/prompts/__init__.py @@ -1,6 +1,4 @@ -from ragas.prompts.base import ( - RagasPrompt, -) +from ragas.prompts.base import RagasPrompt __all__ = [ "RagasPrompt", diff --git a/src/ragas/prompts/base.py b/src/ragas/prompts/base.py index 23eb3115b..e5dfced6a 100644 --- a/src/ragas/prompts/base.py +++ b/src/ragas/prompts/base.py @@ -1,13 +1,12 @@ from __future__ import annotations import json +import os import typing as t from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate from langchain_core.messages import BaseMessage from langchain_core.prompt_values import PromptValue -import pickle -import os from ragas.llms.base import RagasLLM from ragas.utils import json_loader @@ -22,6 +21,7 @@ class RagasPrompt(PromptValue): """ RagasPrompt is a class that represents a prompt for the ragas metrics. """ + name: str instruction: str examples: t.List[t.Dict[str, t.Any]] = [] @@ -111,12 +111,14 @@ def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: cache_dir = cache_dir if cache_dir else "~/.cache/ragas/prompts" if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")): self._load(language, self.name, cache_dir) - + prompts = [] for example in self.examples: prompts.extend( [ - str_translation.format(translate_to=language, input=example.get(key)) + str_translation.format( + translate_to=language, input=example.get(key) + ) for key in self.input_keys ] ) @@ -125,7 +127,9 @@ def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: translate_to=language, input=example.get(self.output_key) ) if self.output_type.lower() == "json" - else str_translation.format(translate_to=language, input=example.get(self.output_key)) + else str_translation.format( + translate_to=language, input=example.get(self.output_key) + ) ) results = [result[0].text for result in llm.generate(prompts).generations] @@ -148,25 +152,23 @@ def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: else example[-1] ) self.examples[i] = example_dict - + self.language = language - + def save(self, cache_dir: str = "~/.cache/ragas/prompts") -> None: - cache_dir = os.path.join(cache_dir, self.language) if not os.path.exists(cache_dir): os.makedirs(cache_dir) - + cache_path = os.path.join(cache_dir, f"{self.name}.json") self.to_json(cache_path) - - @classmethod - def _load(cls, language: str, name: str, cache_dir: str ) -> RagasPrompt: - + @classmethod + def _load(cls, language: str, name: str, cache_dir: str) -> RagasPrompt: path = os.path.join(cache_dir, language, f"{name}.json") cls(**json.load(open(path))) + str_translation = RagasPrompt( name="str_translation", instruction="Language translation", diff --git a/src/ragas/testset/testset_generator.py b/src/ragas/testset/testset_generator.py index 096dd816a..f53206ba9 100644 --- a/src/ragas/testset/testset_generator.py +++ b/src/ragas/testset/testset_generator.py @@ -63,9 +63,7 @@ "conditional": "_condition_question", } -retry_errors = ( - ValidationError, -) +retry_errors = (ValidationError,) DataRow = namedtuple( "DataRow", @@ -416,9 +414,7 @@ def generate( proposal = None try: - proposal = self._make_proposal( - curr_node, neighbor_nodes, evolve_type - ) + proposal = self._make_proposal(curr_node, neighbor_nodes, evolve_type) except Exception as e: err_cause = e.__cause__ if not isinstance(err_cause, retry_errors): diff --git a/src/ragas/utils.py b/src/ragas/utils.py index 100149673..00cc57d5f 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -10,6 +10,9 @@ from langchain.callbacks.manager import CallbackManager, trace_as_chain_group from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate +if t.TYPE_CHECKING: + from ragas.llms import RagasLLM + DEBUG_ENV_VAR = "RAGAS_DEBUG" # constant to tell us that there is no key passed to the llm/embeddings NO_KEY = "no-key" @@ -27,7 +30,8 @@ def get_debug_mode() -> bool: return True else: return False - + + def load_as_json(text): """ validate and return given text as json @@ -40,6 +44,7 @@ def load_as_json(text): return {} + # not migrating to Prompt format to avoid circular imports JSON_PROMPT = HumanMessagePromptTemplate.from_template( """ @@ -154,4 +159,4 @@ def _find_outermost_json(self, text): return -1, -1 # No valid JSON found -json_loader = JsonLoader() \ No newline at end of file +json_loader = JsonLoader() diff --git a/tests/unit/test_prompt.py b/tests/unit/test_prompt.py index ec676e37d..4fcd1d487 100644 --- a/tests/unit/test_prompt.py +++ b/tests/unit/test_prompt.py @@ -1,40 +1,41 @@ from ragas.llms.prompt import Prompt TESTCASES = [ - { "name":"test-prompt", - "instruction" : 'Create one or more statements from each sentence in the given answer.', - "examples" : [ - { - "question":"Cadmium Chloride is slightly soluble in this chemical, it is also called what?", - "answer":"alcohol", - "statements in json":"""{ + { + "name": "test-prompt", + "instruction": "Create one or more statements from each sentence in the given answer.", + "examples": [ + { + "question": "Cadmium Chloride is slightly soluble in this chemical, it is also called what?", + "answer": "alcohol", + "statements in json": """{ "statements": [ "Cadmium Chloride is slightly soluble in alcohol." ] - }""" - }, - { - "question":"Were Hitler and Benito Mussolini of the same nationality?", - "answer":"Sorry, I can't provide answer to that question.", - "statements in json":"""{ + }""", + }, + { + "question": "Were Hitler and Benito Mussolini of the same nationality?", + "answer": "Sorry, I can't provide answer to that question.", + "statements in json": """{ "statements": [] - }""" - } - ], - "input_keys" : ["question", "answer"], - "output_key" : "statements in json", - }, - { - "name":"test-prompt", - "instruction" : 'Natural language inference. Use only "Yes" (1) or "No" (0) as a binary verdict.', - "examples" : [ - { - "Context":"""John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. + }""", + }, + ], + "input_keys": ["question", "answer"], + "output_key": "statements in json", + }, + { + "name": "test-prompt", + "instruction": 'Natural language inference. Use only "Yes" (1) or "No" (0) as a binary verdict.', + "examples": [ + { + "Context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects. statement_1: John is majoring in Biology. statement_2: John is taking a course on Artificial Intelligence. statement_3: John is a dedicated student. statement_4: John has a part-time job.""", - "Answer":"""[ + "Answer": """[ { "statement_1": "John is majoring in Biology.", "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", @@ -55,32 +56,44 @@ "reason": "There is no information given in the context about John having a part-time job.", "verdict": "0" }] - """ - } - ], - "input_keys" : ["Context"], - "output_key" : "Answer", - "output_type" : "json" - }, - { - "name":"test-prompt", - "instruction" : 'This is a test prompt without examples', - "input_keys" : ["Context"], - "output_key" : "Answer", - "output_type" : "json" - }, + """, + } + ], + "input_keys": ["Context"], + "output_key": "Answer", + "output_type": "json", + }, + { + "name": "test-prompt", + "instruction": "This is a test prompt without examples", + "input_keys": ["Context"], + "output_key": "Answer", + "output_type": "json", + }, ] -def test_prompt_object(): +def test_prompt_object(): for testcase in TESTCASES: prompt = Prompt(**testcase) assert prompt is not None, "Prompt object is not created" - assert prompt.instruction==testcase['instruction'], "instruction in object is not same as in the testcase" - assert prompt.input_keys==testcase['input_keys'], "input_keys in object is not same as in the testcase" - assert prompt.output_key==testcase['output_key'], "output_key in object is not same as in the testcase" - assert prompt.output_type==testcase.get('output_type', 'json'), "output_type in object is not same as in the testcase" - assert prompt.examples==testcase.get('examples', []), "examples should be empty if not provided" - if testcase.get('examples'): - assert isinstance(prompt.get_example_str(0), str), "get_example_str should return a string" \ No newline at end of file + assert ( + prompt.instruction == testcase["instruction"] + ), "instruction in object is not same as in the testcase" + assert ( + prompt.input_keys == testcase["input_keys"] + ), "input_keys in object is not same as in the testcase" + assert ( + prompt.output_key == testcase["output_key"] + ), "output_key in object is not same as in the testcase" + assert prompt.output_type == testcase.get( + "output_type", "json" + ), "output_type in object is not same as in the testcase" + assert prompt.examples == testcase.get( + "examples", [] + ), "examples should be empty if not provided" + if testcase.get("examples"): + assert isinstance( + prompt.get_example_str(0), str + ), "get_example_str should return a string" From c1f6d6bdd464537b1fdbe8bba54cb0606ba73b29 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 13:55:24 +0530 Subject: [PATCH 30/41] adapting logs --- src/ragas/metrics/_answer_correctness.py | 1 + src/ragas/metrics/_answer_relevance.py | 1 + src/ragas/metrics/_context_precision.py | 1 + src/ragas/metrics/_context_recall.py | 1 + src/ragas/metrics/_faithfulness.py | 1 + src/ragas/metrics/critique.py | 1 + 6 files changed, 6 insertions(+) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index ff8faea0c..e59c3b0ce 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -93,6 +93,7 @@ def __post_init__(self: t.Self): self.correctness_prompt = CORRECTNESS_PROMPT def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + print(f"Adapting AnswerCorrectness metric to {language}") self.correctness_prompt.adapt(language, self.llm, cache_dir) def save(self, cache_dir: t.Optional[str] = None) -> None: diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 30731af1b..6be51bb8b 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -88,6 +88,7 @@ def __post_init__(self: t.Self): self.question_generation = QUESTION_GEN def adapt(self, language: str, cache_dir: str | None = None) -> None: + print(f"Adapting AnswerRelevancy metric to {language}") self.question_generation.adapt(language, self.llm, cache_dir) def save(self, cache_dir: str | None = None) -> None: diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 9e18c7ee2..55e86f5bb 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -77,6 +77,7 @@ def __post_init__(self: t.Self): self.context_precision_prompt = CONTEXT_PRECISION def adapt(self, language: str, cache_dir: str | None = None) -> None: + print(f"Adapting Context Precision to {language}") self.context_precision_prompt.adapt(language, self.llm, cache_dir) def save(self, cache_dir: str | None = None) -> None: diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 615e3a7d1..593caefaf 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -86,6 +86,7 @@ def __post_init__(self: t.Self): self.context_recall_prompt = CONTEXT_RECALL_RA def adapt(self, language: str, cache_dir: str | None = None) -> None: + print(f"Adapting Context Recall to {language}") self.context_recall_prompt.adapt(language, self.llm, cache_dir) def save(self, cache_dir: str | None = None) -> None: diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 50aa345ac..8531ce5aa 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -127,6 +127,7 @@ def __post_init__(self: t.Self): self.nli_statements_message = NLI_STATEMENTS_MESSAGE def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + print(f"Adapting Faithfulness metric to {language}") self.long_form_answer_prompt.adapt(language, self.llm, cache_dir) self.nli_statements_message.adapt(language, self.llm, cache_dir) diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index e88eba85a..5a7e210c5 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -85,6 +85,7 @@ def __post_init__(self: t.Self): self.critic_prompt = CRITIQUE_PROMPT def adapt(self, language: str, cache_dir: str | None = None) -> None: + print(f"Adapting Critic to {language}") self.critic_prompt.adapt(language, self.llm, cache_dir) def save(self, cache_dir: str | None = None) -> None: From 62db651b99e098d3c016b60bc29dc7fbe51d042e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 13:55:48 +0530 Subject: [PATCH 31/41] add support to dict type --- src/ragas/llms/prompt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index d840ee216..243c8af6e 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -71,6 +71,7 @@ def to_string(self) -> str: # Format the examples to match the Langchain prompt template for example in self.examples: for key, value in example.items(): + value = json.dumps(value, ensure_ascii=False).encode('utf8').decode() value = ( value.replace("{", "{{").replace("}", "}}") if self.output_type.lower() == "json" @@ -97,6 +98,7 @@ def get_example_str(self, example_no: int) -> str: example = self.examples[example_no] example_str = "" for key, value in example.items(): + value = json.dumps(value, ensure_ascii=False).encode('utf8').decode() value = ( value.replace("{", "{{").replace("}", "}}") if self.output_type.lower() == "json" @@ -160,7 +162,7 @@ def adapt( {k: v for k, v in zip(self.input_keys, example[: len(self.input_keys)])} ) example_dict[self.output_key] = ( - json.dumps(json_loader.safe_load(example[-1], llm)) + json_loader.safe_load(example[-1], llm) if self.output_type.lower() == "json" else example[-1] ) From f68f0085e26238ac71c122455fb22176117caf4f Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 18:30:18 +0530 Subject: [PATCH 32/41] add support to dict type --- src/ragas/llms/prompt.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 243c8af6e..5da099015 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -51,10 +51,11 @@ def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]: raise ValueError( f"example {no+1} does not have the variable {output_key} in the definition" ) - if values["output_type"] == "json": + if values["output_type"].lower() == "json": try: if output_key in example: - json.loads(example[output_key]) + if isinstance(example[output_key], str): + json.loads(example[output_key]) except ValueError as e: raise ValueError( f"{output_key} in example {no+1} is not in valid json format: {e}" @@ -170,6 +171,8 @@ def adapt( self.examples[i] = example_dict self.language = language + + # TODO:Validate the prompt after adaptation def save(self, cache_dir: t.Optional[str] = None) -> None: cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME From 1fd1fb08497e0d82e4bca814a4c2f69f58369504 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 18:32:06 +0530 Subject: [PATCH 33/41] add support to dict type --- src/ragas/llms/prompt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 5da099015..cc35e243a 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -72,7 +72,7 @@ def to_string(self) -> str: # Format the examples to match the Langchain prompt template for example in self.examples: for key, value in example.items(): - value = json.dumps(value, ensure_ascii=False).encode('utf8').decode() + value = json.dumps(value, ensure_ascii=False).encode("utf8").decode() value = ( value.replace("{", "{{").replace("}", "}}") if self.output_type.lower() == "json" @@ -99,7 +99,7 @@ def get_example_str(self, example_no: int) -> str: example = self.examples[example_no] example_str = "" for key, value in example.items(): - value = json.dumps(value, ensure_ascii=False).encode('utf8').decode() + value = json.dumps(value, ensure_ascii=False).encode("utf8").decode() value = ( value.replace("{", "{{").replace("}", "}}") if self.output_type.lower() == "json" @@ -171,7 +171,7 @@ def adapt( self.examples[i] = example_dict self.language = language - + # TODO:Validate the prompt after adaptation def save(self, cache_dir: t.Optional[str] = None) -> None: From 4ae911364ba7867f850c4583929d7a7e25dfc435 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 18:42:41 +0530 Subject: [PATCH 34/41] remove prompts --- src/ragas/prompts/__init__.py | 5 - src/ragas/prompts/base.py | 208 ---------------------------------- 2 files changed, 213 deletions(-) delete mode 100644 src/ragas/prompts/__init__.py delete mode 100644 src/ragas/prompts/base.py diff --git a/src/ragas/prompts/__init__.py b/src/ragas/prompts/__init__.py deleted file mode 100644 index e08d385e5..000000000 --- a/src/ragas/prompts/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from ragas.prompts.base import RagasPrompt - -__all__ = [ - "RagasPrompt", -] diff --git a/src/ragas/prompts/base.py b/src/ragas/prompts/base.py deleted file mode 100644 index e5dfced6a..000000000 --- a/src/ragas/prompts/base.py +++ /dev/null @@ -1,208 +0,0 @@ -from __future__ import annotations - -import json -import os -import typing as t - -from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate -from langchain_core.messages import BaseMessage -from langchain_core.prompt_values import PromptValue - -from ragas.llms.base import RagasLLM -from ragas.utils import json_loader - -try: - from pydantic.v1 import root_validator -except ImportError: - from pydantic import root_validator - - -class RagasPrompt(PromptValue): - """ - RagasPrompt is a class that represents a prompt for the ragas metrics. - """ - - name: str - instruction: str - examples: t.List[t.Dict[str, t.Any]] = [] - input_keys: t.List[str] - output_key: str - output_type: str = "JSON" - language: str = "en" - - def to_string(self) -> str: - """Return prompt value as string.""" - ... - - def to_messages(self) -> t.List[BaseMessage]: - """Return prompt as a list of Messages.""" - ... - - @root_validator() - def validate_prompt(cls, value: t.Dict[str, str]) -> t.Dict[str, str]: - """ - Validate the template string to ensure that it is in desired format. - """ - if value.get("instruction") is None or value.get("instruction") == "": - raise ValueError("Instruction cannot be empty") - if value.get("input_keys") is None or value.get("instruction") == []: - raise ValueError("Input keys cannot be empty") - if value.get("output_key") is None or value.get("output_key") == "": - raise ValueError("Output key cannot be empty") - - if value.get("examples"): - output_key = value["output_key"] - for no, example in enumerate(value["examples"]): - for inp_key in value["input_keys"]: - if inp_key not in example: - raise ValueError( - f"Example {no+1} does not have the variable {inp_key} in the definition" - ) - if output_key not in example: - raise ValueError( - f"Example {no+1} does not have the variable {output_key} in the definition" - ) - if value["output_type"] == "JSON": - try: - json.loads(example[output_key]) - except ValueError as e: - raise ValueError( - f"{output_key} in example {no+1} is not in valid JSON format: {e}" - ) - - return value - - def generate_prompt_string(self) -> str: - """ - Generate the prompt string from the variables. - """ - prompt_str = self.instruction + "\n" - - # Format the examples to match the Langchain prompt template - for example in self.examples: - for key, value in example.items(): - value = ( - value.replace("{", "{{").replace("}", "}}") - if self.output_type == "JSON" - else value - ) - prompt_str += f"\n{key}: {value}" - prompt_str += "\n" - - prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys) - prompt_str += f"\n{self.output_key}: \n" - - return prompt_str - - def format(self, **kwargs: t.Any) -> ChatPromptTemplate: - """ - Format the RagasPrompt object into a ChatPromptTemplate object to be used in metrics. - """ - if set(self.input_keys) != set(kwargs.keys()): - raise ValueError( - f"Input variables {self.input_keys} do not match with the given parameters {list(kwargs.keys())}" - ) - prompt = self.generate_prompt_string() - human_prompt = HumanMessagePromptTemplate.from_template(prompt) - return ChatPromptTemplate.from_messages([human_prompt.format(**kwargs)]) - - def adapt(self, language: str, cache_dir: str, llm: RagasLLM) -> None: - # TODO: Add callbacks - cache_dir = cache_dir if cache_dir else "~/.cache/ragas/prompts" - if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")): - self._load(language, self.name, cache_dir) - - prompts = [] - for example in self.examples: - prompts.extend( - [ - str_translation.format( - translate_to=language, input=example.get(key) - ) - for key in self.input_keys - ] - ) - prompts.append( - json_translatation.format( - translate_to=language, input=example.get(self.output_key) - ) - if self.output_type.lower() == "json" - else str_translation.format( - translate_to=language, input=example.get(self.output_key) - ) - ) - - results = [result[0].text for result in llm.generate(prompts).generations] - per_example_items = len(self.input_keys) + 1 - grouped_results = [ - results[i : i + per_example_items] - for i in range(0, len(results), per_example_items) - ] - assert len(grouped_results) == len( - self.examples - ), "examples and adapted examples must be of equal length" - for i, example in enumerate(grouped_results): - example_dict = {} - example_dict.update( - {k: v for k, v in zip(self.input_keys, example[: len(self.input_keys)])} - ) - example_dict[self.output_key] = ( - json_loader.safe_load(example[-1], llm=llm) - if self.output_type.lower() == "json" - else example[-1] - ) - self.examples[i] = example_dict - - self.language = language - - def save(self, cache_dir: str = "~/.cache/ragas/prompts") -> None: - cache_dir = os.path.join(cache_dir, self.language) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir) - - cache_path = os.path.join(cache_dir, f"{self.name}.json") - self.to_json(cache_path) - - @classmethod - def _load(cls, language: str, name: str, cache_dir: str) -> RagasPrompt: - path = os.path.join(cache_dir, language, f"{name}.json") - cls(**json.load(open(path))) - - -str_translation = RagasPrompt( - name="str_translation", - instruction="Language translation", - examples=[ - { - "translate_to": "hindi", - "input": "Who was Albert Einstein and what is he best known for?", - "output": "अल्बर्ट आइंस्टीन कौन थे और वे किसके लिए सबसे ज्यादा प्रसिद्ध हैं?", - }, - ], - input_keys=["translate_to", "input"], - output_key="output", - output_type="str", -) - -json_translatation = RagasPrompt( - name="json_translation", - instruction="Translate values in given json to target language ", - examples=[ - { - "translate_to": "hindi", - "input": """{ - "statements": [ - "Albert Einstein was born in Germany.", - "Albert Einstein was best known for his theory of relativity." - ]}""", - "output": """{ - "statements": [ - "अल्बर्ट आइंस्टीन का जन्म जर्मनी में हुआ था।", - "अल्बर्ट आइंस्टीन अपने सापेक्षता के सिद्धांत के लिए सबसे अधिक प्रसिद्ध थे।" - ]}""", - } - ], - input_keys=["translate_to", "input"], - output_key="output", - output_type="JSON", -) From 48355105c4a89a8f39961ac7954d7793a8de5d66 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 18:54:53 +0530 Subject: [PATCH 35/41] accept dict return type --- src/ragas/metrics/_answer_correctness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index e59c3b0ce..abd476772 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -134,7 +134,7 @@ def _score_batch( f1_score = [] for prediction in outputs: prediction = json_loader.safe_load(prediction[0].text, self.llm) - prediction = prediction if isinstance(prediction, list) else [] + prediction = prediction if isinstance(prediction, list) else [prediction] if prediction: prediction = [ item.get(key_map[k], np.nan) From a2aa64fd9b282775fc4b0085e6f69c0267134726 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Sun, 31 Dec 2023 19:39:21 +0530 Subject: [PATCH 36/41] fix return type --- src/ragas/llms/prompt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index cc35e243a..28e5924d0 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -185,7 +185,7 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: json.dump(self.to_json(), file, indent=4) @classmethod - def _load(cls, language: str, name: str, cache_dir: str) -> Prompt: + def _load(cls, language: str, name: str, cache_dir: str) -> None: logging.log(logging.INFO, f"Loading {name} from {cache_dir}") path = os.path.join(cache_dir, language, f"{name}.json") cls(**json.load(open(path))["kwargs"]) From 2cb33c8e036c345e838377f612735f1311d74802 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 1 Jan 2024 00:05:56 +0530 Subject: [PATCH 37/41] accept dict return type --- src/ragas/metrics/_answer_correctness.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index abd476772..35ddd098f 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -134,7 +134,9 @@ def _score_batch( f1_score = [] for prediction in outputs: prediction = json_loader.safe_load(prediction[0].text, self.llm) - prediction = prediction if isinstance(prediction, list) else [prediction] + prediction = ( + prediction if isinstance(prediction, list) else [prediction] + ) if prediction: prediction = [ item.get(key_map[k], np.nan) From dce0eeda305ac92640450762f726fd22f1b2e81a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 1 Jan 2024 11:42:08 +0530 Subject: [PATCH 38/41] change to logging --- src/ragas/metrics/_answer_correctness.py | 7 +++++-- src/ragas/metrics/_answer_relevance.py | 7 +++++-- src/ragas/metrics/_context_precision.py | 6 ++++-- src/ragas/metrics/_context_recall.py | 7 +++++-- src/ragas/metrics/_context_relevancy.py | 4 +++- src/ragas/metrics/_faithfulness.py | 11 ++++++++--- src/ragas/metrics/critique.py | 3 ++- 7 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 35ddd098f..8436de854 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import typing as t from dataclasses import dataclass, field @@ -93,8 +94,10 @@ def __post_init__(self: t.Self): self.correctness_prompt = CORRECTNESS_PROMPT def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: - print(f"Adapting AnswerCorrectness metric to {language}") - self.correctness_prompt.adapt(language, self.llm, cache_dir) + logging.info(f"Adapting AnswerCorrectness metric to {language}") + self.correctness_prompt = self.correctness_prompt.adapt( + language, self.llm, cache_dir + ) def save(self, cache_dir: t.Optional[str] = None) -> None: self.correctness_prompt.save(cache_dir) diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index 6be51bb8b..f534a959b 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import typing as t from dataclasses import dataclass, field @@ -88,8 +89,10 @@ def __post_init__(self: t.Self): self.question_generation = QUESTION_GEN def adapt(self, language: str, cache_dir: str | None = None) -> None: - print(f"Adapting AnswerRelevancy metric to {language}") - self.question_generation.adapt(language, self.llm, cache_dir) + logging.info(f"Adapting AnswerRelevancy metric to {language}") + self.question_generation = self.question_generation.adapt( + language, self.llm, cache_dir + ) def save(self, cache_dir: str | None = None) -> None: self.question_generation.save(cache_dir) diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 55e86f5bb..ccde0dcce 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -77,8 +77,10 @@ def __post_init__(self: t.Self): self.context_precision_prompt = CONTEXT_PRECISION def adapt(self, language: str, cache_dir: str | None = None) -> None: - print(f"Adapting Context Precision to {language}") - self.context_precision_prompt.adapt(language, self.llm, cache_dir) + logging.info(f"Adapting Context Precision to {language}") + self.context_precision_prompt = self.context_precision_prompt.adapt( + language, self.llm, cache_dir + ) def save(self, cache_dir: str | None = None) -> None: self.context_precision_prompt.save(cache_dir) diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 593caefaf..2c93bbd70 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import typing as t from dataclasses import dataclass @@ -86,8 +87,10 @@ def __post_init__(self: t.Self): self.context_recall_prompt = CONTEXT_RECALL_RA def adapt(self, language: str, cache_dir: str | None = None) -> None: - print(f"Adapting Context Recall to {language}") - self.context_recall_prompt.adapt(language, self.llm, cache_dir) + logging.info(f"Adapting Context Recall to {language}") + self.context_recall_prompt = self.context_recall_prompt.adapt( + language, self.llm, cache_dir + ) def save(self, cache_dir: str | None = None) -> None: self.context_recall_prompt.save(cache_dir) diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index 30d9b4748..f9acf6a29 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -59,7 +59,9 @@ def __post_init__(self: t.Self): self.context_relevancy_prompt = CONTEXT_RELEVANCE def adapt(self, language: str, cache_dir: str | None = None) -> None: - self.context_relevancy_prompt.adapt(language, self.llm, cache_dir) + self.context_relevancy_prompt = self.context_relevancy_prompt.adapt( + language, self.llm, cache_dir + ) def save(self, cache_dir: str | None = None) -> None: self.context_relevancy_prompt.save(cache_dir) diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 8531ce5aa..d0045d46e 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import typing as t from dataclasses import dataclass @@ -127,9 +128,13 @@ def __post_init__(self: t.Self): self.nli_statements_message = NLI_STATEMENTS_MESSAGE def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: - print(f"Adapting Faithfulness metric to {language}") - self.long_form_answer_prompt.adapt(language, self.llm, cache_dir) - self.nli_statements_message.adapt(language, self.llm, cache_dir) + logging.info(f"Adapting Faithfulness metric to {language}") + self.long_form_answer_prompt = self.long_form_answer_prompt.adapt( + language, self.llm, cache_dir + ) + self.nli_statements_message = self.nli_statements_message.adapt( + language, self.llm, cache_dir + ) def save(self, cache_dir: t.Optional[str] = None) -> None: self.long_form_answer_prompt.save(cache_dir) diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 5a7e210c5..03bb5847e 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import typing as t from collections import Counter from dataclasses import dataclass, field @@ -85,7 +86,7 @@ def __post_init__(self: t.Self): self.critic_prompt = CRITIQUE_PROMPT def adapt(self, language: str, cache_dir: str | None = None) -> None: - print(f"Adapting Critic to {language}") + logging.info(f"Adapting Critic to {language}") self.critic_prompt.adapt(language, self.llm, cache_dir) def save(self, cache_dir: str | None = None) -> None: From 27ceb4b492fc75d192f04efefc063fadf5c3f4c5 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Mon, 1 Jan 2024 11:42:26 +0530 Subject: [PATCH 39/41] return prompt objects --- src/ragas/llms/prompt.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ragas/llms/prompt.py b/src/ragas/llms/prompt.py index 28e5924d0..61d2bb589 100644 --- a/src/ragas/llms/prompt.py +++ b/src/ragas/llms/prompt.py @@ -122,11 +122,11 @@ def format(self, **kwargs: t.Any) -> ChatPromptTemplate: def adapt( self, language: str, llm: RagasLLM, cache_dir: t.Optional[str] = None - ) -> None: + ) -> Prompt: # TODO: Add callbacks cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME if os.path.exists(os.path.join(cache_dir, language, f"{self.name}.json")): - self._load(language, self.name, cache_dir) + return self._load(language, self.name, cache_dir) prompts = [] for example in self.examples: @@ -174,6 +174,8 @@ def adapt( # TODO:Validate the prompt after adaptation + return self + def save(self, cache_dir: t.Optional[str] = None) -> None: cache_dir = cache_dir if cache_dir else RAGAS_CACHE_HOME cache_dir = os.path.join(cache_dir, self.language) @@ -185,10 +187,10 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: json.dump(self.to_json(), file, indent=4) @classmethod - def _load(cls, language: str, name: str, cache_dir: str) -> None: + def _load(cls, language: str, name: str, cache_dir: str) -> Prompt: logging.log(logging.INFO, f"Loading {name} from {cache_dir}") path = os.path.join(cache_dir, language, f"{name}.json") - cls(**json.load(open(path))["kwargs"]) + return cls(**json.load(open(path))["kwargs"]) str_translation = Prompt( From ad0a464f4b24bd618c7b850d5c0996fba088e497 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Mon, 1 Jan 2024 20:05:35 +0530 Subject: [PATCH 40/41] logging and classes --- src/ragas/metrics/_answer_correctness.py | 6 ++++-- src/ragas/metrics/_answer_relevance.py | 8 ++++---- src/ragas/metrics/_answer_similarity.py | 3 +++ src/ragas/metrics/_context_recall.py | 8 ++++---- src/ragas/metrics/_context_relevancy.py | 9 +++++---- src/ragas/metrics/_faithfulness.py | 9 ++++----- src/ragas/metrics/critique.py | 5 +++-- 7 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index 8436de854..eeb437192 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -13,6 +13,8 @@ from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader +logger = logging.getLogger(__name__) + if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -73,6 +75,7 @@ class AnswerCorrectness(MetricWithLLM): name: str = "answer_correctness" # type: ignore[reportIncompatibleMethodOverride] evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore[reportIncompatibleMethodOverride] + correctness_prompt: Prompt = CORRECTNESS_PROMPT batch_size: int = 15 weights: list[float] = field(default_factory=lambda: [0.75, 0.25]) answer_similarity: AnswerSimilarity | None = None @@ -91,10 +94,9 @@ def __post_init__(self: t.Self): self.answer_similarity = AnswerSimilarity( llm=self.llm, batch_size=self.batch_size ) - self.correctness_prompt = CORRECTNESS_PROMPT def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: - logging.info(f"Adapting AnswerCorrectness metric to {language}") + logger.info(f"Adapting AnswerCorrectness metric to {language}") self.correctness_prompt = self.correctness_prompt.adapt( language, self.llm, cache_dir ) diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index f534a959b..c66e1fd92 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -15,6 +15,8 @@ from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.utils import json_loader +logger = logging.getLogger(__name__) + if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks @@ -74,6 +76,7 @@ class AnswerRelevancy(MetricWithLLM): name: str = "answer_relevancy" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore + question_generation: Prompt = QUESTION_GEN batch_size: int = 15 strictness: int = 3 embeddings: RagasEmbeddings = field(default_factory=embedding_factory) @@ -85,11 +88,8 @@ def init_model(self): if self.embeddings.openai_api_key == "no-key": raise OpenAIKeyNotFound - def __post_init__(self: t.Self): - self.question_generation = QUESTION_GEN - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logging.info(f"Adapting AnswerRelevancy metric to {language}") + logger.info(f"Adapting AnswerRelevancy metric to {language}") self.question_generation = self.question_generation.adapt( language, self.llm, cache_dir ) diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index df42554e4..350c44c70 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import typing as t from dataclasses import dataclass, field @@ -19,6 +20,8 @@ from ragas.embeddings.base import RagasEmbeddings +logger = logging.getLogger(__name__) + @dataclass class AnswerSimilarity(MetricWithLLM): diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 2c93bbd70..b256a3b06 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -15,6 +15,8 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks +logger = logging.getLogger(__name__) + CONTEXT_RECALL_RA = Prompt( name="context_recall", instruction="""Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only "Yes" (1) or "No" (0) as a binary classification. Output json with reason.""", @@ -81,13 +83,11 @@ class ContextRecall(MetricWithLLM): name: str = "context_recall" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore + context_recall_prompt: Prompt = CONTEXT_RECALL_RA batch_size: int = 15 - def __post_init__(self: t.Self): - self.context_recall_prompt = CONTEXT_RECALL_RA - def adapt(self, language: str, cache_dir: str | None = None) -> None: - logging.info(f"Adapting Context Recall to {language}") + logger.info(f"Adapting Context Recall to {language}") self.context_recall_prompt = self.context_recall_prompt.adapt( language, self.llm, cache_dir ) diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index f9acf6a29..c23566589 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -16,6 +16,8 @@ if t.TYPE_CHECKING: from langchain.callbacks.base import Callbacks +logger = logging.getLogger(__name__) + CONTEXT_RELEVANCE = Prompt( name="context_relevancy", instruction="""Please extract relevant sentences from the provided context that is absolutely required answer the following question. If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the phrase "Insufficient Information". While extracting candidate sentences you're not allowed to make any changes to sentences from given context.""", @@ -52,13 +54,12 @@ class ContextRelevancy(MetricWithLLM): name: str = "context_relevancy" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qc # type: ignore + context_relevancy_prompt: Prompt = CONTEXT_RELEVANCE batch_size: int = 15 show_deprecation_warning: bool = False - def __post_init__(self: t.Self): - self.context_relevancy_prompt = CONTEXT_RELEVANCE - def adapt(self, language: str, cache_dir: str | None = None) -> None: + logger.info(f"Adapting Context Relevancy to {language}") self.context_relevancy_prompt = self.context_relevancy_prompt.adapt( language, self.llm, cache_dir ) @@ -73,7 +74,7 @@ def _score_batch( callback_group_name: str = "batch", ) -> list[float]: if self.show_deprecation_warning: - logging.warning( + logger.warning( "The 'context_relevancy' metric is going to be deprecated soon! Please use the 'context_precision' metric instead. It is a drop-in replacement just a simple search and replace should work." # noqa ) prompts = [] diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index d0045d46e..46592c69d 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -15,6 +15,7 @@ from datasets import Dataset from langchain.callbacks.base import Callbacks +logger = logging.getLogger(__name__) LONG_FORM_ANSWER_PROMPT = Prompt( name="long_form_answer", @@ -121,14 +122,12 @@ class Faithfulness(MetricWithLLM): name: str = "faithfulness" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore + long_form_answer_prompt: Prompt = LONG_FORM_ANSWER_PROMPT + nli_statements_message: Prompt = NLI_STATEMENTS_MESSAGE batch_size: int = 15 - def __post_init__(self: t.Self): - self.long_form_answer_prompt = LONG_FORM_ANSWER_PROMPT - self.nli_statements_message = NLI_STATEMENTS_MESSAGE - def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: - logging.info(f"Adapting Faithfulness metric to {language}") + logger.info(f"Adapting Faithfulness metric to {language}") self.long_form_answer_prompt = self.long_form_answer_prompt.adapt( language, self.llm, cache_dir ) diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 03bb5847e..1b0f88dc8 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -19,6 +19,7 @@ from ragas.llms import RagasLLM +logger = logging.getLogger(__name__) CRITIQUE_PROMPT = Prompt( name="critique", @@ -65,6 +66,7 @@ class AspectCritique(MetricWithLLM): name: str = field(default="", repr=True) # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore + critic_prompt: Prompt = CRITIQUE_PROMPT definition: str = field(default="", repr=True) strictness: int = field(default=1, repr=False) batch_size: int = field(default=15, repr=False) @@ -83,10 +85,9 @@ def __post_init__(self: t.Self): self.strictness = ( self.strictness if self.strictness % 2 != 0 else self.strictness + 1 ) - self.critic_prompt = CRITIQUE_PROMPT def adapt(self, language: str, cache_dir: str | None = None) -> None: - logging.info(f"Adapting Critic to {language}") + logger.info(f"Adapting Critic to {language}") self.critic_prompt.adapt(language, self.llm, cache_dir) def save(self, cache_dir: str | None = None) -> None: From 1595a27086e2a6a0cbf0965d798764c316e805e4 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Mon, 1 Jan 2024 20:35:08 +0530 Subject: [PATCH 41/41] fixed default factor error --- src/ragas/metrics/_answer_correctness.py | 2 +- src/ragas/metrics/_answer_relevance.py | 2 +- src/ragas/metrics/_context_precision.py | 6 ++---- src/ragas/metrics/_context_recall.py | 4 ++-- src/ragas/metrics/_context_relevancy.py | 4 ++-- src/ragas/metrics/_faithfulness.py | 10 +++++++--- src/ragas/metrics/critique.py | 2 +- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index eeb437192..89f83005e 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -75,7 +75,7 @@ class AnswerCorrectness(MetricWithLLM): name: str = "answer_correctness" # type: ignore[reportIncompatibleMethodOverride] evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore[reportIncompatibleMethodOverride] - correctness_prompt: Prompt = CORRECTNESS_PROMPT + correctness_prompt: Prompt = field(default_factory=lambda: CORRECTNESS_PROMPT) batch_size: int = 15 weights: list[float] = field(default_factory=lambda: [0.75, 0.25]) answer_similarity: AnswerSimilarity | None = None diff --git a/src/ragas/metrics/_answer_relevance.py b/src/ragas/metrics/_answer_relevance.py index c66e1fd92..6b736aabd 100644 --- a/src/ragas/metrics/_answer_relevance.py +++ b/src/ragas/metrics/_answer_relevance.py @@ -76,7 +76,7 @@ class AnswerRelevancy(MetricWithLLM): name: str = "answer_relevancy" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore - question_generation: Prompt = QUESTION_GEN + question_generation: Prompt = field(default_factory=lambda: QUESTION_GEN) batch_size: int = 15 strictness: int = 3 embeddings: RagasEmbeddings = field(default_factory=embedding_factory) diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index ccde0dcce..a8f40b814 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -2,7 +2,7 @@ import logging import typing as t -from dataclasses import dataclass +from dataclasses import dataclass, field import numpy as np from datasets import Dataset @@ -71,11 +71,9 @@ class ContextPrecision(MetricWithLLM): name: str = "context_precision" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore + context_precision_prompt: Prompt = field(default_factory=lambda: CONTEXT_PRECISION) batch_size: int = 15 - def __post_init__(self: t.Self): - self.context_precision_prompt = CONTEXT_PRECISION - def adapt(self, language: str, cache_dir: str | None = None) -> None: logging.info(f"Adapting Context Precision to {language}") self.context_precision_prompt = self.context_precision_prompt.adapt( diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index b256a3b06..faabce302 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -2,7 +2,7 @@ import logging import typing as t -from dataclasses import dataclass +from dataclasses import dataclass, field import numpy as np from datasets import Dataset @@ -83,7 +83,7 @@ class ContextRecall(MetricWithLLM): name: str = "context_recall" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qcg # type: ignore - context_recall_prompt: Prompt = CONTEXT_RECALL_RA + context_recall_prompt: Prompt = field(default_factory=lambda: CONTEXT_RECALL_RA) batch_size: int = 15 def adapt(self, language: str, cache_dir: str | None = None) -> None: diff --git a/src/ragas/metrics/_context_relevancy.py b/src/ragas/metrics/_context_relevancy.py index c23566589..a43fa4454 100644 --- a/src/ragas/metrics/_context_relevancy.py +++ b/src/ragas/metrics/_context_relevancy.py @@ -2,7 +2,7 @@ import logging import typing as t -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List import numpy as np @@ -54,7 +54,7 @@ class ContextRelevancy(MetricWithLLM): name: str = "context_relevancy" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qc # type: ignore - context_relevancy_prompt: Prompt = CONTEXT_RELEVANCE + context_relevancy_prompt: Prompt = field(default_factory=lambda: CONTEXT_RELEVANCE) batch_size: int = 15 show_deprecation_warning: bool = False diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 46592c69d..9df7ed897 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -2,7 +2,7 @@ import logging import typing as t -from dataclasses import dataclass +from dataclasses import dataclass, field import numpy as np from langchain.callbacks.manager import CallbackManager, trace_as_chain_group @@ -122,8 +122,12 @@ class Faithfulness(MetricWithLLM): name: str = "faithfulness" # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore - long_form_answer_prompt: Prompt = LONG_FORM_ANSWER_PROMPT - nli_statements_message: Prompt = NLI_STATEMENTS_MESSAGE + long_form_answer_prompt: Prompt = field( + default_factory=lambda: LONG_FORM_ANSWER_PROMPT + ) + nli_statements_message: Prompt = field( + default_factory=lambda: NLI_STATEMENTS_MESSAGE + ) batch_size: int = 15 def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: diff --git a/src/ragas/metrics/critique.py b/src/ragas/metrics/critique.py index 1b0f88dc8..a4b3db30b 100644 --- a/src/ragas/metrics/critique.py +++ b/src/ragas/metrics/critique.py @@ -66,7 +66,7 @@ class AspectCritique(MetricWithLLM): name: str = field(default="", repr=True) # type: ignore evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore - critic_prompt: Prompt = CRITIQUE_PROMPT + critic_prompt: Prompt = field(default_factory=lambda: CRITIQUE_PROMPT) definition: str = field(default="", repr=True) strictness: int = field(default=1, repr=False) batch_size: int = field(default=15, repr=False)