From 99e4848df6a4b53ec59eb89bc4394c3103006eb8 Mon Sep 17 00:00:00 2001 From: sahusiddharth Date: Mon, 12 Aug 2024 23:16:14 +0530 Subject: [PATCH 01/14] Basic implementation of Noise sensitivity metrics from RAGChecker --- .../testset/generators/base.py | 3 +- .../testset/questions/base.py | 12 +- src/ragas/async_utils.py | 1 + src/ragas/integrations/langchain.py | 6 +- src/ragas/llms/base.py | 6 +- src/ragas/metrics/__init__.py | 2 + src/ragas/metrics/_faithfulness.py | 3 +- src/ragas/metrics/_noise_sensitivity.py | 375 ++++++++++++++++++ src/ragas/metrics/base.py | 9 +- src/ragas/testset/docstore.py | 21 +- src/ragas/testset/evolutions.py | 6 +- src/ragas/testset/extractor.py | 3 +- 12 files changed, 404 insertions(+), 43 deletions(-) create mode 100644 src/ragas/metrics/_noise_sensitivity.py diff --git a/src/experimental/ragas_experimental/testset/generators/base.py b/src/experimental/ragas_experimental/testset/generators/base.py index ce40adc38..1a9730f5a 100644 --- a/src/experimental/ragas_experimental/testset/generators/base.py +++ b/src/experimental/ragas_experimental/testset/generators/base.py @@ -48,8 +48,7 @@ def generate( docs: t.Sequence[Document], test_size: int, distribution: QADistribution, - ) -> TestDataset: - ... + ) -> TestDataset: ... def generate_with_langchain_docs( self, diff --git a/src/experimental/ragas_experimental/testset/questions/base.py b/src/experimental/ragas_experimental/testset/questions/base.py index 1ab7c9c38..fc2290192 100644 --- a/src/experimental/ragas_experimental/testset/questions/base.py +++ b/src/experimental/ragas_experimental/testset/questions/base.py @@ -46,12 +46,12 @@ class QAC: @dataclass class StyleLengthDistribution: - style_length_distribution: t.Dict[ - t.Tuple[QuestionStyle, QuestionLength], float - ] = field( - default_factory=lambda: { - (QuestionStyle.PERFECT_GRAMMAR, QuestionLength.MEDIUM): 1.0 - } + style_length_distribution: t.Dict[t.Tuple[QuestionStyle, QuestionLength], float] = ( + field( + default_factory=lambda: { + (QuestionStyle.PERFECT_GRAMMAR, QuestionLength.MEDIUM): 1.0 + } + ) ) def __post_init__(self): diff --git a/src/ragas/async_utils.py b/src/ragas/async_utils.py index c365ac808..6937b4617 100644 --- a/src/ragas/async_utils.py +++ b/src/ragas/async_utils.py @@ -1,4 +1,5 @@ """Async utils.""" + import asyncio from typing import Any, Coroutine, List diff --git a/src/ragas/integrations/langchain.py b/src/ragas/integrations/langchain.py index 44279187f..99f03b2ed 100644 --- a/src/ragas/integrations/langchain.py +++ b/src/ragas/integrations/langchain.py @@ -48,9 +48,9 @@ def __init__(self, metric: Metric, **kwargs: t.Any): t.cast(MetricWithLLM, self.metric).llm = LangchainLLMWrapper(llm) if isinstance(self.metric, MetricWithEmbeddings): embeddings = get_or_init(kwargs, "embeddings", OpenAIEmbeddings) - t.cast( - MetricWithEmbeddings, self.metric - ).embeddings = LangchainEmbeddingsWrapper(embeddings) + t.cast(MetricWithEmbeddings, self.metric).embeddings = ( + LangchainEmbeddingsWrapper(embeddings) + ) self.metric.init(run_config) @property diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index c41747da0..2642b3c54 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -62,8 +62,7 @@ def generate_text( temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = None, - ) -> LLMResult: - ... + ) -> LLMResult: ... @abstractmethod async def agenerate_text( @@ -73,8 +72,7 @@ async def agenerate_text( temperature: t.Optional[float] = None, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = None, - ) -> LLMResult: - ... + ) -> LLMResult: ... async def generate( self, diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index e22d3fa57..dc29e2496 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -13,6 +13,7 @@ ) from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._faithfulness import Faithfulness, faithfulness +from ragas.metrics._noise_sensitivity import noise_sensitivity from ragas.metrics._summarization import SummarizationScore, summarization_score from ragas.metrics.critique import AspectCritique @@ -36,4 +37,5 @@ "context_entity_recall", "SummarizationScore", "summarization_score", + "noise_sensitivity", ] diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 8db5807e9..fe8b2715e 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -21,8 +21,7 @@ class HasSegmentMethod(Protocol): - def segment(self, text) -> Any: - ... + def segment(self, text) -> Any: ... logger = logging.getLogger(__name__) diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py new file mode 100644 index 000000000..4ac2ff8f2 --- /dev/null +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -0,0 +1,375 @@ +from __future__ import annotations + +import inspect +import json +import logging +import typing as t +from dataclasses import dataclass, field + +import numpy as np +from langchain_core.pydantic_v1 import BaseModel, Field + +from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions +from ragas.llms.prompt import Prompt +from ragas.metrics.base import EvaluationMode, MetricWithLLM, ensembler, get_segmenter + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + + from ragas.llms.prompt import PromptValue + +from typing import Any, Protocol + + +class HasSegmentMethod(Protocol): + def segment(self, text) -> Any: ... + + +logger = logging.getLogger(__name__) + + +class Statements(BaseModel): + sentence_index: int = Field( + ..., description="Index of the sentence from the statement list" + ) + simpler_statements: t.List[str] = Field(..., description="the simpler statements") + + +class StatementsAnswers(BaseModel): + __root__: t.List[Statements] + + def dicts(self) -> t.List[t.Dict]: + return self.dict()["__root__"] + + +_statements_output_instructions = get_json_format_instructions(StatementsAnswers) +_statements_output_parser = RagasoutputParser(pydantic_object=StatementsAnswers) + + +LONG_FORM_ANSWER_PROMPT = Prompt( + name="long_form_answer", + output_format_instruction=_statements_output_instructions, + instruction="Given a question, an answer, and sentences from the answer analyze the complexity of each sentence given under 'sentences' and break down each sentence into one or more fully understandable statements while also ensuring no pronouns are used in each statement. Format the outputs in JSON.", + examples=[ + { + "question": "Who was Albert Einstein and what is he best known for?", + "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", + "sentences": """ + 0:He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. + 1:He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics. + """, + "analysis": StatementsAnswers.parse_obj( + [ + { + "sentence_index": 0, + "simpler_statements": [ + "Albert Einstein was a German-born theoretical physicist.", + "Albert Einstein is recognized as one of the greatest and most influential physicists of all time.", + ], + }, + { + "sentence_index": 1, + "simpler_statements": [ + "Albert Einstein was best known for developing the theory of relativity.", + "Albert Einstein also made important contributions to the development of the theory of quantum mechanics.", + ], + }, + ] + ).dicts(), + } + ], + input_keys=["question", "answer", "sentences"], + output_key="analysis", + language="english", +) + + +class StatementFaithfulnessAnswer(BaseModel): + statement: str = Field(..., description="the original statement, word-by-word") + reason: str = Field(..., description="the reason of the verdict") + verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.") + + +class StatementFaithfulnessAnswers(BaseModel): + __root__: t.List[StatementFaithfulnessAnswer] + + def dicts(self) -> t.List[t.Dict]: + return self.dict()["__root__"] + + +_faithfulness_output_instructions = get_json_format_instructions( + StatementFaithfulnessAnswers +) +_faithfulness_output_parser = RagasoutputParser( + pydantic_object=StatementFaithfulnessAnswers +) + +NLI_STATEMENTS_MESSAGE = Prompt( + name="nli_statements", + instruction="Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context.", + output_format_instruction=_faithfulness_output_instructions, + examples=[ + { + "context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", + "statements": [ + "John is majoring in Biology.", + "John is taking a course on Artificial Intelligence.", + "John is a dedicated student.", + "John has a part-time job.", + ], + "answer": StatementFaithfulnessAnswers.parse_obj( + [ + { + "statement": "John is majoring in Biology.", + "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", + "verdict": 0, + }, + { + "statement": "John is taking a course on Artificial Intelligence.", + "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", + "verdict": 0, + }, + { + "statement": "John is a dedicated student.", + "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", + "verdict": 1, + }, + { + "statement": "John has a part-time job.", + "reason": "There is no information given in the context about John having a part-time job.", + "verdict": 0, + }, + ] + ).dicts(), + }, + { + "context": """Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.""", + "statements": ["Albert Einstein was a genius."], + "answer": StatementFaithfulnessAnswers.parse_obj( + [ + { + "statement": "Albert Einstein was a genius.", + "reason": "The context and statement are unrelated", + "verdict": 0, + } + ] + ).dicts(), + }, + ], + input_keys=["context", "statements"], + output_key="answer", + output_type="json", + language="english", +) # noqa: E501 + + +@dataclass +class NoiseSensitivity(MetricWithLLM): + name: str = "noise_sensitivity" # type: ignore + evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore + nli_statements_message: Prompt = field( + default_factory=lambda: NLI_STATEMENTS_MESSAGE + ) + statement_prompt: Prompt = field(default_factory=lambda: LONG_FORM_ANSWER_PROMPT) + sentence_segmenter: t.Optional[HasSegmentMethod] = None + max_retries: int = 1 + _reproducibility: int = 1 + + @property + def reproducibility(self): + return self._reproducibility + + @reproducibility.setter + def reproducibility(self, value): + if value < 1: + logger.warning("reproducibility cannot be less than 1, setting to 1") + value = 1 + elif value % 2 == 0: + logger.warning( + "reproducibility level cannot be set to even number, setting to odd" + ) + value += 1 + self._reproducibility = value + + def __post_init__(self): + if self.sentence_segmenter is None: + language = self.nli_statements_message.language + self.sentence_segmenter = get_segmenter(language=language, clean=False) + + def _create_nli_prompt(self, contexts: str, statements: t.List[str]) -> PromptValue: + assert self.llm is not None, "llm must be set to compute score" + + # check if the statements are support in the contexts + contexts_str: str = "\n".join(contexts) + statements_str: str = json.dumps(statements) + prompt_value = self.nli_statements_message.format( + context=contexts_str, statements=statements_str + ) + return prompt_value + + def _create_statements_prompt(self, text: str, question: str) -> PromptValue: + assert self.sentence_segmenter is not None, "sentence_segmenter is not set" + # contexts = row["contexts"] + sentences = self.sentence_segmenter.segment(text) + sentences = [ + sentence for sentence in sentences if sentence.strip().endswith(".") + ] + sentences = "\n".join([f"{i}:{x}" for i, x in enumerate(sentences)]) + prompt_value = self.statement_prompt.format( + question=question, answer=text, sentences=sentences + ) + return prompt_value + + async def _evaluate_statement_faithfulness( + self, statements, context: str, callbacks: Callbacks + ): + assert self.llm is not None, "LLM is not set" + + p_value = self._create_nli_prompt(context, statements) + nli_result = await self.llm.generate( + p_value, + callbacks=callbacks, + n=self._reproducibility, + ) + + nli_result_text = [ + nli_result.generations[0][i].text for i in range(self._reproducibility) + ] + faithfulness_list = [ + await _faithfulness_output_parser.aparse( + text, p_value, self.llm, self.max_retries + ) + for text in nli_result_text + ] + + faithfulness_list = [ + faith.dicts() for faith in faithfulness_list if faith is not None + ] + + if faithfulness_list: + faithfulness_list = ensembler.from_discrete( + faithfulness_list, + "verdict", + ) + + faithfulness_list = StatementFaithfulnessAnswers.parse_obj( + faithfulness_list + ) + + verdict_list = [ + 1 if statement.verdict else 0 + for statement in faithfulness_list.__root__ + ] + return np.array(verdict_list) + else: + return np.nan + + async def _decompose_answer_into_statements( + self, text: str, question: str, callbacks: Callbacks + ): + assert self.llm is not None, "LLM is not set" + + p_value = self._create_statements_prompt(text, question) + + if inspect.iscoroutinefunction(self.llm.generate): + statements_gen = await self.llm.generate( + p_value, + callbacks=callbacks, + ) + else: + statements_gen = self.llm.generate( + p_value, + callbacks=callbacks, + ) + + # Await the aparse method + statements = await _statements_output_parser.aparse( + statements_gen.generations[0][0].text, p_value, self.llm, self.max_retries # type: ignore + ) + + if statements is None: + return np.nan + + # Ensure statements is not a coroutine before calling dicts() + if inspect.iscoroutine(statements): + statements = await statements + + # Add error handling and logging + if not hasattr(statements, "dicts"): + logging.error(f"Unexpected type for statements: {type(statements)}") + logging.error(f"Statements content: {statements}") + raise AttributeError( + f"'statements' object of type {type(statements)} has no attribute 'dicts'" + ) + + statements = [item["simpler_statements"] for item in statements.dicts()] + statements = [item for sublist in statements for item in sublist] + + return statements + + def _compute_score(self, answers: t.Dict): + # check the verdicts and compute the score + relevant_retrieved = np.max(answers["retrieved2answer"], axis=0, keepdims=True) + relevant_faithful = np.max( + relevant_retrieved & answers["retrieved2response"], axis=1 + ) + incorrect = ~answers["answer2response"] + + noise_sensitivity_in_relevant = np.mean(relevant_faithful & incorrect) + return noise_sensitivity_in_relevant + + async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: + """ + returns the NLI score for each (q, c, a) pair + """ + assert self.llm is not None, "LLM is not set" + + gt_statements = await self._decompose_answer_into_statements( + row["ground_truth"], row["question"], callbacks + ) + ans_statements = await self._decompose_answer_into_statements( + row["answer"], row["question"], callbacks + ) + + answers_verdictslist = [] + response_verdictslist = [] + + for ctx in row["contexts"]: + verdicts = await self._evaluate_statement_faithfulness( + gt_statements, ctx, callbacks + ) + answers_verdictslist.append(verdicts) + + verdicts = await self._evaluate_statement_faithfulness( + ans_statements, ctx, callbacks + ) + response_verdictslist.append(verdicts) + + answers = {} + answers["retrieved2answer"] = np.array(answers_verdictslist).T + answers["retrieved2response"] = np.array(response_verdictslist).T + answers["answer2response"] = await self._evaluate_statement_faithfulness( + ans_statements, row["ground_truth"], callbacks + ) + return self._compute_score(answers) + + def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: + assert self.llm is not None, "LLM is not set" + + logger.info(f"Adapting Faithfulness metric to {language}") + + self.nli_statements_message = self.nli_statements_message.adapt( + language, self.llm, cache_dir + ) + self.statement_prompt = self.statement_prompt.adapt( + language, self.llm, cache_dir + ) + + self.sentence_segmenter = get_segmenter(language=language, clean=False) + + def save(self, cache_dir: t.Optional[str] = None) -> None: + self.nli_statements_message.save(cache_dir) + self.statement_prompt.save(cache_dir) + + +noise_sensitivity = NoiseSensitivity() diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index f619ad12c..5587a3656 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -63,13 +63,11 @@ def get_required_columns( class Metric(ABC): @property @abstractmethod - def name(self) -> str: - ... + def name(self) -> str: ... @property @abstractmethod - def evaluation_mode(self) -> EvaluationMode: - ... + def evaluation_mode(self) -> EvaluationMode: ... @abstractmethod def init(self, run_config: RunConfig): @@ -132,8 +130,7 @@ async def ascore( return score @abstractmethod - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: - ... + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ... @dataclass diff --git a/src/ragas/testset/docstore.py b/src/ragas/testset/docstore.py index 8a9a61e08..f22efd867 100644 --- a/src/ragas/testset/docstore.py +++ b/src/ragas/testset/docstore.py @@ -99,29 +99,23 @@ def __init__(self): self.documents = {} @abstractmethod - def add_documents(self, docs: t.Sequence[Document], show_progress=True): - ... + def add_documents(self, docs: t.Sequence[Document], show_progress=True): ... @abstractmethod - def add_nodes(self, nodes: t.Sequence[Node], show_progress=True): - ... + def add_nodes(self, nodes: t.Sequence[Node], show_progress=True): ... @abstractmethod - def get_node(self, node_id: str) -> Node: - ... + def get_node(self, node_id: str) -> Node: ... @abstractmethod - def get_random_nodes(self, k=1) -> t.List[Node]: - ... + def get_random_nodes(self, k=1) -> t.List[Node]: ... @abstractmethod def get_similar( self, node: Node, threshold: float = 0.7, top_k: int = 3 - ) -> t.Union[t.List[Document], t.List[Node]]: - ... + ) -> t.Union[t.List[Document], t.List[Node]]: ... - def set_run_config(self, run_config: RunConfig): - ... + def set_run_config(self, run_config: RunConfig): ... class SimilarityMode(str, Enum): @@ -197,8 +191,7 @@ class InMemoryDocumentStore(DocumentStore): node_map: t.Dict[str, Node] = field(default_factory=dict) run_config: RunConfig = field(default_factory=RunConfig) - def _embed_items(self, items: t.Union[t.Sequence[Document], t.Sequence[Node]]): - ... + def _embed_items(self, items: t.Union[t.Sequence[Document], t.Sequence[Node]]): ... def add_documents(self, docs: t.Sequence[Document], show_progress=True): """ diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py index 16a7e8ee5..096dc2a74 100644 --- a/src/ragas/testset/evolutions.py +++ b/src/ragas/testset/evolutions.py @@ -173,11 +173,9 @@ async def fix_invalid_question( @abstractmethod async def _aevolve( self, current_tries: int, current_nodes: CurrentNodes - ) -> EvolutionOutput: - ... + ) -> EvolutionOutput: ... - async def filter_and_retry(self, question): - ... + async def filter_and_retry(self, question): ... async def generate_datarow( self, diff --git a/src/ragas/testset/extractor.py b/src/ragas/testset/extractor.py index 19126514d..49b6f94a2 100644 --- a/src/ragas/testset/extractor.py +++ b/src/ragas/testset/extractor.py @@ -22,8 +22,7 @@ class Extractor(ABC): llm: BaseRagasLLM @abstractmethod - async def extract(self, node: Node, is_async: bool = True) -> t.Any: - ... + async def extract(self, node: Node, is_async: bool = True) -> t.Any: ... def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: """ From dfe069faf7154bbe4d0356f8a9889f0e0e7029c9 Mon Sep 17 00:00:00 2001 From: sahusiddharth Date: Tue, 13 Aug 2024 20:41:43 +0530 Subject: [PATCH 02/14] Implemented requested changes --- src/ragas/metrics/_noise_sensitivity.py | 103 +----------------------- 1 file changed, 3 insertions(+), 100 deletions(-) diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index 4ac2ff8f2..d68380d0c 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -11,6 +11,7 @@ from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions from ragas.llms.prompt import Prompt +from ragas.metrics._faithfulness import LONG_FORM_ANSWER_PROMPT, NLI_STATEMENTS_MESSAGE from ragas.metrics.base import EvaluationMode, MetricWithLLM, ensembler, get_segmenter if t.TYPE_CHECKING: @@ -42,48 +43,10 @@ def dicts(self) -> t.List[t.Dict]: return self.dict()["__root__"] -_statements_output_instructions = get_json_format_instructions(StatementsAnswers) +# _statements_output_instructions = get_json_format_instructions(StatementsAnswers) _statements_output_parser = RagasoutputParser(pydantic_object=StatementsAnswers) -LONG_FORM_ANSWER_PROMPT = Prompt( - name="long_form_answer", - output_format_instruction=_statements_output_instructions, - instruction="Given a question, an answer, and sentences from the answer analyze the complexity of each sentence given under 'sentences' and break down each sentence into one or more fully understandable statements while also ensuring no pronouns are used in each statement. Format the outputs in JSON.", - examples=[ - { - "question": "Who was Albert Einstein and what is he best known for?", - "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", - "sentences": """ - 0:He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. - 1:He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics. - """, - "analysis": StatementsAnswers.parse_obj( - [ - { - "sentence_index": 0, - "simpler_statements": [ - "Albert Einstein was a German-born theoretical physicist.", - "Albert Einstein is recognized as one of the greatest and most influential physicists of all time.", - ], - }, - { - "sentence_index": 1, - "simpler_statements": [ - "Albert Einstein was best known for developing the theory of relativity.", - "Albert Einstein also made important contributions to the development of the theory of quantum mechanics.", - ], - }, - ] - ).dicts(), - } - ], - input_keys=["question", "answer", "sentences"], - output_key="analysis", - language="english", -) - - class StatementFaithfulnessAnswer(BaseModel): statement: str = Field(..., description="the original statement, word-by-word") reason: str = Field(..., description="the reason of the verdict") @@ -104,64 +67,6 @@ def dicts(self) -> t.List[t.Dict]: pydantic_object=StatementFaithfulnessAnswers ) -NLI_STATEMENTS_MESSAGE = Prompt( - name="nli_statements", - instruction="Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context.", - output_format_instruction=_faithfulness_output_instructions, - examples=[ - { - "context": """John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", - "statements": [ - "John is majoring in Biology.", - "John is taking a course on Artificial Intelligence.", - "John is a dedicated student.", - "John has a part-time job.", - ], - "answer": StatementFaithfulnessAnswers.parse_obj( - [ - { - "statement": "John is majoring in Biology.", - "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", - "verdict": 0, - }, - { - "statement": "John is taking a course on Artificial Intelligence.", - "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", - "verdict": 0, - }, - { - "statement": "John is a dedicated student.", - "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", - "verdict": 1, - }, - { - "statement": "John has a part-time job.", - "reason": "There is no information given in the context about John having a part-time job.", - "verdict": 0, - }, - ] - ).dicts(), - }, - { - "context": """Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.""", - "statements": ["Albert Einstein was a genius."], - "answer": StatementFaithfulnessAnswers.parse_obj( - [ - { - "statement": "Albert Einstein was a genius.", - "reason": "The context and statement are unrelated", - "verdict": 0, - } - ] - ).dicts(), - }, - ], - input_keys=["context", "statements"], - output_key="answer", - output_type="json", - language="english", -) # noqa: E501 - @dataclass class NoiseSensitivity(MetricWithLLM): @@ -199,11 +104,9 @@ def __post_init__(self): def _create_nli_prompt(self, contexts: str, statements: t.List[str]) -> PromptValue: assert self.llm is not None, "llm must be set to compute score" - # check if the statements are support in the contexts - contexts_str: str = "\n".join(contexts) statements_str: str = json.dumps(statements) prompt_value = self.nli_statements_message.format( - context=contexts_str, statements=statements_str + context=contexts, statements=statements_str ) return prompt_value From d89ecb4502adbdfa282e9b5f68388c87bbb67f87 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 14 Aug 2024 10:06:12 +0530 Subject: [PATCH 03/14] remove unnecssary declarations --- src/ragas/metrics/_noise_sensitivity.py | 56 ++++--------------------- 1 file changed, 8 insertions(+), 48 deletions(-) diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index d68380d0c..1a64db57e 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -7,11 +7,16 @@ from dataclasses import dataclass, field import numpy as np -from langchain_core.pydantic_v1 import BaseModel, Field -from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions from ragas.llms.prompt import Prompt -from ragas.metrics._faithfulness import LONG_FORM_ANSWER_PROMPT, NLI_STATEMENTS_MESSAGE +from ragas.metrics._faithfulness import ( + LONG_FORM_ANSWER_PROMPT, + NLI_STATEMENTS_MESSAGE, + HasSegmentMethod, + StatementFaithfulnessAnswers, + _faithfulness_output_parser, + _statements_output_parser, +) from ragas.metrics.base import EvaluationMode, MetricWithLLM, ensembler, get_segmenter if t.TYPE_CHECKING: @@ -19,55 +24,10 @@ from ragas.llms.prompt import PromptValue -from typing import Any, Protocol - - -class HasSegmentMethod(Protocol): - def segment(self, text) -> Any: ... - logger = logging.getLogger(__name__) -class Statements(BaseModel): - sentence_index: int = Field( - ..., description="Index of the sentence from the statement list" - ) - simpler_statements: t.List[str] = Field(..., description="the simpler statements") - - -class StatementsAnswers(BaseModel): - __root__: t.List[Statements] - - def dicts(self) -> t.List[t.Dict]: - return self.dict()["__root__"] - - -# _statements_output_instructions = get_json_format_instructions(StatementsAnswers) -_statements_output_parser = RagasoutputParser(pydantic_object=StatementsAnswers) - - -class StatementFaithfulnessAnswer(BaseModel): - statement: str = Field(..., description="the original statement, word-by-word") - reason: str = Field(..., description="the reason of the verdict") - verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.") - - -class StatementFaithfulnessAnswers(BaseModel): - __root__: t.List[StatementFaithfulnessAnswer] - - def dicts(self) -> t.List[t.Dict]: - return self.dict()["__root__"] - - -_faithfulness_output_instructions = get_json_format_instructions( - StatementFaithfulnessAnswers -) -_faithfulness_output_parser = RagasoutputParser( - pydantic_object=StatementFaithfulnessAnswers -) - - @dataclass class NoiseSensitivity(MetricWithLLM): name: str = "noise_sensitivity" # type: ignore From e029f085a0b5af7fd55df2d6e75af57be25c2d78 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 14 Aug 2024 11:06:10 +0530 Subject: [PATCH 04/14] convert verdicts to bool --- src/ragas/metrics/_noise_sensitivity.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index 1a64db57e..22a1fa670 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -177,7 +177,6 @@ def _compute_score(self, answers: t.Dict): relevant_retrieved & answers["retrieved2response"], axis=1 ) incorrect = ~answers["answer2response"] - noise_sensitivity_in_relevant = np.mean(relevant_faithful & incorrect) return noise_sensitivity_in_relevant @@ -214,6 +213,8 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: answers["answer2response"] = await self._evaluate_statement_faithfulness( ans_statements, row["ground_truth"], callbacks ) + answers["answer2response"] = np.array([answers["answer2response"]]) + answers = {k: v.astype(bool) for k, v in answers.items()} return self._compute_score(answers) def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: From 8128f80c64a0c48efb757d6013497372219a203d Mon Sep 17 00:00:00 2001 From: sahusiddharth Date: Wed, 14 Aug 2024 23:28:35 +0530 Subject: [PATCH 05/14] added docs --- docs/concepts/metrics/context_recall.md | 2 +- docs/concepts/metrics/noise_sensitivity.md | 114 +++++++++++++++++++++ src/ragas/metrics/__init__.py | 3 +- src/ragas/metrics/_noise_sensitivity.py | 48 ++++++--- 4 files changed, 153 insertions(+), 14 deletions(-) create mode 100644 docs/concepts/metrics/noise_sensitivity.md diff --git a/docs/concepts/metrics/context_recall.md b/docs/concepts/metrics/context_recall.md index 553e7355e..aa1611e7a 100644 --- a/docs/concepts/metrics/context_recall.md +++ b/docs/concepts/metrics/context_recall.md @@ -55,7 +55,7 @@ Let's examine how context recall was calculated using the low context recall exa - Statement 1: Yes - Statement 2: No -- **Step 3:** Use the formula depicted above to calculate context recall. +- **Step 4:** Use the formula depicted above to calculate context recall. ```{math} \text{context recall} = { \text{1} \over \text{2} } = 0.5 ``` diff --git a/docs/concepts/metrics/noise_sensitivity.md b/docs/concepts/metrics/noise_sensitivity.md new file mode 100644 index 000000000..1091c3a7a --- /dev/null +++ b/docs/concepts/metrics/noise_sensitivity.md @@ -0,0 +1,114 @@ + + +# Noise Sensitivity + +Noise sensitivity measures how often the system makes a mistake (i.e., gives an incorrect response) when using the retrieved documents. It is computed using `question`, `ground truth`, `answer` and the retrieved `context`, and the values range between 0 and 1, with lower values indicating better performance. +To estimate noise sensitivity, each claim in the answer is analyzed to determine whether it can be attributed to the relevant retrieved context or not. In an ideal scenario, all claims in the answer should be attributable to the relevant retrieved context. + +```{math} +\text{noise sensitivity (relevant)} = {|\text{Number of incorrect claims in answer}| \over |\text{Number of claims in the Answer}|} +``` + +```{Hint} + +Question: What is the Life Insurance Corporation of India (LIC) known for? + +Ground truth: The Life Insurance Corporation of India (LIC) is the largest insurance company in India, established in 1956 through the nationalization of the insurance industry. It is known for managing a large portfolio of investments. + +Relevant Retrieval: + - The Life Insurance Corporation of India (LIC) was established in 1956 following the nationalization of the insurance industry in India. + - LIC is the largest insurance company in India, with a vast network of policyholders and a significant role in the financial sector. + - As the largest institutional investor in India, LIC manages a substantial life fund, contributing to the financial stability of the country. + +Irrelevant Retrieval: + - The Indian economy is one of the fastest-growing major economies in the world, thanks to the secors like finance, technology, manufacturing etc. +``` + + +## Example + +```{code-block} python +:caption: Noise Sensitivity +from datasets import Dataset +from ragas.metrics import noise_sensitivity +from ragas import evaluate + +data_sample = { + "question": ["What is the Life Insurance Corporation of India (LIC) known for?"], + "ground_truth": ["The Life Insurance Corporation of India (LIC) is the largest insurance company in India, established in 1956 through the nationalization of the insurance industry. It is known for managing a large portfolio of investments."], + "answer": ["The Life Insurance Corporation of India (LIC) is the largest insurance company in India, known for its vast portfolio of investments. LIC contributs to the financial stability of the country."], + "contexts": [[ + "The Life Insurance Corporation of India (LIC) was established in 1956 following the nationalization of the insurance industry in India.", + "LIC is the largest insurance company in India, with a vast network of policyholders and a huge investments.", + "As the largest institutional investor in India, LIC manages a substantial funds, contributing to the financial stability of the country.", + "The Indian economy is one of the fastest-growing major economies in the world, thanks to the secors like finance, technology, manufacturing etc". + ]] +} + +dataset = Dataset.from_dict(data_sample) +score = evaluate(dataset,metrics=[noise_sensitivity]) +score.to_pandas() +``` +By default, the NoiseSensitivity metric evaluates the noise sensitivity for relevant retrievals. However, you can customize this behavior by specifying the focus parameter when instantiating the NoiseSensitivity class. The `focus` parameter allows you to choose whether to calculate the noise sensitivity for relevant retrievals, irrelevant retrievals, or both. + +```{code-block} python +from datasets import Dataset +from ragas.metrics import NoiseSensitivity +from ragas import evaluate + +# Sample dataset +dataset = Dataset.from_dict(data_sample) + +# Calculate noise sensitivity for irrelevant retrievals +noise_sensitivity = NoiseSensitivity(focus='irrelevant') + +# Alternatively, calculate noise sensitivity for both relevant and irrelevant retrievals +# noise_sensitivity = NoiseSensitivity(focus='both') + +score = evaluate(dataset, metrics=[noise_sensitivity]) +score.to_pandas() +``` +## Calculation + +Let's examine how noise sensitivity in relevant context was calculated: + +- **Step 1:** Identify the relevant contexts from which the ground truth can be inferred. + + - Ground Truth: + The Life Insurance Corporation of India (LIC) is the largest insurance company in India, established in 1956 through the nationalization of the insurance industry. It is known for managing a large portfolio of investments. + + - Contexts: + - Context 1: `The Life Insurance Corporation of India (LIC) was established in 1956` following the nationalization of the insurance industry in India. + - Context 2: `LIC is the largest insurance company in India`, with a vast network of policyholders and a significant role in the financial sector. + - Context 3: `As the largest institutional investor in India, LIC manages a substantial funds`, contributing to the financial stability of the country. + +- **Step 2:** Verify if the claims in the generated answer can be inferred from the relevant context. + + - Answer: + The Life Insurance Corporation of India (LIC) is the largest insurance company in India, known for its vast portfolio of investments. LIC contributs to the financial stability of the country. + + - Contexts: + - Context 1: The Life Insurance Corporation of India (LIC) was established in 1956 following the nationalization of the insurance industry in India. + - Context 2: `LIC is the largest insurance company in India`, with a vast network of policyholders and a significant role in the financial sector. + - Context 3: `As the largest institutional investor in India, LIC manages a substantial funds`, `contributing to the financial stability of the country`. + + +- **Step 3:** Identify any incorrect claims in the answer (i.e., answer statements that are not supported by the ground truth). + + - Ground Truth: + The Life Insurance Corporation of India (LIC) is the largest insurance company in India, established in 1956 through the nationalization of the insurance industry. It is known for managing a large portfolio of investments. + + - Answer: + The Life Insurance Corporation of India (LIC) is the largest insurance company in India, known for its vast portfolio of investments. `LIC contributs to the financial stability of the country`. + + Explanation: The ground truth does not mention anything about LIC contributing to the financial stability of the country. Therefore, this statement in the answer is incorrect. + + Incorrect Statement: 1 + Total claims: 3 + +- **Step 4:** Calculate noise sensitivity using the formula: + ```{math} + \text{noise sensitivity} = { \text{1} \over \text{3} } = 0.333 + ``` +This results in a noise sensitivity score of 0.333, indicating that one out of three claims in the answer was incorrect. + diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index cbc5d2b79..98c527f86 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -13,7 +13,7 @@ ) from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness -from ragas.metrics._noise_sensitivity import noise_sensitivity +from ragas.metrics._noise_sensitivity import NoiseSensitivity, noise_sensitivity from ragas.metrics._summarization import SummarizationScore, summarization_score from ragas.metrics.critique import AspectCritique @@ -39,4 +39,5 @@ "SummarizationScore", "summarization_score", "noise_sensitivity", + "NoiseSensitivity", ] diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index 22a1fa670..7e206a015 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -31,6 +31,7 @@ @dataclass class NoiseSensitivity(MetricWithLLM): name: str = "noise_sensitivity" # type: ignore + focus: str = "relevant" evaluation_mode: EvaluationMode = EvaluationMode.qga # type: ignore nli_statements_message: Prompt = field( default_factory=lambda: NLI_STATEMENTS_MESSAGE @@ -60,6 +61,10 @@ def __post_init__(self): if self.sentence_segmenter is None: language = self.nli_statements_message.language self.sentence_segmenter = get_segmenter(language=language, clean=False) + if self.focus not in {"relevant", "irrelevant"}: # "all" + raise ValueError( + f"Invalid argument passed for 'focus': {self.focus}. Must be 'relevant', 'irrelevant', or 'both'." + ) def _create_nli_prompt(self, contexts: str, statements: t.List[str]) -> PromptValue: assert self.llm is not None, "llm must be set to compute score" @@ -172,12 +177,32 @@ async def _decompose_answer_into_statements( def _compute_score(self, answers: t.Dict): # check the verdicts and compute the score - relevant_retrieved = np.max(answers["retrieved2answer"], axis=0, keepdims=True) + # relevant retrievals + relevant_retrieved = np.max( + answers["retrieved2ground_truth"], axis=0, keepdims=True + ) relevant_faithful = np.max( - relevant_retrieved & answers["retrieved2response"], axis=1 + relevant_retrieved & answers["retrieved2answer"], axis=1 + ) + + # irrelevant retrievals + irrelevant_retrieved = ~np.max( + answers["retrieved2ground_truth"], axis=0, keepdims=True + ) + irrelevant_faithful = np.max( + irrelevant_retrieved & answers["retrieved2answer"], axis=1 ) - incorrect = ~answers["answer2response"] + + irrelevant_faithful &= ~relevant_faithful # to keep them exclusive + + incorrect = ~answers["ground_truth2answer"] noise_sensitivity_in_relevant = np.mean(relevant_faithful & incorrect) + noise_sensitivity_in_irrelevant = np.mean(irrelevant_faithful & incorrect) + + if self.focus == "irrelevant": + return noise_sensitivity_in_irrelevant + # elif self.focus == "all": + # return noise_sensitivity_in_relevant, noise_sensitivity_in_irrelevant return noise_sensitivity_in_relevant async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: @@ -192,28 +217,27 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: ans_statements = await self._decompose_answer_into_statements( row["answer"], row["question"], callbacks ) - - answers_verdictslist = [] - response_verdictslist = [] + gt_verdictslist = [] + ans_verdictslist = [] for ctx in row["contexts"]: verdicts = await self._evaluate_statement_faithfulness( gt_statements, ctx, callbacks ) - answers_verdictslist.append(verdicts) + gt_verdictslist.append(verdicts) verdicts = await self._evaluate_statement_faithfulness( ans_statements, ctx, callbacks ) - response_verdictslist.append(verdicts) + ans_verdictslist.append(verdicts) answers = {} - answers["retrieved2answer"] = np.array(answers_verdictslist).T - answers["retrieved2response"] = np.array(response_verdictslist).T - answers["answer2response"] = await self._evaluate_statement_faithfulness( + answers["retrieved2ground_truth"] = np.array(gt_verdictslist).T + answers["retrieved2answer"] = np.array(ans_verdictslist).T + answers["ground_truth2answer"] = await self._evaluate_statement_faithfulness( ans_statements, row["ground_truth"], callbacks ) - answers["answer2response"] = np.array([answers["answer2response"]]) + answers["ground_truth2answer"] = np.array([answers["ground_truth2answer"]]) answers = {k: v.astype(bool) for k, v in answers.items()} return self._compute_score(answers) From 574323e0c237976bc93af59dfad42d2c7f8aa6f1 Mon Sep 17 00:00:00 2001 From: sahusiddharth Date: Wed, 14 Aug 2024 23:33:04 +0530 Subject: [PATCH 06/14] corrected the numbering --- docs/concepts/metrics/context_recall.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/concepts/metrics/context_recall.md b/docs/concepts/metrics/context_recall.md index aa1611e7a..553e7355e 100644 --- a/docs/concepts/metrics/context_recall.md +++ b/docs/concepts/metrics/context_recall.md @@ -55,7 +55,7 @@ Let's examine how context recall was calculated using the low context recall exa - Statement 1: Yes - Statement 2: No -- **Step 4:** Use the formula depicted above to calculate context recall. +- **Step 3:** Use the formula depicted above to calculate context recall. ```{math} \text{context recall} = { \text{1} \over \text{2} } = 0.5 ``` From 9731ef7222e762a9360e5102545ad5e7fddf4d9d Mon Sep 17 00:00:00 2001 From: sahusiddharth Date: Wed, 14 Aug 2024 23:54:39 +0530 Subject: [PATCH 07/14] modified content --- docs/concepts/metrics/noise_sensitivity.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/concepts/metrics/noise_sensitivity.md b/docs/concepts/metrics/noise_sensitivity.md index 1091c3a7a..54b84a120 100644 --- a/docs/concepts/metrics/noise_sensitivity.md +++ b/docs/concepts/metrics/noise_sensitivity.md @@ -2,8 +2,10 @@ # Noise Sensitivity -Noise sensitivity measures how often the system makes a mistake (i.e., gives an incorrect response) when using the retrieved documents. It is computed using `question`, `ground truth`, `answer` and the retrieved `context`, and the values range between 0 and 1, with lower values indicating better performance. -To estimate noise sensitivity, each claim in the answer is analyzed to determine whether it can be attributed to the relevant retrieved context or not. In an ideal scenario, all claims in the answer should be attributable to the relevant retrieved context. +Noise sensitivity measures how often a system makes errors by providing incorrect responses when utilizing either relevant or irrelevant retrieved documents. The score ranges from 0 to 1, with lower values indicating better performance. Noise sensitivity is computed using the question, ground truth, answer, and the retrieved context. + +To estimate noise sensitivity, each claim in the generated answer is examined to determine whether it is correct based on the ground truth and whether it can be attributed to the relevant (or irrelevant) retrieved context. Ideally, all claims in the answer should be supported by the relevant retrieved context. + ```{math} \text{noise sensitivity (relevant)} = {|\text{Number of incorrect claims in answer}| \over |\text{Number of claims in the Answer}|} @@ -49,7 +51,7 @@ dataset = Dataset.from_dict(data_sample) score = evaluate(dataset,metrics=[noise_sensitivity]) score.to_pandas() ``` -By default, the NoiseSensitivity metric evaluates the noise sensitivity for relevant retrievals. However, you can customize this behavior by specifying the focus parameter when instantiating the NoiseSensitivity class. The `focus` parameter allows you to choose whether to calculate the noise sensitivity for relevant retrievals, irrelevant retrievals, or both. +By default, the NoiseSensitivity metric evaluates the noise sensitivity for relevant retrievals. However, you can customize this behavior by specifying the focus parameter when instantiating the NoiseSensitivity class. The `focus` parameter allows you to choose whether to calculate the noise sensitivity for relevant retrievals, irrelevant retrievals. ```{code-block} python from datasets import Dataset @@ -62,9 +64,6 @@ dataset = Dataset.from_dict(data_sample) # Calculate noise sensitivity for irrelevant retrievals noise_sensitivity = NoiseSensitivity(focus='irrelevant') -# Alternatively, calculate noise sensitivity for both relevant and irrelevant retrievals -# noise_sensitivity = NoiseSensitivity(focus='both') - score = evaluate(dataset, metrics=[noise_sensitivity]) score.to_pandas() ``` From e9426b689600dbf3374150e9f6deca77513c28c3 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 15 Aug 2024 10:12:44 +0530 Subject: [PATCH 08/14] remove changes from experimental --- .../ragas_experimental/testset/generators/base.py | 3 ++- .../ragas_experimental/testset/questions/base.py | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/experimental/ragas_experimental/testset/generators/base.py b/src/experimental/ragas_experimental/testset/generators/base.py index 1a9730f5a..ce40adc38 100644 --- a/src/experimental/ragas_experimental/testset/generators/base.py +++ b/src/experimental/ragas_experimental/testset/generators/base.py @@ -48,7 +48,8 @@ def generate( docs: t.Sequence[Document], test_size: int, distribution: QADistribution, - ) -> TestDataset: ... + ) -> TestDataset: + ... def generate_with_langchain_docs( self, diff --git a/src/experimental/ragas_experimental/testset/questions/base.py b/src/experimental/ragas_experimental/testset/questions/base.py index fc2290192..1ab7c9c38 100644 --- a/src/experimental/ragas_experimental/testset/questions/base.py +++ b/src/experimental/ragas_experimental/testset/questions/base.py @@ -46,12 +46,12 @@ class QAC: @dataclass class StyleLengthDistribution: - style_length_distribution: t.Dict[t.Tuple[QuestionStyle, QuestionLength], float] = ( - field( - default_factory=lambda: { - (QuestionStyle.PERFECT_GRAMMAR, QuestionLength.MEDIUM): 1.0 - } - ) + style_length_distribution: t.Dict[ + t.Tuple[QuestionStyle, QuestionLength], float + ] = field( + default_factory=lambda: { + (QuestionStyle.PERFECT_GRAMMAR, QuestionLength.MEDIUM): 1.0 + } ) def __post_init__(self): From 529372f35429e4e167a2d884422c06a59263f459 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 15 Aug 2024 10:13:17 +0530 Subject: [PATCH 09/14] minor fixes --- src/ragas/metrics/_noise_sensitivity.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index 7e206a015..79b2412e3 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -61,10 +61,11 @@ def __post_init__(self): if self.sentence_segmenter is None: language = self.nli_statements_message.language self.sentence_segmenter = get_segmenter(language=language, clean=False) - if self.focus not in {"relevant", "irrelevant"}: # "all" + if self.focus not in {"relevant", "irrelevant"}: raise ValueError( - f"Invalid argument passed for 'focus': {self.focus}. Must be 'relevant', 'irrelevant', or 'both'." + f"Invalid argument passed for 'focus': {self.focus}. Must be 'relevant' or 'irrelevant'." ) + self.name = f"{self.name}_{self.focus}" # type: ignore def _create_nli_prompt(self, contexts: str, statements: t.List[str]) -> PromptValue: assert self.llm is not None, "llm must be set to compute score" @@ -175,8 +176,7 @@ async def _decompose_answer_into_statements( return statements - def _compute_score(self, answers: t.Dict): - # check the verdicts and compute the score + def _compute_score(self, answers: t.Dict) -> float: # relevant retrievals relevant_retrieved = np.max( answers["retrieved2ground_truth"], axis=0, keepdims=True @@ -193,7 +193,8 @@ def _compute_score(self, answers: t.Dict): irrelevant_retrieved & answers["retrieved2answer"], axis=1 ) - irrelevant_faithful &= ~relevant_faithful # to keep them exclusive + # to keep them exclusive + irrelevant_faithful &= ~relevant_faithful incorrect = ~answers["ground_truth2answer"] noise_sensitivity_in_relevant = np.mean(relevant_faithful & incorrect) @@ -201,8 +202,7 @@ def _compute_score(self, answers: t.Dict): if self.focus == "irrelevant": return noise_sensitivity_in_irrelevant - # elif self.focus == "all": - # return noise_sensitivity_in_relevant, noise_sensitivity_in_irrelevant + return noise_sensitivity_in_relevant async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks) -> float: @@ -260,4 +260,5 @@ def save(self, cache_dir: t.Optional[str] = None) -> None: self.statement_prompt.save(cache_dir) -noise_sensitivity = NoiseSensitivity() +noise_sensitivity_relevant = NoiseSensitivity() +noise_sensitivity_irrelevant = NoiseSensitivity(focus="irrelevant") From 2fbd51cab654a6d300c55292166ef975d09065b5 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 15 Aug 2024 10:19:10 +0530 Subject: [PATCH 10/14] reflect changes in docs --- docs/concepts/metrics/noise_sensitivity.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/concepts/metrics/noise_sensitivity.md b/docs/concepts/metrics/noise_sensitivity.md index 54b84a120..7aa716879 100644 --- a/docs/concepts/metrics/noise_sensitivity.md +++ b/docs/concepts/metrics/noise_sensitivity.md @@ -32,7 +32,7 @@ Irrelevant Retrieval: ```{code-block} python :caption: Noise Sensitivity from datasets import Dataset -from ragas.metrics import noise_sensitivity +from ragas.metrics import noise_sensitivity_relevant, noise_sensitivity_irrelevant from ragas import evaluate data_sample = { @@ -43,12 +43,13 @@ data_sample = { "The Life Insurance Corporation of India (LIC) was established in 1956 following the nationalization of the insurance industry in India.", "LIC is the largest insurance company in India, with a vast network of policyholders and a huge investments.", "As the largest institutional investor in India, LIC manages a substantial funds, contributing to the financial stability of the country.", - "The Indian economy is one of the fastest-growing major economies in the world, thanks to the secors like finance, technology, manufacturing etc". + "The Indian economy is one of the fastest-growing major economies in the world, thanks to the secors like finance, technology, manufacturing etc" ]] } dataset = Dataset.from_dict(data_sample) -score = evaluate(dataset,metrics=[noise_sensitivity]) +metrics = [noise_sensitivity_relevant, noise_sensitivity_irrelevant] +score = evaluate(dataset,metrics=metrics) score.to_pandas() ``` By default, the NoiseSensitivity metric evaluates the noise sensitivity for relevant retrievals. However, you can customize this behavior by specifying the focus parameter when instantiating the NoiseSensitivity class. The `focus` parameter allows you to choose whether to calculate the noise sensitivity for relevant retrievals, irrelevant retrievals. From a0786f1965cc9849cf44fa6f6436a58c827070ec Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 15 Aug 2024 10:20:58 +0530 Subject: [PATCH 11/14] fix imports --- src/ragas/metrics/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 98c527f86..4b7109f53 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -13,7 +13,11 @@ ) from ragas.metrics._context_recall import ContextRecall, context_recall from ragas.metrics._faithfulness import Faithfulness, FaithulnesswithHHEM, faithfulness -from ragas.metrics._noise_sensitivity import NoiseSensitivity, noise_sensitivity +from ragas.metrics._noise_sensitivity import ( + NoiseSensitivity, + noise_sensitivity_irrelevant, + noise_sensitivity_relevant, +) from ragas.metrics._summarization import SummarizationScore, summarization_score from ragas.metrics.critique import AspectCritique @@ -38,6 +42,7 @@ "context_entity_recall", "SummarizationScore", "summarization_score", - "noise_sensitivity", "NoiseSensitivity", + "noise_sensitivity_irrelevant", + "noise_sensitivity_relevant", ] From 8188daa6d8038e1c660ea7088cadee472700a7a0 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 15 Aug 2024 10:29:34 +0530 Subject: [PATCH 12/14] fix typo --- src/ragas/metrics/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 96dfbcac1..cd2f566e2 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -17,6 +17,7 @@ NoiseSensitivity, noise_sensitivity_irrelevant, noise_sensitivity_relevant, +) from ragas.metrics._rubrics_based import ( LabelledRubricsScore, ReferenceFreeRubricsScore, From 1e846920308bdccd8847f8d5a76786b768f545cc Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 15 Aug 2024 10:52:27 +0530 Subject: [PATCH 13/14] add noise sensitivity to metrics --- docs/concepts/metrics/index.md | 2 ++ docs/concepts/metrics/noise_sensitivity.md | 17 ++--------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/docs/concepts/metrics/index.md b/docs/concepts/metrics/index.md index ac52ce73d..bbd5ce19b 100644 --- a/docs/concepts/metrics/index.md +++ b/docs/concepts/metrics/index.md @@ -15,6 +15,7 @@ Just like in any machine learning system, the performance of individual componen - [Context precision](context_precision.md) - [Context utilization](context_utilization.md) - [Context entity recall](context_entities_recall.md) +- [Noise Sensitivity](noise_sensitivity.md) - [Summarization Score](summarization_score.md) ```{toctree} @@ -35,6 +36,7 @@ answer_relevance context_precision context_recall context_entities_recall +noise_sensitivity semantic_similarity answer_correctness critique diff --git a/docs/concepts/metrics/noise_sensitivity.md b/docs/concepts/metrics/noise_sensitivity.md index 7aa716879..a07309779 100644 --- a/docs/concepts/metrics/noise_sensitivity.md +++ b/docs/concepts/metrics/noise_sensitivity.md @@ -52,22 +52,7 @@ metrics = [noise_sensitivity_relevant, noise_sensitivity_irrelevant] score = evaluate(dataset,metrics=metrics) score.to_pandas() ``` -By default, the NoiseSensitivity metric evaluates the noise sensitivity for relevant retrievals. However, you can customize this behavior by specifying the focus parameter when instantiating the NoiseSensitivity class. The `focus` parameter allows you to choose whether to calculate the noise sensitivity for relevant retrievals, irrelevant retrievals. -```{code-block} python -from datasets import Dataset -from ragas.metrics import NoiseSensitivity -from ragas import evaluate - -# Sample dataset -dataset = Dataset.from_dict(data_sample) - -# Calculate noise sensitivity for irrelevant retrievals -noise_sensitivity = NoiseSensitivity(focus='irrelevant') - -score = evaluate(dataset, metrics=[noise_sensitivity]) -score.to_pandas() -``` ## Calculation Let's examine how noise sensitivity in relevant context was calculated: @@ -112,3 +97,5 @@ Let's examine how noise sensitivity in relevant context was calculated: ``` This results in a noise sensitivity score of 0.333, indicating that one out of three claims in the answer was incorrect. + +Credits: Noise senstivity was introduced in [RAGChecker](https://github.com/amazon-science/RAGChecker/tree/main/ragchecker) \ No newline at end of file From 1acb14b71a71f25d032e7100c45a0a3d86dc4b23 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 15 Aug 2024 10:58:15 +0530 Subject: [PATCH 14/14] remove all unneccessary linting changes --- src/ragas/llms/base.py | 6 ++++-- src/ragas/metrics/_faithfulness.py | 3 ++- src/ragas/metrics/base.py | 9 ++++++--- src/ragas/testset/docstore.py | 21 ++++++++++++++------- src/ragas/testset/evolutions.py | 6 ++++-- src/ragas/testset/extractor.py | 3 ++- 6 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index 71c7bdc3f..93356228e 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -63,7 +63,8 @@ def generate_text( temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = None, - ) -> LLMResult: ... + ) -> LLMResult: + ... @abstractmethod async def agenerate_text( @@ -73,7 +74,8 @@ async def agenerate_text( temperature: t.Optional[float] = None, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = None, - ) -> LLMResult: ... + ) -> LLMResult: + ... async def generate( self, diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 76821f019..e072ae070 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -21,7 +21,8 @@ class HasSegmentMethod(Protocol): - def segment(self, text) -> Any: ... + def segment(self, text) -> Any: + ... logger = logging.getLogger(__name__) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 5587a3656..f619ad12c 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -63,11 +63,13 @@ def get_required_columns( class Metric(ABC): @property @abstractmethod - def name(self) -> str: ... + def name(self) -> str: + ... @property @abstractmethod - def evaluation_mode(self) -> EvaluationMode: ... + def evaluation_mode(self) -> EvaluationMode: + ... @abstractmethod def init(self, run_config: RunConfig): @@ -130,7 +132,8 @@ async def ascore( return score @abstractmethod - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ... + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + ... @dataclass diff --git a/src/ragas/testset/docstore.py b/src/ragas/testset/docstore.py index f22efd867..8a9a61e08 100644 --- a/src/ragas/testset/docstore.py +++ b/src/ragas/testset/docstore.py @@ -99,23 +99,29 @@ def __init__(self): self.documents = {} @abstractmethod - def add_documents(self, docs: t.Sequence[Document], show_progress=True): ... + def add_documents(self, docs: t.Sequence[Document], show_progress=True): + ... @abstractmethod - def add_nodes(self, nodes: t.Sequence[Node], show_progress=True): ... + def add_nodes(self, nodes: t.Sequence[Node], show_progress=True): + ... @abstractmethod - def get_node(self, node_id: str) -> Node: ... + def get_node(self, node_id: str) -> Node: + ... @abstractmethod - def get_random_nodes(self, k=1) -> t.List[Node]: ... + def get_random_nodes(self, k=1) -> t.List[Node]: + ... @abstractmethod def get_similar( self, node: Node, threshold: float = 0.7, top_k: int = 3 - ) -> t.Union[t.List[Document], t.List[Node]]: ... + ) -> t.Union[t.List[Document], t.List[Node]]: + ... - def set_run_config(self, run_config: RunConfig): ... + def set_run_config(self, run_config: RunConfig): + ... class SimilarityMode(str, Enum): @@ -191,7 +197,8 @@ class InMemoryDocumentStore(DocumentStore): node_map: t.Dict[str, Node] = field(default_factory=dict) run_config: RunConfig = field(default_factory=RunConfig) - def _embed_items(self, items: t.Union[t.Sequence[Document], t.Sequence[Node]]): ... + def _embed_items(self, items: t.Union[t.Sequence[Document], t.Sequence[Node]]): + ... def add_documents(self, docs: t.Sequence[Document], show_progress=True): """ diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py index 096dc2a74..16a7e8ee5 100644 --- a/src/ragas/testset/evolutions.py +++ b/src/ragas/testset/evolutions.py @@ -173,9 +173,11 @@ async def fix_invalid_question( @abstractmethod async def _aevolve( self, current_tries: int, current_nodes: CurrentNodes - ) -> EvolutionOutput: ... + ) -> EvolutionOutput: + ... - async def filter_and_retry(self, question): ... + async def filter_and_retry(self, question): + ... async def generate_datarow( self, diff --git a/src/ragas/testset/extractor.py b/src/ragas/testset/extractor.py index 49b6f94a2..19126514d 100644 --- a/src/ragas/testset/extractor.py +++ b/src/ragas/testset/extractor.py @@ -22,7 +22,8 @@ class Extractor(ABC): llm: BaseRagasLLM @abstractmethod - async def extract(self, node: Node, is_async: bool = True) -> t.Any: ... + async def extract(self, node: Node, is_async: bool = True) -> t.Any: + ... def adapt(self, language: str, cache_dir: t.Optional[str] = None) -> None: """