From 9f274e2e7cf3abc9a82460e6cd18e7f914348302 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 17:01:04 +0530 Subject: [PATCH 01/15] added context recall --- src/ragas/metrics/context_recall.py | 114 ++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 src/ragas/metrics/context_recall.py diff --git a/src/ragas/metrics/context_recall.py b/src/ragas/metrics/context_recall.py new file mode 100644 index 000000000..4cdb89ef4 --- /dev/null +++ b/src/ragas/metrics/context_recall.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import typing as t +from dataclasses import dataclass + +from datasets import Dataset +from langchain.callbacks.manager import CallbackManager, trace_as_chain_group +from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate + +from ragas.metrics.base import EvaluationMode, MetricWithLLM +from ragas.metrics.llms import generate + +CONTEXT_RECALL_RC = HumanMessagePromptTemplate.from_template( + """ + +Given text 1 and text 2, Analyze each sentence from text 2 and classify if the sentence is also present in text 1 or not. + +text 1: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect". +text 2: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. +classification: +1. Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. [Present] +2. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics. [Present] +3. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect". [Present] +4. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". [Not Present] +5. His work is also known for its influence on the philosophy of science. [Not Present] +6. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. [Not Present] +7. His intellectual achievements and originality have made Einstein synonymous with genius. [Not Present] + +text 1: {context} +text 2: {ground_truth} +classification: +""" # noqa: E501 +) + +CONTEXT_RECALL_RA = HumanMessagePromptTemplate.from_template( + """ +Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. +Think in steps and reason bofore coming to conclusion. + +context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. +answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 +classification +1. Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. This is mentioned clearly in the context. [Attributed] +1. He published 4 papers in 1905. There is no mention about papers he wrote in the context. [Not Attributed] +2. Einstein moved to Switzerland in 1895. This sentence cannot be deducted from the context. [Not Attributed] + +context:{context} +answer:{ground_truth} +classification: +""" # noqa: E501 +) + + +@dataclass +class ContextRecall(MetricWithLLM): + name: str = "context_recall" + reference: str = "reference_answer" + evaluation_mode: EvaluationMode = EvaluationMode.gc + batch_size: int = 15 + + def __post_init__(self: t.Self): + if self.reference == "reference_answer": + self.prompt_format = CONTEXT_RECALL_RA + self.verdict_token = "[Attributed]" + elif self.reference == "reference_context": + self.prompt_format = CONTEXT_RECALL_RC + self.verdict_token = "[Present]" + else: + raise ValueError( + "reference must be either reference_answer or reference_context" + ) + + def init_model(self: t.Self): + ... + + def _score_batch( + self: t.Self, + dataset: Dataset, + callbacks: t.Optional[CallbackManager] = None, + callback_group_name: str = "batch", + ) -> list: + prompts = [] + ground_truths, contexts = dataset["ground_truths"], dataset["contexts"] + with trace_as_chain_group( + callback_group_name, callback_manager=callbacks + ) as batch_group: + for gt, ctx in zip(ground_truths, contexts): + gt = "\n".join(gt) if isinstance(gt, list) else gt + ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx + human_prompt = self.prompt_format.format(context=ctx, ground_truth=gt) + prompts.append(ChatPromptTemplate.from_messages([human_prompt])) + + responses: list[list[str]] = [] + results = generate( + prompts, + self.llm, + n=1, + callbacks=batch_group, + ) + responses = [[i.text for i in r] for r in results.generations] + scores = [] + for response in responses: + sentences = response[0].split("\n") + denom = len(sentences) + numerator = sum( + bool(sentence.find(self.verdict_token) != -1) + for sentence in sentences + ) + scores.append(numerator / denom) + + return scores + + +context_recall = ContextRecall(reference="reference_answer") From 0b112df74bf9f0c01a7e3df503e71210fac049d6 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 17:06:03 +0530 Subject: [PATCH 02/15] added context recall to docs --- README.md | 6 ++++-- docs/metrics.md | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 31e9fc282..6492b3869 100644 --- a/README.md +++ b/README.md @@ -91,9 +91,11 @@ Ragas measures your pipeline's performance against different dimensions 2. **Context Relevancy**: measures how relevant retrieved contexts are to the question. Ideally, the context should only contain information necessary to answer the question. The presence of redundant information in the context is penalized. -3. **Answer Relevancy**: refers to the degree to which a response directly addresses and is appropriate for a given question or context. This does not take the factuality of the answer into consideration but rather penalizes the present of redundant information or incomplete answers given a question. +3. **Context Recall**: measures the recall of the retrieved context using either annotated answer as ground truth or annotated context as ground truth. -4. **Aspect Critiques**: Designed to judge the submission against defined aspects like harmlessness, correctness, etc. You can also define your own aspect and validate the submission against your desired aspect. The output of aspect critiques is always binary. +4. **Answer Relevancy**: refers to the degree to which a response directly addresses and is appropriate for a given question or context. This does not take the factuality of the answer into consideration but rather penalizes the present of redundant information or incomplete answers given a question. + +5. **Aspect Critiques**: Designed to judge the submission against defined aspects like harmlessness, correctness, etc. You can also define your own aspect and validate the submission against your desired aspect. The output of aspect critiques is always binary. The final `ragas_score` is the harmonic mean of individual metric scores. diff --git a/docs/metrics.md b/docs/metrics.md index 8f3616a04..8ff750794 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -30,6 +30,22 @@ dataset: Dataset results = context_rel.score(dataset) ``` +### Context Recall +This measures the recall of the retrieved context using either annotated answer as ground truth or annotated context as ground truth. + +```python +from ragas.metrics.context_recall import ContextRecall +context_recall = ContextRecall(reference="reference_answer") +# Dataset({ +# features: ['contexts','ground_truths'], +# num_rows: 25 +# }) +dataset: Dataset + +results = context_recall.score(dataset) +``` + + ### `AnswerRelevancy` This measures how relevant is the generated answer to the prompt. If the generated answer is incomplete or contains redundant information the score will be low. This is quantified by working out the chance of an LLM generating the given question using the generated answer. Values range (0,1), higher the better. From eb9e31561feb4fef0c028adbd2007149cee84869 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 20:53:44 +0530 Subject: [PATCH 03/15] update recall information --- README.md | 2 +- docs/metrics.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6492b3869..f0bf40940 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ Ragas measures your pipeline's performance against different dimensions 2. **Context Relevancy**: measures how relevant retrieved contexts are to the question. Ideally, the context should only contain information necessary to answer the question. The presence of redundant information in the context is penalized. -3. **Context Recall**: measures the recall of the retrieved context using either annotated answer as ground truth or annotated context as ground truth. +3. **Context Recall**: measures the recall of the retrieved context using annotated answer as ground truth. Annotated answer is taken as proxy for ground truth context. 4. **Answer Relevancy**: refers to the degree to which a response directly addresses and is appropriate for a given question or context. This does not take the factuality of the answer into consideration but rather penalizes the present of redundant information or incomplete answers given a question. diff --git a/docs/metrics.md b/docs/metrics.md index 8ff750794..76a4a55ea 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -31,11 +31,11 @@ results = context_rel.score(dataset) ``` ### Context Recall -This measures the recall of the retrieved context using either annotated answer as ground truth or annotated context as ground truth. +measures the recall of the retrieved context using annotated answer as ground truth. Annotated answer is taken as proxy for ground truth context. ```python from ragas.metrics.context_recall import ContextRecall -context_recall = ContextRecall(reference="reference_answer") +context_recall = ContextRecall() # Dataset({ # features: ['contexts','ground_truths'], # num_rows: 25 From 4fb41300da48c0c6c3cc945f2fec9c2777b56710 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 20:54:04 +0530 Subject: [PATCH 04/15] rmv sentence comparison --- src/ragas/metrics/context_recall.py | 46 +++++------------------------ 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/src/ragas/metrics/context_recall.py b/src/ragas/metrics/context_recall.py index 4cdb89ef4..b2d12aa38 100644 --- a/src/ragas/metrics/context_recall.py +++ b/src/ragas/metrics/context_recall.py @@ -10,39 +10,18 @@ from ragas.metrics.base import EvaluationMode, MetricWithLLM from ragas.metrics.llms import generate -CONTEXT_RECALL_RC = HumanMessagePromptTemplate.from_template( - """ - -Given text 1 and text 2, Analyze each sentence from text 2 and classify if the sentence is also present in text 1 or not. - -text 1: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect". -text 2: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. -classification: -1. Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. [Present] -2. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics. [Present] -3. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect". [Present] -4. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". [Not Present] -5. His work is also known for its influence on the philosophy of science. [Not Present] -6. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. [Not Present] -7. His intellectual achievements and originality have made Einstein synonymous with genius. [Not Present] - -text 1: {context} -text 2: {ground_truth} -classification: -""" # noqa: E501 -) - CONTEXT_RECALL_RA = HumanMessagePromptTemplate.from_template( """ Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Think in steps and reason bofore coming to conclusion. context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius. -answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 +answer: Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905. Einstein moved to Switzerland in 1895 classification -1. Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. This is mentioned clearly in the context. [Attributed] -1. He published 4 papers in 1905. There is no mention about papers he wrote in the context. [Not Attributed] -2. Einstein moved to Switzerland in 1895. This sentence cannot be deducted from the context. [Not Attributed] +1. Albert Einstein born in 14 March 1879 was German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. The date of birth of Einstein is mentioned clearly in the context. So [Attributed] +2. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. The exact sentence is present in the given context. So [Attributed] +3. He published 4 papers in 1905. There is no mention about papers he wrote in given the context. So [Not Attributed] +4. Einstein moved to Switzerland in 1895. There is not supporting evidence for this in the given the context. So [Not Attributed] context:{context} answer:{ground_truth} @@ -54,21 +33,12 @@ @dataclass class ContextRecall(MetricWithLLM): name: str = "context_recall" - reference: str = "reference_answer" evaluation_mode: EvaluationMode = EvaluationMode.gc batch_size: int = 15 def __post_init__(self: t.Self): - if self.reference == "reference_answer": - self.prompt_format = CONTEXT_RECALL_RA - self.verdict_token = "[Attributed]" - elif self.reference == "reference_context": - self.prompt_format = CONTEXT_RECALL_RC - self.verdict_token = "[Present]" - else: - raise ValueError( - "reference must be either reference_answer or reference_context" - ) + self.prompt_format = CONTEXT_RECALL_RA + self.verdict_token = "[Attributed]" def init_model(self: t.Self): ... @@ -111,4 +81,4 @@ def _score_batch( return scores -context_recall = ContextRecall(reference="reference_answer") +context_recall = ContextRecall() From 58f59dc1a28d89f54ad8632e9cd3f8898872b77f Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 20:55:22 +0530 Subject: [PATCH 05/15] added gc to eval mode --- src/ragas/evaluation.py | 6 +++--- src/ragas/validation.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 40f677af9..b86f46d72 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -15,9 +15,9 @@ def evaluate( dataset: Dataset, metrics: list[Metric] | None = None, column_map: dict[str, str] = { - "question": "question", + # "question": "question", "contexts": "contexts", - "answer": "answer", + # "answer": "answer", "ground_truths": "ground_truths", }, ) -> Result: @@ -77,7 +77,7 @@ def evaluate( metrics = [answer_relevancy, context_relevancy, faithfulness] # select columns from the dataset - dataset = dataset.from_dict({k: dataset[v] for k, v in column_map.items()}) + # dataset = dataset.from_dict({k: dataset[v] for k, v in column_map.items()}) # validation validate_evaluation_modes(dataset, metrics) diff --git a/src/ragas/validation.py b/src/ragas/validation.py index 98c39250c..50c1487d8 100644 --- a/src/ragas/validation.py +++ b/src/ragas/validation.py @@ -29,7 +29,7 @@ def validate_column_dtypes(ds: Dataset): EvaluationMode.qac: ["question", "answer", "contexts"], EvaluationMode.qa: ["question", "answer"], EvaluationMode.qc: ["question", "contexts"], - EvaluationMode.ga: ["ground_truths", "answer"], + EvaluationMode.gc: ["ground_truths", "contexts"], } From 4830840377f34f486c2ee0e98ee156ef33c04edf Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 21:09:02 +0530 Subject: [PATCH 06/15] added doc string --- src/ragas/metrics/context_recall.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/ragas/metrics/context_recall.py b/src/ragas/metrics/context_recall.py index b2d12aa38..61cc3e05a 100644 --- a/src/ragas/metrics/context_recall.py +++ b/src/ragas/metrics/context_recall.py @@ -32,6 +32,18 @@ @dataclass class ContextRecall(MetricWithLLM): + + """ + Estimates context recall by estimating TP and FN using annotated answer and + retrieved context. + + Attributes + ---------- + name : str + batch_size : int + Batch size for openai completion. + """ + name: str = "context_recall" evaluation_mode: EvaluationMode = EvaluationMode.gc batch_size: int = 15 From 81bd31597b72636bafc90f044da1821965c5c6b1 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 21:13:19 +0530 Subject: [PATCH 07/15] undo patchs --- src/ragas/evaluation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index b86f46d72..40f677af9 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -15,9 +15,9 @@ def evaluate( dataset: Dataset, metrics: list[Metric] | None = None, column_map: dict[str, str] = { - # "question": "question", + "question": "question", "contexts": "contexts", - # "answer": "answer", + "answer": "answer", "ground_truths": "ground_truths", }, ) -> Result: @@ -77,7 +77,7 @@ def evaluate( metrics = [answer_relevancy, context_relevancy, faithfulness] # select columns from the dataset - # dataset = dataset.from_dict({k: dataset[v] for k, v in column_map.items()}) + dataset = dataset.from_dict({k: dataset[v] for k, v in column_map.items()}) # validation validate_evaluation_modes(dataset, metrics) From cc7939c7a7fe3effc51ec04d057aed3fd403d1a1 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 21:14:31 +0530 Subject: [PATCH 08/15] add recall to init --- src/ragas/metrics/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 826e8fb25..b385e9e26 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,5 +1,6 @@ from ragas.metrics.answer_relevance import AnswerRelevancy, answer_relevancy from ragas.metrics.context_relevance import ContextRelevancy, context_relevancy +from ragas.metrics.context_recall import ContextRecall, context_recall from ragas.metrics.critique import AspectCritique from ragas.metrics.faithfulnes import Faithfulness, faithfulness @@ -11,4 +12,6 @@ "ContextRelevancy", "context_relevancy", "AspectCritique", + "ContextRecall", + "context_recall" ] From a0e73d18f5f400a5045d91f9eabe2fa30b53c40c Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 21:28:50 +0530 Subject: [PATCH 09/15] added gc --- src/ragas/metrics/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 605b38547..19868c3b4 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -38,7 +38,7 @@ def make_batches(total_size: int, batch_size: int) -> list[range]: return batches -EvaluationMode = Enum("EvaluationMode", "qac qa qc ga") +EvaluationMode = Enum("EvaluationMode", "qac qa qc gc") @dataclass From 6fbf3059eacc845835f53112db279fbcf4b64530 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 23:12:20 +0530 Subject: [PATCH 10/15] added recall metrics --- docs/quickstart.ipynb | 873 ++++++++++++++++++++++-------------------- 1 file changed, 457 insertions(+), 416 deletions(-) diff --git a/docs/quickstart.ipynb b/docs/quickstart.ipynb index 9e9411953..64883960a 100644 --- a/docs/quickstart.ipynb +++ b/docs/quickstart.ipynb @@ -1,419 +1,460 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2e63f667", - "metadata": {}, - "source": [ - "

\n", - " \n", - " \"Open\n", - " \n", - "
\n", - "

Quickstart

\n", - "

\n", - "\n", - "welcome to the ragas quickstart. We're going to get you up and running with ragas as qickly as you can so that you can go back to improving your Retrieval Augmented Generation pipelines while this library makes sure your changes are improving your entire pipeline.\n", - "\n", - "to kick things of lets start with the data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "57585b55", - "metadata": {}, - "outputs": [], - "source": [ - "# if using colab uncomment this\n", - "#!pip install ragas" - ] - }, - { - "cell_type": "markdown", - "id": "c77789bb", - "metadata": {}, - "source": [ - "Ragas also uses OpenAI for running some metrics so make sure you have your openai key ready and available in your environment" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "0b7179f7", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-key\"" - ] - }, - { - "cell_type": "markdown", - "id": "06c9fc7d", - "metadata": {}, - "source": [ - "## The Data\n", - "\n", - "Ragas performs a `ground_truth` free evaluation of your RAG pipelines. This is because for most people building a gold labeled dataset which represents in the distribution they get in production is a very expensive process.\n", - "\n", - "Hence to work with ragas all you need are the following data\n", - "- question: `list[str]` - These are the questions you RAG pipeline will be evaluated on. \n", - "- answer: `list[str]` - The answer generated from the RAG pipeline and give to the user.\n", - "- contexts: `list[list[str]]` - The contexts which where passed into the LLM to answer the question.\n", - "\n", - "Ideally your list of questions should reflect the questions your users give, including those that you have been problamatic in the past.\n", - "\n", - "Here we're using an example dataset from on of the baselines we created for the [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/) we created. If you want to want to know more about the baseline, feel free to check the `experiements/baseline` section" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b658e02f", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Found cached dataset fiqa (/home/jjmachan/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "986d2c6f72354b10b32d0458fe00a749", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
questionground_truthsanswercontextscontext_ relevancyfaithfulnessanswer_relevancyharmfulness
0How to deposit a cheque issued to an associate...[Have the check reissued to the proper payee.J...\\nThe best way to deposit a cheque issued to a...[Just have the associate sign the back and the...0.1324681.00.9781800
1Can I send a money order from USPS as a business?[Sure you can. You can fill in whatever you w...\\nYes, you can send a money order from USPS as...[Sure you can. You can fill in whatever you w...0.0741751.00.9094810
21 EIN doing business under multiple business n...[You're confusing a lot of things here. Compan...\\nYes, it is possible to have one EIN doing bu...[You're confusing a lot of things here. Compan...0.0000000.50.9164800
\n", - "" - ], - "text/plain": [ - " question \\\n", - "0 How to deposit a cheque issued to an associate... \n", - "1 Can I send a money order from USPS as a business? \n", - "2 1 EIN doing business under multiple business n... \n", - "\n", - " ground_truths \\\n", - "0 [Have the check reissued to the proper payee.J... \n", - "1 [Sure you can. You can fill in whatever you w... \n", - "2 [You're confusing a lot of things here. Compan... \n", - "\n", - " answer \\\n", - "0 \\nThe best way to deposit a cheque issued to a... \n", - "1 \\nYes, you can send a money order from USPS as... \n", - "2 \\nYes, it is possible to have one EIN doing bu... \n", - "\n", - " contexts context_ relevancy \\\n", - "0 [Just have the associate sign the back and the... 0.132468 \n", - "1 [Sure you can. You can fill in whatever you w... 0.074175 \n", - "2 [You're confusing a lot of things here. Compan... 0.000000 \n", - "\n", - " faithfulness answer_relevancy harmfulness \n", - "0 1.0 0.978180 0 \n", - "1 1.0 0.909481 0 \n", - "2 0.5 0.916480 0 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = result.to_pandas()\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "f668fce1", - "metadata": {}, - "source": [ - "And thats it!\n", - "\n", - "You can check out the [ragas in action] notebook to get a feel of what is like to use it while trying to improve your pipelines.\n", - "\n", - "if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } + "cells": [ + { + "cell_type": "markdown", + "id": "2e63f667", + "metadata": {}, + "source": [ + "

\n", + " \n", + " \"Open\n", + " \n", + "
\n", + "

Quickstart

\n", + "

\n", + "\n", + "welcome to the ragas quickstart. We're going to get you up and running with ragas as qickly as you can so that you can go back to improving your Retrieval Augmented Generation pipelines while this library makes sure your changes are improving your entire pipeline.\n", + "\n", + "to kick things of lets start with the data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "57585b55", + "metadata": {}, + "outputs": [], + "source": [ + "# if using colab uncomment this\n", + "#!pip install ragas" + ] + }, + { + "cell_type": "markdown", + "id": "c77789bb", + "metadata": {}, + "source": [ + "Ragas also uses OpenAI for running some metrics so make sure you have your openai key ready and available in your environment" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0b7179f7", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-key\"" + ] + }, + { + "cell_type": "markdown", + "id": "06c9fc7d", + "metadata": {}, + "source": [ + "## The Data\n", + "\n", + "Ragas performs a `ground_truth` free evaluation of your RAG pipelines. This is because for most people building a gold labeled dataset which represents in the distribution they get in production is a very expensive process.\n", + "\n", + "Hence to work with ragas all you need are the following data\n", + "- question: `list[str]` - These are the questions you RAG pipeline will be evaluated on. \n", + "- answer: `list[str]` - The answer generated from the RAG pipeline and give to the user.\n", + "- contexts: `list[list[str]]` - The contexts which where passed into the LLM to answer the question.\n", + "\n", + "Ideally your list of questions should reflect the questions your users give, including those that you have been problamatic in the past.\n", + "\n", + "Here we're using an example dataset from on of the baselines we created for the [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/) we created. If you want to want to know more about the baseline, feel free to check the `experiements/baseline` section" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b658e02f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/ragas/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Found cached dataset fiqa (/Users/shahules/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n", + "100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 284.42it/s]\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " baseline: Dataset({\n", + " features: ['question', 'ground_truths', 'answer', 'contexts'],\n", + " num_rows: 30\n", + " })\n", + "})" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# data\n", + "from datasets import load_dataset\n", + "\n", + "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n", + "fiqa_eval" + ] + }, + { + "cell_type": "markdown", + "id": "84aa640f", + "metadata": {}, + "source": [ + "## Metrics\n", + "\n", + "Ragas measures your pipeline's performance against two dimensions\n", + "\n", + "1. Faithfulness: measures the factual consistency of the generated answer against the given context.\n", + "2. Relevancy: measures how relevant retrieved contexts and the generated answer are to the question.\n", + "\n", + "Through repeated experiments, we have found that the quality of a RAG pipeline is highly dependent on these two dimensions. The final `ragas_score` is the harmonic mean of these two factors.\n", + "\n", + "now lets import these metrics and understand more about what they denote" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f17bcf9d", + "metadata": {}, + "outputs": [], + "source": [ + "from ragas.metrics import context_relevancy, answer_relevancy, faithfulness, context_recall\n", + "from ragas.metrics.critique import harmfulness" + ] + }, + { + "cell_type": "markdown", + "id": "ef8c5e60", + "metadata": {}, + "source": [ + "here you can see that we are using 4 metrics, but what do the represent?\n", + "\n", + "1. context_relevancy - a measure of how relevent the retrieved context is to the question. Conveys quality of the retrieval pipeline.\n", + "2. answer_relevancy - a measure of how relevent the answer is to the question\n", + "3. faithfulness - the factual consistancy of the answer to the context base on the question.\n", + "4. harmfulness (AspectCritique) - in general, `AspectCritique` is a metric that can be used to quantify various aspects of the answer. Aspects like harmfulness, maliciousness, coherence, correctness, concisenes are available by default but you can easily define your own. Check the [docs](./metrics.md) for more info.\n", + "\n", + "**Note:** *by default these metrics are using OpenAI's API to compute the score. If you using this metric make sure you set the environment key `OPENAI_API_KEY` with your API key. You can also try other LLMs for evaluation, check the [llm guide](./guides/llms.ipynb) to learn more*\n", + "\n", + "If you're interested in learning more, feel free to check the [docs](https://github.com/explodinggradients/ragas/blob/main/docs/metrics.md)" + ] + }, + { + "cell_type": "markdown", + "id": "8d6ecd5a", + "metadata": {}, + "source": [ + "## Evaluation\n", + "\n", + "Running the evalutation is as simple as calling evaluate on the `Dataset` with the metrics of your choice." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "22eb6f97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [context_ relevancy]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████| 1/1 [01:38<00:00, 98.08s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [faithfulness]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████| 1/1 [01:21<00:00, 81.37s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [answer_relevancy]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████| 1/1 [00:21<00:00, 21.56s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [context_recall]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████| 1/1 [01:24<00:00, 84.71s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating with [harmfulness]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.03s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'ragas_score': 0.5482, 'context_ relevancy': 0.3754, 'faithfulness': 0.8667, 'answer_relevancy': 0.9116, 'context_recall': 0.4197, 'harmfulness': 0.0000}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from ragas import evaluate\n", + "\n", + "result = evaluate(\n", + " fiqa_eval[\"baseline\"],\n", + " metrics=[context_relevancy, faithfulness, answer_relevancy, context_recall, harmfulness],\n", + ")\n", + "\n", + "result" + ] + }, + { + "cell_type": "markdown", + "id": "a2dc0ec2", + "metadata": {}, + "source": [ + "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n", + "\n", + "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8686bf53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questioncontextsanswerground_truthscontext_ relevancyfaithfulnessanswer_relevancycontext_recallharmfulness
0How to deposit a cheque issued to an associate...[Just have the associate sign the back and the...\\nThe best way to deposit a cheque issued to a...[Have the check reissued to the proper payee.J...0.0883010.6666670.9768700.1111110
1Can I send a money order from USPS as a business?[Sure you can. You can fill in whatever you w...\\nYes, you can send a money order from USPS as...[Sure you can. You can fill in whatever you w...0.2310111.0000000.8835670.8000000
21 EIN doing business under multiple business n...[You're confusing a lot of things here. Compan...\\nYes, it is possible to have one EIN doing bu...[You're confusing a lot of things here. Compan...0.0694201.0000000.9199111.0000000
3Applying for and receiving business credit[Set up a meeting with the bank that handles y...\\nApplying for and receiving business credit c...[\"I'm afraid the great myth of limited liabili...0.9699681.0000000.8903910.1875000
4401k Transfer After Business Closure[The time horizon for your 401K/IRA is essenti...\\nIf your employer has closed and you need to ...[You should probably consult an attorney. Howe...0.5184140.6666670.8874260.0000000
\n", + "
" + ], + "text/plain": [ + " question \\\n", + "0 How to deposit a cheque issued to an associate... \n", + "1 Can I send a money order from USPS as a business? \n", + "2 1 EIN doing business under multiple business n... \n", + "3 Applying for and receiving business credit \n", + "4 401k Transfer After Business Closure \n", + "\n", + " contexts \\\n", + "0 [Just have the associate sign the back and the... \n", + "1 [Sure you can. You can fill in whatever you w... \n", + "2 [You're confusing a lot of things here. Compan... \n", + "3 [Set up a meeting with the bank that handles y... \n", + "4 [The time horizon for your 401K/IRA is essenti... \n", + "\n", + " answer \\\n", + "0 \\nThe best way to deposit a cheque issued to a... \n", + "1 \\nYes, you can send a money order from USPS as... \n", + "2 \\nYes, it is possible to have one EIN doing bu... \n", + "3 \\nApplying for and receiving business credit c... \n", + "4 \\nIf your employer has closed and you need to ... \n", + "\n", + " ground_truths context_ relevancy \\\n", + "0 [Have the check reissued to the proper payee.J... 0.088301 \n", + "1 [Sure you can. You can fill in whatever you w... 0.231011 \n", + "2 [You're confusing a lot of things here. Compan... 0.069420 \n", + "3 [\"I'm afraid the great myth of limited liabili... 0.969968 \n", + "4 [You should probably consult an attorney. Howe... 0.518414 \n", + "\n", + " faithfulness answer_relevancy context_recall harmfulness \n", + "0 0.666667 0.976870 0.111111 0 \n", + "1 1.000000 0.883567 0.800000 0 \n", + "2 1.000000 0.919911 1.000000 0 \n", + "3 1.000000 0.890391 0.187500 0 \n", + "4 0.666667 0.887426 0.000000 0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = result.to_pandas()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "f668fce1", + "metadata": {}, + "source": [ + "And thats it!\n", + "\n", + "You can check out the [ragas in action] notebook to get a feel of what is like to use it while trying to improve your pipelines.\n", + "\n", + "if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ragas2", + "language": "python", + "name": "ragas2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } From b4686e1d53a393d93c16fcb48e59291bdfba7226 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 23 Aug 2023 23:13:14 +0530 Subject: [PATCH 11/15] removed post_init --- src/ragas/metrics/context_recall.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/ragas/metrics/context_recall.py b/src/ragas/metrics/context_recall.py index 61cc3e05a..6e5977ca9 100644 --- a/src/ragas/metrics/context_recall.py +++ b/src/ragas/metrics/context_recall.py @@ -48,10 +48,6 @@ class ContextRecall(MetricWithLLM): evaluation_mode: EvaluationMode = EvaluationMode.gc batch_size: int = 15 - def __post_init__(self: t.Self): - self.prompt_format = CONTEXT_RECALL_RA - self.verdict_token = "[Attributed]" - def init_model(self: t.Self): ... @@ -61,15 +57,17 @@ def _score_batch( callbacks: t.Optional[CallbackManager] = None, callback_group_name: str = "batch", ) -> list: + verdict_token = "[Attributed]" prompts = [] ground_truths, contexts = dataset["ground_truths"], dataset["contexts"] + with trace_as_chain_group( callback_group_name, callback_manager=callbacks ) as batch_group: for gt, ctx in zip(ground_truths, contexts): gt = "\n".join(gt) if isinstance(gt, list) else gt ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx - human_prompt = self.prompt_format.format(context=ctx, ground_truth=gt) + human_prompt = CONTEXT_RECALL_RA.format(context=ctx, ground_truth=gt) prompts.append(ChatPromptTemplate.from_messages([human_prompt])) responses: list[list[str]] = [] @@ -85,8 +83,7 @@ def _score_batch( sentences = response[0].split("\n") denom = len(sentences) numerator = sum( - bool(sentence.find(self.verdict_token) != -1) - for sentence in sentences + bool(sentence.find(verdict_token) != -1) for sentence in sentences ) scores.append(numerator / denom) From 4c0e2b90cfe717f458765c162cf7d85c6aa3768f Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 24 Aug 2023 01:23:39 +0530 Subject: [PATCH 12/15] added RagasEvalChain importable --- src/ragas/langchain/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ragas/langchain/__init__.py b/src/ragas/langchain/__init__.py index e69de29bb..07570a8fd 100644 --- a/src/ragas/langchain/__init__.py +++ b/src/ragas/langchain/__init__.py @@ -0,0 +1 @@ +from ragas.langchain.evalchain import RagasEvaluatorChain From bdf20ce3dcbbfa47d8251fef50b6c85df997b4db Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 24 Aug 2023 04:10:49 +0530 Subject: [PATCH 13/15] fixed langchain --- docs/integrations/langchain.ipynb | 188 +++++++++++++++++++++++++----- src/ragas/langchain/evalchain.py | 61 ++++++++-- 2 files changed, 209 insertions(+), 40 deletions(-) diff --git a/docs/integrations/langchain.ipynb b/docs/integrations/langchain.ipynb index 40232d010..2e1b2f0d5 100644 --- a/docs/integrations/langchain.ipynb +++ b/docs/integrations/langchain.ipynb @@ -25,6 +25,17 @@ "nest_asyncio.apply()" ] }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8333f65e", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, { "cell_type": "markdown", "id": "842e32dc", @@ -35,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "4aa9a986", "metadata": {}, "outputs": [], @@ -51,23 +62,23 @@ "\n", "llm = ChatOpenAI()\n", "qa_chain = RetrievalQA.from_chain_type(\n", - " llm, retriever=index.vectorstore.as_retriever(), return_source_documents=True\n", + " llm, retriever=index.vectorstore.as_retriever(), return_source_documents=True,\n", ")" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "b0ebdf8d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'New York City was named in honor of the Duke of York, who would become King James II of England. King Charles II appointed the Duke as proprietor of the former territory of New Netherland, including the city of New Amsterdam, when England seized it from Dutch control.'" + "'New York City got its name in 1664 when it was renamed after the Duke of York, who later became King James II of England. The city was originally called New Amsterdam by Dutch colonists and was renamed New York when it came under British control.'" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -90,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "e67ce0e0", "metadata": {}, "outputs": [], @@ -103,7 +114,16 @@ " \"What is the significance of the Statue of Liberty in New York City?\",\n", "]\n", "\n", - "queries = [{\"query\": q} for q in eval_questions]" + "eval_answers = [\n", + " \"8,804,000\", # incorrect answer\n", + " \"Queens\", # incorrect answer\n", + " \"New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.\",\n", + " \"New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.\",\n", + " 'The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.',\n", + "]\n", + "\n", + "examples = [{\"query\": q, \"ground_truths\": [eval_answers[i]]} \n", + " for i, q in enumerate(eval_questions)]" ] }, { @@ -126,18 +146,63 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, + "id": "8f89d719", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = qa_chain({\"query\": eval_questions[4]})\n", + "result[\"result\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "81fa9c47", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The borough of Brooklyn (Kings County) has the highest population in New York City.'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = qa_chain(examples[1])\n", + "result[\"result\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "id": "1d9266d4", "metadata": {}, "outputs": [], "source": [ "from ragas.langchain.evalchain import RagasEvaluatorChain\n", - "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy\n", + "from ragas.metrics import faithfulness, answer_relevancy, context_relevancy, context_recall\n", "\n", "# create evaluation chains\n", "faithfulness_chain = RagasEvaluatorChain(metric=faithfulness)\n", "answer_rel_chain = RagasEvaluatorChain(metric=answer_relevancy)\n", - "context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)" + "context_rel_chain = RagasEvaluatorChain(metric=context_relevancy)\n", + "context_recall_chain = RagasEvaluatorChain(metric=context_recall)" ] }, { @@ -152,17 +217,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 17, "id": "5ede32cd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1.0" + "0.5" ] }, - "execution_count": 6, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -172,6 +237,28 @@ "eval_result[\"faithfulness_score\"]" ] }, + { + "cell_type": "code", + "execution_count": 18, + "id": "94b5544e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_result = context_recall_chain(result)\n", + "eval_result[\"context_recall_score\"]" + ] + }, { "cell_type": "markdown", "id": "f11295b5", @@ -184,7 +271,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 24, "id": "1ce7bff1", "metadata": {}, "outputs": [ @@ -199,7 +286,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|█████████████████████████████████████████████████████████████| 1/1 [00:38<00:00, 38.77s/it]\n" + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:57<00:00, 57.41s/it]\n" ] }, { @@ -207,23 +294,65 @@ "text/plain": [ "[{'faithfulness_score': 1.0},\n", " {'faithfulness_score': 0.5},\n", - " {'faithfulness_score': 0.75},\n", + " {'faithfulness_score': 1.0},\n", " {'faithfulness_score': 1.0},\n", " {'faithfulness_score': 1.0}]" ] }, - "execution_count": 7, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# run the queries as a batch for efficiency\n", - "predictions = qa_chain.batch(queries)\n", + "predictions = qa_chain.batch(examples)\n", "\n", "# evaluate\n", "print(\"evaluating...\")\n", - "r = faithfulness_chain.evaluate(queries, predictions)\n", + "r = faithfulness_chain.evaluate(examples, predictions)\n", + "r" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "55299f14", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "evaluating...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████████████████████████| 1/1 [00:54<00:00, 54.21s/it]\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'context_recall_score': 0.9333333333333333},\n", + " {'context_recall_score': 0.0},\n", + " {'context_recall_score': 1.0},\n", + " {'context_recall_score': 1.0},\n", + " {'context_recall_score': 1.0}]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# evaluate context recall\n", + "print(\"evaluating...\")\n", + "r = context_recall_chain.evaluate(examples, predictions)\n", "r" ] }, @@ -244,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 48, "id": "e75144c5", "metadata": {}, "outputs": [ @@ -252,7 +381,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "using existing dataset: NYC test\n" + "Created a new dataset: NYC test\n" ] } ], @@ -274,9 +403,10 @@ " dataset = client.create_dataset(\n", " dataset_name=dataset_name, description=\"NYC test dataset\"\n", " )\n", - " for q in eval_questions:\n", + " for e in examples:\n", " client.create_example(\n", - " inputs={\"query\": q},\n", + " inputs={\"query\": e[\"query\"]},\n", + " outputs={\"ground_truths\": e[\"ground_truths\"]},\n", " dataset_id=dataset.id,\n", " )\n", "\n", @@ -297,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 27, "id": "3a6decc6", "metadata": {}, "outputs": [], @@ -322,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 49, "id": "25f7992f", "metadata": {}, "outputs": [ @@ -330,8 +460,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "View the evaluation results for project '2023-08-22-19-28-17-RetrievalQA' at:\n", - "https://smith.langchain.com/projects/p/2133d672-b69a-4091-bc96-a4e39d150db5?eval=true\n" + "View the evaluation results for project '2023-08-24-03-36-45-RetrievalQA' at:\n", + "https://smith.langchain.com/projects/p/9fb78371-150e-49cc-a927-b1247fdb9e8d?eval=true\n" ] } ], @@ -339,10 +469,10 @@ "from langchain.smith import RunEvalConfig, run_on_dataset\n", "\n", "evaluation_config = RunEvalConfig(\n", - " custom_evaluators=[faithfulness_chain, answer_rel_chain, context_rel_chain],\n", + " custom_evaluators=[faithfulness_chain, answer_rel_chain, context_rel_chain, context_recall_chain],\n", " prediction_key=\"result\",\n", ")\n", - "\n", + " \n", "result = run_on_dataset(\n", " client,\n", " dataset_name,\n", diff --git a/src/ragas/langchain/evalchain.py b/src/ragas/langchain/evalchain.py index fd5af1210..43d4ad3c9 100644 --- a/src/ragas/langchain/evalchain.py +++ b/src/ragas/langchain/evalchain.py @@ -1,6 +1,7 @@ from __future__ import annotations import typing as t +from collections import defaultdict from datasets import Dataset from langchain.callbacks.manager import CallbackManagerForChainRun @@ -32,6 +33,8 @@ def input_keys(self) -> list[str]: keys = ["query", "result"] if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]: keys += ["source_documents"] + if self.metric.evaluation_mode in [EvaluationMode.gc]: + keys += ["ground_truths"] return keys @property @@ -58,6 +61,9 @@ def _call( contexts.append(document["page_content"]) else: contexts.append(document.page_content) + ground_truths = [] + if "ground_truths" in inputs: + ground_truths = inputs["ground_truths"] question = inputs["query"] answer = inputs["result"] @@ -66,6 +72,7 @@ def _call( "question": question, "answer": answer, "contexts": contexts, + "ground_truths": ground_truths, }, callbacks=callbacks, ) @@ -96,6 +103,11 @@ def _validate( f'"{context_key}" is required in each prediction for the ' f"metric[{self.metric.name}] you have chosen." ) + if "ground_truths" in required_columns and "ground_truths" not in input: + raise ValueError( + f'"ground_truths" is required in each prediction for the ' + f"metric[{self.metric.name}] you have chosen." + ) def evaluate( self, @@ -104,11 +116,12 @@ def evaluate( question_key: str = "query", prediction_key: str = "result", context_key: str = "source_documents", + ground_truths_key: str = "ground_truths", *, callbacks: Callbacks = None, ) -> list[dict]: """Evaluate question answering examples and predictions.""" - question, answer, contexts = [], [], [] + dataset_dict = defaultdict(list) # validation if len(examples) != len(predictions): @@ -122,13 +135,32 @@ def evaluate( {**example, **predictions[i]}, question_key, prediction_key, context_key ) # transform into Dataset that is supported by ragas - question.append(example[question_key]) - answer.append(predictions[i][prediction_key]) - if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qc]: - contexts.append([d.page_content for d in predictions[i][context_key]]) - dataset = Dataset.from_dict( - {"question": question, "answer": answer, "contexts": contexts} - ) + if self.metric.evaluation_mode in [ + EvaluationMode.qac, + EvaluationMode.qc, + EvaluationMode.qa, + ]: + dataset_dict["question"].append(example[question_key]) + + if self.metric.evaluation_mode in [EvaluationMode.qac, EvaluationMode.qa]: + dataset_dict["answer"].append(predictions[i][prediction_key]) + + if self.metric.evaluation_mode in [ + EvaluationMode.qac, + EvaluationMode.qc, + EvaluationMode.gc, + ]: + dataset_dict["contexts"].append( + [d.page_content for d in predictions[i][context_key]] + ) + + if self.metric.evaluation_mode == EvaluationMode.gc: + if isinstance(example["ground_truths"], list): + dataset_dict["ground_truths"].append(example["ground_truths"]) + else: + dataset_dict["ground_truths"].append([example["ground_truths"]]) + + dataset = Dataset.from_dict(dataset_dict) # evaluate dataset_with_scores = self.metric.score(dataset, callbacks=callbacks) @@ -145,9 +177,16 @@ def evaluate_run( Evaluate a langsmith run """ if run.outputs is None: - raise ValueError("Run outputs cannot be None") - run.outputs["query"] = run.inputs["query"] - eval_output = self(run.outputs, include_run_info=True) + raise ValueError("The chain should return results and service_document.") + if example is None: + raise ValueError("Examples have to be provided.") + chain_eval = run.outputs + chain_eval["query"] = run.inputs["query"] + if self.metric.evaluation_mode == EvaluationMode.gc: + if example.outputs is None or "ground_truths" not in example.outputs: + raise ValueError("expected `ground_truths` in example outputs.") + chain_eval["ground_truths"] = example.outputs["ground_truths"] + eval_output = self(chain_eval, include_run_info=True) score_name = f"{self.metric.name}_score" evaluation_result = EvaluationResult( From 07cdb353283a7922494f78a41f24c59f50f19f23 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 24 Aug 2023 07:12:08 +0530 Subject: [PATCH 14/15] fix column_name issue --- src/ragas/evaluation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 40f677af9..6d0c80896 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -76,13 +76,15 @@ def evaluate( metrics = [answer_relevancy, context_relevancy, faithfulness] - # select columns from the dataset - dataset = dataset.from_dict({k: dataset[v] for k, v in column_map.items()}) - # validation validate_evaluation_modes(dataset, metrics) validate_column_dtypes(dataset) + # select columns from the dataset + dataset = dataset.from_dict( + {column_map[name]: dataset[column_map[name]] for name in dataset.column_names} + ) + # run the evaluation on dataset with different metrics # initialize all the models in the metrics [m.init_model() for m in metrics] From 7b9d4c427c10524a14b3024a88a587684b7ee732 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 24 Aug 2023 07:47:05 +0530 Subject: [PATCH 15/15] fix docs --- docs/metrics.md | 2 +- docs/quickstart.ipynb | 117 +++++++++++++++++++++++------------------- 2 files changed, 66 insertions(+), 53 deletions(-) diff --git a/docs/metrics.md b/docs/metrics.md index 76a4a55ea..1c16c5156 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -30,7 +30,7 @@ dataset: Dataset results = context_rel.score(dataset) ``` -### Context Recall +### `Context Recall` measures the recall of the retrieved context using annotated answer as ground truth. Annotated answer is taken as proxy for ground truth context. ```python diff --git a/docs/quickstart.ipynb b/docs/quickstart.ipynb index 64883960a..89804059f 100644 --- a/docs/quickstart.ipynb +++ b/docs/quickstart.ipynb @@ -64,6 +64,7 @@ "- question: `list[str]` - These are the questions you RAG pipeline will be evaluated on. \n", "- answer: `list[str]` - The answer generated from the RAG pipeline and give to the user.\n", "- contexts: `list[list[str]]` - The contexts which where passed into the LLM to answer the question.\n", + "- ground_truths: `list[list[str]]` - The ground truth answer to the questions.\n", "\n", "Ideally your list of questions should reflect the questions your users give, including those that you have been problamatic in the past.\n", "\n", @@ -80,12 +81,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "/opt/anaconda3/envs/ragas/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "Found cached dataset fiqa (/Users/shahules/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n", - "100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 284.42it/s]\n" + "Found cached dataset fiqa (/home/jjmachan/.cache/huggingface/datasets/explodinggradients___fiqa/ragas_eval/1.0.0/3dc7b639f5b4b16509a3299a2ceb78bf5fe98ee6b5fee25e7d5e4d290c88efb8)\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6c415f76ed4f4c969f87986ee05f2fb1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00\n", " \n", " question\n", - " contexts\n", - " answer\n", " ground_truths\n", + " answer\n", + " contexts\n", " context_ relevancy\n", " faithfulness\n", " answer_relevancy\n", @@ -315,12 +328,12 @@ " \n", " 0\n", " How to deposit a cheque issued to an associate...\n", - " [Just have the associate sign the back and the...\n", - " \\nThe best way to deposit a cheque issued to a...\n", " [Have the check reissued to the proper payee.J...\n", + " \\nThe best way to deposit a cheque issued to a...\n", + " [Just have the associate sign the back and the...\n", " 0.088301\n", " 0.666667\n", - " 0.976870\n", + " 0.976247\n", " 0.111111\n", " 0\n", " \n", @@ -330,9 +343,9 @@ " [Sure you can. You can fill in whatever you w...\n", " \\nYes, you can send a money order from USPS as...\n", " [Sure you can. You can fill in whatever you w...\n", - " 0.231011\n", + " 0.191611\n", " 1.000000\n", - " 0.883567\n", + " 0.883586\n", " 0.800000\n", " 0\n", " \n", @@ -344,31 +357,31 @@ " [You're confusing a lot of things here. Compan...\n", " 0.069420\n", " 1.000000\n", - " 0.919911\n", + " 0.928548\n", " 1.000000\n", " 0\n", " \n", " \n", " 3\n", " Applying for and receiving business credit\n", - " [Set up a meeting with the bank that handles y...\n", - " \\nApplying for and receiving business credit c...\n", " [\"I'm afraid the great myth of limited liabili...\n", - " 0.969968\n", + " \\nApplying for and receiving business credit c...\n", + " [Set up a meeting with the bank that handles y...\n", + " 0.408924\n", " 1.000000\n", - " 0.890391\n", + " 0.906223\n", " 0.187500\n", " 0\n", " \n", " \n", " 4\n", " 401k Transfer After Business Closure\n", - " [The time horizon for your 401K/IRA is essenti...\n", - " \\nIf your employer has closed and you need to ...\n", " [You should probably consult an attorney. Howe...\n", - " 0.518414\n", + " \\nIf your employer has closed and you need to ...\n", + " [The time horizon for your 401K/IRA is essenti...\n", + " 0.064802\n", " 0.666667\n", - " 0.887426\n", + " 0.889312\n", " 0.000000\n", " 0\n", " \n", @@ -384,12 +397,12 @@ "3 Applying for and receiving business credit \n", "4 401k Transfer After Business Closure \n", "\n", - " contexts \\\n", - "0 [Just have the associate sign the back and the... \n", + " ground_truths \\\n", + "0 [Have the check reissued to the proper payee.J... \n", "1 [Sure you can. You can fill in whatever you w... \n", "2 [You're confusing a lot of things here. Compan... \n", - "3 [Set up a meeting with the bank that handles y... \n", - "4 [The time horizon for your 401K/IRA is essenti... \n", + "3 [\"I'm afraid the great myth of limited liabili... \n", + "4 [You should probably consult an attorney. Howe... \n", "\n", " answer \\\n", "0 \\nThe best way to deposit a cheque issued to a... \n", @@ -398,22 +411,22 @@ "3 \\nApplying for and receiving business credit c... \n", "4 \\nIf your employer has closed and you need to ... \n", "\n", - " ground_truths context_ relevancy \\\n", - "0 [Have the check reissued to the proper payee.J... 0.088301 \n", - "1 [Sure you can. You can fill in whatever you w... 0.231011 \n", + " contexts context_ relevancy \\\n", + "0 [Just have the associate sign the back and the... 0.088301 \n", + "1 [Sure you can. You can fill in whatever you w... 0.191611 \n", "2 [You're confusing a lot of things here. Compan... 0.069420 \n", - "3 [\"I'm afraid the great myth of limited liabili... 0.969968 \n", - "4 [You should probably consult an attorney. Howe... 0.518414 \n", + "3 [Set up a meeting with the bank that handles y... 0.408924 \n", + "4 [The time horizon for your 401K/IRA is essenti... 0.064802 \n", "\n", " faithfulness answer_relevancy context_recall harmfulness \n", - "0 0.666667 0.976870 0.111111 0 \n", - "1 1.000000 0.883567 0.800000 0 \n", - "2 1.000000 0.919911 1.000000 0 \n", - "3 1.000000 0.890391 0.187500 0 \n", - "4 0.666667 0.887426 0.000000 0 " + "0 0.666667 0.976247 0.111111 0 \n", + "1 1.000000 0.883586 0.800000 0 \n", + "2 1.000000 0.928548 1.000000 0 \n", + "3 1.000000 0.906223 0.187500 0 \n", + "4 0.666667 0.889312 0.000000 0 " ] }, - "execution_count": 4, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -438,9 +451,9 @@ ], "metadata": { "kernelspec": { - "display_name": "ragas2", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "ragas2" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -452,7 +465,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.12" } }, "nbformat": 4,