From 72fe84a474f27cd2f8d21ee7258e73e33f7f306e Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 7 Jun 2023 21:50:55 +0530 Subject: [PATCH 1/7] added readme --- README.md | 70 +++++++++++++++++-------------------------------------- 1 file changed, 21 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index dbc673503..c94dc11d4 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Installation | Quickstart | Metrics | + FAQ | Hugging Face

@@ -52,66 +53,37 @@ pip install -e . This is a small example program you can run to see ragas in action! ```python -from datasets import load_dataset -from ragas.metrics import ( - Evaluation, - rouge1, - bert_score, - entailment_score, -) # import the metrics you want to use - -# load the dataset -ds = load_dataset("explodinggradients/eli5-test", split="test_eli5").select(range(100)) - -# init the evaluator, this takes in the metrics you want to use -# and performs the evaluation -e = Evaluation( - metrics=[rouge1, bert_score, entailment_score,], - batched=False, - batch_size=30, -) - -# run the evaluation -results = e.eval(ds["ground_truth"], ds["generated_text"]) -print(results) -``` -If you want a more in-depth explanation of core components, check out our quick-start notebook -## :luggage: Metrics -### :3rd_place_medal: Character based +from ragas.metrics import factuality, answer_relevancy, context_relevancy +from ragas import evaluate +import os -- **Levenshtein distance** the number of single character edits (additional, insertion, deletion) required to change your generated text to ground truth text. -- **Levenshtein** **ratio** is obtained by dividing the Levenshtein distance by sum of number of characters in generated text and ground truth. This type of metrics is suitable where one works with short and precise texts. +os.environ["OPENAI_API_KEY"] = "your-openai-key" -### :2nd_place_medal: N-Gram based +ds = Dataset({ + features: ['question','context','answer'], + num_rows: 25 +}) +results = evaluate(ds, metrics=[nli, answer_relevancy, context_relevancy]) -N-gram based metrics as name indicates uses n-grams for comparing generated answer with ground truth. It is suitable to extractive and abstractive tasks but has its limitations in long free form answers due to the word based comparison. +``` +If you want a more in-depth explanation of core components, check out our quick-start notebook +## :luggage: Metrics -- **ROGUE** (Recall-Oriented Understudy for Gisting Evaluation): - - **ROUGE-N** measures the number of matching β€˜n-grams’ between generated text and ground truth. These matches do not consider the ordering of words. - - **ROUGE-L** measures the longest common subsequence (LCS) between generated text and ground truth. This means is that we count the longest sequence of tokens that is shared between both +Ragas measures your pipeline's performance against two dimensions +1. Factuality: measures the factual consistency of the generated answer against the given context. +2. Relevancy: measures how relevant retrieved contexts and the generated answer are to the question. -- **BLEU** (BiLingual Evaluation Understudy) +Through repeated experiments, we have found that the quality of a RAG pipeline is highly dependent on these two dimensions. The final `ragas_score` is the harmonic mean of these two factors. - It measures precision by comparingΒ  clipped n-grams in generated text to ground truth text. These matches do not consider the ordering of words. -### :1st_place_medal: Model Based +## :raising_hand_man: FAQ +1. Why harmonic mean? +Harmonic mean penalizes extreme values. For example if your generated answer is fully factually consistent with the context (factuality = 1) but is not relevant to the question (relevancy = 0), simple average would give you a score of 0.5 but harmonic mean will give you 0.0 -Model based methods uses language models combined with NLP techniques to compare generated text with ground truth. It is well suited for free form long or short answer types. -- **BertScore** - - Bert Score measures the similarity between ground truth text answers and generated text using SBERT vector embeddings. The common choice of similarity measure is cosine similarity for which values range between 0 to 1. It shows good correlation with human judgement. - -- **EntailmentScore** - - Textual entailment to measure factual consistency in generated text given ground truth. Score can range from 0 to 1 with latter indicating perfect factual entailment for all samples. Entailment score is highly correlated with human judgement. - -- **$Q^2$** - - Best used to measure factual consistencies between ground truth and generated text. Scores can range from 0 to 1. Higher score indicates better factual consistency between ground truth and generated answer. Employs QA-QG paradigm followed by NLI to compare ground truth and generated answer. $Q^2$ score is highly correlated with human judgement. :warning: time and resource hungry metrics. + -πŸ“œ Checkout [citations](./references.md) for related publications. From 58859f62580123760596e4e1a85e4dd25bfe233b Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 7 Jun 2023 22:10:26 +0530 Subject: [PATCH 2/7] added how to use --- README.md | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c94dc11d4..43f53a2a2 100644 --- a/README.md +++ b/README.md @@ -71,11 +71,22 @@ If you want a more in-depth explanation of core components, check out our quick- ## :luggage: Metrics Ragas measures your pipeline's performance against two dimensions -1. Factuality: measures the factual consistency of the generated answer against the given context. -2. Relevancy: measures how relevant retrieved contexts and the generated answer are to the question. +1. **Factuality**: measures the factual consistency of the generated answer against the given context. +2. **Relevancy**: measures how relevant retrieved contexts and the generated answer are to the question. Through repeated experiments, we have found that the quality of a RAG pipeline is highly dependent on these two dimensions. The final `ragas_score` is the harmonic mean of these two factors. +## :question: How to use Ragas to improve your pipeline? +*"Measurement is the first step that leads to control and eventually to improvement" - James Harrington* + +Here we assume that you already have your RAG pipeline ready. When is comes to RAG pipelines, there are mainly two parts - Retriever and generator. A change in any of this should also impact your pipelines's quality. + +1. First, decide one parameter that you're interested in adjusting. for example the number of retrieved documents, K. +2. Collect a set of sample prompts (min 20) to form your test set. +3. Run your pipeline using the test set before and after the change. Each time record the prompts with context and generated output. +4. Run ragas evaluation for each of them to generate evaluation scores. +5. Compare the scores and you will know how much the change has affected your pipelines's performance. + ## :raising_hand_man: FAQ 1. Why harmonic mean? From bd526dc130b80026ec6f50a8e5fe3b17b44d8436 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 7 Jun 2023 23:30:17 +0530 Subject: [PATCH 3/7] added metrics to docs --- docs/metrics.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 docs/metrics.md diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 000000000..15233ac6b --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,7 @@ +# Metrics + +1. `factuality` : measures the factual consistency of the generated answer against the given context. This is done using a multi step paradigm that includes creation of statements from the generated answer followed by verifying each of these statements against the context. The answer is scaled to (0,1) range. Higher the better. + +2. `answer_relevancy`: measures how relevant is the generated answer to the prompt. This is quantified using conditional likelihood of an LLM generating the question given the answer. This is implemented using a custom model. Values range (0,1), higher the better. + +3. `context_relevancy`: measures how relevant is the retrieved context to the prompt. This is quantified using a custom trained cross encoder model. Values range (0,1), higher the better. \ No newline at end of file From 93a96f25056b11c591dbadc04a75114532aaf41a Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Wed, 7 Jun 2023 23:31:37 +0530 Subject: [PATCH 4/7] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 43f53a2a2..763f88b81 100644 --- a/README.md +++ b/README.md @@ -76,6 +76,7 @@ Ragas measures your pipeline's performance against two dimensions Through repeated experiments, we have found that the quality of a RAG pipeline is highly dependent on these two dimensions. The final `ragas_score` is the harmonic mean of these two factors. +To read more about our metrics, checkout [docs](/docs/metrics.md). ## :question: How to use Ragas to improve your pipeline? *"Measurement is the first step that leads to control and eventually to improvement" - James Harrington* From cbffce109eaf022e65905e3a79f829755a389036 Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 8 Jun 2023 17:03:06 +0530 Subject: [PATCH 5/7] set model_max_length --- .gitignore | 1 + Makefile | 10 ++-- pyproject.toml | 12 ++-- {ragas => src/ragas}/__init__.py | 0 {ragas => src/ragas}/evaluation.py | 13 +++-- {ragas => src/ragas}/exceptions.py | 0 {ragas => src/ragas}/metrics/__init__.py | 0 .../ragas}/metrics/answer_relevance.py | 4 +- {ragas => src/ragas}/metrics/base.py | 9 ++- .../ragas}/metrics/context_relevance.py | 0 {ragas => src/ragas}/metrics/factual.py | 8 +-- {ragas => src/ragas}/metrics/llms.py | 2 + {ragas => src/ragas}/utils.py | 0 tests/benchmarks/benchmark.py | 58 ------------------- tests/benchmarks/benchmark_eval.py | 25 ++++---- 15 files changed, 47 insertions(+), 95 deletions(-) rename {ragas => src/ragas}/__init__.py (100%) rename {ragas => src/ragas}/evaluation.py (86%) rename {ragas => src/ragas}/exceptions.py (100%) rename {ragas => src/ragas}/metrics/__init__.py (100%) rename {ragas => src/ragas}/metrics/answer_relevance.py (99%) rename {ragas => src/ragas}/metrics/base.py (85%) rename {ragas => src/ragas}/metrics/context_relevance.py (100%) rename {ragas => src/ragas}/metrics/factual.py (97%) rename {ragas => src/ragas}/metrics/llms.py (97%) rename {ragas => src/ragas}/utils.py (100%) delete mode 100644 tests/benchmarks/benchmark.py diff --git a/.gitignore b/.gitignore index 07269ad76..2a54a6ed5 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ ragas/_version.py experiments/**/data experiments/**/storage **/fil-result/ +src/ragas/_version.py diff --git a/Makefile b/Makefile index 52ce482a3..19fde54a4 100644 --- a/Makefile +++ b/Makefile @@ -8,17 +8,17 @@ format: ## Running code formatter: black and isort @echo "(isort) Ordering imports..." @isort . @echo "(black) Formatting codebase..." - @black --config pyproject.toml ragas tests examples + @black --config pyproject.toml src tests examples experiments @echo "(black) Formatting stubs..." - @find ragas -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \; + @find src -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \; @echo "(ruff) Running fix only..." - @ruff check ragas examples tests --fix-only + @ruff check src examples tests --fix-only lint: ## Running lint checker: ruff @echo "(ruff) Linting development project..." - @ruff check ragas examples tests + @ruff check src examples tests type: ## Running type checker: pyright @echo "(pyright) Typechecking codebase..." - @pyright ragas + @pyright src clean: ## Clean all generated files @echo "Cleaning all generated files..." @cd $(GIT_ROOT)/docs && make clean diff --git a/pyproject.toml b/pyproject.toml index 1ef38b55f..094d495e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,23 +1,25 @@ [project] name = "ragas" dependencies = [ - "Levenshtein", - "rouge-score", "numpy", "transformers", "sentence-transformers", - "nltk", "datasets", - "spacy<4.0.0,>=3.0.0", "protobuf<=3.20.0", + "backoff", + "openai", ] dynamic = ["version", "readme"] +[tool.setuptools] +package-dir = {"" = "src"} + [tool.setuptools.dynamic] readme = {file = ["README.md"], content-type = "text/plain"} [build-system] requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" + [tool.setuptools_scm] -write_to = "ragas/_version.py" +write_to = "src/ragas/_version.py" diff --git a/ragas/__init__.py b/src/ragas/__init__.py similarity index 100% rename from ragas/__init__.py rename to src/ragas/__init__.py diff --git a/ragas/evaluation.py b/src/ragas/evaluation.py similarity index 86% rename from ragas/evaluation.py rename to src/ragas/evaluation.py index f5c3d2e42..43bafbb91 100644 --- a/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -1,12 +1,10 @@ from __future__ import annotations -import typing as t from dataclasses import dataclass from enum import Enum import numpy as np from datasets import Dataset, concatenate_datasets -from tqdm import tqdm from ragas.metrics.base import Metric @@ -44,10 +42,10 @@ def evaluate( [m.init_model() for m in metrics] scores = [] - for metric in tqdm(metrics): + for metric in metrics: scores.append(metric.score(dataset).select_columns(metric.name)) - return Result(concatenate_datasets(scores)) + return Result(concatenate_datasets(scores, axis=1)) @dataclass @@ -55,8 +53,13 @@ class Result(dict): scores: Dataset def __post_init__(self): + values = [] for cn in self.scores.column_names: - self[cn] = np.mean(self.scores[cn]) + value = np.mean(self.scores[cn]) + self[cn] = value + values.append(value) + + self["ragas_score"] = len(values) / np.sum(1.0 / np.array(values)) def describe(self): description = {} diff --git a/ragas/exceptions.py b/src/ragas/exceptions.py similarity index 100% rename from ragas/exceptions.py rename to src/ragas/exceptions.py diff --git a/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py similarity index 100% rename from ragas/metrics/__init__.py rename to src/ragas/metrics/__init__.py diff --git a/ragas/metrics/answer_relevance.py b/src/ragas/metrics/answer_relevance.py similarity index 99% rename from ragas/metrics/answer_relevance.py rename to src/ragas/metrics/answer_relevance.py index f815ce736..d0e577f59 100644 --- a/ragas/metrics/answer_relevance.py +++ b/src/ragas/metrics/answer_relevance.py @@ -24,7 +24,7 @@ class QGen: def __init__(self, model_name: str, device: str) -> None: config = AutoConfig.from_pretrained(model_name) - self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = "[PAD]" architecture = np.intersect1d( @@ -121,7 +121,7 @@ def predict( ) -> npt.NDArray[np.float64]: predictions = [] dataloader = DataLoader( - sentences, batch_size=batch_size, collate_fn=self.collate_fn + sentences, batch_size=batch_size, collate_fn=self.collate_fn # type: ignore ) if show_progress: diff --git a/ragas/metrics/base.py b/src/ragas/metrics/base.py similarity index 85% rename from ragas/metrics/base.py rename to src/ragas/metrics/base.py index a996966b7..c1bdacadd 100644 --- a/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -47,8 +47,11 @@ def get_batches(self, dataset_size: int): range(i, i + self.batch_size) for i in range(0, self.batch_size * num_batches, self.batch_size) ] - batches.append( - range(self.batch_size * num_batches, self.batch_size * num_batches + tail) - ) + if tail != 0: + batches.append( + range( + self.batch_size * num_batches, self.batch_size * num_batches + tail + ) + ) return batches diff --git a/ragas/metrics/context_relevance.py b/src/ragas/metrics/context_relevance.py similarity index 100% rename from ragas/metrics/context_relevance.py rename to src/ragas/metrics/context_relevance.py diff --git a/ragas/metrics/factual.py b/src/ragas/metrics/factual.py similarity index 97% rename from ragas/metrics/factual.py rename to src/ragas/metrics/factual.py index e59a13cd9..71db8dcac 100644 --- a/ragas/metrics/factual.py +++ b/src/ragas/metrics/factual.py @@ -28,7 +28,7 @@ statements:\nShahul and Jithin were from different countries. question:{} answer: {} -statements:\n""" +statements:\n""" # noqa: E501 NLI_STATEMENTS = """ Prompt: Natural language inference @@ -53,7 +53,7 @@ statements:\n{} Now, read the following statements and determine whether they are supported by the information present in the context. Provide a brief explanation for each statement. Also provide a Final Answer (Yes/No) at the end. Answer: -""" +""" # noqa: E501 @dataclass @@ -87,7 +87,7 @@ def _score_batch(self: t.Self, ds: Dataset) -> Dataset: response = openai_completion(prompts) list_statements: list[list[str]] = [] - for output in response["choices"]: + for output in response["choices"]: # type: ignore statements = output["text"].split("\n") list_statements.append(statements) @@ -101,7 +101,7 @@ def _score_batch(self: t.Self, ds: Dataset) -> Dataset: prompts.append(prompt) response = openai_completion(prompts) - outputs = response["choices"] + outputs = response["choices"] # type: ignore scores = [] for i, output in enumerate(outputs): diff --git a/ragas/metrics/llms.py b/src/ragas/metrics/llms.py similarity index 97% rename from ragas/metrics/llms.py rename to src/ragas/metrics/llms.py index ea7452477..88f5f4e9b 100644 --- a/ragas/metrics/llms.py +++ b/src/ragas/metrics/llms.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import os diff --git a/ragas/utils.py b/src/ragas/utils.py similarity index 100% rename from ragas/utils.py rename to src/ragas/utils.py diff --git a/tests/benchmarks/benchmark.py b/tests/benchmarks/benchmark.py deleted file mode 100644 index c868ceff8..000000000 --- a/tests/benchmarks/benchmark.py +++ /dev/null @@ -1,58 +0,0 @@ -import typing as t - -from datasets import Dataset, arrow_dataset, load_dataset -from torch.cuda import is_available -from tqdm import tqdm -from utils import print_table, timeit - -from ragas.metrics import ( - Evaluation, - bert_score, - edit_ratio, - rouge1, -) - -DEVICE = "cuda" if is_available() else "cpu" -BATCHES = [0, 1, 30, 60] - -METRICS = { - "Rouge1": rouge1, - # "Rouge2": rouge2, - # "RougeL": rougeL, - "EditRatio": edit_ratio, - # "EditDistance": edit_distance, - "SBERTScore": bert_score, - # "EntailmentScore": entailment_score, - # "Qsquare": q_square, -} -DS = load_dataset("explodinggradients/eli5-test", split="test_eli5") -assert isinstance(DS, arrow_dataset.Dataset), "Not an arrow_dataset" -DS = DS.select(range(100)) - - -def setup() -> t.Iterator[tuple[str, Evaluation, Dataset]]: - metrics = [m for m in METRICS.values()] - for b in BATCHES: - setup_name = f"batch-{b}" - assert isinstance(DS, Dataset), f"{type(DS)} found in the place of Dataset!" - batched = False if b == 0 else True - e = Evaluation( - metrics=metrics, - batched=batched, - batch_size=b, - ) - yield setup_name, e, DS - - -@timeit -def evaluate(e: Evaluation, ds: Dataset): - e.eval(ds["ground_truth"], ds["generated_text"]) - - -if __name__ == "__main__": - results = {} - for setup_name, e, ds in tqdm(setup(), total=len(BATCHES)): - mean, var = evaluate(e, ds) - results[setup_name] = (mean, var) - - print_table(results) diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index 4b0b8fa07..4eeeddec6 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -1,22 +1,21 @@ -from datasets import arrow_dataset, load_dataset +import os + +from datasets import Dataset, load_dataset from torch.cuda import is_available -from ragas.metrics import Evaluation, bert_score, edit_ratio, rougeL -from ragas.metrics.factual import EntailmentScore +from ragas import evaluate +from ragas.metrics import answer_relevancy, context_relavancy, factuality DEVICE = "cuda" if is_available() else "cpu" -entailment_score = EntailmentScore(device=DEVICE, batch_size=2) -# q_square = Qsquare(device=DEVICE, batch_size=2) -DS = load_dataset("explodinggradients/ragas-webgpt", split="train") -assert isinstance(DS, arrow_dataset.Dataset), "Not an arrow_dataset" -DS = DS.select(range(500)) +PATH_TO_DATSET_GIT_REPO = "../../../datasets/fiqa/" +assert os.path.isdir(PATH_TO_DATSET_GIT_REPO), "Dataset not found" +ds = Dataset.from_json(os.path.join(PATH_TO_DATSET_GIT_REPO, "gen_ds.json")) +assert isinstance(ds, Dataset) if __name__ == "__main__": - e = Evaluation( - metrics=[rougeL, edit_ratio, bert_score, entailment_score], - batched=True, - batch_size=64, + result = evaluate( + ds, + metrics=[answer_relevancy, context_relavancy, factuality], ) - result = e.eval(DS["ground_truth"], DS["generated_text"]) print(result) From 71cf2ef6c0fcf100ad8f3d7cd47a1c6f3d3210cf Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Thu, 8 Jun 2023 17:03:29 +0530 Subject: [PATCH 6/7] fix import paths --- .../assesments/metrics_assesments.ipynb | 72 +++++++++---------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/experiments/assesments/metrics_assesments.ipynb b/experiments/assesments/metrics_assesments.ipynb index 2640c9b14..f15ff1544 100644 --- a/experiments/assesments/metrics_assesments.ipynb +++ b/experiments/assesments/metrics_assesments.ipynb @@ -32,10 +32,19 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 1, "id": "7bfb2480", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/alerts/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "import json\n", "from datasets import load_dataset\n", @@ -55,7 +64,7 @@ "metadata": {}, "outputs": [], "source": [ - "os.chdir('/Users/shahules/belar/')" + "os.chdir('/Users/shahules/belar/src/')" ] }, { @@ -134,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 7, "id": "f9f4280e", "metadata": {}, "outputs": [ @@ -143,7 +152,7 @@ "output_type": "stream", "text": [ "Found cached dataset parquet (/Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--ragas-wikiqa-5b5116e5cb909aca/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n", - "100%|β–ˆ| 1/1 [00:00<00:00, 58.\n" + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 242.78it/s]\n" ] } ], @@ -161,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 8, "id": "eca20daf", "metadata": {}, "outputs": [], @@ -183,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "f3e35532", "metadata": {}, "outputs": [], @@ -216,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "335081e3", "metadata": {}, "outputs": [], @@ -248,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "id": "b2642e5b", "metadata": {}, "outputs": [], @@ -263,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "id": "26ca4af4", "metadata": {}, "outputs": [ @@ -280,7 +289,7 @@ "0" ] }, - "execution_count": 19, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -301,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "ca1c56d6", "metadata": {}, "outputs": [], @@ -323,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "cd7fed9c", "metadata": {}, "outputs": [], @@ -340,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "35113558", "metadata": {}, "outputs": [], @@ -351,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "4e82d0df", "metadata": {}, "outputs": [ @@ -365,10 +374,10 @@ { "data": { "text/plain": [ - "3.514920235612768" + "3.5533440372846865" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -396,40 +405,27 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 13, "id": "cc263805", "metadata": {}, "outputs": [], "source": [ - "from experimental.relevance import QGen" + "from ragas.metrics.answer_relevance import QGen" ] }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 14, "id": "38deaf06", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/alerts/lib/python3.8/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n", - "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n", - "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n", - "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n", - "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "t5_qgen = QGen(\"t5-base\",\"cpu\")\n" ] }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 15, "id": "45942810", "metadata": {}, "outputs": [], @@ -454,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 16, "id": "ab00e4fe", "metadata": {}, "outputs": [], @@ -516,12 +512,12 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 17, "id": "b6d76ae2", "metadata": {}, "outputs": [], "source": [ - "## import cross encoder\n" + "from ragas.metrics.context_relevance import context_relavancy" ] }, { From 91c35e8fff326f673818c7b74c0d8ad25f7dd2ac Mon Sep 17 00:00:00 2001 From: Shahules786 Date: Fri, 9 Jun 2023 17:15:49 +0530 Subject: [PATCH 7/7] update readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 763f88b81..2d8fc4c69 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,6 @@ pip install -e . This is a small example program you can run to see ragas in action! ```python -from ragas.metrics import factuality, answer_relevancy, context_relevancy from ragas import evaluate import os @@ -64,7 +63,7 @@ ds = Dataset({ features: ['question','context','answer'], num_rows: 25 }) -results = evaluate(ds, metrics=[nli, answer_relevancy, context_relevancy]) +results = evaluate(ds) ``` If you want a more in-depth explanation of core components, check out our quick-start notebook