From 35b34337dfac13a3c6f8489a01d723e30f0b0f21 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 8 Jun 2023 06:15:32 +0530 Subject: [PATCH 1/8] ci fixes --- ragas/evaluation.py | 1 - tests/benchmarks/benchmark.py | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/ragas/evaluation.py b/ragas/evaluation.py index f5c3d2e42..96806600f 100644 --- a/ragas/evaluation.py +++ b/ragas/evaluation.py @@ -1,6 +1,5 @@ from __future__ import annotations -import typing as t from dataclasses import dataclass from enum import Enum diff --git a/tests/benchmarks/benchmark.py b/tests/benchmarks/benchmark.py index c868ceff8..e87c404d1 100644 --- a/tests/benchmarks/benchmark.py +++ b/tests/benchmarks/benchmark.py @@ -5,12 +5,7 @@ from tqdm import tqdm from utils import print_table, timeit -from ragas.metrics import ( - Evaluation, - bert_score, - edit_ratio, - rouge1, -) +from ragas.metrics import Evaluation, bert_score, edit_ratio, rouge1 DEVICE = "cuda" if is_available() else "cpu" BATCHES = [0, 1, 30, 60] From 74f87c672807a75a2f3ebf1165572d546940e3b8 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 8 Jun 2023 06:38:00 +0530 Subject: [PATCH 2/8] add src directory file structure --- {ragas => src/ragas}/__init__.py | 0 src/ragas/_version.py | 4 ++++ {ragas => src/ragas}/evaluation.py | 0 {ragas => src/ragas}/exceptions.py | 0 {ragas => src/ragas}/metrics/__init__.py | 0 {ragas => src/ragas}/metrics/answer_relevance.py | 0 {ragas => src/ragas}/metrics/base.py | 9 ++++++--- {ragas => src/ragas}/metrics/context_relevance.py | 0 {ragas => src/ragas}/metrics/factual.py | 1 + {ragas => src/ragas}/metrics/llms.py | 0 {ragas => src/ragas}/utils.py | 0 11 files changed, 11 insertions(+), 3 deletions(-) rename {ragas => src/ragas}/__init__.py (100%) create mode 100644 src/ragas/_version.py rename {ragas => src/ragas}/evaluation.py (100%) rename {ragas => src/ragas}/exceptions.py (100%) rename {ragas => src/ragas}/metrics/__init__.py (100%) rename {ragas => src/ragas}/metrics/answer_relevance.py (100%) rename {ragas => src/ragas}/metrics/base.py (85%) rename {ragas => src/ragas}/metrics/context_relevance.py (100%) rename {ragas => src/ragas}/metrics/factual.py (99%) rename {ragas => src/ragas}/metrics/llms.py (100%) rename {ragas => src/ragas}/utils.py (100%) diff --git a/ragas/__init__.py b/src/ragas/__init__.py similarity index 100% rename from ragas/__init__.py rename to src/ragas/__init__.py diff --git a/src/ragas/_version.py b/src/ragas/_version.py new file mode 100644 index 000000000..e2c51d598 --- /dev/null +++ b/src/ragas/_version.py @@ -0,0 +1,4 @@ +# file generated by setuptools_scm +# don't change, don't track in version control +__version__ = version = '0.0.1a8.dev6+g6f492e1' +__version_tuple__ = version_tuple = (0, 0, 1, 'dev6', 'g6f492e1') diff --git a/ragas/evaluation.py b/src/ragas/evaluation.py similarity index 100% rename from ragas/evaluation.py rename to src/ragas/evaluation.py diff --git a/ragas/exceptions.py b/src/ragas/exceptions.py similarity index 100% rename from ragas/exceptions.py rename to src/ragas/exceptions.py diff --git a/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py similarity index 100% rename from ragas/metrics/__init__.py rename to src/ragas/metrics/__init__.py diff --git a/ragas/metrics/answer_relevance.py b/src/ragas/metrics/answer_relevance.py similarity index 100% rename from ragas/metrics/answer_relevance.py rename to src/ragas/metrics/answer_relevance.py diff --git a/ragas/metrics/base.py b/src/ragas/metrics/base.py similarity index 85% rename from ragas/metrics/base.py rename to src/ragas/metrics/base.py index a996966b7..c1bdacadd 100644 --- a/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -47,8 +47,11 @@ def get_batches(self, dataset_size: int): range(i, i + self.batch_size) for i in range(0, self.batch_size * num_batches, self.batch_size) ] - batches.append( - range(self.batch_size * num_batches, self.batch_size * num_batches + tail) - ) + if tail != 0: + batches.append( + range( + self.batch_size * num_batches, self.batch_size * num_batches + tail + ) + ) return batches diff --git a/ragas/metrics/context_relevance.py b/src/ragas/metrics/context_relevance.py similarity index 100% rename from ragas/metrics/context_relevance.py rename to src/ragas/metrics/context_relevance.py diff --git a/ragas/metrics/factual.py b/src/ragas/metrics/factual.py similarity index 99% rename from ragas/metrics/factual.py rename to src/ragas/metrics/factual.py index e59a13cd9..a67843a9d 100644 --- a/ragas/metrics/factual.py +++ b/src/ragas/metrics/factual.py @@ -70,6 +70,7 @@ def init_model(self: t.Self): def score(self: t.Self, dataset: Dataset) -> Dataset: scores = [] for batch in tqdm(self.get_batches(len(dataset))): + print(batch) score = self._score_batch(dataset.select(batch)) scores.append(score) diff --git a/ragas/metrics/llms.py b/src/ragas/metrics/llms.py similarity index 100% rename from ragas/metrics/llms.py rename to src/ragas/metrics/llms.py diff --git a/ragas/utils.py b/src/ragas/utils.py similarity index 100% rename from ragas/utils.py rename to src/ragas/utils.py From f51dcb55cc21923589385c88ff9f8f8080e6ed70 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 8 Jun 2023 06:42:41 +0530 Subject: [PATCH 3/8] update pyproject --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 1ef38b55f..57f4dec2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,11 +13,15 @@ dependencies = [ ] dynamic = ["version", "readme"] +[tool.setuptools] +package-dir = {"" = "src"} + [tool.setuptools.dynamic] readme = {file = ["README.md"], content-type = "text/plain"} [build-system] requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" + [tool.setuptools_scm] write_to = "ragas/_version.py" From b25d2b1105d342448df7593207520cdc68de2d63 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 8 Jun 2023 06:44:27 +0530 Subject: [PATCH 4/8] fix _version.py --- .gitignore | 1 + pyproject.toml | 2 +- src/ragas/_version.py | 4 ---- 3 files changed, 2 insertions(+), 5 deletions(-) delete mode 100644 src/ragas/_version.py diff --git a/.gitignore b/.gitignore index 07269ad76..2a54a6ed5 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ ragas/_version.py experiments/**/data experiments/**/storage **/fil-result/ +src/ragas/_version.py diff --git a/pyproject.toml b/pyproject.toml index 57f4dec2a..cbed71be8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,4 +24,4 @@ requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] -write_to = "ragas/_version.py" +write_to = "src/ragas/_version.py" diff --git a/src/ragas/_version.py b/src/ragas/_version.py deleted file mode 100644 index e2c51d598..000000000 --- a/src/ragas/_version.py +++ /dev/null @@ -1,4 +0,0 @@ -# file generated by setuptools_scm -# don't change, don't track in version control -__version__ = version = '0.0.1a8.dev6+g6f492e1' -__version_tuple__ = version_tuple = (0, 0, 1, 'dev6', 'g6f492e1') From 675c1f4bfd5d18a8cbf5eb30fe40daf20e530f1b Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 8 Jun 2023 06:51:28 +0530 Subject: [PATCH 5/8] fix linting and type issues --- Makefile | 10 +++++----- src/ragas/evaluation.py | 4 ++-- src/ragas/metrics/answer_relevance.py | 2 +- src/ragas/metrics/factual.py | 8 ++++---- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 52ce482a3..19fde54a4 100644 --- a/Makefile +++ b/Makefile @@ -8,17 +8,17 @@ format: ## Running code formatter: black and isort @echo "(isort) Ordering imports..." @isort . @echo "(black) Formatting codebase..." - @black --config pyproject.toml ragas tests examples + @black --config pyproject.toml src tests examples experiments @echo "(black) Formatting stubs..." - @find ragas -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \; + @find src -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \; @echo "(ruff) Running fix only..." - @ruff check ragas examples tests --fix-only + @ruff check src examples tests --fix-only lint: ## Running lint checker: ruff @echo "(ruff) Linting development project..." - @ruff check ragas examples tests + @ruff check src examples tests type: ## Running type checker: pyright @echo "(pyright) Typechecking codebase..." - @pyright ragas + @pyright src clean: ## Clean all generated files @echo "Cleaning all generated files..." @cd $(GIT_ROOT)/docs && make clean diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 96806600f..0515fa2cc 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -5,7 +5,6 @@ import numpy as np from datasets import Dataset, concatenate_datasets -from tqdm import tqdm from ragas.metrics.base import Metric @@ -43,9 +42,10 @@ def evaluate( [m.init_model() for m in metrics] scores = [] - for metric in tqdm(metrics): + for metric in metrics: scores.append(metric.score(dataset).select_columns(metric.name)) + print(scores) return Result(concatenate_datasets(scores)) diff --git a/src/ragas/metrics/answer_relevance.py b/src/ragas/metrics/answer_relevance.py index f815ce736..0bd490875 100644 --- a/src/ragas/metrics/answer_relevance.py +++ b/src/ragas/metrics/answer_relevance.py @@ -121,7 +121,7 @@ def predict( ) -> npt.NDArray[np.float64]: predictions = [] dataloader = DataLoader( - sentences, batch_size=batch_size, collate_fn=self.collate_fn + sentences, batch_size=batch_size, collate_fn=self.collate_fn # type: ignore ) if show_progress: diff --git a/src/ragas/metrics/factual.py b/src/ragas/metrics/factual.py index a67843a9d..272f1561d 100644 --- a/src/ragas/metrics/factual.py +++ b/src/ragas/metrics/factual.py @@ -28,7 +28,7 @@ statements:\nShahul and Jithin were from different countries. question:{} answer: {} -statements:\n""" +statements:\n""" # noqa: E501 NLI_STATEMENTS = """ Prompt: Natural language inference @@ -53,7 +53,7 @@ statements:\n{} Now, read the following statements and determine whether they are supported by the information present in the context. Provide a brief explanation for each statement. Also provide a Final Answer (Yes/No) at the end. Answer: -""" +""" # noqa: E501 @dataclass @@ -88,7 +88,7 @@ def _score_batch(self: t.Self, ds: Dataset) -> Dataset: response = openai_completion(prompts) list_statements: list[list[str]] = [] - for output in response["choices"]: + for output in response["choices"]: # type: ignore statements = output["text"].split("\n") list_statements.append(statements) @@ -102,7 +102,7 @@ def _score_batch(self: t.Self, ds: Dataset) -> Dataset: prompts.append(prompt) response = openai_completion(prompts) - outputs = response["choices"] + outputs = response["choices"] # type: ignore scores = [] for i, output in enumerate(outputs): From 6481b4db19404f7cdb09a0eda9c63f13ab70041d Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 8 Jun 2023 07:11:55 +0530 Subject: [PATCH 6/8] fix dependencies --- pyproject.toml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cbed71be8..094d495e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,13 @@ [project] name = "ragas" dependencies = [ - "Levenshtein", - "rouge-score", "numpy", "transformers", "sentence-transformers", - "nltk", "datasets", - "spacy<4.0.0,>=3.0.0", "protobuf<=3.20.0", + "backoff", + "openai", ] dynamic = ["version", "readme"] From 996f03399bc42d0af17bda7ec617a2b3be4c9188 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 8 Jun 2023 07:17:30 +0530 Subject: [PATCH 7/8] fix annotation bug --- src/ragas/metrics/llms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ragas/metrics/llms.py b/src/ragas/metrics/llms.py index ea7452477..88f5f4e9b 100644 --- a/src/ragas/metrics/llms.py +++ b/src/ragas/metrics/llms.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import os From e407fd4f8aa3d722a0a8bfd4aff53620bb907be1 Mon Sep 17 00:00:00 2001 From: jjmachan Date: Thu, 8 Jun 2023 07:53:10 +0530 Subject: [PATCH 8/8] added ragas score --- src/ragas/evaluation.py | 10 ++++-- src/ragas/metrics/factual.py | 1 - tests/benchmarks/benchmark.py | 53 ------------------------------ tests/benchmarks/benchmark_eval.py | 25 +++++++------- 4 files changed, 19 insertions(+), 70 deletions(-) delete mode 100644 tests/benchmarks/benchmark.py diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 0515fa2cc..43bafbb91 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -45,8 +45,7 @@ def evaluate( for metric in metrics: scores.append(metric.score(dataset).select_columns(metric.name)) - print(scores) - return Result(concatenate_datasets(scores)) + return Result(concatenate_datasets(scores, axis=1)) @dataclass @@ -54,8 +53,13 @@ class Result(dict): scores: Dataset def __post_init__(self): + values = [] for cn in self.scores.column_names: - self[cn] = np.mean(self.scores[cn]) + value = np.mean(self.scores[cn]) + self[cn] = value + values.append(value) + + self["ragas_score"] = len(values) / np.sum(1.0 / np.array(values)) def describe(self): description = {} diff --git a/src/ragas/metrics/factual.py b/src/ragas/metrics/factual.py index 272f1561d..71db8dcac 100644 --- a/src/ragas/metrics/factual.py +++ b/src/ragas/metrics/factual.py @@ -70,7 +70,6 @@ def init_model(self: t.Self): def score(self: t.Self, dataset: Dataset) -> Dataset: scores = [] for batch in tqdm(self.get_batches(len(dataset))): - print(batch) score = self._score_batch(dataset.select(batch)) scores.append(score) diff --git a/tests/benchmarks/benchmark.py b/tests/benchmarks/benchmark.py deleted file mode 100644 index e87c404d1..000000000 --- a/tests/benchmarks/benchmark.py +++ /dev/null @@ -1,53 +0,0 @@ -import typing as t - -from datasets import Dataset, arrow_dataset, load_dataset -from torch.cuda import is_available -from tqdm import tqdm -from utils import print_table, timeit - -from ragas.metrics import Evaluation, bert_score, edit_ratio, rouge1 - -DEVICE = "cuda" if is_available() else "cpu" -BATCHES = [0, 1, 30, 60] - -METRICS = { - "Rouge1": rouge1, - # "Rouge2": rouge2, - # "RougeL": rougeL, - "EditRatio": edit_ratio, - # "EditDistance": edit_distance, - "SBERTScore": bert_score, - # "EntailmentScore": entailment_score, - # "Qsquare": q_square, -} -DS = load_dataset("explodinggradients/eli5-test", split="test_eli5") -assert isinstance(DS, arrow_dataset.Dataset), "Not an arrow_dataset" -DS = DS.select(range(100)) - - -def setup() -> t.Iterator[tuple[str, Evaluation, Dataset]]: - metrics = [m for m in METRICS.values()] - for b in BATCHES: - setup_name = f"batch-{b}" - assert isinstance(DS, Dataset), f"{type(DS)} found in the place of Dataset!" - batched = False if b == 0 else True - e = Evaluation( - metrics=metrics, - batched=batched, - batch_size=b, - ) - yield setup_name, e, DS - - -@timeit -def evaluate(e: Evaluation, ds: Dataset): - e.eval(ds["ground_truth"], ds["generated_text"]) - - -if __name__ == "__main__": - results = {} - for setup_name, e, ds in tqdm(setup(), total=len(BATCHES)): - mean, var = evaluate(e, ds) - results[setup_name] = (mean, var) - - print_table(results) diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index 4b0b8fa07..4eeeddec6 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -1,22 +1,21 @@ -from datasets import arrow_dataset, load_dataset +import os + +from datasets import Dataset, load_dataset from torch.cuda import is_available -from ragas.metrics import Evaluation, bert_score, edit_ratio, rougeL -from ragas.metrics.factual import EntailmentScore +from ragas import evaluate +from ragas.metrics import answer_relevancy, context_relavancy, factuality DEVICE = "cuda" if is_available() else "cpu" -entailment_score = EntailmentScore(device=DEVICE, batch_size=2) -# q_square = Qsquare(device=DEVICE, batch_size=2) -DS = load_dataset("explodinggradients/ragas-webgpt", split="train") -assert isinstance(DS, arrow_dataset.Dataset), "Not an arrow_dataset" -DS = DS.select(range(500)) +PATH_TO_DATSET_GIT_REPO = "../../../datasets/fiqa/" +assert os.path.isdir(PATH_TO_DATSET_GIT_REPO), "Dataset not found" +ds = Dataset.from_json(os.path.join(PATH_TO_DATSET_GIT_REPO, "gen_ds.json")) +assert isinstance(ds, Dataset) if __name__ == "__main__": - e = Evaluation( - metrics=[rougeL, edit_ratio, bert_score, entailment_score], - batched=True, - batch_size=64, + result = evaluate( + ds, + metrics=[answer_relevancy, context_relavancy, factuality], ) - result = e.eval(DS["ground_truth"], DS["generated_text"]) print(result)