diff --git a/.gitignore b/.gitignore index 07269ad76..2a54a6ed5 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ ragas/_version.py experiments/**/data experiments/**/storage **/fil-result/ +src/ragas/_version.py diff --git a/Makefile b/Makefile index 52ce482a3..19fde54a4 100644 --- a/Makefile +++ b/Makefile @@ -8,17 +8,17 @@ format: ## Running code formatter: black and isort @echo "(isort) Ordering imports..." @isort . @echo "(black) Formatting codebase..." - @black --config pyproject.toml ragas tests examples + @black --config pyproject.toml src tests examples experiments @echo "(black) Formatting stubs..." - @find ragas -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \; + @find src -name "*.pyi" ! -name "*_pb2*" -exec black --pyi --config pyproject.toml {} \; @echo "(ruff) Running fix only..." - @ruff check ragas examples tests --fix-only + @ruff check src examples tests --fix-only lint: ## Running lint checker: ruff @echo "(ruff) Linting development project..." - @ruff check ragas examples tests + @ruff check src examples tests type: ## Running type checker: pyright @echo "(pyright) Typechecking codebase..." - @pyright ragas + @pyright src clean: ## Clean all generated files @echo "Cleaning all generated files..." @cd $(GIT_ROOT)/docs && make clean diff --git a/pyproject.toml b/pyproject.toml index 1ef38b55f..094d495e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,23 +1,25 @@ [project] name = "ragas" dependencies = [ - "Levenshtein", - "rouge-score", "numpy", "transformers", "sentence-transformers", - "nltk", "datasets", - "spacy<4.0.0,>=3.0.0", "protobuf<=3.20.0", + "backoff", + "openai", ] dynamic = ["version", "readme"] +[tool.setuptools] +package-dir = {"" = "src"} + [tool.setuptools.dynamic] readme = {file = ["README.md"], content-type = "text/plain"} [build-system] requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" + [tool.setuptools_scm] -write_to = "ragas/_version.py" +write_to = "src/ragas/_version.py" diff --git a/ragas/__init__.py b/src/ragas/__init__.py similarity index 100% rename from ragas/__init__.py rename to src/ragas/__init__.py diff --git a/ragas/evaluation.py b/src/ragas/evaluation.py similarity index 86% rename from ragas/evaluation.py rename to src/ragas/evaluation.py index f5c3d2e42..43bafbb91 100644 --- a/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -1,12 +1,10 @@ from __future__ import annotations -import typing as t from dataclasses import dataclass from enum import Enum import numpy as np from datasets import Dataset, concatenate_datasets -from tqdm import tqdm from ragas.metrics.base import Metric @@ -44,10 +42,10 @@ def evaluate( [m.init_model() for m in metrics] scores = [] - for metric in tqdm(metrics): + for metric in metrics: scores.append(metric.score(dataset).select_columns(metric.name)) - return Result(concatenate_datasets(scores)) + return Result(concatenate_datasets(scores, axis=1)) @dataclass @@ -55,8 +53,13 @@ class Result(dict): scores: Dataset def __post_init__(self): + values = [] for cn in self.scores.column_names: - self[cn] = np.mean(self.scores[cn]) + value = np.mean(self.scores[cn]) + self[cn] = value + values.append(value) + + self["ragas_score"] = len(values) / np.sum(1.0 / np.array(values)) def describe(self): description = {} diff --git a/ragas/exceptions.py b/src/ragas/exceptions.py similarity index 100% rename from ragas/exceptions.py rename to src/ragas/exceptions.py diff --git a/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py similarity index 100% rename from ragas/metrics/__init__.py rename to src/ragas/metrics/__init__.py diff --git a/ragas/metrics/answer_relevance.py b/src/ragas/metrics/answer_relevance.py similarity index 99% rename from ragas/metrics/answer_relevance.py rename to src/ragas/metrics/answer_relevance.py index f815ce736..0bd490875 100644 --- a/ragas/metrics/answer_relevance.py +++ b/src/ragas/metrics/answer_relevance.py @@ -121,7 +121,7 @@ def predict( ) -> npt.NDArray[np.float64]: predictions = [] dataloader = DataLoader( - sentences, batch_size=batch_size, collate_fn=self.collate_fn + sentences, batch_size=batch_size, collate_fn=self.collate_fn # type: ignore ) if show_progress: diff --git a/ragas/metrics/base.py b/src/ragas/metrics/base.py similarity index 85% rename from ragas/metrics/base.py rename to src/ragas/metrics/base.py index a996966b7..c1bdacadd 100644 --- a/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -47,8 +47,11 @@ def get_batches(self, dataset_size: int): range(i, i + self.batch_size) for i in range(0, self.batch_size * num_batches, self.batch_size) ] - batches.append( - range(self.batch_size * num_batches, self.batch_size * num_batches + tail) - ) + if tail != 0: + batches.append( + range( + self.batch_size * num_batches, self.batch_size * num_batches + tail + ) + ) return batches diff --git a/ragas/metrics/context_relevance.py b/src/ragas/metrics/context_relevance.py similarity index 100% rename from ragas/metrics/context_relevance.py rename to src/ragas/metrics/context_relevance.py diff --git a/ragas/metrics/factual.py b/src/ragas/metrics/factual.py similarity index 97% rename from ragas/metrics/factual.py rename to src/ragas/metrics/factual.py index e59a13cd9..71db8dcac 100644 --- a/ragas/metrics/factual.py +++ b/src/ragas/metrics/factual.py @@ -28,7 +28,7 @@ statements:\nShahul and Jithin were from different countries. question:{} answer: {} -statements:\n""" +statements:\n""" # noqa: E501 NLI_STATEMENTS = """ Prompt: Natural language inference @@ -53,7 +53,7 @@ statements:\n{} Now, read the following statements and determine whether they are supported by the information present in the context. Provide a brief explanation for each statement. Also provide a Final Answer (Yes/No) at the end. Answer: -""" +""" # noqa: E501 @dataclass @@ -87,7 +87,7 @@ def _score_batch(self: t.Self, ds: Dataset) -> Dataset: response = openai_completion(prompts) list_statements: list[list[str]] = [] - for output in response["choices"]: + for output in response["choices"]: # type: ignore statements = output["text"].split("\n") list_statements.append(statements) @@ -101,7 +101,7 @@ def _score_batch(self: t.Self, ds: Dataset) -> Dataset: prompts.append(prompt) response = openai_completion(prompts) - outputs = response["choices"] + outputs = response["choices"] # type: ignore scores = [] for i, output in enumerate(outputs): diff --git a/ragas/metrics/llms.py b/src/ragas/metrics/llms.py similarity index 97% rename from ragas/metrics/llms.py rename to src/ragas/metrics/llms.py index ea7452477..88f5f4e9b 100644 --- a/ragas/metrics/llms.py +++ b/src/ragas/metrics/llms.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import os diff --git a/ragas/utils.py b/src/ragas/utils.py similarity index 100% rename from ragas/utils.py rename to src/ragas/utils.py diff --git a/tests/benchmarks/benchmark.py b/tests/benchmarks/benchmark.py deleted file mode 100644 index c868ceff8..000000000 --- a/tests/benchmarks/benchmark.py +++ /dev/null @@ -1,58 +0,0 @@ -import typing as t - -from datasets import Dataset, arrow_dataset, load_dataset -from torch.cuda import is_available -from tqdm import tqdm -from utils import print_table, timeit - -from ragas.metrics import ( - Evaluation, - bert_score, - edit_ratio, - rouge1, -) - -DEVICE = "cuda" if is_available() else "cpu" -BATCHES = [0, 1, 30, 60] - -METRICS = { - "Rouge1": rouge1, - # "Rouge2": rouge2, - # "RougeL": rougeL, - "EditRatio": edit_ratio, - # "EditDistance": edit_distance, - "SBERTScore": bert_score, - # "EntailmentScore": entailment_score, - # "Qsquare": q_square, -} -DS = load_dataset("explodinggradients/eli5-test", split="test_eli5") -assert isinstance(DS, arrow_dataset.Dataset), "Not an arrow_dataset" -DS = DS.select(range(100)) - - -def setup() -> t.Iterator[tuple[str, Evaluation, Dataset]]: - metrics = [m for m in METRICS.values()] - for b in BATCHES: - setup_name = f"batch-{b}" - assert isinstance(DS, Dataset), f"{type(DS)} found in the place of Dataset!" - batched = False if b == 0 else True - e = Evaluation( - metrics=metrics, - batched=batched, - batch_size=b, - ) - yield setup_name, e, DS - - -@timeit -def evaluate(e: Evaluation, ds: Dataset): - e.eval(ds["ground_truth"], ds["generated_text"]) - - -if __name__ == "__main__": - results = {} - for setup_name, e, ds in tqdm(setup(), total=len(BATCHES)): - mean, var = evaluate(e, ds) - results[setup_name] = (mean, var) - - print_table(results) diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index 4b0b8fa07..4eeeddec6 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -1,22 +1,21 @@ -from datasets import arrow_dataset, load_dataset +import os + +from datasets import Dataset, load_dataset from torch.cuda import is_available -from ragas.metrics import Evaluation, bert_score, edit_ratio, rougeL -from ragas.metrics.factual import EntailmentScore +from ragas import evaluate +from ragas.metrics import answer_relevancy, context_relavancy, factuality DEVICE = "cuda" if is_available() else "cpu" -entailment_score = EntailmentScore(device=DEVICE, batch_size=2) -# q_square = Qsquare(device=DEVICE, batch_size=2) -DS = load_dataset("explodinggradients/ragas-webgpt", split="train") -assert isinstance(DS, arrow_dataset.Dataset), "Not an arrow_dataset" -DS = DS.select(range(500)) +PATH_TO_DATSET_GIT_REPO = "../../../datasets/fiqa/" +assert os.path.isdir(PATH_TO_DATSET_GIT_REPO), "Dataset not found" +ds = Dataset.from_json(os.path.join(PATH_TO_DATSET_GIT_REPO, "gen_ds.json")) +assert isinstance(ds, Dataset) if __name__ == "__main__": - e = Evaluation( - metrics=[rougeL, edit_ratio, bert_score, entailment_score], - batched=True, - batch_size=64, + result = evaluate( + ds, + metrics=[answer_relevancy, context_relavancy, factuality], ) - result = e.eval(DS["ground_truth"], DS["generated_text"]) print(result)