From e0658eb4bc3275bbc290de644ba66d351e0b9ec7 Mon Sep 17 00:00:00 2001 From: Jithin James Date: Sat, 20 May 2023 20:07:40 +0530 Subject: [PATCH 1/6] added init_model to baseline --- ragas/metrics/base.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ragas/metrics/base.py b/ragas/metrics/base.py index b2acdc9f5..9afd921cb 100644 --- a/ragas/metrics/base.py +++ b/ragas/metrics/base.py @@ -13,17 +13,33 @@ class Metric(ABC): @property @abstractmethod def name(self: t.Self) -> str: + """ + the metric name + """ ... @property @abstractmethod def is_batchable(self: t.Self) -> bool: + """ + Attribute to check if this metric is is_batchable + """ + ... + + @abstractmethod + def init_model(): + """ + This method will lazy initialize the model. + """ ... @abstractmethod def score( self: t.Self, ground_truth: list[str], generated_text: list[str] ) -> list[float]: + """ + Run the metric on the ground_truth and generated_text and return score. + """ ... From b14b194525758fcaaa3a3f79525afbdcb266e152 Mon Sep 17 00:00:00 2001 From: Jithin James Date: Sat, 20 May 2023 20:33:44 +0530 Subject: [PATCH 2/6] added init_model to everything --- ragas/metrics/base.py | 4 ++++ ragas/metrics/factual.py | 4 ++-- ragas/metrics/similarity.py | 2 +- ragas/metrics/simple.py | 8 +++++++- tests/benchmarks/benchmark.py | 14 ++++++++------ 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/ragas/metrics/base.py b/ragas/metrics/base.py index 9afd921cb..21cde9b2e 100644 --- a/ragas/metrics/base.py +++ b/ragas/metrics/base.py @@ -53,6 +53,10 @@ def eval(self, ground_truth: list[list[str]], generated_text: list[str]) -> Resu ds = Dataset.from_dict( {"ground_truth": ground_truth, "generated_text": generated_text} ) + + # initialize all the models in the metrics + [m.init_model() for m in self.metrics] + ds = ds.map( self._get_score, batched=self.batched, diff --git a/ragas/metrics/factual.py b/ragas/metrics/factual.py index d9ab22909..7dc452fd8 100644 --- a/ragas/metrics/factual.py +++ b/ragas/metrics/factual.py @@ -52,7 +52,7 @@ class EntailmentScore(Metric): batch_size: int = 4 device: t.Literal["cpu", "cuda"] | Device = "cpu" - def __post_init__(self): + def init_model(self): self.device = device_check(self.device) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name) @@ -212,7 +212,7 @@ class Qsquare(Metric): include_nouns: bool = True save_results: bool = False - def __post_init__(self): + def init_model(self): self.qa = QAGQ.from_pretrained(self.qa_model_name) self.qg = QAGQ.from_pretrained(self.qg_model_name) self.nli = EntailmentScore() diff --git a/ragas/metrics/similarity.py b/ragas/metrics/similarity.py index 60b3d1d2e..a79468b69 100644 --- a/ragas/metrics/similarity.py +++ b/ragas/metrics/similarity.py @@ -18,7 +18,7 @@ class BERTScore(Metric): model_path: str = "all-MiniLM-L6-v2" batch_size: int = 1000 - def __post_init__(self): + def init_model(self): self.model = SentenceTransformer(self.model_path) @property diff --git a/ragas/metrics/simple.py b/ragas/metrics/simple.py index 2ac8ee8b9..643d5d777 100644 --- a/ragas/metrics/simple.py +++ b/ragas/metrics/simple.py @@ -26,6 +26,9 @@ def name(self): def is_batchable(self): return True + def init_model(self): + ... + def score(self, ground_truth: t.List[str], generated_text: t.List[str]): ground_truth_ = [[word_tokenize(text)] for text in ground_truth] generated_text_ = [word_tokenize(text) for text in generated_text] @@ -45,7 +48,7 @@ class ROUGE(Metric): type: t.Literal[ROUGE_TYPES] use_stemmer: bool = False - def __post_init__(self): + def init_model(self): self.scorer = rouge_scorer.RougeScorer( [self.type], use_stemmer=self.use_stemmer ) @@ -80,6 +83,9 @@ def name(self) -> str: def is_batchable(self): return True + def init_model(self): + ... + def score(self, ground_truth: t.List[str], generated_text: t.List[str]): if self.measure == "distance": score = [distance(s1, s2) for s1, s2 in zip(ground_truth, generated_text)] diff --git a/tests/benchmarks/benchmark.py b/tests/benchmarks/benchmark.py index 5bb413fc0..f98d8c564 100644 --- a/tests/benchmarks/benchmark.py +++ b/tests/benchmarks/benchmark.py @@ -7,8 +7,10 @@ from ragas.metrics import ( Evaluation, + bert_score, edit_distance, edit_ratio, + entailment_score, q_square, rouge1, rouge2, @@ -16,17 +18,17 @@ ) DEVICE = "cuda" if is_available() else "cpu" -BATCHES = [0, 1] +BATCHES = [0, 1, 30, 60] METRICS = { "Rouge1": rouge1, - "Rouge2": rouge2, - "RougeL": rougeL, + # "Rouge2": rouge2, + # "RougeL": rougeL, "EditRatio": edit_ratio, - "EditDistance": edit_distance, - # "SBERTScore": bert_score, + # "EditDistance": edit_distance, + "SBERTScore": bert_score, # "EntailmentScore": entailment_score, - "Qsquare": q_square, + # "Qsquare": q_square, } DS = load_dataset("explodinggradients/eli5-test", split="test_eli5") assert isinstance(DS, arrow_dataset.Dataset), "Not an arrow_dataset" From b9be515ee5eacd75e2c504962b89f672a3066109 Mon Sep 17 00:00:00 2001 From: Jithin James Date: Sat, 20 May 2023 20:48:26 +0530 Subject: [PATCH 3/6] fix lint issues --- tests/benchmarks/benchmark.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/benchmarks/benchmark.py b/tests/benchmarks/benchmark.py index f98d8c564..e87c404d1 100644 --- a/tests/benchmarks/benchmark.py +++ b/tests/benchmarks/benchmark.py @@ -5,17 +5,7 @@ from tqdm import tqdm from utils import print_table, timeit -from ragas.metrics import ( - Evaluation, - bert_score, - edit_distance, - edit_ratio, - entailment_score, - q_square, - rouge1, - rouge2, - rougeL, -) +from ragas.metrics import Evaluation, bert_score, edit_ratio, rouge1 DEVICE = "cuda" if is_available() else "cpu" BATCHES = [0, 1, 30, 60] From acf72289588621f361395cc866d5db937815002c Mon Sep 17 00:00:00 2001 From: Jithin James Date: Mon, 22 May 2023 19:17:21 +0530 Subject: [PATCH 4/6] added init model to qsquare --- ragas/metrics/factual.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ragas/metrics/factual.py b/ragas/metrics/factual.py index 7dc452fd8..fd5891569 100644 --- a/ragas/metrics/factual.py +++ b/ragas/metrics/factual.py @@ -216,6 +216,7 @@ def init_model(self): self.qa = QAGQ.from_pretrained(self.qa_model_name) self.qg = QAGQ.from_pretrained(self.qg_model_name) self.nli = EntailmentScore() + self.nli.init_model() try: self.nlp = spacy.load(SPACY_MODEL) except OSError: From 6fd5902cfcaa177d752166334c263f5472344199 Mon Sep 17 00:00:00 2001 From: Jithin James Date: Mon, 22 May 2023 19:33:47 +0530 Subject: [PATCH 5/6] ignore type issue --- ragas/metrics/factual.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ragas/metrics/factual.py b/ragas/metrics/factual.py index fd5891569..a936d4ff8 100644 --- a/ragas/metrics/factual.py +++ b/ragas/metrics/factual.py @@ -327,7 +327,7 @@ def score(self, ground_truth: list[str], generated_text: list[str], **kwargs): ) gnd_qans[i] = [ {"question": qstn, "answer": ans} - for qstn, ans in zip(questions, candidates) + for qstn, ans in zip(questions, candidates) # type: ignore ] for i, gen_text in enumerate(generated_text): @@ -335,7 +335,7 @@ def score(self, ground_truth: list[str], generated_text: list[str], **kwargs): gen_answers = self.generate_answers(questions, gen_text) _ = [ item.update({"predicted_answer": ans}) - for item, ans in zip(gnd_qans[i], gen_answers) + for item, ans in zip(gnd_qans[i], gen_answers) # type: ignore ] # del self.qa From d833553b22bf899da32e0f3f894b751e0053dca6 Mon Sep 17 00:00:00 2001 From: Jithin James Date: Mon, 22 May 2023 19:35:16 +0530 Subject: [PATCH 6/6] fix linting --- Makefile | 2 +- tests/benchmarks/benchmark.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fc5ab0fa8..52ce482a3 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ lint: ## Running lint checker: ruff @ruff check ragas examples tests type: ## Running type checker: pyright @echo "(pyright) Typechecking codebase..." - @pyright -p ragas + @pyright ragas clean: ## Clean all generated files @echo "Cleaning all generated files..." @cd $(GIT_ROOT)/docs && make clean diff --git a/tests/benchmarks/benchmark.py b/tests/benchmarks/benchmark.py index e87c404d1..c868ceff8 100644 --- a/tests/benchmarks/benchmark.py +++ b/tests/benchmarks/benchmark.py @@ -5,7 +5,12 @@ from tqdm import tqdm from utils import print_table, timeit -from ragas.metrics import Evaluation, bert_score, edit_ratio, rouge1 +from ragas.metrics import ( + Evaluation, + bert_score, + edit_ratio, + rouge1, +) DEVICE = "cuda" if is_available() else "cpu" BATCHES = [0, 1, 30, 60]