Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ lint: ## Running lint checker: ruff
@ruff check ragas examples tests
type: ## Running type checker: pyright
@echo "(pyright) Typechecking codebase..."
@pyright -p ragas
@pyright ragas
clean: ## Clean all generated files
@echo "Cleaning all generated files..."
@cd $(GIT_ROOT)/docs && make clean
Expand Down
20 changes: 20 additions & 0 deletions ragas/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,33 @@ class Metric(ABC):
@property
@abstractmethod
def name(self: t.Self) -> str:
"""
the metric name
"""
...

@property
@abstractmethod
def is_batchable(self: t.Self) -> bool:
"""
Attribute to check if this metric is is_batchable
"""
...

@abstractmethod
def init_model():
"""
This method will lazy initialize the model.
"""
...

@abstractmethod
def score(
self: t.Self, ground_truth: list[str], generated_text: list[str]
) -> list[float]:
"""
Run the metric on the ground_truth and generated_text and return score.
"""
...


Expand All @@ -37,6 +53,10 @@ def eval(self, ground_truth: list[list[str]], generated_text: list[str]) -> Resu
ds = Dataset.from_dict(
{"ground_truth": ground_truth, "generated_text": generated_text}
)

# initialize all the models in the metrics
[m.init_model() for m in self.metrics]

ds = ds.map(
self._get_score,
batched=self.batched,
Expand Down
9 changes: 5 additions & 4 deletions ragas/metrics/factual.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class EntailmentScore(Metric):
batch_size: int = 4
device: t.Literal["cpu", "cuda"] | Device = "cpu"

def __post_init__(self):
def init_model(self):
self.device = device_check(self.device)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
Expand Down Expand Up @@ -212,10 +212,11 @@ class Qsquare(Metric):
include_nouns: bool = True
save_results: bool = False

def __post_init__(self):
def init_model(self):
self.qa = QAGQ.from_pretrained(self.qa_model_name)
self.qg = QAGQ.from_pretrained(self.qg_model_name)
self.nli = EntailmentScore()
self.nli.init_model()
try:
self.nlp = spacy.load(SPACY_MODEL)
except OSError:
Expand Down Expand Up @@ -326,15 +327,15 @@ def score(self, ground_truth: list[str], generated_text: list[str], **kwargs):
)
gnd_qans[i] = [
{"question": qstn, "answer": ans}
for qstn, ans in zip(questions, candidates)
for qstn, ans in zip(questions, candidates) # type: ignore
]

for i, gen_text in enumerate(generated_text):
questions = [item["question"] for item in gnd_qans[i]]
gen_answers = self.generate_answers(questions, gen_text)
_ = [
item.update({"predicted_answer": ans})
for item, ans in zip(gnd_qans[i], gen_answers)
for item, ans in zip(gnd_qans[i], gen_answers) # type: ignore
]

# del self.qa
Expand Down
2 changes: 1 addition & 1 deletion ragas/metrics/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class BERTScore(Metric):
model_path: str = "all-MiniLM-L6-v2"
batch_size: int = 1000

def __post_init__(self):
def init_model(self):
self.model = SentenceTransformer(self.model_path)

@property
Expand Down
8 changes: 7 additions & 1 deletion ragas/metrics/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def name(self):
def is_batchable(self):
return True

def init_model(self):
...

def score(self, ground_truth: t.List[str], generated_text: t.List[str]):
ground_truth_ = [[word_tokenize(text)] for text in ground_truth]
generated_text_ = [word_tokenize(text) for text in generated_text]
Expand All @@ -45,7 +48,7 @@ class ROUGE(Metric):
type: t.Literal[ROUGE_TYPES]
use_stemmer: bool = False

def __post_init__(self):
def init_model(self):
self.scorer = rouge_scorer.RougeScorer(
[self.type], use_stemmer=self.use_stemmer
)
Expand Down Expand Up @@ -80,6 +83,9 @@ def name(self) -> str:
def is_batchable(self):
return True

def init_model(self):
...

def score(self, ground_truth: t.List[str], generated_text: t.List[str]):
if self.measure == "distance":
score = [distance(s1, s2) for s1, s2 in zip(ground_truth, generated_text)]
Expand Down
17 changes: 7 additions & 10 deletions tests/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,23 @@

from ragas.metrics import (
Evaluation,
edit_distance,
bert_score,
edit_ratio,
q_square,
rouge1,
rouge2,
rougeL,
)

DEVICE = "cuda" if is_available() else "cpu"
BATCHES = [0, 1]
BATCHES = [0, 1, 30, 60]

METRICS = {
"Rouge1": rouge1,
"Rouge2": rouge2,
"RougeL": rougeL,
# "Rouge2": rouge2,
# "RougeL": rougeL,
"EditRatio": edit_ratio,
"EditDistance": edit_distance,
# "SBERTScore": bert_score,
# "EditDistance": edit_distance,
"SBERTScore": bert_score,
# "EntailmentScore": entailment_score,
"Qsquare": q_square,
# "Qsquare": q_square,
}
DS = load_dataset("explodinggradients/eli5-test", split="test_eli5")
assert isinstance(DS, arrow_dataset.Dataset), "Not an arrow_dataset"
Expand Down