Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions src/ragas/metrics/_answer_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class AnswerCorrectness(MetricWithLLM):

"""
Measures answer correctness compared to ground truth as a combination of
semantic similarity and factuality
factuality and semantic similarity.

Attributes
----------
Expand All @@ -78,7 +78,14 @@ class AnswerCorrectness(MetricWithLLM):
answer_similarity: AnswerSimilarity | None = None

def __post_init__(self: t.Self):
if self.answer_similarity is None:
if len(self.weights) != 2:
raise ValueError("Expects a list of two weights. First for factuality, second for semantic similarity")
if all([w == 0 for w in self.weights]):
raise ValueError("At least one weight must be non-zero")
if not all([w >= 0 for w in self.weights]):
raise ValueError("Weights must be non-negative")

if self.answer_similarity is None and self.weights[1] != 0:
self.answer_similarity = AnswerSimilarity(
llm=self.llm, batch_size=self.batch_size
)
Expand Down Expand Up @@ -113,7 +120,7 @@ def _score_batch(
"FP": "statements present in the answer but not found in the ground truth",
"FN": "relevant statements found in the ground truth but omitted in the answer", # noqa: E501
}

f1_score = []
for prediction in outputs:
prediction = json_loader.safe_load(prediction[0].text, self.llm)
Expand All @@ -131,10 +138,13 @@ def _score_batch(
score = tp / (tp + 0.5 * (fp + fn))
else:
score = np.nan

f1_score.append(score)

similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore

if self.weights[1] == 0:
similarity_scores = np.zeros(len(f1_score))
else:
similarity_scores = self.answer_similarity._score_batch(dataset, callbacks=batch_group) # type: ignore
scores_stacked = np.vstack([f1_score, similarity_scores])
scores = np.average(
scores_stacked,
Expand Down