diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index b8f5e03bc..169fe441c 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -37,7 +37,6 @@ def evaluate( dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str]] The dataset in the format of ragas which the metrics will use to score the RAG pipeline with - metrics : list[Metric] , optional List of metrics to use for evaluation. If not provided then ragas will run the evaluation on the best set of metrics to give a complete view. diff --git a/src/ragas/metrics/context_relevance.py b/src/ragas/metrics/context_relevance.py index a0dd0aa0d..8560ed55d 100644 --- a/src/ragas/metrics/context_relevance.py +++ b/src/ragas/metrics/context_relevance.py @@ -81,22 +81,37 @@ def evaluate(self, answers: List[List[str]]) -> np.float_: @dataclass class ContextRelevancy(Metric): - """ - params - strictness: Integer, controls the number of times sentence extraction is - performed to quantify uncertainty from the LLM. Defaults to 2. - agreement_metric: bert_score or jaccard_score, used to measure agreement - between multiple samples. - model_name: any encoder model. Used for calculating bert_score. + Extracts sentences from the context that are relevant to the question with + self-consistancy checks. The number of relevant sentences and is used as the score. + + Attributes + ---------- + name : str + batch_size : int + Batch size for openai completion. + strictness : int + Controls the number of times sentence extraction is performed to quantify + uncertainty from the LLM. Defaults to 2. + agreement_metric : str + "bert_score" or "jaccard_score", used to measure agreement between multiple + samples. + model_name : str + any encoder model. Used for calculating bert_score. """ name: str = "context_relavency" batch_size: int = 15 - agreement_metric: str = "bert_score" strictness: int = 2 + agreement_metric: str = "bert_score" model_name: str = "cross-encoder/stsb-TinyBERT-L-4" + def __post_init__(self: t.Self): + if self.agreement_metric == "bert_score" and self.model_name is None: + raise ValueError( + "model_name must be provided when agreement_metric is bert_score" + ) + def init_model(self: t.Self): self.sent_agreement = SentenceAgreement( model_name=self.model_name, metric=self.agreement_metric @@ -104,7 +119,14 @@ def init_model(self: t.Self): def score(self: t.Self, dataset: Dataset) -> Dataset: """ + Parameters + ---------- dataset: Dataset[question: list[str], contexts: list[list[str]]] + + Returns + ------- + Dataset[question: list[str], contexts: list[list[str]], scores: list[float]] + Dataset with the scores for each row. """ prompts = [] questions, contexts = dataset["question"], dataset["contexts"]