# RankingMetric
> Base class for ranking metrics

In [None]:
#| default_exp metric.ranking

In [None]:
#| export

import typing as t
from dataclasses import dataclass
from pydantic import BaseModel, Field
from ragas_experimental.metric import Metric, MetricResult
from ragas_experimental.metric.decorator import create_metric_decorator

@dataclass
class RankingMetric(Metric):
    num_ranks: int
    
    def _get_response_model(self, with_reasoning: bool) -> t.Type[BaseModel]:
        """Get or create a response model based on reasoning parameter."""
        
        if with_reasoning in self._response_models:
            return self._response_models[with_reasoning]
        
        # Store values needed for validation
        num_ranks = self.num_ranks
        
        # Create explicit model classes instead of using create_model
        if with_reasoning:
            # Model with result and reason
            class ResponseModelWithReason(BaseModel):
                result: t.List[int] = Field(...)
                reason: str = Field(...)
                
                def model_post_init(self, __context):
                    expected = set(range(num_ranks))
                    if set(self.result) != expected:
                        raise ValueError(
                            f"'result' must contain exactly the numbers {sorted(expected)} without repetition."
                        )
            
            self._response_models[with_reasoning] = ResponseModelWithReason
            return ResponseModelWithReason
        else:
            # Model with just result
            class ResponseModel(BaseModel):
                result: t.List[int] = Field(...)
                
                def model_post_init(self, __context):
                    expected = set(range(num_ranks))
                    if set(self.result) != expected:
                        raise ValueError(
                            f"'result' must contain exactly the numbers {sorted(expected)} without repetition."
                        )
            
            self._response_models[with_reasoning] = ResponseModel
            return ResponseModel

    def _ensemble(self, results: t.List[MetricResult]) -> MetricResult:
        if len(results) == 1:
            return results[0]

        n_items = self.num_ranks  # Use the class attribute instead of len(results)
        borda_scores = [0] * n_items

        for result in results:
            for position_idx, item_idx in enumerate(result.result):
                borda_scores[item_idx] += (n_items - position_idx)  # Fixed the formula

        indexed_scores = [(score, i) for i, score in enumerate(borda_scores)]    
        indexed_scores.sort(key=lambda x: (-x[0], x[1]))    
        final_ranking = [pos for _, pos in indexed_scores]

        if any(r.reason for r in results):
            reason = "Ensemble ranking based on multiple evaluations.\n" + '\n'.join([r.reason for r in results if r.reason])
        else:
            reason = None
        
        return MetricResult(result=final_ranking, reason=reason)
    

ranking_metric = create_metric_decorator(RankingMetric)

  from .autonotebook import tqdm as notebook_tqdm


### Example usage

In [None]:

#| eval: false

from ragas_experimental.llm import ragas_llm
from openai import OpenAI

llm = ragas_llm(provider="openai",model="gpt-4o",client=OpenAI())

my_ranking_metric = RankingMetric(
    name='response_ranking',
    llm=llm,  # Your language model instance
    prompt="Rank the following responses:\n{candidates}",
    num_ranks=3,
)

# To score a single input (ranking candidate responses)
result = my_ranking_metric.score(candidates=[
    "short answer.",
    "a bit more detailed.",
    "the longest and most detailed answer."
],n=3)
print(result)   # Might output something like: [1, 0, 2]
print(result.reason)  # Provides the reasoning behind the ranking



[2, 1, 0]
Ensemble ranking based on multiple evaluations.
The ranking is based on the length and detail of each response. 'the longest and most detailed answer.' is the most comprehensive, followed by 'a bit more detailed.', and 'short answer.' is the briefest.
The ranking is based on the length and detail of each response. The response 'the longest and most detailed answer.' is ranked highest (2) because it is the most detailed, followed by 'a bit more detailed.' (1), and finally 'short answer.' (0) as it is the least detailed.
The responses are ranked based on the level of detail and length. 'short answer.' is the least detailed, 'a bit more detailed.' provides more information, and 'the longest and most detailed answer.' offers the most comprehensive explanation.


### Custom ranking metric

In [None]:
#| eval: false

from ragas_experimental.metric import MetricResult

@ranking_metric(
    llm=llm,  # Your language model instance
    prompt="Rank the following responses:\n{candidates}",
    name='new_ranking_metric',
    num_ranks=3
)
def my_ranking_metric(llm, prompt, **kwargs):
    # Your custom logic that calls the LLM and returns a tuple of (ranking, reason)
    # For example, process the prompt (formatted with candidates) and produce a ranking.
    ranking = [1, 0, 2]  # Dummy ranking: second candidate is best, then first, then third.
    reason = "Ranked based on response clarity and detail."
    return MetricResult(result=ranking, reason=reason)

# Using the decorator-based ranking metric:
result = my_ranking_metric.score(candidates=[
    "Response A: short answer.",
    "Response B: a bit more detailed.",
    "Response C: the longest and most detailed answer."
])
print(result)   # E.g., [1, 0, 2]
print(result.reason)  # E.g., "Ranked based on response clarity and detail."


[1, 0, 2]
Ranked based on response clarity and detail.
