Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions docs/howtos/customizations/metrics/write_your_own_metric.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"source": [
"from ragas.llms import llm_factory\n",
"\n",
"evaluator_llm = llm_factory('gpt-4o')"
"evaluator_llm = llm_factory(\"gpt-4o\")"
]
},
{
Expand Down Expand Up @@ -104,7 +104,7 @@
"hallucinations_binary = AspectCritic(\n",
" name=\"hallucinations_binary\",\n",
" definition=\"Did the model hallucinate or add any information that was not present in the retrieved context?\",\n",
" llm=evaluator_llm\n",
" llm=evaluator_llm,\n",
")\n",
"\n",
"await hallucinations_binary.single_turn_ascore(eval_dataset[0])"
Expand Down Expand Up @@ -163,9 +163,7 @@
"from ragas.metrics import RubricsScoreWithoutReference\n",
"\n",
"hallucinations_rubric = RubricsScoreWithoutReference(\n",
" name=\"hallucinations_rubric\",\n",
" llm=evaluator_llm,\n",
" rubrics=rubric\n",
" name=\"hallucinations_rubric\", llm=evaluator_llm, rubrics=rubric\n",
")\n",
"\n",
"await hallucinations_rubric.single_turn_ascore(eval_dataset[0])"
Expand Down Expand Up @@ -215,19 +213,28 @@
"from ragas.callbacks import Callbacks\n",
"from ragas.dataset_schema import SingleTurnSample\n",
"\n",
"\n",
"@dataclass\n",
"class HallucinationsMetric(MetricWithLLM, SingleTurnMetric):\n",
" # name of the metric\n",
" name: str = \"hallucinations_metric\"\n",
" # we need to define the required columns for the metric\n",
" _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=lambda: {MetricType.SINGLE_TURN: {\"user_input\", \"response\", \"retrieved_contexts\"}})\n",
" _required_columns: t.Dict[MetricType, t.Set[str]] = field(\n",
" default_factory=lambda: {\n",
" MetricType.SINGLE_TURN: {\"user_input\", \"response\", \"retrieved_contexts\"}\n",
" }\n",
" )\n",
"\n",
" def __post_init__(self):\n",
" # init the faithfulness metric\n",
" self.faithfulness_metric = Faithfulness(llm=self.llm)\n",
"\n",
" async def _single_turn_ascore(self, sample: SingleTurnSample, callbacks: Callbacks) -> float:\n",
" faithfulness_score = await self.faithfulness_metric.single_turn_ascore(sample, callbacks)\n",
" async def _single_turn_ascore(\n",
" self, sample: SingleTurnSample, callbacks: Callbacks\n",
" ) -> float:\n",
" faithfulness_score = await self.faithfulness_metric.single_turn_ascore(\n",
" sample, callbacks\n",
" )\n",
" return 1 - faithfulness_score"
]
},
Expand Down Expand Up @@ -269,12 +276,8 @@
"from ragas import evaluate\n",
"\n",
"results = evaluate(\n",
" eval_dataset, \n",
" metrics=[\n",
" hallucinations_metric,\n",
" hallucinations_rubric,\n",
" hallucinations_binary\n",
" ], \n",
" eval_dataset,\n",
" metrics=[hallucinations_metric, hallucinations_rubric, hallucinations_binary],\n",
")"
]
},
Expand Down
25 changes: 13 additions & 12 deletions src/ragas/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ class ChainType(Enum):


class ChainRun(BaseModel):
run_id: uuid.UUID
parent_run_id: t.Optional[uuid.UUID]
run_id: str
parent_run_id: t.Optional[str]
name: str
inputs: t.Dict[str, t.Any]
metadata: t.Dict[str, t.Any]
outputs: t.Dict[str, t.Any] = Field(default_factory=dict)
children: t.List[uuid.UUID] = Field(default_factory=list)
children: t.List[str] = Field(default_factory=list)


class ChainRunEncoder(json.JSONEncoder):
Expand All @@ -72,12 +72,14 @@ def default(self, o):
return str(o)
if isinstance(o, ChainType):
return o.value
# if isinstance(o, EvaluationResult):
# return ""
return json.JSONEncoder.default(self, o)


@dataclass
class RagasTracer(BaseCallbackHandler):
traces: t.Dict[uuid.UUID, ChainRun] = field(default_factory=dict)
traces: t.Dict[str, ChainRun] = field(default_factory=dict)

def on_chain_start(
self,
Expand All @@ -90,17 +92,17 @@ def on_chain_start(
metadata: t.Optional[t.Dict[str, t.Any]] = None,
**kwargs: t.Any,
) -> t.Any:
self.traces[run_id] = ChainRun(
run_id=run_id,
parent_run_id=parent_run_id,
self.traces[str(run_id)] = ChainRun(
run_id=str(run_id),
parent_run_id=str(parent_run_id) if parent_run_id else None,
name=serialized["name"],
inputs=inputs,
metadata=metadata or {},
children=[],
)

if parent_run_id and parent_run_id in self.traces:
self.traces[parent_run_id].children.append(run_id)
if parent_run_id and str(parent_run_id) in self.traces:
self.traces[str(parent_run_id)].children.append(str(run_id))

def on_chain_end(
self,
Expand All @@ -109,12 +111,11 @@ def on_chain_end(
run_id: uuid.UUID,
**kwargs: t.Any,
) -> t.Any:
self.traces[run_id].outputs = outputs
self.traces[str(run_id)].outputs = outputs

def to_jsons(self) -> str:
return json.dumps(
[t.model_dump() for t in self.traces.values()],
indent=4,
cls=ChainRunEncoder,
)

Expand All @@ -131,7 +132,7 @@ def __str__(self):


def parse_run_traces(
traces: t.Dict[uuid.UUID, ChainRun],
traces: t.Dict[str, ChainRun],
) -> t.List[t.Dict[str, t.Any]]:
root_traces = [
chain_trace
Expand Down
51 changes: 42 additions & 9 deletions src/ragas/dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,12 @@
from datasets import Dataset as HFDataset
from pydantic import BaseModel, field_validator

from ragas.callbacks import parse_run_traces
from ragas.callbacks import ChainRunEncoder, parse_run_traces
from ragas.cost import CostCallbackHandler
from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage
from ragas.utils import safe_nanmean
from ragas.utils import RAGAS_API_URL, safe_nanmean

if t.TYPE_CHECKING:
import uuid
from pathlib import Path

from datasets import Dataset as HFDataset
Expand Down Expand Up @@ -375,7 +374,7 @@ class EvaluationResult:
binary_columns: t.List[str] = field(default_factory=list)
cost_cb: t.Optional[CostCallbackHandler] = None
traces: t.List[t.Dict[str, t.Any]] = field(default_factory=list)
ragas_traces: t.Dict[uuid.UUID, ChainRun] = field(default_factory=dict, repr=False)
ragas_traces: t.Dict[str, ChainRun] = field(default_factory=dict, repr=False)

def __post_init__(self):
# transform scores from list of dicts to dict of lists
Expand All @@ -395,6 +394,13 @@ def __post_init__(self):
# parse the traces
self.traces = parse_run_traces(self.ragas_traces)

def __repr__(self) -> str:
score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
return "{" + ", ".join(score_strs) + "}"

def __getitem__(self, key: str) -> t.List[float]:
return self._scores_dict[key]

def to_pandas(self, batch_size: int | None = None, batched: bool = False):
"""
Convert the result to a pandas DataFrame.
Expand Down Expand Up @@ -487,9 +493,36 @@ def total_cost(
cost_per_input_token, cost_per_output_token, per_model_costs
)

def __repr__(self) -> str:
score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()]
return "{" + ", ".join(score_strs) + "}"
def upload(self, base_url: str = RAGAS_API_URL, verbose: bool = True) -> str:
from datetime import datetime, timezone

import requests

timestamp = datetime.now(timezone.utc).isoformat()
root_trace = [
trace for trace in self.ragas_traces.values() if trace.parent_run_id is None
][0]
packet = json.dumps(
{
"run_id": str(root_trace.run_id),
"created_at": timestamp,
"evaluation_run": [t.model_dump() for t in self.ragas_traces.values()],
},
cls=ChainRunEncoder,
)

def __getitem__(self, key: str) -> t.List[float]:
return self._scores_dict[key]
response = requests.post(
f"{base_url}/alignment/evaluation",
data=packet,
headers={"Content-Type": "application/json"},
)

if response.status_code != 200:
raise Exception(f"Failed to upload results: {response.text}")

evaluation_endpoint = (
f"https://app.ragas.io/alignment/evaluation/{root_trace.run_id}"
)
if verbose:
print(f"Evaluation results uploaded! View at {evaluation_endpoint}")
return evaluation_endpoint
Comment on lines +496 to +528
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

code for uploading

2 changes: 1 addition & 1 deletion src/ragas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def evaluate(
ragas_traces=tracer.traces,
)
if not evaluation_group_cm.ended:
evaluation_rm.on_chain_end(result)
evaluation_rm.on_chain_end({"scores": result.scores})
finally:
# reset llms and embeddings if changed
for i in llm_changed:
Expand Down
4 changes: 3 additions & 1 deletion src/ragas/metrics/_context_entities_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ class EntitiesList(BaseModel):

class ExtractEntitiesPrompt(PydanticPrompt[StringIO, EntitiesList]):
name: str = "text_entity_extraction"
instruction: str = "Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity."
instruction: str = (
"Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity."
)
input_model = StringIO
output_model = EntitiesList
examples = [
Expand Down
26 changes: 14 additions & 12 deletions src/ragas/metrics/_context_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ class Verification(BaseModel):

class ContextPrecisionPrompt(PydanticPrompt[QAC, Verification]):
name: str = "context_precision"
instruction: str = 'Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.'
instruction: str = (
'Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.'
)
input_model = QAC
output_model = Verification
examples = [
Expand Down Expand Up @@ -157,17 +159,17 @@ async def _ascore(
user_input, retrieved_contexts, reference = self._get_row_attributes(row)
responses = []
for context in retrieved_contexts:
verdicts: t.List[
Verification
] = await self.context_precision_prompt.generate_multiple(
data=QAC(
question=user_input,
context=context,
answer=reference,
),
n=self.reproducibility,
llm=self.llm,
callbacks=callbacks,
verdicts: t.List[Verification] = (
await self.context_precision_prompt.generate_multiple(
data=QAC(
question=user_input,
context=context,
answer=reference,
),
n=self.reproducibility,
llm=self.llm,
callbacks=callbacks,
)
)

responses.append([result.model_dump() for result in verdicts])
Expand Down
26 changes: 14 additions & 12 deletions src/ragas/metrics/_context_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ class ContextRecallClassificationPrompt(
PydanticPrompt[QCA, ContextRecallClassifications]
):
name: str = "context_recall_classification"
instruction: str = "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
instruction: str = (
"Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason."
)
input_model = QCA
output_model = ContextRecallClassifications
examples = [
Expand Down Expand Up @@ -148,17 +150,17 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.llm is not None, "set LLM before use"

# run classification
classifications_list: t.List[
ContextRecallClassifications
] = await self.context_recall_prompt.generate_multiple(
data=QCA(
question=row["user_input"],
context="\n".join(row["retrieved_contexts"]),
answer=row["reference"],
),
llm=self.llm,
callbacks=callbacks,
n=self.reproducibility,
classifications_list: t.List[ContextRecallClassifications] = (
await self.context_recall_prompt.generate_multiple(
data=QCA(
question=row["user_input"],
context="\n".join(row["retrieved_contexts"]),
answer=row["reference"],
),
llm=self.llm,
callbacks=callbacks,
n=self.reproducibility,
)
)
classification_dicts = []
for classification in classifications_list:
Expand Down
12 changes: 9 additions & 3 deletions src/ragas/metrics/_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ class AnswersGenerated(BaseModel):

class ExtractKeyphrasePrompt(PydanticPrompt[StringIO, ExtractedKeyphrases]):
name: str = "extract_keyphrases"
instruction: str = "Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages."
instruction: str = (
"Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages."
)
input_model = StringIO
output_model = ExtractedKeyphrases
examples: t.List[t.Tuple[StringIO, ExtractedKeyphrases]] = [
Expand Down Expand Up @@ -62,7 +64,9 @@ class GenerateQuestionsPrompt(
PydanticPrompt[GenerateQuestionsPromptInput, QuestionsGenerated]
):
name: str = "generate_questions"
instruction: str = "Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text."
instruction: str = (
"Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text."
)
input_model = GenerateQuestionsPromptInput
output_model = QuestionsGenerated
examples: t.List[t.Tuple[GenerateQuestionsPromptInput, QuestionsGenerated]] = [
Expand Down Expand Up @@ -99,7 +103,9 @@ class SummaryAndQuestions(BaseModel):

class GenerateAnswersPrompt(PydanticPrompt[SummaryAndQuestions, AnswersGenerated]):
name: str = "generate_answers"
instruction: str = "Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question."
instruction: str = (
"Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question."
)
input_model = SummaryAndQuestions
output_model = AnswersGenerated
examples: t.List[t.Tuple[SummaryAndQuestions, AnswersGenerated]] = [
Expand Down
4 changes: 3 additions & 1 deletion src/ragas/metrics/_topic_adherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ class TopicRefusedPrompt(PydanticPrompt[TopicRefusedInput, TopicRefusedOutput]):
]


class TopicExtractionPrompt(PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]):
class TopicExtractionPrompt(
PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]
):
instruction: str = (
"Given an interaction between Human, Tool and AI, extract the topics from Human's input."
)
Expand Down
2 changes: 2 additions & 0 deletions src/ragas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
RAGAS_SUPPORTED_LANGUAGE_CODES = {
v.__name__.lower(): k for k, v in LANGUAGE_CODES.items()
}
# endpoint for uploading results
RAGAS_API_URL = "https://api.ragas.io"


@lru_cache(maxsize=1)
Expand Down
Loading