diff --git a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb index bb47013e2..66407cfe6 100644 --- a/docs/howtos/customizations/metrics/write_your_own_metric.ipynb +++ b/docs/howtos/customizations/metrics/write_your_own_metric.ipynb @@ -62,7 +62,7 @@ "source": [ "from ragas.llms import llm_factory\n", "\n", - "evaluator_llm = llm_factory('gpt-4o')" + "evaluator_llm = llm_factory(\"gpt-4o\")" ] }, { @@ -104,7 +104,7 @@ "hallucinations_binary = AspectCritic(\n", " name=\"hallucinations_binary\",\n", " definition=\"Did the model hallucinate or add any information that was not present in the retrieved context?\",\n", - " llm=evaluator_llm\n", + " llm=evaluator_llm,\n", ")\n", "\n", "await hallucinations_binary.single_turn_ascore(eval_dataset[0])" @@ -163,9 +163,7 @@ "from ragas.metrics import RubricsScoreWithoutReference\n", "\n", "hallucinations_rubric = RubricsScoreWithoutReference(\n", - " name=\"hallucinations_rubric\",\n", - " llm=evaluator_llm,\n", - " rubrics=rubric\n", + " name=\"hallucinations_rubric\", llm=evaluator_llm, rubrics=rubric\n", ")\n", "\n", "await hallucinations_rubric.single_turn_ascore(eval_dataset[0])" @@ -215,19 +213,28 @@ "from ragas.callbacks import Callbacks\n", "from ragas.dataset_schema import SingleTurnSample\n", "\n", + "\n", "@dataclass\n", "class HallucinationsMetric(MetricWithLLM, SingleTurnMetric):\n", " # name of the metric\n", " name: str = \"hallucinations_metric\"\n", " # we need to define the required columns for the metric\n", - " _required_columns: t.Dict[MetricType, t.Set[str]] = field(default_factory=lambda: {MetricType.SINGLE_TURN: {\"user_input\", \"response\", \"retrieved_contexts\"}})\n", + " _required_columns: t.Dict[MetricType, t.Set[str]] = field(\n", + " default_factory=lambda: {\n", + " MetricType.SINGLE_TURN: {\"user_input\", \"response\", \"retrieved_contexts\"}\n", + " }\n", + " )\n", "\n", " def __post_init__(self):\n", " # init the faithfulness metric\n", " self.faithfulness_metric = Faithfulness(llm=self.llm)\n", "\n", - " async def _single_turn_ascore(self, sample: SingleTurnSample, callbacks: Callbacks) -> float:\n", - " faithfulness_score = await self.faithfulness_metric.single_turn_ascore(sample, callbacks)\n", + " async def _single_turn_ascore(\n", + " self, sample: SingleTurnSample, callbacks: Callbacks\n", + " ) -> float:\n", + " faithfulness_score = await self.faithfulness_metric.single_turn_ascore(\n", + " sample, callbacks\n", + " )\n", " return 1 - faithfulness_score" ] }, @@ -269,12 +276,8 @@ "from ragas import evaluate\n", "\n", "results = evaluate(\n", - " eval_dataset, \n", - " metrics=[\n", - " hallucinations_metric,\n", - " hallucinations_rubric,\n", - " hallucinations_binary\n", - " ], \n", + " eval_dataset,\n", + " metrics=[hallucinations_metric, hallucinations_rubric, hallucinations_binary],\n", ")" ] }, diff --git a/src/ragas/callbacks.py b/src/ragas/callbacks.py index 014eabb23..7b16059f5 100644 --- a/src/ragas/callbacks.py +++ b/src/ragas/callbacks.py @@ -57,13 +57,13 @@ class ChainType(Enum): class ChainRun(BaseModel): - run_id: uuid.UUID - parent_run_id: t.Optional[uuid.UUID] + run_id: str + parent_run_id: t.Optional[str] name: str inputs: t.Dict[str, t.Any] metadata: t.Dict[str, t.Any] outputs: t.Dict[str, t.Any] = Field(default_factory=dict) - children: t.List[uuid.UUID] = Field(default_factory=list) + children: t.List[str] = Field(default_factory=list) class ChainRunEncoder(json.JSONEncoder): @@ -72,12 +72,14 @@ def default(self, o): return str(o) if isinstance(o, ChainType): return o.value + # if isinstance(o, EvaluationResult): + # return "" return json.JSONEncoder.default(self, o) @dataclass class RagasTracer(BaseCallbackHandler): - traces: t.Dict[uuid.UUID, ChainRun] = field(default_factory=dict) + traces: t.Dict[str, ChainRun] = field(default_factory=dict) def on_chain_start( self, @@ -90,17 +92,17 @@ def on_chain_start( metadata: t.Optional[t.Dict[str, t.Any]] = None, **kwargs: t.Any, ) -> t.Any: - self.traces[run_id] = ChainRun( - run_id=run_id, - parent_run_id=parent_run_id, + self.traces[str(run_id)] = ChainRun( + run_id=str(run_id), + parent_run_id=str(parent_run_id) if parent_run_id else None, name=serialized["name"], inputs=inputs, metadata=metadata or {}, children=[], ) - if parent_run_id and parent_run_id in self.traces: - self.traces[parent_run_id].children.append(run_id) + if parent_run_id and str(parent_run_id) in self.traces: + self.traces[str(parent_run_id)].children.append(str(run_id)) def on_chain_end( self, @@ -109,12 +111,11 @@ def on_chain_end( run_id: uuid.UUID, **kwargs: t.Any, ) -> t.Any: - self.traces[run_id].outputs = outputs + self.traces[str(run_id)].outputs = outputs def to_jsons(self) -> str: return json.dumps( [t.model_dump() for t in self.traces.values()], - indent=4, cls=ChainRunEncoder, ) @@ -131,7 +132,7 @@ def __str__(self): def parse_run_traces( - traces: t.Dict[uuid.UUID, ChainRun], + traces: t.Dict[str, ChainRun], ) -> t.List[t.Dict[str, t.Any]]: root_traces = [ chain_trace diff --git a/src/ragas/dataset_schema.py b/src/ragas/dataset_schema.py index 0a59c2485..d3b4978d7 100644 --- a/src/ragas/dataset_schema.py +++ b/src/ragas/dataset_schema.py @@ -8,13 +8,12 @@ from datasets import Dataset as HFDataset from pydantic import BaseModel, field_validator -from ragas.callbacks import parse_run_traces +from ragas.callbacks import ChainRunEncoder, parse_run_traces from ragas.cost import CostCallbackHandler from ragas.messages import AIMessage, HumanMessage, ToolCall, ToolMessage -from ragas.utils import safe_nanmean +from ragas.utils import RAGAS_API_URL, safe_nanmean if t.TYPE_CHECKING: - import uuid from pathlib import Path from datasets import Dataset as HFDataset @@ -375,7 +374,7 @@ class EvaluationResult: binary_columns: t.List[str] = field(default_factory=list) cost_cb: t.Optional[CostCallbackHandler] = None traces: t.List[t.Dict[str, t.Any]] = field(default_factory=list) - ragas_traces: t.Dict[uuid.UUID, ChainRun] = field(default_factory=dict, repr=False) + ragas_traces: t.Dict[str, ChainRun] = field(default_factory=dict, repr=False) def __post_init__(self): # transform scores from list of dicts to dict of lists @@ -395,6 +394,13 @@ def __post_init__(self): # parse the traces self.traces = parse_run_traces(self.ragas_traces) + def __repr__(self) -> str: + score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()] + return "{" + ", ".join(score_strs) + "}" + + def __getitem__(self, key: str) -> t.List[float]: + return self._scores_dict[key] + def to_pandas(self, batch_size: int | None = None, batched: bool = False): """ Convert the result to a pandas DataFrame. @@ -487,9 +493,36 @@ def total_cost( cost_per_input_token, cost_per_output_token, per_model_costs ) - def __repr__(self) -> str: - score_strs = [f"'{k}': {v:0.4f}" for k, v in self._repr_dict.items()] - return "{" + ", ".join(score_strs) + "}" + def upload(self, base_url: str = RAGAS_API_URL, verbose: bool = True) -> str: + from datetime import datetime, timezone + + import requests + + timestamp = datetime.now(timezone.utc).isoformat() + root_trace = [ + trace for trace in self.ragas_traces.values() if trace.parent_run_id is None + ][0] + packet = json.dumps( + { + "run_id": str(root_trace.run_id), + "created_at": timestamp, + "evaluation_run": [t.model_dump() for t in self.ragas_traces.values()], + }, + cls=ChainRunEncoder, + ) - def __getitem__(self, key: str) -> t.List[float]: - return self._scores_dict[key] + response = requests.post( + f"{base_url}/alignment/evaluation", + data=packet, + headers={"Content-Type": "application/json"}, + ) + + if response.status_code != 200: + raise Exception(f"Failed to upload results: {response.text}") + + evaluation_endpoint = ( + f"https://app.ragas.io/alignment/evaluation/{root_trace.run_id}" + ) + if verbose: + print(f"Evaluation results uploaded! View at {evaluation_endpoint}") + return evaluation_endpoint diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 81fa62cae..94faa877a 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -344,7 +344,7 @@ def evaluate( ragas_traces=tracer.traces, ) if not evaluation_group_cm.ended: - evaluation_rm.on_chain_end(result) + evaluation_rm.on_chain_end({"scores": result.scores}) finally: # reset llms and embeddings if changed for i in llm_changed: diff --git a/src/ragas/metrics/_context_entities_recall.py b/src/ragas/metrics/_context_entities_recall.py index 6976d69ce..2a40c2cf8 100644 --- a/src/ragas/metrics/_context_entities_recall.py +++ b/src/ragas/metrics/_context_entities_recall.py @@ -23,7 +23,9 @@ class EntitiesList(BaseModel): class ExtractEntitiesPrompt(PydanticPrompt[StringIO, EntitiesList]): name: str = "text_entity_extraction" - instruction: str = "Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity." + instruction: str = ( + "Given a text, extract unique entities without repetition. Ensure you consider different forms or mentions of the same entity as a single entity." + ) input_model = StringIO output_model = EntitiesList examples = [ diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index f8b02a200..e935e8f02 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -33,7 +33,9 @@ class Verification(BaseModel): class ContextPrecisionPrompt(PydanticPrompt[QAC, Verification]): name: str = "context_precision" - instruction: str = 'Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.' + instruction: str = ( + 'Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.' + ) input_model = QAC output_model = Verification examples = [ @@ -157,17 +159,17 @@ async def _ascore( user_input, retrieved_contexts, reference = self._get_row_attributes(row) responses = [] for context in retrieved_contexts: - verdicts: t.List[ - Verification - ] = await self.context_precision_prompt.generate_multiple( - data=QAC( - question=user_input, - context=context, - answer=reference, - ), - n=self.reproducibility, - llm=self.llm, - callbacks=callbacks, + verdicts: t.List[Verification] = ( + await self.context_precision_prompt.generate_multiple( + data=QAC( + question=user_input, + context=context, + answer=reference, + ), + n=self.reproducibility, + llm=self.llm, + callbacks=callbacks, + ) ) responses.append([result.model_dump() for result in verdicts]) diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index d685deddc..e655957e4 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -41,7 +41,9 @@ class ContextRecallClassificationPrompt( PydanticPrompt[QCA, ContextRecallClassifications] ): name: str = "context_recall_classification" - instruction: str = "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason." + instruction: str = ( + "Given a context, and an answer, analyze each sentence in the answer and classify if the sentence can be attributed to the given context or not. Use only 'Yes' (1) or 'No' (0) as a binary classification. Output json with reason." + ) input_model = QCA output_model = ContextRecallClassifications examples = [ @@ -148,17 +150,17 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "set LLM before use" # run classification - classifications_list: t.List[ - ContextRecallClassifications - ] = await self.context_recall_prompt.generate_multiple( - data=QCA( - question=row["user_input"], - context="\n".join(row["retrieved_contexts"]), - answer=row["reference"], - ), - llm=self.llm, - callbacks=callbacks, - n=self.reproducibility, + classifications_list: t.List[ContextRecallClassifications] = ( + await self.context_recall_prompt.generate_multiple( + data=QCA( + question=row["user_input"], + context="\n".join(row["retrieved_contexts"]), + answer=row["reference"], + ), + llm=self.llm, + callbacks=callbacks, + n=self.reproducibility, + ) ) classification_dicts = [] for classification in classifications_list: diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py index ce9843110..0cdb09d63 100644 --- a/src/ragas/metrics/_summarization.py +++ b/src/ragas/metrics/_summarization.py @@ -31,7 +31,9 @@ class AnswersGenerated(BaseModel): class ExtractKeyphrasePrompt(PydanticPrompt[StringIO, ExtractedKeyphrases]): name: str = "extract_keyphrases" - instruction: str = "Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages." + instruction: str = ( + "Extract keyphrases of type: Person, Organization, Location, Date/Time, Monetary Values, and Percentages." + ) input_model = StringIO output_model = ExtractedKeyphrases examples: t.List[t.Tuple[StringIO, ExtractedKeyphrases]] = [ @@ -62,7 +64,9 @@ class GenerateQuestionsPrompt( PydanticPrompt[GenerateQuestionsPromptInput, QuestionsGenerated] ): name: str = "generate_questions" - instruction: str = "Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text." + instruction: str = ( + "Based on the given text and keyphrases, generate closed-ended questions that can be answered with '1' if the question can be answered using the text, or '0' if it cannot. The questions should ALWAYS result in a '1' based on the given text." + ) input_model = GenerateQuestionsPromptInput output_model = QuestionsGenerated examples: t.List[t.Tuple[GenerateQuestionsPromptInput, QuestionsGenerated]] = [ @@ -99,7 +103,9 @@ class SummaryAndQuestions(BaseModel): class GenerateAnswersPrompt(PydanticPrompt[SummaryAndQuestions, AnswersGenerated]): name: str = "generate_answers" - instruction: str = "Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question." + instruction: str = ( + "Based on the list of close-ended '1' or '0' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided summary contains sufficient information to answer EACH question. Answers should STRICTLY be either '1' or '0'. Answer '0' if the provided summary does not contain enough information to answer the question and answer '1' if the provided summary can answer the question." + ) input_model = SummaryAndQuestions output_model = AnswersGenerated examples: t.List[t.Tuple[SummaryAndQuestions, AnswersGenerated]] = [ diff --git a/src/ragas/metrics/_topic_adherence.py b/src/ragas/metrics/_topic_adherence.py index 140962fa9..ae55ddffb 100644 --- a/src/ragas/metrics/_topic_adherence.py +++ b/src/ragas/metrics/_topic_adherence.py @@ -97,7 +97,9 @@ class TopicRefusedPrompt(PydanticPrompt[TopicRefusedInput, TopicRefusedOutput]): ] -class TopicExtractionPrompt(PydanticPrompt[TopicExtractionInput, TopicExtractionOutput]): +class TopicExtractionPrompt( + PydanticPrompt[TopicExtractionInput, TopicExtractionOutput] +): instruction: str = ( "Given an interaction between Human, Tool and AI, extract the topics from Human's input." ) diff --git a/src/ragas/utils.py b/src/ragas/utils.py index a9bc34390..7f1a42037 100644 --- a/src/ragas/utils.py +++ b/src/ragas/utils.py @@ -19,6 +19,8 @@ RAGAS_SUPPORTED_LANGUAGE_CODES = { v.__name__.lower(): k for k, v in LANGUAGE_CODES.items() } +# endpoint for uploading results +RAGAS_API_URL = "https://api.ragas.io" @lru_cache(maxsize=1)