diff --git a/README.md b/README.md index 3c6af2a77..081b47753 100644 --- a/README.md +++ b/README.md @@ -78,8 +78,9 @@ os.environ["OPENAI_API_KEY"] = "your-openai-key" dataset: Dataset results = evaluate(dataset) -# {'ragas_score': 0.860, 'context_precision': 0.817, -# 'faithfulness': 0.892, 'answer_relevancy': 0.874} +# {'context_precision': 0.817, +# 'faithfulness': 0.892, +# 'answer_relevancy': 0.874} ``` Refer to our [documentation](https://docs.ragas.io/) to learn more. diff --git a/docs/concepts/metrics/critique.md b/docs/concepts/metrics/critique.md index 439d93e8a..00b807300 100644 --- a/docs/concepts/metrics/critique.md +++ b/docs/concepts/metrics/critique.md @@ -5,7 +5,7 @@ This is designed to assess submissions based on predefined aspects such as `harm Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. (Please refer to `SUPPORTED_ASPECTS` for a complete list). If you prefer, you can also create custom aspects to evaluate submissions according to your unique requirements. -The `strictness` parameter plays a crucial role in maintaining a certain level of self-consistency in predictions, with an ideal range typically falling between 2 to 4. It's important to note that the scores obtained from aspect critiques are binary and do not contribute to the final Ragas score due to their non-continuous nature. +The `strictness` parameter plays a crucial role in maintaining a certain level of self-consistency in predictions, with an ideal range typically falling between 2 to 4. ```{hint} diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md index f21c4b64b..0091d1832 100644 --- a/docs/getstarted/evaluation.md +++ b/docs/getstarted/evaluation.md @@ -96,7 +96,7 @@ result = evaluate( result ``` -and there you have it, all the scores you need. `ragas_score` gives you a single metric that you can use while 4 metrics individually would measure the different parts of your pipeline. +and there you have it, all the scores you need. Now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too! diff --git a/docs/howtos/applications/compare_embeddings.md b/docs/howtos/applications/compare_embeddings.md index 145857539..01b7a8d13 100644 --- a/docs/howtos/applications/compare_embeddings.md +++ b/docs/howtos/applications/compare_embeddings.md @@ -109,7 +109,7 @@ result = evaluate(query_engine1, metrics, test_questions, test_answers) ```{code-block} :caption: output -{'ragas_score': 0.3570, 'context_precision': 0.2378, 'context_recall': 0.7159} +{'context_precision': 0.2378, 'context_recall': 0.7159} ``` ## Evaluate Bge embeddings @@ -124,7 +124,7 @@ result = evaluate(query_engine2, metrics, test_questions, test_answers) ```{code-block} :caption: output -{'ragas_score': 0.3883, 'context_precision': 0.2655, 'context_recall': 0.7227} +{'context_precision': 0.2655, 'context_recall': 0.7227} ``` diff --git a/docs/howtos/applications/compare_llms.md b/docs/howtos/applications/compare_llms.md index 4ce5caced..95897ee2f 100644 --- a/docs/howtos/applications/compare_llms.md +++ b/docs/howtos/applications/compare_llms.md @@ -145,7 +145,7 @@ result_zephyr ```{code-block} :caption: output -{'ragas_score': 0.7809, 'faithfulness': 0.8365, 'answer_relevancy': 0.8831, 'answer_correctness': 0.6605} +{'faithfulness': 0.8365, 'answer_relevancy': 0.8831, 'answer_correctness': 0.6605} ``` ## Evaluate Falcon-7B-Instruct LLM @@ -168,7 +168,7 @@ result ```{code-block} :caption: output -{'ragas_score': 0.6956, 'faithfulness': 0.6909, 'answer_relevancy': 0.8651, 'answer_correctness': 0.5850} +{'faithfulness': 0.6909, 'answer_relevancy': 0.8651, 'answer_correctness': 0.5850} ``` ## Compare Scores diff --git a/docs/howtos/customisations/aws-bedrock.ipynb b/docs/howtos/customisations/aws-bedrock.ipynb index 9aef4035b..82ccfa921 100644 --- a/docs/howtos/customisations/aws-bedrock.ipynb +++ b/docs/howtos/customisations/aws-bedrock.ipynb @@ -263,7 +263,7 @@ "id": "a2dc0ec2", "metadata": {}, "source": [ - "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n", + "and there you have the it, all the scores you need.\n", "\n", "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!" ] diff --git a/docs/howtos/customisations/azure-openai.ipynb b/docs/howtos/customisations/azure-openai.ipynb index 93851916c..23fe19a24 100644 --- a/docs/howtos/customisations/azure-openai.ipynb +++ b/docs/howtos/customisations/azure-openai.ipynb @@ -258,7 +258,7 @@ "id": "a2dc0ec2", "metadata": {}, "source": [ - "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n", + "and there you have the it, all the scores you need.\n", "\n", "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!" ] diff --git a/docs/howtos/customisations/gcp-vertexai.ipynb b/docs/howtos/customisations/gcp-vertexai.ipynb index 7ddf68f15..b3b5e24c2 100644 --- a/docs/howtos/customisations/gcp-vertexai.ipynb +++ b/docs/howtos/customisations/gcp-vertexai.ipynb @@ -294,7 +294,7 @@ "id": "960f88fc-c90b-4ac6-8e97-252edd2f1661", "metadata": {}, "source": [ - "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n", + "and there you have the it, all the scores you need.\n", "\n", "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!" ] diff --git a/docs/howtos/integrations/langfuse.ipynb b/docs/howtos/integrations/langfuse.ipynb index 2c217a51c..9fa24c747 100644 --- a/docs/howtos/integrations/langfuse.ipynb +++ b/docs/howtos/integrations/langfuse.ipynb @@ -559,7 +559,7 @@ { "data": { "text/plain": [ - "{'ragas_score': 0.9309, 'faithfulness': 0.8889, 'answer_relevancy': 0.9771}" + "{'faithfulness': 0.8889, 'answer_relevancy': 0.9771}" ] }, "execution_count": 15, diff --git a/docs/howtos/integrations/langsmith.ipynb b/docs/howtos/integrations/langsmith.ipynb index 30821c918..198996e2e 100644 --- a/docs/howtos/integrations/langsmith.ipynb +++ b/docs/howtos/integrations/langsmith.ipynb @@ -102,7 +102,7 @@ { "data": { "text/plain": [ - "{'ragas_score': 0.7744, 'context_precision': 0.5976, 'faithfulness': 0.8889, 'answer_relevancy': 0.9300}" + "{'context_precision': 0.5976, 'faithfulness': 0.8889, 'answer_relevancy': 0.9300}" ] }, "execution_count": 1, diff --git a/docs/howtos/integrations/llamaindex.ipynb b/docs/howtos/integrations/llamaindex.ipynb index 21773c71a..215d07aaf 100644 --- a/docs/howtos/integrations/llamaindex.ipynb +++ b/docs/howtos/integrations/llamaindex.ipynb @@ -282,7 +282,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'ragas_score': 0.5142, 'faithfulness': 0.7000, 'answer_relevancy': 0.9550, 'context_precision': 0.2335, 'context_recall': 0.9800, 'harmfulness': 0.0000}\n" + "{faithfulness': 0.7000, 'answer_relevancy': 0.9550, 'context_precision': 0.2335, 'context_recall': 0.9800, 'harmfulness': 0.0000}\n" ] } ], diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 8ec1741fb..c19ca4630 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -42,8 +42,7 @@ def evaluate( ------- Result Result object containing the scores of each metric. You can use this do analysis - later. If the top 3 metrics are provided then it also returns the `ragas_score` - for the entire pipeline. + later. Raises ------ @@ -64,8 +63,9 @@ def evaluate( }) >>> result = evaluate(dataset) - >>> print(result["ragas_score"]) - {'ragas_score': 0.860, 'context_precision': 0.817, 'faithfulness': 0.892, + >>> print(result) + {'context_precision': 0.817, + 'faithfulness': 0.892, 'answer_relevancy': 0.874} ``` """ diff --git a/src/ragas/llama_index/evaluation.py b/src/ragas/llama_index/evaluation.py index dab200688..b30738d29 100644 --- a/src/ragas/llama_index/evaluation.py +++ b/src/ragas/llama_index/evaluation.py @@ -36,8 +36,7 @@ def evaluate( ------- Result Result object containing the scores of each metric. You can use this do analysis - later. If the top 3 metrics are provided then it also returns the `ragas_score` - for the entire pipeline. + later. Raises ------