explodinggradients · shahules786 · Jan 3, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/README.md b/README.md
@@ -78,8 +78,9 @@ os.environ["OPENAI_API_KEY"] = "your-openai-key"
 dataset: Dataset
 
 results = evaluate(dataset)
-# {'ragas_score': 0.860, 'context_precision': 0.817,
-# 'faithfulness': 0.892, 'answer_relevancy': 0.874}
+# {'context_precision': 0.817,
+# 'faithfulness': 0.892, 
+# 'answer_relevancy': 0.874}
 ```
 
 Refer to our [documentation](https://docs.ragas.io/) to learn more.

diff --git a/docs/concepts/metrics/critique.md b/docs/concepts/metrics/critique.md
@@ -5,7 +5,7 @@ This is designed to assess submissions based on predefined aspects such as `harm
 
 Critiques within the LLM evaluators evaluate submissions based on the provided aspect. Ragas Critiques offers a range of predefined aspects like correctness, harmfulness, etc. (Please refer to `SUPPORTED_ASPECTS` for a complete list). If you prefer, you can also create custom aspects to evaluate submissions according to your unique requirements.
 
-The `strictness` parameter plays a crucial role in maintaining a certain level of self-consistency in predictions, with an ideal range typically falling between 2 to 4. It's important to note that the scores obtained from aspect critiques are binary and do not contribute to the final Ragas score due to their non-continuous nature.
+The `strictness` parameter plays a crucial role in maintaining a certain level of self-consistency in predictions, with an ideal range typically falling between 2 to 4.
 
 
 ```{hint}

diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md
@@ -96,7 +96,7 @@ result = evaluate(
 
 result
 ```
-and there you have it, all the scores you need. `ragas_score` gives you a single metric that you can use while 4 metrics individually would measure the different parts of your pipeline.
+and there you have it, all the scores you need.
 
 Now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!
 

diff --git a/docs/howtos/applications/compare_embeddings.md b/docs/howtos/applications/compare_embeddings.md
@@ -109,7 +109,7 @@ result = evaluate(query_engine1, metrics, test_questions, test_answers)
 
 ```{code-block}
 :caption: output
-{'ragas_score': 0.3570, 'context_precision': 0.2378, 'context_recall': 0.7159}
+{'context_precision': 0.2378, 'context_recall': 0.7159}
 ```
 
 ## Evaluate Bge embeddings
@@ -124,7 +124,7 @@ result = evaluate(query_engine2, metrics, test_questions, test_answers)
 
 ```{code-block}
 :caption: output
-{'ragas_score': 0.3883, 'context_precision': 0.2655, 'context_recall': 0.7227}
+{'context_precision': 0.2655, 'context_recall': 0.7227}
 
 ```
 

diff --git a/docs/howtos/applications/compare_llms.md b/docs/howtos/applications/compare_llms.md
@@ -145,7 +145,7 @@ result_zephyr
 
 ```{code-block}
 :caption: output
-{'ragas_score': 0.7809, 'faithfulness': 0.8365, 'answer_relevancy': 0.8831, 'answer_correctness': 0.6605}
+{'faithfulness': 0.8365, 'answer_relevancy': 0.8831, 'answer_correctness': 0.6605}
 ```
 
 ## Evaluate Falcon-7B-Instruct LLM
@@ -168,7 +168,7 @@ result
 
 ```{code-block}
 :caption: output
-{'ragas_score': 0.6956, 'faithfulness': 0.6909, 'answer_relevancy': 0.8651, 'answer_correctness': 0.5850}
+{'faithfulness': 0.6909, 'answer_relevancy': 0.8651, 'answer_correctness': 0.5850}
 ```
 
 ## Compare Scores

diff --git a/docs/howtos/customisations/aws-bedrock.ipynb b/docs/howtos/customisations/aws-bedrock.ipynb
@@ -263,7 +263,7 @@
    "id": "a2dc0ec2",
    "metadata": {},
    "source": [
-    "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n",
+    "and there you have the it, all the scores you need.\n",
     "\n",
     "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!"
    ]

diff --git a/docs/howtos/customisations/azure-openai.ipynb b/docs/howtos/customisations/azure-openai.ipynb
@@ -258,7 +258,7 @@
    "id": "a2dc0ec2",
    "metadata": {},
    "source": [
-    "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n",
+    "and there you have the it, all the scores you need.\n",
     "\n",
     "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!"
    ]

diff --git a/docs/howtos/customisations/gcp-vertexai.ipynb b/docs/howtos/customisations/gcp-vertexai.ipynb
@@ -294,7 +294,7 @@
    "id": "960f88fc-c90b-4ac6-8e97-252edd2f1661",
    "metadata": {},
    "source": [
-    "and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.\n",
+    "and there you have the it, all the scores you need.\n",
     "\n",
     "now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!"
    ]

diff --git a/docs/howtos/integrations/langfuse.ipynb b/docs/howtos/integrations/langfuse.ipynb
@@ -559,7 +559,7 @@
     {
      "data": {
       "text/plain": [
-       "{'ragas_score': 0.9309, 'faithfulness': 0.8889, 'answer_relevancy': 0.9771}"
+       "{'faithfulness': 0.8889, 'answer_relevancy': 0.9771}"
       ]
      },
      "execution_count": 15,

diff --git a/docs/howtos/integrations/langsmith.ipynb b/docs/howtos/integrations/langsmith.ipynb
@@ -102,7 +102,7 @@
     {
      "data": {
       "text/plain": [
-       "{'ragas_score': 0.7744, 'context_precision': 0.5976, 'faithfulness': 0.8889, 'answer_relevancy': 0.9300}"
+       "{'context_precision': 0.5976, 'faithfulness': 0.8889, 'answer_relevancy': 0.9300}"
       ]
      },
      "execution_count": 1,

diff --git a/docs/howtos/integrations/llamaindex.ipynb b/docs/howtos/integrations/llamaindex.ipynb
@@ -282,7 +282,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'ragas_score': 0.5142, 'faithfulness': 0.7000, 'answer_relevancy': 0.9550, 'context_precision': 0.2335, 'context_recall': 0.9800, 'harmfulness': 0.0000}\n"
+      "{faithfulness': 0.7000, 'answer_relevancy': 0.9550, 'context_precision': 0.2335, 'context_recall': 0.9800, 'harmfulness': 0.0000}\n"
      ]
     }
    ],

diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -42,8 +42,7 @@ def evaluate(
     -------
     Result
         Result object containing the scores of each metric. You can use this do analysis
-        later. If the top 3 metrics are provided then it also returns the `ragas_score`
-        for the entire pipeline.
+        later.
 
     Raises
     ------
@@ -64,8 +63,9 @@ def evaluate(
     })
 
     >>> result = evaluate(dataset)
-    >>> print(result["ragas_score"])
-    {'ragas_score': 0.860, 'context_precision': 0.817, 'faithfulness': 0.892,
+    >>> print(result)
+    {'context_precision': 0.817,
+    'faithfulness': 0.892,
     'answer_relevancy': 0.874}
     ```
     """

diff --git a/src/ragas/llama_index/evaluation.py b/src/ragas/llama_index/evaluation.py
@@ -36,8 +36,7 @@ def evaluate(
     -------
     Result
         Result object containing the scores of each metric. You can use this do analysis
-        later. If the top 3 metrics are provided then it also returns the `ragas_score`
-        for the entire pipeline.
+        later.
 
     Raises
     ------
-Original file line number
+Diff line change
@@ Expand Up / @@ -96,7 +96,7 @@ result = evaluate( @@
     result
     ```
-    and there you have it, all the scores you need. `ragas_score` gives you a single metric that you can use while 4 metrics individually would measure the different parts of your pipeline.
+    and there you have it, all the scores you need.
     Now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!
@@ Expand Down @@