explodinggradients · jjmachan · Jan 27, 2024 · Jan 27, 2024 · Jan 27, 2024 · Jan 27, 2024
diff --git a/src/ragas/testset/docstore.py b/src/ragas/testset/docstore.py
@@ -7,7 +7,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from enum import Enum
-from random import choices
 
 import numpy as np
 import numpy.typing as npt
@@ -17,13 +16,13 @@
 
 from ragas.embeddings.base import BaseRagasEmbeddings
 from ragas.executor import Executor
+from ragas.testset.utils import rng
 
 if t.TYPE_CHECKING:
     from llama_index.readers.schema import Document as LlamaindexDocument
 
 Embedding = t.Union[t.List[float], npt.NDArray[np.float64]]
 logger = logging.getLogger(__name__)
-rng = np.random.default_rng()
 
 
 class Document(LCDocument):
@@ -243,7 +242,7 @@ def get_document(self, doc_id: str) -> Node:
         raise NotImplementedError
 
     def get_random_nodes(self, k=1) -> t.List[Node]:
-        return choices(self.nodes, k=k)
+        return rng.choice(np.array(self.nodes), size=k).tolist()
 
     def get_similar(
         self, node: Node, threshold: float = 0.7, top_k: int = 3

diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py
@@ -10,12 +10,14 @@
 from numpy.random import default_rng
 
 from ragas.llms import BaseRagasLLM
+from ragas.llms.json_load import json_loader
 from ragas.llms.prompt import Prompt
 from ragas.testset.docstore import Direction, DocumentStore, Node
 from ragas.testset.filters import EvolutionFilter, NodeFilter, QuestionFilter
 from ragas.testset.prompts import (
     compress_question_prompt,
     conditional_question_prompt,
+    find_relevent_context_prompt,
     multi_context_question_prompt,
     question_answer_prompt,
     reasoning_question_prompt,
@@ -139,7 +141,25 @@ def generate_datarow(
     ):
         assert self.generator_llm is not None, "generator_llm cannot be None"
 
-        merged_nodes = self.merge_nodes(current_nodes)
+        node_content = [
+            f"{i}\t{n.page_content}" for i, n in enumerate(current_nodes.nodes)
+        ]
+        results = self.generator_llm.generate_text(
+            prompt=find_relevent_context_prompt.format(
+                question=question, contexts=node_content
+            )
+        )
+        relevant_context_indices = json_loader.safe_load(
+            results.generations[0][0].text.strip(), llm=self.generator_llm
+        ).get("relevant_context", None)
+        if relevant_context_indices is None:
+            relevant_context = CurrentNodes(
+                root_node=current_nodes.root_node, nodes=current_nodes.nodes
+            )
+        else:
+            relevant_context = current_nodes
+
+        merged_nodes = self.merge_nodes(relevant_context)
         results = self.generator_llm.generate_text(
             prompt=question_answer_prompt.format(
                 question=question, context=merged_nodes.page_content

diff --git a/src/ragas/testset/prompts.py b/src/ragas/testset/prompts.py
@@ -297,3 +297,35 @@
     output_type="string",
     language="english",
 )
+
+
+find_relevent_context_prompt = Prompt(
+    name="find_relevent_context",
+    instruction="Given a question and set of contexts, find the most relevant contexts to answer the question.",
+    examples=[
+        {
+            "question": "What is the capital of France?",
+            "contexts": [
+                "1. France is a country in Western Europe. It has several cities, including Paris, Lyon, and Marseille. Paris is not only known for its cultural landmarks like the Eiffel Tower and the Louvre Museum but also as the administrative center.",
+                "2. The capital of France is Paris. It is also the most populous city in France, with a population of over 2 million people. Paris is known for its cultural landmarks like the Eiffel Tower and the Louvre Museum.",
+                "3. Paris is the capital of France. It is also the most populous city in France, with a population of over 2 million people. Paris is known for its cultural landmarks like the Eiffel Tower and the Louvre Museum.",
+            ],
+            "output": {
+                "relevent_contexts": [1, 2],
+            },
+        },
+        {
+            "question": "How does caffeine affect the body and what are its common sources?",
+            "contexts": [
+                "1. Caffeine is a central nervous system stimulant. It can temporarily ward off drowsiness and restore alertness. It primarily affects the brain, where it alters the function of neurotransmitters.",
+                "2. Regular physical activity is essential for maintaining good health. It can help control weight, combat health conditions, boost energy, and promote better sleep.",
+                "3. Common sources of caffeine include coffee, tea, cola, and energy drinks. These beverages are consumed worldwide and are known for providing a quick boost of energy.",
+            ],
+            "output": {"relevant_contexts": [1, 2]},
+        },
+    ],
+    input_keys=["question", "contexts"],
+    output_key="output",
+    output_type="json",
+    language="english",
+)
diff --git a/src/ragas/testset/utils.py b/src/ragas/testset/utils.py
@@ -3,6 +3,10 @@
 import re
 import warnings
 
+import numpy as np
+
+rng = np.random.default_rng(seed=42)
+
 
 def load_as_score(text):
     """