From 51d216dd03dce889862f0c7911307bca1c620deb Mon Sep 17 00:00:00 2001
From: Shahules786 <Shahules786@gmail.com>
Date: Fri, 15 Nov 2024 09:31:29 +0530
Subject: [PATCH 1/2] handle OOC cases

---
 src/ragas/testset/transforms/base.py          |  20 +++
 .../transforms/extractors/llm_based.py        | 117 +++++++++++-------
 2 files changed, 95 insertions(+), 42 deletions(-)

diff --git a/src/ragas/testset/transforms/base.py b/src/ragas/testset/transforms/base.py
index 3c1892c81..49945e482 100644
--- a/src/ragas/testset/transforms/base.py
+++ b/src/ragas/testset/transforms/base.py
@@ -3,10 +3,15 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 
+import tiktoken
+from tiktoken.core import Encoding
+
 from ragas.llms import BaseRagasLLM, llm_factory
 from ragas.prompt import PromptMixin
 from ragas.testset.graph import KnowledgeGraph, Node, Relationship
 
+DEFAULT_TOKENIZER = tiktoken.get_encoding("o200k_base")
+
 logger = logging.getLogger(__name__)
 
 
@@ -188,6 +193,21 @@ async def apply_extract(node: Node):
 class LLMBasedExtractor(Extractor, PromptMixin):
     llm: BaseRagasLLM = field(default_factory=llm_factory)
     merge_if_possible: bool = True
+    max_token_limit: int = 32000
+    tokenizer: Encoding = DEFAULT_TOKENIZER
+
+    def split_text_by_token_limit(self, text, max_token_limit):
+
+        # Tokenize the entire input string
+        tokens = self.tokenizer.encode(text)
+
+        # Split tokens into chunks of max_token_limit or less
+        chunks = []
+        for i in range(0, len(tokens), max_token_limit):
+            chunk_tokens = tokens[i : i + max_token_limit]
+            chunks.append(self.tokenizer.decode(chunk_tokens))
+
+        return chunks
 
 
 class Splitter(BaseGraphTransformation):
diff --git a/src/ragas/testset/transforms/extractors/llm_based.py b/src/ragas/testset/transforms/extractors/llm_based.py
index dca9c66f4..fcf6ed2db 100644
--- a/src/ragas/testset/transforms/extractors/llm_based.py
+++ b/src/ragas/testset/transforms/extractors/llm_based.py
@@ -8,7 +8,11 @@
 from ragas.testset.transforms.base import LLMBasedExtractor
 
 
-# define prompts
+class TextWithExtractionLimit(BaseModel):
+    text: str
+    max_num: int = 10
+
+
 class SummaryExtractorPrompt(PydanticPrompt[StringIO, StringIO]):
     instruction: str = "Summarize the given text in less than 10 sentences."
     input_model: t.Type[StringIO] = StringIO
@@ -29,14 +33,15 @@ class Keyphrases(BaseModel):
     keyphrases: t.List[str]
 
 
-class KeyphrasesExtractorPrompt(PydanticPrompt[StringIO, Keyphrases]):
-    instruction: str = "Extract top 5 keyphrases from the given text."
-    input_model: t.Type[StringIO] = StringIO
+class KeyphrasesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Keyphrases]):
+    instruction: str = "Extract top max_num keyphrases from the given text."
+    input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit
     output_model: t.Type[Keyphrases] = Keyphrases
-    examples: t.List[t.Tuple[StringIO, Keyphrases]] = [
+    examples: t.List[t.Tuple[TextWithExtractionLimit, Keyphrases]] = [
         (
-            StringIO(
-                text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations."
+            TextWithExtractionLimit(
+                text="Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations.",
+                max_num=5,
             ),
             Keyphrases(
                 keyphrases=[
@@ -69,14 +74,17 @@ class Headlines(BaseModel):
     headlines: t.List[str]
 
 
-class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
-    instruction: str = "Extract only level 2 and level 3 headings from the given text."
+class HeadlinesExtractorPrompt(PydanticPrompt[TextWithExtractionLimit, Headlines]):
+    instruction: str = (
+        "Extract the most important max_num headlines from the given text that can be used to split the text into independent sections."
+        "Focus on Level 2 and Level 3 headings."
+    )
 
-    input_model: t.Type[StringIO] = StringIO
+    input_model: t.Type[TextWithExtractionLimit] = TextWithExtractionLimit
     output_model: t.Type[Headlines] = Headlines
-    examples: t.List[t.Tuple[StringIO, Headlines]] = [
+    examples: t.List[t.Tuple[TextWithExtractionLimit, Headlines]] = [
         (
-            StringIO(
+            TextWithExtractionLimit(
                 text="""\
                 Introduction
                 Overview of the topic...
@@ -98,16 +106,18 @@ class HeadlinesExtractorPrompt(PydanticPrompt[StringIO, Headlines]):
 
                 Conclusion
                 Final remarks and summary.
-                """
+                """,
+                max_num=6,
             ),
             Headlines(
                 headlines=[
+                    "Introduction",
                     "Main Concepts",
                     "Detailed Analysis",
-                    "Subsection: Specialized Techniques",
+                    "Subsection: Specialized Techniques"
                     "Future Directions",
-                    "Subsection: Next Steps in Research",
-                ]
+                    "Conclusion",
+                ],
             ),
         ),
     ]
@@ -117,11 +127,6 @@ class NEROutput(BaseModel):
     entities: t.List[str]
 
 
-class TextWithExtractionLimit(BaseModel):
-    text: str
-    max_num: int = 10
-
-
 class NERPrompt(PydanticPrompt[TextWithExtractionLimit, NEROutput]):
     instruction: str = (
         "Extract the named entities from the given text, limiting the output to the top entities. "
@@ -171,14 +176,15 @@ async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
         node_text = node.get_property("page_content")
         if node_text is None:
             return self.property_name, None
-        result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
+        chunks = self.split_text_by_token_limit(node_text, self.max_token_limit)
+        result = await self.prompt.generate(self.llm, data=StringIO(text=chunks[0]))
         return self.property_name, result.text
 
 
 @dataclass
 class KeyphrasesExtractor(LLMBasedExtractor):
     """
-    Extracts top 5 keyphrases from the given text.
+    Extracts top keyphrases from the given text.
 
     Attributes
     ----------
@@ -190,13 +196,20 @@ class KeyphrasesExtractor(LLMBasedExtractor):
 
     property_name: str = "keyphrases"
     prompt: KeyphrasesExtractorPrompt = KeyphrasesExtractorPrompt()
+    max_num: int = 5
 
     async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
         node_text = node.get_property("page_content")
         if node_text is None:
             return self.property_name, None
-        result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
-        return self.property_name, result.keyphrases
+        chunks = self.split_text_by_token_limit(node_text, self.max_token_limit)
+        keyphrases = []
+        for chunk in chunks:
+            result = await self.prompt.generate(
+                self.llm, data=TextWithExtractionLimit(text=chunk, max_num=self.max_num)
+            )
+            keyphrases.extend(result.keyphrases)
+        return self.property_name, keyphrases
 
 
 @dataclass
@@ -219,7 +232,8 @@ async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
         node_text = node.get_property("page_content")
         if node_text is None:
             return self.property_name, None
-        result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
+        chunks = self.split_text_by_token_limit(node_text, self.max_token_limit)
+        result = await self.prompt.generate(self.llm, data=StringIO(text=chunks[0]))
         return self.property_name, result.text
 
 
@@ -238,15 +252,22 @@ class HeadlinesExtractor(LLMBasedExtractor):
 
     property_name: str = "headlines"
     prompt: HeadlinesExtractorPrompt = HeadlinesExtractorPrompt()
+    max_num: int = 5
 
     async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
         node_text = node.get_property("page_content")
         if node_text is None:
             return self.property_name, None
-        result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
-        if result is None:
-            return self.property_name, None
-        return self.property_name, result.headlines
+
+        chunks = self.split_text_by_token_limit(node_text, self.max_token_limit)
+        headlines = []
+        for chunk in chunks:
+            result = await self.prompt.generate(
+                self.llm, data=TextWithExtractionLimit(text=chunk, max_num=self.max_num)
+            )
+            if result:
+                headlines.extend(result.headlines)
+        return self.property_name, headlines
 
 
 @dataclass
@@ -270,11 +291,15 @@ async def extract(self, node: Node) -> t.Tuple[str, t.List[str]]:
         node_text = node.get_property("page_content")
         if node_text is None:
             return self.property_name, []
-        result = await self.prompt.generate(
-            self.llm,
-            data=TextWithExtractionLimit(text=node_text, max_num=self.max_num_entities),
-        )
-        return self.property_name, result.entities
+        chunks = self.split_text_by_token_limit(node_text, self.max_token_limit)
+        entities = []
+        for chunk in chunks:
+            result = await self.prompt.generate(
+                self.llm,
+                data=TextWithExtractionLimit(text=chunk, max_num=self.max_num_entities),
+            )
+            entities.extend(result.entities)
+        return self.property_name, entities
 
 
 class TopicDescription(BaseModel):
@@ -282,7 +307,9 @@ class TopicDescription(BaseModel):
 
 
 class TopicDescriptionPrompt(PydanticPrompt[StringIO, TopicDescription]):
-    instruction: str = "Provide a concise description of the main topic(s) discussed in the following text."
+    instruction: str = (
+        "Provide a concise description of the main topic(s) discussed in the following text."
+    )
     input_model: t.Type[StringIO] = StringIO
     output_model: t.Type[TopicDescription] = TopicDescription
     examples: t.List[t.Tuple[StringIO, TopicDescription]] = [
@@ -317,7 +344,8 @@ async def extract(self, node: Node) -> t.Tuple[str, t.Any]:
         node_text = node.get_property("page_content")
         if node_text is None:
             return self.property_name, None
-        result = await self.prompt.generate(self.llm, data=StringIO(text=node_text))
+        chunks = self.split_text_by_token_limit(node_text, self.max_token_limit)
+        result = await self.prompt.generate(self.llm, data=StringIO(text=chunks[0]))
         return self.property_name, result.description
 
 
@@ -372,8 +400,13 @@ async def extract(self, node: Node) -> t.Tuple[str, t.List[str]]:
         node_text = node.get_property("page_content")
         if node_text is None:
             return self.property_name, []
-        result = await self.prompt.generate(
-            self.llm,
-            data=TextWithExtractionLimit(text=node_text, max_num=self.max_num_themes),
-        )
-        return self.property_name, result.output
+        chunks = self.split_text_by_token_limit(node_text, self.max_token_limit)
+        themes = []
+        for chunk in chunks:
+            result = await self.prompt.generate(
+                self.llm,
+                data=TextWithExtractionLimit(text=chunk, max_num=self.max_num_themes),
+            )
+            themes.extend(result.output)
+
+        return self.property_name, themes

From 3a3d59542cffe516260910abbaaf5b554180c6af Mon Sep 17 00:00:00 2001
From: Shahules786 <Shahules786@gmail.com>
Date: Fri, 15 Nov 2024 09:38:00 +0530
Subject: [PATCH 2/2] add missing import

---
 docs/extra/components/choose_generator_llm.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/extra/components/choose_generator_llm.md b/docs/extra/components/choose_generator_llm.md
index e971dd8bf..504739444 100644
--- a/docs/extra/components/choose_generator_llm.md
+++ b/docs/extra/components/choose_generator_llm.md
@@ -16,6 +16,7 @@
 
     ```python
     from ragas.llms import LangchainLLMWrapper
+    from ragas.embeddings import LangchainEmbeddingsWrapper
     from langchain_openai import ChatOpenAI
     from langchain_openai import OpenAIEmbeddings
     generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))