In [1]:
from lightrag.core import Component, Generator
from lightrag.components.model_client import GroqAPIClient
from dotenv import load_dotenv
import os
os.environ["GROQ_API_KEY"] = ''

# Load environment variables from .env file
load_dotenv()

# Access the environment variable
api_key = os.getenv("GROQ_API_KEY")
print(f"GROQ_API_KEY: {api_key}")  # This should print the API key if loaded correctly

GROQ_API_KEY: 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


In [6]:

from dataclasses import dataclass, field

from lightrag.core import Component, Generator, DataClass
from lightrag.components.model_client import GroqAPIClient
from lightrag.components.output_parsers import JsonOutputParser

@dataclass
class QAOutput(DataClass):
    explanation: str = field(
        metadata={"desc": "A brief explanation of the concept in one sentence."}
    )
    example: str = field(metadata={"desc": "An example of the concept in a sentence."})



qa_template = r"""<SYS>
You are a helpful assistant.
<OUTPUT_FORMAT>
{{output_format_str}}
</OUTPUT_FORMAT>
</SYS>
User: {{input_str}}
You:"""

class QA(Component):
    def __init__(self):
        super().__init__()

        parser = JsonOutputParser(data_class=QAOutput, return_data_class=True)
        self.generator = Generator(
            model_client=GroqAPIClient(),
            model_kwargs={"model": "llama3-8b-8192"},
            template=qa_template,
            prompt_kwargs={"output_format_str": parser.format_instructions()},
            output_processors=parser,
        )

    def call(self, query: str):
        return self.generator.call({"input_str": query})

    async def acall(self, query: str):
        return await self.generator.acall({"input_str": query})

In [9]:

qa = QA()
print(qa)

# call
output = qa("What is Pakistan?")
print(output)

QA(
  (generator): Generator(
    model_kwargs={'model': 'llama3-8b-8192'}, 
    (prompt): Prompt(
      template: <SYS>
      You are a helpful assistant.
      <OUTPUT_FORMAT>
      {{output_format_str}}
      </OUTPUT_FORMAT>
      </SYS>
      User: {{input_str}}
      You:, prompt_kwargs: {'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\n```\n{\n    "explanation": "A brief explanation of the concept in one sentence. (str) (required)",\n    "example": "An example of the concept in a sentence. (str) (required)"\n}\n```\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n-Use double quotes for the keys and string values.\n-DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the JSON output.\n-Follow the JSON formatting conventions.'}, prompt_variables: ['output_format_str', 'input_str']
    )
    (model_client): Groq

In [8]:
qa.generator.print_prompt(
    output_format_str=qa.generator.output_processors.format_instructions(),
    input_str="What is Pakistan"
)

Prompt:
______________________
<SYS>
You are a helpful assistant.
<OUTPUT_FORMAT>
Your output should be formatted as a standard JSON instance with the following schema:
```
{
    "explanation": "A brief explanation of the concept in one sentence. (str) (required)",
    "example": "An example of the concept in a sentence. (str) (required)"
}
```
-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!
-Use double quotes for the keys and string values.
-DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the JSON output.
-Follow the JSON formatting conventions.
</OUTPUT_FORMAT>
</SYS>
User: What is Pakistan
You:


In [10]:
def jinja2_template_example(**kwargs):
   from jinja2 import Template

   template = r"""<SYS>{{ task_desc_str }}</SYS>
{# tools #}
{% if tools %}
<TOOLS>
{% for tool in tools %}
{{loop.index}}. {{ tool }}
{% endfor %}
</TOOLS>
{% endif %}
User: {{ input_str }}"""
   t = Template(template, trim_blocks=True, lstrip_blocks=True)
   print(t.render(**kwargs))

In [17]:
task_desc_str=" You are a expert in maths"
input_str="What is the formula for arithmetic series?"
tools=['google', 'wikipedia', 'wikidata']

In [18]:
jinja2_template_example(task_desc_str=task_desc_str, input_str=input_str)
jinja2_template_example(
     task_desc_str=task_desc_str, input_str=input_str, tools=tools
 )

<SYS> You are a expert in maths</SYS>
User: What is the formula for arithmetic series?
<SYS> You are a expert in maths</SYS>
<TOOLS>
1. google
2. wikipedia
3. wikidata
</TOOLS>
User: What is the formula for arithmetic series?


Using Prompt class


In [20]:
template = r"""<SYS>{{ task_desc_str }}</SYS>
{# tools #}
{% if tools %}
<TOOLS>
{% for tool in tools %}
{{loop.index}}. {{ tool }}
{% endfor %}
</TOOLS>
{% endif %}
User: {{ input_str }}"""

In [22]:
from lightrag.core.prompt_builder import Prompt

prompt = Prompt(
   template=template,
   prompt_kwargs={
      "task_desc_str": task_desc_str,
      "tools": tools,
   },
)
print(prompt)
print("The second one is",prompt(input_str=input_str)) # takes the rest arguments in keyword arguments

Prompt(
  template: <SYS>{{ task_desc_str }}</SYS>
  {# tools #}
  {% if tools %}
  <TOOLS>
  {% for tool in tools %}
  {{loop.index}}. {{ tool }}
  {% endfor %}
  </TOOLS>
  {% endif %}
  User: {{ input_str }}, prompt_kwargs: {'task_desc_str': ' You are a expert in maths', 'tools': ['google', 'wikipedia', 'wikidata']}, prompt_variables: ['input_str', 'task_desc_str', 'tools']
)
The second one is <SYS> You are a expert in maths</SYS>
<TOOLS>
1. google
2. wikipedia
3. wikidata
</TOOLS>
User: What is the formula for arithmetic series?


Default Prompt Template

In default, the Prompt class uses the DEFAULT_LIGHTRAG_SYSTEM_PROMPT as its string template if no template is provided. This default template allows you to conditionally passing seven important variables designed from the data flow diagram abo

LIGHTRAG_DEFAULT_PROMPT_ARGS = [
   "task_desc_str",  # task description
   "output_format_str",  # output format of the task
   "tools_str",  # tools used in the task
   "examples_str",  # examples of the task
   "chat_history_str",  # chat history of the user
   "context_str",  # context of the user query
   "steps_str",  # used in agent steps
   "input_str",  # user query or input
]

In [23]:
prompt = Prompt()
output = prompt(input_str=input_str)
print(output)

<SYS>
You are a helpful assistant.
</SYS>
<User>
What is the formula for arithmetic series?
</User>
You:



Generator In Action
We will create a simple one-turn chatbot to demonstrate how to use the Generator.

Minimum Example
The minimum setup to initiate a generator in the code:

In [28]:
from lightrag.core import Generator
from lightrag.components.model_client import GroqAPIClient

generator = Generator(
    model_client=GroqAPIClient(),
    model_kwargs={"model": "llama3-8b-8192"},
    
)
print(generator)

Generator(
  model_kwargs={'model': 'llama3-8b-8192'}, 
  (prompt): Prompt(
    template: <SYS>
    {# task desc #}
    {% if task_desc_str %}
    {{task_desc_str}}
    {% else %}
    You are a helpful assistant.
    {% endif %}
    {# output format #}
    {% if output_format_str %}
    <OUTPUT_FORMAT>
    {{output_format_str}}
    </OUTPUT_FORMAT>
    {% endif %}
    {# tools #}
    {% if tools_str %}
    <TOOLS>
    {{tools_str}}
    </TOOLS>
    {% endif %}
    {# example #}
    {% if examples_str %}
    <EXAMPLES>
    {{examples_str}}
    </EXAMPLES>
    {% endif %}
    {# chat history #}
    {% if chat_history_str %}
    <CHAT_HISTORY>
    {{chat_history_str}}
    </CHAT_HISTORY>
    {% endif %}
    {#contex#}
    {% if context_str %}
    <CONTEXT>
    {{context_str}}
    </CONTEXT>
    {% endif %}
    {# steps #}
    {% if steps_str %}
    <STEPS>
    {{steps_str}}
    </STEPS>
    {% endif %}
    </SYS>
    {% if input_str %}
    <User>
    {{input_str}}
    </User>
    {% endif

In [25]:
prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."}
generator.print_prompt(**prompt_kwargs)

Prompt:
______________________
<SYS>
You are a helpful assistant.
</SYS>
<User>
What is LLM? Explain in one sentence.
</User>
You:



Call the generator 

In [26]:
output = generator(
    prompt_kwargs=prompt_kwargs,
)
print(output)


GeneratorOutput(data='LLM stands for Large Language Model, which refers to a type of artificial intelligence (AI) that can process and generate human-like natural language text.', error=None, usage=None, raw_response='LLM stands for Large Language Model, which refers to a type of artificial intelligence (AI) that can process and generate human-like natural language text.', metadata=None)


Using built in prompt template 

In [32]:

generator = Generator(
    model_client=GroqAPIClient(),
    model_kwargs={"model": "llama3-8b-8192"},
   
)
print("Generator is ",generator,'\n')
prompt_kwargs = {"input_str": "What is LLM?"}

generator.print_prompt(
    **prompt_kwargs,
)
output = generator(
    prompt_kwargs=prompt_kwargs,
)

Generator is  Generator(
  model_kwargs={'model': 'llama3-8b-8192'}, 
  (prompt): Prompt(
    template: <SYS>
    {# task desc #}
    {% if task_desc_str %}
    {{task_desc_str}}
    {% else %}
    You are a helpful assistant.
    {% endif %}
    {# output format #}
    {% if output_format_str %}
    <OUTPUT_FORMAT>
    {{output_format_str}}
    </OUTPUT_FORMAT>
    {% endif %}
    {# tools #}
    {% if tools_str %}
    <TOOLS>
    {{tools_str}}
    </TOOLS>
    {% endif %}
    {# example #}
    {% if examples_str %}
    <EXAMPLES>
    {{examples_str}}
    </EXAMPLES>
    {% endif %}
    {# chat history #}
    {% if chat_history_str %}
    <CHAT_HISTORY>
    {{chat_history_str}}
    </CHAT_HISTORY>
    {% endif %}
    {#contex#}
    {% if context_str %}
    <CONTEXT>
    {{context_str}}
    </CONTEXT>
    {% endif %}
    {# steps #}
    {% if steps_str %}
    <STEPS>
    {{steps_str}}
    </STEPS>
    {% endif %}
    </SYS>
    {% if input_str %}
    <User>
    {{input_str}}
    </User

Using custom prompt template

In this example, we will use a customized template to format the prompt. We intialized the prompt with one variable task_desc_str, which is further combined with the input_str in the prompt

In [33]:
template = r"""<SYS>{{task_desc_str}}</SYS>
User: {{input_str}}
You:"""
generator = Generator(
    model_client=GroqAPIClient(),
    model_kwargs={"model": "llama3-8b-8192"},
    template=template,
    prompt_kwargs={"task_desc_str": "You are a helpful assistant"},
)
print("Generator is ",generator,'\n')
prompt_kwargs = {"input_str": "What is LLM?"}

generator.print_prompt(
    **prompt_kwargs,
)
output = generator(
    prompt_kwargs=prompt_kwargs,
)

Generator is  Generator(
  model_kwargs={'model': 'llama3-8b-8192'}, 
  (prompt): Prompt(
    template: <SYS>{{task_desc_str}}</SYS>
    User: {{input_str}}
    You:, prompt_kwargs: {'task_desc_str': 'You are a helpful assistant'}, prompt_variables: ['task_desc_str', 'input_str']
  )
  (model_client): GroqAPIClient()
) 

Prompt:
______________________
<SYS>You are a helpful assistant</SYS>
User: What is LLM?
You:


USE OUTPUT PROCESSORS 

In this example, we will instruct the LLM to output a JSON object in response. We will use the JsonParser to parse the output back to a dict object.

In [35]:
from lightrag.core import Generator
from lightrag.core.types import GeneratorOutput
from lightrag.components.model_client import GroqAPIClient
from lightrag.core.string_parser import JsonParser

output_format_str = r"""Your output should be formatted as a standard JSON object with two keys:
{
    "explanation": "A brief explanation of the concept in one sentence.",
    "example": "An example of the concept in a sentence."
}
"""

generator = Generator(
    model_client=GroqAPIClient(),
    model_kwargs={"model": "llama3-8b-8192"},
    prompt_kwargs={"output_format_str": output_format_str},
    output_processors=JsonParser(),
)

prompt_kwargs = {"input_str": "What is LLM?"}
generator.print_prompt(**prompt_kwargs)

output: GeneratorOutput = generator(prompt_kwargs=prompt_kwargs)
print("The type of output data is ",type(output.data))
print("The output data is ",output.data)

Prompt:
______________________
<SYS>
You are a helpful assistant.
<OUTPUT_FORMAT>
Your output should be formatted as a standard JSON object with two keys:
{
    "explanation": "A brief explanation of the concept in one sentence.",
    "example": "An example of the concept in a sentence."
}

</OUTPUT_FORMAT>
</SYS>
<User>
What is LLM?
</User>
You:

The type of output data is  <class 'dict'>
The outout data is  {'explanation': 'LLM stands for Large Language Model, which refers to a type of artificial intelligence trained on large datasets to generate human-like text or respond to natural language inputs.', 'example': 'For instance, an LLM can be asked a question or given a prompt and respond with a coherent and informative answer, making it suitable for applications such as chatbots, language translation, and text summarization.'}


Switch the model client

In [38]:
from lightrag.core.types import ModelClientType

generator = Generator(
    model_client=ModelClientType.GROQ(),  # or ModelClientType.GROQ()
    model_kwargs={"model": "gpt-3.5-turbo"},
)


Get Errors in GeneratorOutput

We will use an incorrect API key to delibrately create an error. We will still get a response, but it will only contain empty data and an error message. Here is an example of an API key error with OpenAI:

In [None]:
GeneratorOutput(data=None, error="Error code: 401 - {'error': {'message': 'Incorrect API key provided: ab. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}", usage=None, raw_response=None, metadata=None)

Create from Configs
As with all components, we can create the generator purely from configs.

In [39]:
from lightrag.core import Generator

config = {
    "model_client": {
        "component_name": "GroqAPIClient",
        "component_config": {},
    },
    "model_kwargs": {
        "model": "llama3-8b-8192",
    },
}

generator: Generator = Generator.from_config(config)
print(generator)

prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."}
generator.print_prompt(**prompt_kwargs)
output = generator(
    prompt_kwargs=prompt_kwargs,
)
print(output)

Generator(
  model_kwargs={'model': 'llama3-8b-8192'}, 
  (prompt): Prompt(
    template: <SYS>
    {# task desc #}
    {% if task_desc_str %}
    {{task_desc_str}}
    {% else %}
    You are a helpful assistant.
    {% endif %}
    {# output format #}
    {% if output_format_str %}
    <OUTPUT_FORMAT>
    {{output_format_str}}
    </OUTPUT_FORMAT>
    {% endif %}
    {# tools #}
    {% if tools_str %}
    <TOOLS>
    {{tools_str}}
    </TOOLS>
    {% endif %}
    {# example #}
    {% if examples_str %}
    <EXAMPLES>
    {{examples_str}}
    </EXAMPLES>
    {% endif %}
    {# chat history #}
    {% if chat_history_str %}
    <CHAT_HISTORY>
    {{chat_history_str}}
    </CHAT_HISTORY>
    {% endif %}
    {#contex#}
    {% if context_str %}
    <CONTEXT>
    {{context_str}}
    </CONTEXT>
    {% endif %}
    {# steps #}
    {% if steps_str %}
    <STEPS>
    {{steps_str}}
    </STEPS>
    {% endif %}
    </SYS>
    {% if input_str %}
    <User>
    {{input_str}}
    </User>
    {% endif

Purely from the Configs

This is even more general. This method can be used to create any component from configs. We just need to follow the config structure: component_name and component_config for all argument

In [40]:
from lightrag.utils.config import new_component
from lightrag.core import Generator

config = {
    "generator": {
        "component_name": "Generator",
        "component_config": {
            "model_client": {
                "component_name": "GroqAPIClient",
                "component_config": {},
            },
            "model_kwargs": {
                "model": "llama3-8b-8192",
            },
        },
    }
}

generator: Generator = new_component(config["generator"])
print(generator)

prompt_kwargs = {"input_str": "What is LLM? Explain in one sentence."}
generator.print_prompt(**prompt_kwargs)
output = generator(
    prompt_kwargs=prompt_kwargs,
)
print(output)

Generator(
  model_kwargs={'model': 'llama3-8b-8192'}, 
  (prompt): Prompt(
    template: <SYS>
    {# task desc #}
    {% if task_desc_str %}
    {{task_desc_str}}
    {% else %}
    You are a helpful assistant.
    {% endif %}
    {# output format #}
    {% if output_format_str %}
    <OUTPUT_FORMAT>
    {{output_format_str}}
    </OUTPUT_FORMAT>
    {% endif %}
    {# tools #}
    {% if tools_str %}
    <TOOLS>
    {{tools_str}}
    </TOOLS>
    {% endif %}
    {# example #}
    {% if examples_str %}
    <EXAMPLES>
    {{examples_str}}
    </EXAMPLES>
    {% endif %}
    {# chat history #}
    {% if chat_history_str %}
    <CHAT_HISTORY>
    {{chat_history_str}}
    </CHAT_HISTORY>
    {% endif %}
    {#contex#}
    {% if context_str %}
    <CONTEXT>
    {{context_str}}
    </CONTEXT>
    {% endif %}
    {# steps #}
    {% if steps_str %}
    <STEPS>
    {{steps_str}}
    </STEPS>
    {% endif %}
    </SYS>
    {% if input_str %}
    <User>
    {{input_str}}
    </User>
    {% endif

Output Parsers in Action

In [49]:
from dataclasses import dataclass, field
from lightrag.core import DataClass

@dataclass
class User(DataClass):
    id: int = field(default=1, metadata={"description": "User ID"})  # it will also without suing field  function
    name: str = field(default="John", metadata={"description": "User name"})

user_example = User(id=1, name="John")

JSON Output parser

In [42]:
from lightrag.components.output_parsers import JsonOutputParser

parser = JsonOutputParser(data_class=User, examples=[user_example])
print(parser)

JsonOutputParser(
  data_class=User, examples=[User(id=1, name='John')], exclude_fields=None, return_data_class=False
  (output_format_prompt): Prompt(
    template: Your output should be formatted as a standard JSON instance with the following schema:
    ```
    {{schema}}
    ```
    {% if example %}
    Examples:
    ```
    {{example}}
    ```
    {% endif %}
    -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!
    -Use double quotes for the keys and string values.
    -DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the JSON output.
    -Follow the JSON formatting conventions., prompt_variables: ['example', 'schema']
  )
  (output_processors): JsonParser()
)


In [45]:
user_to_parse = '{"id": 2, "name": "Jane"}'
parsed_user = parser(user_to_parse)
print(parsed_user)

{'id': 2, 'name': 'Jane'}


YAML OUTPUT PARSER

In [50]:
from lightrag.components.output_parsers import YamlOutputParser

parser = YamlOutputParser(data_class=User, examples=[user_example])
print(parser)

YamlOutputParser(
  data_class=User, examples=[User(id=1, name='John')], exclude_fields=None, return_data_class=False
  (output_format_prompt): Prompt(
    template: Your output should be formatted as a standard YAML instance with the following schema:
    ```
    {{schema}}
    ```
    {% if example %}
    Examples:
    ```
    {{example}}
    ```
    {% endif %}
    
    -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!
    -Follow the YAML formatting conventions with an indent of 2 spaces.
    -DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the YAML output.
    -Quote the string values properly., prompt_variables: ['example', 'schema']
  )
  (output_processors): YamlParser()
)


In [51]:
user_to_parse = "id: 2\nname: Jane"
parsed_user = parser(user_to_parse)
print(parsed_user)

{'id': 2, 'name': 'Jane'}


Embedder: Using LOCAL EMBEDDING MODELS

In [52]:
from lightrag.core.embedder import Embedder
from lightrag.components.model_client import TransformersClient

model_kwargs = {"model": "thenlper/gte-base"}
local_embedder = Embedder(model_client=TransformersClient(), model_kwargs=model_kwargs)

In [53]:
query = "What is the capital of China?"

queries = [query] * 100



In [56]:
output = local_embedder(query)
print("The length of list is ",output.length, "The dimensions of output is",output.embedding_dim,"The output is normalized ?", output.is_normalized)
# 1 768 True

output = local_embedder(queries)
print("The length of length is ",output.length, "The dimensions of output is",output.embedding_dim,"The output is normalized ?", output.is_normalized)
# 100 768 True

The length of list is  1 The dimensions of output is 768 The output is normalized ? True
The length of length is  100 The dimensions of output is 768 The output is normalized ? True


Decreasing the dimension of the embedding
Use Output Processors

If we want to decreate the embedding dimension to only 256 to save memory, we can customize an additional output processing step and pass it to embedder via the output_processors argument.



In [57]:
from lightrag.core.types import Embedding
from lightrag.core.functional import normalize_vector
from typing import List
from lightrag.core.component import Component
from copy import deepcopy

class DecreaseEmbeddingDim(Component):
    def __init__(self, old_dim: int, new_dim: int,  normalize: bool = True):
        super().__init__()
        self.old_dim = old_dim
        self.new_dim = new_dim
        self.normalize = normalize
        assert self.new_dim < self.old_dim, "new_dim should be less than old_dim"

    def call(self, input: List[Embedding]) -> List[Embedding]:
        output: List[Embedding] = deepcopy(input)
        for embedding in output:
            old_embedding = embedding.embedding
            new_embedding = old_embedding[: self.new_dim]
            if self.normalize:
                new_embedding = normalize_vector(new_embedding)
            embedding.embedding = new_embedding
        return output

    def _extra_repr(self) -> str:
        repr_str = f"old_dim={self.old_dim}, new_dim={self.new_dim}, normalize={self.normalize}"
        return repr_str

This output procesor will process on the data field of the EmbedderOutput, which is of type List[Embedding]. Thus we have input: List[Embedding] -> output: List[Embedding] in the call method. Putting it all together, we can create a new embedder with the output processor.

In [59]:
local_embedder_256 = Embedder(
     model_client=TransformersClient(),
     model_kwargs=model_kwargs,
     output_processors=DecreaseEmbeddingDim(768, 256),
 )

print(local_embedder_256)

Embedder(
  model_kwargs={'model': 'thenlper/gte-base'}, 
  (model_client): TransformersClient()
  (output_processors): DecreaseEmbeddingDim(old_dim=768, new_dim=256, normalize=True)
)


Now, the embeddings will be generated of 256 dimesions

In [60]:
output = local_embedder_256(query)
print(output.length, output.embedding_dim, output.is_normalized)
# 1 256 True

1 256 True


BATCH EMBEDDER

 when you use parentheses with an object like batch_embedder(queries), Python attempts to call the __call__ method of that object if it is defined.

In [64]:
from lightrag.core.embedder import BatchEmbedder

batch_embedder = BatchEmbedder(embedder=local_embedder, batch_size=100)
query = "What is the capital of China?"
queries = [query] * 1000

response = batch_embedder(queries)
# 100%|██████████| 11/11 [00:04<00:00,  2.59it/s]

Batch embedding documents: 100%|██████████| 10/10 [00:18<00:00,  1.85s/it]


In [65]:
print(response[0])

EmbedderOutput(data=[Embedding(embedding=[-0.005133619997650385, 0.0028610778972506523, -0.005318059120327234, 0.003371176077052951, 0.02472376450896263, 0.031122606247663498, 0.015265182591974735, 0.07056716829538345, 0.0035839297343045473, -0.04136485978960991, 0.005012736655771732, -0.057165008038282394, -0.05562780052423477, 0.015120604075491428, -0.018120622262358665, 0.034119606018066406, 0.0476648174226284, -0.004806012846529484, -0.006379107013344765, 0.020822394639253616, -0.025776447728276253, 0.006425760220736265, 0.025007842108607292, 0.0504639632999897, 0.03291403874754906, -0.009098559617996216, -0.015141552314162254, 0.009841454215347767, -0.09457811713218689, -0.013661283068358898, -0.0033976684790104628, 0.026431547477841377, -0.005364086478948593, 0.01193434838205576, -0.004608070943504572, -0.008419710211455822, -0.023735299706459045, -0.05674968659877777, -0.00627771345898509, 0.011796836741268635, -0.02255016751587391, -0.031651098281145096, -0.027354855090379715, 

Retriever

In [73]:
from typing import Generic, TypeVar, List, Optional, Union


RetrieverQueryType = TypeVar("RetrieverQueryType", contravariant=True)
RetrieverStrQueryType = str
RetrieverQueriesType = Union[RetrieverQueryType, Sequence[RetrieverQueryType]]
RetrieverStrQueriesType = Union[str, Sequence[RetrieverStrQueryType]]

In [74]:
RetrieverDocumentType = TypeVar("RetrieverDocumentType", contravariant=True) # a single document
RetrieverDocumentsType = Sequence[RetrieverDocumentType] # The final documents types retriever can use

In [75]:
from dataclasses import dataclass, field

@dataclass
class RetrieverOutput(DataClass):
    doc_indices: List[int] = field(metadata={"desc": "List of document indices"})
    doc_scores: Optional[List[float]] = field(
        default=None, metadata={"desc": "List of document scores"}
    )
    query: Optional[RetrieverQueryType] = field(
        default=None, metadata={"desc": "The query used to retrieve the documents"}
    )
    documents: Optional[List[RetrieverDocumentType]] = field(
        default=None, metadata={"desc": "List of retrieved documents"}
    )


RetrieverOutputType = List[RetrieverOutput]  # so to support multiple queries at once

In [76]:
class Retriever(Component, Generic[RetrieverDocumentType, RetrieverQueryType]):

    ...

    def call(
        self,
        input: RetrieverQueriesType,
        top_k: Optional[int] = None,
        **kwargs,
    ) -> RetrieverOutputType:
        raise NotImplementedError(f"retrieve is not implemented")

    async def acall(
        self,
        input: RetrieverQueriesType,
        top_k: Optional[int] = None,
        **kwargs,
    ) -> RetrieverOutputType:
        raise NotImplementedError(f"Async retrieve is not implemented")

In [78]:
query_1 = "What are the benefits of renewable energy?" # gt is [0, 3]
query_2 = "How do solar panels impact the environment?" # gt is [1, 2]

documents =[
    {
        "title": "The Impact of Renewable Energy on the Economy",
        "content": "Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure."
    },
    {
        "title": "Understanding Solar Panels",
        "content": "Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from atoms, generating a flow of electricity. Solar panels are a type of renewable energy technology that has been found to have a significant positive effect on the environment by reducing the reliance on fossil fuels."
    },
    {
        "title": "Pros and Cons of Solar Energy",
        "content": "While solar energy offers substantial environmental benefits, such as reducing carbon footprints and pollution, it also has downsides. The production of solar panels can lead to hazardous waste, and large solar farms require significant land, which can disrupt local ecosystems."
    },
    {
        "title":  "Renewable Energy and Its Effects",
        "content": "Renewable energy sources like wind, solar, and hydro power play a crucial role in combating climate change. They do not produce greenhouse gases during operation, making them essential for sustainable development. However, the initial setup and material sourcing for these technologies can still have environmental impacts."
    }
]

FAISS RETRIEVER

In [86]:
from lightrag.core.embedder import Embedder
from lightrag.components.model_client import TransformersClient

model_kwargs = {"model": "thenlper/gte-base","dimensions": 256,
    "encoding_format": "float",}
local_embedder = Embedder(model_client=TransformersClient(), model_kwargs=model_kwargs)
# model_kwargs = {
#     "model": "text-embedding-3-small",
#     "dimensions": 256,
#     "encoding_format": "float",
# }


# embedder = Embedder(model_client =ModelClientType.GROQ(), model_kwargs=model_kwargs)
output = local_embedder(input=[doc["content"] for doc in documents])
print("Output is ",output,'\n')
documents_embeddings = [x.embedding for x in output.data]
print("Document embeddings are ",documents_embeddings)

Output is  EmbedderOutput(data=[Embedding(embedding=[0.031234975904226303, 0.006776659749448299, 0.018826505169272423, 0.0438690111041069, 0.058732613921165466, 0.04396121948957443, 0.003569175722077489, 0.0425785593688488, -0.06031818687915802, -0.055096086114645004, -0.010727199725806713, -0.008095057681202888, -0.06078498438000679, 0.031698841601610184, -0.023531673476099968, 0.04983801022171974, 0.05090096592903137, 0.0010038662003353238, 0.03749937564134598, 0.007872911170125008, -0.02095297910273075, -0.000925887085031718, 0.02084084041416645, 0.0435415580868721, 0.026637457311153412, -0.03196244686841965, -0.003608370665460825, 0.04615307226777077, -0.07347429543733597, -0.031295616179704666, 0.043329525738954544, -0.007536784745752811, -0.040572669357061386, -0.015187337063252926, 0.030087590217590332, -0.01298055611550808, -0.03638317808508873, -0.06368640810251236, -0.050168901681900024, -0.038125183433294296, -0.04273608699440956, -0.038880299776792526, -0.023293282836675644

In [82]:
from lightrag.components.retriever import FAISSRetriever
retriever = FAISSRetriever(top_k=2, embedder=local_embedder, documents=documents_embeddings)

print(retriever)

FAISSRetriever(
  top_k=2, metric=prob, dimensions=768, total_documents=4
  (embedder): Embedder(
    model_kwargs={'model': 'thenlper/gte-base', 'dimensions': 256, 'encoding_format': 'float'}, 
    (model_client): TransformersClient()
  )
)


In [84]:
retriever_1 = FAISSRetriever(top_k=2, embedder=local_embedder)
retriever_1.build_index_from_documents(documents=documents_embeddings)

In [85]:
output_1 = retriever(input=query_1)
output_2 = retriever(input=query_2)
output_3 = retriever(input = [query_1, query_2])
print(output_1)
print(output_2)
print(output_3)

[RetrieverOutput(doc_indices=[0, 2], doc_scores=[0.9559999704360962, 0.9390000104904175], query='What are the benefits of renewable energy?', documents=None)]
[RetrieverOutput(doc_indices=[2, 1], doc_scores=[0.9599999785423279, 0.9559999704360962], query='How do solar panels impact the environment?', documents=None)]
[RetrieverOutput(doc_indices=[0, 2], doc_scores=[0.9559999704360962, 0.9390000104904175], query='What are the benefits of renewable energy?', documents=None), RetrieverOutput(doc_indices=[2, 1], doc_scores=[0.9599999785423279, 0.9559999704360962], query='How do solar panels impact the environment?', documents=None)]


TEXT-SPLITTING

SPLIT BY WORD

In [87]:
from lightrag.components.data_process.text_splitter import TextSplitter
from lightrag.core.types import Document

# Configure the splitter settings
text_splitter = TextSplitter(
    split_by="word",
    chunk_size=5,
    chunk_overlap=1
)

# Example document
doc = Document(
    text="Example text. More example text. Even more text to illustrate.",
    id="doc1"
)

# Execute the splitting
splitted_docs = text_splitter.call(documents=[doc])

for doc in splitted_docs:
    print(doc)

# Output:
# Document(id=44a8aa37-0d16-40f0-9ca4-2e25ae5336c8, text='Example text. More example text. ', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)
# Document(id=ca0af45b-4f88-49b5-97db-163da9868ea4, text='text. Even more text to ', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)
# Document(id=e7b617b2-3927-4248-afce-ec0fc247ac8b, text='to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)

Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<?, ?it/s]

Document(id=db342bae-eb78-4d6c-baff-0344c7418561, text='Example text. More example text. ', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)
Document(id=4d168d04-97fc-472b-ac96-cb100d3edf64, text='text. Even more text to ', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)
Document(id=edb47605-45ee-4ea6-92d5-8a5bcdfb990f, text='to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)





SPLIT BY TOKEN

In [88]:
from lightrag.components.data_process.text_splitter import TextSplitter
from lightrag.core.types import Document
import tiktoken

# Configure the splitter settings
text_splitter = TextSplitter(
    split_by="token",
    chunk_size=5,
    chunk_overlap=0
)

doc = Document(
    text="Example text. More example text. Even more text to illustrate.",
    id = "doc1"
    )

splitted_docs = (text_splitter.call(documents=[doc]))

for doc in splitted_docs:
    print(doc)

# Output:
# Document(id=27cec433-b400-4f11-8871-e386e774d150, text='Example text. More example', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)
# Document(id=8905dc5f-8be5-4ca4-88b1-2ae492258b53, text=' text. Even more text', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)
# Document(id=ba8e1e23-82fb-4aa8-bfc5-e22084984bb9, text=' to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)

Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 500.16it/s]

Document(id=46f972cd-91c2-413b-948c-80bb5d94350a, text='Example text. More example', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)
Document(id=ceb2af69-e55e-49e7-ab8f-d050857a5fc3, text=' text. Even more text', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)
Document(id=ff105da6-1427-4d27-b611-d506c680a406, text=' to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)





BM25 RETRIEVER

In [92]:
from lightrag.components.retriever.bm25_retriever import split_text_by_word_fn_then_lower_tokenized, split_text_by_word_fn

query_1_words = split_text_by_word_fn(query_1)
print(query_1_words)
query_1_tokens = split_text_by_word_fn_then_lower_tokenized(query_1)
print(query_1_tokens)

['what', 'are', 'the', 'benefits', 'of', 'renewable', 'energy?']
['what', 'are', 'the', 'benef', 'its', 'of', 're', 'new', 'able', 'energy', '?']


In [93]:
from lightrag.components.retriever import BM25Retriever

document_map_func = lambda x: x["content"]

bm25_retriever = BM25Retriever(top_k=2, documents=documents, document_map_func=document_map_func)
print(bm25_retriever)

BM25Retriever(top_k=2, k1=1.5, b=0.75, epsilon=0.25, use_tokenizer=True, total_documents=4)


In [94]:
output_1 = bm25_retriever(input=query_1)
output_2 = bm25_retriever(input=query_2)
output_3 = bm25_retriever(input = [query_1, query_2])
print(output_1)
print(output_2)
print(output_3)

[RetrieverOutput(doc_indices=[2, 1], doc_scores=[2.151683837681807, 1.6294762236217233], query='What are the benefits of renewable energy?', documents=None)]
[RetrieverOutput(doc_indices=[3, 2], doc_scores=[1.5166601493236314, 0.7790170272403408], query='How do solar panels impact the environment?', documents=None)]
[RetrieverOutput(doc_indices=[2, 1], doc_scores=[2.151683837681807, 1.6294762236217233], query='What are the benefits of renewable energy?', documents=None), RetrieverOutput(doc_indices=[3, 2], doc_scores=[1.5166601493236314, 0.7790170272403408], query='How do solar panels impact the environment?', documents=None)]


Here we see the first query returns [2, 1] while the ground truth is [0, 3]. The second query returns [3, 2] while the ground truth is [1, 2]. The performance is quite disappointing. BM25 is known for lack of semantic understanding and does not consider context. We tested on the shorter and almost key-word like version of our queries and use both the title and content, and it gives the right response using the tokenized split.

In [95]:
query_1_short = "renewable energy?"  # gt is [0, 3]
query_2_short = "solar panels?"  # gt is [1, 2]
document_map_func = lambda x: x["title"] + " " + x["content"]
bm25_retriever.build_index_from_documents(documents=documents, document_map_func=document_map_func)

In [97]:
output_1 = bm25_retriever(input=query_1_short)
output_2 = bm25_retriever(input=query_2_short)

print(output_1)
print(output_2)


[RetrieverOutput(doc_indices=[0, 3], doc_scores=[0.9498793313012154, 0.8031794089550072], query='renewable energy?', documents=None)]
[RetrieverOutput(doc_indices=[2, 1], doc_scores=[0.5343238380789569, 0.4568096570283078], query='solar panels?', documents=None)]


DATA AND RAG

In [98]:
org_documents =[
    {
        "title": "The Impact of Renewable Energy on the Economy",
        "content": "Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure."
    },
    {
        "title": "Understanding Solar Panels",
        "content": "Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from atoms, generating a flow of electricity. Solar panels are a type of renewable energy technology that has been found to have a significant positive effect on the environment by reducing the reliance on fossil fuels."
    },
    {
        "title": "Pros and Cons of Solar Energy",
        "content": "While solar energy offers substantial environmental benefits, such as reducing carbon footprints and pollution, it also has downsides. The production of solar panels can lead to hazardous waste, and large solar farms require significant land, which can disrupt local ecosystems."
    },
    {
        "title":  "Renewable Energy and Its Effects",
        "content": "Renewable energy sources like wind, solar, and hydro power play a crucial role in combating climate change. They do not produce greenhouse gases during operation, making them essential for sustainable development. However, the initial setup and material sourcing for these technologies can still have environmental impacts."
    }
]

turns = [
    {
        "user": "What are the benefits of renewable energy?",
        "system": "I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.",
        "user_time": "2021-09-01T12:00:00Z",
        "system_time": "2021-09-01T12:00:01Z"
    },
    {
        "user": "How do solar panels impact the environment?",
        "system": "Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from atoms, generating a flow of electricity. Solar panels are a type of renewable energy technology that has been found to have a significant positive effect on the environment by reducing the reliance on fossil fuels.",
        "user_time": "2021-09-01T12:00:02Z",
        "system_time": "2021-09-01T12:00:03Z"
    }
]

Creating a doucment

In [99]:
from lightrag.core.types import Document

documents  = [Document(text=doc['content'], meta_data={'title': doc['title']}) for doc in org_documents]
print(documents)

[Document(id=481053e1-eadb-4420-a9ce-81b3eee8e01d, text='Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute...', meta_data={'title': 'The Impact of Renewable Energy on the Economy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=9757423d-01d4-4642-b16f-6ef3b570e44a, text='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock ele...', meta_data={'title': 'Understanding Solar Panels'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=872d8de4-e4c3-4080-88a8-2d877cb81b90, text='While solar energy offers substantial environmental benefits, such as reducing carbon footprints and...', meta_data={'title': 'Pros and Cons of Solar Energy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=28a140de-19cc-4ecd-9531-ac97f6a198db, text='Renewable energy sources like wind, solar, and hydro power play a crucial role in combating climate ...', meta_dat

DialogTurn

In [100]:
from lightrag.core.types import DialogTurn, UserQuery, AssistantResponse

dialog_turns = [
DialogTurn(
        user_query=UserQuery(query_str=turn["user"]),
        assistant_response=AssistantResponse(response_str=turn["system"]),
        user_query_timestamp=turn["user_time"],
        assistant_response_timestamp=turn["system_time"],
    )
    for turn in turns
]
print(dialog_turns)

[DialogTurn(id='7ece1d3a-61c5-4715-83cd-f5cc1ba931bd', user_id=None, conversation_id=None, order=None, user_query=UserQuery(query_str='What are the benefits of renewable energy?', metadata=None), assistant_response=AssistantResponse(response_str='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', metadata=None), user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='8717b76b-4124-4d36-af0c-8722eb688f29', user_id=None, conversation_id=None, order=None, user_query=UserQuery(query_str='How do solar panels impact the environment?', metadata=None), assistant_response=AssistantResponse(response

DATA PIPELINE

In [102]:
# mapping function for org_documents
from typing import Dict

def map_to_document(doc: Dict) -> Document:
    return Document(text=doc['content'], meta_data={'title': doc['title']})

def map_dialogturn_to_document(turn: DialogTurn) -> Document:
    # it can be important to keep the original data's id
    return Document(id=turn.id, text=turn.user_query.query_str + ' ' + turn.assistant_response.response_str)

In [158]:
from lightrag.core.embedder import Embedder
from lightrag.core.types import ModelClientType
from lightrag.components.data_process.text_splitter import TextSplitter
from lightrag.components.data_process import  ToEmbeddings
from lightrag.core.container import Sequential


model_kwargs = {
    "model": "thenlper/gte-base",
    "dimensions": 256,
    "encoding_format": "float",
}

 

# splitter_config = {
#     "split_by": "word",
#     "split_length": 50,
#     "split_overlap": 10
# }

text_splitter =TextSplitter (
    split_by= "word",
    chunk_size=50,
    chunk_overlap=10

)
# splitted_docs = text_splitter.call(documents=[doc])

# splitter = DocumentSplitter(**splitter_config)
local_embedder = Embedder(model_client=TransformersClient(), model_kwargs=model_kwargs)
# embedder = Embedder(model_client =ModelClientType.GROQ(), model_kwargs=model_kwargs)
embedder_transformer = ToEmbeddings(local_embedder, batch_size=2)
data_transformer = Sequential(text_splitter, embedder_transformer)
print(data_transformer)

Sequential(
  (0): TextSplitter(split_by=word, chunk_size=50, chunk_overlap=10)
  (1): ToEmbeddings(
    batch_size=2
    (embedder): Embedder(
      model_kwargs={'model': 'thenlper/gte-base', 'dimensions': 256, 'encoding_format': 'float'}, 
      (model_client): TransformersClient()
    )
    (batch_embedder): BatchEmbedder(
      (embedder): Embedder(
        model_kwargs={'model': 'thenlper/gte-base', 'dimensions': 256, 'encoding_format': 'float'}, 
        (model_client): TransformersClient()
      )
    )
  )
)


In [159]:
print(dialog_turns)

[DialogTurn(id='7ece1d3a-61c5-4715-83cd-f5cc1ba931bd', user_id=None, conversation_id=None, order=None, user_query=UserQuery(query_str='What are the benefits of renewable energy?', metadata=None), assistant_response=AssistantResponse(response_str='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', metadata=None), user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='8717b76b-4124-4d36-af0c-8722eb688f29', user_id=None, conversation_id=None, order=None, user_query=UserQuery(query_str='How do solar panels impact the environment?', metadata=None), assistant_response=AssistantResponse(response

In [160]:

dialog_turns_as_documents = [map_dialogturn_to_document(turn) for turn in dialog_turns]
print(dialog_turns_as_documents)

# # apply data transformation to the documents
# output = data_transformer(dialog_turns_as_documents)
# print(output)

[Document(id=7ece1d3a-61c5-4715-83cd-f5cc1ba931bd, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector=[], parent_doc_id=None, order=None, score=None), Document(id=8717b76b-4124-4d36-af0c-8722eb688f29, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector=[], parent_doc_id=None, order=None, score=None)]


In [161]:

output = data_transformer(dialog_turns_as_documents)
print(output)

Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 199.51it/s]
Batch embedding documents: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]
Adding embeddings to documents from batch: 2it [00:00, ?it/s]

[Document(id=850176d3-718b-4403-bfcb-dbc082d28b5b, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 768', parent_doc_id=7ece1d3a-61c5-4715-83cd-f5cc1ba931bd, order=0, score=None), Document(id=e39f0529-ad40-4a0b-b385-ad52427350b1, text='and installation sectors. The growth in renewable energy usage boosts local economies through increa...', meta_data=None, vector='len: 768', parent_doc_id=7ece1d3a-61c5-4715-83cd-f5cc1ba931bd, order=1, score=None), Document(id=9a5e9029-10ec-4f61-b68b-109d6b62dc6a, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 768', parent_doc_id=8717b76b-4124-4d36-af0c-8722eb688f29, order=0, score=None), Document(id=d4fadaa4-14ab-4753-8d70-058b2dc39126, text='has been found to have a significant positive effect on the environment by reducing the reliance on ...', meta_data=None, vector='




In [164]:
for o in output:
    print(o)

Document(id=850176d3-718b-4403-bfcb-dbc082d28b5b, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 768', parent_doc_id=7ece1d3a-61c5-4715-83cd-f5cc1ba931bd, order=0, score=None)
Document(id=e39f0529-ad40-4a0b-b385-ad52427350b1, text='and installation sectors. The growth in renewable energy usage boosts local economies through increa...', meta_data=None, vector='len: 768', parent_doc_id=7ece1d3a-61c5-4715-83cd-f5cc1ba931bd, order=1, score=None)
Document(id=9a5e9029-10ec-4f61-b68b-109d6b62dc6a, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 768', parent_doc_id=8717b76b-4124-4d36-af0c-8722eb688f29, order=0, score=None)
Document(id=d4fadaa4-14ab-4753-8d70-058b2dc39126, text='has been found to have a significant positive effect on the environment by reducing the reliance on ...', meta_data=None, vector='len:

Funtion call in action

In [165]:
from dataclasses import dataclass
import numpy as np
import time
import asyncio


def multiply(a: int, b: int) -> int:
    """Multiply two numbers."""
    time.sleep(1)
    return a * b


def add(a: int, b: int) -> int:
    """Add two numbers."""
    time.sleep(1)
    return a + b


async def divide(a: float, b: float) -> float:
    """Divide two numbers."""
    await asyncio.sleep(1)
    return float(a) / b


async def search(query: str) -> List[str]:
    """Search for query and return a list of results."""
    await asyncio.sleep(1)
    return ["result1" + query, "result2" + query]


def numpy_sum(arr: np.ndarray) -> float:
    """Sum the elements of an array."""
    return np.sum(arr)


x = 2

@dataclass
class Point:
    x: int
    y: int


def add_points(p1: Point, p2: Point) -> Point:
    return Point(p1.x + p2.x, p1.y + p2.y)

In [166]:
from lightrag.core.func_tool import FunctionTool

functions =[multiply, add, divide, search, numpy_sum, add_points]
tools = [
    FunctionTool(fn=fn) for fn in functions
]
for tool in tools:
    print(tool)

FunctionTool(fn: <function multiply at 0x000001D28418FD80>, async: False, definition: FunctionDefinition(func_name='multiply', func_desc='multiply(a: int, b: int) -> int\nMultiply two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']}))
FunctionTool(fn: <function add at 0x000001D28418FA60>, async: False, definition: FunctionDefinition(func_name='add', func_desc='add(a: int, b: int) -> int\nAdd two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']}))
FunctionTool(fn: <function divide at 0x000001D28418F7E0>, async: True, definition: FunctionDefinition(func_name='divide', func_desc='divide(a: float, b: float) -> float\nDivide two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'float'}, 'b': {'type': 'float'}}, 'required': ['a', 'b']}))
FunctionTool(fn: <function search at 0x000001D28418FE20>, async: True, 

In [167]:
print(tools[-2].definition.to_dict())

{'func_name': 'numpy_sum', 'func_desc': 'numpy_sum(arr: numpy.ndarray) -> float\nSum the elements of an array.', 'func_parameters': {'type': 'object', 'properties': {'arr': {'type': 'ndarray'}}, 'required': ['arr']}}


In [168]:
context_map = {tool.definition.func_name: tool for tool in tools}

To execute a function, we can do

In [169]:
function_name = "add"
function_to_call = context_map[function_name]
function_args = {"a": 1, "b": 2}
function_response = function_to_call.call(**function_args)

TOOL MANANGER

In [170]:
from lightrag.core.tool_manager import ToolManager

tool_manager = ToolManager(tools=functions)
print(tool_manager)

ToolManager(Tools: [FunctionTool(fn: <function multiply at 0x000001D28418FD80>, async: False, definition: FunctionDefinition(func_name='multiply', func_desc='multiply(a: int, b: int) -> int\nMultiply two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})), FunctionTool(fn: <function add at 0x000001D28418FA60>, async: False, definition: FunctionDefinition(func_name='add', func_desc='add(a: int, b: int) -> int\nAdd two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})), FunctionTool(fn: <function divide at 0x000001D28418F7E0>, async: True, definition: FunctionDefinition(func_name='divide', func_desc='divide(a: float, b: float) -> float\nDivide two numbers.', func_parameters={'type': 'object', 'properties': {'a': {'type': 'float'}, 'b': {'type': 'float'}}, 'required': ['a', 'b']})), FunctionTool(fn: <function search at 0x000001D28

FUNCTION call end to end

In [171]:
template = r"""<SYS>You have these tools available:
{% if tools %}
<TOOLS>
{% for tool in tools %}
{{ loop.index }}.
{{tool}}
------------------------
{% endfor %}
</TOOLS>
{% endif %}
<OUTPUT_FORMAT>
{{output_format_str}}
</OUTPUT_FORMAT>
</SYS>
User: {{input_str}}
You:
"""

In [172]:
from lightrag.core.prompt_builder import Prompt

prompt = Prompt(template=template)
small_tool_manager = ToolManager(tools=tools[:2])

renered_prompt = prompt(tools=small_tool_manager.yaml_definitions)
print(renered_prompt)

<SYS>You have these tools available:
<TOOLS>
1.
func_name: multiply
func_desc: 'multiply(a: int, b: int) -> int

  Multiply two numbers.'
func_parameters:
  type: object
  properties:
    a:
      type: int
    b:
      type: int
  required:
  - a
  - b

------------------------
2.
func_name: add
func_desc: 'add(a: int, b: int) -> int

  Add two numbers.'
func_parameters:
  type: object
  properties:
    a:
      type: int
    b:
      type: int
  required:
  - a
  - b

------------------------
</TOOLS>
<OUTPUT_FORMAT>
None
</OUTPUT_FORMAT>
</SYS>
User: None
You:



Pass the output format 1:

In [173]:
from lightrag.core.types import Function

output_data_class = Function
output_format_str = output_data_class.to_json_signature(exclude=["thought", "args"])

renered_prompt= prompt(output_format_str=output_format_str)
print(renered_prompt)

<SYS>You have these tools available:
<OUTPUT_FORMAT>
{
    "name": "The name of the function (str) (optional)",
    "kwargs": "The keyword arguments of the function (Optional[Dict[str, object]]) (optional)"
}
</OUTPUT_FORMAT>
</SYS>
User: None
You:



Pass the output format 2:

In [174]:
from lightrag.core.types import FunctionExpression

output_data_class = FunctionExpression
output_format_str = output_data_class.to_json_signature(exclude=["thought"])
print(prompt(output_format_str=output_format_str))

<SYS>You have these tools available:
<OUTPUT_FORMAT>
{
    "action": "FuncName(<kwargs>) Valid function call expression. Example: \"FuncName(a=1, b=2)\" Follow the data type specified in the function parameters.e.g. for Type object with x,y properties, use \"ObjectType(x=1, y=2) (str) (required)"
}
</OUTPUT_FORMAT>
</SYS>
User: None
You:



In [180]:
from lightrag.components.output_parsers import JsonOutputParser

func_parser = JsonOutputParser(data_class=Function,exclude_fields=["thought", "args"])

instructions = func_parser.format_instructions()
print(instructions)

Your output should be formatted as a standard JSON instance with the following schema:
```
{
    "name": "The name of the function (str) (optional)",
    "kwargs": "The keyword arguments of the function (Optional[Dict[str, object]]) (optional)"
}
```
-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!
-Use double quotes for the keys and string values.
-DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the JSON output.
-Follow the JSON formatting conventions.


In [182]:
from lightrag.core.generator import Generator
from lightrag.core.types import ModelClientType

model_kwargs = {"model": "llama3-8b-8192"}
prompt_kwargs = {
    "tools": tool_manager.yaml_definitions,
    "output_format_str": func_parser.format_instructions(),
}
generator = Generator(
    model_client=ModelClientType.GROQ(),
    model_kwargs=model_kwargs,
    template=template,
    prompt_kwargs=prompt_kwargs,
    output_processors=func_parser,
)

RUN QUERIES

In [183]:
queries = [
    "add 2 and 3",
    "search for something",
    "add points (1, 2) and (3, 4)",
    "sum numpy array with arr = np.array([[1, 2], [3, 4]])",
    "multiply 2 with local variable x",
    "divide 2 by 3",
    "Add 5 to variable y",
]

for idx, query in enumerate(queries):
    prompt_kwargs = {"input_str": query}
    print(f"\n{idx} Query: {query}")
    print(f"{'-'*50}")
    try:
        result = generator(prompt_kwargs=prompt_kwargs)
        # print(f"LLM raw output: {result.raw_response}")
        func = Function.from_dict(result.data)
        print(f"Function: {func}")
        func_output = tool_manager.execute_func(func)
        print(f"Function output: {func_output}")
    except Exception as e:
        print(
            f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}"
        )


0 Query: add 2 and 3
--------------------------------------------------
Function: Function(thought=None, name='add', args=[], kwargs={'a': 2, 'b': 3})
Function output: FunctionOutput(name='add', input=Function(thought=None, name='add', args=(), kwargs={'a': 2, 'b': 3}), parsed_input=None, output=5, error=None)

1 Query: search for something
--------------------------------------------------
Function: Function(thought=None, name='search', args=[], kwargs={'query': 'python'})
Function output: FunctionOutput(name='search', input=Function(thought=None, name='search', args=(), kwargs={'query': 'python'}), parsed_input=None, output=['result1python', 'result2python'], error=None)

2 Query: add points (1, 2) and (3, 4)
--------------------------------------------------


Error at calling <function add_points at 0x000001D28418E700>: 'dict' object has no attribute 'x'


Function: Function(thought=None, name='add_points', args=[], kwargs={'p1': {'x': 1, 'y': 2}, 'p2': {'x': 3, 'y': 4}})
Function output: FunctionOutput(name='add_points', input=Function(thought=None, name='add_points', args=(), kwargs={'p1': {'x': 1, 'y': 2}, 'p2': {'x': 3, 'y': 4}}), parsed_input=None, output=None, error="'dict' object has no attribute 'x'")

3 Query: sum numpy array with arr = np.array([[1, 2], [3, 4]])
--------------------------------------------------


Error at parsing JSON string: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping
  in "<unicode string>", line 1, column 33:
    {"name": "numpy_sum", "kwargs": {"arr": np.array([[1, 2], [3, 4]])}}
                                    ^
expected ',' or '}', but got '['
  in "<unicode string>", line 1, column 50:
     ... um", "kwargs": {"arr": np.array([[1, 2], [3, 4]])}}
                                         ^. Got JSON string: {"name": "numpy_sum", "kwargs": {"arr": np.array([[1, 2], [3, 4]])}}
Error in parsing JSON to JSON: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping
  in "<unicode string>", line 1, column 33:
    {"name": "numpy_sum", "kwargs": {"arr": np.array([[1, 2], [3, 4]])}}
                                    ^
expected ',' or '}', but got '['
  in "<unicode string>", line 1, column 50:
     ... um", "kwargs": {"arr": np.array([[1, 2], [3, 4]])}}
                                         ^. Got JSON s

Failed to execute the function for query: sum numpy array with arr = np.array([[1, 2], [3, 4]]), func: None, error: 'NoneType' object has no attribute 'items'

4 Query: multiply 2 with local variable x
--------------------------------------------------


Error at parsing JSON string: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow node
expected the node content, but found '?'
  in "<unicode string>", line 3, column 27:
      "kwargs": {"a": 2, "b": ?}
                              ^. Got JSON string: {
  "name": "multiply",
  "kwargs": {"a": 2, "b": ?}
}
Error in parsing JSON to JSON: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow node
expected the node content, but found '?'
  in "<unicode string>", line 3, column 27:
      "kwargs": {"a": 2, "b": ?}
                              ^. Got JSON string: {
  "name": "multiply",
  "kwargs": {"a": 2, "b": ?}
}
Error processing the output processors: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow node
expected the node content, but found '?'
  in "<unicode string>", line 3, column 27:
      "kwargs": {"a": 2, "b": ?}
                              ^. Got JSON string: {
  "name": "multiply",
  "kwargs": {

Failed to execute the function for query: multiply 2 with local variable x, func: None, error: 'NoneType' object has no attribute 'items'

5 Query: divide 2 by 3
--------------------------------------------------
Function: Function(thought=None, name='divide', args=[], kwargs={'a': 2, 'b': 3.0})
Function output: FunctionOutput(name='divide', input=Function(thought=None, name='divide', args=(), kwargs={'a': 2, 'b': 3.0}), parsed_input=None, output=0.6666666666666666, error=None)

6 Query: Add 5 to variable y
--------------------------------------------------


Error in parsing JSON to JSON: Error: No JSON object or array found in the text: To add 5 to a variable y, I will use the `add` function. Please note that I need to know the type of y beforehand. If y is an integer, the type should be `int`. If y is a float, the type should be `float`. Let me know the type of y.
Error processing the output processors: Error: No JSON object or array found in the text: To add 5 to a variable y, I will use the `add` function. Please note that I need to know the type of y beforehand. If y is an integer, the type should be `int`. If y is a float, the type should be `float`. Let me know the type of y.


Failed to execute the function for query: Add 5 to variable y, func: None, error: 'NoneType' object has no attribute 'items'


As LLM has problem calling add_point, we will add one example and we will generate it with core.types.FunctionExpression.from_function(). We will update our outputparser to use the example:

In [184]:
example = FunctionExpression.from_function(
        func=add_points, p1=Point(x=1, y=2), p2=Point(x=3, y=4)
)
func_parser = JsonOutputParser(
        data_class=FunctionExpression, examples=[example]
)

In [185]:
print(example)

FunctionExpression(thought=None, action='add_points(p1=Point(x=1, y=2), p2=Point(x=3, y=4))')


In [189]:
for idx in range(0, len(queries), 2):
    query = " and ".join(queries[idx : idx + 2])
    
    prompt_kwargs = {"input_str": query}
    print(f"\n{idx} Query: {query}")
    print(f"{'-'*50}")
    try:
        result = generator(prompt_kwargs=prompt_kwargs)
        # print(f"LLM raw output: {result.raw_response}")
        func_expr: List[FunctionExpression] = [
            FunctionExpression.from_dict(item) for item in result.data
        ]
        print(f"Function_expr: {func_expr}")
        for expr in func_expr:
            func_output = tool_manager.execute_func_expr_via_sandbox(expr)
            print(f"Function output: {func_output}")
    except Exception as e:
        print(
            f"Failed to execute the function for query: {query}, func: {result.data}, error: {e}"
        )


0 Query: add 2 and 3 and search for something
--------------------------------------------------
Failed to execute the function for query: add 2 and 3 and search for something, func: {'name': 'add', 'kwargs': {'a': 2, 'b': 3}}, error: 'str' object has no attribute 'items'

2 Query: add points (1, 2) and (3, 4) and sum numpy array with arr = np.array([[1, 2], [3, 4]])
--------------------------------------------------


Error at parsing JSON string: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping
  in "<unicode string>", line 3, column 15:
        "kwargs": {
                  ^
expected ',' or '}', but got '['
  in "<unicode string>", line 6, column 25:
            "arr": np.array([[1, 2], [3, 4]])
                            ^. Got JSON string: {
    "name": "Add points and sum numpy array",
    "kwargs": {
        "p1": {"x": 1, "y": 2},
        "p2": {"x": 3, "y": 4},
        "arr": np.array([[1, 2], [3, 4]])
    }
}
Error in parsing JSON to JSON: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping
  in "<unicode string>", line 3, column 15:
        "kwargs": {
                  ^
expected ',' or '}', but got '['
  in "<unicode string>", line 6, column 25:
            "arr": np.array([[1, 2], [3, 4]])
                            ^. Got JSON string: {
    "name": "Add points and sum numpy array",
    "kwargs": {
        "p1": {"x"

Failed to execute the function for query: add points (1, 2) and (3, 4) and sum numpy array with arr = np.array([[1, 2], [3, 4]]), func: None, error: 'NoneType' object is not iterable

4 Query: multiply 2 with local variable x and divide 2 by 3
--------------------------------------------------
Failed to execute the function for query: multiply 2 with local variable x and divide 2 by 3, func: {'name': 'multiply', 'kwargs': {'a': 2, 'b': 'x'}}, error: 'str' object has no attribute 'items'

6 Query: Add 5 to variable y
--------------------------------------------------
Failed to execute the function for query: Add 5 to variable y, func: {'name': 'Add', 'kwargs': {'a': 5, 'b': 'y'}}, error: 'str' object has no attribute 'items'


Agent in action

In [8]:
from lightrag.components.agent import ReActAgent
from lightrag.core import Generator, ModelClientType, ModelClient

from dotenv import load_dotenv
import os
os.environ["GROQ_API_KEY"] = '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'
# Load environment variables from .env file
load_dotenv()
# Access the environment variable
api_key = os.getenv("GROQ_API_KEY")



# Define tools
def multiply(a: int, b: int) -> int:
   """
   Multiply two numbers.
   """
   return a * b

def add(a: int, b: int) -> int:
   """
   Add two numbers.
   """
   return a + b

def divide(a: float, b: float) -> float:
   """
   Divide two numbers.
   """
   return float(a) / b

llama3_model_kwargs = {
   "model": "llama3-70b-8192",  # llama3 70b works better than 8b here.
   "temperature": 0.0,
}
# gpt_model_kwargs = {
#    "model": "gpt-3.5-turbo",
#    "temperature": 0.0,
# }


def test_react_agent(model_client: ModelClient, model_kwargs: dict):
   tools = [multiply, add, divide]
   queries = [
      "What is the capital of France? and what is 465 times 321 then add 95297 and then divide by 13.2?",
      "Give me 5 words rhyming with cool, and make a 4-sentence poem using them",
   ]
   # define a generator without tools for comparison

   generator = Generator(
      model_client=model_client,
      model_kwargs=model_kwargs,
   )

   react = ReActAgent(
      max_steps=6,
      add_llm_as_fallback=True,
      tools=tools,
      model_client=model_client,
      model_kwargs=model_kwargs,
   )
   print("react is as ",react)
   # print(react)

   for query in queries:
      print("hello")
      print(f"Query: {query}")
      agent_response = react.call(query)
      llm_response = generator.call(prompt_kwargs={"input_str": query})
      print(f"Agent response: {agent_response}")
      print(f"LLM response: {llm_response}")
      print("")


In [9]:
test_react_agent(ModelClientType.GROQ(), llama3_model_kwargs)
# test_react_agent(ModelClientType.OPENAI(), gpt_model_kwargs)

react is as  ReActAgent(
  max_steps=6, add_llm_as_fallback=True, 
  (tool_manager): ToolManager(Tools: [FunctionTool(fn: <function multiply at 0x0000029A71FAB880>, async: False, definition: FunctionDefinition(func_name='multiply', func_desc='multiply(a: int, b: int) -> int\n\n   Multiply two numbers.\n   ', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})), FunctionTool(fn: <function add at 0x0000029A71FAB740>, async: False, definition: FunctionDefinition(func_name='add', func_desc='add(a: int, b: int) -> int\n\n   Add two numbers.\n   ', func_parameters={'type': 'object', 'properties': {'a': {'type': 'int'}, 'b': {'type': 'int'}}, 'required': ['a', 'b']})), FunctionTool(fn: <function divide at 0x0000029A6FC85300>, async: False, definition: FunctionDefinition(func_name='divide', func_desc='divide(a: float, b: float) -> float\n\n   Divide two numbers.\n   ', func_parameters={'type': 'object', 'properties': {'a': {'t