In [1]:
import logging
from typing import Any, Set

from automata.config.agent_config_builder import AutomataAgentConfigBuilder
from automata.config.config_types import AgentConfigName
from automata.core.agent.agent import AutomataAgent
from automata.core.agent.tools.tool_utils import (
    AgentToolFactory,
    DependencyFactory,
    build_llm_toolkits,
)
from automata.core.base.tool import ToolkitType


In [2]:
tool_list = ["context_oracle"]

dependencies: Set[Any] = set()
for tool in tool_list:
    for dependency_name, _ in AgentToolFactory.TOOLKIT_TYPE_TO_ARGS[ToolkitType(tool)]:
        dependencies.add(dependency_name)

kwargs = {}
print("  - Building dependencies...")
for dependency in dependencies:
    print(f"Building {dependency}...")
    kwargs[dependency] = DependencyFactory().get(dependency)


  - Building dependencies...
Building symbol_search...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 699/699 [00:00<00:00, 3149.05it/s]


Building symbol_doc_similarity...


In [3]:
llm_toolkits = build_llm_toolkits(tool_list, **kwargs)

In [4]:
from automata.core.base.tool import Tool, Toolkit, ToolkitType
llm_toolkits[ToolkitType.CONTEXT_ORACLE].tools[0]

Tool(name='context-oracle', description="\nThis tool combines SymbolSearch and SymbolSimilarity to create contexts. Given a query, it uses SymbolSimilarity calculate the similarity between each symbol's documentation and the query returns the most similar document. Then, it leverages SymbolSearch to combine Semantic Search with PageRank to find the most relevant symbols to the query. The overview documentation of these symbols is then concated to the result of the SymbolSimilarity query to create a context.\n\nFor instance, if a query reads 'Tell me about SymbolRank', it will find the most similar document to this query from the embeddings, which in this case would be the documentation for the SymbolRank class. Then, it will use SymbolSearch to fetch some of the most relevant symbols which would be 'Symbol', 'SymbolSearch', 'SymbolGraph', etc. This results in a comprehensive context for the query.\n", return_direct=True, verbose=False, func=<function ContextOracleTool.build.<locals>.<l

In [5]:
config_name = AgentConfigName("automata_retriever")

agent_config = (
    AutomataAgentConfigBuilder.from_name(config_name)
    .with_llm_toolkits(llm_toolkits)
    .with_model(kwargs.get("model", "gpt-4"))
    .build()
)
print(agent_config.system_instruction)

You are Automata Retriever, an autonomous software retrieval system built by OpenAI and designed to run within a local Python repository. You receive instructions from a user in simple English and carry out the instructions with the tools you are provided. You may use the following tools:

Tools:

context-oracle:

This tool combines SymbolSearch and SymbolSimilarity to create contexts. Given a query, it uses SymbolSimilarity calculate the similarity between each symbol's documentation and the query returns the most similar document. Then, it leverages SymbolSearch to combine Semantic Search with PageRank to find the most relevant symbols to the query. The overview documentation of these symbols is then concated to the result of the SymbolSimilarity query to create a context.

For instance, if a query reads 'Tell me about SymbolRank', it will find the most similar document to this query from the embeddings, which in this case would be the documentation for the SymbolRank class. Then, it

In [6]:
instructions = "Explain how embeddings are used by the codebase"
agent = AutomataAgent(instructions, config=agent_config)
agent.setup()
result = agent.run()


>>> automata_retriever Agent:
- thoughts
  - I will start by retrieving the relevant context from the oracle
- actions
  - tool_query_1
    - tool_name
      - context-oracle
    - tool_args
      - Explain how embeddings are used by the codebase.


Failed to get embedding for symbol Symbol(scip-python python automata 9db05b7e7ebd49f93703df45accd7e5f9d5cedb0 `automata.cli.commands`/run_code_embedding()., scip-python, Package(python automata 9db05b7e7ebd49f93703df45accd7e5f9d5cedb0), (Descriptor(automata.cli.commands, 1), Descriptor(run_code_embedding, 4))) with error: 'Symbol Symbol(scip-python python automata 9db05b7e7ebd49f93703df45accd7e5f9d5cedb0 `automata.cli.commands`/run_code_embedding()., scip-python, Package(python automata 9db05b7e7ebd49f93703df45accd7e5f9d5cedb0), (Descriptor(automata.cli.commands, 1), Descriptor(run_code_embedding, 4))) not in database'
Failed to get embedding for symbol Symbol(scip-python python automata 9db05b7e7ebd49f93703df45accd7e5f9d5cedb0 `automata.tests.unit.test_symbol_embedding`/test_get_embedding()., scip-python, Package(python automata 9db05b7e7ebd49f93703df45accd7e5f9d5cedb0), (Descriptor(automata.tests.unit.test_symbol_embedding, 1), Descriptor(test_get_embedding, 4))) with error: 'Symbol


>>> automata_retriever Agent:
- thoughts
  - I can now return the requested information.
- actions
  - return_result_0
    - Embeddings are used in the codebase for various purposes, such as creating, updating, and retrieving embeddings for symbols in source code and documentation. The `SymbolCodeEmbedding` and `SymbolDocEmbedding` classes represent embeddings for code and documentation, respectively. The `SymbolCodeEmbeddingHandler` and `SymbolDocEmbeddingHandler` classes manage these embeddings, leveraging `EmbeddingProvider` implementations, like `OpenAIEmbedding`, to generate them. Embeddings are stored in a vector database, such as `JSONVectorDatabase`.

      Here are some examples of creating instances for different classes:

      ```python
      # SymbolCodeEmbedding
      import numpy as np
      from automata.core.symbol.symbol_types import SymbolCodeEmbedding
      from automata.core.symbol.parser import parse_symbol

      symbol_str = "scip-python python automata 7548269

In [7]:
print(f"Result:\n{result}")

Result:
Embeddings are used in the codebase for various purposes, such as creating, updating, and retrieving embeddings for symbols in source code and documentation. The `SymbolCodeEmbedding` and `SymbolDocEmbedding` classes represent embeddings for code and documentation, respectively. The `SymbolCodeEmbeddingHandler` and `SymbolDocEmbeddingHandler` classes manage these embeddings, leveraging `EmbeddingProvider` implementations, like `OpenAIEmbedding`, to generate them. Embeddings are stored in a vector database, such as `JSONVectorDatabase`.
      Here are some examples of creating instances for different classes:

      ```python
      # SymbolCodeEmbedding
      import numpy as np
      from automata.core.symbol.symbol_types import SymbolCodeEmbedding
      from automata.core.symbol.parser import parse_symbol

      symbol_str = "scip-python python automata 75482692a6fe30c72db516201a6f47d9fb4af065 `automata.core.agent.agent_enums`/ActionIndicator#"
      symbol = parse_symbol(symbo

In [14]:
## Testing the resulting snipppets -

# SymbolCodeEmbedding
import numpy as np
from automata.core.symbol.symbol_types import SymbolCodeEmbedding
from automata.core.symbol.parser import parse_symbol

symbol_str = "scip-python python automata 75482692a6fe30c72db516201a6f47d9fb4af065 `automata.core.agent.agent_enums`/ActionIndicator#"
symbol = parse_symbol(symbol_str)
source_code = "symbol_source"
vector = np.array([1, 0, 0, 0])

embedding = SymbolCodeEmbedding(symbol=symbol, source_code=source_code, vector=vector)



# OpenAIEmbedding
from automata.core.embedding.embedding_types import OpenAIEmbedding
import numpy as np

openai_embedding = OpenAIEmbedding()
symbol_source = "def square(x): return x * x"
embedding = openai_embedding.build_embedding(symbol_source)
assert isinstance(embedding, np.ndarray)


# JSONVectorDatabase
from automata.core.database.vector import JSONVectorDatabase
from automata.core.symbol.symbol_types import Symbol, SymbolEmbedding

file_path = "my_database.json"
db = JSONVectorDatabase(file_path)

symbol = Symbol.from_string("example.symbol#")
embedding = SymbolEmbedding(symbol, "embedding_source", [1, 2, 3])
db.add(embedding)

assert db.contains(symbol)

retrieved_embedding = db.get(symbol)
assert retrieved_embedding == embedding

new_embedding = SymbolEmbedding(symbol, "new_embedding_source", [4, 5, 6])
db.update(new_embedding)

db.save()
db.load()

db.discard(symbol)
db.clear()


ValueError: Invalid symbol_str: example.symbol#

In [20]:
import textwrap

input2 = textwrap.dedent("""
ValueError                                
Traceback (most recent call last)
Cell In[14], line 34
     31 file_path = "my_database.json"
     32 db = JSONVectorDatabase(file_path)
---> 34 symbol = Symbol.from_string("example.symbol#")
     35 embedding = SymbolEmbedding(symbol, "embedding_source", [1, 2, 3])
     36 db.add(embedding)

File ~/_Automata_/automata/core/symbol/symbol_types.py:257, in Symbol.from_string(cls, symbol_str)
    255 match = re.search(r"Symbol\((.*?), (.*?), Package\((.*?)\), \((.*?)\)\)", symbol_str)
    256 if not match:
--> 257     raise ValueError(f"Invalid symbol_str: {symbol_str}")
    258 uri, _, __, ___ = match.groups()
    259 # In current implementation, only the uri is used in re-construcing the symbol

ValueError: Invalid symbol_str: example.symbol#
""")
result2 = agent.run_further_with_new_instructions(f'''
                                                  The first two of the previous code snippets were correct. 
                                                  However, the third generated the following error - {input2}
                                                  How do we correct this?
                                                  ''')


>>> automata_retriever Agent:
Apologies for the confusion. It appears that I made an error in the example code for creating a Symbol instance. Let's fix the example by using the `parse_symbol` function instead of the `Symbol.from_string` method.

Here's the corrected example code for using the `JSONVectorDatabase`:

```python
from automata.core.database.vector import JSONVectorDatabase
from automata.core.symbol.symbol_types import Symbol, SymbolEmbedding
from automata.core.symbol.parser import parse_symbol

file_path = "my_database.json"
db = JSONVectorDatabase(file_path)

symbol_str = "scip-python python automata 75482692a6fe30c72db516201a6f47d9fb4af065 `automata.core.agent.agent_enums`/ActionIndicator#"
symbol = parse_symbol(symbol_str)
embedding = SymbolEmbedding(symbol, "embedding_source", [1, 2, 3])
db.add(embedding)

assert db.contains(symbol)

retrieved_embedding = db.get(symbol)
assert retrieved_embedding == embedding

new_embedding = SymbolEmbedding(symbol, "new_embedding_sou

In [21]:
### Testing the updated code snippet -

from automata.core.database.vector import JSONVectorDatabase
from automata.core.symbol.symbol_types import Symbol, SymbolEmbedding
from automata.core.symbol.parser import parse_symbol

file_path = "my_database.json"
db = JSONVectorDatabase(file_path)

symbol_str = "scip-python python automata 75482692a6fe30c72db516201a6f47d9fb4af065 `automata.core.agent.agent_enums`/ActionIndicator#"
symbol = parse_symbol(symbol_str)
embedding = SymbolEmbedding(symbol, "embedding_source", [1, 2, 3])
db.add(embedding)

assert db.contains(symbol)

retrieved_embedding = db.get(symbol)
assert retrieved_embedding == embedding

new_embedding = SymbolEmbedding(symbol, "new_embedding_source", [4, 5, 6])
db.update(new_embedding)

db.save()
db.load()

db.discard(symbol)
db.clear()

KeyError: 'Symbol Symbol(scip-python python automata 75482692a6fe30c72db516201a6f47d9fb4af065 `automata.core.agent.agent_enums`/ActionIndicator#, scip-python, Package(python automata 75482692a6fe30c72db516201a6f47d9fb4af065), (Descriptor(automata.core.agent.agent_enums, 1), Descriptor(ActionIndicator, 2))) not in database'

In [23]:
import textwrap

input3 = textwrap.dedent("""
Cell In[21], line 19
     16 assert retrieved_embedding == embedding
     18 new_embedding = SymbolEmbedding(symbol, "new_embedding_source", [4, 5, 6])
---> 19 db.update(new_embedding)
     21 db.save()
     22 db.load()

File ~/_Automata_/automata/core/database/vector.py:86, in JSONVectorDatabase.update(self, embedding)
     77 Updates an embedding in the database
     78 
   (...)
     83     KeyError: If the symbol is not in the database
     85 if embedding.symbol not in self.index:
---> 86     raise KeyError(f"Symbol {embedding.symbol} not in database")
     87 self.data[self.index[embedding.symbol.dotpath]] = embedding

KeyError: 'Symbol Symbol(scip-python python automata 75482692a6fe30c72db516201a6f47d9fb4af065 `automata.core.agent.agent_enums`/ActionIndicator#, scip-python, Package(python automata 75482692a6fe30c72db516201a6f47d9fb4af065), (Descriptor(automata.core.agent.agent_enums, 1), Descriptor(ActionIndicator, 2))) not in database'

""")
result3 = agent.run_further_with_new_instructions(f'''
                                                  The previous change fixed that error, however another was revealed. 
                                                  We have the following error - {input3}
                                                  How do we correct this?
                                                  ''')


>>> automata_retriever Agent:
My apologies for the confusion. It seems like the database was not updated correctly during the previous steps. Let's fix the example by first checking if the symbol is in the database before updating it.

Here's the corrected example code for using the `JSONVectorDatabase`:

```python
from automata.core.database.vector import JSONVectorDatabase
from automata.core.symbol.symbol_types import Symbol, SymbolEmbedding
from automata.core.symbol.parser import parse_symbol

file_path = "my_database.json"
db = JSONVectorDatabase(file_path)

symbol_str = "scip-python python automata 75482692a6fe30c72db516201a6f47d9fb4af065 `automata.core.agent.agent_enums`/ActionIndicator#"
symbol = parse_symbol(symbol_str)
embedding = SymbolEmbedding(symbol, "embedding_source", [1, 2, 3])
db.add(embedding)

assert db.contains(symbol)

retrieved_embedding = db.get(symbol)
assert retrieved_embedding == embedding

if db.contains(symbol):
    new_embedding = SymbolEmbedding(symbol, "

InvalidRequestError: This model's maximum context length is 8192 tokens. However, your messages resulted in 8210 tokens. Please reduce the length of the messages.

In [25]:
### Testing the last code snippet -

from automata.core.database.vector import JSONVectorDatabase
from automata.core.symbol.symbol_types import Symbol, SymbolEmbedding
from automata.core.symbol.parser import parse_symbol

file_path = "my_database.json"
db = JSONVectorDatabase(file_path)

symbol_str = "scip-python python automata 75482692a6fe30c72db516201a6f47d9fb4af065 `automata.core.agent.agent_enums`/ActionIndicator#"
symbol = parse_symbol(symbol_str)
embedding = SymbolEmbedding(symbol, "embedding_source", [1, 2, 3])
db.add(embedding)

assert db.contains(symbol)

retrieved_embedding = db.get(symbol)
assert retrieved_embedding == embedding

if db.contains(symbol):
    new_embedding = SymbolEmbedding(symbol, "new_embedding_source", [4, 5, 6])
    db.update(new_embedding)

db.save()
db.load()

db.discard(symbol)
db.clear()
