Skip to content
This repository has been archived by the owner on Mar 16, 2024. It is now read-only.

Commit

Permalink
refactor to simplify symbol similarity (#113)
Browse files Browse the repository at this point in the history
* refactor to simplify symbol similarity

* check in progress towards integration

* get tests running, add synchronizer

* Refactor SymbolSearch

* Correcting type errors

* refresh embeddings

* progress towards merge

* prep for merge

* prep for merge

* update demo symbol search

* Update nbs

* further tweaks

* Check progress towards final merge

* finalize commit
  • Loading branch information
emrgnt-cmplxty authored Jul 5, 2023
1 parent fcbc060 commit 0d4d4b2
Show file tree
Hide file tree
Showing 125 changed files with 2,200 additions and 3,338 deletions.
35 changes: 20 additions & 15 deletions .setup.sh.example
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
## NOTE - the code below is contained in setup.sh.example

# Clone the repository
git clone git@github.com:emrgnt-cmplxty/Automata.git
cd Automata

# Create the local environment
python3 -m venv local_env
source local_env/bin/activate
Expand All @@ -22,16 +16,27 @@ CONVERSATION_DB_PATH="$PWD/conversation_db.sqlite3"
TASK_DB_PATH="$PWD/task_db.sqlite3"
TASKS_OUTPUT_PATH="$PWD/tasks"
REPOSITORY_NAME="emrgnt-cmplxty/Automata"
sed -i "s|your_openai_api_key|$OPEN_API_KEY|" .env
sed -i "s|your_github_api_key|$GITHUB_API_KEY|" .env
sed -i "s|your_conversation_db_path|$CONVERSATION_DB_PATH|" .env
sed -i "s|your_task_db_path|$TASK_DB_PATH|" .env
sed -i "s|your_tasks_output_path|$TASKS_OUTPUT_PATH|" .env
sed -i "s|your_repository_name|$REPOSITORY_NAME|" .env

# Detect the operating system
if [[ "$OSTYPE" == "darwin"* ]]; then
# Mac OSX
sed -i '' "s|your_openai_api_key|$OPEN_API_KEY|" .env
sed -i '' "s|your_github_api_key|$GITHUB_API_KEY|" .env
sed -i '' "s|your_conversation_db_path|$CONVERSATION_DB_PATH|" .env
sed -i '' "s|your_task_db_path|$TASK_DB_PATH|" .env
sed -i '' "s|your_tasks_output_path|$TASKS_OUTPUT_PATH|" .env
sed -i '' "s|your_repository_name|$REPOSITORY_NAME|" .env
else
# Linux and others
sed -i "s|your_openai_api_key|$OPEN_API_KEY|" .env
sed -i "s|your_github_api_key|$GITHUB_API_KEY|" .env
sed -i "s|your_conversation_db_path|$CONVERSATION_DB_PATH|" .env
sed -i "s|your_task_db_path|$TASK_DB_PATH|" .env
sed -i "s|your_tasks_output_path|$TASKS_OUTPUT_PATH|" .env
sed -i "s|your_repository_name|$REPOSITORY_NAME|" .env
fi
# Additional Notes -
# Default Max Workers is 8, manually change the .env to update this quantity.
# For MAC users, the example should read as follows -
## sed -i '' "s|your_openai_api_key|$OPEN_API_KEY|" .env

# Fetch the submodules
git submodule update --init --recursive
Expand All @@ -45,4 +50,4 @@ git submodule update --init --recursive
###
### Then, initialize by running the following:
## git lfs install
## git lfs pull
## git lfs pull
22 changes: 11 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ https://github.com/emrgnt-cmplxty/Automata/assets/68796651/2e1ceb8c-ac93-432b-af
Follow these steps to setup the Automata environment

```bash
# Clone the repository
git clone git@github.com:emrgnt-cmplxty/Automata.git && cd Automata

# Copy the env and setup files
cp .setup.sh.example setup.sh && cp .env.example .env

Expand All @@ -43,7 +46,6 @@ chmod 755 setup.sh

# Update the setup and env files with your local paths
vim setup.sh
vim .env

# Run the setup script
./setup.sh
Expand All @@ -68,10 +70,10 @@ automata run-code-embedding

# "L1" docs are the docstrings written into the code
# "L2" docs are generated from the L1 docs + symbol context
automata run-doc-embedding-l2
automata run-doc-embedding --embedding-level 2

# "L3" docs are generated from the L2 docs + symbol context
automata run-doc-embedding-l3
automata run-doc-embedding --embedding-level 3
```

### Run the system
Expand All @@ -83,7 +85,7 @@ The following commands illustrate how to run the system with a trivial instructi
automata run-agent --instructions="Return true" --model=gpt-3.5-turbo-0613

# Run a single agent w/ a non-trivial instruction
automata run-agent --instructions="Explain what AutomataAgent is and how it works, include an example to initialize an instance of AutomataAgent." --model=gpt-3.5-turbo-16k
automata run-agent --instructions="Explain what AutomataAgent is and how it works, include an example to initialize an instance of AutomataAgent."
```

---
Expand All @@ -100,14 +102,12 @@ Sometimes the best way to understand a complicated system is to start by underst

```python

import logging
from automata.config.openai_agent import AutomataOpenAIAgentConfigBuilder
from automata.config.base import AgentConfigName
from automata.config.openai_agent import OpenAIAutomataAgentConfigBuilder
from automata.core.agent.providers import OpenAIAutomataAgent
from automata.core.tools.tool_utils import AgentToolFactory
from automata.core.singletons.dependency_factory import dependency_factory
from automata.core.singletons.module_loader import py_module_loader

logger = logging.getLogger(__name__)
from automata.core.singletons.py_module_loader import py_module_loader
from automata.core.tools.factory import AgentToolFactory

# Initialize the module loader to the local directory
py_module_loader.initialize()
Expand All @@ -121,7 +121,7 @@ tools = AgentToolFactory.build_tools(toolkit_list, **tool_dependencies)

# Build the agent config
agent_config = (
AutomataOpenAIAgentConfigBuilder.from_name("automata-main")
OpenAIAutomataAgentConfigBuilder.from_name("automata-main")
.with_tools(tools)
.with_model("gpt-4")
.build()
Expand Down
45 changes: 7 additions & 38 deletions automata/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,25 +47,15 @@ def run_code_embedding(ctx, *args, **kwargs) -> None:
@common_options
@cli.command()
@click.pass_context
def run_doc_embedding_l2(ctx, *args, **kwargs) -> None:
"""Run the document embedding Level-2 pipeline."""
from automata.cli.scripts.run_doc_embedding_l2 import main
@click.option("--embedding-level", type=int, default=2, help="Level of the embedding.")
def run_doc_embedding(ctx, *args, **kwargs) -> None:
from automata.cli.scripts.run_doc_embedding import main

"""Run the document embedding pipeline."""
reconfigure_logging(kwargs.get("log-level", "DEBUG"))
logger.info("Calling run_doc_embedding_l2")
main(**kwargs)


@common_options
@cli.command()
@click.pass_context
def run_doc_embedding_l3(ctx, *args, **kwargs) -> None:
"""Run the document embedding Level-3 pipeline."""
from automata.cli.scripts.run_doc_embedding_l3 import main

reconfigure_logging(kwargs.get("log-level", "DEBUG"))
logger.info("Calling run_doc_embedding_l3")
main(**kwargs)
logger.info("Calling run_doc_embedding")
result = main(*args, **kwargs)
logger.info(f"Result = {result}")


@common_options
Expand All @@ -92,24 +82,3 @@ def run_agent(ctx, *args, **kwargs) -> None:
reconfigure_logging(kwargs.get("log-level", "DEBUG"))
logger.info("Running agent")
main(**kwargs)


@common_options
@agent_options
@cli.command()
@click.option("--fetch-issues", default="", help="Comma-separated list of issue numbers to fetch")
@click.pass_context
def run_agent_task(ctx, *args, **kwargs) -> None:
"""
Run an agent task.
Note - This is similar to run_agent, but executes the agent
within the task framework. This allows for more flexibility
across multiple tasks.
"""
from automata.cli.scripts.run_agent_task import main

reconfigure_logging(kwargs.get("log-level", "DEBUG"))
logger.info("Running the task")
main(**kwargs)
7 changes: 6 additions & 1 deletion automata/cli/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,15 @@ def common_options(command: click.Command, *args, **kwargs) -> click.Command:
help="Which index file to use for the embedding modifications.",
),
click.option(
"--embedding-file",
"--code-embedding-file",
default="symbol_code_embedding.json",
help="Which embedding file to save to.",
),
click.option(
"--doc-embedding-file",
default="symbol_doc_embedding_l3.json",
help="Which embedding file to save to.",
),
]
for option in reversed(options):
command = option(command)
Expand Down
7 changes: 3 additions & 4 deletions automata/cli/scripts/run_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@

from automata.config import GITHUB_API_KEY, REPOSITORY_NAME
from automata.config.base import AgentConfigName
from automata.config.openai_agent import AutomataOpenAIAgentConfigBuilder
from automata.config.openai_agent import OpenAIAutomataAgentConfigBuilder
from automata.core.agent.providers import OpenAIAutomataAgent
from automata.core.github_management.client import GitHubClient
from automata.core.singletons.dependency_factory import dependency_factory
from automata.core.singletons.module_loader import py_module_loader
from automata.core.singletons.py_module_loader import py_module_loader
from automata.core.tools.factory import AgentToolFactory

logger = logging.getLogger(__name__)

DEFAULT_ISSUES_PROMPT_PREFIX = """Provide a comprehensive explanation and full code implementation (in Markdown) which address the Github issue(s) that follow:"""

DEFAULT_ISSUES_PROMPT_SUFFIX = """You may use the context oracle (multiple times if necessary) to ensure that you have proper context to answer this question. If you are tasked with writing code, then keep to the SOLID Principles Further, pay special attention to Dependency Inversion Principle and Dependency Injection."""
# Solve the GitHub issues by writing the relevant code via the PyWriter tool. The issues begin now:"""


def process_issues(issue_numbers: List[int], github_manager: GitHubClient) -> List[str]:
Expand Down Expand Up @@ -63,7 +62,7 @@ def main(*args, **kwargs):
logger.info("Done building tools...")
config_name = AgentConfigName(kwargs.get("agent_name", "automata-main"))
agent_config = (
AutomataOpenAIAgentConfigBuilder.from_name(config_name)
OpenAIAutomataAgentConfigBuilder.from_name(config_name)
.with_tools(tools)
.with_model(kwargs.get("model", "gpt-4-0613"))
.build()
Expand Down
49 changes: 29 additions & 20 deletions automata/cli/scripts/run_code_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
from automata.config.base import ConfigCategory
from automata.core.llm.providers.openai import OpenAIEmbeddingProvider
from automata.core.memory_store.symbol_code_embedding import SymbolCodeEmbeddingHandler
from automata.core.singletons.module_loader import py_module_loader
from automata.core.singletons.dependency_factory import dependency_factory
from automata.core.singletons.py_module_loader import py_module_loader
from automata.core.symbol.graph import SymbolGraph
from automata.core.symbol.symbol_utils import get_rankable_symbols
from automata.core.symbol_embedding.base import JSONSymbolEmbeddingVectorDatabase
from automata.core.symbol_embedding.builders import SymbolCodeEmbeddingBuilder
from automata.core.utils import get_config_fpath

logger = logging.getLogger(__name__)
Expand All @@ -23,36 +22,46 @@ def main(*args, **kwargs) -> str:

py_module_loader.initialize()

scip_path = os.path.join(
scip_fpath = os.path.join(
get_config_fpath(), ConfigCategory.SYMBOL.value, kwargs.get("index-file", "index.scip")
)
embedding_path = os.path.join(
code_embedding_fpath = os.path.join(
get_config_fpath(),
ConfigCategory.SYMBOL.value,
kwargs.get("embedding-file", "symbol_code_embedding.json"),
kwargs.get("code-embedding-file", "symbol_code_embedding.json"),
)
embedding_provider = OpenAIEmbeddingProvider()

symbol_graph = SymbolGraph(scip_path)
dependency_factory.set_overrides(
**{
"symbol_graph_scip_fpath": scip_fpath,
"code_embedding_fpath": code_embedding_fpath,
"embedding_provider": embedding_provider,
}
)

all_defined_symbols = symbol_graph.get_all_available_symbols()
filtered_symbols = sorted(get_rankable_symbols(all_defined_symbols), key=lambda x: x.dotpath)
symbol_graph: SymbolGraph = dependency_factory.get("symbol_graph")
symbol_code_embedding_handler: SymbolCodeEmbeddingHandler = dependency_factory.get(
"symbol_code_embedding_handler"
)
# Mock synchronization to allow us to build the initial embedding handler
symbol_graph.is_synchronized = True
symbol_code_embedding_handler.is_synchronized = True

embedding_db = JSONSymbolEmbeddingVectorDatabase(embedding_path)
embedding_provider = OpenAIEmbeddingProvider()
embedding_builder = SymbolCodeEmbeddingBuilder(embedding_provider)
embedding_handler = SymbolCodeEmbeddingHandler(embedding_db, embedding_builder)
all_defined_symbols = symbol_graph.get_sorted_supported_symbols()
filtered_symbols = sorted(get_rankable_symbols(all_defined_symbols), key=lambda x: x.dotpath)

for symbol in tqdm(filtered_symbols):
try:
embedding_handler.process_embedding(symbol)
embedding_db.save()
symbol_code_embedding_handler.process_embedding(symbol)
symbol_code_embedding_handler.embedding_db.save()
except Exception as e:
logger.error(f"Failed to update embedding for {symbol.dotpath}: {e}")

for symbol in embedding_handler.get_all_supported_symbols():
if symbol not in filtered_symbols:
logger.info(f"Discarding stale symbol {symbol}...")
embedding_db.discard(symbol.dotpath)
embedding_db.save()
# for symbol in symbol_code_embedding_handler.get_sorted_supported_symbols():
# if symbol not in filtered_symbols:
# logger.info(f"Discarding stale symbol {symbol}...")
# symbol_code_embedding_handler.embedding_db.discard(symbol.dotpath)
# symbol_code_embedding_handler.embedding_db.save()

return "Success"
Loading

0 comments on commit 0d4d4b2

Please sign in to comment.