From c8e47cb3ab2713a96e7fb6e00372c127824ebcef Mon Sep 17 00:00:00 2001 From: nanjiangwill Date: Sat, 28 Sep 2024 21:59:50 -0700 Subject: [PATCH 1/8] tmp --- agent/agent_utils.py | 70 +++++++++++- agent/display.py | 28 +++++ agent/run_agent.py | 19 ++-- agent/run_agent_test.py | 237 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 341 insertions(+), 13 deletions(-) create mode 100644 agent/run_agent_test.py diff --git a/agent/agent_utils.py b/agent/agent_utils.py index c6ec4d5..da792af 100644 --- a/agent/agent_utils.py +++ b/agent/agent_utils.py @@ -6,6 +6,8 @@ from pathlib import Path from typing import List import fitz +from import_deps import ModuleSet +from graphlib import TopologicalSorter, CycleError import yaml from agent.class_types import AgentConfig @@ -190,8 +192,46 @@ def _find_files_to_edit(base_dir: str, src_dir: str, test_dir: str) -> list[str] return files -def get_target_edit_files(target_dir: str, src_dir: str, test_dir: str) -> list[str]: +def ignore_cycles(graph: dict): + ts = TopologicalSorter(graph) + try: + return list(set(ts.static_order())) + except CycleError as e: + # print(f"Cycle detected: {e.args[1]}") + # You can either break the cycle by modifying the graph or handle it as needed. + # For now, let's just remove the first node in the cycle and try again. + cycle_nodes = e.args[1] + node_to_remove = cycle_nodes[0] + # print(f"Removing node {node_to_remove} to resolve cycle.") + graph.pop(node_to_remove, None) + return ignore_cycles(graph) + + +def topological_sort_based_on_dependencies(pkg_paths: list[str]) -> list[str]: + """Topological sort based on dependencies.""" + module_set = ModuleSet([str(p) for p in pkg_paths]) + + import_dependencies = {} + for path in sorted(module_set.by_path.keys()): + module_name = ".".join(module_set.by_path[path].fqn) + mod = module_set.by_name[module_name] + imports = module_set.get_imports(mod) + import_dependencies[path] = set([str(x) for x in imports]) + + import_dependencies_files = ignore_cycles(import_dependencies) + + return import_dependencies_files + + +def get_target_edit_files( + local_repo: git.Repo, + src_dir: str, + test_dir: str, + latest_commit: str, + reference_commit: str, +) -> list[str]: """Find the files with functions with the pass statement.""" + target_dir = local_repo.working_dir files = _find_files_to_edit(target_dir, src_dir, test_dir) filtered_files = [] for file_path in files: @@ -202,13 +242,33 @@ def get_target_edit_files(target_dir: str, src_dir: str, test_dir: str) -> list[ if " pass" in content: filtered_files.append(file_path) + # Change to reference commit to get the correct dependencies + local_repo.git.checkout(reference_commit) + + topological_sort_files = topological_sort_based_on_dependencies(filtered_files) + if len(topological_sort_files) != len(filtered_files): + if len(topological_sort_files) < len(filtered_files): + # Find the missing elements + missing_files = set(filtered_files) - set(topological_sort_files) + # Add the missing files to the end of the list + topological_sort_files = topological_sort_files + list(missing_files) + else: + raise ValueError( + "topological_sort_files should not be longer than filtered_files" + ) + assert len(topological_sort_files) == len( + filtered_files + ), "all files should be included" + + # change to latest commit + local_repo.git.checkout(latest_commit) + # Remove the base_dir prefix - filtered_files = [ - file.replace(target_dir, "").lstrip("/") for file in filtered_files + topological_sort_files = [ + file.replace(target_dir, "").lstrip("/") for file in topological_sort_files ] - # Only keep python files - return filtered_files + return topological_sort_files def get_message( diff --git a/agent/display.py b/agent/display.py index a5f389c..53d01fe 100644 --- a/agent/display.py +++ b/agent/display.py @@ -17,6 +17,8 @@ from rich.align import Align from collections import OrderedDict from types import TracebackType +import json +from datetime import datetime class RepoBox: @@ -404,3 +406,29 @@ def __exit__( f"{'Total':<30} {self.total_time_spent:>13.2f}s {total_files:>18} {total_money:>13.2f}$" ) print("-" * 80) + + # Write summary to JSON file + + summary_data = { + "timestamp": datetime.now().isoformat(), + "total_time_spent": self.total_time_spent, + "total_files_processed": total_files, + "total_money_spent": total_money, + "repositories": [ + { + "name": repo_name, + "time_spent": self.end_time_per_repo[repo_name] + - self.start_time_per_repo[repo_name], + "files_processed": self.total_files_per_repo[repo_name], + "money_spent": sum( + self.repo_money_spent.get(repo_name, {}).values() + ), + } + for repo_name in self.end_time_per_repo + ], + } + + with open("processing_summary.json", "w") as json_file: + json.dump(summary_data, json_file, indent=4) + + print("\nSummary has been written to processing_summary.json") diff --git a/agent/run_agent.py b/agent/run_agent.py index 3ef2a08..8a3c4de 100644 --- a/agent/run_agent.py +++ b/agent/run_agent.py @@ -66,13 +66,6 @@ def run_agent_for_repo( repo_path = os.path.join(repo_base_dir, repo_name) repo_path = os.path.abspath(repo_path) - target_edit_files = get_target_edit_files( - repo_path, example["src_dir"], example["test"]["test_dir"] - ) - # Call the commit0 get-tests command to retrieve test files - test_files_str = get_tests(repo_name, verbose=0) - test_files = sorted(list(set([i.split(":")[0] for i in test_files_str]))) - try: local_repo = Repo(repo_path) except Exception: @@ -90,7 +83,6 @@ def run_agent_for_repo( # # if branch_name is not provided, create a new branch name based on agent_config # if branch is None: # branch = args2string(agent_config) - create_branch(local_repo, branch, example["base_commit"]) # in cases where the latest commit of branch is not commit 0 @@ -99,6 +91,17 @@ def run_agent_for_repo( if latest_commit.hexsha != example["base_commit"] and override_previous_changes: local_repo.git.reset("--hard", example["base_commit"]) + target_edit_files = get_target_edit_files( + local_repo, + example["src_dir"], + example["test"]["test_dir"], + latest_commit, + example["reference_commit"], + ) + # Call the commit0 get-tests command to retrieve test files + test_files_str = get_tests(repo_name, verbose=0) + test_files = sorted(list(set([i.split(":")[0] for i in test_files_str]))) + # prepare the log dir experiment_log_dir = ( Path(log_dir) diff --git a/agent/run_agent_test.py b/agent/run_agent_test.py new file mode 100644 index 0000000..64f7ca2 --- /dev/null +++ b/agent/run_agent_test.py @@ -0,0 +1,237 @@ +import os +import yaml +import multiprocessing +from tqdm import tqdm +from datasets import load_dataset +from git import Repo +from agent.agent_utils import ( + args2string, + create_branch, + get_message, + get_target_edit_files, + get_lint_cmd, + read_yaml_config, +) +from agent.agents import AiderAgents +from typing import Optional, Type, cast +from types import TracebackType +from agent.class_types import AgentConfig +from commit0.harness.constants import SPLIT +from commit0.harness.get_pytest_ids import main as get_tests +from commit0.harness.constants import RUN_AGENT_LOG_DIR, RepoInstance +from commit0.cli import read_commit0_dot_file +from pathlib import Path +from datetime import datetime + + +class DirContext: + def __init__(self, d: str): + self.dir = d + self.cwd = os.getcwd() + + def __enter__(self): + os.chdir(self.dir) + + def __exit__( + self, + exctype: Optional[Type[BaseException]], + excinst: Optional[BaseException], + exctb: Optional[TracebackType], + ) -> None: + os.chdir(self.cwd) + + +def run_agent_for_repo( + repo_base_dir: str, + agent_config: AgentConfig, + example: RepoInstance, + branch: Optional[str] = None, + override_previous_changes: bool = False, + backend: str = "modal", + log_dir: str = str(RUN_AGENT_LOG_DIR.resolve()), +) -> None: + """Run Aider for a given repository.""" + # get repo info + _, repo_name = example["repo"].split("/") + print("Working on repo: ", repo_name) + + # repo_name = repo_name.lower() + # repo_name = repo_name.replace(".", "-") + + repo_path = os.path.join(repo_base_dir, repo_name) + repo_path = os.path.abspath(repo_path) + + try: + local_repo = Repo(repo_path) + except Exception: + raise Exception( + f"{repo_path} is not a git repo. Check if base_dir is correctly specified." + ) + + if agent_config.agent_name == "aider": + agent = AiderAgents(agent_config.max_iteration, agent_config.model_name) + else: + raise NotImplementedError( + f"{agent_config.agent_name} is not implemented; please add your implementations in baselines/agents.py." + ) + + # if branch_name is not provided, create a new branch name based on agent_config + if branch is None: + branch = args2string(agent_config) + + create_branch(local_repo, branch, example["base_commit"]) + + # in cases where the latest commit of branch is not commit 0 + # set it back to commit 0 + latest_commit = local_repo.commit(branch) + if latest_commit.hexsha != example["base_commit"] and override_previous_changes: + local_repo.git.reset("--hard", example["base_commit"]) + + # get target files to edit and test files to run + target_edit_files = get_target_edit_files( + local_repo, example["src_dir"], example["test"]["test_dir"], latest_commit, example["reference_commit"] + ) + print(target_edit_files) + return + # Call the commit0 get-tests command to retrieve test files + test_files_str = get_tests(repo_name, verbose=0) + test_files = sorted(list(set([i.split(":")[0] for i in test_files_str]))) + + # prepare the log dir + experiment_log_dir = ( + Path(log_dir) + / repo_name + / branch + / datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ) + experiment_log_dir.mkdir(parents=True, exist_ok=True) + + # write agent_config to .agent.yaml in the log_dir for record + agent_config_log_file = experiment_log_dir / ".agent.yaml" + with open(agent_config_log_file, "w") as agent_config_file: + yaml.dump(agent_config, agent_config_file) + + # TODO: make this path more general + commit0_dot_file_path = str(Path(repo_path).parent.parent / ".commit0.yaml") + + with DirContext(repo_path): + if agent_config is None: + raise ValueError("Invalid input") + + if agent_config.run_tests: + # when unit test feedback is available, iterate over test files + for test_file in test_files: + test_cmd = f"python -m commit0 test {repo_path} {test_file} --branch {branch} --backend {backend} --commit0-dot-file-path {commit0_dot_file_path}" + test_file_name = test_file.replace(".py", "").replace("/", "__") + test_log_dir = experiment_log_dir / test_file_name + lint_cmd = get_lint_cmd(repo_name, agent_config.use_lint_info) + message = get_message(agent_config, repo_path, test_file=test_file) + _ = agent.run( + message, + test_cmd, + lint_cmd, + target_edit_files, + test_log_dir, + test_first=True, + ) + # cost = agent_return.last_cost + else: + # when unit test feedback is not available, iterate over target files to edit + message = get_message( + agent_config, repo_path, test_dir=example["test"]["test_dir"] + ) + for f in target_edit_files: + file_name = f.replace(".py", "").replace("/", "__") + file_log_dir = experiment_log_dir / file_name + lint_cmd = get_lint_cmd(repo_name, agent_config.use_lint_info) + _ = agent.run(message, "", lint_cmd, [f], file_log_dir) + # cost = agent_return.last_cost + + +def run_agent( + branch: str, + override_previous_changes: bool, + backend: str, + agent_config_file: str, + log_dir: str, + max_parallel_repos: int, +) -> None: + """Main function to run Aider for a given repository. + + Will run in parallel for each repo. + """ + config = read_yaml_config(agent_config_file) + + agent_config = AgentConfig(**config) + + commit0_config = read_commit0_dot_file(".commit0.yaml") + + dataset = load_dataset( + commit0_config["dataset_name"], split=commit0_config["dataset_split"] + ) + filtered_dataset = [ + example + for example in dataset + if commit0_config["repo_split"] == "all" + or ( + isinstance(example, dict) + and "repo" in example + and isinstance(example["repo"], str) + and example["repo"].split("/")[-1] + in SPLIT.get(commit0_config["repo_split"], []) + ) + ] + assert len(filtered_dataset) > 0, "No examples available" + + # if len(filtered_dataset) > 1: + # sys.stdout = open(os.devnull, "w") + for i in range(len(filtered_dataset)): + if "python-rsa" not in filtered_dataset[i]["repo"]: + continue + run_agent_for_repo( + commit0_config["base_dir"], + agent_config, + filtered_dataset[i], + branch, + override_previous_changes, + backend, + log_dir, + ) + # with tqdm( + # total=len(filtered_dataset), smoothing=0, desc="Running Aider for repos" + # ) as pbar: + # with multiprocessing.Pool(processes=max_parallel_repos) as pool: + # results = [] + + # # Use apply_async to submit jobs and add progress bar updates + # for example in filtered_dataset: + # result = pool.apply_async( + # run_agent_for_repo, + # args=( + # commit0_config["base_dir"], + # agent_config, + # cast(RepoInstance, example), + # branch, + # override_previous_changes, + # backend, + # log_dir, + # ), + # callback=lambda _: pbar.update( + # 1 + # ), # Update progress bar on task completion + # ) + # results.append(result) + + # for result in results: + # result.wait() + + +if __name__ == "__main__": + run_agent( + "fillin", + False, + "modal", + ".agent_with_test.yaml", + "logs", + 10, + ) From 99779c9eac207447fa5e9b1d7a2d5bfa54041477 Mon Sep 17 00:00:00 2001 From: nanjiangwill Date: Sun, 29 Sep 2024 10:39:14 -0700 Subject: [PATCH 2/8] update --- agent/agent_utils.py | 9 ++++----- agent/run_agent.py | 4 ++-- agent/run_agent_test.py | 10 ++++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/agent/agent_utils.py b/agent/agent_utils.py index da792af..197f358 100644 --- a/agent/agent_utils.py +++ b/agent/agent_utils.py @@ -192,10 +192,11 @@ def _find_files_to_edit(base_dir: str, src_dir: str, test_dir: str) -> list[str] return files -def ignore_cycles(graph: dict): +def ignore_cycles(graph: dict) -> list[str]: + """Ignore the cycles in the graph.""" ts = TopologicalSorter(graph) try: - return list(set(ts.static_order())) + return list(ts.static_order()) except CycleError as e: # print(f"Cycle detected: {e.args[1]}") # You can either break the cycle by modifying the graph or handle it as needed. @@ -231,7 +232,7 @@ def get_target_edit_files( reference_commit: str, ) -> list[str]: """Find the files with functions with the pass statement.""" - target_dir = local_repo.working_dir + target_dir = str(local_repo.working_dir) files = _find_files_to_edit(target_dir, src_dir, test_dir) filtered_files = [] for file_path in files: @@ -241,10 +242,8 @@ def get_target_edit_files( continue if " pass" in content: filtered_files.append(file_path) - # Change to reference commit to get the correct dependencies local_repo.git.checkout(reference_commit) - topological_sort_files = topological_sort_based_on_dependencies(filtered_files) if len(topological_sort_files) != len(filtered_files): if len(topological_sort_files) < len(filtered_files): diff --git a/agent/run_agent.py b/agent/run_agent.py index 8a3c4de..3a6425a 100644 --- a/agent/run_agent.py +++ b/agent/run_agent.py @@ -87,8 +87,8 @@ def run_agent_for_repo( # in cases where the latest commit of branch is not commit 0 # set it back to commit 0 - latest_commit = local_repo.commit(branch) - if latest_commit.hexsha != example["base_commit"] and override_previous_changes: + latest_commit = str(local_repo.commit(branch)) + if latest_commit != example["base_commit"] and override_previous_changes: local_repo.git.reset("--hard", example["base_commit"]) target_edit_files = get_target_edit_files( diff --git a/agent/run_agent_test.py b/agent/run_agent_test.py index 64f7ca2..5e1b4a9 100644 --- a/agent/run_agent_test.py +++ b/agent/run_agent_test.py @@ -1,7 +1,5 @@ import os import yaml -import multiprocessing -from tqdm import tqdm from datasets import load_dataset from git import Repo from agent.agent_utils import ( @@ -13,7 +11,7 @@ read_yaml_config, ) from agent.agents import AiderAgents -from typing import Optional, Type, cast +from typing import Optional, Type from types import TracebackType from agent.class_types import AgentConfig from commit0.harness.constants import SPLIT @@ -89,7 +87,11 @@ def run_agent_for_repo( # get target files to edit and test files to run target_edit_files = get_target_edit_files( - local_repo, example["src_dir"], example["test"]["test_dir"], latest_commit, example["reference_commit"] + local_repo, + example["src_dir"], + example["test"]["test_dir"], + latest_commit, + example["reference_commit"], ) print(target_edit_files) return From 7f4929909cb09a510bb990f21371844c88be1709 Mon Sep 17 00:00:00 2001 From: nanjiangwill Date: Sun, 29 Sep 2024 10:42:10 -0700 Subject: [PATCH 3/8] update --- agent/run_agent.py | 6 +- agent/run_agent_no_rich.py | 21 ++-- agent/run_agent_test.py | 239 ------------------------------------- 3 files changed, 16 insertions(+), 250 deletions(-) delete mode 100644 agent/run_agent_test.py diff --git a/agent/run_agent.py b/agent/run_agent.py index 3a6425a..7eaa926 100644 --- a/agent/run_agent.py +++ b/agent/run_agent.py @@ -87,15 +87,15 @@ def run_agent_for_repo( # in cases where the latest commit of branch is not commit 0 # set it back to commit 0 - latest_commit = str(local_repo.commit(branch)) - if latest_commit != example["base_commit"] and override_previous_changes: + latest_commit = local_repo.commit(branch) + if latest_commit.hexsha != example["base_commit"] and override_previous_changes: local_repo.git.reset("--hard", example["base_commit"]) target_edit_files = get_target_edit_files( local_repo, example["src_dir"], example["test"]["test_dir"], - latest_commit, + str(latest_commit), example["reference_commit"], ) # Call the commit0 get-tests command to retrieve test files diff --git a/agent/run_agent_no_rich.py b/agent/run_agent_no_rich.py index c46ae2f..754256d 100644 --- a/agent/run_agent_no_rich.py +++ b/agent/run_agent_no_rich.py @@ -61,14 +61,6 @@ def run_agent_for_repo( repo_path = os.path.join(repo_base_dir, repo_name) repo_path = os.path.abspath(repo_path) - # get target files to edit and test files to run - target_edit_files = get_target_edit_files( - repo_path, example["src_dir"], example["test"]["test_dir"] - ) - # Call the commit0 get-tests command to retrieve test files - test_files_str = get_tests(repo_name, verbose=0) - test_files = sorted(list(set([i.split(":")[0] for i in test_files_str]))) - try: local_repo = Repo(repo_path) except Exception: @@ -95,6 +87,19 @@ def run_agent_for_repo( if latest_commit.hexsha != example["base_commit"] and override_previous_changes: local_repo.git.reset("--hard", example["base_commit"]) + # get target files to edit and test files to run + target_edit_files = get_target_edit_files( + local_repo, + example["src_dir"], + example["test"]["test_dir"], + str(latest_commit), + str(example["reference_commit"]), + ) + + # Call the commit0 get-tests command to retrieve test files + test_files_str = get_tests(repo_name, verbose=0) + test_files = sorted(list(set([i.split(":")[0] for i in test_files_str]))) + # prepare the log dir experiment_log_dir = ( Path(log_dir) diff --git a/agent/run_agent_test.py b/agent/run_agent_test.py deleted file mode 100644 index 5e1b4a9..0000000 --- a/agent/run_agent_test.py +++ /dev/null @@ -1,239 +0,0 @@ -import os -import yaml -from datasets import load_dataset -from git import Repo -from agent.agent_utils import ( - args2string, - create_branch, - get_message, - get_target_edit_files, - get_lint_cmd, - read_yaml_config, -) -from agent.agents import AiderAgents -from typing import Optional, Type -from types import TracebackType -from agent.class_types import AgentConfig -from commit0.harness.constants import SPLIT -from commit0.harness.get_pytest_ids import main as get_tests -from commit0.harness.constants import RUN_AGENT_LOG_DIR, RepoInstance -from commit0.cli import read_commit0_dot_file -from pathlib import Path -from datetime import datetime - - -class DirContext: - def __init__(self, d: str): - self.dir = d - self.cwd = os.getcwd() - - def __enter__(self): - os.chdir(self.dir) - - def __exit__( - self, - exctype: Optional[Type[BaseException]], - excinst: Optional[BaseException], - exctb: Optional[TracebackType], - ) -> None: - os.chdir(self.cwd) - - -def run_agent_for_repo( - repo_base_dir: str, - agent_config: AgentConfig, - example: RepoInstance, - branch: Optional[str] = None, - override_previous_changes: bool = False, - backend: str = "modal", - log_dir: str = str(RUN_AGENT_LOG_DIR.resolve()), -) -> None: - """Run Aider for a given repository.""" - # get repo info - _, repo_name = example["repo"].split("/") - print("Working on repo: ", repo_name) - - # repo_name = repo_name.lower() - # repo_name = repo_name.replace(".", "-") - - repo_path = os.path.join(repo_base_dir, repo_name) - repo_path = os.path.abspath(repo_path) - - try: - local_repo = Repo(repo_path) - except Exception: - raise Exception( - f"{repo_path} is not a git repo. Check if base_dir is correctly specified." - ) - - if agent_config.agent_name == "aider": - agent = AiderAgents(agent_config.max_iteration, agent_config.model_name) - else: - raise NotImplementedError( - f"{agent_config.agent_name} is not implemented; please add your implementations in baselines/agents.py." - ) - - # if branch_name is not provided, create a new branch name based on agent_config - if branch is None: - branch = args2string(agent_config) - - create_branch(local_repo, branch, example["base_commit"]) - - # in cases where the latest commit of branch is not commit 0 - # set it back to commit 0 - latest_commit = local_repo.commit(branch) - if latest_commit.hexsha != example["base_commit"] and override_previous_changes: - local_repo.git.reset("--hard", example["base_commit"]) - - # get target files to edit and test files to run - target_edit_files = get_target_edit_files( - local_repo, - example["src_dir"], - example["test"]["test_dir"], - latest_commit, - example["reference_commit"], - ) - print(target_edit_files) - return - # Call the commit0 get-tests command to retrieve test files - test_files_str = get_tests(repo_name, verbose=0) - test_files = sorted(list(set([i.split(":")[0] for i in test_files_str]))) - - # prepare the log dir - experiment_log_dir = ( - Path(log_dir) - / repo_name - / branch - / datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - ) - experiment_log_dir.mkdir(parents=True, exist_ok=True) - - # write agent_config to .agent.yaml in the log_dir for record - agent_config_log_file = experiment_log_dir / ".agent.yaml" - with open(agent_config_log_file, "w") as agent_config_file: - yaml.dump(agent_config, agent_config_file) - - # TODO: make this path more general - commit0_dot_file_path = str(Path(repo_path).parent.parent / ".commit0.yaml") - - with DirContext(repo_path): - if agent_config is None: - raise ValueError("Invalid input") - - if agent_config.run_tests: - # when unit test feedback is available, iterate over test files - for test_file in test_files: - test_cmd = f"python -m commit0 test {repo_path} {test_file} --branch {branch} --backend {backend} --commit0-dot-file-path {commit0_dot_file_path}" - test_file_name = test_file.replace(".py", "").replace("/", "__") - test_log_dir = experiment_log_dir / test_file_name - lint_cmd = get_lint_cmd(repo_name, agent_config.use_lint_info) - message = get_message(agent_config, repo_path, test_file=test_file) - _ = agent.run( - message, - test_cmd, - lint_cmd, - target_edit_files, - test_log_dir, - test_first=True, - ) - # cost = agent_return.last_cost - else: - # when unit test feedback is not available, iterate over target files to edit - message = get_message( - agent_config, repo_path, test_dir=example["test"]["test_dir"] - ) - for f in target_edit_files: - file_name = f.replace(".py", "").replace("/", "__") - file_log_dir = experiment_log_dir / file_name - lint_cmd = get_lint_cmd(repo_name, agent_config.use_lint_info) - _ = agent.run(message, "", lint_cmd, [f], file_log_dir) - # cost = agent_return.last_cost - - -def run_agent( - branch: str, - override_previous_changes: bool, - backend: str, - agent_config_file: str, - log_dir: str, - max_parallel_repos: int, -) -> None: - """Main function to run Aider for a given repository. - - Will run in parallel for each repo. - """ - config = read_yaml_config(agent_config_file) - - agent_config = AgentConfig(**config) - - commit0_config = read_commit0_dot_file(".commit0.yaml") - - dataset = load_dataset( - commit0_config["dataset_name"], split=commit0_config["dataset_split"] - ) - filtered_dataset = [ - example - for example in dataset - if commit0_config["repo_split"] == "all" - or ( - isinstance(example, dict) - and "repo" in example - and isinstance(example["repo"], str) - and example["repo"].split("/")[-1] - in SPLIT.get(commit0_config["repo_split"], []) - ) - ] - assert len(filtered_dataset) > 0, "No examples available" - - # if len(filtered_dataset) > 1: - # sys.stdout = open(os.devnull, "w") - for i in range(len(filtered_dataset)): - if "python-rsa" not in filtered_dataset[i]["repo"]: - continue - run_agent_for_repo( - commit0_config["base_dir"], - agent_config, - filtered_dataset[i], - branch, - override_previous_changes, - backend, - log_dir, - ) - # with tqdm( - # total=len(filtered_dataset), smoothing=0, desc="Running Aider for repos" - # ) as pbar: - # with multiprocessing.Pool(processes=max_parallel_repos) as pool: - # results = [] - - # # Use apply_async to submit jobs and add progress bar updates - # for example in filtered_dataset: - # result = pool.apply_async( - # run_agent_for_repo, - # args=( - # commit0_config["base_dir"], - # agent_config, - # cast(RepoInstance, example), - # branch, - # override_previous_changes, - # backend, - # log_dir, - # ), - # callback=lambda _: pbar.update( - # 1 - # ), # Update progress bar on task completion - # ) - # results.append(result) - - # for result in results: - # result.wait() - - -if __name__ == "__main__": - run_agent( - "fillin", - False, - "modal", - ".agent_with_test.yaml", - "logs", - 10, - ) From e47004798c51d7b7f2433a24a8ec881f1d476dac Mon Sep 17 00:00:00 2001 From: nanjiangwill Date: Sun, 29 Sep 2024 10:57:22 -0700 Subject: [PATCH 4/8] update file finding logic --- agent/agent_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/agent/agent_utils.py b/agent/agent_utils.py index 197f358..fcbb87e 100644 --- a/agent/agent_utils.py +++ b/agent/agent_utils.py @@ -216,8 +216,11 @@ def topological_sort_based_on_dependencies(pkg_paths: list[str]) -> list[str]: for path in sorted(module_set.by_path.keys()): module_name = ".".join(module_set.by_path[path].fqn) mod = module_set.by_name[module_name] - imports = module_set.get_imports(mod) - import_dependencies[path] = set([str(x) for x in imports]) + try: + imports = module_set.get_imports(mod) + import_dependencies[path] = set([str(x) for x in imports]) + except Exception: + import_dependencies[path] = set() import_dependencies_files = ignore_cycles(import_dependencies) @@ -236,7 +239,7 @@ def get_target_edit_files( files = _find_files_to_edit(target_dir, src_dir, test_dir) filtered_files = [] for file_path in files: - with open(file_path, "r", encoding="utf-8", errors="ignore") as file: + with open(file_path, "r", encoding="utf-8-sig", errors="ignore") as file: content = file.read() if len(content.splitlines()) > 1500: continue @@ -244,6 +247,7 @@ def get_target_edit_files( filtered_files.append(file_path) # Change to reference commit to get the correct dependencies local_repo.git.checkout(reference_commit) + topological_sort_files = topological_sort_based_on_dependencies(filtered_files) if len(topological_sort_files) != len(filtered_files): if len(topological_sort_files) < len(filtered_files): From 0932f76cd793ed9acdd884a2d1064f4b8bced816 Mon Sep 17 00:00:00 2001 From: nanjiangwill Date: Sun, 29 Sep 2024 15:32:02 -0700 Subject: [PATCH 5/8] update file finding logic --- agent/agent_utils.py | 36 +++++++++++++++++++++++++++++++----- agent/cli.py | 6 ++++++ agent/run_agent.py | 8 ++++++-- agent/run_agent_no_rich.py | 8 ++++++-- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/agent/agent_utils.py b/agent/agent_utils.py index fcbb87e..3c5cb60 100644 --- a/agent/agent_utils.py +++ b/agent/agent_utils.py @@ -18,6 +18,7 @@ UNIT_TESTS_INFO_HEADER = "\n\n>>> Here are the Unit Tests Information:\n" LINT_INFO_HEADER = "\n\n>>> Here is the Lint Information:\n" SPEC_INFO_HEADER = "\n\n>>> Here is the Specification Information:\n" +IMPORT_DEPENDENCIES_HEADER = "\n\n>>> Here are the Import Dependencies:\n" # prefix components: space = " " branch = "│ " @@ -208,7 +209,9 @@ def ignore_cycles(graph: dict) -> list[str]: return ignore_cycles(graph) -def topological_sort_based_on_dependencies(pkg_paths: list[str]) -> list[str]: +def topological_sort_based_on_dependencies( + pkg_paths: list[str], +) -> tuple[list[str], dict]: """Topological sort based on dependencies.""" module_set = ModuleSet([str(p) for p in pkg_paths]) @@ -224,7 +227,7 @@ def topological_sort_based_on_dependencies(pkg_paths: list[str]) -> list[str]: import_dependencies_files = ignore_cycles(import_dependencies) - return import_dependencies_files + return import_dependencies_files, import_dependencies def get_target_edit_files( @@ -233,7 +236,7 @@ def get_target_edit_files( test_dir: str, latest_commit: str, reference_commit: str, -) -> list[str]: +) -> tuple[list[str], dict]: """Find the files with functions with the pass statement.""" target_dir = str(local_repo.working_dir) files = _find_files_to_edit(target_dir, src_dir, test_dir) @@ -248,7 +251,9 @@ def get_target_edit_files( # Change to reference commit to get the correct dependencies local_repo.git.checkout(reference_commit) - topological_sort_files = topological_sort_based_on_dependencies(filtered_files) + topological_sort_files, import_dependencies = ( + topological_sort_based_on_dependencies(filtered_files) + ) if len(topological_sort_files) != len(filtered_files): if len(topological_sort_files) < len(filtered_files): # Find the missing elements @@ -271,7 +276,14 @@ def get_target_edit_files( file.replace(target_dir, "").lstrip("/") for file in topological_sort_files ] - return topological_sort_files + # Remove the base_dir prefix from import dependencies + import_dependencies_without_prefix = {} + for key, value in import_dependencies.items(): + key_without_prefix = key.replace(target_dir, "").lstrip("/") + value_without_prefix = [v.replace(target_dir, "").lstrip("/") for v in value] + import_dependencies_without_prefix[key_without_prefix] = value_without_prefix + + return topological_sort_files, import_dependencies_without_prefix def get_message( @@ -331,6 +343,20 @@ def get_message( return message_to_agent +def update_message_with_dependencies(message: str, dependencies: list[str]) -> str: + """Update the message with the dependencies.""" + if len(dependencies) == 0: + return message + import_dependencies_info = f"\n{IMPORT_DEPENDENCIES_HEADER}" + for dependency in dependencies: + with open(dependency, "r") as file: + import_dependencies_info += ( + f"\nHere is the content of the file {dependency}:\n{file.read()}" + ) + message += import_dependencies_info + return message + + def get_specification(specification_pdf_path: Path) -> str: """Get the reference for a given specification PDF path.""" # TODO: after pdf_to_text is available, use it to extract the text from the PDF diff --git a/agent/cli.py b/agent/cli.py index 905191b..8d06891 100644 --- a/agent/cli.py +++ b/agent/cli.py @@ -178,6 +178,10 @@ def run( ".agent.yaml", help="Path to the agent config file", ), + commit0_config_file: str = typer.Option( + ".commit0.yaml", + help="Path to the commit0 config file", + ), log_dir: str = typer.Option( str(RUN_AGENT_LOG_DIR.resolve()), help="Log directory to store the logs", @@ -202,6 +206,7 @@ def run( override_previous_changes, backend, agent_config_file, + commit0_config_file, log_dir, max_parallel_repos, display_repo_progress_num, @@ -212,6 +217,7 @@ def run( override_previous_changes, backend, agent_config_file, + commit0_config_file, log_dir, max_parallel_repos, ) diff --git a/agent/run_agent.py b/agent/run_agent.py index 7eaa926..5315086 100644 --- a/agent/run_agent.py +++ b/agent/run_agent.py @@ -7,6 +7,7 @@ create_branch, get_message, get_target_edit_files, + update_message_with_dependencies, get_lint_cmd, read_yaml_config, ) @@ -91,7 +92,7 @@ def run_agent_for_repo( if latest_commit.hexsha != example["base_commit"] and override_previous_changes: local_repo.git.reset("--hard", example["base_commit"]) - target_edit_files = get_target_edit_files( + target_edit_files, import_dependencies = get_target_edit_files( local_repo, example["src_dir"], example["test"]["test_dir"], @@ -161,6 +162,8 @@ def run_agent_for_repo( ) for f in target_edit_files: update_queue.put(("set_current_file", (repo_name, f))) + dependencies = import_dependencies[f] + message = update_message_with_dependencies(message, dependencies) file_name = f.replace(".py", "").replace("/", "__") file_log_dir = experiment_log_dir / file_name lint_cmd = get_lint_cmd(repo_name, agent_config.use_lint_info) @@ -179,6 +182,7 @@ def run_agent( override_previous_changes: bool, backend: str, agent_config_file: str, + commit0_config_file: str, log_dir: str, max_parallel_repos: int, display_repo_progress_num: int, @@ -188,7 +192,7 @@ def run_agent( agent_config = AgentConfig(**config) - commit0_config = read_commit0_dot_file(".commit0.yaml") + commit0_config = read_commit0_dot_file(commit0_config_file) dataset = load_dataset( commit0_config["dataset_name"], split=commit0_config["dataset_split"] diff --git a/agent/run_agent_no_rich.py b/agent/run_agent_no_rich.py index 754256d..ec1334a 100644 --- a/agent/run_agent_no_rich.py +++ b/agent/run_agent_no_rich.py @@ -9,6 +9,7 @@ create_branch, get_message, get_target_edit_files, + update_message_with_dependencies, get_lint_cmd, read_yaml_config, ) @@ -88,7 +89,7 @@ def run_agent_for_repo( local_repo.git.reset("--hard", example["base_commit"]) # get target files to edit and test files to run - target_edit_files = get_target_edit_files( + target_edit_files, import_dependencies = get_target_edit_files( local_repo, example["src_dir"], example["test"]["test_dir"], @@ -144,6 +145,8 @@ def run_agent_for_repo( agent_config, repo_path, test_dir=example["test"]["test_dir"] ) for f in target_edit_files: + dependencies = import_dependencies[f] + message = update_message_with_dependencies(message, dependencies) file_name = f.replace(".py", "").replace("/", "__") file_log_dir = experiment_log_dir / file_name lint_cmd = get_lint_cmd(repo_name, agent_config.use_lint_info) @@ -156,6 +159,7 @@ def run_agent( override_previous_changes: bool, backend: str, agent_config_file: str, + commit0_config_file: str, log_dir: str, max_parallel_repos: int, ) -> None: @@ -167,7 +171,7 @@ def run_agent( agent_config = AgentConfig(**config) - commit0_config = read_commit0_dot_file(".commit0.yaml") + commit0_config = read_commit0_dot_file(commit0_config_file) dataset = load_dataset( commit0_config["dataset_name"], split=commit0_config["dataset_split"] From ebfb6566fb6126a6796ad9223b8cc66fc7050fac Mon Sep 17 00:00:00 2001 From: nanjiangwill Date: Sun, 29 Sep 2024 15:41:10 -0700 Subject: [PATCH 6/8] update files --- agent/agents.py | 3 +++ pyproject.toml | 1 + 2 files changed, 4 insertions(+) diff --git a/agent/agents.py b/agent/agents.py index 9255a9f..f99e360 100644 --- a/agent/agents.py +++ b/agent/agents.py @@ -86,6 +86,9 @@ def run( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) + # Log the message + logging.info(f"Message Sent: {message} \n\n") + # Redirect print statements to the log file sys.stdout = open(log_file, "a") sys.stderr = open(log_file, "a") diff --git a/pyproject.toml b/pyproject.toml index 8befc62..7666711 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ requires-python = ">=3.11" dependencies = [ "ruff>=0.6.4", "pre-commit>=3.8.0", + "import-deps>=0.3.0", "PyMuPDF>=1.24.5", "modal==0.64.95", "typer>=0.12.0", From d9039b06b09ccf6c041d35930f7fc80c6ce05002 Mon Sep 17 00:00:00 2001 From: nanjiangwill Date: Sun, 29 Sep 2024 15:46:12 -0700 Subject: [PATCH 7/8] update files --- agent/agents.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agent/agents.py b/agent/agents.py index f99e360..9d9c4e3 100644 --- a/agent/agents.py +++ b/agent/agents.py @@ -86,13 +86,13 @@ def run( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) - # Log the message - logging.info(f"Message Sent: {message} \n\n") - # Redirect print statements to the log file sys.stdout = open(log_file, "a") sys.stderr = open(log_file, "a") + # Log the message + logging.info(f"Message Sent: {message} \n\n") + # Configure httpx and backoff logging handle_logging("httpx", log_file) handle_logging("backoff", log_file) From c34bde7142bf3b493422fd3b4386a381885204f0 Mon Sep 17 00:00:00 2001 From: nanjiangwill Date: Sun, 29 Sep 2024 15:50:22 -0700 Subject: [PATCH 8/8] update files --- agent/agents.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/agent/agents.py b/agent/agents.py index 9d9c4e3..6e7d9d8 100644 --- a/agent/agents.py +++ b/agent/agents.py @@ -91,7 +91,9 @@ def run( sys.stderr = open(log_file, "a") # Log the message - logging.info(f"Message Sent: {message} \n\n") + agent_message_log_file = log_dir / "agent_message.log" + with open(agent_message_log_file, "a") as f: + f.write(f"Message Sent: {message}\n\n") # Configure httpx and backoff logging handle_logging("httpx", log_file)