fix small error

nanjiangwill · nanjiangwill · commit 9212dcbc0eb1 · 2024-09-24T21:03:05.000-04:00
diff --git a/agent/agent_utils.py b/agent/agent_utils.py
@@ -126,7 +126,7 @@ def get_target_edit_files(target_dir: str, src_prefix: str) -> list[str]:
         for filename in filenames:
             if filename.endswith(".py"):
                 file_path = os.path.join(root, filename)
-                with open(file_path, "r") as file:
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
                     if "    pass" in file.read():
                         files.append(file_path)
 
diff --git a/agent/cli.py b/agent/cli.py
@@ -1,6 +1,7 @@
 import typer
 from agent.run_agent_no_rich import run_agent as run_agent_no_rich
 from agent.run_agent import run_agent
+from agent.run_agent_joblit import run_agent as run_agent_joblit
 from commit0.harness.constants import RUN_AIDER_LOG_DIR
 import subprocess
 from agent.agent_utils import write_agent_config
@@ -243,3 +244,41 @@ def run_test_no_rich(
         log_dir,
         max_parallel_repos,
     )
+
+
+@agent_app.command()
+def run_test_joblit(
+    branch: str = typer.Argument(
+        ...,
+        help="Branch name of current run",
+    ),
+    override_previous_changes: bool = typer.Option(
+        False,
+        help="If override the previous agent changes on `branch` or run the agent continuously on the new changes",
+    ),
+    backend: str = typer.Option(
+        "modal",
+        help="Test backend to run the agent on, ignore this option if you are not adding `test` option to agent",
+    ),
+    agent_config_file: str = typer.Option(
+        ".agent.yaml",
+        help="Path to the agent config file",
+    ),
+    log_dir: str = typer.Option(
+        str(RUN_AIDER_LOG_DIR.resolve()),
+        help="Log directory to store the logs",
+    ),
+    max_parallel_repos: int = typer.Option(
+        1,
+        help="Maximum number of repositories for agent to run in parallel",
+    ),
+) -> None:
+    """Run the agent on the repository."""
+    run_agent_joblit(
+        branch,
+        override_previous_changes,
+        backend,
+        agent_config_file,
+        log_dir,
+        max_parallel_repos,
+    )
diff --git a/agent/run_agent.py b/agent/run_agent.py
@@ -57,11 +57,13 @@ def run_agent_for_repo(
     # get repo info
     _, repo_name = example["repo"].split("/")
 
+    original_repo_name = repo_name
+
     repo_name = repo_name.lower()
     repo_name = repo_name.replace(".", "-")
 
     # before starting, display all information to terminal
-    update_queue.put(("start_repo", (repo_name, 0)))
+    update_queue.put(("start_repo", (original_repo_name, 0)))
 
     repo_path = os.path.join(repo_base_dir, repo_name)
     repo_path = os.path.abspath(repo_path)
@@ -128,7 +130,7 @@ def run_agent_for_repo(
             test_files_str = get_tests(repo_name, verbose=0)
             test_files = sorted(list(set([i.split(":")[0] for i in test_files_str])))
 
-            update_queue.put(("start_repo", (repo_name, len(test_files))))
+            update_queue.put(("start_repo", (original_repo_name, len(test_files))))
             # when unit test feedback is available, iterate over test files
             for test_file in test_files:
                 update_queue.put(("set_current_file", (repo_name, test_file)))
@@ -159,7 +161,9 @@ def run_agent_for_repo(
                 agent_config, repo_path, test_dir=example["test"]["test_dir"]
             )
 
-            update_queue.put(("start_repo", (repo_name, len(target_edit_files))))
+            update_queue.put(
+                ("start_repo", (original_repo_name, len(target_edit_files)))
+            )
             for f in target_edit_files:
                 update_queue.put(("set_current_file", (repo_name, f)))
                 file_name = f.replace(".py", "").replace("/", "__")
@@ -172,7 +176,7 @@ def run_agent_for_repo(
                         (repo_name, file_name, agent_return.last_cost),
                     )
                 )
-    update_queue.put(("finish_repo", repo_name))
+    update_queue.put(("finish_repo", original_repo_name))
 
 
 def run_agent(
diff --git a/agent/run_agent_joblit.py b/agent/run_agent_joblit.py
@@ -0,0 +1,225 @@
+import os
+import yaml
+import multiprocessing
+from tqdm import tqdm
+from datasets import load_dataset
+from git import Repo
+from agent.agent_utils import (
+    args2string,
+    create_branch,
+    get_message,
+    get_target_edit_files,
+    get_lint_cmd,
+    read_yaml_config,
+)
+from agent.agents import AiderAgents
+from typing import Optional, Type, cast
+from types import TracebackType
+from agent.class_types import AgentConfig
+from commit0.harness.constants import SPLIT
+from commit0.harness.get_pytest_ids import main as get_tests
+from commit0.harness.constants import RUN_AIDER_LOG_DIR, RepoInstance
+from commit0.cli import read_commit0_dot_file
+from pathlib import Path
+from datetime import datetime
+
+
+class DirContext:
+    def __init__(self, d: str):
+        self.dir = d
+        self.cwd = os.getcwd()
+
+    def __enter__(self):
+        os.chdir(self.dir)
+
+    def __exit__(
+        self,
+        exctype: Optional[Type[BaseException]],
+        excinst: Optional[BaseException],
+        exctb: Optional[TracebackType],
+    ) -> None:
+        os.chdir(self.cwd)
+
+
+def run_agent_for_repo(
+    repo_base_dir: str,
+    agent_config: AgentConfig,
+    example: RepoInstance,
+    branch: Optional[str] = None,
+    override_previous_changes: bool = False,
+    backend: str = "modal",
+    log_dir: str = str(RUN_AIDER_LOG_DIR.resolve()),
+) -> None:
+    """Run Aider for a given repository."""
+    # get repo info
+    _, repo_name = example["repo"].split("/")
+
+    repo_name = repo_name.lower()
+    repo_name = repo_name.replace(".", "-")
+
+    repo_path = os.path.join(repo_base_dir, repo_name)
+    repo_path = os.path.abspath(repo_path)
+
+    src_dir = os.path.join(repo_path, example["src_dir"])
+
+    try:
+        local_repo = Repo(repo_path)
+    except Exception:
+        raise Exception(
+            f"{repo_path} is not a git repo. Check if base_dir is correctly specified."
+        )
+
+    if agent_config.agent_name == "aider":
+        agent = AiderAgents(agent_config.max_iteration, agent_config.model_name)
+    else:
+        raise NotImplementedError(
+            f"{agent_config.agent_name} is not implemented; please add your implementations in baselines/agents.py."
+        )
+
+    # if branch_name is not provided, create a new branch name based on agent_config
+    if branch is None:
+        branch = args2string(agent_config)
+
+    create_branch(local_repo, branch, example["base_commit"])
+
+    # in cases where the latest commit of branch is not commit 0
+    # set it back to commit 0
+    latest_commit = local_repo.commit(branch)
+    if latest_commit.hexsha != example["base_commit"] and override_previous_changes:
+        local_repo.git.reset("--hard", example["base_commit"])
+
+    # prepare the log dir
+    experiment_log_dir = (
+        Path(log_dir)
+        / repo_name
+        / branch
+        / datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    )
+    experiment_log_dir.mkdir(parents=True, exist_ok=True)
+
+    # write agent_config to .agent.yaml in the log_dir for record
+    agent_config_log_file = experiment_log_dir / ".agent.yaml"
+    with open(agent_config_log_file, "w") as agent_config_file:
+        yaml.dump(agent_config, agent_config_file)
+
+    # TODO: make this path more general
+    commit0_dot_file_path = str(Path(repo_path).parent.parent / ".commit0.yaml")
+    with DirContext(repo_path):
+        if agent_config is None:
+            raise ValueError("Invalid input")
+
+        target_edit_files = get_target_edit_files(
+            src_dir, src_prefix=example["src_dir"]
+        )
+
+        if agent_config.run_tests:
+            # Call the commit0 get-tests command to retrieve test files
+            test_files_str = get_tests(repo_name, verbose=0)
+            test_files = sorted(list(set([i.split(":")[0] for i in test_files_str])))
+
+            # when unit test feedback is available, iterate over test files
+            for test_file in test_files:
+                test_cmd = f"python -m commit0 test {repo_path} {test_file} --branch {branch} --backend {backend} --commit0_dot_file_path {commit0_dot_file_path}"
+                test_file_name = test_file.replace(".py", "").replace("/", "__")
+                test_log_dir = experiment_log_dir / test_file_name
+                lint_cmd = get_lint_cmd(repo_name, agent_config.use_lint_info)
+                message = get_message(agent_config, repo_path, test_file=test_file)
+                _ = agent.run(
+                    message,
+                    test_cmd,
+                    lint_cmd,
+                    target_edit_files,
+                    test_log_dir,
+                )
+                # cost = agent_return.last_cost
+        else:
+            # when unit test feedback is not available, iterate over target files to edit
+            message = get_message(
+                agent_config, repo_path, test_dir=example["test"]["test_dir"]
+            )
+            for f in target_edit_files:
+                file_name = f.replace(".py", "").replace("/", "__")
+                file_log_dir = experiment_log_dir / file_name
+                lint_cmd = get_lint_cmd(repo_name, agent_config.use_lint_info)
+                _ = agent.run(message, "", lint_cmd, [f], file_log_dir)
+                # cost = agent_return.last_cost
+
+
+def run_agent(
+    branch: str,
+    override_previous_changes: bool,
+    backend: str,
+    agent_config_file: str,
+    log_dir: str,
+    max_parallel_repos: int,
+) -> None:
+    """Main function to run Aider for a given repository.
+
+    Will run in parallel for each repo.
+    """
+    config = read_yaml_config(agent_config_file)
+
+    agent_config = AgentConfig(**config)
+
+    commit0_config = read_commit0_dot_file(".commit0.yaml")
+
+    dataset = load_dataset(
+        commit0_config["dataset_name"], split=commit0_config["dataset_split"]
+    )
+    filtered_dataset = [
+        example
+        for example in dataset
+        if commit0_config["repo_split"] == "all"
+        or (
+            isinstance(example, dict)
+            and "repo" in example
+            and isinstance(example["repo"], str)
+            and example["repo"].split("/")[-1]
+            in SPLIT.get(commit0_config["repo_split"], [])
+        )
+    ]
+    assert len(filtered_dataset) > 0, "No examples available"
+
+    # if len(filtered_dataset) > 1:
+    #     sys.stdout = open(os.devnull, "w")
+    print("jere")
+    print(filtered_dataset[0])
+    for example in filtered_dataset:
+        if "joblib" in example["repo"]:
+            print(example)
+            run_agent_for_repo(
+                commit0_config["base_dir"],
+                agent_config,
+                cast(RepoInstance, example),
+                branch,
+                override_previous_changes,
+                backend,
+                log_dir,
+            )
+    # with tqdm(
+    #     total=len(filtered_dataset), smoothing=0, desc="Running Aider for repos"
+    # ) as pbar:
+    #     with multiprocessing.Pool(processes=max_parallel_repos) as pool:
+    #         results = []
+
+    #         # Use apply_async to submit jobs and add progress bar updates
+    #         for example in filtered_dataset:
+    #             result = pool.apply_async(
+    #                 run_agent_for_repo,
+    #                 args=(
+    #                     commit0_config["base_dir"],
+    #                     agent_config,
+    #                     cast(RepoInstance, example),
+    #                     branch,
+    #                     override_previous_changes,
+    #                     backend,
+    #                     log_dir,
+    #                 ),
+    #                 callback=lambda _: pbar.update(
+    #                     1
+    #                 ),  # Update progress bar on task completion
+    #             )
+    #             results.append(result)
+
+    #         for result in results:
+    #             result.wait()