Merge branch 'main' into aider

wenting-zhao · wenting-zhao · commit 9cba73fe9b30 · 2024-09-21T04:31:47.000Z
diff --git a/.github/workflows/system.yml b/.github/workflows/system.yml
@@ -25,9 +25,19 @@ jobs:
       - name: Get tests
         run: uv run commit0 get-tests simpy
       - name: Test
-        run: uv run commit0 test simpy tests/test_event.py::test_succeed --reference
+        env:
+          MODAL_TOKEN_ID: ${{secrets.MODAL_TOKEN_ID}}
+          MODAL_TOKEN_SECRET: ${{secrets.MODAL_TOKEN_SECRET}}
+        run: |
+          uv run commit0 test simpy tests/test_event.py::test_succeed --reference --rebuild
+          uv run commit0 test simpy tests/test_event.py::test_succeed --reference
       - name: Evaluate
-        run: uv run commit0 evaluate --reference
+        env:
+          MODAL_TOKEN_ID: ${{secrets.MODAL_TOKEN_ID}}
+          MODAL_TOKEN_SECRET: ${{secrets.MODAL_TOKEN_SECRET}}
+        run: |
+          uv run commit0 evaluate --reference  --rebuild
+          uv run commit0 evaluate --reference
       - name: Lint
         run: uv run commit0 lint commit0/harness/lint.py
       - name: Save
diff --git a/agent/cli.py b/agent/cli.py
@@ -83,7 +83,7 @@ def config(
         help="Use the user prompt instead of the default prompt",
     ),
     user_prompt: str = typer.Option(
-        "Here is your task:\nYou need to implement all functions with '    pass' and pass the unit tests.\nDo not change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc.\nWhen you generate code, you must maintain the original formatting of the function stubs (such as whitespaces), otherwise we will not able to search/replace blocks for code modifications, and therefore you will receive a score of 0 for your generated code.",
+        "Here is your task:\nYou need to complete the implementations for all functions (i.e., those with pass statements) and pass the unit tests.\nDo not change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc.\nWhen you generate code, you must maintain the original formatting of the function stubs (such as whitespaces), otherwise we will not able to search/replace blocks for code modifications, and therefore you will receive a score of 0 for your generated code.",
         help="User prompt to use",
     ),
     run_tests: bool = typer.Option(
diff --git a/agent/commit0_utils.py b/agent/commit0_utils.py
@@ -119,9 +119,7 @@ def get_file_info(file_path: Path, prefix: str = "") -> str:
 
 
 def get_target_edit_files(target_dir: str) -> list[str]:
-    """Find the files with the error 'NotImplementedError('IMPLEMENT ME
-    HERE')'.
-    """
+    """Find the files with functions with the pass statement."""
     files = []
     for root, _, filenames in os.walk(target_dir):
         for filename in filenames:
diff --git a/commit0/cli.py b/commit0/cli.py
@@ -10,6 +10,7 @@
 import commit0.harness.lint
 import commit0.harness.save
 from commit0.harness.constants import SPLIT, SPLIT_ALL
+from commit0.harness.utils import get_active_branch
 import subprocess
 import yaml
 import os
@@ -216,12 +217,15 @@ def test(
     branch: Union[str, None] = typer.Option(
         None, help="Branch to test (branch MUST be provided or use --reference)"
     ),
-    backend: str = typer.Option("local", help="Backend to use for testing"),
+    backend: str = typer.Option("modal", help="Backend to use for testing"),
     timeout: int = typer.Option(1800, help="Timeout for tests in seconds"),
     num_cpus: int = typer.Option(1, help="Number of CPUs to use"),
     reference: Annotated[
         bool, typer.Option("--reference", help="Test the reference commit.")
     ] = False,
+    rebuild: bool = typer.Option(
+        False, "--rebuild", help="Whether to rebuild an image"
+    ),
     commit0_dot_file_path: str = typer.Option(
         ".commit0.yaml",
         help="Path to the commit0 dot file, where the setup config is stored",
@@ -242,29 +246,30 @@ def test(
 
     commit0_config = read_commit0_dot_file(commit0_dot_file_path)
 
-    if not branch and not reference:
-        raise typer.BadParameter(
-            f"Invalid {highlight('BRANCH', Colors.RED)}. Either --reference or provide a branch name.",
-            param_hint="BRANCH",
-        )
     if reference:
         branch = "reference"
-    assert branch is not None, "branch is not specified"
+    if branch is None and not reference:
+        git_path = os.path.join(
+            commit0_config["base_dir"], repo_or_repo_path.split("/")[-1]
+        )
+        branch = get_active_branch(git_path)
 
-    typer.echo(f"Running tests for repository: {repo_or_repo_path}")
-    typer.echo(f"Branch: {branch}")
-    typer.echo(f"Test IDs: {test_ids}")
+    if verbose == 2:
+        typer.echo(f"Running tests for repository: {repo_or_repo_path}")
+        typer.echo(f"Branch: {branch}")
+        typer.echo(f"Test IDs: {test_ids}")
 
     commit0.harness.run_pytest_ids.main(
         commit0_config["dataset_name"],
         commit0_config["dataset_split"],
         commit0_config["base_dir"],
         repo_or_repo_path,
-        branch,
+        branch,  # type: ignore
         test_ids,
         backend,
         timeout,
         num_cpus,
+        rebuild,
         verbose,
     )
 
@@ -274,7 +279,7 @@ def evaluate(
     branch: Union[str, None] = typer.Option(
         None, help="Branch to evaluate (branch MUST be provided or use --reference)"
     ),
-    backend: str = typer.Option("local", help="Backend to use for evaluation"),
+    backend: str = typer.Option("modal", help="Backend to use for evaluation"),
     timeout: int = typer.Option(1800, help="Timeout for evaluation in seconds"),
     num_cpus: int = typer.Option(1, help="Number of CPUs to use"),
     num_workers: int = typer.Option(8, help="Number of workers to use"),
@@ -285,17 +290,12 @@ def evaluate(
         ".commit0.yaml",
         help="Path to the commit0 dot file, where the setup config is stored",
     ),
+    rebuild: bool = typer.Option(False, "--rebuild", help="Whether to rebuild images"),
 ) -> None:
     """Evaluate Commit0 split you choose in Setup Stage."""
     check_commit0_path()
-    if not branch and not reference:
-        raise typer.BadParameter(
-            f"Invalid {highlight('BRANCH', Colors.RED)}. Either --reference or provide a branch name",
-            param_hint="BRANCH",
-        )
     if reference:
         branch = "reference"
-    assert branch is not None, "branch is not specified"
 
     commit0_config = read_commit0_dot_file(commit0_dot_file_path)
     check_valid(commit0_config["repo_split"], SPLIT)
@@ -313,6 +313,7 @@ def evaluate(
         timeout,
         num_cpus,
         num_workers,
+        rebuild,
     )
 
 
diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py
@@ -16,6 +16,8 @@ class Files(TypedDict):
     patch: Dict[str, Path]
 
 
+BASE_BRANCH = "commit0"
+
 # Constants - Evaluation Log Directories
 BASE_IMAGE_BUILD_DIR = Path("logs/build_images/base")
 REPO_IMAGE_BUILD_DIR = Path("logs/build_images/repo")
diff --git a/commit0/harness/evaluate.py b/commit0/harness/evaluate.py
@@ -5,12 +5,12 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datasets import load_dataset
 from tqdm import tqdm
-from typing import Iterator
+from typing import Iterator, Union
 
 from commit0.harness.run_pytest_ids import main as run_tests
 from commit0.harness.get_pytest_ids import main as get_tests
 from commit0.harness.constants import RepoInstance, SPLIT, RUN_PYTEST_LOG_DIR
-from commit0.harness.utils import get_hash_string
+from commit0.harness.utils import get_hash_string, get_active_branch
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -23,24 +23,28 @@ def main(
     dataset_split: str,
     repo_split: str,
     base_dir: str,
-    branch: str,
+    branch: Union[str, None],
     backend: str,
     timeout: int,
     num_cpus: int,
     num_workers: int,
+    rebuild_image: bool,
 ) -> None:
     dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
     repos = SPLIT[repo_split]
-    pairs = []
+    triples = []
     log_dirs = []
     for example in dataset:
         repo_name = example["repo"].split("/")[-1]
         if repo_split != "all" and repo_name not in SPLIT[repo_split]:
             continue
-        pairs.append((repo_name, example["test"]["test_dir"]))
         hashed_test_ids = get_hash_string(example["test"]["test_dir"])
+        if branch is None:
+            git_path = os.path.join(base_dir, repo_name)
+            branch = get_active_branch(git_path)
         log_dir = RUN_PYTEST_LOG_DIR / repo_name / branch / hashed_test_ids
         log_dirs.append(str(log_dir))
+        triples.append((repo_name, example["test"]["test_dir"], branch))
 
     with tqdm(total=len(repos), smoothing=0, desc="Evaluating repos") as pbar:
         with ThreadPoolExecutor(max_workers=num_workers) as executor:
@@ -57,9 +61,10 @@ def main(
                     backend,
                     timeout,
                     num_cpus,
+                    rebuild_image=rebuild_image,
                     verbose=0,
                 ): None
-                for repo, test_dir in pairs
+                for repo, test_dir, branch in triples
             }
             # Wait for each future to complete
             for future in as_completed(futures):
diff --git a/commit0/harness/execution_context.py b/commit0/harness/execution_context.py
@@ -44,6 +44,7 @@ def __init__(
         log_dir: Path,
         files_to_copy: Optional[Files] = None,
         files_to_collect: Optional[list[str]] = None,
+        rebuild_image: bool = False,
     ):
         """Create the remote execution context
 
@@ -85,6 +86,7 @@ def __init__(
         log_dir: Path,
         files_to_copy: Optional[Files] = None,
         files_to_collect: Optional[list[str]] = None,
+        rebuild_image: bool = False,
     ):
         super().__init__(
             spec,
@@ -145,6 +147,7 @@ def __init__(
         log_dir: Path,
         files_to_copy: Optional[Files] = None,
         files_to_collect: Optional[list[str]] = None,
+        rebuild_image: bool = False,
     ):
         super().__init__(
             spec,
@@ -161,7 +164,7 @@ def __init__(
         # the image must exist on dockerhub
         reponame = spec.repo.split("/")[-1]
         image_name = f"wentingzhao/{reponame}:latest".lower()
-        image = modal.Image.from_registry(image_name)
+        image = modal.Image.from_registry(image_name, force_build=rebuild_image)
         if files_to_copy:
             for _, f in files_to_copy.items():
                 image = image.copy_local_file(f["src"], f["dest"])  # type: ignore
@@ -171,14 +174,12 @@ def exec_run_with_timeout(self, command: str) -> tuple[str, bool, float]:
         """Execute command on modal sandbox"""
         start_time = time.time()
         with modal.Volume.ephemeral() as vol:
-            cp_cmd = ""
             if self.files_to_collect:
+                command += " && "
                 for fname in self.files_to_collect:
                     remote_file = Path(self.spec.repo_directory) / fname
-                    curr_cp_cmd = f" && cp {str(remote_file)} /vol/{fname} 2>/dev/null"
-                    cp_cmd += curr_cp_cmd
-
-            command += cp_cmd
+                    cp_cmd = f"test -e {str(remote_file)} && cp {str(remote_file)} /vol/{fname}; "
+                    command += cp_cmd
             self.sandbox = modal.Sandbox.create(
                 "bash",
                 "-c",
@@ -199,7 +200,9 @@ def exec_run_with_timeout(self, command: str) -> tuple[str, bool, float]:
                 timed_out = False
 
             if self.files_to_collect:
-                for fname in self.files_to_collect:
+                fnames = vol.listdir("")
+                for fname in fnames:
+                    fname = fname.path
                     with (self.log_dir / fname).open("wb") as f:
                         for data in vol.read_file(fname):
                             f.write(data)
diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py
@@ -37,6 +37,7 @@ def main(
     backend: str,
     timeout: int,
     num_cpus: int,
+    rebuild_image: bool,
     verbose: int,
 ) -> None:
     """Runs the pytests for repos in a dataset.
@@ -81,15 +82,30 @@ def main(
             )
         except Exception as e:
             raise e
+    commit_id = ""
     if branch == "reference":
         commit_id = example["reference_commit"]
     else:
-        try:
-            local_repo.git.checkout(branch)
-            local_branch = local_repo.branches[branch]
-            commit_id = local_branch.commit.hexsha
-        except Exception as e:
-            raise Exception(f"Problem checking out branch {branch}.\n{e}")
+        # Check if it's a local branch
+        if branch in local_repo.branches:
+            commit_id = local_repo.commit(branch).hexsha
+        else:
+            found_remote_branch = False
+            for remote in local_repo.remotes:
+                remote.fetch()  # Fetch latest updates from each remote
+
+                # Check if the branch exists in this remote
+                for ref in remote.refs:
+                    if (
+                        ref.remote_head == branch
+                    ):  # Compare branch name without remote prefix
+                        commit_id = local_repo.commit(ref.name).hexsha
+                        found_remote_branch = True
+                        break  # Branch found, no need to keep checking this remote
+                if found_remote_branch:
+                    break  # Stop checking other remotes if branch is found
+            if not found_remote_branch:
+                raise Exception(f"Branch {branch} does not exist locally or remotely.")
     patch = generate_patch_between_commits(
         local_repo, example["base_commit"], commit_id
     )
@@ -125,7 +141,14 @@ def main(
 
     try:
         with execution_context(
-            spec, logger, timeout, num_cpus, log_dir, files_to_copy, files_to_collect
+            spec,
+            logger,
+            timeout,
+            num_cpus,
+            log_dir,
+            files_to_copy,
+            files_to_collect,
+            rebuild_image,
         ) as context:
             output, timed_out, total_runtime = context.exec_run_with_timeout(
                 "/bin/bash /eval.sh"
diff --git a/commit0/harness/setup.py b/commit0/harness/setup.py
@@ -7,7 +7,7 @@
 from commit0.harness.utils import (
     clone_repo,
 )
-from commit0.harness.constants import RepoInstance, SPLIT
+from commit0.harness.constants import BASE_BRANCH, RepoInstance, SPLIT
 
 
 logging.basicConfig(
@@ -29,7 +29,12 @@ def main(
             continue
         clone_url = f"https://github.com/{example['repo']}.git"
         clone_dir = os.path.abspath(os.path.join(base_dir, repo_name))
-        clone_repo(clone_url, clone_dir, example["base_commit"], logger)
+        branch = dataset_name.split("/")[-1]
+        repo = clone_repo(clone_url, clone_dir, branch, logger)
+        if BASE_BRANCH in repo.branches:
+            repo.git.branch("-d", BASE_BRANCH)
+        repo.git.checkout("-b", BASE_BRANCH)
+        logger.info(f"Checked out the base branch: {BASE_BRANCH}")
 
 
 __all__ = []
diff --git a/commit0/harness/utils.py b/commit0/harness/utils.py