From a226f18e68e60b35588abd285a45ab1e7f5f6add Mon Sep 17 00:00:00 2001 From: Wenting Zhao Date: Sat, 21 Sep 2024 03:27:20 +0000 Subject: [PATCH 1/4] cleanup; remove checkout during test --- commit0/harness/constants.py | 1 + commit0/harness/run_pytest_ids.py | 24 ++++++++++++++++++------ commit0/harness/setup.py | 9 +++++++-- commit0/harness/utils.py | 11 +++++------ 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py index f2957fa..dfaba33 100644 --- a/commit0/harness/constants.py +++ b/commit0/harness/constants.py @@ -15,6 +15,7 @@ class Files(TypedDict): eval_script: Dict[str, Path] patch: Dict[str, Path] +BASE_BRANCH = "commit0" # Constants - Evaluation Log Directories BASE_IMAGE_BUILD_DIR = Path("logs/build_images/base") diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py index 09fbe45..b76a75e 100644 --- a/commit0/harness/run_pytest_ids.py +++ b/commit0/harness/run_pytest_ids.py @@ -85,12 +85,24 @@ def main( if branch == "reference": commit_id = example["reference_commit"] else: - try: - local_repo.git.checkout(branch) - local_branch = local_repo.branches[branch] - commit_id = local_branch.commit.hexsha - except Exception as e: - raise Exception(f"Problem checking out branch {branch}.\n{e}") + # Check if it's a local branch + if branch in local_repo.branches: + commit_id = local_repo.commit(branch) + else: + found_remote_branch = False + for remote in local_repo.remotes: + remote.fetch() # Fetch latest updates from each remote + + # Check if the branch exists in this remote + for ref in remote.refs: + if ref.remote_head == branch: # Compare branch name without remote prefix + commit_id = local_repo.commit(ref.name) + found_remote_branch = True + break # Branch found, no need to keep checking this remote + if found_remote_branch: + break # Stop checking other remotes if branch is found + if not found_remote_branch: + raise Exception(f"Branch {branch} does not exist locally or remotely.") patch = generate_patch_between_commits( local_repo, example["base_commit"], commit_id ) diff --git a/commit0/harness/setup.py b/commit0/harness/setup.py index edaadcd..22ed00b 100644 --- a/commit0/harness/setup.py +++ b/commit0/harness/setup.py @@ -7,7 +7,7 @@ from commit0.harness.utils import ( clone_repo, ) -from commit0.harness.constants import RepoInstance, SPLIT +from commit0.harness.constants import BASE_BRANCH, RepoInstance, SPLIT logging.basicConfig( @@ -29,7 +29,12 @@ def main( continue clone_url = f"https://github.com/{example['repo']}.git" clone_dir = os.path.abspath(os.path.join(base_dir, repo_name)) - clone_repo(clone_url, clone_dir, example["base_commit"], logger) + branch = dataset_name.split('/')[-1] + repo = clone_repo(clone_url, clone_dir, branch, logger) + if BASE_BRANCH in repo.branches: + repo.git.branch('-d', BASE_BRANCH) + repo.git.checkout('-b', BASE_BRANCH) + logger.info("Checked out the base commit: commit 0") __all__ = [] diff --git a/commit0/harness/utils.py b/commit0/harness/utils.py index 8836415..c57a566 100644 --- a/commit0/harness/utils.py +++ b/commit0/harness/utils.py @@ -85,7 +85,7 @@ def extract_test_output(ss: str, pattern: str) -> str: def clone_repo( - clone_url: str, clone_dir: str, commit: str, logger: logging.Logger + clone_url: str, clone_dir: str, branch: str, logger: logging.Logger ) -> git.Repo: """Clone repo into the specified directory if it does not already exist. @@ -98,8 +98,8 @@ def clone_repo( URL of the repository to clone. clone_dir : str Directory where the repository will be cloned. - commit : str - The commit hash or branch/tag name to checkout. + branch : str + The branch/tag name to checkout. logger : logging.Logger The logger object. @@ -129,11 +129,10 @@ def clone_repo( except git.exc.GitCommandError as e: raise RuntimeError(f"Failed to clone repository: {e}") - logger.info(f"Checking out {commit}") try: - repo.git.checkout(commit) + repo.git.checkout(branch) except git.exc.GitCommandError as e: - raise RuntimeError(f"Failed to check out {commit}: {e}") + raise RuntimeError(f"Failed to check out {branch}: {e}") return repo From 3794ffcc273a2bdaa8f56ff533a8ee4c86b02a6a Mon Sep 17 00:00:00 2001 From: Wenting Zhao Date: Sat, 21 Sep 2024 03:30:25 +0000 Subject: [PATCH 2/4] precommit fixes --- commit0/harness/constants.py | 1 + commit0/harness/run_pytest_ids.py | 9 ++++++--- commit0/harness/setup.py | 6 +++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py index dfaba33..9a92337 100644 --- a/commit0/harness/constants.py +++ b/commit0/harness/constants.py @@ -15,6 +15,7 @@ class Files(TypedDict): eval_script: Dict[str, Path] patch: Dict[str, Path] + BASE_BRANCH = "commit0" # Constants - Evaluation Log Directories diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py index b76a75e..fd45080 100644 --- a/commit0/harness/run_pytest_ids.py +++ b/commit0/harness/run_pytest_ids.py @@ -82,12 +82,13 @@ def main( ) except Exception as e: raise e + commit_id = "" if branch == "reference": commit_id = example["reference_commit"] else: # Check if it's a local branch if branch in local_repo.branches: - commit_id = local_repo.commit(branch) + commit_id = local_repo.commit(branch).hexsha else: found_remote_branch = False for remote in local_repo.remotes: @@ -95,8 +96,10 @@ def main( # Check if the branch exists in this remote for ref in remote.refs: - if ref.remote_head == branch: # Compare branch name without remote prefix - commit_id = local_repo.commit(ref.name) + if ( + ref.remote_head == branch + ): # Compare branch name without remote prefix + commit_id = local_repo.commit(ref.name).hexsha found_remote_branch = True break # Branch found, no need to keep checking this remote if found_remote_branch: diff --git a/commit0/harness/setup.py b/commit0/harness/setup.py index 22ed00b..b355572 100644 --- a/commit0/harness/setup.py +++ b/commit0/harness/setup.py @@ -29,11 +29,11 @@ def main( continue clone_url = f"https://github.com/{example['repo']}.git" clone_dir = os.path.abspath(os.path.join(base_dir, repo_name)) - branch = dataset_name.split('/')[-1] + branch = dataset_name.split("/")[-1] repo = clone_repo(clone_url, clone_dir, branch, logger) if BASE_BRANCH in repo.branches: - repo.git.branch('-d', BASE_BRANCH) - repo.git.checkout('-b', BASE_BRANCH) + repo.git.branch("-d", BASE_BRANCH) + repo.git.checkout("-b", BASE_BRANCH) logger.info("Checked out the base commit: commit 0") From 4b95135182bb92a6d0c1cf6316b1457de34225de Mon Sep 17 00:00:00 2001 From: Wenting Zhao Date: Sat, 21 Sep 2024 04:11:25 +0000 Subject: [PATCH 3/4] when branch is not specified, defaults to active branch. --- commit0/cli.py | 16 ++++------------ commit0/harness/evaluate.py | 5 ++++- commit0/harness/utils.py | 26 +++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/commit0/cli.py b/commit0/cli.py index 8c82b08..c023b02 100644 --- a/commit0/cli.py +++ b/commit0/cli.py @@ -10,6 +10,7 @@ import commit0.harness.lint import commit0.harness.save from commit0.harness.constants import SPLIT, SPLIT_ALL +from commit0.harness.utils import get_active_branch import subprocess import yaml import os @@ -245,14 +246,11 @@ def test( commit0_config = read_commit0_dot_file(commit0_dot_file_path) - if not branch and not reference: - raise typer.BadParameter( - f"Invalid {highlight('BRANCH', Colors.RED)}. Either --reference or provide a branch name.", - param_hint="BRANCH", - ) if reference: branch = "reference" - assert branch is not None, "branch is not specified" + if branch is None and not reference: + git_path = os.path.join(commit0_config["base_dir"], repo_or_repo_path.split("/")[-1]) + branch = get_active_branch(git_path) if verbose == 2: typer.echo(f"Running tests for repository: {repo_or_repo_path}") @@ -294,14 +292,8 @@ def evaluate( ) -> None: """Evaluate Commit0 split you choose in Setup Stage.""" check_commit0_path() - if not branch and not reference: - raise typer.BadParameter( - f"Invalid {highlight('BRANCH', Colors.RED)}. Either --reference or provide a branch name", - param_hint="BRANCH", - ) if reference: branch = "reference" - assert branch is not None, "branch is not specified" commit0_config = read_commit0_dot_file(commit0_dot_file_path) check_valid(commit0_config["repo_split"], SPLIT) diff --git a/commit0/harness/evaluate.py b/commit0/harness/evaluate.py index 11dbc98..7776c4b 100644 --- a/commit0/harness/evaluate.py +++ b/commit0/harness/evaluate.py @@ -10,7 +10,7 @@ from commit0.harness.run_pytest_ids import main as run_tests from commit0.harness.get_pytest_ids import main as get_tests from commit0.harness.constants import RepoInstance, SPLIT, RUN_PYTEST_LOG_DIR -from commit0.harness.utils import get_hash_string +from commit0.harness.utils import get_hash_string, get_active_branch logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -40,6 +40,9 @@ def main( continue pairs.append((repo_name, example["test"]["test_dir"])) hashed_test_ids = get_hash_string(example["test"]["test_dir"]) + if branch is None: + git_path = os.path.join(base_dir, repo_name) + branch = get_active_branch(git_path) log_dir = RUN_PYTEST_LOG_DIR / repo_name / branch / hashed_test_ids log_dirs.append(str(log_dir)) diff --git a/commit0/harness/utils.py b/commit0/harness/utils.py index c57a566..c1fb257 100644 --- a/commit0/harness/utils.py +++ b/commit0/harness/utils.py @@ -6,7 +6,7 @@ import time import sys from pathlib import Path -from typing import Optional +from typing import Optional, Union from fastcore.net import HTTP404NotFoundError, HTTP403ForbiddenError # type: ignore from ghapi.core import GhApi @@ -189,4 +189,28 @@ def generate_patch_between_commits( raise Exception(f"Error generating patch: {e}") +def get_active_branch(repo_path: Union[str, Path]) -> str: + """ + Retrieve the current active branch of a Git repository. + + Args: + repo_path (Path): The path to git repo. + + Returns: + str: The name of the active branch. + + Raises: + Exception: If the repository is in a detached HEAD state. + """ + repo = git.Repo(repo_path) + try: + # Get the current active branch + branch = repo.active_branch.name + except TypeError as e: + raise Exception(f"{e}\nThis means the repository is in a detached HEAD state. " + "To proceed, please specify a valid branch.") + + return branch + + __all__ = [] From ca81e37827b6d8951021911c99b3ec991af43568 Mon Sep 17 00:00:00 2001 From: Wenting Zhao Date: Sat, 21 Sep 2024 04:17:25 +0000 Subject: [PATCH 4/4] pre-commit --- commit0/cli.py | 6 ++++-- commit0/harness/evaluate.py | 10 +++++----- commit0/harness/utils.py | 13 +++++++++---- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/commit0/cli.py b/commit0/cli.py index c023b02..c0d8596 100644 --- a/commit0/cli.py +++ b/commit0/cli.py @@ -249,7 +249,9 @@ def test( if reference: branch = "reference" if branch is None and not reference: - git_path = os.path.join(commit0_config["base_dir"], repo_or_repo_path.split("/")[-1]) + git_path = os.path.join( + commit0_config["base_dir"], repo_or_repo_path.split("/")[-1] + ) branch = get_active_branch(git_path) if verbose == 2: @@ -262,7 +264,7 @@ def test( commit0_config["dataset_split"], commit0_config["base_dir"], repo_or_repo_path, - branch, + branch, # type: ignore test_ids, backend, timeout, diff --git a/commit0/harness/evaluate.py b/commit0/harness/evaluate.py index 7776c4b..ddc4b15 100644 --- a/commit0/harness/evaluate.py +++ b/commit0/harness/evaluate.py @@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from datasets import load_dataset from tqdm import tqdm -from typing import Iterator +from typing import Iterator, Union from commit0.harness.run_pytest_ids import main as run_tests from commit0.harness.get_pytest_ids import main as get_tests @@ -23,7 +23,7 @@ def main( dataset_split: str, repo_split: str, base_dir: str, - branch: str, + branch: Union[str, None], backend: str, timeout: int, num_cpus: int, @@ -32,19 +32,19 @@ def main( ) -> None: dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split) # type: ignore repos = SPLIT[repo_split] - pairs = [] + triples = [] log_dirs = [] for example in dataset: repo_name = example["repo"].split("/")[-1] if repo_split != "all" and repo_name not in SPLIT[repo_split]: continue - pairs.append((repo_name, example["test"]["test_dir"])) hashed_test_ids = get_hash_string(example["test"]["test_dir"]) if branch is None: git_path = os.path.join(base_dir, repo_name) branch = get_active_branch(git_path) log_dir = RUN_PYTEST_LOG_DIR / repo_name / branch / hashed_test_ids log_dirs.append(str(log_dir)) + triples.append((repo_name, example["test"]["test_dir"], branch)) with tqdm(total=len(repos), smoothing=0, desc="Evaluating repos") as pbar: with ThreadPoolExecutor(max_workers=num_workers) as executor: @@ -64,7 +64,7 @@ def main( rebuild_image=rebuild_image, verbose=0, ): None - for repo, test_dir in pairs + for repo, test_dir, branch in triples } # Wait for each future to complete for future in as_completed(futures): diff --git a/commit0/harness/utils.py b/commit0/harness/utils.py index c1fb257..b006035 100644 --- a/commit0/harness/utils.py +++ b/commit0/harness/utils.py @@ -190,25 +190,30 @@ def generate_patch_between_commits( def get_active_branch(repo_path: Union[str, Path]) -> str: - """ - Retrieve the current active branch of a Git repository. + """Retrieve the current active branch of a Git repository. Args: + ---- repo_path (Path): The path to git repo. Returns: + ------- str: The name of the active branch. Raises: + ------ Exception: If the repository is in a detached HEAD state. + """ repo = git.Repo(repo_path) try: # Get the current active branch branch = repo.active_branch.name except TypeError as e: - raise Exception(f"{e}\nThis means the repository is in a detached HEAD state. " - "To proceed, please specify a valid branch.") + raise Exception( + f"{e}\nThis means the repository is in a detached HEAD state. " + "To proceed, please specify a valid branch by using --branch {branch}." + ) return branch