From 09760ca61a8f13af5f0d24cbe88dc2b9bbc8d482 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 30 Apr 2025 10:25:35 -0400 Subject: [PATCH 1/3] Add a helper script to bisect a regression --- bench_runner/__main__.py | 1 + bench_runner/git.py | 9 ++ bench_runner/scripts/bisect.py | 232 +++++++++++++++++++++++++++++++ bench_runner/scripts/workflow.py | 11 +- bench_runner/util.py | 19 +++ 5 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 bench_runner/scripts/bisect.py diff --git a/bench_runner/__main__.py b/bench_runner/__main__.py index b843b9d3..3a02d91f 100644 --- a/bench_runner/__main__.py +++ b/bench_runner/__main__.py @@ -7,6 +7,7 @@ COMMANDS = { "backfill": "Schedule benchmarking a number of commits", + "bisect": "Run a bisect to find the commit that caused a regression", "compare": "Compare a matrix of specific results", "find_failures": "Find the benchmarks that failed in the last weekly run", "generate_results": "Create all of the derived artifacts from raw data", diff --git a/bench_runner/git.py b/bench_runner/git.py index 7f8120ed..9944c6d7 100644 --- a/bench_runner/git.py +++ b/bench_runner/git.py @@ -168,3 +168,12 @@ def clone( if depth is not None: args += ["--depth", str(depth)] subprocess.check_call(args) + + +def checkout(dirname: PathLike, ref: str) -> None: + dirname = Path(dirname) + + subprocess.check_call( + ["git", "checkout", ref], + cwd=dirname, + ) diff --git a/bench_runner/scripts/bisect.py b/bench_runner/scripts/bisect.py new file mode 100644 index 00000000..740e5e18 --- /dev/null +++ b/bench_runner/scripts/bisect.py @@ -0,0 +1,232 @@ +import argparse +import contextlib +from pathlib import Path +import subprocess +import sys +import traceback + + +import numpy as np +import rich_argparse + + +from bench_runner import flags as mflags +from bench_runner import git +from bench_runner import result +from bench_runner.scripts import run_benchmarks +from bench_runner.scripts import workflow +from bench_runner.util import PathLike, format_seconds + + +def _get_result_commandline( + benchmark: str, + good_val: float, + bad_val: float, + pgo: bool, + flags: str, + repo: PathLike, +) -> list[str]: + repo = Path(repo) + + return [ + sys.executable, + "-m", + "bench_runner.scripts.bisect", + benchmark, + str(good_val), + str(bad_val), + str(pgo), + str(flags), + str(repo.absolute()), + ] + + +def parse_result(benchmark_json: PathLike) -> float: + # The name of the benchmark in the JSON file may be different from the one + # used to select the benchmark. Therefore, just take the mean of all the + # benchmarks in the JSON file. + r = result.Result.from_arbitrary_filename(benchmark_json) + timing_data = r.get_timing_data() + return float(np.mean([x.mean() for x in timing_data.values()])) + + +def get_result( + benchmark: str, + pgo: bool = False, + flags: str = "", + cpython: PathLike = Path("cpython"), + reconfigure: bool = False, +) -> float: + cpython = Path(cpython) + + if pgo or reconfigure: + # Jumping around through commits with PGO can leave stale PGO data + # around, so we need to clean it each time. We also always do it the + # first time in case the *last* bisect run used pgo. + subprocess.run(["make", "clean"], cwd=cpython) + + workflow.compile_unix(cpython, mflags.parse_flags(flags), pgo, False, reconfigure) + run_benchmarks.run_benchmarks(cpython / "python", benchmark) + timing = parse_result(run_benchmarks.BENCHMARK_JSON) + + return timing + + +def get_log_file() -> Path: + return Path("bisect_log.txt") + + +def delete_log() -> None: + bisect_log = get_log_file() + if bisect_log.is_file(): + bisect_log.unlink() + + +def show_log() -> None: + bisect_log = get_log_file() + + print() + print("Bisect log:") + + with get_log_file().open("r") as f: + for line in f.readlines(): + print(line.strip()) + + +def log(message: str) -> None: + with get_log_file().open("a") as f: + f.write(f"{message}\n") + + +def _main( + benchmark: str, + good: str, + bad: str, + pgo: bool = False, + flags: str = "", + repo: PathLike = Path("."), + cpython: PathLike = Path("cpython"), +): + repo = Path(repo).absolute() + cpython = Path(cpython).absolute() + + delete_log() + + if not cpython.is_dir(): + git.clone( + cpython, "https://github.com/python/cpython.git", branch="main", depth=None + ) + + git.checkout(cpython, good) + good_timing = get_result(benchmark, pgo, flags, cpython=cpython, reconfigure=True) + log(f"KNOWN GOOD ({good[:7]}): {format_seconds(good_timing)}") + + git.checkout(cpython, bad) + bad_timing = get_result(benchmark, pgo, flags, cpython=cpython) + log(f"KNOWN BAD ({bad[:7]}): {format_seconds(bad_timing)}") + + try: + with contextlib.chdir(cpython): + subprocess.run(["git", "bisect", "start"]) + subprocess.run(["git", "bisect", "bad", bad]) + subprocess.run(["git", "bisect", "good", good]) + subprocess.run( + ["git", "bisect", "run"] + + _get_result_commandline( + benchmark, good_timing, bad_timing, pgo, flags, repo + ) + ) + finally: + show_log() + delete_log() + + +def main(): + # This is the entry point for the user + + parser = argparse.ArgumentParser( + description=""" + Run bisect on a benchmark to find the first regressing commit. + + A full checkout of CPython should be in the cpython directory. + If it doesn't exist, it will be cloned. + """, + formatter_class=rich_argparse.ArgumentDefaultsRichHelpFormatter, + ) + parser.add_argument( + "benchmark", + type=str, + help="The benchmark to run bisect on.", + ) + parser.add_argument( + "good", + type=str, + help="The good commit hash for the bisect.", + ) + parser.add_argument( + "bad", + type=str, + help="The bad commit hash for the bisect.", + ) + parser.add_argument( + "--pgo", + action="store_true", + ) + parser.add_argument( + "--flags", + type=str, + default="", + ) + + args = parser.parse_args() + + _main(args.benchmark, args.good, args.bad, args.pgo, args.flags) + + +if __name__ == "__main__": + # This is the entry point when we are called from `git bisect run` itself + + parser = argparse.ArgumentParser() + parser.add_argument("benchmark", type=str) + parser.add_argument("good_val", type=float) + parser.add_argument("bad_val", type=float) + parser.add_argument("pgo", type=str) + parser.add_argument("flags", type=str) + parser.add_argument("repo", type=str) + args = parser.parse_args() + + mid_point = (args.good_val + args.bad_val) / 2.0 + + repo = Path(args.repo) + cpython = repo / "cpython" + + try: + with contextlib.chdir(repo): + timing = get_result( + args.benchmark, args.pgo == "True", args.flags, cpython=cpython + ) + except Exception as e: + # If there was any exception, display that exception and traceback and + # then abort the git bisect with -1 + traceback.print_exception(e) + sys.exit(-1) + + # The confidence is 0.0 at the mid-point, 1.0 at the good and bad values, + # and > 1.0 outside of that. + confidence = abs((timing - mid_point) / ((args.bad_val - args.good_val) / 2.0)) + + with contextlib.chdir(repo): + if timing > mid_point: + log( + f"BISECT BAD ({git.get_git_hash(cpython)[:7]}): " + f"{format_seconds(timing)} (confidence {confidence:.02f})" + ) + print(f"BAD: {timing} vs. ({args.good_val}, {args.bad_val})") + sys.exit(1) + else: + log( + f"BISECT GOOD ({git.get_git_hash(cpython)[:7]}): " + f"{format_seconds(timing)} (confidence {confidence:.02f})" + ) + print(f"GOOD: {timing} vs. ({args.good_val}, {args.bad_val})") + sys.exit(0) diff --git a/bench_runner/scripts/workflow.py b/bench_runner/scripts/workflow.py index fe6baf0b..5e9ca59e 100644 --- a/bench_runner/scripts/workflow.py +++ b/bench_runner/scripts/workflow.py @@ -130,7 +130,13 @@ def checkout_benchmarks(): ) -def compile_unix(cpython: PathLike, flags: list[str], pgo: bool, pystats: bool) -> None: +def compile_unix( + cpython: PathLike, + flags: list[str], + pgo: bool, + pystats: bool, + reconfigure: bool = True, +) -> None: cpython = Path(cpython) cfg = config.get_config_for_current_runner() @@ -166,7 +172,8 @@ def compile_unix(cpython: PathLike, flags: list[str], pgo: bool, pystats: bool) make_args.extend(["-j"]) with contextlib.chdir(cpython): - subprocess.check_call(["./configure", *args], env=env) + if reconfigure: + subprocess.check_call(["./configure", *args], env=env) subprocess.check_call(["make", *make_args], env=env) diff --git a/bench_runner/util.py b/bench_runner/util.py index af711a2d..b600d826 100644 --- a/bench_runner/util.py +++ b/bench_runner/util.py @@ -87,6 +87,25 @@ def get_simple_platform() -> Literal["linux", "macos", "windows"]: raise RuntimeError(f"Unsupported platform {sys.platform}.") +def format_seconds(value: float) -> str: + """ + Given a float value in seconds, formats it into a human-readable string with the appropriate precision. + """ + _TIMEDELTA_UNITS = ("sec", "ms", "us", "ns") + + for i in range(2, -9, -1): + if value >= 10.0**i: + break + else: + i = -9 + + precision = 2 - i % 3 + k = -(i // 3) if i < 0 else 0 + factor = 10 ** (k * 3) + unit = _TIMEDELTA_UNITS[k] + return f"{value * factor:.{precision}f} {unit}" + + if os.getenv("GITHUB_ACTIONS") == "true": @contextlib.contextmanager From ec0eed3cdc8d98d59705b6abede188479ae7cd58 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 30 Apr 2025 10:46:32 -0400 Subject: [PATCH 2/3] Lint --- bench_runner/git.py | 2 +- bench_runner/scripts/bisect.py | 2 -- bench_runner/util.py | 3 ++- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bench_runner/git.py b/bench_runner/git.py index 9944c6d7..8070c398 100644 --- a/bench_runner/git.py +++ b/bench_runner/git.py @@ -138,7 +138,7 @@ def clone( url: str, *, branch: str | None = None, - depth: int = 1, + depth: int | None = 1, ) -> None: is_hash = re.match(r"^[0-9a-f]{40}$", branch) if branch else False diff --git a/bench_runner/scripts/bisect.py b/bench_runner/scripts/bisect.py index 740e5e18..4a8fc190 100644 --- a/bench_runner/scripts/bisect.py +++ b/bench_runner/scripts/bisect.py @@ -83,8 +83,6 @@ def delete_log() -> None: def show_log() -> None: - bisect_log = get_log_file() - print() print("Bisect log:") diff --git a/bench_runner/util.py b/bench_runner/util.py index b600d826..d2332664 100644 --- a/bench_runner/util.py +++ b/bench_runner/util.py @@ -89,7 +89,8 @@ def get_simple_platform() -> Literal["linux", "macos", "windows"]: def format_seconds(value: float) -> str: """ - Given a float value in seconds, formats it into a human-readable string with the appropriate precision. + Given a float value in seconds, formats it into a human-readable string with + the appropriate precision. """ _TIMEDELTA_UNITS = ("sec", "ms", "us", "ns") From faa685fad6354f8ef022d6bd6405abbf2f54e1b8 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 30 Apr 2025 11:38:16 -0400 Subject: [PATCH 3/3] Fix tests --- bench_runner/__main__.py | 12 +++++++++++- bench_runner/scripts/workflow.py | 9 --------- tests/test_run_benchmarks.py | 12 +++++++++--- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/bench_runner/__main__.py b/bench_runner/__main__.py index 3a02d91f..9b191221 100644 --- a/bench_runner/__main__.py +++ b/bench_runner/__main__.py @@ -1,4 +1,5 @@ import importlib +import os import sys @@ -25,6 +26,15 @@ } if __name__ == "__main__": + # This lets pytest-cov collect coverage data in a subprocess + if "COV_CORE_SOURCE" in os.environ: + try: + from pytest_cov.embed import init + + init() + except Exception: + sys.stderr.write("pytest-cov: Failed to setup subprocess coverage.") + command = len(sys.argv) >= 2 and sys.argv[1] or "" if command not in COMMANDS: @@ -37,5 +47,5 @@ sys.exit(1) sys.argv = [sys.argv[0], *sys.argv[2:]] - mod = importlib.import_module(f"bench_runner.scripts.{command}") + mod = importlib.import_module(f".{command}", "bench_runner.scripts") mod.main() diff --git a/bench_runner/scripts/workflow.py b/bench_runner/scripts/workflow.py index 5e9ca59e..254df840 100644 --- a/bench_runner/scripts/workflow.py +++ b/bench_runner/scripts/workflow.py @@ -398,13 +398,4 @@ def main(): if __name__ == "__main__": - # This lets pytest-cov collect coverage data in a subprocess - if "COV_CORE_SOURCE" in os.environ: - try: - from pytest_cov.embed import init - - init() - except Exception: - sys.stderr.write("pytest-cov: Failed to setup subprocess coverage.") - main() diff --git a/tests/test_run_benchmarks.py b/tests/test_run_benchmarks.py index 4a5896c3..3cdf6572 100644 --- a/tests/test_run_benchmarks.py +++ b/tests/test_run_benchmarks.py @@ -83,7 +83,9 @@ def test_run_benchmarks(benchmarks_checkout, monkeypatch): subprocess.check_call( [ venv_python, - run_benchmarks.__file__, + "-m", + "bench_runner", + "run_benchmarks", "benchmark", sys.executable, "python", @@ -132,7 +134,9 @@ def test_run_benchmarks(benchmarks_checkout, monkeypatch): returncode = subprocess.call( [ venv_python, - run_benchmarks.__file__, + "-m", + "bench_runner", + "run_benchmarks", "benchmark", sys.executable, "python", @@ -164,7 +168,9 @@ def test_run_benchmarks_flags(benchmarks_checkout): subprocess.check_call( [ venv_python, - run_benchmarks.__file__, + "-m", + "bench_runner", + "run_benchmarks", "benchmark", sys.executable, "python",