From 09760ca61a8f13af5f0d24cbe88dc2b9bbc8d482 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 30 Apr 2025 10:25:35 -0400
Subject: [PATCH 1/3] Add a helper script to bisect a regression

---
 bench_runner/__main__.py         |   1 +
 bench_runner/git.py              |   9 ++
 bench_runner/scripts/bisect.py   | 232 +++++++++++++++++++++++++++++++
 bench_runner/scripts/workflow.py |  11 +-
 bench_runner/util.py             |  19 +++
 5 files changed, 270 insertions(+), 2 deletions(-)
 create mode 100644 bench_runner/scripts/bisect.py

diff --git a/bench_runner/__main__.py b/bench_runner/__main__.py
index b843b9d3..3a02d91f 100644
--- a/bench_runner/__main__.py
+++ b/bench_runner/__main__.py
@@ -7,6 +7,7 @@
 
 COMMANDS = {
     "backfill": "Schedule benchmarking a number of commits",
+    "bisect": "Run a bisect to find the commit that caused a regression",
     "compare": "Compare a matrix of specific results",
     "find_failures": "Find the benchmarks that failed in the last weekly run",
     "generate_results": "Create all of the derived artifacts from raw data",
diff --git a/bench_runner/git.py b/bench_runner/git.py
index 7f8120ed..9944c6d7 100644
--- a/bench_runner/git.py
+++ b/bench_runner/git.py
@@ -168,3 +168,12 @@ def clone(
         if depth is not None:
             args += ["--depth", str(depth)]
         subprocess.check_call(args)
+
+
+def checkout(dirname: PathLike, ref: str) -> None:
+    dirname = Path(dirname)
+
+    subprocess.check_call(
+        ["git", "checkout", ref],
+        cwd=dirname,
+    )
diff --git a/bench_runner/scripts/bisect.py b/bench_runner/scripts/bisect.py
new file mode 100644
index 00000000..740e5e18
--- /dev/null
+++ b/bench_runner/scripts/bisect.py
@@ -0,0 +1,232 @@
+import argparse
+import contextlib
+from pathlib import Path
+import subprocess
+import sys
+import traceback
+
+
+import numpy as np
+import rich_argparse
+
+
+from bench_runner import flags as mflags
+from bench_runner import git
+from bench_runner import result
+from bench_runner.scripts import run_benchmarks
+from bench_runner.scripts import workflow
+from bench_runner.util import PathLike, format_seconds
+
+
+def _get_result_commandline(
+    benchmark: str,
+    good_val: float,
+    bad_val: float,
+    pgo: bool,
+    flags: str,
+    repo: PathLike,
+) -> list[str]:
+    repo = Path(repo)
+
+    return [
+        sys.executable,
+        "-m",
+        "bench_runner.scripts.bisect",
+        benchmark,
+        str(good_val),
+        str(bad_val),
+        str(pgo),
+        str(flags),
+        str(repo.absolute()),
+    ]
+
+
+def parse_result(benchmark_json: PathLike) -> float:
+    # The name of the benchmark in the JSON file may be different from the one
+    # used to select the benchmark. Therefore, just take the mean of all the
+    # benchmarks in the JSON file.
+    r = result.Result.from_arbitrary_filename(benchmark_json)
+    timing_data = r.get_timing_data()
+    return float(np.mean([x.mean() for x in timing_data.values()]))
+
+
+def get_result(
+    benchmark: str,
+    pgo: bool = False,
+    flags: str = "",
+    cpython: PathLike = Path("cpython"),
+    reconfigure: bool = False,
+) -> float:
+    cpython = Path(cpython)
+
+    if pgo or reconfigure:
+        # Jumping around through commits with PGO can leave stale PGO data
+        # around, so we need to clean it each time.  We also always do it the
+        # first time in case the *last* bisect run used pgo.
+        subprocess.run(["make", "clean"], cwd=cpython)
+
+    workflow.compile_unix(cpython, mflags.parse_flags(flags), pgo, False, reconfigure)
+    run_benchmarks.run_benchmarks(cpython / "python", benchmark)
+    timing = parse_result(run_benchmarks.BENCHMARK_JSON)
+
+    return timing
+
+
+def get_log_file() -> Path:
+    return Path("bisect_log.txt")
+
+
+def delete_log() -> None:
+    bisect_log = get_log_file()
+    if bisect_log.is_file():
+        bisect_log.unlink()
+
+
+def show_log() -> None:
+    bisect_log = get_log_file()
+
+    print()
+    print("Bisect log:")
+
+    with get_log_file().open("r") as f:
+        for line in f.readlines():
+            print(line.strip())
+
+
+def log(message: str) -> None:
+    with get_log_file().open("a") as f:
+        f.write(f"{message}\n")
+
+
+def _main(
+    benchmark: str,
+    good: str,
+    bad: str,
+    pgo: bool = False,
+    flags: str = "",
+    repo: PathLike = Path("."),
+    cpython: PathLike = Path("cpython"),
+):
+    repo = Path(repo).absolute()
+    cpython = Path(cpython).absolute()
+
+    delete_log()
+
+    if not cpython.is_dir():
+        git.clone(
+            cpython, "https://github.com/python/cpython.git", branch="main", depth=None
+        )
+
+    git.checkout(cpython, good)
+    good_timing = get_result(benchmark, pgo, flags, cpython=cpython, reconfigure=True)
+    log(f"KNOWN  GOOD ({good[:7]}): {format_seconds(good_timing)}")
+
+    git.checkout(cpython, bad)
+    bad_timing = get_result(benchmark, pgo, flags, cpython=cpython)
+    log(f"KNOWN  BAD  ({bad[:7]}): {format_seconds(bad_timing)}")
+
+    try:
+        with contextlib.chdir(cpython):
+            subprocess.run(["git", "bisect", "start"])
+            subprocess.run(["git", "bisect", "bad", bad])
+            subprocess.run(["git", "bisect", "good", good])
+            subprocess.run(
+                ["git", "bisect", "run"]
+                + _get_result_commandline(
+                    benchmark, good_timing, bad_timing, pgo, flags, repo
+                )
+            )
+    finally:
+        show_log()
+        delete_log()
+
+
+def main():
+    # This is the entry point for the user
+
+    parser = argparse.ArgumentParser(
+        description="""
+        Run bisect on a benchmark to find the first regressing commit.
+
+        A full checkout of CPython should be in the cpython directory.
+        If it doesn't exist, it will be cloned.
+        """,
+        formatter_class=rich_argparse.ArgumentDefaultsRichHelpFormatter,
+    )
+    parser.add_argument(
+        "benchmark",
+        type=str,
+        help="The benchmark to run bisect on.",
+    )
+    parser.add_argument(
+        "good",
+        type=str,
+        help="The good commit hash for the bisect.",
+    )
+    parser.add_argument(
+        "bad",
+        type=str,
+        help="The bad commit hash for the bisect.",
+    )
+    parser.add_argument(
+        "--pgo",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--flags",
+        type=str,
+        default="",
+    )
+
+    args = parser.parse_args()
+
+    _main(args.benchmark, args.good, args.bad, args.pgo, args.flags)
+
+
+if __name__ == "__main__":
+    # This is the entry point when we are called from `git bisect run` itself
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("benchmark", type=str)
+    parser.add_argument("good_val", type=float)
+    parser.add_argument("bad_val", type=float)
+    parser.add_argument("pgo", type=str)
+    parser.add_argument("flags", type=str)
+    parser.add_argument("repo", type=str)
+    args = parser.parse_args()
+
+    mid_point = (args.good_val + args.bad_val) / 2.0
+
+    repo = Path(args.repo)
+    cpython = repo / "cpython"
+
+    try:
+        with contextlib.chdir(repo):
+            timing = get_result(
+                args.benchmark, args.pgo == "True", args.flags, cpython=cpython
+            )
+    except Exception as e:
+        # If there was any exception, display that exception and traceback and
+        # then abort the git bisect with -1
+        traceback.print_exception(e)
+        sys.exit(-1)
+
+    # The confidence is 0.0 at the mid-point, 1.0 at the good and bad values,
+    # and > 1.0 outside of that.
+    confidence = abs((timing - mid_point) / ((args.bad_val - args.good_val) / 2.0))
+
+    with contextlib.chdir(repo):
+        if timing > mid_point:
+            log(
+                f"BISECT BAD  ({git.get_git_hash(cpython)[:7]}): "
+                f"{format_seconds(timing)} (confidence {confidence:.02f})"
+            )
+            print(f"BAD: {timing} vs. ({args.good_val}, {args.bad_val})")
+            sys.exit(1)
+        else:
+            log(
+                f"BISECT GOOD ({git.get_git_hash(cpython)[:7]}): "
+                f"{format_seconds(timing)} (confidence {confidence:.02f})"
+            )
+            print(f"GOOD: {timing} vs. ({args.good_val}, {args.bad_val})")
+            sys.exit(0)
diff --git a/bench_runner/scripts/workflow.py b/bench_runner/scripts/workflow.py
index fe6baf0b..5e9ca59e 100644
--- a/bench_runner/scripts/workflow.py
+++ b/bench_runner/scripts/workflow.py
@@ -130,7 +130,13 @@ def checkout_benchmarks():
         )
 
 
-def compile_unix(cpython: PathLike, flags: list[str], pgo: bool, pystats: bool) -> None:
+def compile_unix(
+    cpython: PathLike,
+    flags: list[str],
+    pgo: bool,
+    pystats: bool,
+    reconfigure: bool = True,
+) -> None:
     cpython = Path(cpython)
     cfg = config.get_config_for_current_runner()
 
@@ -166,7 +172,8 @@ def compile_unix(cpython: PathLike, flags: list[str], pgo: bool, pystats: bool)
         make_args.extend(["-j"])
 
     with contextlib.chdir(cpython):
-        subprocess.check_call(["./configure", *args], env=env)
+        if reconfigure:
+            subprocess.check_call(["./configure", *args], env=env)
         subprocess.check_call(["make", *make_args], env=env)
 
 
diff --git a/bench_runner/util.py b/bench_runner/util.py
index af711a2d..b600d826 100644
--- a/bench_runner/util.py
+++ b/bench_runner/util.py
@@ -87,6 +87,25 @@ def get_simple_platform() -> Literal["linux", "macos", "windows"]:
     raise RuntimeError(f"Unsupported platform {sys.platform}.")
 
 
+def format_seconds(value: float) -> str:
+    """
+    Given a float value in seconds, formats it into a human-readable string with the appropriate precision.
+    """
+    _TIMEDELTA_UNITS = ("sec", "ms", "us", "ns")
+
+    for i in range(2, -9, -1):
+        if value >= 10.0**i:
+            break
+    else:
+        i = -9
+
+    precision = 2 - i % 3
+    k = -(i // 3) if i < 0 else 0
+    factor = 10 ** (k * 3)
+    unit = _TIMEDELTA_UNITS[k]
+    return f"{value * factor:.{precision}f} {unit}"
+
+
 if os.getenv("GITHUB_ACTIONS") == "true":
 
     @contextlib.contextmanager

From ec0eed3cdc8d98d59705b6abede188479ae7cd58 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 30 Apr 2025 10:46:32 -0400
Subject: [PATCH 2/3] Lint

---
 bench_runner/git.py            | 2 +-
 bench_runner/scripts/bisect.py | 2 --
 bench_runner/util.py           | 3 ++-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/bench_runner/git.py b/bench_runner/git.py
index 9944c6d7..8070c398 100644
--- a/bench_runner/git.py
+++ b/bench_runner/git.py
@@ -138,7 +138,7 @@ def clone(
     url: str,
     *,
     branch: str | None = None,
-    depth: int = 1,
+    depth: int | None = 1,
 ) -> None:
     is_hash = re.match(r"^[0-9a-f]{40}$", branch) if branch else False
 
diff --git a/bench_runner/scripts/bisect.py b/bench_runner/scripts/bisect.py
index 740e5e18..4a8fc190 100644
--- a/bench_runner/scripts/bisect.py
+++ b/bench_runner/scripts/bisect.py
@@ -83,8 +83,6 @@ def delete_log() -> None:
 
 
 def show_log() -> None:
-    bisect_log = get_log_file()
-
     print()
     print("Bisect log:")
 
diff --git a/bench_runner/util.py b/bench_runner/util.py
index b600d826..d2332664 100644
--- a/bench_runner/util.py
+++ b/bench_runner/util.py
@@ -89,7 +89,8 @@ def get_simple_platform() -> Literal["linux", "macos", "windows"]:
 
 def format_seconds(value: float) -> str:
     """
-    Given a float value in seconds, formats it into a human-readable string with the appropriate precision.
+    Given a float value in seconds, formats it into a human-readable string with
+    the appropriate precision.
     """
     _TIMEDELTA_UNITS = ("sec", "ms", "us", "ns")
 

From faa685fad6354f8ef022d6bd6405abbf2f54e1b8 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 30 Apr 2025 11:38:16 -0400
Subject: [PATCH 3/3] Fix tests

---
 bench_runner/__main__.py         | 12 +++++++++++-
 bench_runner/scripts/workflow.py |  9 ---------
 tests/test_run_benchmarks.py     | 12 +++++++++---
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/bench_runner/__main__.py b/bench_runner/__main__.py
index 3a02d91f..9b191221 100644
--- a/bench_runner/__main__.py
+++ b/bench_runner/__main__.py
@@ -1,4 +1,5 @@
 import importlib
+import os
 import sys
 
 
@@ -25,6 +26,15 @@
 }
 
 if __name__ == "__main__":
+    # This lets pytest-cov collect coverage data in a subprocess
+    if "COV_CORE_SOURCE" in os.environ:
+        try:
+            from pytest_cov.embed import init
+
+            init()
+        except Exception:
+            sys.stderr.write("pytest-cov: Failed to setup subprocess coverage.")
+
     command = len(sys.argv) >= 2 and sys.argv[1] or ""
 
     if command not in COMMANDS:
@@ -37,5 +47,5 @@
         sys.exit(1)
 
     sys.argv = [sys.argv[0], *sys.argv[2:]]
-    mod = importlib.import_module(f"bench_runner.scripts.{command}")
+    mod = importlib.import_module(f".{command}", "bench_runner.scripts")
     mod.main()
diff --git a/bench_runner/scripts/workflow.py b/bench_runner/scripts/workflow.py
index 5e9ca59e..254df840 100644
--- a/bench_runner/scripts/workflow.py
+++ b/bench_runner/scripts/workflow.py
@@ -398,13 +398,4 @@ def main():
 
 
 if __name__ == "__main__":
-    # This lets pytest-cov collect coverage data in a subprocess
-    if "COV_CORE_SOURCE" in os.environ:
-        try:
-            from pytest_cov.embed import init
-
-            init()
-        except Exception:
-            sys.stderr.write("pytest-cov: Failed to setup subprocess coverage.")
-
     main()
diff --git a/tests/test_run_benchmarks.py b/tests/test_run_benchmarks.py
index 4a5896c3..3cdf6572 100644
--- a/tests/test_run_benchmarks.py
+++ b/tests/test_run_benchmarks.py
@@ -83,7 +83,9 @@ def test_run_benchmarks(benchmarks_checkout, monkeypatch):
     subprocess.check_call(
         [
             venv_python,
-            run_benchmarks.__file__,
+            "-m",
+            "bench_runner",
+            "run_benchmarks",
             "benchmark",
             sys.executable,
             "python",
@@ -132,7 +134,9 @@ def test_run_benchmarks(benchmarks_checkout, monkeypatch):
     returncode = subprocess.call(
         [
             venv_python,
-            run_benchmarks.__file__,
+            "-m",
+            "bench_runner",
+            "run_benchmarks",
             "benchmark",
             sys.executable,
             "python",
@@ -164,7 +168,9 @@ def test_run_benchmarks_flags(benchmarks_checkout):
     subprocess.check_call(
         [
             venv_python,
-            run_benchmarks.__file__,
+            "-m",
+            "bench_runner",
+            "run_benchmarks",
             "benchmark",
             sys.executable,
             "python",