From 7546273f62c47ac5431d4360ce52161dc1eeb7fe Mon Sep 17 00:00:00 2001 From: Philippe Llerena Date: Fri, 15 May 2026 16:15:33 +0200 Subject: [PATCH] perf(examples): use mimalloc as the global allocator in the bench binary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Callgrind on the bench binary showed roughly 33 % of cycles inside libc's malloc/free family (`_int_malloc`, `_int_free`, `malloc`, `free`, `malloc_consolidate`, `unlink_chunk`, …). The hot paths are short-lived small allocations: `SmallVec` segments inside `Ranges` on every range clone, the per-call `FxHashMap<&Requirement, bool>` in `reduce_by`, hashbrown rehashes, `String::clone`s for package names. mimalloc's small-object path is measurably cheaper than glibc's on exactly this shape. Wire `mimalloc::MiMalloc` as `#[global_allocator]` in the bench binary only. Production wheels (`rer-python`) are untouched — switching the global allocator in a PyO3 extension is more nuanced and is left for a separate change. ## Benchmark (188 cases, release, same machine, two consecutive runs) | Stage | Total | Mean | vs rez | |------------------------------------|--------:|-------:|-------:| | Baseline (main) | 43.0 s | 230 ms | 8.8× | | + Shared cache (PR #66) | 34.4 s | 183 ms | 11.1× | | + mimalloc (this branch), run 1 | 29.8 s | 158 ms | 12.8× | | + mimalloc (this branch), run 2 | 29.4 s | 156 ms | 13.0× | Cumulative -32 % from main. Stacks cleanly on top of #66. ## Correctness 188/188 still match rez 1:1 (`cargo test --release -p rer-resolver --test test_rez_benchmark -- --ignored`). Co-Authored-By: Claude Opus 4.7 --- Cargo.toml | 5 +++++ crates/examples/Cargo.toml | 3 +++ crates/examples/rez_benchmark_dataset.rs | 7 +++++++ 3 files changed, 15 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index bb932de..e8d6c2e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,3 +26,8 @@ serde_json = "1.0" rer-version = { path = "crates/rer-version", version = "0.1.0-rc.6" } rer-resolver = { path = "crates/rer-resolver", version = "0.1.0-rc.6" } pyo3 = { version = "0.23.5", features = ["extension-module"] } +# `mimalloc` is wired into the bench binary as a `#[global_allocator]`. +# Callgrind shows ~33 % of cycles in libc malloc/free; mimalloc has measurably +# lower per-call cost on the small-object churn rer creates (`SmallVec` in +# `Ranges`, per-call `FxHashMap`s in `reduce_by`, etc.). +mimalloc = "0.1" diff --git a/crates/examples/Cargo.toml b/crates/examples/Cargo.toml index 166e6e3..645642d 100644 --- a/crates/examples/Cargo.toml +++ b/crates/examples/Cargo.toml @@ -12,3 +12,6 @@ path = "rez_benchmark_dataset.rs" rer-resolver = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +# `mimalloc` is set as the global allocator in `rez_benchmark_dataset` — see +# the comment in that file. Workspace-pinned for a consistent version. +mimalloc = { workspace = true } diff --git a/crates/examples/rez_benchmark_dataset.rs b/crates/examples/rez_benchmark_dataset.rs index d29a456..437896d 100644 --- a/crates/examples/rez_benchmark_dataset.rs +++ b/crates/examples/rez_benchmark_dataset.rs @@ -11,6 +11,13 @@ use rer_resolver::rez_solver::{ make_shared_cache, PackageRepo, Requirement, Solver, SolverStatus, }; + +// Callgrind on this binary shows ~33 % of cycles in libc malloc/free — +// `SmallVec` extends inside `Ranges`, per-call `FxHashMap`s in `reduce_by`, +// hashbrown rehashes, and `String::clone`s. mimalloc's small-object path +// outperforms glibc's `_int_malloc`/`_int_free` on exactly this workload. +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; use serde::Deserialize; use std::path::PathBuf; use std::rc::Rc;