From 925a15cca035114d923e321977d5d61ba1093a45 Mon Sep 17 00:00:00 2001 From: He-Pin Date: Sun, 31 May 2026 18:28:37 +0800 Subject: [PATCH] perf: allocation-free ASCII bitmask for std.stripChars (1.65x faster) (#851) Motivation: std.stripChars/lstripChars/rstripChars built a `java.util.BitSet` per call for multi-character strip sets, allocating a BitSet object plus its backing `long[]` on every invocation and paying an array-load + bounds check per membership test. The vast majority of real strip sets are ASCII (whitespace/punctuation). Modification: - In StripUtils.strip, detect an all-ASCII strip set while scanning `chars` and build a 128-bit membership mask in two `long`s (no allocation). Strip via `stripAsciiMask`/`inAsciiMask`, which test membership with a shift+mask and no array access. Falls back to the existing BitSet path for BMP>127 sets and to the codepoint set for surrogates. Behavior is unchanged. - Add `StripBenchmark` (JMH) as a regression guard; relax `StripUtils` to `private[sjsonnet]` so the bench module can exercise it in isolation. Result: Isolated JMH micro (StripBenchmark, all-ASCII set, long leading/trailing runs, -f4, 48 samples): 2302.8 +/- 54.0 ns/op -> 1394.1 +/- 19.7 ns/op (1.65x faster), and gc.alloc.rate.norm 104 -> 48 B/op (the removed per-call BitSet; the remaining 48 B is just the result substring). Behavior verified identical to official jsonnet v0.22.0 across ASCII, multi-char, l/r, tab/newline, and non-ASCII fallback cases. Compiles on Scala 3.3.7 / 2.13.18 / 2.12.21; full JVM suite green. References: https://github.com/databricks/sjsonnet/issues/851 --- bench/src/sjsonnet/bench/StripBenchmark.scala | 40 ++++++++++++++++ .../src/sjsonnet/stdlib/StringModule.scala | 47 ++++++++++++++++++- 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 bench/src/sjsonnet/bench/StripBenchmark.scala diff --git a/bench/src/sjsonnet/bench/StripBenchmark.scala b/bench/src/sjsonnet/bench/StripBenchmark.scala new file mode 100644 index 00000000..c70d3ab9 --- /dev/null +++ b/bench/src/sjsonnet/bench/StripBenchmark.scala @@ -0,0 +1,40 @@ +package sjsonnet.bench + +import org.openjdk.jmh.annotations.* +import org.openjdk.jmh.infra.* +import sjsonnet.stdlib.StringModule + +import java.util.concurrent.TimeUnit + +/** + * Micro-benchmark isolating `std.stripChars` with an all-ASCII strip set — the case optimized by + * the inline two-`long` mask in `StripUtils.strip` (issue #851), replacing a per-call + * `java.util.BitSet`. The string has long leading/trailing runs of strip chars so the membership + * check is exercised; `gc.alloc.rate.norm` shows the removed per-call BitSet allocation directly. + * + * Run: ./mill bench.runJmh ".*StripBenchmark.*" -f 4 -wi 10 -i 15 -r 2 -w 1 -prof gc + */ +@BenchmarkMode(Array(Mode.AverageTime)) +@Fork(4) +@Threads(1) +@Warmup(iterations = 10, time = 1) +@Measurement(iterations = 15, time = 2) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Benchmark) +class StripBenchmark { + + // Multi-char ASCII strip set -> exercises the BitSet path (baseline) / two-long mask (this PR), + // not the single-char fast path. + private val chars = "ab" + private var input: String = _ + + @Setup + def setup(): Unit = { + val run = "ab" * 1000 // 2000 leading + 2000 trailing strip chars + input = run + "MIDDLE" + run + } + + @Benchmark + def strip(bh: Blackhole): Unit = + bh.consume(StringModule.StripUtils.strip(input, chars, left = true, right = true)) +} diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala index dfb916f5..0b9a8819 100644 --- a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala +++ b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala @@ -260,7 +260,7 @@ object StringModule extends AbstractFunctionModule { } } - private object StripUtils { + private[sjsonnet] object StripUtils { def codePointsSet(str: String): collection.Set[Int] = { val chars = Set.newBuilder[Int] chars.sizeHint(str.codePointCount(0, str.length)) @@ -286,6 +286,24 @@ object StringModule extends AbstractFunctionModule { return stripSingleChar(str, single.toChar, left, right) } + // Common case: an all-ASCII strip set (whitespace/punctuation). Build a 128-bit membership + // mask in two `long`s — no allocation (vs a per-call java.util.BitSet) and no array bounds + // check per lookup, keeping the strip loop tight and GC-friendly (issue #851). + var lo = 0L + var hi = 0L + var allAscii = true + var i = 0 + while (allAscii && i < chars.length) { + val ch = chars.charAt(i) + if (ch < 64) lo |= 1L << ch + else if (ch < 128) hi |= 1L << (ch - 64) + else allAscii = false + i += 1 + } + if (allAscii) { + return stripAsciiMask(str, lo, hi, left, right) + } + val bmpSet = bmpNonSurrogateSet(chars) if (bmpSet != null) { return stripBmp(str, bmpSet, left, right) @@ -294,6 +312,33 @@ object StringModule extends AbstractFunctionModule { unspecializedStrip(str, codePointsSet(chars), left, right) } + /** Membership test for the inline 128-bit ASCII mask built in [[strip]]. */ + @inline private def inAsciiMask(lo: Long, hi: Long, c: Char): Boolean = + if (c < 64) (lo & (1L << c)) != 0L + else if (c < 128) (hi & (1L << (c - 64))) != 0L + else false + + /** + * Fast path for an all-ASCII strip set, using the inline two-`long` mask. Non-ASCII chars in + * `str` (including surrogate halves) are never members, so surrogate pairs are left intact. + */ + private def stripAsciiMask( + str: String, + lo: Long, + hi: Long, + left: Boolean, + right: Boolean): String = { + var start = 0 + var end = str.length + if (left) { + while (start < end && inAsciiMask(lo, hi, str.charAt(start))) start += 1 + } + if (right) { + while (end > start && inAsciiMask(lo, hi, str.charAt(end - 1))) end -= 1 + } + str.substring(start, end) + } + def unspecializedStrip( str: String, charsSet: collection.Set[Int],