Faster Base decoding#15337
Conversation
|
Can you please break those into distinct benchmarks? Also, for removing whitespace, have you tried using the result of binary:match to skip the initial traversal? Finally, you have one commit from the other branch. |
|
Oh, and please make sure you measure smaller payloads too (let’s say 1kb). |
Unroll validate16XXX for 8/4/2 bytes Fast-path remove_ignored/2
|
@josevalim yessir! I grouped the benchmarked functions by the change that affects them and also test payloads from Each cell contains the decode_name/1 (decode hot paths)
validate16 (valid16?)
remove_ignored -
|
| op | 1KiB | 10KiB | 50KiB | 100KiB | 1MiB |
|---|---|---|---|---|---|
| valid64? CLEAN | 14.21 → 2.92μs (4.87×) | 137.88 → 22.17μs (6.22×) | 694.34 → 107.71μs (6.45×) | 1.39 → 0.22ms (6.32×) | 14.80 → 2.22ms (6.67×) |
| decode64! CLEAN | 16.63 → 4.00μs (4.16×) | 162.83 → 31.75μs (5.13×) | 819.58 → 155.75μs (5.26×) | 1.63 → 0.31ms (5.26×) | 16.86 → 3.33ms (5.06×) |
| valid64? DIRTY | 14.50 → 15.29μs (0.95×) | 141.00 → 142.38μs (0.99×) | 703.42 → 719.71μs (0.98×) | 1.42 → 1.44ms (0.99×) | 14.78 → 15.17ms (0.97×) |
| decode64! DIRTY | 16.75 → 16.42μs (1.02×) | 165.46 → 151.88μs (1.09×) | 816.29 → 771.83μs (1.06×) | 1.63 → 1.51ms (1.08×) | 17.22 → 15.84ms (1.09×) |
| valid64? (no ignore, ref) | 1.00 → 1.54μs | 9.38 → 18.63μs | 46.46 → 65.92μs | 93.54 → 93.54μs | 0.97 → 1.35ms |
| decode64! (no ignore, ref) | 3.13 → 2.08μs | 30.29 → 19.04μs | 147.00 → 94.67μs | 290.00 → 188.92μs | *3.10 → 2.06ms |
In the remove_ignored benchmark above, you can see the reference times for valid64? and decode64! which are always called, even if we don't provide ignore: :whitespace. So, these times are the baselines if remove_ignored/2 is not called. Now, if you look at the before&after times of valid64? CLEAN, you can see that before the fix, it would take ~14.21µs to call remove_ignored/2 with a string that had no whitespaces. That's the same time as valid64? DIRTY which filters a string that does have whitespaces. After the change, valid64? DIRTY took the same amount of time (within noise), but valid64? CLEAN dropped to 14.21 -> 2.92µs because the binary match would return :nomatch and we don't build the binary unnecessarily.
The benchmark script
# Microbench for Base findings. Run with the SYSTEM elixir from anywhere:
# BENCH_F=decode_name elixir bench_base.exs
#
# Selects which finding to bench via BENCH_F:
# decode_name — Inline decode_name/1 for base 16/32/64 (decode hot paths)
# validate16 — Unroll validate16(upper|lower|mixed) for 4 and 8 bytes
# remove_ignored — remove_ignored fast path via :binary.match for :whitespace
#
# Each scenario runs across payload sizes: 1KiB, 10KiB, 50KiB, 100KiB, 1MiB.
# Hot-loads the in-tree base.ex so patches take effect without `make stdlib`.
Mix.install([{:benchee, "~> 1.3"}])
Code.put_compiler_option(:ignore_module_conflict, true)
src =
System.get_env(
"BENCH_SRC",
"./lib/elixir/lib/base.ex"
)
IO.puts("# Benching Base from: #{src}")
Code.compile_file(src)
finding = System.get_env("BENCH_F", "decode_name")
sizes = [
{"1KiB", 1 * 1024},
{"10KiB", 10 * 1024},
{"50KiB", 50 * 1024},
{"100KiB", 100 * 1024},
{"1MiB", 1024 * 1024}
]
# Sprinkle ~1 whitespace char per 76 chars (MIME-style line wrapping).
sprinkle_ws = fn b64 ->
b64
|> :erlang.binary_to_list()
|> Enum.chunk_every(76)
|> Enum.map_join("\n", &List.to_string/1)
end
case finding do
"decode_name" ->
inputs =
for {label, n} <- sizes, into: %{} do
data = :crypto.strong_rand_bytes(n)
encoded = %{
hex_upper: Base.encode16(data),
hex_lower: Base.encode16(data, case: :lower),
b64: Base.encode64(data),
b64_url: Base.url_encode64(data),
b32: Base.encode32(data)
}
{label, encoded}
end
Benchee.run(
%{
"decode16! upper" => fn %{hex_upper: s} -> Base.decode16!(s) end,
"decode16! lower" => fn %{hex_lower: s} -> Base.decode16!(s, case: :lower) end,
"decode16! mixed" => fn %{hex_upper: s} -> Base.decode16!(s, case: :mixed) end,
"decode64!" => fn %{b64: s} -> Base.decode64!(s) end,
"url_decode64!" => fn %{b64_url: s} -> Base.url_decode64!(s) end,
"decode32!" => fn %{b32: s} -> Base.decode32!(s) end
},
inputs: inputs,
warmup: 2,
time: 5,
print: [fast_warning: false]
)
"validate16" ->
inputs =
for {label, n} <- sizes, into: %{} do
data = :crypto.strong_rand_bytes(n)
encoded = %{
hex_upper: Base.encode16(data),
hex_lower: Base.encode16(data, case: :lower)
}
{label, encoded}
end
Benchee.run(
%{
"valid16? upper" => fn %{hex_upper: s} -> Base.valid16?(s) end,
"valid16? lower" => fn %{hex_lower: s} -> Base.valid16?(s, case: :lower) end,
"valid16? mixed" => fn %{hex_upper: s} -> Base.valid16?(s, case: :mixed) end
},
inputs: inputs,
warmup: 2,
time: 5,
print: [fast_warning: false]
)
"remove_ignored" ->
inputs =
for {label, n} <- sizes, into: %{} do
data = :crypto.strong_rand_bytes(n)
b64 = Base.encode64(data)
b64_with_ws = sprinkle_ws.(b64)
{label, %{clean: b64, dirty: b64_with_ws}}
end
Benchee.run(
%{
"decode64! ignore:ws CLEAN" => fn %{clean: s} ->
Base.decode64!(s, ignore: :whitespace)
end,
"decode64! ignore:ws DIRTY" => fn %{dirty: s} ->
Base.decode64!(s, ignore: :whitespace)
end,
"valid64? ignore:ws CLEAN" => fn %{clean: s} ->
Base.valid64?(s, ignore: :whitespace)
end,
"valid64? ignore:ws DIRTY" => fn %{dirty: s} ->
Base.valid64?(s, ignore: :whitespace)
end,
# Reference rows: same ops without `ignore: :whitespace` to show the
# floor cost (no remove_ignored work at all).
"decode64! (no ignore, ref)" => fn %{clean: s} -> Base.decode64!(s) end,
"valid64? (no ignore, ref)" => fn %{clean: s} -> Base.valid64?(s) end
},
inputs: inputs,
warmup: 2,
time: 5,
print: [fast_warning: false]
)
other ->
raise "Unknown BENCH_F=#{other} (use decode_name, validate16, remove_ignored)"
end|
And could you please elaborate what you mean with: |
|
I'm currently trying to understand why EDIT: I re-ran the test only for the
|
| size | baseline | patched | ratio |
|---|---|---|---|
| 1KiB | 1.00μs | 1.00μs | 1.00× |
| 10KiB | 9.13μs | 9.13μs | 1.00× |
| 50KiB | 45.17μs | 45.17μs | 1.00× |
| 100KiB | 90.96μs | 91.04μs | 1.00× |
| 1MiB | 941.88μs | 945.29μs | **1.00× |
|
@PJUllrich I mean something like this: It is not going to be exactly the above but it will give a good approximation of what I meant. It may not be better though or it may require further adjustments. |
|
Also, inlining decode_name largely increases the byte code size of the module (~20%). If we want to inline it, we need to trim it. |
|
@PJUllrich if you want to optimise this module, ask your coding agent to explore SWAR techniques, as in this commit: #15255 You can first explore it for validation. It will remove the tuple lookups and should be quite more efficient (you can do 7 bytes at a time). You can do all validation cases first. If that works (which you can PR!), then we can start exploring decoding. For this PR in particular, the change for validate16 looks good, the other ones I'd not merge for now cause I worry it may be worse in other cases we haven't considered yet. |
|
@josevalim I benchmarked your suggestion, but it did not make a difference for removing the whitespaces. This might be because in my "random" data with whitespaces, the first whitespace occurs after 76 characters already, so only the first 76 bytes would be skipped. If the whitespace would be further towards the end, it might make a difference indeed. Sounds, good. I will look together with my agent into SWAR and propose changes if I can find significant improvements in another PR. I'll remove the changes other than the validate16 headers. EDIT: This was my (horrible, first draft) implementation of your suggestion if you want to validate it: defp remove_ignored(string, :whitespace) do
case :binary.match(string, [<<?\s>>, <<?\t>>, <<?\r>>, <<?\n>>]) do
{prefix, _} ->
binary_part(string, 0, prefix) <>
for(
<<char::8 <- binary_part(string, prefix + 1, byte_size(string) - prefix - 1)>>,
char not in ~c"\s\t\r\n",
into: <<>>,
do: <<char::8>>
)
:nomatch ->
string
end
end |
|
@PJUllrich your patch is correct but it makes sense we won’t see a difference for that. We need to check smaller payloads and do some white space distribution (at none, beginning, middle, and end). |
|
💚 💙 💜 💛 ❤️ |
Changes
decode_name/1for all three compile-time blocks (base 16/32/64) since it seems that was missing. All other functions (e.g.validate_char_name) were already inlined.validate16(upper|lower|mixed)for 4 and 8 bytes. It had previously only one function head for 2 bytes. Other functions (e.g.decode_name/1) have all 2/4/8 function heads. This allows for larger chunks when iterating through the byte list (I think).remove_ignored(string, :whitespace), check whether the string contains whitespaces first before building a new binary. This will walk the string twice, but runs the costly path of building a new binary only if necessary.Benchmarks
I generated 64 random kilobytes and encoded them in the formats 16/32/64 + 64 (URL) for
Base.url_decode64!/1. I used Benchee to measure one iteration of eachBase.decodeXfunction on the random 64 kbs. The times below are the median iteration time.I tested the
:lower, :mixed, :uppercases andignore: :whitespaceswith (dirty) and without (clean) whitespaces in the random string.(Benchee, 1s warmup, 3s measurement; M2 Pro, 64 KiB random input)