SWAR-optimize ASCII fast paths in String.length/1 and String.slice#15255
Merged
josevalim merged 1 commit intoelixir-lang:mainfrom Apr 9, 2026
Merged
SWAR-optimize ASCII fast paths in String.length/1 and String.slice#15255josevalim merged 1 commit intoelixir-lang:mainfrom
josevalim merged 1 commit intoelixir-lang:mainfrom
Conversation
Add 56-bit SWAR (SIMD Within A Register) acceleration to skip_length/2 and byte_size_remaining_at/2, processing 8 ASCII bytes per iteration instead of one. Uses the Mycroft zero-byte detection algorithm to validate that 7+1 bytes are all ASCII with no \r in a single guard. This mirrors the approach taken in OTP's string module (OTP erlang/otp#10948).
Contributor
Author
BenchmarkMix.install([{:benchee, "~> 1.5"}])
defmodule Baseline do
@moduledoc false
import Kernel, except: [length: 1]
def length(string) when is_binary(string), do: count(string, 0)
defp count(<<byte1, byte2, rest::binary>> = binary, acc)
when byte1 <= 127 and byte1 != ?\r and byte2 <= 127 and byte2 != ?\r do
skip = skip_length(rest, 1)
count(binary_part(binary, skip, byte_size(binary) - skip), acc + skip)
end
defp count(gcs, acc) do
case :unicode_util.gc(gcs) do
[_ | rest] -> count(rest, acc + 1)
[] -> acc
{:error, <<_, rest::bits>>} -> count(rest, acc + 1)
end
end
defp skip_length(<<byte, rest::binary>>, acc)
when byte <= 127 and byte != ?\r,
do: skip_length(rest, acc + 1)
defp skip_length(_binary, acc), do: acc
def byte_size_remaining_at(unicode, 0), do: byte_size_unicode(unicode)
def byte_size_remaining_at(unicode, n) do
case :unicode_util.gc(unicode) do
[_] -> 0
[_ | rest] -> byte_size_remaining_at(rest, n - 1)
[] -> 0
{:error, <<_, bin::bits>>} -> byte_size_remaining_at(bin, n - 1)
end
end
defp byte_size_unicode(binary) when is_binary(binary), do: byte_size(binary)
defp byte_size_unicode([head]), do: byte_size_unicode(head)
defp byte_size_unicode([head | tail]), do: byte_size_unicode(head) + byte_size_unicode(tail)
end
defmodule Optimised do
@moduledoc false
import Kernel, except: [length: 1]
# 56-bit SWAR guard: all 7 bytes are ASCII (< 128) and none is \r (0x0D).
# Uses Mycroft's zero-byte detection: if XORing with 0x0D produces no zero
# byte, then no byte in `w` equals \r.
defguardp ascii_no_cr_swar?(w)
when Bitwise.band(w, 0x80808080808080) == 0 and
Bitwise.band(
Bitwise.bxor(w, 0x0D0D0D0D0D0D0D) - 0x01010101010101,
0x80808080808080
) == 0
# --- length/1 ---
# Same structure as baseline: two-clause length + skip_length helper.
# Only skip_length is SWAR-optimized.
def length(string) when is_binary(string), do: length(string, 0)
defp length(<<byte1, byte2, rest::binary>> = binary, acc)
when byte1 <= 127 and byte1 != ?\r and byte2 <= 127 and byte2 != ?\r do
skip = skip_length(rest, 1)
length(binary_part(binary, skip, byte_size(binary) - skip), acc + skip)
end
defp length(gcs, acc) do
case :unicode_util.gc(gcs) do
[_ | rest] -> length(rest, acc + 1)
[] -> acc
{:error, <<_, rest::bits>>} -> length(rest, acc + 1)
end
end
# SWAR fast path: 7+1 bytes per stride.
defp skip_length(<<w::56, b, rest::binary>>, acc)
when b <= 127 and b != ?\r and ascii_no_cr_swar?(w) do
skip_length(rest, acc + 8)
end
defp skip_length(<<byte, rest::binary>>, acc)
when byte <= 127 and byte != ?\r,
do: skip_length(rest, acc + 1)
defp skip_length(_binary, acc), do: acc
# --- byte_size_remaining_at/2 ---
# Same two-byte entry + skip_length approach as length, capping the skip at n.
def byte_size_remaining_at(unicode, 0), do: byte_size_unicode(unicode)
def byte_size_remaining_at(<<byte1, byte2, rest::binary>> = binary, n)
when n > 0 and byte1 <= 127 and byte1 != ?\r and byte2 <= 127 and byte2 != ?\r do
skip = min(skip_length(rest, 1), n)
byte_size_remaining_at(binary_part(binary, skip, byte_size(binary) - skip), n - skip)
end
def byte_size_remaining_at(unicode, n) when n > 0 do
case :unicode_util.gc(unicode) do
[_] -> 0
[_ | rest] -> byte_size_remaining_at(rest, n - 1)
[] -> 0
{:error, <<_, bin::bits>>} -> byte_size_remaining_at(bin, n - 1)
end
end
defp byte_size_unicode(binary) when is_binary(binary), do: byte_size(binary)
defp byte_size_unicode([head]), do: byte_size_unicode(head)
defp byte_size_unicode([head | tail]), do: byte_size_unicode(head) + byte_size_unicode(tail)
end
length_inputs = %{
"empty" => "",
"ascii_256" => :binary.copy("a", 256),
"ascii_8k" => :binary.copy("x", 8_192),
"ascii_256k" => :binary.copy("y", 256 * 1024),
"ascii_with_cr" => :binary.copy("a", 200) <> "\r" <> :binary.copy("b", 200),
"mixed_tail" => :binary.copy("a", 500) <> "é" <> :binary.copy("z", 100),
"cr_invalid_bytes_a" => "\r\xFF\t\v",
"cr_invalid_bytes_b" => "\r\t\xFF\v"
}
for {label, bin} <- length_inputs do
a = Baseline.length(bin)
b = Optimised.length(bin)
a === b ||
raise "length mismatch on #{inspect(label)}: baseline=#{a} optimised=#{b}"
end
big_ascii = :binary.copy("z", 4096)
huge_ascii = :binary.copy("p", 256 * 1024)
slice_inputs = %{
"skip0" => {big_ascii, 0},
"skip9" => {big_ascii, 9},
"skip64" => {big_ascii, 64},
"skip512" => {big_ascii, 512},
"skip8192_huge" => {huge_ascii, 8192},
"skip_with_cr" => {:binary.copy("a", 300) <> "\r" <> :binary.copy("b", 300), 150},
"skip_mixed" => {:binary.copy("q", 400) <> "ñ" <> :binary.copy("r", 400), 200},
"cr_invalid" => {"\r\xFF\t\v", 2}
}
for {label, {bin, n}} <- slice_inputs do
a = Baseline.byte_size_remaining_at(bin, n)
b = Optimised.byte_size_remaining_at(bin, n)
a === b ||
raise "byte_size_remaining_at mismatch on #{inspect(label)} (#{n}): baseline=#{a} optimised=#{b}"
end
bench_opts = [time: 5, memory_time: 0, print: [fast_warning: false], warmup: 2]
IO.puts("== length/1 ==\n")
defmodule Erlang do
def length(bin) do
:string.length(bin)
end
end
Benchee.run(
%{
"baseline" => fn bin -> Baseline.length(bin) end,
"optimised" => fn bin -> Optimised.length(bin) end
},
Keyword.merge([inputs: length_inputs], bench_opts)
)
IO.puts(
"\n== byte_size_remaining_at/2 ==\n"
)
Benchee.run(
%{
"baseline" => fn {bin, n} -> Baseline.byte_size_remaining_at(bin, n) end,
"optimised_hybrid" => fn {bin, n} -> Optimised.byte_size_remaining_at(bin, n) end
},
Keyword.merge([inputs: slice_inputs], bench_opts)
) |
Contributor
Author
Results |
Member
|
💚 💙 💜 💛 ❤️ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Add 56-bit SWAR (SIMD Within A Register) acceleration to skip_length/2 and byte_size_remaining_at/2, processing 8 ASCII bytes per iteration instead of one. Uses the Mycroft zero-byte detection algorithm to validate that 7+1 bytes are all ASCII with no \r in a single guard.
This mirrors the approach taken in OTP's string module (OTP erlang/otp#10948).
Benchmarks and results attached below. For
String.length/1for example it ranges around 2-5x, it gets better the longer pure-ascii strings.