Skip to content

SWAR-optimize ASCII fast paths in String.length/1 and String.slice#15255

Merged
josevalim merged 1 commit intoelixir-lang:mainfrom
NelsonVides:perf/string_swar
Apr 9, 2026
Merged

SWAR-optimize ASCII fast paths in String.length/1 and String.slice#15255
josevalim merged 1 commit intoelixir-lang:mainfrom
NelsonVides:perf/string_swar

Conversation

@NelsonVides
Copy link
Copy Markdown
Contributor

@NelsonVides NelsonVides commented Apr 9, 2026

Add 56-bit SWAR (SIMD Within A Register) acceleration to skip_length/2 and byte_size_remaining_at/2, processing 8 ASCII bytes per iteration instead of one. Uses the Mycroft zero-byte detection algorithm to validate that 7+1 bytes are all ASCII with no \r in a single guard.

This mirrors the approach taken in OTP's string module (OTP erlang/otp#10948).

Benchmarks and results attached below. For String.length/1 for example it ranges around 2-5x, it gets better the longer pure-ascii strings.

Add 56-bit SWAR (SIMD Within A Register) acceleration to skip_length/2
and byte_size_remaining_at/2, processing 8 ASCII bytes per iteration
instead of one. Uses the Mycroft zero-byte detection algorithm to
validate that 7+1 bytes are all ASCII with no \r in a single guard.

This mirrors the approach taken in OTP's string module
(OTP erlang/otp#10948).
@NelsonVides
Copy link
Copy Markdown
Contributor Author

Benchmark
Mix.install([{:benchee, "~> 1.5"}])

defmodule Baseline do
  @moduledoc false
  import Kernel, except: [length: 1]

  def length(string) when is_binary(string), do: count(string, 0)

  defp count(<<byte1, byte2, rest::binary>> = binary, acc)
       when byte1 <= 127 and byte1 != ?\r and byte2 <= 127 and byte2 != ?\r do
    skip = skip_length(rest, 1)
    count(binary_part(binary, skip, byte_size(binary) - skip), acc + skip)
  end

  defp count(gcs, acc) do
    case :unicode_util.gc(gcs) do
      [_ | rest] -> count(rest, acc + 1)
      [] -> acc
      {:error, <<_, rest::bits>>} -> count(rest, acc + 1)
    end
  end

  defp skip_length(<<byte, rest::binary>>, acc)
       when byte <= 127 and byte != ?\r,
       do: skip_length(rest, acc + 1)

  defp skip_length(_binary, acc), do: acc

  def byte_size_remaining_at(unicode, 0), do: byte_size_unicode(unicode)

  def byte_size_remaining_at(unicode, n) do
    case :unicode_util.gc(unicode) do
      [_] -> 0
      [_ | rest] -> byte_size_remaining_at(rest, n - 1)
      [] -> 0
      {:error, <<_, bin::bits>>} -> byte_size_remaining_at(bin, n - 1)
    end
  end

  defp byte_size_unicode(binary) when is_binary(binary), do: byte_size(binary)
  defp byte_size_unicode([head]), do: byte_size_unicode(head)
  defp byte_size_unicode([head | tail]), do: byte_size_unicode(head) + byte_size_unicode(tail)
end

defmodule Optimised do
  @moduledoc false
  import Kernel, except: [length: 1]

  # 56-bit SWAR guard: all 7 bytes are ASCII (< 128) and none is \r (0x0D).
  # Uses Mycroft's zero-byte detection: if XORing with 0x0D produces no zero
  # byte, then no byte in `w` equals \r.
  defguardp ascii_no_cr_swar?(w)
            when Bitwise.band(w, 0x80808080808080) == 0 and
                   Bitwise.band(
                     Bitwise.bxor(w, 0x0D0D0D0D0D0D0D) - 0x01010101010101,
                     0x80808080808080
                   ) == 0

  # --- length/1 ---
  # Same structure as baseline: two-clause length + skip_length helper.
  # Only skip_length is SWAR-optimized.

  def length(string) when is_binary(string), do: length(string, 0)

  defp length(<<byte1, byte2, rest::binary>> = binary, acc)
       when byte1 <= 127 and byte1 != ?\r and byte2 <= 127 and byte2 != ?\r do
    skip = skip_length(rest, 1)
    length(binary_part(binary, skip, byte_size(binary) - skip), acc + skip)
  end

  defp length(gcs, acc) do
    case :unicode_util.gc(gcs) do
      [_ | rest] -> length(rest, acc + 1)
      [] -> acc
      {:error, <<_, rest::bits>>} -> length(rest, acc + 1)
    end
  end

  # SWAR fast path: 7+1 bytes per stride.
  defp skip_length(<<w::56, b, rest::binary>>, acc)
       when b <= 127 and b != ?\r and ascii_no_cr_swar?(w) do
    skip_length(rest, acc + 8)
  end

  defp skip_length(<<byte, rest::binary>>, acc)
       when byte <= 127 and byte != ?\r,
       do: skip_length(rest, acc + 1)

  defp skip_length(_binary, acc), do: acc

  # --- byte_size_remaining_at/2 ---
  # Same two-byte entry + skip_length approach as length, capping the skip at n.

  def byte_size_remaining_at(unicode, 0), do: byte_size_unicode(unicode)

  def byte_size_remaining_at(<<byte1, byte2, rest::binary>> = binary, n)
      when n > 0 and byte1 <= 127 and byte1 != ?\r and byte2 <= 127 and byte2 != ?\r do
    skip = min(skip_length(rest, 1), n)
    byte_size_remaining_at(binary_part(binary, skip, byte_size(binary) - skip), n - skip)
  end

  def byte_size_remaining_at(unicode, n) when n > 0 do
    case :unicode_util.gc(unicode) do
      [_] -> 0
      [_ | rest] -> byte_size_remaining_at(rest, n - 1)
      [] -> 0
      {:error, <<_, bin::bits>>} -> byte_size_remaining_at(bin, n - 1)
    end
  end

  defp byte_size_unicode(binary) when is_binary(binary), do: byte_size(binary)
  defp byte_size_unicode([head]), do: byte_size_unicode(head)
  defp byte_size_unicode([head | tail]), do: byte_size_unicode(head) + byte_size_unicode(tail)
end

length_inputs = %{
  "empty" => "",
  "ascii_256" => :binary.copy("a", 256),
  "ascii_8k" => :binary.copy("x", 8_192),
  "ascii_256k" => :binary.copy("y", 256 * 1024),
  "ascii_with_cr" => :binary.copy("a", 200) <> "\r" <> :binary.copy("b", 200),
  "mixed_tail" => :binary.copy("a", 500) <> "é" <> :binary.copy("z", 100),
  "cr_invalid_bytes_a" => "\r\xFF\t\v",
  "cr_invalid_bytes_b" => "\r\t\xFF\v"
}

for {label, bin} <- length_inputs do
  a = Baseline.length(bin)
  b = Optimised.length(bin)

  a === b ||
    raise "length mismatch on #{inspect(label)}: baseline=#{a} optimised=#{b}"
end

big_ascii = :binary.copy("z", 4096)
huge_ascii = :binary.copy("p", 256 * 1024)

slice_inputs = %{
  "skip0" => {big_ascii, 0},
  "skip9" => {big_ascii, 9},
  "skip64" => {big_ascii, 64},
  "skip512" => {big_ascii, 512},
  "skip8192_huge" => {huge_ascii, 8192},
  "skip_with_cr" => {:binary.copy("a", 300) <> "\r" <> :binary.copy("b", 300), 150},
  "skip_mixed" => {:binary.copy("q", 400) <> "ñ" <> :binary.copy("r", 400), 200},
  "cr_invalid" => {"\r\xFF\t\v", 2}
}

for {label, {bin, n}} <- slice_inputs do
  a = Baseline.byte_size_remaining_at(bin, n)
  b = Optimised.byte_size_remaining_at(bin, n)

  a === b ||
    raise "byte_size_remaining_at mismatch on #{inspect(label)} (#{n}): baseline=#{a} optimised=#{b}"
end

bench_opts = [time: 5, memory_time: 0, print: [fast_warning: false], warmup: 2]

IO.puts("== length/1 ==\n")

defmodule Erlang do
  def length(bin) do
    :string.length(bin)
  end
end

Benchee.run(
  %{
    "baseline" => fn bin -> Baseline.length(bin) end,
    "optimised" => fn bin -> Optimised.length(bin) end
  },
  Keyword.merge([inputs: length_inputs], bench_opts)
)

IO.puts(
  "\n== byte_size_remaining_at/2 ==\n"
)

Benchee.run(
  %{
    "baseline" => fn {bin, n} -> Baseline.byte_size_remaining_at(bin, n) end,
    "optimised_hybrid" => fn {bin, n} -> Optimised.byte_size_remaining_at(bin, n) end
  },
  Keyword.merge([inputs: slice_inputs], bench_opts)
)

@NelsonVides
Copy link
Copy Markdown
Contributor Author

Results
 ➜ elixir string.exs
== length/1 ==

Operating System: macOS
CPU Information: Apple M4 Pro
Number of Available Cores: 14
Available memory: 48 GB
Elixir 1.18.4
Erlang 27.3.4.2
JIT enabled: true

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: ascii_256, ascii_256k, ascii_8k, ascii_with_cr, cr_invalid_bytes_a, cr_invalid_bytes_b, empty, mixed_tail
Estimated total run time: 1 min 52 s
Excluding outliers: false

Benchmarking baseline with input ascii_256 ...
Benchmarking baseline with input ascii_256k ...
Benchmarking baseline with input ascii_8k ...
Benchmarking baseline with input ascii_with_cr ...
Benchmarking baseline with input cr_invalid_bytes_a ...
Benchmarking baseline with input cr_invalid_bytes_b ...
Benchmarking baseline with input empty ...
Benchmarking baseline with input mixed_tail ...
Benchmarking optimised with input ascii_256 ...
Benchmarking optimised with input ascii_256k ...
Benchmarking optimised with input ascii_8k ...
Benchmarking optimised with input ascii_with_cr ...
Benchmarking optimised with input cr_invalid_bytes_a ...
Benchmarking optimised with input cr_invalid_bytes_b ...
Benchmarking optimised with input empty ...
Benchmarking optimised with input mixed_tail ...
Calculating statistics...
Formatting results...

##### With input ascii_256 #####
Name                ips        average  deviation         median         99th %
optimised        5.93 M      168.69 ns   ±981.59%         166 ns         292 ns
baseline         3.01 M      331.93 ns  ±1010.94%         292 ns         500 ns

Comparison:
optimised        5.93 M
baseline         3.01 M - 1.97x slower +163.24 ns

##### With input ascii_256k #####
Name                ips        average  deviation         median         99th %
optimised       21.57 K       46.36 μs     ±8.19%       45.92 μs       53.88 μs
baseline         4.13 K      242.00 μs    ±10.56%      233.25 μs      336.26 μs

Comparison:
optimised       21.57 K
baseline         4.13 K - 5.22x slower +195.63 μs

##### With input ascii_8k #####
Name                ips        average  deviation         median         99th %
optimised      548.49 K        1.82 μs   ±113.00%        1.83 μs        2.29 μs
baseline       128.76 K        7.77 μs    ±47.47%        7.42 μs       10.79 μs

Comparison:
optimised      548.49 K
baseline       128.76 K - 4.26x slower +5.94 μs

##### With input ascii_with_cr #####
Name                ips        average  deviation         median         99th %
optimised        4.11 M      243.42 ns   ±907.00%         250 ns         375 ns
baseline         1.85 M      539.26 ns    ±41.88%         541 ns         750 ns

Comparison:
optimised        4.11 M
baseline         1.85 M - 2.22x slower +295.85 ns

##### With input cr_invalid_bytes_a #####
Name                ips        average  deviation         median         99th %
optimised        6.86 M      145.70 ns   ±184.35%         125 ns         209 ns
baseline         6.34 M      157.67 ns   ±409.21%         166 ns         292 ns

Comparison:
optimised        6.86 M
baseline         6.34 M - 1.08x slower +11.97 ns

##### With input cr_invalid_bytes_b #####
Name                ips        average  deviation         median         99th %
optimised        8.24 M      121.40 ns   ±700.34%      108.40 ns      233.40 ns
baseline         5.74 M      174.35 ns  ±1992.07%         166 ns         209 ns

Comparison:
optimised        8.24 M
baseline         5.74 M - 1.44x slower +52.95 ns

##### With input empty #####
Name                ips        average  deviation         median         99th %
optimised       63.13 M       15.84 ns  ±3846.88%       12.50 ns          25 ns
baseline        26.45 M       37.81 ns  ±2617.40%          42 ns          83 ns

Comparison:
optimised       63.13 M
baseline        26.45 M - 2.39x slower +21.97 ns

##### With input mixed_tail #####
Name                ips        average  deviation         median         99th %
optimised        3.43 M      291.12 ns    ±49.13%         292 ns         417 ns
baseline         1.28 M      782.89 ns   ±360.58%         750 ns        1125 ns

Comparison:
optimised        3.43 M
baseline         1.28 M - 2.69x slower +491.76 ns

== byte_size_remaining_at/2 ==

Operating System: macOS
CPU Information: Apple M4 Pro
Number of Available Cores: 14
Available memory: 48 GB
Elixir 1.18.4
Erlang 27.3.4.2
JIT enabled: true

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 5 s
memory time: 0 ns
reduction time: 0 ns
parallel: 1
inputs: cr_invalid, skip0, skip512, skip64, skip8192_huge, skip9, skip_mixed, skip_with_cr
Estimated total run time: 1 min 52 s
Excluding outliers: false

Benchmarking baseline with input cr_invalid ...
Benchmarking baseline with input skip0 ...
Benchmarking baseline with input skip512 ...
Benchmarking baseline with input skip64 ...
Benchmarking baseline with input skip8192_huge ...
Benchmarking baseline with input skip9 ...
Benchmarking baseline with input skip_mixed ...
Benchmarking baseline with input skip_with_cr ...
Benchmarking optimised_hybrid with input cr_invalid ...
Benchmarking optimised_hybrid with input skip0 ...
Benchmarking optimised_hybrid with input skip512 ...
Benchmarking optimised_hybrid with input skip64 ...
Benchmarking optimised_hybrid with input skip8192_huge ...
Benchmarking optimised_hybrid with input skip9 ...
Benchmarking optimised_hybrid with input skip_mixed ...
Benchmarking optimised_hybrid with input skip_with_cr ...
Calculating statistics...
Formatting results...

##### With input cr_invalid #####
Name                       ips        average  deviation         median         99th %
baseline               15.89 M       62.91 ns  ±1559.90%       54.20 ns      104.20 ns
optimised_hybrid        6.29 M      159.01 ns  ±4737.11%          84 ns         208 ns

Comparison:
baseline               15.89 M
optimised_hybrid        6.29 M - 2.53x slower +96.10 ns

##### With input skip0 #####
Name                       ips        average  deviation         median         99th %
baseline              271.07 M        3.69 ns    ±20.68%        3.67 ns        4.58 ns
optimised_hybrid      267.83 M        3.73 ns    ±23.43%        3.67 ns        4.83 ns

Comparison:
baseline              271.07 M
optimised_hybrid      267.83 M - 1.01x slower +0.0445 ns

##### With input skip512 #####
Name                       ips        average  deviation         median         99th %
optimised_hybrid        1.28 M        0.78 μs   ±488.99%        0.75 μs        0.96 μs
baseline               0.181 M        5.53 μs   ±112.66%        5.21 μs       13.54 μs

Comparison:
optimised_hybrid        1.28 M
baseline               0.181 M - 7.05x slower +4.74 μs

##### With input skip64 #####
Name                       ips        average  deviation         median         99th %
baseline                1.36 M      732.89 ns   ±711.92%         667 ns        1625 ns
optimised_hybrid        1.27 M      784.35 ns   ±458.58%         750 ns         959 ns

Comparison:
baseline                1.36 M
optimised_hybrid        1.27 M - 1.07x slower +51.46 ns

##### With input skip8192_huge #####
Name                       ips        average  deviation         median         99th %
optimised_hybrid       21.39 K       46.75 μs    ±54.60%       45.58 μs       59.79 μs
baseline               11.40 K       87.74 μs    ±14.71%       86.25 μs      126.47 μs

Comparison:
optimised_hybrid       21.39 K
baseline               11.40 K - 1.88x slower +40.99 μs

##### With input skip9 #####
Name                       ips        average  deviation         median         99th %
baseline               10.41 M       96.10 ns   ±972.33%       87.50 ns      141.60 ns
optimised_hybrid        1.27 M      788.92 ns   ±530.06%         750 ns         959 ns

Comparison:
baseline               10.41 M
optimised_hybrid        1.27 M - 8.21x slower +692.82 ns

##### With input skip_mixed #####
Name                       ips        average  deviation         median         99th %
optimised_hybrid        7.35 M       0.136 μs  ±2530.64%       0.125 μs        0.21 μs
baseline                0.45 M        2.22 μs   ±203.68%        2.08 μs        5.04 μs

Comparison:
optimised_hybrid        7.35 M
baseline                0.45 M - 16.29x slower +2.08 μs

##### With input skip_with_cr #####
Name                       ips        average  deviation         median         99th %
optimised_hybrid        9.73 M       0.103 μs  ±3224.67%      0.0830 μs       0.167 μs
baseline                0.59 M        1.70 μs   ±340.94%        1.58 μs        2.88 μs

Comparison:
optimised_hybrid        9.73 M
baseline                0.59 M - 16.54x slower +1.60 μs

@josevalim josevalim merged commit 74bb90a into elixir-lang:main Apr 9, 2026
15 checks passed
@josevalim
Copy link
Copy Markdown
Member

💚 💙 💜 💛 ❤️

@NelsonVides NelsonVides deleted the perf/string_swar branch April 10, 2026 05:12
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Development

Successfully merging this pull request may close these issues.

2 participants