Skip to content

rewrite masking function to be tail-recursive and use binary matches#17

Merged
the-mikedavis merged 2 commits intomainfrom
masking-function-rewrite
Jun 29, 2021
Merged

rewrite masking function to be tail-recursive and use binary matches#17
the-mikedavis merged 2 commits intomainfrom
masking-function-rewrite

Conversation

@the-mikedavis
Copy link
Copy Markdown
Collaborator

@the-mikedavis the-mikedavis commented Jun 28, 2021

closes #16

saw these results with benchee:

Operating System: Linux
CPU Information: Intel(R) Core(TM) i7-9700KF CPU @ 3.60GHz
Number of Available Cores: 8
Available memory: 31.30 GB
Elixir 1.12.1
Erlang 24.0

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 10 s
memory time: 2 s
parallel: 1
inputs: none specified
Estimated total run time: 28 s

Benchmarking match...
Benchmarking stream...

Name             ips        average  deviation         median         99th %
match        10.11 K       98.91 μs    ±19.70%       96.12 μs      203.03 μs
stream        1.95 K      511.57 μs     ±5.05%      505.31 μs      623.51 μs

Comparison: 
match        10.11 K
stream        1.95 K - 5.17x slower +412.66 μs

Memory usage statistics: 

Name      Memory usage
match          0.38 MB
stream         2.06 MB - 5.39x memory usage +1.68 MB

**All measurements for memory usage were the same**
the bench file
Mix.install([:benchee])

defmodule Foo do
  def stream_mask(payload, <<a, b, c, d>>) do
    [a, b, c, d]
    |> Stream.cycle()
    |> Enum.reduce_while({payload, _acc = <<>>}, fn
      _mask_key, {<<>>, acc} ->
        {:halt, acc}

      mask_key, {<<part_key::integer, payload_rest::binary>>, acc} ->
        {:cont, {payload_rest, <<acc::binary, Bitwise.bxor(mask_key, part_key)::integer>>}}
    end)
  end

  def match_mask(payload, mask, acc \\ <<>>)

  def match_mask(payload, nil, _acc), do: payload

  # n=4 is the happy path
  # n=3..1 catches cases where the remaining byte_size/1 of the payload is shorter
  # than the mask
  for n <- 4..1 do
    def match_mask(
          <<part_key::integer-size(8)-unit(unquote(n)), payload_rest::binary>>,
          <<mask_key::integer-size(8)-unit(unquote(n)), _::binary>> = mask,
          acc
        ) do
      match_mask(
        payload_rest,
        mask,
        <<acc::binary, Bitwise.bxor(mask_key, part_key)::integer-size(8)-unit(unquote(n))>>
      )
    end
  end

  def match_mask(<<>>, _mask, acc), do: acc
end

payload = :crypto.strong_rand_bytes(10_000)
mask = :crypto.strong_rand_bytes(4)

Benchee.run(
  %{
    "stream" => fn -> Foo.stream_mask(payload, mask) end,
    "match" => fn -> Foo.match_mask(payload, mask) end
  },
  time: 10,
  memory_time: 2
)

woof that was a lot of memory consumption beforehand!


what's this masking function?

When the client sends frames to the server, it "mask"s the payloads of the frames using 4 random bytes. The "mask" operation is that you take each byte of the mask and XOR it with a byte of the payload. When you reach the end of the mask bytes, you repeat the mask. When you reach the end of the payload, you're done!

From RFC6455 section 5.3 on masking:

Octet i of the transformed data ("transformed-octet-i") is the XOR of
octet i of the original data ("original-octet-i") with octet at index
i modulo 4 of the masking key ("masking-key-octet-j"):

     j                   = i MOD 4
     transformed-octet-i = original-octet-i XOR masking-key-octet-j

I don't quite remember why we do this (cache busting, some outdated idea of a security mechanism, etc.), but we do this often, so might as well make it efficient.

@the-mikedavis the-mikedavis self-assigned this Jun 28, 2021
@the-mikedavis the-mikedavis requested a review from a team June 28, 2021 22:13
@the-mikedavis
Copy link
Copy Markdown
Collaborator Author

💥 compiling inline makes it even better dang

bench file
Mix.install([:benchee])

defmodule Foo do
  def stream_mask(payload, <<a, b, c, d>>) do
    [a, b, c, d]
    |> Stream.cycle()
    |> Enum.reduce_while({payload, _acc = <<>>}, fn
      _mask_key, {<<>>, acc} ->
        {:halt, acc}

      mask_key, {<<part_key::integer, payload_rest::binary>>, acc} ->
        {:cont, {payload_rest, <<acc::binary, Bitwise.bxor(mask_key, part_key)::integer>>}}
    end)
  end

  def match_mask(payload, mask, acc \\ <<>>)

  def match_mask(payload, nil, _acc), do: payload

  # n=4 is the happy path
  # n=3..1 catches cases where the remaining byte_size/1 of the payload is shorter
  # than the mask
  for n <- 4..1 do
    def match_mask(
          <<part_key::integer-size(8)-unit(unquote(n)), payload_rest::binary>>,
          <<mask_key::integer-size(8)-unit(unquote(n)), _::binary>> = mask,
          acc
        ) do
      match_mask(
        payload_rest,
        mask,
        <<acc::binary, Bitwise.bxor(mask_key, part_key)::integer-size(8)-unit(unquote(n))>>
      )
    end
  end

  def match_mask(<<>>, _mask, acc), do: acc

  @compile {:inline, compile_mask: 2, compile_mask: 3}
  def compile_mask(payload, mask, acc \\ <<>>)

  def compile_mask(payload, nil, _acc), do: payload

  # n=4 is the happy path
  # n=3..1 catches cases where the remaining byte_size/1 of the payload is shorter
  # than the mask
  for n <- 4..1 do
    def compile_mask(
          <<part_key::integer-size(8)-unit(unquote(n)), payload_rest::binary>>,
          <<mask_key::integer-size(8)-unit(unquote(n)), _::binary>> = mask,
          acc
        ) do
      compile_mask(
        payload_rest,
        mask,
        <<acc::binary, Bitwise.bxor(mask_key, part_key)::integer-size(8)-unit(unquote(n))>>
      )
    end
  end

  def compile_mask(<<>>, _mask, acc), do: acc
end

payload = :crypto.strong_rand_bytes(10_000)
mask = :crypto.strong_rand_bytes(4)

Benchee.run(
  %{
    "stream" => fn payload -> Foo.stream_mask(payload, mask) end,
    "match" => fn payload -> Foo.match_mask(payload, mask) end,
    "compile" => fn payload -> Foo.compile_mask(payload, mask) end
  },
  time: 10,
  memory_time: 2,
  inputs: %{
    "Small" => :crypto.strong_rand_bytes(100),
    "Medium" => :crypto.strong_rand_bytes(10_000),
    "Large" => :crypto.strong_rand_bytes(1_000_000)
  }
)

and the results:

Operating System: Linux
CPU Information: Intel(R) Core(TM) i7-9700KF CPU @ 3.60GHz
Number of Available Cores: 8
Available memory: 31.30 GB
Elixir 1.12.1
Erlang 24.0

Benchmark suite executing with the following configuration:
warmup: 2 s
time: 10 s
memory time: 2 s
parallel: 1
inputs: Large, Medium, Small
Estimated total run time: 2.10 min

Benchmarking compile with input Large...
Benchmarking compile with input Medium...
Benchmarking compile with input Small...
Benchmarking match with input Large...
Benchmarking match with input Medium...
Benchmarking match with input Small...
Benchmarking stream with input Large...
Benchmarking stream with input Medium...
Benchmarking stream with input Small...

##### With input Large #####
Name              ips        average  deviation         median         99th %
compile        102.28        9.78 ms     ±3.34%        9.76 ms       10.69 ms
match           75.66       13.22 ms     ±3.79%       13.19 ms       14.41 ms
stream          14.29       69.99 ms     ±3.87%       69.99 ms       76.22 ms

Comparison: 
compile        102.28
match           75.66 - 1.35x slower +3.44 ms
stream          14.29 - 7.16x slower +60.21 ms

Memory usage statistics:

Name       Memory usage
compile        23.84 MB
match          38.15 MB - 1.60x memory usage +14.31 MB
stream        206.00 MB - 8.64x memory usage +182.15 MB

**All measurements for memory usage were the same**

##### With input Medium #####
Name              ips        average  deviation         median         99th %
compile        5.49 K      182.31 μs    ±34.67%      186.93 μs      330.78 μs
match          3.85 K      259.92 μs    ±25.36%      250.79 μs      405.32 μs
stream         0.85 K     1179.86 μs    ±28.18%     1170.20 μs     1905.72 μs

Comparison: 
compile        5.49 K
match          3.85 K - 1.43x slower +77.61 μs
stream         0.85 K - 6.47x slower +997.55 μs

Memory usage statistics:

Name       Memory usage
compile       245.31 KB
match         391.83 KB - 1.60x memory usage +146.52 KB
stream       2111.56 KB - 8.61x memory usage +1866.25 KB

**All measurements for memory usage were the same**

##### With input Small #####
Name              ips        average  deviation         median         99th %
compile      211.05 K        4.74 μs  ±2293.28%        1.51 μs        3.45 μs
match        162.84 K        6.14 μs  ±1770.60%        1.90 μs        4.65 μs
stream        47.81 K       20.92 μs   ±512.20%        6.76 μs      461.43 μs

Comparison: 
compile      211.05 K
match        162.84 K - 1.30x slower +1.40 μs
stream        47.81 K - 4.41x slower +16.18 μs

Memory usage statistics:

Name       Memory usage
compile         3.61 KB
match           5.11 KB - 1.42x memory usage +1.50 KB
stream         23.28 KB - 6.45x memory usage +19.67 KB

**All measurements for memory usage were the same**

@the-mikedavis
Copy link
Copy Markdown
Collaborator Author

looks like gun does this in just about the same way but without the metaprogramming: https://github.com/ninenines/cowlib/blob/0f5c2f8922c89c58f51696cce690245cbdc5f327/src/cow_ws.erl#L526-L542

mask(<<>>, _, Unmasked) ->
	Unmasked;
mask(<< O:32, Rest/bits >>, MaskKey, Acc) ->
	T = O bxor MaskKey,
	mask(Rest, MaskKey, << Acc/binary, T:32 >>);
mask(<< O:24 >>, MaskKey, Acc) ->
	<< MaskKey2:24, _:8 >> = << MaskKey:32 >>,
	T = O bxor MaskKey2,
	<< Acc/binary, T:24 >>;
mask(<< O:16 >>, MaskKey, Acc) ->
	<< MaskKey2:16, _:16 >> = << MaskKey:32 >>,
	T = O bxor MaskKey2,
	<< Acc/binary, T:16 >>;
mask(<< O:8 >>, MaskKey, Acc) ->
	<< MaskKey2:8, _:24 >> = << MaskKey:32 >>,
	T = O bxor MaskKey2,
	<< Acc/binary, T:8 >>.

@the-mikedavis the-mikedavis merged commit 053e79f into main Jun 29, 2021
@the-mikedavis the-mikedavis deleted the masking-function-rewrite branch June 29, 2021 15:47
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

improve masking function memory/speed

2 participants