Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wasm] Add ConditionalSelect SIMD intrinsics #80145

Merged

Conversation

radekdoulik
Copy link
Member

It uses existing OP_BSL, which does And, Not, And and Or operations. llvm emits it as v128.bitselect for us. So I think we don't need to use the llvm.wasm.bitselect.* intrinsics.

This should help in few areas, SpanHelper.ReplaceValueType and IndexOfAnyAsciiSearcher.IndexOfAnyLookup'1.

It improves the Json deserialization a bit:

measurement before after
Json, non-ASCII text deserialize 0.4343ms 0.4275ms
Json, small deserialize 0.0517ms 0.0497ms
Json, large deserialize 14.3995ms 13.8217ms

Example of emitted code:

> wa-info -d -f SpanHelper.*ReplaceValueType src\mono\sample\wasm\browser-bench\bin\Release\AppBundle\dotnet.wasm
(func corlib_System_SpanHelpers_ReplaceValueType_uint16_uint16__uint16__uint16_uint16_uintptr(param $0 i32, $1 i32, $2 i32, $3 i32, $4 i32, $5 i32))
...
    i16x8.eq    [SIMD]
    v128.bitselect    [SIMD]
    v128.store    [SIMD]
...

It uses existing `OP_BSL`, which does And, Not, And and Or operations.
llvm emits it as `v128.bitselect` for us. So I think we don't need
to use the `llvm.wasm.bitselect.*` intrinsics.

This should help in few areas, SpanHelper.ReplaceValueType and
IndexOfAnyAsciiSearcher.IndexOfAnyLookup'1.

It improves the Json deserialization a bit:

| measurement | before | after |
|-:|-:|-:|
|       Json, non-ASCII text deserialize |     0.4343ms |     0.4275ms |
|                Json, small deserialize |     0.0517ms |     0.0497ms |
|                Json, large deserialize |    14.3995ms |    13.8217ms |

Example of emitted code:

    > wa-info -d -f SpanHelper.*ReplaceValueType src\mono\sample\wasm\browser-bench\bin\Release\AppBundle\dotnet.wasm
    (func corlib_System_SpanHelpers_ReplaceValueType_uint16_uint16__uint16__uint16_uint16_uintptr(param $0 i32, $1 i32, $2 i32, $3 i32, $4 i32, $5 i32))
    ...
        i16x8.eq    [SIMD]
        v128.bitselect    [SIMD]
        v128.store    [SIMD]
    ...
@radekdoulik
Copy link
Member Author

Changes in the SpanHelpers.ReplaceValueType emitted code. Didn't want to put it in the commit message as it is quite long, the interesting parts are around the v128.bitselect.

> wa-diff -d -f SpanHelper.*ReplaceValueType dotnet-before.wasm dotnet-after.wasm
(func corlib_System_SpanHelpers_ReplaceValueType_uint16_uint16__uint16__uint16_uint16_uintptr(param i32, i32, i32, i32, i32, i32))
...
   local $7 i32
   local $8 i32
-  local $9 v128
-  local $10 v128
-  local $11 i32
-  local $12 v128
-  local $13 i32
-  local $14 i64
-  local $15 i64
-  local $16 i64
+  local $9 i32
+  local $10 i32
+  local $11 v128
+  local $12 i32
+  local $13 v128
+  local $14 v128
+  local $15 i32
+  local $16 i32
   local $17 i32
-  local $18 v128
-  local $19 i32
-  local $20 i32
-  local $21 i32
   global.get $__stack_pointer
-  i32.const 256
+  i32.const 32
   i32.sub
-  local.tee $6
+  local.tee $8
   global.set $__stack_pointer
-  i32.const 4992746
+  i32.const 5001356
   i32.load8.u
   i32.eqz
   if
-   i32.const 1061380
+   i32.const 1062692
    call mono_aot_corlib_init_method
-   i32.const 4992746
+   i32.const 5001356
    i32.const 1
    i32.store8
...
      local.get $1
      i32.sub
-     local.set $19
-     i32.const 4916992
+     local.set $15
+     i32.const 4925088
      i32.load align:2
-     local.set $20
+     local.set $16
      loop
       local.get $0
-      local.get $8
+      local.get $7
       i32.const 1
       i32.shl
-      local.tee $11
+      local.tee $9
       i32.add
       i32.load16.u align:1
-      local.set $13
-      local.get $6
-      i32.const 4965720
+      local.set $10
+      local.get $8
+      i32.const 4974480
       i32.load align:2
       i32.load align:2
-      local.tee $7
+      local.tee $6
       i32.store offset:28 align:2
       block
-       local.get $7
+       local.get $6
        if
+        local.get $8
         local.get $6
-        local.get $7
         i32.store offset:12 align:2
         br
        
-       i32.const 4965720
+       i32.const 4974480
        i32.load align:2
-       local.set $7
-       local.get $6
-       i32.const 4965728
+       local.set $6
+       local.get $8
+       i32.const 4974488
        i32.load align:2
        call corlib_System_Collections_Generic_EqualityComparer_1_T_UINT16_CreateComparer
-       local.tee $21
+       local.tee $17
        i32.store offset:24 align:2
-       local.get $7
+       local.get $6
        i32.eqz
        br.if
-       local.get $7
-       local.get $7
+       local.get $6
+       local.get $6
        i32.load align:2
-       local.tee $17
+       local.tee $12
-       local.get $21
        local.get $17
+       local.get $12
        select
        i32.store align:2
-       local.get $6
-       local.get $17
+       local.get $8
+       local.get $12
        i32.store offset:20 align:2
-       local.get $7
+       local.get $6
        i32.const 9
        i32.shr.u
-       local.get $20
+       local.get $16
        i32.add
        i32.const 1
        i32.store8
-       local.get $6
-       i32.const 4965720
+       local.get $8
+       i32.const 4974480
        i32.load align:2
        i32.load align:2
-       local.tee $7
+       local.tee $6
        i32.store offset:16 align:2
+       local.get $8
        local.get $6
-       local.get $7
        i32.store offset:12 align:2
-       local.get $7
+       local.get $6
        i32.eqz
        br.if
       
-      local.get $11
-      local.get $19
+      local.get $9
+      local.get $15
       i32.eq
       br.if
       local.get $1
-      local.get $11
+      local.get $9
       i32.add
       local.get $3
-      local.get $13
-      local.get $13
+      local.get $10
+      local.get $10
       i32.const 65535
       i32.and
...
       select
       i32.store16 align:1
-      local.get $8
+      local.get $7
       i32.const 1
       i32.add
-      local.tee $8
+      local.tee $7
       local.get $4
       i32.ne
...
     i32.const 8
     i32.sub
-    local.set $13
+    local.set $6
     local.get $3
     i16x8.splat    [SIMD]
-    local.set $12
+    local.set $13
-    local.get $6
-    i32.const -1
-    i32.sub
-    i32.const 8
-    i32.or
-    local.set $7
-    local.get $6
-    i32.const 48
-    i32.add
-    i32.const 8
-    i32.or
-    local.set $4
-    local.get $6
-    i32.const 32
-    i32.add
-    i32.const 8
-    i32.or
-    local.set $3
     local.get $2
     i16x8.splat    [SIMD]
-    local.set $18
+    local.set $14
     loop
-     local.get $6
+     local.get $7
-     local.get $18
-     local.get $8
      i32.const 1
      i32.shl
-     local.tee $11
-     local.get $0
+     local.tee $9
+     local.get $1
      i32.add
-     v128.load    [SIMD]
-     local.tee $9
-     i16x8.eq    [SIMD]
      local.tee $10
-     v128.store offset:32 align:4    [SIMD]
-     local.get $6
+     i32.eqz
+     br.if
-     local.get $12
-     v128.store offset:48 align:4    [SIMD]
-     local.get $6
-     local.get $9
-     v128.store offset:64 align:4    [SIMD]
-     local.get $6
      local.get $10
-     v128.store64.lane offset:88 align:3 0    [SIMD]
-     local.get $6
+     local.get $13
+     local.get $0
-     local.get $12
-     v128.store64.lane offset:96 align:3 0    [SIMD]
-     local.get $6
-     local.get $9
-     v128.store64.lane offset:104 align:3 0    [SIMD]
-     local.get $6
-     i64.const 0
-     i64.store offset:80 align:3
-     local.get $6
-     local.get $10
-     i64x2.extract.lane 0    [SIMD]
-     i64.const -1
-     i64.xor
      local.get $9
-     i64x2.extract.lane 0    [SIMD]
-     i64.and
-     local.get $12
-     local.get $10
-     v128.and    [SIMD]
-     i64x2.extract.lane 0    [SIMD]
-     i64.or
-     i64.store offset:80 align:3
-     local.get $6
-     v128.load64.zero offset:80 align:3    [SIMD]
-     local.set $9
-     local.get $7
-     i64.load align:3
-     local.set $14
-     local.get $4
-     i64.load align:3
-     local.set $15
-     local.get $6
-     local.get $3
-     i64.load align:3
-     local.tee $16
-     i64.store offset:120 align:3
-     local.get $6
-     local.get $15
-     i64.store offset:128 align:3
-     local.get $6
-     local.get $14
-     i64.store offset:136 align:3
-     local.get $6
-     i64.const 0
-     i64.store offset:112 align:3
-     local.get $6
-     local.get $14
-     local.get $16
-     i64.const -1
-     i64.xor
-     i64.and
-     local.get $15
-     local.get $16
-     i64.and
-     i64.or
-     i64.store offset:112 align:3
-     local.get $6
-     v128.load64.zero offset:112 align:3    [SIMD]
-     local.set $10
-     local.get $1
-     local.get $11
      i32.add
+     v128.load    [SIMD]
      local.tee $11
-     i32.eqz
+     local.get $14
-     br.if
      local.get $11
-     local.get $9
-     local.get $10
+     i16x8.eq    [SIMD]
+     v128.bitselect    [SIMD]
-     i8x16.shuffle 0x17161514131211100706050403020100    [SIMD]
      v128.store    [SIMD]
-     local.get $8
+     local.get $7
      i32.const 8
      i32.add
-     local.tee $8
-     local.get $13
+     local.tee $7
+     local.get $6
      i32.lt.u
      br.if
...
     
     local.get $6
-    local.get $18
-    local.get $13
     i32.const 1
     i32.shl
-    local.tee $8
-    local.get $0
+    local.tee $7
+    local.get $1
     i32.add
-    v128.load    [SIMD]
     local.tee $9
-    i16x8.eq    [SIMD]
-    local.tee $10
-    v128.store offset:144 align:4    [SIMD]
-    local.get $6
-    local.get $12
-    v128.store offset:160 align:4    [SIMD]
-    local.get $6
-    local.get $9
-    v128.store offset:176 align:4    [SIMD]
-    local.get $6
-    local.get $10
-    v128.store64.lane offset:200 align:3 0    [SIMD]
-    local.get $6
-    local.get $12
-    v128.store64.lane offset:208 align:3 0    [SIMD]
-    local.get $6
-    local.get $9
-    v128.store64.lane offset:216 align:3 0    [SIMD]
-    local.get $6
-    i64.const 0
-    i64.store offset:192 align:3
-    local.get $6
-    local.get $10
-    i64x2.extract.lane 0    [SIMD]
-    i64.const -1
-    i64.xor
-    local.get $9
-    i64x2.extract.lane 0    [SIMD]
-    i64.and
-    local.get $12
-    local.get $10
-    v128.and    [SIMD]
-    i64x2.extract.lane 0    [SIMD]
-    i64.or
-    i64.store offset:192 align:3
-    local.get $6
-    v128.load64.zero offset:192 align:3    [SIMD]
-    local.set $9
-    local.get $6
-    local.get $6
-    i64.load offset:152 align:3
-    local.tee $14
-    i64.store offset:232 align:3
-    local.get $6
-    local.get $6
-    i64.load offset:168 align:3
-    local.tee $15
-    i64.store offset:240 align:3
-    local.get $6
-    local.get $6
-    i64.load offset:184 align:3
-    local.tee $16
-    i64.store offset:248 align:3
-    local.get $6
-    i64.const 0
-    i64.store offset:224 align:3
-    local.get $6
-    local.get $16
-    local.get $14
-    i64.const -1
-    i64.xor
-    i64.and
-    local.get $14
-    local.get $15
-    i64.and
-    i64.or
-    i64.store offset:224 align:3
-    local.get $6
-    v128.load64.zero offset:224 align:3    [SIMD]
-    local.set $10
-    local.get $1
-    local.get $8
-    i32.add
-    local.tee $8
     i32.eqz
     br.if
-    local.get $8
     local.get $9
-    local.get $10
-    i8x16.shuffle 0x17161514131211100706050403020100    [SIMD]
+    local.get $13
+    local.get $0
+    local.get $7
+    i32.add
+    v128.load    [SIMD]
+    local.tee $11
+    local.get $14
+    local.get $11
+    i16x8.eq    [SIMD]
+    v128.bitselect    [SIMD]
     v128.store    [SIMD]
    
-   local.get $6
-   i32.const 256
+   local.get $8
+   i32.const 32
    i32.add
    global.set $__stack_pointer

@build-analysis build-analysis bot mentioned this pull request Jan 4, 2023
@radekdoulik radekdoulik merged commit 3454d8c into dotnet:main Jan 4, 2023
@dotnet dotnet locked as resolved and limited conversation to collaborators Feb 3, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

2 participants