Optimize stackalloc zeroing via BLK #83255

EgorBo · 2023-03-10T11:36:29Z

Let's insert GT_BLK after GT_HEAPLCL to rely on the former to perform zeroing.

Codegen example:

void Test()
{
    var p = stackalloc byte[250];
    Consume(p);
}

Main:

; Method Program:Test():this
G_M000_IG01:                
       55                   push     rbp
       4883EC30             sub      rsp, 48
       488D6C2420           lea      rbp, [rsp+20H]
       48B8218F332784F40000 mov      rax, 0xF48427338F21 ;; GS-cookie
       48894508             mov      qword ptr [rbp+08H], rax
G_M000_IG02:                
       4883C420             add      rsp, 32
       B910000000           mov      ecx, 16
G_M000_IG03:                
       6A00                 push     0
       6A00                 push     0
       48FFC9               dec      rcx
       75F7                 jne      SHORT G_M000_IG03  ;; slow loop (zeroing 16 bytes at once)
       4883EC20             sub      rsp, 32
       488D4C2420           lea      rcx, [rsp+20H]
       FF1577AA2800         call     [Program:Consume(ulong)]
       48B9218F332784F40000 mov      rcx, 0xF48427338F21
       48394D08             cmp      qword ptr [rbp+08H], rcx
       7405                 je       SHORT G_M000_IG04
       E852F6BB5F           call     CORINFO_HELP_FAIL_FAST
G_M000_IG04:                
       90                   nop      
G_M000_IG05:                
       488D6510             lea      rsp, [rbp+10H]
       5D                   pop      rbp
       C3                   ret      
; Total bytes of code: 85

PR:

; Method Progr:Test():this
G_M435_IG01:              
       55                   push     rbp
       4883EC30             sub      rsp, 48
       C5F877               vzeroupper 
       488D6C2420           lea      rbp, [rsp+20H]
       48B878563412F0DEBC9A mov      rax, 0x9ABCDEF012345678
       48894508             mov      qword ptr [rbp+08H], rax
G_M435_IG02:              
       852424               test     dword ptr [rsp], esp
       4881EC00010000       sub      rsp, 256
       488D542420           lea      rdx, [rsp+20H]
       C5FC57C0             vxorps   ymm0, ymm0
       C5FE7F02             vmovdqu  ymmword ptr[rdx], ymm0
       C5FE7F4220           vmovdqu  ymmword ptr[rdx+20H], ymm0
       C5FE7F4240           vmovdqu  ymmword ptr[rdx+40H], ymm0
       C5FE7F4260           vmovdqu  ymmword ptr[rdx+60H], ymm0
       C5FE7F8280000000     vmovdqu  ymmword ptr[rdx+80H], ymm0
       C5FE7F82A0000000     vmovdqu  ymmword ptr[rdx+A0H], ymm0
       C5FE7F82C0000000     vmovdqu  ymmword ptr[rdx+C0H], ymm0
       C5FE7F82E0000000     vmovdqu  ymmword ptr[rdx+E0H], ymm0
       FF15F9926000         call     [Progr:Consume(ulong):this]
       48B978563412F0DEBC9A mov      rcx, 0x9ABCDEF012345678
       48394D08             cmp      qword ptr [rbp+08H], rcx
       7405                 je       SHORT G_M435_IG03
       E85472505F           call     CORINFO_HELP_FAIL_FAST
G_M435_IG03:              
       90                   nop      
G_M435_IG04:              
       488D6510             lea      rsp, [rbp+10H]
       5D                   pop      rbp
       C3                   ret      
; Total bytes of code: 131

For large constants, this PR switches to call memset while current Main's impl will still be doing that loop of double-push.

Benchmark

BenchmarkSwitcher.FromAssembly(typeof(StackallocTests).Assembly).Run(args);

[CsvExporter]
public unsafe class StackallocBenchmarks
{
    [Benchmark] public void Stackalloc8() { byte* ptr = stackalloc byte[8]; Consume(ptr); }
    [Benchmark] public void Stackalloc16() { byte* ptr = stackalloc byte[16]; Consume(ptr); }
    [Benchmark] public void Stackalloc20() { byte* ptr = stackalloc byte[20]; Consume(ptr); }
    [Benchmark] public void Stackalloc32() { byte* ptr = stackalloc byte[32]; Consume(ptr); }
    [Benchmark] public void Stackalloc36() { byte* ptr = stackalloc byte[36]; Consume(ptr); }
    [Benchmark] public void Stackalloc40() { byte* ptr = stackalloc byte[40]; Consume(ptr); }
    [Benchmark] public void Stackalloc50() { byte* ptr = stackalloc byte[50]; Consume(ptr); }
    [Benchmark] public void Stackalloc64() { byte* ptr = stackalloc byte[64]; Consume(ptr); }
    [Benchmark] public void Stackalloc65() { byte* ptr = stackalloc byte[65]; Consume(ptr); }
    [Benchmark] public void Stackalloc80() { byte* ptr = stackalloc byte[80]; Consume(ptr); }
    [Benchmark] public void Stackalloc100() { byte* ptr = stackalloc byte[100]; Consume(ptr); }
    [Benchmark] public void Stackalloc110() { byte* ptr = stackalloc byte[110]; Consume(ptr); }
    [Benchmark] public void Stackalloc128() { byte* ptr = stackalloc byte[128]; Consume(ptr); }
    [Benchmark] public void Stackalloc129() { byte* ptr = stackalloc byte[129]; Consume(ptr); }
    [Benchmark] public void Stackalloc150() { byte* ptr = stackalloc byte[150]; Consume(ptr); }
    [Benchmark] public void Stackalloc180() { byte* ptr = stackalloc byte[180]; Consume(ptr); }
    [Benchmark] public void Stackalloc220() { byte* ptr = stackalloc byte[220]; Consume(ptr); }
    [Benchmark] public void Stackalloc256() { byte* ptr = stackalloc byte[256]; Consume(ptr); }
    [Benchmark] public void Stackalloc257() { byte* ptr = stackalloc byte[257]; Consume(ptr); }
    [Benchmark] public void Stackalloc300() { byte* ptr = stackalloc byte[300]; Consume(ptr); }
    [Benchmark] public void Stackalloc400() { byte* ptr = stackalloc byte[400]; Consume(ptr); }
    [Benchmark] public void Stackalloc500() { byte* ptr = stackalloc byte[500]; Consume(ptr); }
    [Benchmark] public void Stackalloc1024() { byte* ptr = stackalloc byte[1024]; Consume(ptr); }
    [Benchmark] public void Stackalloc4096() { byte* ptr = stackalloc byte[4096]; Consume(ptr); }

    [MethodImpl(MethodImplOptions.NoInlining)]
    static void Consume(byte* ptr) {}
}

Core i7 8700K

Ryzen 7950X

NOTE: 32 bytes and lower are handled separately so there are no differences for them.

ghost · 2023-03-10T11:36:43Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak
See info in area-owners.md if you want to be subscribed.

Issue Details

Let's see if this works - I just insert GT_BLK (basically, Unsafe.InitMemoryUnaligned) after CEE_LOCALLOC in importer to rely on that for zeroing. GT_BLK has its own logic to unroll/emit MEMSET.

   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[40]; Consume(ptr); }
   
   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[50]; Consume(ptr); }

   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[64]; Consume(ptr); }
   
   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[100]; Consume(ptr); }

   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[128]; Consume(ptr); }

   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[150]; Consume(ptr); }
   
   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[256]; Consume(ptr); }
   
   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[512]; Consume(ptr); }

   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[1024]; Consume(ptr); }
   
   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[4096]; Consume(ptr); }
   
   [Benchmark]
   public void Stackalloc50() { byte* ptr = stackalloc byte[8192]; Consume(ptr); }

   [MethodImpl(MethodImplOptions.NoInlining)]
   static void Consume(byte* ptr)
   {
   }

|         Method |                 Toolchain |       Mean | Ratio |
|--------------- |-------------------------- |-----------:|------:|
|   Stackalloc40 |    \Core_Root\corerun.exe |   1.493 ns |  1.00 |
|   Stackalloc40 | \Core_Root_PR\corerun.exe |   1.495 ns |  1.00 |
|                |                           |            |       |
|   Stackalloc50 |    \Core_Root\corerun.exe |   2.362 ns |  1.00 |
|   Stackalloc50 | \Core_Root_PR\corerun.exe |   1.321 ns |  0.56 |
|                |                           |            |       |
|   Stackalloc64 |    \Core_Root\corerun.exe |   2.354 ns |  1.00 |
|   Stackalloc64 | \Core_Root_PR\corerun.exe |   1.326 ns |  0.56 |
|                |                           |            |       |
|  Stackalloc100 |    \Core_Root\corerun.exe |   3.020 ns |  1.00 |
|  Stackalloc100 | \Core_Root_PR\corerun.exe |   1.316 ns |  0.44 |
|                |                           |            |       |
|  Stackalloc128 |    \Core_Root\corerun.exe |   3.230 ns |  1.00 |
|  Stackalloc128 | \Core_Root_PR\corerun.exe |   1.508 ns |  0.47 |
|                |                           |            |       |
|  Stackalloc150 |    \Core_Root\corerun.exe |   3.792 ns |  1.00 |
|  Stackalloc150 | \Core_Root_PR\corerun.exe |   4.536 ns |  1.20 |
|                |                           |            |       |
|  Stackalloc256 |    \Core_Root\corerun.exe |   6.295 ns |  1.00 |
|  Stackalloc256 | \Core_Root_PR\corerun.exe |   4.534 ns |  0.72 |
|                |                           |            |       |
|  Stackalloc512 |    \Core_Root\corerun.exe |  13.617 ns |  1.00 |
|  Stackalloc512 | \Core_Root_PR\corerun.exe |   4.760 ns |  0.35 |
|                |                           |            |       |
| Stackalloc1024 |    \Core_Root\corerun.exe |  26.741 ns |  1.00 |
| Stackalloc1024 | \Core_Root_PR\corerun.exe |   6.725 ns |  0.25 |
|                |                           |            |       |
| Stackalloc4096 |    \Core_Root\corerun.exe | 109.259 ns |  1.00 |
| Stackalloc4096 | \Core_Root_PR\corerun.exe |  30.131 ns |  0.28 |
|                |                           |            |       |
| Stackalloc8192 |    \Core_Root\corerun.exe | 217.935 ns |  1.00 |
| Stackalloc8192 | \Core_Root_PR\corerun.exe |  57.974 ns |  0.27 |

Author:	EgorBo
Assignees:	EgorBo
Labels:	`area-CodeGen-coreclr`
Milestone:	-

stephentoub · 2023-03-10T12:11:09Z

after 128 PR switches to MEMSET

For comparison, what does the graph look like if you don't do that?

EgorBo · 2023-03-10T13:33:31Z

after 128 PR switches to MEMSET

For comparison, what does the graph look like if you don't do that?

It looks like we might want to revise our heuristics, e.g. here what Clang/LLVM does:
For a generic CPU with AVX it unrolls zeroing up to 256 bytes, e.g.: https://godbolt.org/z/59Mdodc7T (NOTE that I'm using -Os that stands for "Optimize but keep binary size sane")

For Zen4 (AMD 7xxx) it unrolls up to 512 bytes (AVX512): https://godbolt.org/z/PxvoE4P9r

For a generic CPU without AVX it unrolls up to 128 bytes: https://godbolt.org/z/b4vd13PMz

Our threshold is hard-coded to 128. (and 256 for ARM64)

EgorBo · 2023-03-10T13:59:25Z

NOTE: afair, some (most?) libs in BCL use skiplocalsinit globally so they won't benefit from this change. But to properly test it I need to remove that flag.

stephentoub · 2023-03-10T14:01:31Z

some (most?) libs in BCL use skiplocalsinit globally

Yes, everything in the shared framework:

runtime/src/libraries/Directory.Build.targets

Line 209 in a923c64

    
           <SkipLocalsInit Condition="'$(SkipLocalsInit)' == '' and '$(MSBuildProjectExtension)' == '.csproj' and '$(IsNETCoreAppSrc)' == 'true' and '$(TargetFrameworkIdentifier)' == '.NETCoreApp'">true</SkipLocalsInit>

EgorBo · 2023-03-10T14:14:12Z

cc @anthonycanino (in case if you're interested adjusting the BLK unroll heuristic for avx-512)

EgorBo · 2023-03-14T09:57:32Z

after 128 PR switches to MEMSET

For comparison, what does the graph look like if you don't do that?

Updated. Fixed via #83274

src/coreclr/jit/codegenxarch.cpp

…-simd

benaadams · 2023-03-17T20:08:58Z

Is stackalloc different to locals? As .NET 7 will partially unroll and loop the local zeroing (though currently only uses xmm)

       vxorps   xmm4, xmm4
       mov      rax, -0x2340
       vmovdqa  xmmword ptr [rbp+rax-60H], xmm4
       vmovdqa  xmmword ptr [rbp+rax-50H], xmm4
       vmovdqa  xmmword ptr [rbp+rax-40H], xmm4
       add      rax, 48
       jne      SHORT  -5 instr

Rather than this weird thing

G_M000_IG03:                
       push     0
       push     0
       dec      rcx
       jne      SHORT G_M000_IG03  ;; slow loop (zeroing 16 bytes at once)

benaadams · 2023-03-17T20:13:16Z

i.e. should stackalloc be part of locals?

EgorBo · 2023-03-17T21:21:21Z

i.e. should stackalloc be part of locals?

Good question. We do that if stackalloc is smaller than 32 bytes. The problem with that that we'll have to zero it all the time no matter if we need it or not (since it's in the prologue). E.g. let me raise that limit to 128 bytes and check codegen for this:

void Test(bool cond)
{
    if (cond)
    {
        // rarely taken condition
        var p = stackalloc byte[128];
        Consume(p);
    }
    else
    {
        Console.WriteLine();
    }
}

Codegen:

; Method Program:Test(bool):this
G_M34929_IG01:              ;; offset=0000H
       4881ECA8000000       sub      rsp, 168
       C5D857E4             vxorps   xmm4, xmm4
       C5F97F642420         vmovdqa  xmmword ptr [rsp+20H], xmm4
       C5F97F642430         vmovdqa  xmmword ptr [rsp+30H], xmm4
       48B8A0FFFFFFFFFFFFFF mov      rax, -96
       C5F97FA404A0000000   vmovdqa  xmmword ptr [rsp+rax+A0H], xmm4
       C5F97FA404B0000000   vmovdqa  xmmword ptr [rsp+rax+B0H], xmm4
       C5F97FA404C0000000   vmovdqa  xmmword ptr [rsp+rax+C0H], xmm4
       4883C030             add      rax, 48
       75DF                 jne      SHORT  -5 instr
       48B878563412F0DEBC9A mov      rax, 0x9ABCDEF012345678
       48898424A0000000     mov      qword ptr [rsp+A0H], rax
						;; size=84 bbWeight=1 PerfScore 13.33

G_M34929_IG02:              ;; offset=0054H
       84D2                 test     dl, dl
       742D                 je       SHORT G_M34929_IG06
						;; size=4 bbWeight=1 PerfScore 1.25

G_M34929_IG03:              ;; offset=0058H
       488D4C2420           lea      rcx, [rsp+20H]
       FF15150A7100         call     [Program:Consume(ulong)]
       48B978563412F0DEBC9A mov      rcx, 0x9ABCDEF012345678
       48398C24A0000000     cmp      qword ptr [rsp+A0H], rcx
       7405                 je       SHORT G_M34929_IG04
       E824CA4B5F           call     CORINFO_HELP_FAIL_FAST
						;; size=36 bbWeight=0.50 PerfScore 3.88

G_M34929_IG04:              ;; offset=007CH
       90                   nop      
						;; size=1 bbWeight=0.50 PerfScore 0.12

G_M34929_IG05:              ;; offset=007DH
       4881C4A8000000       add      rsp, 168
       C3                   ret      
						;; size=8 bbWeight=0.50 PerfScore 0.62

G_M34929_IG06:              ;; offset=0085H
       FF152DA69000         call     [System.Console:WriteLine()]
       48B978563412F0DEBC9A mov      rcx, 0x9ABCDEF012345678
       48398C24A0000000     cmp      qword ptr [rsp+A0H], rcx
       7405                 je       SHORT G_M34929_IG07
       E8FCC94B5F           call     CORINFO_HELP_FAIL_FAST
						;; size=31 bbWeight=0.50 PerfScore 3.62

G_M34929_IG07:              ;; offset=00A4H
       90                   nop      
						;; size=1 bbWeight=0.50 PerfScore 0.12

G_M34929_IG08:              ;; offset=00A5H
       4881C4A8000000       add      rsp, 168
       C3                   ret      
						;; size=8 bbWeight=0.50 PerfScore 0.62
; Total bytes of code: 173

Also, here we don't do stack probing.
Probably, I should decrease that threshold from 32 to some single-instruction sized

…-simd

EgorBo · 2023-03-31T14:29:18Z

@jakobbotsch @BruceForstall @dotnet/jit-contrib PTAL

I inject a BLK node in Lower for all stackalloc nodes (GT_LCLHEAP) with uses. Will enable arm64 separately, its default impl is good but it doesn't switch to memset call for large buffers so still will benefit from BLK too.

src/coreclr/jit/importer.cpp

src/coreclr/jit/lower.cpp

Co-authored-by: SingleAccretion <62474226+SingleAccretion@users.noreply.github.com>

src/coreclr/jit/lower.cpp

ghost assigned EgorBo Mar 10, 2023

dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Mar 10, 2023

EgorBo mentioned this pull request Mar 10, 2023

Unify unroll limits in a single entry point #83274

Merged

EgorBo marked this pull request as ready for review March 14, 2023 09:57

EgorBo marked this pull request as draft March 14, 2023 13:02

EgorBo force-pushed the stackalloc-zero-simd branch from 8c1d241 to 167ce5a Compare March 15, 2023 21:32

Better version

588b964

EgorBo force-pushed the stackalloc-zero-simd branch from a4a84ab to 588b964 Compare March 16, 2023 01:08

tannergooding reviewed Mar 16, 2023

View reviewed changes

src/coreclr/jit/codegenxarch.cpp Outdated Show resolved Hide resolved

runfoapp bot mentioned this pull request Mar 16, 2023

Infra improvements for Helix #68176

Closed

EgorBo added 2 commits March 17, 2023 18:53

Merge branch 'main' of github.com:dotnet/runtime into stackalloc-zero…

cce06f6

…-simd

Address feedback

47d8b80

This was referenced Mar 17, 2023

Test_EventSource_EtwManifestGeneration* tests failing in CI #48798

Closed

IOException running NuGet-Migrations during tests in dotnet CLI first run #80619

Closed

[release/6.0] Doublelinklist GC failures on Mono #83245

Closed

Fix build break

1662fc9

Update codegenxarch.cpp

14de9be

EgorBo marked this pull request as ready for review March 18, 2023 00:12

EgorBo added 2 commits March 31, 2023 13:06

Merge branch 'main' of github.com:dotnet/runtime into stackalloc-zero…

9f16ffe

…-simd

Move to lowering

f8e73a6

EgorBo added 4 commits March 31, 2023 15:07

Fix asserts

0d868d2

fix asserts

d0ea84b

clean up

234be45

fix build

9c94519

build-analysis bot mentioned this pull request Mar 31, 2023

Checkout failure: "Git fetch failed with exit code 128" dotnet/arcade#9009

Open

2 tasks

fix assert

e586154

SingleAccretion reviewed Mar 31, 2023

View reviewed changes

src/coreclr/jit/importer.cpp Outdated Show resolved Hide resolved

src/coreclr/jit/lower.cpp Outdated Show resolved Hide resolved

src/coreclr/jit/lower.cpp Outdated Show resolved Hide resolved

EgorBo and others added 5 commits March 31, 2023 19:41

Update src/coreclr/jit/lower.cpp

b1df7c7

Co-authored-by: SingleAccretion <62474226+SingleAccretion@users.noreply.github.com>

Address feedback

6acbebd

Fix assert

024ac3f

flip condition

29b0f7b

Fix formatting

1f0a599

BruceForstall reviewed Apr 3, 2023

View reviewed changes

src/coreclr/jit/lower.cpp Outdated Show resolved Hide resolved

EgorBo added 2 commits April 3, 2023 21:32

Update lower.cpp

cbecd3b

Update lower.cpp

0b0b25a

BruceForstall approved these changes Apr 4, 2023

View reviewed changes

EgorBo merged commit e13f0dc into dotnet:main Apr 4, 2023

EgorBo deleted the stackalloc-zero-simd branch April 4, 2023 23:36

ghost locked as resolved and limited conversation to collaborators May 5, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Optimize stackalloc zeroing via BLK #83255

Optimize stackalloc zeroing via BLK #83255

EgorBo commented Mar 10, 2023 •

edited

Loading

ghost commented Mar 10, 2023

stephentoub commented Mar 10, 2023

EgorBo commented Mar 10, 2023 •

edited

Loading

EgorBo commented Mar 10, 2023

stephentoub commented Mar 10, 2023 •

edited

Loading

EgorBo commented Mar 10, 2023 •

edited

Loading

EgorBo commented Mar 14, 2023

benaadams commented Mar 17, 2023 •

edited

Loading

benaadams commented Mar 17, 2023

EgorBo commented Mar 17, 2023 •

edited

Loading

EgorBo commented Mar 31, 2023

Optimize stackalloc zeroing via BLK #83255

Optimize stackalloc zeroing via BLK #83255

Conversation

EgorBo commented Mar 10, 2023 • edited Loading

Codegen example:

Main:

PR:

Benchmark

Core i7 8700K

Ryzen 7950X

ghost commented Mar 10, 2023

stephentoub commented Mar 10, 2023

EgorBo commented Mar 10, 2023 • edited Loading

EgorBo commented Mar 10, 2023

stephentoub commented Mar 10, 2023 • edited Loading

EgorBo commented Mar 10, 2023 • edited Loading

EgorBo commented Mar 14, 2023

benaadams commented Mar 17, 2023 • edited Loading

benaadams commented Mar 17, 2023

EgorBo commented Mar 17, 2023 • edited Loading

EgorBo commented Mar 31, 2023

EgorBo commented Mar 10, 2023 •

edited

Loading

EgorBo commented Mar 10, 2023 •

edited

Loading

stephentoub commented Mar 10, 2023 •

edited

Loading

EgorBo commented Mar 10, 2023 •

edited

Loading

benaadams commented Mar 17, 2023 •

edited

Loading

EgorBo commented Mar 17, 2023 •

edited

Loading