Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove AO from a couple of SpanHelpers methods #85819

Merged
merged 3 commits into from
May 5, 2023

Conversation

EgorBo
Copy link
Member

@EgorBo EgorBo commented May 5, 2023

Contributes to #85791 (removes a couple of methods jitted during Hello World start).

   3: JIT compiled System.SpanHelpers:IndexOfNullCharacter(ulong) [Tier1, IL size=805, code size=391]
   9: JIT compiled System.SpanHelpers:IndexOfNullByte(ulong) [Tier1, IL size=844, code size=459]
  25: JIT compiled System.SpanHelpers:SequenceCompareTo(byref,int,byref,int) [Tier1, IL size=568, code size=329]

I don't see a good reason for these to have [AO], R2R'd versions look good enough to me (SSE based):

R2R codegen:
; Assembly listing for method System.SpanHelpers:IndexOfNullCharacter(ulong):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
       sub      rsp, 40
       vzeroupper

G_M000_IG02:                ;; offset=0007H
       xor      eax, eax
       mov      edx, 0x7FFFFFFF
       test     cl, 1
       jne      SHORT G_M000_IG04

G_M000_IG03:                ;; offset=0013H
       mov      edx, ecx
       neg      edx
       mov      r8d, edx
       shr      r8d, 31
       add      edx, r8d
       sar      edx, 1
       and      rdx, 7

G_M000_IG04:                ;; offset=0027H
       cmp      rdx, 4
       jl       SHORT G_M000_IG06

G_M000_IG05:                ;; offset=002DH
       cmp      word  ptr [rcx+2*rax], 0
       je       G_M000_IG21
       cmp      word  ptr [rcx+2*rax+02H], 0
       je       G_M000_IG20
       cmp      word  ptr [rcx+2*rax+04H], 0
       je       G_M000_IG19
       cmp      word  ptr [rcx+2*rax+06H], 0
       je       G_M000_IG18
       add      rax, 4
       add      rdx, -4
       cmp      rdx, 4
       jge      SHORT G_M000_IG05

G_M000_IG06:                ;; offset=006AH
       test     rdx, rdx
       jle      SHORT G_M000_IG08

G_M000_IG07:                ;; offset=006FH
       cmp      word  ptr [rcx+2*rax], 0
       je       G_M000_IG21
       inc      rax
       dec      rdx
       test     rdx, rdx
       jg       SHORT G_M000_IG07

G_M000_IG08:                ;; offset=0085H
       cmp      rax, 0x7FFFFFFF
       jge      G_M000_IG22
       lea      rdx, [rcx+2*rax]
       test     dl, 31
       je       SHORT G_M000_IG11

G_M000_IG09:                ;; offset=009AH
       vxorps   xmm0, xmm0, xmm0
       vpcmpeqw xmm0, xmm0, xmmword ptr [rcx+2*rax]
       vpmovmskb edx, xmm0
       test     edx, edx
       jne      SHORT G_M000_IG10
       add      rax, 8
       jmp      SHORT G_M000_IG11

G_M000_IG10:                ;; offset=00B1H
       xor      ecx, ecx
       tzcnt    ecx, edx
       shr      ecx, 1
       mov      edx, ecx
       add      eax, edx
       jmp      G_M000_IG21

G_M000_IG11:                ;; offset=00C2H
       mov      rdx, rax
       neg      rdx
       add      rdx, 0x7FFFFFFF
       and      rdx, -16
       jle      SHORT G_M000_IG13

G_M000_IG12:                ;; offset=00D5H
       vxorps   ymm0, ymm0, ymm0
       vpcmpeqw ymm0, ymm0, ymmword ptr [rcx+2*rax]
       vpmovmskb r8d, ymm0
       test     r8d, r8d
       jne      SHORT G_M000_IG15
       add      rax, 16
       add      rdx, -16
       test     rdx, rdx
       jg       SHORT G_M000_IG12

G_M000_IG13:                ;; offset=00F4H
       mov      r8, rax
       neg      r8
       add      r8, 0x7FFFFFFF
       and      r8, -8
       jle      SHORT G_M000_IG17

G_M000_IG14:                ;; offset=0107H
       vxorps   xmm0, xmm0, xmm0
       vpcmpeqw xmm0, xmm0, xmmword ptr [rcx+2*rax]
       vpmovmskb edx, xmm0
       test     edx, edx
       jne      SHORT G_M000_IG16
       add      rax, 8
       jmp      SHORT G_M000_IG17

G_M000_IG15:                ;; offset=011EH
       xor      edx, edx
       tzcnt    edx, r8d
       shr      edx, 1
       mov      ecx, edx
       add      eax, ecx
       jmp      SHORT G_M000_IG21

G_M000_IG16:                ;; offset=012DH
       tzcnt    edx, edx
       shr      edx, 1
       add      eax, edx
       jmp      SHORT G_M000_IG21

G_M000_IG17:                ;; offset=0137H
       cmp      rax, 0x7FFFFFFF
       jge      SHORT G_M000_IG22
       mov      rdx, rax
       neg      rdx
       add      rdx, 0x7FFFFFFF
       jmp      G_M000_IG04

G_M000_IG18:                ;; offset=0151H
       add      eax, 3
       jmp      SHORT G_M000_IG21

G_M000_IG19:                ;; offset=0156H
       add      eax, 2
       jmp      SHORT G_M000_IG21

G_M000_IG20:                ;; offset=015BH
       inc      eax

G_M000_IG21:                ;; offset=015DH
       vzeroupper
       add      rsp, 40
       ret

G_M000_IG22:                ;; offset=0165H
       call     [System.SpanHelpers:ThrowMustBeNullTerminatedString()]
       int3

; Total bytes of code 364



; Assembly listing for method System.SpanHelpers:IndexOfNullByte(ulong):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
       sub      rsp, 40
       vzeroupper

G_M000_IG02:                ;; offset=0007H
       xor      eax, eax
       mov      edx, ecx
       and      edx, 15
       neg      edx
       add      edx, 16
       and      edx, 15

G_M000_IG03:                ;; offset=0016H
       cmp      rdx, 8
       jb       SHORT G_M000_IG05

G_M000_IG04:                ;; offset=001CH
       add      rdx, -8
       cmp      byte  ptr [rcx+rax], 0
       je       G_M000_IG19
       cmp      byte  ptr [rcx+rax+01H], 0
       je       G_M000_IG20
       cmp      byte  ptr [rcx+rax+02H], 0
       je       G_M000_IG21
       cmp      byte  ptr [rcx+rax+03H], 0
       je       G_M000_IG22
       cmp      byte  ptr [rcx+rax+04H], 0
       je       G_M000_IG23
       cmp      byte  ptr [rcx+rax+05H], 0
       je       G_M000_IG24
       cmp      byte  ptr [rcx+rax+06H], 0
       je       G_M000_IG25
       cmp      byte  ptr [rcx+rax+07H], 0
       je       G_M000_IG26
       add      rax, 8
       cmp      rdx, 8
       jae      SHORT G_M000_IG04

G_M000_IG05:                ;; offset=0081H
       cmp      rdx, 4
       jb       SHORT G_M000_IG07

G_M000_IG06:                ;; offset=0087H
       add      rdx, -4
       cmp      byte  ptr [rcx+rax], 0
       je       G_M000_IG19
       cmp      byte  ptr [rcx+rax+01H], 0
       je       G_M000_IG20
       cmp      byte  ptr [rcx+rax+02H], 0
       je       G_M000_IG21
       cmp      byte  ptr [rcx+rax+03H], 0
       je       G_M000_IG22
       add      rax, 4

G_M000_IG07:                ;; offset=00BAH
       test     rdx, rdx
       je       SHORT G_M000_IG09

G_M000_IG08:                ;; offset=00BFH
       dec      rdx
       cmp      byte  ptr [rcx+rax], 0
       je       G_M000_IG19
       inc      rax
       test     rdx, rdx
       jne      SHORT G_M000_IG08

G_M000_IG09:                ;; offset=00D4H
       cmp      rax, 0x7FFFFFFF
       jae      G_M000_IG28
       mov      edx, ecx
       add      rdx, rax
       test     dl, 31
       je       SHORT G_M000_IG11

G_M000_IG10:                ;; offset=00EAH
       vxorps   xmm0, xmm0, xmm0
       vpcmpeqb xmm0, xmm0, xmmword ptr [rcx+rax]
       vpmovmskb edx, xmm0
       test     edx, edx
       jne      SHORT G_M000_IG16
       add      rax, 16

G_M000_IG11:                ;; offset=00FFH
       mov      edx, eax
       neg      edx
       add      edx, 0x7FFFFFFF
       and      edx, -32
       cmp      rdx, rax
       jbe      SHORT G_M000_IG13

G_M000_IG12:                ;; offset=0111H
       vxorps   ymm0, ymm0, ymm0
       vpcmpeqb ymm0, ymm0, ymmword ptr [rcx+rax]
       vpmovmskb r8d, ymm0
       test     r8d, r8d
       jne      SHORT G_M000_IG17
       add      rax, 32
       cmp      rdx, rax
       ja       SHORT G_M000_IG12

G_M000_IG13:                ;; offset=012CH
       mov      edx, eax
       neg      edx
       add      edx, 0x7FFFFFFF
       and      edx, -16
       mov      r8d, edx
       cmp      r8, rax
       jbe      SHORT G_M000_IG15

G_M000_IG14:                ;; offset=0141H
       vxorps   xmm0, xmm0, xmm0
       vpcmpeqb xmm0, xmm0, xmmword ptr [rcx+rax]
       vpmovmskb edx, xmm0
       test     edx, edx
       jne      SHORT G_M000_IG18
       add      rax, 16

G_M000_IG15:                ;; offset=0156H
       cmp      rax, 0x7FFFFFFF
       jae      SHORT G_M000_IG28
       mov      rdx, rax
       neg      rdx
       add      rdx, 0x7FFFFFFF
       jmp      G_M000_IG03

G_M000_IG16:                ;; offset=0170H
       tzcnt    edx, edx
       add      eax, edx
       jmp      SHORT G_M000_IG27

G_M000_IG17:                ;; offset=0178H
       xor      edx, edx
       tzcnt    edx, r8d
       add      eax, edx
       jmp      SHORT G_M000_IG27

G_M000_IG18:                ;; offset=0183H
       tzcnt    edx, edx
       add      eax, edx
       jmp      SHORT G_M000_IG27

G_M000_IG19:                ;; offset=018BH
       jmp      SHORT G_M000_IG27

G_M000_IG20:                ;; offset=018DH
       inc      eax
       jmp      SHORT G_M000_IG27

G_M000_IG21:                ;; offset=0191H
       add      eax, 2
       jmp      SHORT G_M000_IG27

G_M000_IG22:                ;; offset=0196H
       add      eax, 3
       jmp      SHORT G_M000_IG27

G_M000_IG23:                ;; offset=019BH
       add      eax, 4
       jmp      SHORT G_M000_IG27

G_M000_IG24:                ;; offset=01A0H
       add      eax, 5
       jmp      SHORT G_M000_IG27

G_M000_IG25:                ;; offset=01A5H
       add      eax, 6
       jmp      SHORT G_M000_IG27

G_M000_IG26:                ;; offset=01AAH
       add      eax, 7

G_M000_IG27:                ;; offset=01ADH
       vzeroupper
       add      rsp, 40
       ret

G_M000_IG28:                ;; offset=01B5H
       call     [System.SpanHelpers:ThrowMustBeNullTerminatedString()]
       int3

; Total bytes of code 444

@ghost
Copy link

ghost commented May 5, 2023

Tagging subscribers to this area: @dotnet/area-system-memory
See info in area-owners.md if you want to be subscribed.

Issue Details

Contributes to #84421 (removes a couple of methods jitted during Hello World start).

I don't see a good reason for these to have [AO], R2R'd versions look good enough to me (SSE based):

R2R codegen:
; Assembly listing for method System.SpanHelpers:IndexOfNullCharacter(ulong):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
       sub      rsp, 40
       vzeroupper

G_M000_IG02:                ;; offset=0007H
       xor      eax, eax
       mov      edx, 0x7FFFFFFF
       test     cl, 1
       jne      SHORT G_M000_IG04

G_M000_IG03:                ;; offset=0013H
       mov      edx, ecx
       neg      edx
       mov      r8d, edx
       shr      r8d, 31
       add      edx, r8d
       sar      edx, 1
       and      rdx, 7

G_M000_IG04:                ;; offset=0027H
       cmp      rdx, 4
       jl       SHORT G_M000_IG06

G_M000_IG05:                ;; offset=002DH
       cmp      word  ptr [rcx+2*rax], 0
       je       G_M000_IG21
       cmp      word  ptr [rcx+2*rax+02H], 0
       je       G_M000_IG20
       cmp      word  ptr [rcx+2*rax+04H], 0
       je       G_M000_IG19
       cmp      word  ptr [rcx+2*rax+06H], 0
       je       G_M000_IG18
       add      rax, 4
       add      rdx, -4
       cmp      rdx, 4
       jge      SHORT G_M000_IG05

G_M000_IG06:                ;; offset=006AH
       test     rdx, rdx
       jle      SHORT G_M000_IG08

G_M000_IG07:                ;; offset=006FH
       cmp      word  ptr [rcx+2*rax], 0
       je       G_M000_IG21
       inc      rax
       dec      rdx
       test     rdx, rdx
       jg       SHORT G_M000_IG07

G_M000_IG08:                ;; offset=0085H
       cmp      rax, 0x7FFFFFFF
       jge      G_M000_IG22
       lea      rdx, [rcx+2*rax]
       test     dl, 31
       je       SHORT G_M000_IG11

G_M000_IG09:                ;; offset=009AH
       vxorps   xmm0, xmm0, xmm0
       vpcmpeqw xmm0, xmm0, xmmword ptr [rcx+2*rax]
       vpmovmskb edx, xmm0
       test     edx, edx
       jne      SHORT G_M000_IG10
       add      rax, 8
       jmp      SHORT G_M000_IG11

G_M000_IG10:                ;; offset=00B1H
       xor      ecx, ecx
       tzcnt    ecx, edx
       shr      ecx, 1
       mov      edx, ecx
       add      eax, edx
       jmp      G_M000_IG21

G_M000_IG11:                ;; offset=00C2H
       mov      rdx, rax
       neg      rdx
       add      rdx, 0x7FFFFFFF
       and      rdx, -16
       jle      SHORT G_M000_IG13

G_M000_IG12:                ;; offset=00D5H
       vxorps   ymm0, ymm0, ymm0
       vpcmpeqw ymm0, ymm0, ymmword ptr [rcx+2*rax]
       vpmovmskb r8d, ymm0
       test     r8d, r8d
       jne      SHORT G_M000_IG15
       add      rax, 16
       add      rdx, -16
       test     rdx, rdx
       jg       SHORT G_M000_IG12

G_M000_IG13:                ;; offset=00F4H
       mov      r8, rax
       neg      r8
       add      r8, 0x7FFFFFFF
       and      r8, -8
       jle      SHORT G_M000_IG17

G_M000_IG14:                ;; offset=0107H
       vxorps   xmm0, xmm0, xmm0
       vpcmpeqw xmm0, xmm0, xmmword ptr [rcx+2*rax]
       vpmovmskb edx, xmm0
       test     edx, edx
       jne      SHORT G_M000_IG16
       add      rax, 8
       jmp      SHORT G_M000_IG17

G_M000_IG15:                ;; offset=011EH
       xor      edx, edx
       tzcnt    edx, r8d
       shr      edx, 1
       mov      ecx, edx
       add      eax, ecx
       jmp      SHORT G_M000_IG21

G_M000_IG16:                ;; offset=012DH
       tzcnt    edx, edx
       shr      edx, 1
       add      eax, edx
       jmp      SHORT G_M000_IG21

G_M000_IG17:                ;; offset=0137H
       cmp      rax, 0x7FFFFFFF
       jge      SHORT G_M000_IG22
       mov      rdx, rax
       neg      rdx
       add      rdx, 0x7FFFFFFF
       jmp      G_M000_IG04

G_M000_IG18:                ;; offset=0151H
       add      eax, 3
       jmp      SHORT G_M000_IG21

G_M000_IG19:                ;; offset=0156H
       add      eax, 2
       jmp      SHORT G_M000_IG21

G_M000_IG20:                ;; offset=015BH
       inc      eax

G_M000_IG21:                ;; offset=015DH
       vzeroupper
       add      rsp, 40
       ret

G_M000_IG22:                ;; offset=0165H
       call     [System.SpanHelpers:ThrowMustBeNullTerminatedString()]
       int3

; Total bytes of code 364



; Assembly listing for method System.SpanHelpers:IndexOfNullByte(ulong):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
       sub      rsp, 40
       vzeroupper

G_M000_IG02:                ;; offset=0007H
       xor      eax, eax
       mov      edx, ecx
       and      edx, 15
       neg      edx
       add      edx, 16
       and      edx, 15

G_M000_IG03:                ;; offset=0016H
       cmp      rdx, 8
       jb       SHORT G_M000_IG05

G_M000_IG04:                ;; offset=001CH
       add      rdx, -8
       cmp      byte  ptr [rcx+rax], 0
       je       G_M000_IG19
       cmp      byte  ptr [rcx+rax+01H], 0
       je       G_M000_IG20
       cmp      byte  ptr [rcx+rax+02H], 0
       je       G_M000_IG21
       cmp      byte  ptr [rcx+rax+03H], 0
       je       G_M000_IG22
       cmp      byte  ptr [rcx+rax+04H], 0
       je       G_M000_IG23
       cmp      byte  ptr [rcx+rax+05H], 0
       je       G_M000_IG24
       cmp      byte  ptr [rcx+rax+06H], 0
       je       G_M000_IG25
       cmp      byte  ptr [rcx+rax+07H], 0
       je       G_M000_IG26
       add      rax, 8
       cmp      rdx, 8
       jae      SHORT G_M000_IG04

G_M000_IG05:                ;; offset=0081H
       cmp      rdx, 4
       jb       SHORT G_M000_IG07

G_M000_IG06:                ;; offset=0087H
       add      rdx, -4
       cmp      byte  ptr [rcx+rax], 0
       je       G_M000_IG19
       cmp      byte  ptr [rcx+rax+01H], 0
       je       G_M000_IG20
       cmp      byte  ptr [rcx+rax+02H], 0
       je       G_M000_IG21
       cmp      byte  ptr [rcx+rax+03H], 0
       je       G_M000_IG22
       add      rax, 4

G_M000_IG07:                ;; offset=00BAH
       test     rdx, rdx
       je       SHORT G_M000_IG09

G_M000_IG08:                ;; offset=00BFH
       dec      rdx
       cmp      byte  ptr [rcx+rax], 0
       je       G_M000_IG19
       inc      rax
       test     rdx, rdx
       jne      SHORT G_M000_IG08

G_M000_IG09:                ;; offset=00D4H
       cmp      rax, 0x7FFFFFFF
       jae      G_M000_IG28
       mov      edx, ecx
       add      rdx, rax
       test     dl, 31
       je       SHORT G_M000_IG11

G_M000_IG10:                ;; offset=00EAH
       vxorps   xmm0, xmm0, xmm0
       vpcmpeqb xmm0, xmm0, xmmword ptr [rcx+rax]
       vpmovmskb edx, xmm0
       test     edx, edx
       jne      SHORT G_M000_IG16
       add      rax, 16

G_M000_IG11:                ;; offset=00FFH
       mov      edx, eax
       neg      edx
       add      edx, 0x7FFFFFFF
       and      edx, -32
       cmp      rdx, rax
       jbe      SHORT G_M000_IG13

G_M000_IG12:                ;; offset=0111H
       vxorps   ymm0, ymm0, ymm0
       vpcmpeqb ymm0, ymm0, ymmword ptr [rcx+rax]
       vpmovmskb r8d, ymm0
       test     r8d, r8d
       jne      SHORT G_M000_IG17
       add      rax, 32
       cmp      rdx, rax
       ja       SHORT G_M000_IG12

G_M000_IG13:                ;; offset=012CH
       mov      edx, eax
       neg      edx
       add      edx, 0x7FFFFFFF
       and      edx, -16
       mov      r8d, edx
       cmp      r8, rax
       jbe      SHORT G_M000_IG15

G_M000_IG14:                ;; offset=0141H
       vxorps   xmm0, xmm0, xmm0
       vpcmpeqb xmm0, xmm0, xmmword ptr [rcx+rax]
       vpmovmskb edx, xmm0
       test     edx, edx
       jne      SHORT G_M000_IG18
       add      rax, 16

G_M000_IG15:                ;; offset=0156H
       cmp      rax, 0x7FFFFFFF
       jae      SHORT G_M000_IG28
       mov      rdx, rax
       neg      rdx
       add      rdx, 0x7FFFFFFF
       jmp      G_M000_IG03

G_M000_IG16:                ;; offset=0170H
       tzcnt    edx, edx
       add      eax, edx
       jmp      SHORT G_M000_IG27

G_M000_IG17:                ;; offset=0178H
       xor      edx, edx
       tzcnt    edx, r8d
       add      eax, edx
       jmp      SHORT G_M000_IG27

G_M000_IG18:                ;; offset=0183H
       tzcnt    edx, edx
       add      eax, edx
       jmp      SHORT G_M000_IG27

G_M000_IG19:                ;; offset=018BH
       jmp      SHORT G_M000_IG27

G_M000_IG20:                ;; offset=018DH
       inc      eax
       jmp      SHORT G_M000_IG27

G_M000_IG21:                ;; offset=0191H
       add      eax, 2
       jmp      SHORT G_M000_IG27

G_M000_IG22:                ;; offset=0196H
       add      eax, 3
       jmp      SHORT G_M000_IG27

G_M000_IG23:                ;; offset=019BH
       add      eax, 4
       jmp      SHORT G_M000_IG27

G_M000_IG24:                ;; offset=01A0H
       add      eax, 5
       jmp      SHORT G_M000_IG27

G_M000_IG25:                ;; offset=01A5H
       add      eax, 6
       jmp      SHORT G_M000_IG27

G_M000_IG26:                ;; offset=01AAH
       add      eax, 7

G_M000_IG27:                ;; offset=01ADH
       vzeroupper
       add      rsp, 40
       ret

G_M000_IG28:                ;; offset=01B5H
       call     [System.SpanHelpers:ThrowMustBeNullTerminatedString()]
       int3

; Total bytes of code 444
Author: EgorBo
Assignees: EgorBo
Labels:

area-System.Memory

Milestone: -

@EgorBo
Copy link
Member Author

EgorBo commented May 5, 2023

Also, removed from SequenceCompareTo(ref byte first, int firstLength, ref byte second, int secondLength):

; Assembly listing for method System.SpanHelpers:SequenceCompareTo(byref,int,byref,int):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 7 single block inlinees; 0 inlinees without PGO data

G_M000_IG01:                ;; offset=0000H
       push     rsi
       vzeroupper

G_M000_IG02:                ;; offset=0004H
       cmp      rcx, r8
       je       G_M000_IG16

G_M000_IG03:                ;; offset=000DH
       cmp      edx, r9d
       mov      eax, r9d
       cmovb    eax, edx
       mov      r10d, eax
       xor      r11d, r11d
       mov      rax, r10
       cmp      rax, 32
       jb       SHORT G_M000_IG08
       add      rax, -32
       je       SHORT G_M000_IG05

G_M000_IG04:                ;; offset=002BH
       vmovups  ymm0, ymmword ptr [rcx+r11]
       vpcmpeqb ymm0, ymm0, ymmword ptr [r8+r11]
       vpmovmskb r10d, ymm0
       cmp      r10d, -1
       jne      SHORT G_M000_IG06
       add      r11, 32
       cmp      rax, r11
       ja       SHORT G_M000_IG04

G_M000_IG05:                ;; offset=004AH
       mov      r11, rax
       vmovups  ymm0, ymmword ptr [rcx+r11]
       vpcmpeqb ymm0, ymm0, ymmword ptr [r8+r11]
       vpmovmskb r10d, ymm0
       cmp      r10d, -1
       je       G_M000_IG16

G_M000_IG06:                ;; offset=0067H
       mov      eax, r10d
       not      eax
       tzcnt    eax, eax
       add      rax, r11
       mov      r11, rax
       movzx    rax, byte  ptr [rcx+r11]
       movzx    rcx, byte  ptr [r8+r11]
       sub      eax, ecx

G_M000_IG07:                ;; offset=0082H
       vzeroupper
       pop      rsi
       ret

G_M000_IG08:                ;; offset=0087H
       cmp      r10, 16
       jb       SHORT G_M000_IG12
       add      rax, -16
       je       SHORT G_M000_IG09
       vmovups  xmm0, xmmword ptr [rcx]
       vpcmpeqb xmm0, xmm0, xmmword ptr [r8]
       vpmovmskb r10d, xmm0
       cmp      r10d, 0xFFFF
       jne      SHORT G_M000_IG10

G_M000_IG09:                ;; offset=00A9H
       mov      r11, rax
       vmovups  xmm0, xmmword ptr [rcx+r11]
       vpcmpeqb xmm0, xmm0, xmmword ptr [r8+r11]
       vpmovmskb r10d, xmm0
       cmp      r10d, 0xFFFF
       je       SHORT G_M000_IG16

G_M000_IG10:                ;; offset=00C5H
       mov      eax, r10d
       not      eax
       tzcnt    eax, eax
       add      rax, r11
       mov      r11, rax
       movzx    rax, byte  ptr [rcx+r11]
       movzx    rcx, byte  ptr [r8+r11]
       sub      eax, ecx

G_M000_IG11:                ;; offset=00E0H
       vzeroupper
       pop      rsi
       ret

G_M000_IG12:                ;; offset=00E5H
       cmp      r10, 8
       jbe      SHORT G_M000_IG14
       lea      rax, [r10-08H]
       test     rax, rax
       je       SHORT G_M000_IG14

G_M000_IG13:                ;; offset=00F4H
       mov      rsi, qword ptr [rcx+r11]
       cmp      rsi, qword ptr [r8+r11]
       jne      SHORT G_M000_IG14
       add      r11, 8
       cmp      rax, r11
       ja       SHORT G_M000_IG13

G_M000_IG14:                ;; offset=0107H
       cmp      r10, r11
       jbe      SHORT G_M000_IG16

G_M000_IG15:                ;; offset=010CH
       movzx    rax, byte  ptr [rcx+r11]
       movzx    rsi, byte  ptr [r8+r11]
       sub      eax, esi
       jne      SHORT G_M000_IG18
       inc      r11
       cmp      r10, r11
       ja       SHORT G_M000_IG15

G_M000_IG16:                ;; offset=0122H
       mov      eax, edx
       sub      eax, r9d

G_M000_IG17:                ;; offset=0127H
       vzeroupper
       pop      rsi
       ret

G_M000_IG18:                ;; offset=012CH
       vzeroupper
       pop      rsi
       ret

; Total bytes of code 305

@stephentoub
Copy link
Member

Also contributes to #71261

@@ -422,7 +422,6 @@ public static unsafe int SequenceCompareTo(ref char first, int firstLength, ref

// IndexOfNullCharacter processes memory in aligned chunks, and thus it won't crash even if it accesses memory beyond the null terminator.
// This behavior is an implementation detail of the runtime and callers outside System.Private.CoreLib must not depend on it.
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some more SpanHelpers methods marked with AggressiveOptimization. Delete it on all of them?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some more SpanHelpers methods marked with AggressiveOptimization. Delete it on all of them?

There are a few cases when because of SVM we get a non optimal codegen in R2R (similar to #84421 (comment)) so I didn't want to regress SpanHelpers

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

E.g. System.SpanHelpers:LastIndexOfValueType[short,System.SpanHelpers+DontNegate1[short]](byref,short,int)`

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So only *IndexOfAnyValue* (with generic math) are left with AggressiveOptimization in SpanHelpers

Copy link
Member

@jkotas jkotas May 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both R2R and Tier0 codegen for these compilated generic constructs tend to be pretty bad. I would not worry about it - we have the same problem in number of other places.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

E.g. if I remove [AO] from LastIndexOfValueType here is what I get:

; Assembly listing for method System.SpanHelpers:LastIndexOfValueType[short](byref,short,int):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data

G_M000_IG01:                ;; offset=0000H

G_M000_IG02:                ;; offset=0000H
       movsx    rdx, dx
       lea      rax, [(reloc 0x435488)]

G_M000_IG03:                ;; offset=000BH
       tail.jmp [rax]System.SpanHelpers:LastIndexOfValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,int):int

; Total bytes of code 14

and that nested LastIndexOfValueType is jit-compiled. so presumably we'll get a slow Tier0 version instead of having AggressiveOpt one for start - if that is ok I can remove

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is ok.

Copy link
Member Author

@EgorBo EgorBo May 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, removed them. Thus, we only have 3 uses of AO in the corelib - 1 in AsyncTaskMethodBuilder that says that AO helps it to avoid allocations in T0. And two in CastHelpers which have to be there since VM special case them to be direct calls

@EgorBo EgorBo merged commit 9abc5a9 into dotnet:main May 5, 2023
@EgorBo EgorBo deleted the remove-ao-spanhelpers branch May 5, 2023 16:21
@jkotas jkotas added the tenet-performance Performance related issue label May 5, 2023
@kunalspathak
Copy link
Member

@EgorBo - do you know why AO was cause for jitting these methods on startup for x64 and not for arm64?

@stephentoub
Copy link
Member

do you know why AO was cause for jitting these methods on startup for x64 and not for arm64?

I think the comparison was apples vs oranges... they were running different code because EventSource startup gunk was being invoked in one case and not the other.

@EgorBo
Copy link
Member Author

EgorBo commented May 9, 2023

@EgorBo - do you know why AO was cause for jitting these methods on startup for x64 and not for arm64?

I'd say it is #85791 (comment)
I also saw that from stack traces in JIT's compileMethod

@kunalspathak
Copy link
Member

Got it. Yes @TIHan also confirmed that in #85791 (comment).

@ghost ghost locked as resolved and limited conversation to collaborators Jun 8, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants