Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Casting via generic math doesn't always inline in R2R #84421

Closed
xtqqczze opened this issue Apr 6, 2023 · 12 comments
Closed

Casting via generic math doesn't always inline in R2R #84421

xtqqczze opened this issue Apr 6, 2023 · 12 comments
Labels
Milestone

Comments

@xtqqczze
Copy link
Contributor

xtqqczze commented Apr 6, 2023

Related: #78648.

uint M0() => uint.CreateTruncating(42u);
uint M1() => uint.CreateTruncating((byte)42);
uint M2() => uint.CreateTruncating(42UL);
uint M3() => uint.CreateTruncating(42);
uint M4() => uint.CreateTruncating((sbyte)42);
int M5() => int.CreateTruncating(42);
int M6() => int.CreateTruncating((sbyte)42);
int M7() => int.CreateTruncating(42L);
int M8() => int.CreateTruncating(42u);
int M9() => int.CreateTruncating((byte)42);
// crossgen2 8.0.0-preview.4.23206.99+18e2c5fd9e2239a8b06fe49dbb6492d40f5e5e19

C:M0():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       mov      eax, 42
						;; size=5 bbWeight=1 PerfScore 0.25
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00

C:M1():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.UInt32:CreateTruncating[ubyte](ubyte):uint
						;; size=3 bbWeight=1 PerfScore 2.00

C:M2():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.UInt32:CreateTruncating[ulong](ulong):uint
						;; size=3 bbWeight=1 PerfScore 2.00

C:M3():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.UInt32:CreateTruncating[int](int):uint
						;; size=3 bbWeight=1 PerfScore 2.00

C:M4():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.UInt32:CreateTruncating[byte](byte):uint
						;; size=3 bbWeight=1 PerfScore 2.00

C:M5():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       mov      eax, 42
						;; size=5 bbWeight=1 PerfScore 0.25
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00

C:M6():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.Int32:CreateTruncating[byte](byte):int
						;; size=3 bbWeight=1 PerfScore 2.00

C:M7():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.Int32:CreateTruncating[long](long):int
						;; size=3 bbWeight=1 PerfScore 2.00

C:M8():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.Int32:CreateTruncating[uint](uint):int
						;; size=3 bbWeight=1 PerfScore 2.00

C:M9():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.Int32:CreateTruncating[ubyte](ubyte):int
						;; size=3 bbWeight=1 PerfScore 2.00

https://csharp.godbolt.org/z/K6xM8hP6P

@dotnet-issue-labeler dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Apr 6, 2023
@ghost ghost added the untriaged New issue has not been triaged by the area owner label Apr 6, 2023
@ghost
Copy link

ghost commented Apr 6, 2023

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak
See info in area-owners.md if you want to be subscribed.

Issue Details

See #78648.

cc: @EgorBo

https://csharp.godbolt.org/z/K6xM8hP6P

// crossgen2 8.0.0-preview.4.23206.99+18e2c5fd9e2239a8b06fe49dbb6492d40f5e5e19

C:M0():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       mov      eax, 42
						;; size=5 bbWeight=1 PerfScore 0.25
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00

C:M1():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.UInt32:CreateTruncating[ubyte](ubyte):uint
						;; size=3 bbWeight=1 PerfScore 2.00

C:M2():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.UInt32:CreateTruncating[ulong](ulong):uint
						;; size=3 bbWeight=1 PerfScore 2.00

C:M3():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.UInt32:CreateTruncating[int](int):uint
						;; size=3 bbWeight=1 PerfScore 2.00

C:M4():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.UInt32:CreateTruncating[byte](byte):uint
						;; size=3 bbWeight=1 PerfScore 2.00

C:M5():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       mov      eax, 42
						;; size=5 bbWeight=1 PerfScore 0.25
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00

C:M6():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.Int32:CreateTruncating[byte](byte):int
						;; size=3 bbWeight=1 PerfScore 2.00

C:M7():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.Int32:CreateTruncating[long](long):int
						;; size=3 bbWeight=1 PerfScore 2.00

C:M8():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.Int32:CreateTruncating[uint](uint):int
						;; size=3 bbWeight=1 PerfScore 2.00

C:M9():int:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
						;; size=0 bbWeight=1 PerfScore 0.00
       lea      rax, [(reloc)]
       mov      edi, 42
						;; size=12 bbWeight=1 PerfScore 0.75
       tail.jmp [rax]System.Int32:CreateTruncating[ubyte](ubyte):int
						;; size=3 bbWeight=1 PerfScore 2.00
Author: xtqqczze
Assignees: -
Labels:

area-CodeGen-coreclr

Milestone: -

@JulieLeeMSFT JulieLeeMSFT removed the untriaged New issue has not been triaged by the area owner label Apr 7, 2023
@JulieLeeMSFT JulieLeeMSFT added this to the Future milestone Apr 7, 2023
@JulieLeeMSFT
Copy link
Member

We will push this to Future becuase we will not have time to work on this in .NET 8.

@EgorBo
Copy link
Member

EgorBo commented Apr 7, 2023

@xtqqczze it emits expected codegen for JIT and NativeAOT, seems to be R2R specific, from what I see R2R runtime throws

Internal.JitInterface.RequiresRuntimeJitException: SVM
   at Internal.JitInterface.CorInfoImpl.getCallInfo(CORINFO_RESOLVED_TOKEN& pResolvedToken, CORINFO_RESOLVED_TOKEN* pConstrainedResolvedToken, CORINFO_METHOD_STRUCT_* callerHandle, CORINFO_CALLINFO_FLAGS flags, CORINFO_CALL_INFO* pResult) in C:\prj\runtime\src\coreclr\tools\aot\ILCompiler.ReadyToRun\JitInterface\CorInfoImpl.ReadyToRun.cs:line 2109
   at Internal.JitInterface.CorInfoImpl._getCallInfo(IntPtr thisHandle, IntPtr* ppException, CORINFO_RESOLVED_TOKEN* pResolvedToken, CORINFO_RESOLVED_TOKEN* pConstrainedResolvedToken, CORINFO_METHOD_STRUCT_* callerHandle, CORINFO_CALLINFO_FLAGS flags, CORINFO_CALL_INFO* pResult) in C:\prj\runtime\src\coreclr\tools\Common\JitInterface\CorInfoImpl_generated.cs:line 2178

for getCallInfo during inlining so JIT gives up on inlining due to error

@stephentoub
Copy link
Member

We will push this to Future becuase we will not have time to work on this in .NET 8.

seems to be R2R specific

In case it impacts prioritization, we're on a path to make fairly heavy use of this throughout core formatting/parsing logic, as in #84469.

@EgorBo EgorBo modified the milestones: Future, 8.0.0 Apr 8, 2023
@EgorBo
Copy link
Member

EgorBo commented Apr 12, 2023

@dotnet/crossgen-contrib any idea what we can do here for crossgen, is it simply not supported?

uint M1() => uint.CreateTruncating((byte)42);

image

This exception leads to this codegen for R2R:

C:M1():uint:this:
; Emitting BLENDED_CODE for X64 CPU with SSE2 - Unix
       lea      rax, [(reloc)]
       mov      edi, 42
       tail.jmp [rax]System.UInt32:CreateTruncating[ubyte](ubyte):uint

because JIT caught the exception and decided that it can't inline it

@trylek
Copy link
Member

trylek commented Apr 12, 2023

Well, Crossgen2 doesn't yet support static virtual method resolution, that was planned for .NET 7 but it got cut as it turned out to be quite challenging and we didn't get to it early enough in the release cycle. If it becomes more of a perf problem now we can discuss its prioritization among .NET 8 Crossgen2 work. If the method is not a static virtual then that's just a bug in the check that should be easy to fix.

@EgorBo
Copy link
Member

EgorBo commented Apr 12, 2023

Yes it's static virtual TSelf CreateTruncating.

If it becomes more of a perf problem

I assume it could be e.g. for customers who disable tiering since we now use more and more of these

@trylek
Copy link
Member

trylek commented Apr 12, 2023

OK, thanks for pointing it out, I'll discuss it at today Crossgen2 weekly sync. Maybe we could start supporting at least some cases, IIRC the main problem is precompilation of canonical methods where the resolution depends on the actual type passed at runtime i.o.w. there's no way to resolve the call at compile time.

@EgorBo EgorBo removed their assignment Apr 17, 2023
@EgorBo EgorBo added area-crossgen2-coreclr and removed area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI labels Apr 17, 2023
@trylek trylek mentioned this issue May 3, 2023
46 tasks
@xtqqczze xtqqczze changed the title Casting via generic math still doesn't always inline Casting via generic math doesn't always inline in R2R May 5, 2023
@jkotas jkotas added the tenet-performance Performance related issue label May 5, 2023
@mangod9
Copy link
Member

mangod9 commented Aug 4, 2023

@EgorBo @trylek assume this will be closed once SVM support in crossgen2 is enabled?

@EgorBo
Copy link
Member

EgorBo commented Aug 4, 2023

@EgorBo @trylek assume this will be closed once SVM support in crossgen2 is enabled?

Correct

@trylek
Copy link
Member

trylek commented Aug 8, 2023

I believe this should be fixed with #87438, closing. There are still certain cases that cannot be resolved at compile time, some of them can be fixed in the future by improving the JIT interface. In particular, embedGenericHandle doesn't support passing the constrained type that would be needed if SVM lookup fails at compile time (the SVM lookup itself would still need runtime work but the method calling the SVM could be precompiled); similarly, CanInline doesn't support type constraint information so that, if there's an implementation of the SVM on the interface that defines it, we incorrectly decide we can inline it despite the fact that the constrained lookup could find a different implementation on derived interfaces or types.

@trylek trylek closed this as completed Aug 8, 2023
@xtqqczze
Copy link
Contributor Author

xtqqczze commented Aug 9, 2023

Unfortunately, with #87438, the codegen still has many missed inlining opportunities, especially for signed integers.

// crossgen2 8.0.0-rc.1.23409.99+40b39ff7df1dc2388a5865c9ad151dce88fa007d
// Emitting BLENDED_CODE for generic X64 - Unix

C:M0():uint:this (FullOpts):
 
       mov      eax, 42
 
       ret      
 
C:M1():uint:this (FullOpts):
       push     rbp
       sub      rsp, 16
       lea      rbp, [rsp+0x10]
 
       mov      dword ptr [rbp-0x08], 42
       mov      eax, dword ptr [rbp-0x08]
 
       add      rsp, 16
       pop      rbp
       ret      
 

C:M2():uint:this (FullOpts):
       push     rbp
       sub      rsp, 16
       lea      rbp, [rsp+0x10]
 
       mov      dword ptr [rbp-0x08], 42
       mov      eax, dword ptr [rbp-0x08]
 
       add      rsp, 16
       pop      rbp
       ret      
 
C:M3():uint:this (FullOpts):
       push     rbp
       sub      rsp, 16
       lea      rbp, [rsp+0x10]
 
       xor      esi, esi
       mov      dword ptr [rbp-0x08], esi
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.Int32:System.Numerics.INumberBase<System.Int32>.TryConvertToTruncating[uint](int,byref):bool]
       test     eax, eax
       je       SHORT G_M000_IG04
       mov      eax, dword ptr [rbp-0x08]
 
       add      rsp, 16
       pop      rbp
       ret      
 
G_M000_IG04:
       call     [System.ThrowHelper:ThrowNotSupportedException()]
       int3     
 
C:M4():uint:this (FullOpts):
       push     rbp
       sub      rsp, 16
       lea      rbp, [rsp+0x10]
 
       xor      esi, esi
       mov      dword ptr [rbp-0x08], esi
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.SByte:System.Numerics.INumberBase<System.SByte>.TryConvertToTruncating[uint](byte,byref):bool]
       test     eax, eax
       je       SHORT G_M000_IG04
       mov      eax, dword ptr [rbp-0x08]
 
       add      rsp, 16
       pop      rbp
       ret      
 
G_M000_IG04:
       call     [System.ThrowHelper:ThrowNotSupportedException()]
       int3     
 
C:M5():int:this (FullOpts):
 
       mov      eax, 42
 
       ret      
 
C:M6():int:this (FullOpts):
       push     rbp
       sub      rsp, 16
       lea      rbp, [rsp+0x10]
 
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.Int32:TryConvertFromTruncating[byte](byte,byref):bool]
       test     eax, eax
       jne      SHORT G_M000_IG04
 
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.SByte:System.Numerics.INumberBase<System.SByte>.TryConvertToTruncating[int](byte,byref):bool]
       test     eax, eax
       je       SHORT G_M000_IG06
 
G_M000_IG04:
       mov      eax, dword ptr [rbp-0x08]
 
       add      rsp, 16
       pop      rbp
       ret      
 
G_M000_IG06:
       call     [System.ThrowHelper:ThrowNotSupportedException()]
       int3     
 
C:M7():int:this (FullOpts):
       push     rbp
       sub      rsp, 16
       lea      rbp, [rsp+0x10]
 
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.Int32:TryConvertFromTruncating[long](long,byref):bool]
       test     eax, eax
       jne      SHORT G_M000_IG04
 
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.Int64:System.Numerics.INumberBase<System.Int64>.TryConvertToTruncating[int](long,byref):bool]
       test     eax, eax
       je       SHORT G_M000_IG06
 
G_M000_IG04:
       mov      eax, dword ptr [rbp-0x08]
 
       add      rsp, 16
       pop      rbp
       ret      
 
G_M000_IG06:
       call     [System.ThrowHelper:ThrowNotSupportedException()]
       int3     
 
C:M8():int:this (FullOpts):
       push     rbp
       sub      rsp, 16
       lea      rbp, [rsp+0x10]
 
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.Int32:TryConvertFromTruncating[uint](uint,byref):bool]
       test     eax, eax
       jne      SHORT G_M000_IG04
 
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.UInt32:System.Numerics.INumberBase<System.UInt32>.TryConvertToTruncating[int](uint,byref):bool]
       test     eax, eax
       je       SHORT G_M000_IG06
 
G_M000_IG04:
       mov      eax, dword ptr [rbp-0x08]
 
       add      rsp, 16
       pop      rbp
       ret      
 
G_M000_IG06:
       call     [System.ThrowHelper:ThrowNotSupportedException()]
       int3     
 
C:M9():int:this (FullOpts):
       push     rbp
       sub      rsp, 16
       lea      rbp, [rsp+0x10]
 
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.Int32:TryConvertFromTruncating[ubyte](ubyte,byref):bool]
       test     eax, eax
       jne      SHORT G_M000_IG04
 
       lea      rsi, [rbp-0x08]
       mov      edi, 42
       call     [System.Byte:System.Numerics.INumberBase<System.Byte>.TryConvertToTruncating[int](ubyte,byref):bool]
       test     eax, eax
       je       SHORT G_M000_IG06
 
G_M000_IG04:
       mov      eax, dword ptr [rbp-0x08]
 
       add      rsp, 16
       pop      rbp
       ret      
 
G_M000_IG06:
       call     [System.ThrowHelper:ThrowNotSupportedException()]
       int3     

@ghost ghost locked as resolved and limited conversation to collaborators Sep 8, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
Projects
None yet
Development

No branches or pull requests

7 participants