Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ineffective codegen for or pattern type checking #47920

Open
huoyaoyuan opened this issue Feb 5, 2021 · 17 comments
Open

Ineffective codegen for or pattern type checking #47920

huoyaoyuan opened this issue Feb 5, 2021 · 17 comments
Assignees
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI tenet-performance Performance related issue
Milestone

Comments

@huoyaoyuan
Copy link
Member

huoyaoyuan commented Feb 5, 2021

Description

Examine this code:

using System;
public class C
{
    public bool M0(object o)
    {
        return o is int or uint or long;
    }
    public bool M1(object o)
    {
        return o != null && (o is int or uint or long);
    }
    public bool M2(object o)
    {
        return o != null && (o.GetType() == typeof(int) || o.GetType() == typeof(uint) || o.GetType() == typeof(long));
    }
}

IL Compiled:

.method public hidebysig 
        instance bool M0 (
            object o
        ) cil managed 
    {
        // Method begins at RVA 0x2050
        // Code size 28 (0x1c)
        .maxstack 8

        IL_0000: ldarg.1
        IL_0001: isinst [System.Private.CoreLib]System.Int32
        IL_0006: brtrue.s IL_001a

        IL_0008: ldarg.1
        IL_0009: isinst [System.Private.CoreLib]System.UInt32
        IL_000e: brtrue.s IL_001a

        IL_0010: ldarg.1
        IL_0011: isinst [System.Private.CoreLib]System.Int64
        IL_0016: ldnull
        IL_0017: cgt.un
        IL_0019: ret

        IL_001a: ldc.i4.1
        IL_001b: ret
    } // end of method C::M0

    .method public hidebysig 
        instance bool M1 (
            object o
        ) cil managed 
    {
        // Method begins at RVA 0x206d
        // Code size 33 (0x21)
        .maxstack 8

        IL_0000: ldarg.1
        IL_0001: brfalse.s IL_001f

        IL_0003: ldarg.1
        IL_0004: isinst [System.Private.CoreLib]System.Int32
        IL_0009: brtrue.s IL_001d

        IL_000b: ldarg.1
        IL_000c: isinst [System.Private.CoreLib]System.UInt32
        IL_0011: brtrue.s IL_001d

        IL_0013: ldarg.1
        IL_0014: isinst [System.Private.CoreLib]System.Int64
        IL_0019: ldnull
        IL_001a: cgt.un
        IL_001c: ret

        IL_001d: ldc.i4.1
        IL_001e: ret

        IL_001f: ldc.i4.0
        IL_0020: ret
    } // end of method C::M1

    .method public hidebysig 
        instance bool M2 (
            object o
        ) cil managed 
    {
        // Method begins at RVA 0x2090
        // Code size 75 (0x4b)
        .maxstack 2

        IL_0000: ldarg.1
        IL_0001: brfalse.s IL_0049

        IL_0003: ldarg.1
        IL_0004: callvirt instance class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Object::GetType()
        IL_0009: ldtoken [System.Private.CoreLib]System.Int32
        IL_000e: call class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Type::GetTypeFromHandle(valuetype [System.Private.CoreLib]System.RuntimeTypeHandle)
        IL_0013: call bool [System.Private.CoreLib]System.Type::op_Equality(class [System.Private.CoreLib]System.Type, class [System.Private.CoreLib]System.Type)
        IL_0018: brtrue.s IL_0047

        IL_001a: ldarg.1
        IL_001b: callvirt instance class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Object::GetType()
        IL_0020: ldtoken [System.Private.CoreLib]System.UInt32
        IL_0025: call class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Type::GetTypeFromHandle(valuetype [System.Private.CoreLib]System.RuntimeTypeHandle)
        IL_002a: call bool [System.Private.CoreLib]System.Type::op_Equality(class [System.Private.CoreLib]System.Type, class [System.Private.CoreLib]System.Type)
        IL_002f: brtrue.s IL_0047

        IL_0031: ldarg.1
        IL_0032: callvirt instance class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Object::GetType()
        IL_0037: ldtoken [System.Private.CoreLib]System.Int64
        IL_003c: call class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Type::GetTypeFromHandle(valuetype [System.Private.CoreLib]System.RuntimeTypeHandle)
        IL_0041: call bool [System.Private.CoreLib]System.Type::op_Equality(class [System.Private.CoreLib]System.Type, class [System.Private.CoreLib]System.Type)
        IL_0046: ret

        IL_0047: ldc.i4.1
        IL_0048: ret

        IL_0049: ldc.i4.0
        IL_004a: ret
    } // end of method C::M2

codegen:

C.M0(System.Object)
    L0000: mov rax, rdx
    L0003: test rax, rax
    L0006: je short L0017
    L0008: mov rcx, 0x7ff9125bb258
    L0012: cmp [rax], rcx
    L0015: jne short L001c
    L0017: test rdx, rdx
    L001a: jne short L0058
    L001c: mov rax, rdx
    L001f: test rax, rax
    L0022: je short L0033
    L0024: mov rcx, 0x7ff9125bbc18
    L002e: cmp [rax], rcx
    L0031: jne short L0038
    L0033: test rdx, rdx
    L0036: jne short L0058
    L0038: test rdx, rdx
    L003b: je short L004e
    L003d: mov rax, 0x7ff9125bc5d8
    L0047: cmp [rdx], rax
    L004a: je short L004e
    L004c: xor edx, edx
    L004e: test rdx, rdx
    L0051: setne al
    L0054: movzx eax, al
    L0057: ret
    L0058: mov eax, 1
    L005d: ret

C.M1(System.Object)
    L0000: test rdx, rdx
    L0003: je short L004a
    L0005: mov rax, rdx
    L0008: mov rcx, 0x7ff9125bb258
    L0012: cmp [rax], rcx
    L0015: je short L0044
    L0017: mov rax, rdx
    L001a: mov rcx, 0x7ff9125bbc18
    L0024: cmp [rax], rcx
    L0027: je short L0044
    L0029: mov rax, 0x7ff9125bc5d8
    L0033: cmp [rdx], rax
    L0036: je short L003a
    L0038: xor edx, edx
    L003a: test rdx, rdx
    L003d: setne al
    L0040: movzx eax, al
    L0043: ret
    L0044: mov eax, 1
    L0049: ret
    L004a: xor eax, eax
    L004c: ret

C.M2(System.Object)
    L0000: test rdx, rdx
    L0003: je short L003d
    L0005: mov rax, 0x7ff9125bb258
    L000f: cmp [rdx], rax
    L0012: je short L0037
    L0014: mov rax, 0x7ff9125bbc18
    L001e: cmp [rdx], rax
    L0021: je short L0037
    L0023: mov rax, 0x7ff9125bc5d8
    L002d: cmp [rdx], rax
    L0030: sete al
    L0033: movzx eax, al
    L0036: ret
    L0037: mov eax, 1
    L003c: ret
    L003d: xor eax, eax
    L003f: ret

sharplab

Looking through it:
The pattern matching codegen moves from rdx to rax for every test.
If there's no null check, JIT will generate null test for each condition.

The compiled IL of pattern matching uses ldnull and cgt for the last test. This pattern may confuse the JIT.

Configuration

sharplab default x64 (5.0 timeframe)

Regression?

Probably not.

category:cq
theme:redundant-branches
skill-level:expert
cost:medium
impact:small

@huoyaoyuan huoyaoyuan added the tenet-performance Performance related issue label Feb 5, 2021
@dotnet-issue-labeler dotnet-issue-labeler bot added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI untriaged New issue has not been triaged by the area owner labels Feb 5, 2021
@AndyAyersMS
Copy link
Member

Did you have a chance to look at 6.0 codegen? It may be better.

If not, I'll take a look when I get a chance.

@huoyaoyuan
Copy link
Member Author

I changed to master branch in sharplab, and the codegen was identical.
Editing the code may crash sharplab.

@JulieLeeMSFT JulieLeeMSFT added this to Needs Triage in .NET Core CodeGen via automation Feb 5, 2021
@AndyAyersMS
Copy link
Member

I don't think sharplab's master tests 6.0 codegen -- suspect it refers to the roslyn branch, not the runtime branch.

@huoyaoyuan
Copy link
Member Author

Historically, sharplab uses master for both roslyn and runtime master.

I've built latest master and viewed with disasmo. The problem still exists.

By the way, o is int || o is uint || o is long generates identical IL with the worst scenario. Roslyn doesn't introduce a new IL pattern for new language feature here.

@JulieLeeMSFT JulieLeeMSFT added needs-further-triage Issue has been initially triaged, but needs deeper consideration or reconsideration and removed untriaged New issue has not been triaged by the area owner labels Feb 5, 2021
@jnyrup
Copy link
Contributor

jnyrup commented Feb 6, 2021

Here are two other snippets than generates new asm, at least with SharpLab.

I don't know if they are "better", but I observe that:

  • M3 has two instructions less than M2.
  • M4 has two instructions less than M0, but three more than M1.
public bool M3(object o)
{
    return o is not null and (int or uint or long);
}

public bool M4(object o)
{
    return o switch 
    {
        int or uint or long => true,
        _ => false
    };
}
C.M3(System.Object)
    L0000: push ebp
    L0001: mov ebp, esp
    L0003: test edx, edx
    L0005: je short L002a
    L0007: mov eax, edx
    L0009: cmp dword ptr [eax], 0xca73b4
    L000f: je short L0023
    L0011: mov eax, edx
    L0013: cmp dword ptr [eax], 0xca79b8
    L0019: je short L0023
    L001b: cmp dword ptr [edx], 0xca7fbc
    L0021: jne short L002a
    L0023: mov eax, 1
    L0028: jmp short L002c
    L002a: xor eax, eax
    L002c: pop ebp
    L002d: ret

C.M4(System.Object)
    L0000: push ebp
    L0001: mov ebp, esp
    L0003: mov eax, edx
    L0005: test eax, eax
    L0007: je short L0011
    L0009: cmp dword ptr [eax], 0xca73b4
    L000f: jne short L0015
    L0011: test edx, edx
    L0013: jne short L0037
    L0015: mov eax, edx
    L0017: test eax, eax
    L0019: je short L0023
    L001b: cmp dword ptr [eax], 0xca79b8
    L0021: jne short L0027
    L0023: test edx, edx
    L0025: jne short L0037
    L0027: test edx, edx
    L0029: je short L0033
    L002b: cmp dword ptr [edx], 0xca7fbc
    L0031: jne short L003e
    L0033: test edx, edx
    L0035: je short L003e
    L0037: mov eax, 1
    L003c: jmp short L0040
    L003e: xor eax, eax
    L0040: pop ebp
    L0041: ret

sharplab

@EgorBo
Copy link
Member

EgorBo commented Feb 10, 2021

Well, even a simple

bool Test(object o) => o is int;

emits a sub-optimal codegen:

; Method Test(System.Object):bool:this
G_M7842_IG01:              ;; offset=0000H
						;; bbWeight=1    PerfScore 0.00

G_M7842_IG02:              ;; offset=0000H
       4885D2               test     rdx, rdx
       7411                 je       SHORT G_M7842_IG05
						;; bbWeight=1    PerfScore 1.25

G_M7842_IG03:              ;; offset=0005H
       48B8A09B3944FF7F0000 mov      rax, 0x7FFF44399BA0
       483902               cmp      qword ptr [rdx], rax
       7402                 je       SHORT G_M7842_IG05
						;; bbWeight=0.25 PerfScore 0.81

G_M7842_IG04:              ;; offset=0014H
       33D2                 xor      rdx, rdx
						;; bbWeight=0.13 PerfScore 0.03

G_M7842_IG05:              ;; offset=0016H
       4885D2               test     rdx, rdx
       0F95C0               setne    al
       0FB6C0               movzx    rax, al
						;; bbWeight=1    PerfScore 1.50

G_M7842_IG06:              ;; offset=001FH
       C3                   ret      
						;; bbWeight=1    PerfScore 1.00
; Total bytes of code: 32

The problem here is the fact we emit a giant tree for the following IL:

  ldarg.1
  isinst Int32
  ldnull
  cgt.un
  ret

After importer:

            [000016] -A-X--------              *  ASG       ref   
            [000015] D------N----              +--*  LCL_VAR   ref    V03 tmp1         
            [000014] ---X--------              \--*  QMARK     ref   
            [000007] Q-----------    if           +--*  EQ        int   
            [000006] ------------                 |  +--*  LCL_VAR   ref    V01 arg1         
            [000005] ------------                 |  \--*  CNS_INT   ref    null
            [000013] ---X--------    if           \--*  COLON     ref   
            [000011] ---X-------- else               +--*  QMARK     ref   
            [000004] Q--X--------    if              |  +--*  NE        int   
            [000003] #--X--------                    |  |  +--*  IND       long  
            [000002] ------------                    |  |  |  \--*  LCL_VAR   ref    V01 arg1         
            [000001] H-----------                    |  |  \--*  CNS_INT(h) long   0x7fff44389ba0 class
            [000010] ------------    if              |  \--*  COLON     ref   
            [000008] ------------ else               |     +--*  LCL_VAR   ref    V01 arg1         
            [000009] ------------ then               |     \--*  CNS_INT   ref    null
            [000012] ------------ then               \--*  LCL_VAR   ref    V01 arg1         

            [000020] ------------              *  RETURN    int   
            [000019] N--------U--              \--*  GT        int   
            [000017] ------------                 +--*  LCL_VAR   ref    V03 tmp1         
            [000018] ------------                 \--*  CNS_INT   ref    null

We emit a huge tree for isinst while not needing its return value (we only compare it with null), we only want something like this:

*  RETURN    int        
\--*  QMARK     int   
  +--*  EQ        int   
  |  +--*  LCL_VAR   ref    V01 arg1         
  |  \--*  CNS_INT   ref    null
  \--*  COLON     int   
    +--*  NE        int   
    |  +--*  IND       long  
    |  |  \--*  LCL_VAR   ref    V01 arg1         
    |  \--*  CNS_INT(h) long   0x7fff44389ba0 class
    \--*  CNS_INT   ref    0(false)  

@AndyAyersMS
Copy link
Member

6.0 codegen for M1 and M2 looks pretty good (better than the sharplab above, which is indeed showing 5.0 codegen).

Main difference from 5.0 is likely the CSE of the method table fetches enabled by #45854. M1 could improve its handling of return values a bit.

; Assembly listing for method C:M1(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  5,  4   )     ref  ->  rdx         class-hnd
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V03 tmp1         [V03,T01] (  2,  2   )     ref  ->  rax         class-hnd "spilling QMark2"
;* V04 tmp2         [V04,T04] (  0,  0   )     ref  ->  zero-ref    class-hnd "spilling QMark2"
;  V05 tmp3         [V05,T02] (  3,  2.25)     ref  ->  rdx         class-hnd "spilling QMark2"
;  V06 cse0         [V06,T03] (  4,  2   )    long  ->  rax         "CSE - moderate"
;
; Lcl frame size = 0

G_M18147_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M18147_IG02:
       test     rdx, rdx
       je       SHORT G_M18147_IG09
						;; bbWeight=1    PerfScore 1.25
G_M18147_IG03:
       mov      rax, rdx
       mov      rax, qword ptr [rax]
       mov      rcx, 0xD1FFAB1E
       cmp      rax, rcx
       je       SHORT G_M18147_IG07
       mov      rcx, 0xD1FFAB1E
       cmp      rax, rcx
       je       SHORT G_M18147_IG07
       mov      rcx, 0xD1FFAB1E
       cmp      rax, rcx
       je       SHORT G_M18147_IG05
						;; bbWeight=0.50 PerfScore 3.38
G_M18147_IG04:
       xor      rdx, rdx
						;; bbWeight=0.12 PerfScore 0.03
G_M18147_IG05:
       test     rdx, rdx
       setne    al
       movzx    rax, al
						;; bbWeight=0.50 PerfScore 0.75
G_M18147_IG06:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M18147_IG07:
       mov      eax, 1
						;; bbWeight=0.50 PerfScore 0.12
G_M18147_IG08:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M18147_IG09:
       xor      eax, eax
						;; bbWeight=0.50 PerfScore 0.12
G_M18147_IG10:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
; Assembly listing for method C:M2(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  4,  3.50)     ref  ->  rdx         class-hnd
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V03 cse0         [V03,T01] (  4,  2   )    long  ->  rax         "CSE - aggressive"
;
; Lcl frame size = 0

G_M2592_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M2592_IG02:
       test     rdx, rdx
       je       SHORT G_M2592_IG07
						;; bbWeight=1    PerfScore 1.25
G_M2592_IG03:
       mov      rax, qword ptr [rdx]
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       je       SHORT G_M2592_IG05
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       je       SHORT G_M2592_IG05
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       sete     al
       movzx    rax, al
						;; bbWeight=0.50 PerfScore 3.38
G_M2592_IG04:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M2592_IG05:
       mov      eax, 1
						;; bbWeight=0.50 PerfScore 0.12
G_M2592_IG06:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M2592_IG07:
       xor      eax, eax
						;; bbWeight=0.50 PerfScore 0.12
G_M2592_IG08:
       ret      
						;; bbWeight=0.50 PerfScore 0.50

@AndyAyersMS
Copy link
Member

However M0 is still not where it could be.

; Assembly listing for method C:M0(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  7,  5   )     ref  ->  rdx         class-hnd
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V03 tmp1         [V03,T01] (  3,  4.50)     ref  ->  rax         class-hnd "spilling QMark2"
;  V04 tmp2         [V04,T03] (  3,  2.50)     ref  ->  rax         class-hnd "spilling QMark2"
;  V05 tmp3         [V05,T02] (  5,  3.75)     ref  ->  rdx         class-hnd "spilling QMark2"
;
; Lcl frame size = 0

G_M57506_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M57506_IG02:
       mov      rax, rdx
       test     rax, rax
       je       SHORT G_M57506_IG04
						;; bbWeight=1    PerfScore 1.50
G_M57506_IG03:
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rax], rcx
       jne      SHORT G_M57506_IG05
						;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG04:
       test     rdx, rdx
       jne      SHORT G_M57506_IG13
						;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG05:
       mov      rax, rdx
       test     rax, rax
       je       SHORT G_M57506_IG07
						;; bbWeight=0.50 PerfScore 0.75
G_M57506_IG06:
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rax], rcx
       jne      SHORT G_M57506_IG08
						;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG07:
       test     rdx, rdx
       jne      SHORT G_M57506_IG13
						;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG08:
       test     rdx, rdx
       je       SHORT G_M57506_IG11
						;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG09:
       mov      rax, 0xD1FFAB1E
       cmp      qword ptr [rdx], rax
       je       SHORT G_M57506_IG11
						;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG10:
       xor      rdx, rdx
						;; bbWeight=0.12 PerfScore 0.03
G_M57506_IG11:
       test     rdx, rdx
       setne    al
       movzx    rax, al
						;; bbWeight=0.50 PerfScore 0.75
G_M57506_IG12:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M57506_IG13:
       mov      eax, 1
						;; bbWeight=0.50 PerfScore 0.12
G_M57506_IG14:
       ret      
						;; bbWeight=0.50 PerfScore 0.50

The jump-threading optimizations introduced in #46257 bail out as there is a temp assign (and hence side effect) in the block threading would like to avoid:

Dominator BB01 of BB06 has relop with same liberal VN:
N003 (  3,  3) [000007] J------N----              *  EQ        int    $100
N001 (  1,  1) [000006] ------------              +--*  LCL_VAR   ref    V03 tmp1         u:1 $80
N002 (  1,  1) [000005] ------------              \--*  CNS_INT   ref    null $VN.Null
 Redundant compare; current relop:
N003 (  3,  3) [000030] J------N----              *  EQ        int    $100
N001 (  1,  1) [000029] ------------              +--*  LCL_VAR   ref    V04 tmp2         u:1 $80
N002 (  1,  1) [000028] ------------              \--*  CNS_INT   ref    null $VN.Null
Both successors of IDom BB01 reach BB06 -- attempting jump threading
BB06 has side effects; no threading

 ------------ BB06 [008..???) -> BB10 (cond), preds={BB04,BB05} succs={BB07,BB10}

***** BB06
STMT00013 (IL 0x008...  ???)
N003 (  5,  4) [000075] -A------R---              *  ASG       ref    $80
N002 (  3,  2) [000074] D------N----              +--*  LCL_VAR   ref    V04 tmp2         d:1 $80
N001 (  1,  1) [000035] ------?-----              \--*  LCL_VAR   ref    V01 arg1         u:1 $80

***** BB06
STMT00011 (IL 0x008...  ???)
N004 (  5,  5) [000072] ------------              *  JTRUE     void  
N003 (  3,  3) [000030] J------N----              \--*  EQ        int    $100
N001 (  1,  1) [000029] ------------                 +--*  LCL_VAR   ref    V04 tmp2         u:1 $80
N002 (  1,  1) [000028] ------------                 \--*  CNS_INT   ref    null $VN.Null

This temp assign is consequence of the expansion of qmarks done by morph in fgExpandQmarkForCastInstOf.

At any rate we can / should extend RBO to allow duplication of the side effecting code. I didn't do this initially because:

  • if the redundant test block has PHIs then it seems we really ought to do some sensible updating of SSA, and we're not well equipped to do that;
  • we would need to introduce a new block for one of the threading out comes so that we only need to create one copy (that is, currently for a redundant test block T we group predecessors into two camps -- true preds and false preds -- and in general for both camps we simply jump to the new intended target. Instead one camp would need to jump to a clone of T and the other could continue jumping to T (like we do for fall through preds).

I don't see us getting to this in 6.0 given other priorities, though this sort of chained type test pattern is likely somewhat common (and more of it is on the way from PGO induced GDV) so we may find time to squeeze it in.

@AndyAyersMS AndyAyersMS removed the needs-further-triage Issue has been initially triaged, but needs deeper consideration or reconsideration label Feb 10, 2021
@AndyAyersMS AndyAyersMS added this to the Future milestone Feb 10, 2021
@AndyAyersMS AndyAyersMS moved this from Needs Triage to Optimizations in .NET Core CodeGen Feb 10, 2021
@AndyAyersMS
Copy link
Member

For completeness, here's the 6.0 codegen for M3 and M4.

M3 looks good and benefits from #45854. M4 has the same temp assign blocking issue as noted for M0.

; Assembly listing for method C:M3(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  4,  3.50)     ref  ->  rdx         class-hnd
;  V02 loc0         [V02,T03] (  3,  2   )    bool  ->  rax        
;# V03 OutArgs      [V03    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V04 tmp1         [V04,T01] (  2,  2   )     ref  ->  rdx         class-hnd "spilling QMark2"
;* V05 tmp2         [V05,T04] (  0,  0   )     ref  ->  zero-ref    class-hnd "spilling QMark2"
;* V06 tmp3         [V06,T05] (  0,  0   )     ref  ->  zero-ref    class-hnd "spilling QMark2"
;  V07 cse0         [V07,T02] (  4,  2   )    long  ->  rax         "CSE - moderate"
;
; Lcl frame size = 0

G_M3425_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M3425_IG02:
       test     rdx, rdx
       je       SHORT G_M3425_IG05
						;; bbWeight=1    PerfScore 1.25
G_M3425_IG03:
       mov      rax, qword ptr [rdx]
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       je       SHORT G_M3425_IG04
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       je       SHORT G_M3425_IG04
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       jne      SHORT G_M3425_IG05
						;; bbWeight=0.50 PerfScore 3.25
G_M3425_IG04:
       mov      eax, 1
       jmp      SHORT G_M3425_IG06
						;; bbWeight=0.50 PerfScore 1.12
G_M3425_IG05:
       xor      eax, eax
						;; bbWeight=0.50 PerfScore 0.12
G_M3425_IG06:
       ret      
						;; bbWeight=1    PerfScore 1.00
; Assembly listing for method C:M4(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  7,  5   )     ref  ->  rdx         class-hnd
;  V02 loc0         [V02,T04] (  3,  2   )    bool  ->  rax        
;# V03 OutArgs      [V03    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V04 tmp1         [V04,T01] (  3,  4.50)     ref  ->  rax         class-hnd "spilling QMark2"
;  V05 tmp2         [V05,T02] (  3,  2.50)     ref  ->  rax         class-hnd "spilling QMark2"
;  V06 tmp3         [V06,T03] (  3,  2.50)     ref  ->  rdx         class-hnd "spilling QMark2"
;
; Lcl frame size = 0

G_M53926_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M53926_IG02:
       mov      rax, rdx
       test     rax, rax
       je       SHORT G_M53926_IG04
						;; bbWeight=1    PerfScore 1.50
G_M53926_IG03:
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rax], rcx
       jne      SHORT G_M53926_IG05
						;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG04:
       test     rdx, rdx
       jne      SHORT G_M53926_IG10
						;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG05:
       mov      rax, rdx
       test     rax, rax
       je       SHORT G_M53926_IG07
						;; bbWeight=0.50 PerfScore 0.75
G_M53926_IG06:
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rax], rcx
       jne      SHORT G_M53926_IG08
						;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG07:
       test     rdx, rdx
       jne      SHORT G_M53926_IG10
						;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG08:
       test     rdx, rdx
       je       SHORT G_M53926_IG11
						;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG09:
       mov      rax, 0xD1FFAB1E
       cmp      qword ptr [rdx], rax
       jne      SHORT G_M53926_IG11
						;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG10:
       mov      eax, 1
       jmp      SHORT G_M53926_IG12
						;; bbWeight=0.50 PerfScore 1.12
G_M53926_IG11:
       xor      eax, eax
						;; bbWeight=0.50 PerfScore 0.12
G_M53926_IG12:
       ret      
						;; bbWeight=1    PerfScore 1.00

@EgorBo
Copy link
Member

EgorBo commented Feb 10, 2021

@AndyAyersMS Do you think it makes sense to special case

isinst
ldnull
cgt.un

in the importer, something like this EgorBo@6659f53 or it just should be handled via #48115?

@AndyAyersMS
Copy link
Member

Handling it early is a good idea. The less IR/flow we introduce early, the better.

@huoyaoyuan
Copy link
Member Author

Is the pattern

isinst
brtrue.s / brfalse.s

also handled? It looks to be common too.

@AndyAyersMS
Copy link
Member

Need to test these out with #76476.

@huoyaoyuan
Copy link
Member Author

I've investigated myself, and successfully optimized isinst+ldnull+ceq/cgt.un and isinst+brtrue/brfalse patterns locally. However, I have no idea how to handle the isinst+dup+x pattern, which is used by as followed with ?. or ??.

The hard part here is fgExpandQmarkForCastInstOf was written in mind with as operator only. I have no knowledge about how qmark nodes can participate in further optimization either.

I also tried to introduce a new managed helper for doing the x is Exact or x as Exact check, but found that JIT doesn't allow helper call to be inlined currently.

@EgorBo do you have any suggestions around these?

@EgorBo
Copy link
Member

EgorBo commented Jun 23, 2023

@EgorBo do you have any suggestions around these?

Not really, ?? and other cases often produce multi-use boxing that we can't pattern match here

but found that JIT doesn't allow helper call to be inlined currently.

it can be implemented, just a bit of work + new JIT-EE API

@huoyaoyuan
Copy link
Member Author

it can be implemented, just a bit of work + new JIT-EE API

I'd like to try it. What information need to be passed in the interface? Eagerly load and pass the body of helper method?

@AndyAyersMS
Copy link
Member

All of M0-M4 now produce very similar code with 9p5 ish bits (M2 has slightly different layout)

; Method C:M4(System.Object):ubyte:this (FullOpts)
G_M53340_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M53340_IG02:  ;; offset=0x0000
       test     rdx, rdx
       je       SHORT G_M53340_IG05
						;; size=5 bbWeight=1 PerfScore 1.25

G_M53340_IG03:  ;; offset=0x0005
       mov      rax, qword ptr [rdx]
       mov      rcx, 0x7FFE205A5A10      ; System.Int32
       cmp      rax, rcx
       je       SHORT G_M53340_IG04
       mov      rcx, 0x7FFE2062CA58      ; System.UInt32
       cmp      rax, rcx
       je       SHORT G_M53340_IG04
       mov      rcx, 0x7FFE206613C0      ; System.Int64
       cmp      rax, rcx
       jne      SHORT G_M53340_IG05
						;; size=48 bbWeight=0.25 PerfScore 1.62

G_M53340_IG04:  ;; offset=0x0035
       mov      eax, 1
       jmp      SHORT G_M53340_IG06
						;; size=7 bbWeight=0.50 PerfScore 1.12

G_M53340_IG05:  ;; offset=0x003C
       xor      eax, eax
						;; size=2 bbWeight=0.50 PerfScore 0.12

G_M53340_IG06:  ;; offset=0x003E
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 63

Egor's Test example however still has some convoluted flow

; Method C:Test(System.Object):ubyte:this (FullOpts)
G_M56083_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M56083_IG02:  ;; offset=0x0000
       test     rdx, rdx
       je       SHORT G_M56083_IG05
						;; size=5 bbWeight=1 PerfScore 1.25

G_M56083_IG03:  ;; offset=0x0005
       mov      rax, 0x7FFE20595A10      ; System.Int32
       cmp      qword ptr [rdx], rax
       jne      SHORT G_M56083_IG05
						;; size=15 bbWeight=0.25 PerfScore 1.06

G_M56083_IG04:  ;; offset=0x0014
       jmp      SHORT G_M56083_IG06
						;; size=2 bbWeight=0.12 PerfScore 0.25

G_M56083_IG05:  ;; offset=0x0016
       xor      rdx, rdx
						;; size=2 bbWeight=0.25 PerfScore 0.06

G_M56083_IG06:  ;; offset=0x0018
       test     rdx, rdx
       setne    al
       movzx    rax, al
						;; size=9 bbWeight=1 PerfScore 1.50

G_M56083_IG07:  ;; offset=0x0021
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 34

But looks like #103391 will address that.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI tenet-performance Performance related issue
Projects
Status: Optimizations
.NET Core CodeGen
  
Optimizations
Development

No branches or pull requests

5 participants