Another RyuJIT optimization bug #1299

Closed
AndreyAkinshin opened this Issue Jul 27, 2015 · 14 comments

Projects

None yet

10 participants

@AndreyAkinshin
Contributor

This issue based on a SO question: RyuJit producing incorrect results.

Example 1

Let's start with the following simple code:

public struct NullableBool
{
    public bool? Value;

    public NullableBool(bool? value)
    {
        Value = value;
    }
}

[MethodImpl(MethodImplOptions.NoInlining)]
public static NullableBool Calculate1()
{
    bool? b = true;
    bool? c = null;
    c = b.Value ? false : c;
    var d = new NullableBool(c);
    return d;
}

public void Run1()
{
    Console.WriteLine(Calculate1().Value);
}

The expected result:

False

The actual RyuJIT result:

True

It was reproduced with .NET 4.6 Release RyuJIT (x64, Release build) and CoreCLR RyuJIT 480227b. Let's look to the asm.

; VisualStudio 2015 Disasm (.NET 4.6 Release RyuJIT-x64)
; Calculate1()
; --------------------

sub         rsp,28h  
; c = b.Value ? false : c;
mov         eax,1                  ; c.HasValue = true
movzx       edx,al                 ; c.Value = 1 (!!!!!)
; var d = new NullableBool(c);
mov         word ptr [rsp+20h],0   ; d = new NullableBool
lea         rcx,[rsp+20h]          ; rcx = &d
mov         byte ptr [rcx],al      ; d.HasValue = c.HasValue
mov         byte ptr [rcx+1],dl    ; d.Value = c.Value
; return d;
movsx       rax,word ptr [rsp+20h] ; return d
add         rsp,28h  
ret  
; CoreCLR 480227becb804cd511c23bbcaf4994b839130fd8
; > SET COMPLUS_JitDisasm=Calculate1
; > csc /optimize Program.cs
; > CoreRun.exe Program.exe
; --------------------

; Assembly listing for method Program:Calculate1():struct
; Emitting BLENDED_CODE for X64 CPU with AVX
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;* V00 loc0         [V00    ] (  0,   0  )  struct ( 8) zero ref
;* V01 loc1         [V01    ] (  0,   0  )  struct ( 8) zero ref
;  V02 loc2         [V02    ] (  2,   2  )  struct ( 8) [rsp+0x20]   do-not-enreg[XSFB] must-init addr-exposed
;* V03 tmp0         [V03    ] (  0,   0  )  struct ( 8) zero ref
;* V04 tmp1         [V04    ] (  0,   0  )  struct ( 8) zero ref
;* V05 tmp2         [V05    ] (  0,   0  )  struct ( 8) zero ref
;* V06 tmp3         [V06,T08] (  0,   0  )    bool  ->  zero ref    V00.hasValue(offs=0x00) P-INDEP
;* V07 tmp4         [V07,T09] (  0,   0  )    bool  ->  zero ref    V00.value(offs=0x01) P-INDEP
;  V08 tmp5         [V08,T01] (  3,   3  )    bool  ->  rax         V01.hasValue(offs=0x00) P-INDEP
;  V09 tmp6         [V09,T02] (  2,   2  )    bool  ->  rdx         V01.value(offs=0x01) P-INDEP
;  V10 tmp7         [V10,T06] (  2,   1  )    bool  ->  rax         V03.hasValue(offs=0x00) P-INDEP
;* V11 tmp8         [V11,T07] (  0,   0  )    bool  ->  zero ref    V03.value(offs=0x01) P-INDEP
;  V12 tmp9         [V12,T05] (  2,   1.5)    bool  ->  rax         V04.hasValue(offs=0x00) P-INDEP
;* V13 tmp10        [V13,T10] (  0,   0  )    bool  ->  zero ref    V04.value(offs=0x01) P-INDEP
;  V14 tmp11        [V14,T03] (  2,   2  )    bool  ->  rax         V05.hasValue(offs=0x00) P-INDEP
;  V15 tmp12        [V15,T04] (  2,   2  )    bool  ->  rdx         V05.value(offs=0x01) P-INDEP
;  V16 tmp13        [V16,T00] (  3,   6  )   byref  ->  rcx         stack-byref
;  V17 OutArgs      [V17    ] (  1,   1  )  lclBlk (32) [rsp+0x00]
;
; Lcl frame size = 40

G_M40051_IG01:
       4883EC28             sub      rsp, 40
       33C0                 xor      rax, rax
       4889442420           mov      qword ptr [rsp+20H], rax

G_M40051_IG02:
       B801000000           mov      eax, 1
       0FB6D0               movzx    rdx, al
       488D4C2420           lea      rcx, bword ptr [rsp+20H]
       8801                 mov      byte  ptr [rcx], al
       885101               mov      byte  ptr [rcx+1], dl
       480FBF442420         movsx    rax, word  ptr [rsp+20H]

G_M40051_IG03:
       4883C428             add      rsp, 40
       C3                   ret

; Total bytes of code 40, prolog size 11 for method Program:Calculate1():struct

We can see that RyuJIT transforms source

bool? b = true;
bool? c = null;
c = b.Value ? false : c;

to

mov         eax,1                  ; c.HasValue = true
movzx       edx,al                 ; c.Value = 1

It is wrong.

Example 2

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static NullableBool Calculate2()
{
    bool? a = null, b = true;
    if (!b.HasValue)
        return new NullableBool();
    bool? c = null;
    c = b.Value ? false : c;
    if (a.HasValue && a.Value) { }
    return new NullableBool(c);
}        

public void Run2()
{
    Console.WriteLine(Calculate2().Value);
    Console.WriteLine(Calculate2().Value);
}

The expected result:

False
False

The actual RyuJIT result:

True
False

We can see that RyuJIT apply an optimization wrong way only first time. Let's look to the asm.

; VisualStudio 2015 Disasm (.NET 4.6 Release RyuJIT-x64)
; Run2()
; --------------------

in          al,dx  ; VisualStudio disasm bug
pop         rax    ; it actually means sub rsp,58h
; Calculate2().Value
mov         word ptr [rsp+38h],0   ; [rsp+38h] = 0
mov         edx,1                  ; c.HasValue = true
movzx       ecx,dl                 ; c.Value = true (!!!!!)
mov         word ptr [rsp+40h],0   ; [rsp+40h] = new Nullable<bool>()
lea         rax,[rsp+40h]          ; rax = [rsp+40h]
mov         byte ptr [rax],dl      ; [rsp+40h].HasValue = true
mov         byte ptr [rax+1],cl    ; [rsp+40h].Value = true (!!!!!)
movsx       rdx,word ptr [rsp+40h] ; rdx = *[rsp+40h]
mov         word ptr [rsp+50h],dx  ; [rsp+50h] = (bool?)true (!!!!!)
lea         rdx,[rsp+50h]          ; rdx = [rsp+50h]
; Console.WriteLine
mov         rcx,7FFBA21C37E0h  
call        00007FFBA32B3EC0  
mov         rcx,rax  
call        00007FFBA285A940  
; Calculate2().Value
mov         word ptr [rsp+28h],0   ; [rsp+28h] = 0
xor         edx,edx                ; c.Value = false (!!!!!)
mov         ecx,1                  ; c.HasValue = true
mov         word ptr [rsp+30h],0   ; [rsp+30h] = new Nullable<bool>()
lea         rax,[rsp+30h]          ; rax = [rsp+30h]
mov         byte ptr [rax],cl      ; [rsp+30h].HasValue = true
mov         byte ptr [rax+1],dl    ; [rsp+30h].Value = false (!!!!!)
movsx       rdx,word ptr [rsp+30h] ; rdx = *[rsp+30h]
mov         word ptr [rsp+48h],dx  ; [rsp+48h] = (bool?)false (!!!!!)
lea         rdx,[rsp+48h]          ; rdx = [rsp+48h]
; Console.WriteLine
mov         rcx,7FFBA21C37E0h  
call        00007FFBA32B3EC0  
mov         rcx,rax  
call        00007FFBA285A940  
nop  
add         rsp,58h  
; CoreCLR 480227becb804cd511c23bbcaf4994b839130fd8
; > SET COMPLUS_JitDisasm=Run2
; > csc /optimize Program.cs
; > CoreRun.exe Program.exe
; --------------------

; Assembly listing for method Program:Run2():this
; Emitting BLENDED_CODE for X64 CPU with AVX
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,   0  )     ref  ->  zero ref    this
;  V01 tmp0         [V01    ] (  2,   4  )  struct ( 8) [rsp+0x50]   do-not-enreg[XSF] addr-exposed
;  V02 tmp1         [V02    ] (  2,   4  )  struct ( 8) [rsp+0x48]   do-not-enreg[XSF] addr-exposed
;  V03 tmp2         [V03,T06] (  2,   2  )   short  ->  rdx
;* V04 tmp3         [V04    ] (  0,   0  )  struct ( 8) zero ref
;* V05 tmp4         [V05    ] (  0,   0  )  struct ( 8) zero ref
;* V06 tmp5         [V06    ] (  0,   0  )  struct ( 8) zero ref
;* V07 tmp6         [V07    ] (  0,   0  )  struct ( 8) zero ref
;* V08 tmp7         [V08    ] (  0,   0  )  struct ( 8) zero ref
;  V09 tmp8         [V09    ] (  3,   6  )  struct ( 8) [rsp+0x40]   do-not-enreg[XSFB] addr-exposed
;  V10 tmp9         [V10,T14] (  1,   1  )  struct ( 8) [rsp+0x38]   do-not-enreg[SF]
;* V11 tmp10        [V11    ] (  0,   0  )  struct ( 8) zero ref
;  V12 tmp11        [V12,T07] (  2,   2  )   short  ->  rdx
;* V13 tmp12        [V13    ] (  0,   0  )  struct ( 8) zero ref
;* V14 tmp13        [V14    ] (  0,   0  )  struct ( 8) zero ref
;* V15 tmp14        [V15    ] (  0,   0  )  struct ( 8) zero ref
;* V16 tmp15        [V16    ] (  0,   0  )  struct ( 8) zero ref
;* V17 tmp16        [V17    ] (  0,   0  )  struct ( 8) zero ref
;  V18 tmp17        [V18    ] (  3,   6  )  struct ( 8) [rsp+0x30]   do-not-enreg[XSFB] addr-exposed
;  V19 tmp18        [V19,T15] (  1,   1  )  struct ( 8) [rsp+0x28]   do-not-enreg[SF]
;* V20 tmp19        [V20    ] (  0,   0  )  struct ( 8) zero ref
;* V21 tmp20        [V21,T13] (  0,   0  )    bool  ->  zero ref    V04.hasValue(offs=0x00) P-INDEP
;* V22 tmp21        [V22    ] (  0,   0  )    bool  ->  zero ref    V04.value(offs=0x01) P-INDEP
;* V23 tmp22        [V23,T16] (  0,   0  )    bool  ->  zero ref    V05.hasValue(offs=0x00) P-INDEP
;* V24 tmp23        [V24,T17] (  0,   0  )    bool  ->  zero ref    V05.value(offs=0x01) P-INDEP
;  V25 tmp24        [V25,T02] (  3,   2  )    bool  ->  rdx         V06.hasValue(offs=0x00) P-INDEP
;  V26 tmp25        [V26,T03] (  2,   1.5)    bool  ->  rcx         V06.value(offs=0x01) P-INDEP
;  V27 tmp26        [V27,T22] (  2,   0.5)    bool  ->  rdx         V07.hasValue(offs=0x00) P-INDEP
;* V28 tmp27        [V28,T24] (  0,   0  )    bool  ->  zero ref    V07.value(offs=0x01) P-INDEP
;  V29 tmp28        [V29,T20] (  2,   0.7)    bool  ->  rdx         V08.hasValue(offs=0x00) P-INDEP
;* V30 tmp29        [V30,T25] (  0,   0  )    bool  ->  zero ref    V08.value(offs=0x01) P-INDEP
;  V31 tmp30        [V31,T08] (  2,   2  )    bool  ->  rdx         V11.hasValue(offs=0x00) P-INDEP
;  V32 tmp31        [V32,T09] (  2,   2  )    bool  ->  rcx         V11.value(offs=0x01) P-INDEP
;  V33 tmp32        [V33,T12] (  2,   0.7)    bool  ->  rdx         V13.hasValue(offs=0x00) P-INDEP
;* V34 tmp33        [V34    ] (  0,   0  )    bool  ->  zero ref    V13.value(offs=0x01) P-INDEP
;* V35 tmp34        [V35,T18] (  0,   0  )    bool  ->  zero ref    V14.hasValue(offs=0x00) P-INDEP
;* V36 tmp35        [V36,T19] (  0,   0  )    bool  ->  zero ref    V14.value(offs=0x01) P-INDEP
;  V37 tmp36        [V37,T04] (  2,   1.5)    bool  ->  rcx         V15.hasValue(offs=0x00) P-INDEP
;  V38 tmp37        [V38,T05] (  2,   1.5)    bool  ->  rdx         V15.value(offs=0x01) P-INDEP
;  V39 tmp38        [V39,T23] (  2,   0.5)    bool  ->  rcx         V16.hasValue(offs=0x00) P-INDEP
;* V40 tmp39        [V40,T26] (  0,   0  )    bool  ->  zero ref    V16.value(offs=0x01) P-INDEP
;  V41 tmp40        [V41,T21] (  2,   0.7)    bool  ->  rcx         V17.hasValue(offs=0x00) P-INDEP
;* V42 tmp41        [V42,T27] (  0,   0  )    bool  ->  zero ref    V17.value(offs=0x01) P-INDEP
;  V43 tmp42        [V43,T10] (  2,   2  )    bool  ->  rcx         V20.hasValue(offs=0x00) P-INDEP
;  V44 tmp43        [V44,T11] (  2,   2  )    bool  ->  rdx         V20.value(offs=0x01) P-INDEP
;  V45 tmp44        [V45,T00] (  3,   6  )   byref  ->  rax         stack-byref
;  V46 tmp45        [V46,T01] (  3,   6  )   byref  ->  rax         stack-byref
;  V47 OutArgs      [V47    ] (  1,   1  )  lclBlk (32) [rsp+0x00]
;
; Lcl frame size = 88

G_M55345_IG01:
       4883EC58             sub      rsp, 88

G_M55345_IG02:
       66C74424380000       mov      word  ptr [rsp+38H], 0
       BA01000000           mov      edx, 1
       0FB6CA               movzx    rcx, dl
       66C74424400000       mov      word  ptr [rsp+40H], 0
       488D442440           lea      rax, bword ptr [rsp+40H]
       8810                 mov      byte  ptr [rax], dl
       884801               mov      byte  ptr [rax+1], cl
       480FBF542440         movsx    rdx, word  ptr [rsp+40H]
       6689542450           mov      word  ptr [rsp+50H], dx
       488D542450           lea      rdx, bword ptr [rsp+50H]
       48B9D8C9E087FB7F0000 mov      rcx, 0x7FFB87E0C9D8
       E88DFF245F           call     CORINFO_HELP_BOX_NULLABLE
       488BC8               mov      rcx, rax
       E8E5E3B05B           call     System.Console:WriteLine(ref)
       66C74424280000       mov      word  ptr [rsp+28H], 0
       33D2                 xor      edx, edx
       B901000000           mov      ecx, 1
       66C74424300000       mov      word  ptr [rsp+30H], 0
       488D442430           lea      rax, bword ptr [rsp+30H]
       8808                 mov      byte  ptr [rax], cl
       885001               mov      byte  ptr [rax+1], dl
       480FBF542430         movsx    rdx, word  ptr [rsp+30H]
       6689542448           mov      word  ptr [rsp+48H], dx
       488D542448           lea      rdx, bword ptr [rsp+48H]
       48B9D8C9E087FB7F0000 mov      rcx, 0x7FFB87E0C9D8
       E847FF245F           call     CORINFO_HELP_BOX_NULLABLE
       488BC8               mov      rcx, rax
       E89FE3B05B           call     System.Console:WriteLine(ref)
       90                   nop

G_M55345_IG03:
       4883C458             add      rsp, 88
       C3                   ret

; Total bytes of code 151, prolog size 4 for method Program:Run2():this
; ============================================================

We can see that RyuJIT transforms source

bool? a = null, b = true;
if (!b.HasValue)
    return new NullableBool();
bool? c = null;
c = b.Value ? false : c;
if (a.HasValue && a.Value) { }

to

mov         edx,1                  ; c.HasValue = true
movzx       ecx,dl                 ; c.Value = true (!!!!!)

but only for the first call. It is wrong result. For the second call we have

xor         edx,edx                ; c.Value = false (!!!!!)
mov         ecx,1                  ; c.HasValue = true

It is ok.

Conclusion

It seems that there is a tricky RyuJIT optimization bug which depends on big amount of additional conditions: any changes of the source code spoil reproducibility.

@schellap
Contributor

Andrey, thank you for the reduced repro. Your analysis is correct. I have a fix for the issue that is going through the bug fix processes.

@jgowdy
jgowdy commented Jul 29, 2015

So the current blog entry from Microsoft is describing the tail call optimization defect and stating that it's okay to leave RyuJIT on in production and simply disable tail call optimization. Yet the above bug seems to indicate a new defect in RyuJIT code generation that contradicts the idea that RyuJIT is fine for production minus tail call optimization. Should the advice in the blog entry be updated based on the above?

http://blogs.msdn.com/b/dotnet/archive/2015/07/28/ryujit-bug-advisory-in-the-net-framework-4-6.aspx

@EamonNerbonne

Given the fact that there have been 2 such nasty bugs found so quickly, and the fact that ryujit isn't a huge performance boost, it looks like holding off on ryujit in general for a while looks like better advice than merely disabling tail calls.

@danielmarbach

Agree. Updating a blog post is also not enough. There needs to be an update on the existing blog post and a general announcement which clearly says: DO NOT USE IN PRODUCTION.

@AndreyAkinshin
Contributor

@EamonNerbonne, I completely agree. In my humble opinion, RyuJIT is not ready yet.

@redknightlois

@EamonNerbonne that is without counting subpar optimization on other pretty common performance sensitive scenarios.
@AndreyAkinshin I second that.

@Eyas
Eyas commented Jul 29, 2015

Its important to realize that RyuJIT's primary prupose is that it is lighter and faster in running. It will optimize your code more quickly. As a JITer, it will not perform some of the complex optimizations that x64-JIT performed, which was based on the C++ AoT compiler.

See: http://blogs.msdn.com/b/dotnet/archive/2013/09/30/10453200.aspx

Missed optimization opportunities are absolutely issues that should be talked about to make RyuJIT even better, but just do realize that the comparison to x64 is unfair. RyuJIT gets you to a JITed state faster, reduces startup times on server applications and long-running services, etc. There is indeed a trade-off and it was well documented.

@ayende
ayende commented Jul 29, 2015

But for long running systems, it is fine to have a bit slower startup time
if it means that we are running faster afterward.

*Hibernating Rhinos Ltd *

Oren Eini* l CEO l *Mobile: + 972-52-548-6969

Office: +972-4-622-7811 *l *Fax: +972-153-4-622-7811

On Wed, Jul 29, 2015 at 5:21 PM, Eyas notifications@github.com wrote:

Its important to realize that RyuJIT's primary prupose is that it is
lighter and faster in running. It will optimize your code more quickly. As
a JITer, it will not perform some of the complex optimizations that x64-JIT
performed, which was based on the C++ AoT compiler.

See: http://blogs.msdn.com/b/dotnet/archive/2013/09/30/10453200.aspx

Missed optimization opportunities are absolutely issues that should be
talked about to make RyuJIT even better, but just do realize that the
comparison to x64 is unfair. RyuJIT gets you to a JITed state faster,
reduces startup times on server applications and long-running services,
etc. There is indeed a trade-off and it was well documented.


Reply to this email directly or view it on GitHub
#1299 (comment).

@redknightlois

@Eyas probably but if you release it as the default the expectations are that it is on-par on what you already have.

"Taking time to compile efficient code made sense when 64-bit was primarily for server code. But >“server code” today includes web apps that have to start fast."

Let's agree to disagree. While it makes sense, I dont share the feeling that is true. What's the point of being able to startup 5-10 seconds earlier if you are going to lose minutes/hours over the time the application is processing (and forced it even on my old applications that do not target 4.6)? This is definitely something to discuss in a different issue though. I stand on the position that in the current state RyuJIT is not ready for being the default.

@Eyas
Eyas commented Jul 29, 2015

@redknightlois Yep, I'm not affiliated with Microsoft in any way. Just expressing that the RyuJIT decision was announced in 2013 and made probably before that. The blog post makes the case that they intend to make. Their proposition is that RyuJIT is already better for most use cases.

Another issue might be appropriate for a larger discussion about what kinds of optimizations RyuJIT should and shouldn't be doing, that Legacy-JIT performed previously. Fine by me. But what I'm trying to prevent is having every "regression" issue devolving into a conversation about "RyuJIT is worse, how are they releasing this?" when our metric for "worse" is not the one they are using.

Edit: to clarify, We should track performance regressions, just not degrade the conversation

@ayende
ayende commented Jul 29, 2015

I would argue that as the users, our metric for worse is the one that count.

*Hibernating Rhinos Ltd *

Oren Eini* l CEO l *Mobile: + 972-52-548-6969

Office: +972-4-622-7811 *l *Fax: +972-153-4-622-7811

On Wed, Jul 29, 2015 at 5:42 PM, Eyas notifications@github.com wrote:

@redknightlois https://github.com/redknightlois Yep, I'm not affiliated
with Microsoft in any way. Just expressing that the RyuJIT decision was
announced in 2013 and made probably before that. The blog post makes the
case that they intend to make. Their proposition is that RyuJIT is already
better for most use cases.

Another issue might be appropriate for a larger discussion about what
kinds of optimizations RyuJIT should and shouldn't be doing, that
Legacy-JIT performed previously. Fine by me. But what I'm trying to prevent
is having every "regression" issue devolving into a conversation about
"RyuJIT is worse, how are they releasing this?" when our metric for "worse"
is not the one they are using.


Reply to this email directly or view it on GitHub
#1299 (comment).

@masonwheeler

@ayende is absolutely right here. The longer the program is going to run, the more important performance becomes, rather than startup time.

@Eyas
Eyas commented Jul 29, 2015

@ayende Possibly. But that is the topic of another ticket/discussion about a larger issue with what RyuJIT is even trying to solve.

@jgowdy
jgowdy commented Jul 29, 2015

I'm more concerned with generated code correctness before I start load testing my app with the new JIT. I am still excited by RyuJIT overall, but this has been released into prod with two codegen defects (so far), and the blog post about the first defect doesn't advise you to avoid the second. I don't know if there's a technical reason why they defaulted pre-4.6 assemblies to using RyuJIT but I think that was the real mistake here. People don't expect a framework upgrade to break their not-yet-upgraded projects.

@schellap schellap added a commit to schellap/coreclr that referenced this issue Jul 29, 2015
@schellap schellap Conservatively mark variable as killed if there is an embedded comma …
…assignment

GitHub Issue: Another RyuJIT optimization bug #1299
dotnet#1299

The underlying problem is there are two loops:

The first loop does copy prop
The second loop updates the stack of live definitions in the currently
traversed CFG path.

This approach works fine only if assignments are atomic statements, but
when there is an assignment embedded using a comma in a statement:
The first loop does nothing about this and continues to propagate
copies, i.e., without updating the liveness stack. This leads to using a
stale VN for comparison rather than using an updated VN due to this
definition.

Only when the second loop runs, this live definition is added to the
stack.

The right fix is to merge the two loops into a single loop that does
copy prop on the tree and at the same time updates the liveness on the
stack right after the call. But this is causing a lot of ASM diffs which
would affect stability.

I am making a conservative fix of temporarily marking the variables that
have embedded intervening defs as "killed", until the second loop runs
to update liveness.

There are no SuperASM diffs with this change.
e4ae672
@schellap schellap self-assigned this Jul 30, 2015
@schellap schellap closed this Jul 30, 2015
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment