-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes and improvements for removal of redundant zero inits #37786
Conversation
1. Bug fix: when processing a zero assignment to a field of a promoted struct, check the reference count of the parent struct. 2. Bug fix: when processing a zero assignment to a promoted struct, check the reference counts of all fields. 3. Bug fix and improvement: a dependently promoted field is initialized in the prolog iff its parent struct is initialized in the prolog so we can remove a field zero initialization if the parent struct is already zero initialized. 4. Improvement: keep track of explicitly zero-initialized locals so that duplicate explicit zero initializations can be eliminated. Fixes dotnet#37666.
Framework x64 pmi diffs:
|
No diffs in benchmarks. |
Only instance of a bug fix in framework: Microsoft.CodeAnalysis.CSharp.dll
; Assembly listing for method ControlFlowPass:LabelState(LabelSymbol):LocalState:this
G_M46536_IG01:
sub rsp, 40
xor rax, rax
mov qword ptr [rsp+20H], rax
;; bbWeight=1 PerfScore 1.50
G_M46536_IG02:
mov rax, 0xD1FFAB1E
call qword ptr [rax]PreciseAbstractFlowPass`1:LabelState(LabelSymbol):LocalState:this
mov word ptr [rsp+20H], ax
+ mov byte ptr [rsp+21H], 0
movsx rax, word ptr [rsp+20H]
;; bbWeight=1 PerfScore 5.25
;; bbWeight=1 PerfScore 6.25
G_M46536_IG03:
add rsp, 40
ret
;; bbWeight=1 PerfScore 1.25
-; Total bytes of code 39, prolog size 11, PerfScore 11.90, (MethodHash=997c4a37) for method ControlFlowPass:LabelState(LabelSymbol):LocalState:this
+; Total bytes of code 44, prolog size 11, PerfScore 13.40, (MethodHash=997c4a37) for method ControlFlowPass:LabelState(LabelSymbol):LocalState:this
|
Examples of improvement diffs: Struct is explicitly zero initialized and we remove field zero inits Microsoft.CodeAnalysis.VisualBasic.dll
; Assembly listing for method ControlFlowPass:UnreachableState():LocalState:this
G_M2978_IG01:
push rax
;; bbWeight=1 PerfScore 1.00
G_M2978_IG02:
xor eax, eax
mov word ptr [rsp], ax
movzx rax, byte ptr [rcx+137]
- xor edx, edx
- mov word ptr [rsp], dx
- mov byte ptr [rsp], 0
mov byte ptr [rsp+01H], al
movsx rax, word ptr [rsp]
;; bbWeight=1 PerfScore 7.50
;; bbWeight=1 PerfScore 5.25
G_M2978_IG03:
add rsp, 8
ret
;; bbWeight=1 PerfScore 1.25
-; Total bytes of code 38, prolog size 1, PerfScore 13.55, (MethodHash=1828f45d) for method ControlFlowPass:UnreachableState():LocalState:this
+; Total bytes of code 28, prolog size 1, PerfScore 10.30, (MethodHash=1828f45d) for method ControlFlowPass:UnreachableState():LocalState:this Struct is zero initialized in the prolog and we remove field zero init Microsoft.CodeAnalysis.CSharp.dll
; Assembly listing for method SyntaxDiagnosticInfoList:GetEnumerator():Enumerator:this
G_M39539_IG01:
push rdi
push rsi
push rbx
sub rsp, 64
vxorps xmm4, xmm4
vmovdqu xmmword ptr [rsp+28H], xmm4
xor rax, rax
mov qword ptr [rsp+38H], rax
mov rsi, rdx
;; bbWeight=1 PerfScore 6.08
G_M39539_IG02:
mov rdi, gword ptr [rcx]
- xor ecx, ecx
- mov dword ptr [rsp+38H], ecx
test rdi, rdi
je G_M39539_IG08
...
-; Total bytes of code 240, prolog size 24, PerfScore 51.60, (MethodHash=0691658c) for method SyntaxDiagnosticInfoList:GetEnumerator():Enumerator:this
+; Total bytes of code 234, prolog size 24, PerfScore 49.75, (MethodHash=0691658c) for method SyntaxDiagnosticInfoList:GetEnumerator():Enumerator:this We remove duplicate explicit struct zero init Microsoft.CodeAnalysis.VisualBasic.dll
; Assembly listing for method DataFlowPass:ReachableState():LocalState:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;* V00 this [V00 ] ( 0, 0 ) ref -> zero-ref this class-hnd
; V01 RetBuf [V01,T01] ( 4, 4 ) byref -> rbx
; V02 OutArgs [V02 ] ( 1, 1 ) lclBlk (32) [rsp+0x00] "OutgoingArgSpace"
; V03 tmp1 [V03 ] ( 4, 8 ) struct (16) [rsp+0x30] do-not-enreg[XSFB] addr-exposed "NewObj constructor temp"
; V03 tmp1 [V03 ] ( 3, 6 ) struct (16) [rsp+0x30] do-not-enreg[XSFB] addr-exposed "NewObj constructor temp"
; V04 tmp2 [V04 ] ( 4, 8 ) struct (16) [rsp+0x20] do-not-enreg[XS] must-init addr-exposed "struct address for call/obj"
;* V05 tmp3 [V05 ] ( 0, 0 ) struct (16) zero-ref "Inlining Arg"
; V06 tmp4 [V06 ] ( 2, 3 ) ref -> [rsp+0x20] do-not-enreg[X] addr-exposed V04._bits(offs=0x00) P-DEP "field V04._bits (fldOffset=0x0)"
; V07 tmp5 [V07 ] ( 2, 3 ) int -> [rsp+0x28] do-not-enreg[X] addr-exposed V04._bits0(offs=0x08) P-DEP "field V04._bits0 (fldOffset=0x8)"
; V08 tmp6 [V08 ] ( 2, 3 ) int -> [rsp+0x2C] do-not-enreg[X] addr-exposed V04._capacity(offs=0x0c) P-DEP "field V04._capacity (fldOffset=0xc)"
; V09 tmp7 [V09,T02] ( 2, 2 ) ref -> rax V05._bits(offs=0x00) P-INDEP "field V05._bits (fldOffset=0x0)"
; V10 tmp8 [V10,T03] ( 2, 2 ) int -> rdx V05._bits0(offs=0x08) P-INDEP "field V05._bits0 (fldOffset=0x8)"
; V11 tmp9 [V11,T04] ( 2, 2 ) int -> rcx V05._capacity(offs=0x0c) P-INDEP "field V05._capacity (fldOffset=0xc)"
; V12 tmp10 [V12,T00] ( 4, 8 ) byref -> r8 stack-byref "BlockOp address local"
;
; Lcl frame size = 64
G_M31598_IG01:
push rdi
push rsi
push rbx
sub rsp, 64
vzeroupper
xor rax, rax
mov qword ptr [rsp+20H], rax
mov rbx, rdx
;; bbWeight=1 PerfScore 5.75
G_M31598_IG02:
vxorps xmm0, xmm0
vmovdqu xmmword ptr [rsp+30H], xmm0
lea rcx, bword ptr [rsp+20H]
call BitVector:get_Empty():BitVector
mov rax, gword ptr [rsp+20H]
mov edx, dword ptr [rsp+28H]
mov ecx, dword ptr [rsp+2CH]
- vxorps xmm0, xmm0
- vmovdqu xmmword ptr [rsp+30H], xmm0
lea r8, bword ptr [rsp+30H]
mov gword ptr [r8], rax
mov dword ptr [r8+8], edx
mov dword ptr [r8+12], ecx
mov rdi, rbx
lea rsi, bword ptr [rsp+30H]
call CORINFO_HELP_ASSIGN_BYREF
movsq
mov rax, rbx
;; bbWeight=1 PerfScore 13.67
;; bbWeight=1 PerfScore 12.33
G_M31598_IG03:
add rsp, 64
pop rbx
pop rsi
pop rdi
ret
;; bbWeight=1 PerfScore 2.75
-; Total bytes of code 105, prolog size 17, PerfScore 33.07, (MethodHash=272a8491) for method DataFlowPass:ReachableState():LocalState:this
+; Total bytes of code 95, prolog size 17, PerfScore 30.53, (MethodHash=272a8491) for method DataFlowPass:ReachableState():LocalState:this |
@dotnet/jit-contrib @AndyAyersMS PTAL |
I recommend reviewing with "Hide whitespace changes" since I added a couple of early-outs from the loop to reduce indentation. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Just one question about some code you didn't change.
fgRemoveStmt(block, stmt); | ||
removedExplicitZeroInit = true; | ||
lclDsc->lvSuppressedZeroInit = 1; | ||
lclDsc->setLvRefCnt(lclDsc->lvRefCnt() - 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We generally don't incrementally update ref counts anymore. Is this important?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added that in #37280. The idea is to detect cases where we make the struct dependently promoted because of a block assignment and then delete that block assignment. In that case we can make the field tracked and eliminate dead field assignments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if there is any guarantee that the count is > 0 at this point? I suppose this is running not long after the first call to lvaMarkLocalVars
so the ref counts are probably still fairly accurate.
And the code to determine which locals to track is just a heuristic, so it's ok if it is based on incorrect ref count info.
Bug fix: when processing a zero assignment to a field of a promoted struct,
check the reference count of the parent struct.
Bug fix: when processing a zero assignment to a promoted struct, check the
reference counts of all fields.
Bug fix and improvement: a dependently promoted field is initialized in the prolog
iff its parent struct is initialized in the prolog so we can remove a field zero initialization
if the parent struct is already zero initialized.
Improvement: keep track of explicitly zero-initialized locals so that duplicate
explicit zero initializations can be eliminated.
Fixes #37666.