I don't think there is perf impact. Just an interesting case which may point to something important.
The difference is in these lines:
mov dword ptr [rcx+rax+16], r8d
lea rax, bword ptr [rcx+rax+16]
;vs
lea rax, bword ptr [rcx+rax+16]
mov dword ptr [rax], r8d
[MethodImpl(MethodImplOptions.NoInlining)]
private static void TestRangeCheckElimination(Entry[] entries, int index, int value)
{
entries[index].hashCode = value;
entries[index].next = value;
entries[index].key = value;
entries[index].value = value;
}
[MethodImpl(MethodImplOptions.NoInlining)]
private static void TestRangeCheckEliminationWithRef(Entry[] entries, int index, int value)
{
ref Entry entry = ref entries[index];
entry.hashCode = value;
entry.next = value;
entry.key = value;
entry.value = value;
}
Assembly for TestRangeCheckElimination
; Assembly listing for method TestDotNetCore.Program:TestRangeCheckElimination(ref,int,int)
; Emitting BLENDED_CODE for X64 CPU with AVX
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 15, 15 ) ref -> rcx class-hnd
; V01 arg1 [V01,T03] ( 12, 12 ) int -> rdx
; V02 arg2 [V02,T01] ( 14, 14 ) int -> r8
; V03 OutArgs [V03 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
; V04 cse0 [V04,T02] ( 14, 14 ) byref -> rax
; V05 cse1 [V05,T04] ( 10, 10 ) long -> rax
; V06 cse2 [V06,T05] ( 5, 5 ) int -> rax
;
; Lcl frame size = 40
G_M10648_IG01:
4883EC28 sub rsp, 40
G_M10648_IG02:
8B4108 mov eax, dword ptr [rcx+8]
3BD0 cmp edx, eax
7322 jae SHORT G_M10648_IG04
4863C2 movsxd rax, edx
48C1E004 shl rax, 4
4489440110 mov dword ptr [rcx+rax+16], r8d
488D440110 lea rax, bword ptr [rcx+rax+16]
44894004 mov dword ptr [rax+4], r8d
44894008 mov dword ptr [rax+8], r8d
4489400C mov dword ptr [rax+12], r8d
G_M10648_IG03:
4883C428 add rsp, 40
C3 ret
G_M10648_IG04:
E86E50305F call CORINFO_HELP_RNGCHKFAIL
CC int3
; Total bytes of code 51, prolog size 4 for method
Assembly for TestRangeCheckEliminationWithRe
; Assembly listing for method TestDotNetCore.Program:TestRangeCheckEliminationWithRef(ref,int,int)
; Emitting BLENDED_CODE for X64 CPU with AVX
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 4, 4 ) ref -> rcx class-hnd
; V01 arg1 [V01,T03] ( 4, 4 ) int -> rdx
; V02 arg2 [V02,T01] ( 6, 6 ) int -> r8
; V03 tmp0 [V03,T00] ( 5, 10 ) byref -> rax
; V04 OutArgs [V04 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
;
; Lcl frame size = 40
G_M60939_IG01:
4883EC28 sub rsp, 40
G_M60939_IG02:
3B5108 cmp edx, dword ptr [rcx+8]
7320 jae SHORT G_M60939_IG04
4863C2 movsxd rax, edx
48C1E004 shl rax, 4
488D440110 lea rax, bword ptr [rcx+rax+16]
448900 mov dword ptr [rax], r8d
44894004 mov dword ptr [rax+4], r8d
44894008 mov dword ptr [rax+8], r8d
4489400C mov dword ptr [rax+12], r8d
G_M60939_IG03:
4883C428 add rsp, 40
C3 ret
G_M60939_IG04:
E82250305F call CORINFO_HELP_RNGCHKFAIL
CC int3
; Total bytes of code 47, prolog size 4 for method
cc @mikedn
category:cq
theme:basic-cq
skill-level:expert
cost:medium
I don't think there is perf impact. Just an interesting case which may point to something important.
The difference is in these lines:
Assembly for TestRangeCheckElimination
Assembly for TestRangeCheckEliminationWithRe
cc @mikedn
category:cq
theme:basic-cq
skill-level:expert
cost:medium