Using go1.10.1
Consider the following benchmark:
type Scalar struct {
incomparable [0]func()
A, B *int
C, D uint64
}
func Foo(v int64) Scalar {
return Scalar{C: uint64(v), D: 5}
}
var sink Scalar
func BenchmarkA(b *testing.B) {
for i := 0; i < b.N; i++ {
sink = Scalar{C: uint64(i), D: 5}
}
}
func BenchmarkB(b *testing.B) {
for i := 0; i < b.N; i++ {
sink = Foo(int64(i))
}
}
One my machine, this takes:
BenchmarkA-8 200000000 6.15 ns/op
BenchmarkB-8 200000000 8.69 ns/op
which is an abnormally high amount of time to copy 32 bytes.
If you comment out the incomparable field, the benchmarks dramatically improves:
BenchmarkA-8 2000000000 0.84 ns/op
BenchmarkB-8 2000000000 0.84 ns/op
I don't quite see how a zero-length field could make such a big impact of performance.
The assembly for BenchmarkB (with incomparable):
0x0000 00000 TEXT "".BenchmarkB(SB), $104-8
0x0000 00000 MOVQ (TLS), CX
0x0009 00009 CMPQ SP, 16(CX)
0x000d 00013 JLS 241
0x0013 00019 SUBQ $104, SP
0x0017 00023 MOVQ BP, 96(SP)
0x001c 00028 LEAQ 96(SP), BP
0x0021 00033 FUNCDATA $0, gclocals·a36216b97439c93dafebe03e7f0808b5(SB)
0x0021 00033 FUNCDATA $1, gclocals·1d0ed49f611d7e40a62328b5976a2ede(SB)
0x0021 00033 XORL AX, AX
0x0023 00035 JMP 40
0x0025 00037 INCQ AX
0x0028 00040 MOVQ "".b+112(SP), CX
0x002d 00045 MOVQ 240(CX), DX
0x0034 00052 CMPQ AX, DX
0x0037 00055 JGE 231
0x003d 00061 XORPS X0, X0
0x0040 00064 MOVUPS X0, "".~r1+32(SP)
0x0045 00069 MOVUPS X0, "".~r1+48(SP)
0x004a 00074 MOVUPS X0, ""..autotmp_6+64(SP)
0x004f 00079 MOVUPS X0, ""..autotmp_6+80(SP)
0x0054 00084 MOVQ AX, ""..autotmp_6+80(SP)
0x0059 00089 MOVQ $5, ""..autotmp_6+88(SP)
0x0062 00098 LEAQ "".~r1+32(SP), DI
0x0067 00103 LEAQ ""..autotmp_6+64(SP), SI
0x006c 00108 DUFFCOPY $868
0x007f 00127 MOVL runtime.writeBarrier(SB), DX
0x0085 00133 TESTL DX, DX
0x0087 00135 JNE 173
0x0089 00137 LEAQ "".sink(SB), DI
0x0090 00144 LEAQ "".~r1+32(SP), SI
0x0095 00149 DUFFCOPY $868
0x00a8 00168 JMP 37
0x00ad 00173 MOVQ AX, ""..autotmp_7+24(SP)
0x00b2 00178 LEAQ type."".Scalar(SB), AX
0x00b9 00185 MOVQ AX, (SP)
0x00bd 00189 LEAQ "".sink(SB), AX
0x00c4 00196 MOVQ AX, 8(SP)
0x00c9 00201 LEAQ "".~r1+32(SP), CX
0x00ce 00206 MOVQ CX, 16(SP)
0x00d3 00211 PCDATA $0, $0
0x00d3 00211 CALL runtime.typedmemmove(SB)
0x00d8 00216 MOVQ ""..autotmp_7+24(SP), AX
0x00dd 00221 MOVQ "".b+112(SP), CX
0x00e2 00226 JMP 37
0x00e7 00231 MOVQ 96(SP), BP
0x00ec 00236 ADDQ $104, SP
0x00f0 00240 RET
0x00f1 00241 NOP
0x00f1 00241 PCDATA $0, $-1
0x00f1 00241 CALL runtime.morestack_noctxt(SB)
0x00f6 00246 JMP 0
The assembly for BenchmarkA (with incomparable):
0x0000 00000 TEXT "".BenchmarkA(SB), $72-8
0x0000 00000 MOVQ (TLS), CX
0x0009 00009 CMPQ SP, 16(CX)
0x000d 00013 JLS 199
0x0013 00019 SUBQ $72, SP
0x0017 00023 MOVQ BP, 64(SP)
0x001c 00028 LEAQ 64(SP), BP
0x0021 00033 FUNCDATA $0, gclocals·a36216b97439c93dafebe03e7f0808b5(SB)
0x0021 00033 FUNCDATA $1, gclocals·ff19ed39bdde8a01a800918ac3ef0ec7(SB)
0x0021 00033 XORL AX, AX
0x0023 00035 JMP 40
0x0025 00037 INCQ AX
0x0028 00040 MOVQ "".b+80(SP), CX
0x002d 00045 MOVQ 240(CX), DX
0x0034 00052 CMPQ AX, DX
0x0037 00055 JGE 189
0x003d 00061 XORPS X0, X0
0x0040 00064 MOVUPS X0, ""..autotmp_3+32(SP)
0x0045 00069 MOVUPS X0, ""..autotmp_3+48(SP)
0x004a 00074 MOVQ AX, ""..autotmp_3+48(SP)
0x004f 00079 MOVQ $5, ""..autotmp_3+56(SP)
0x0058 00088 MOVL runtime.writeBarrier(SB), DX
0x005e 00094 TESTL DX, DX
0x0060 00096 JNE 131
0x0062 00098 LEAQ "".sink(SB), DI
0x0069 00105 LEAQ ""..autotmp_3+32(SP), SI
0x006e 00110 DUFFCOPY $868
0x0081 00129 JMP 37
0x0083 00131 MOVQ AX, "".i+24(SP)
0x0088 00136 LEAQ type."".Scalar(SB), AX
0x008f 00143 MOVQ AX, (SP)
0x0093 00147 LEAQ "".sink(SB), AX
0x009a 00154 MOVQ AX, 8(SP)
0x009f 00159 LEAQ ""..autotmp_3+32(SP), CX
0x00a4 00164 MOVQ CX, 16(SP)
0x00a9 00169 PCDATA $0, $0
0x00a9 00169 CALL runtime.typedmemmove(SB)
0x00ae 00174 MOVQ "".i+24(SP), AX
0x00b3 00179 MOVQ "".b+80(SP), CX
0x00b8 00184 JMP 37
0x00bd 00189 MOVQ 64(SP), BP
0x00c2 00194 ADDQ $72, SP
0x00c6 00198 RET
0x00c7 00199 NOP
0x00c7 00199 PCDATA $0, $-1
0x00c7 00199 CALL runtime.morestack_noctxt(SB)
0x00cc 00204 JMP 0
The assembly for both BenchmarkA and BenchmarkB (without incomparable):
0x0000 00000 TEXT "".BenchmarkA(SB), $8-8
0x0000 00000 MOVQ (TLS), CX
0x0009 00009 CMPQ SP, 16(CX)
0x000d 00013 JLS 134
0x000f 00015 SUBQ $8, SP
0x0013 00019 MOVQ BP, (SP)
0x0017 00023 LEAQ (SP), BP
0x001b 00027 FUNCDATA $0, gclocals·a36216b97439c93dafebe03e7f0808b5(SB)
0x001b 00027 FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
0x001b 00027 MOVQ "".b+16(SP), CX
0x0020 00032 XORL AX, AX
0x0022 00034 JMP 39
0x0024 00036 INCQ AX
0x0027 00039 MOVQ 240(CX), DX
0x002e 00046 CMPQ AX, DX
0x0031 00049 JGE 125
0x0033 00051 MOVQ AX, "".sink+16(SB)
0x003a 00058 MOVQ $5, "".sink+24(SB)
0x0045 00069 MOVL runtime.writeBarrier(SB), DX
0x004b 00075 TESTL DX, DX
0x004d 00077 JNE 91
0x004f 00079 XORPS X0, X0
0x0052 00082 MOVUPS X0, "".sink(SB)
0x0059 00089 JMP 36
0x005b 00091 LEAQ "".sink(SB), DI
0x0062 00098 MOVQ AX, DX
0x0065 00101 XORL AX, AX
0x0067 00103 CALL runtime.gcWriteBarrier(SB)
0x006c 00108 LEAQ "".sink+8(SB), DI
0x0073 00115 CALL runtime.gcWriteBarrier(SB)
0x0078 00120 MOVQ DX, AX
0x007b 00123 JMP 36
0x007d 00125 MOVQ (SP), BP
0x0081 00129 ADDQ $8, SP
0x0085 00133 RET
0x0086 00134 NOP
0x0086 00134 PCDATA $0, $-1
0x0086 00134 CALL runtime.morestack_noctxt(SB)
0x008b 00139 JMP 0
The presence of the incomparable field seems to trigger the use of runtime.duffcopy, which is slow on short copies.
Using
go1.10.1Consider the following benchmark:
One my machine, this takes:
which is an abnormally high amount of time to copy 32 bytes.
If you comment out the
incomparablefield, the benchmarks dramatically improves:I don't quite see how a zero-length field could make such a big impact of performance.
The assembly for BenchmarkB (with
incomparable):The assembly for BenchmarkA (with
incomparable):The assembly for both BenchmarkA and BenchmarkB (without
incomparable):The presence of the
incomparablefield seems to trigger the use ofruntime.duffcopy, which is slow on short copies.