Skip to content

Commit

Permalink
cmd/compile: improve atomic add intrinsics with ARMv8.1 new instruction
Browse files Browse the repository at this point in the history
ARMv8.1 has added new instruction (LDADDAL) for atomic memory operations. This
CL improves existing atomic add intrinsics with the new instruction. Since the
new instruction is only guaranteed to be present after ARMv8.1, we guard its
usage with a conditional on CPU feature.

Performance result on ARMv8.1 machine:
name        old time/op  new time/op  delta
Xadd-224    1.05µs ± 6%  0.02µs ± 4%  -98.06%  (p=0.000 n=10+8)
Xadd64-224  1.05µs ± 3%  0.02µs ±13%  -98.10%  (p=0.000 n=9+10)
[Geo mean]  1.05µs       0.02µs       -98.08%

Performance result on ARMv8.0 machine:
name        old time/op  new time/op  delta
Xadd-46      538ns ± 1%   541ns ± 1%  +0.62%  (p=0.000 n=9+9)
Xadd64-46    505ns ± 1%   508ns ± 0%  +0.48%  (p=0.003 n=9+8)
[Geo mean]   521ns        524ns       +0.55%

Change-Id: If4b5d8d0e2d6f84fe1492a4f5de0789910ad0ee9
Reviewed-on: https://go-review.googlesource.com/81877
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
  • Loading branch information
Wei Xiao authored and cherrymui committed Jun 21, 2018
1 parent 1988b3e commit 0a7ac93
Show file tree
Hide file tree
Showing 16 changed files with 211 additions and 6 deletions.
3 changes: 2 additions & 1 deletion src/cmd/asm/internal/arch/arm64.go
Expand Up @@ -77,7 +77,8 @@ func IsARM64STLXR(op obj.As) bool {
arm64.ALDADDB, arm64.ALDADDH, arm64.ALDADDW, arm64.ALDADDD,
arm64.ALDANDB, arm64.ALDANDH, arm64.ALDANDW, arm64.ALDANDD,
arm64.ALDEORB, arm64.ALDEORH, arm64.ALDEORW, arm64.ALDEORD,
arm64.ALDORB, arm64.ALDORH, arm64.ALDORW, arm64.ALDORD:
arm64.ALDORB, arm64.ALDORH, arm64.ALDORW, arm64.ALDORD,
arm64.ALDADDALD, arm64.ALDADDALW:
return true
}
return false
Expand Down
2 changes: 2 additions & 0 deletions src/cmd/asm/internal/asm/testdata/arm64.s
Expand Up @@ -604,6 +604,8 @@ again:
LDORH R5, (RSP), R7 // e7332578
LDORB R5, (R6), R7 // c7302538
LDORB R5, (RSP), R7 // e7332538
LDADDALD R2, (R1), R3 // 2300e2f8
LDADDALW R5, (R4), R6 // 8600e5b8

// RET
//
Expand Down
22 changes: 22 additions & 0 deletions src/cmd/compile/internal/arm64/ssa.go
Expand Up @@ -553,6 +553,28 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p3.From.Reg = arm64.REGTMP
p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p)
case ssa.OpARM64LoweredAtomicAdd64Variant,
ssa.OpARM64LoweredAtomicAdd32Variant:
// LDADDAL Rarg1, (Rarg0), Rout
// ADD Rarg1, Rout
op := arm64.ALDADDALD
if v.Op == ssa.OpARM64LoweredAtomicAdd32Variant {
op = arm64.ALDADDALW
}
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
out := v.Reg0()
p := s.Prog(op)
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_MEM
p.To.Reg = r0
p.RegTo2 = out
p1 := s.Prog(arm64.AADD)
p1.From.Type = obj.TYPE_REG
p1.From.Reg = r1
p1.To.Type = obj.TYPE_REG
p1.To.Reg = out
case ssa.OpARM64LoweredAtomicCas64,
ssa.OpARM64LoweredAtomicCas32:
// LDAXR (Rarg0), Rtmp
Expand Down
1 change: 1 addition & 0 deletions src/cmd/compile/internal/gc/go.go
Expand Up @@ -303,6 +303,7 @@ var (
racewriterange,
supportPopcnt,
supportSSE41,
arm64SupportAtomics,
typedmemclr,
typedmemmove,
Udiv,
Expand Down
47 changes: 45 additions & 2 deletions src/cmd/compile/internal/gc/ssa.go
Expand Up @@ -78,6 +78,7 @@ func initssaconfig() {
racewriterange = sysfunc("racewriterange")
supportPopcnt = sysfunc("support_popcnt")
supportSSE41 = sysfunc("support_sse41")
arm64SupportAtomics = sysfunc("arm64_support_atomics")
typedmemclr = sysfunc("typedmemclr")
typedmemmove = sysfunc("typedmemmove")
Udiv = sysfunc("udiv")
Expand Down Expand Up @@ -2935,14 +2936,56 @@ func init() {
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[TUINT32], v)
},
sys.AMD64, sys.ARM64, sys.S390X, sys.MIPS, sys.MIPS64, sys.PPC64)
sys.AMD64, sys.S390X, sys.MIPS, sys.MIPS64, sys.PPC64)
addF("runtime/internal/atomic", "Xadd64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[TUINT64], types.TypeMem), args[0], args[1], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[TUINT64], v)
},
sys.AMD64, sys.ARM64, sys.S390X, sys.MIPS64, sys.PPC64)
sys.AMD64, sys.S390X, sys.MIPS64, sys.PPC64)

makeXaddARM64 := func(op0 ssa.Op, op1 ssa.Op, ty types.EType) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), arm64SupportAtomics, s.sb)
v := s.load(types.Types[TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchUnlikely // most machines don't have Atomics nowadays

// We have atomic instructions - use it directly.
s.startBlock(bTrue)
v0 := s.newValue3(op1, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v0)
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v0)
s.endBlock().AddEdgeTo(bEnd)

// Use original instruction sequence.
s.startBlock(bFalse)
v1 := s.newValue3(op0, types.NewTuple(types.Types[ty], types.TypeMem), args[0], args[1], s.mem())
s.vars[&memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v1)
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[ty], v1)
s.endBlock().AddEdgeTo(bEnd)

// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[ty])
}
}

addF("runtime/internal/atomic", "Xadd",
makeXaddARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, TUINT32),
sys.ARM64)
addF("runtime/internal/atomic", "Xadd64",
makeXaddARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, TUINT64),
sys.ARM64)

addF("runtime/internal/atomic", "Cas",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
Expand Down
3 changes: 3 additions & 0 deletions src/cmd/compile/internal/ssa/gen/ARM64.rules
Expand Up @@ -544,6 +544,9 @@
(AtomicAnd8 ptr val mem) -> (Select1 (LoweredAtomicAnd8 ptr val mem))
(AtomicOr8 ptr val mem) -> (Select1 (LoweredAtomicOr8 ptr val mem))

(AtomicAdd32Variant ptr val mem) -> (LoweredAtomicAdd32Variant ptr val mem)
(AtomicAdd64Variant ptr val mem) -> (LoweredAtomicAdd64Variant ptr val mem)

// Write barrier.
(WB {fn} destptr srcptr mem) -> (LoweredWB {fn} destptr srcptr mem)

Expand Down
7 changes: 7 additions & 0 deletions src/cmd/compile/internal/ssa/gen/ARM64Ops.go
Expand Up @@ -578,6 +578,13 @@ func init() {
{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},

// atomic add variant.
// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
// LDADDAL (Rarg0), Rarg1, Rout
// ADD Rarg1, Rout
{name: "LoweredAtomicAdd64Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicAdd32Variant", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true},

// atomic compare and swap.
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory. auxint must be zero.
// if *arg0 == arg1 {
Expand Down
7 changes: 7 additions & 0 deletions src/cmd/compile/internal/ssa/gen/genericOps.go
Expand Up @@ -515,6 +515,13 @@ var genericOps = []opData{
{name: "AtomicAnd8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 &= arg1. arg2=memory. Returns memory.
{name: "AtomicOr8", argLength: 3, typ: "Mem", hasSideEffects: true}, // *arg0 |= arg1. arg2=memory. Returns memory.

// Atomic operation variants
// These variants have the same semantics as above atomic operations.
// But they are used for generating more efficient code on certain modern machines, with run-time CPU feature detection.
// Currently, they are used on ARM64 only.
{name: "AtomicAdd32Variant", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.
{name: "AtomicAdd64Variant", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true}, // Do *arg0 += arg1. arg2=memory. Returns sum and new memory.

// Clobber experiment op
{name: "Clobber", argLength: 0, typ: "Void", aux: "SymOff", symEffect: "None"}, // write an invalid pointer value to the given pointer slot of a stack variable
}
Expand Down
48 changes: 48 additions & 0 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 36 additions & 0 deletions src/cmd/compile/internal/ssa/rewriteARM64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions src/cmd/internal/obj/arm64/a.out.go
Expand Up @@ -594,6 +594,8 @@ const (
AHVC
AIC
AISB
ALDADDALD
ALDADDALW
ALDADDB
ALDADDH
ALDADDW
Expand Down
2 changes: 2 additions & 0 deletions src/cmd/internal/obj/arm64/anames.go
Expand Up @@ -96,6 +96,8 @@ var Anames = []string{
"HVC",
"IC",
"ISB",
"LDADDALD",
"LDADDALW",
"LDADDB",
"LDADDH",
"LDADDW",
Expand Down
12 changes: 9 additions & 3 deletions src/cmd/internal/obj/arm64/asm7.go
Expand Up @@ -2011,6 +2011,8 @@ func buildop(ctxt *obj.Link) {
oprangeset(ASWPB, t)
oprangeset(ASWPH, t)
oprangeset(ASWPW, t)
oprangeset(ALDADDALD, t)
oprangeset(ALDADDALW, t)
oprangeset(ALDADDB, t)
oprangeset(ALDADDH, t)
oprangeset(ALDADDW, t)
Expand Down Expand Up @@ -3363,9 +3365,9 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
rt := p.RegTo2
rb := p.To.Reg
switch p.As {
case ASWPD, ALDADDD, ALDANDD, ALDEORD, ALDORD: // 64-bit
case ASWPD, ALDADDALD, ALDADDD, ALDANDD, ALDEORD, ALDORD: // 64-bit
o1 = 3 << 30
case ASWPW, ALDADDW, ALDANDW, ALDEORW, ALDORW: // 32-bit
case ASWPW, ALDADDALW, ALDADDW, ALDANDW, ALDEORW, ALDORW: // 32-bit
o1 = 2 << 30
case ASWPH, ALDADDH, ALDANDH, ALDEORH, ALDORH: // 16-bit
o1 = 1 << 30
Expand All @@ -3377,7 +3379,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
switch p.As {
case ASWPD, ASWPW, ASWPH, ASWPB:
o1 |= 0x20 << 10
case ALDADDD, ALDADDW, ALDADDH, ALDADDB:
case ALDADDALD, ALDADDALW, ALDADDD, ALDADDW, ALDADDH, ALDADDB:
o1 |= 0x00 << 10
case ALDANDD, ALDANDW, ALDANDH, ALDANDB:
o1 |= 0x04 << 10
Expand All @@ -3386,6 +3388,10 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
case ALDORD, ALDORW, ALDORH, ALDORB:
o1 |= 0x0c << 10
}
switch p.As {
case ALDADDALD, ALDADDALW:
o1 |= 3 << 22
}
o1 |= 0x1c1<<21 | uint32(rs&31)<<16 | uint32(rb&31)<<5 | uint32(rt&31)

case 50: /* sys/sysl */
Expand Down
20 changes: 20 additions & 0 deletions src/runtime/internal/atomic/bench_test.go
Expand Up @@ -42,3 +42,23 @@ func BenchmarkAtomicStore(b *testing.B) {
atomic.Store(&x, 0)
}
}

func BenchmarkXadd(b *testing.B) {
var x uint32
ptr := &x
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
atomic.Xadd(ptr, 1)
}
})
}

func BenchmarkXadd64(b *testing.B) {
var x uint64
ptr := &x
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
atomic.Xadd64(ptr, 1)
}
})
}
2 changes: 2 additions & 0 deletions src/runtime/proc.go
Expand Up @@ -517,6 +517,8 @@ func cpuinit() {
support_popcnt = cpu.X86.HasPOPCNT
support_sse2 = cpu.X86.HasSSE2
support_sse41 = cpu.X86.HasSSE41

arm64_support_atomics = cpu.ARM64.HasATOMICS
}

// The bootstrap sequence is:
Expand Down

0 comments on commit 0a7ac93

Please sign in to comment.