Skip to content

Commit

Permalink
cmd/compile: intrinsify math/bits.Add on amd64
Browse files Browse the repository at this point in the history
name             old time/op  new time/op  delta
Add-8            1.11ns ± 0%  1.18ns ± 0%   +6.31%  (p=0.029 n=4+4)
Add32-8          1.02ns ± 0%  1.02ns ± 1%     ~     (p=0.333 n=4+5)
Add64-8          1.11ns ± 1%  1.17ns ± 0%   +5.79%  (p=0.008 n=5+5)
Add64multiple-8  4.35ns ± 1%  0.86ns ± 0%  -80.22%  (p=0.000 n=5+4)

The individual ops are a bit slower (but still very fast).
Using the ops in carry chains is very fast.

Update #28273

Change-Id: Id975f76df2b930abf0e412911d327b6c5b1befe5
Reviewed-on: https://go-review.googlesource.com/c/144257
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
  • Loading branch information
randall77 committed Oct 25, 2018
1 parent f6b554f commit 899f3a2
Show file tree
Hide file tree
Showing 10 changed files with 493 additions and 12 deletions.
38 changes: 38 additions & 0 deletions src/cmd/compile/internal/amd64/ssa.go
Expand Up @@ -360,6 +360,34 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = r

case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
r := v.Reg0()
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
switch r {
case r0:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case r1:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r0
p.To.Type = obj.TYPE_REG
p.To.Reg = r
default:
v.Fatalf("output not in same register as an input %s", v.LongString())
}

case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()

case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
r := v.Reg()
a := v.Args[0].Reg()
Expand Down Expand Up @@ -946,6 +974,16 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = r

case ssa.OpAMD64NEGLflags:
r := v.Reg0()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = r

case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
Expand Down
7 changes: 7 additions & 0 deletions src/cmd/compile/internal/gc/ssa.go
Expand Up @@ -3481,6 +3481,13 @@ func init() {
},
sys.AMD64, sys.ARM64, sys.PPC64)

addF("math/bits", "Add64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2])
},
sys.AMD64)
alias("math/bits", "Add", "math/bits", "Add64", sys.ArchAMD64)

/******** sync/atomic ********/

// Note: these are disabled by flag_race in findIntrinsic below.
Expand Down
13 changes: 13 additions & 0 deletions src/cmd/compile/internal/ssa/gen/AMD64.rules
Expand Up @@ -29,6 +29,19 @@
(Div8u x y) -> (Select0 (DIVWU (ZeroExt8to16 x) (ZeroExt8to16 y)))
(Div(32|64)F x y) -> (DIVS(S|D) x y)

(Select0 (Add64carry x y c)) ->
(Select0 <typ.UInt64> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c))))
(Select1 (Add64carry x y c)) ->
(NEGQ <typ.UInt64> (SBBQcarrymask <typ.UInt64> (Select1 <types.TypeFlags> (ADCQ x y (Select1 <types.TypeFlags> (NEGLflags c))))))

// Optimize ADCQ and friends
(ADCQ x (MOVQconst [c]) carry) && is32Bit(c) -> (ADCQconst x [c] carry)
(ADCQ x y (FlagEQ)) -> (ADDQcarry x y)
(ADCQconst x [c] (FlagEQ)) -> (ADDQconstcarry x [c])
(ADDQcarry x (MOVQconst [c])) && is32Bit(c) -> (ADDQconstcarry x [c])
(Select1 (NEGLflags (MOVQconst [0]))) -> (FlagEQ)
(Select1 (NEGLflags (NEGQ (SBBQcarrymask x)))) -> x

(Mul64uhilo x y) -> (MULQU2 x y)
(Div128u xhi xlo y) -> (DIVQU2 xhi xlo y)

Expand Down
32 changes: 21 additions & 11 deletions src/cmd/compile/internal/ssa/gen/AMD64Ops.go
Expand Up @@ -107,24 +107,27 @@ func init() {

// Common regInfo
var (
gp01 = regInfo{inputs: nil, outputs: gponly}
gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly}
gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
gp11sb = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
gp21sb = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
gp11div = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}}
gp21hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
gp01 = regInfo{inputs: nil, outputs: gponly}
gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly}
gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
gp11sb = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
gp21sb = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
gp11div = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax, dx}}
gp21hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
gp21flags = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
gp2flags1flags = regInfo{inputs: []regMask{gp, gp, 0}, outputs: []regMask{gp, 0}}

gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
gp1flags = regInfo{inputs: []regMask{gpsp}}
gp0flagsLoad = regInfo{inputs: []regMask{gpspsb, 0}}
gp1flagsLoad = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
flagsgp = regInfo{inputs: nil, outputs: gponly}

gp11flags = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}}
gp11flags = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}}
gp1flags1flags = regInfo{inputs: []regMask{gp, 0}, outputs: []regMask{gp, 0}}

readflags = regInfo{inputs: nil, outputs: gponly}
flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}
Expand Down Expand Up @@ -229,6 +232,13 @@ func init() {
{name: "DIVLU", argLength: 2, reg: gp11div, typ: "(UInt32,UInt32)", asm: "DIVL", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]
{name: "DIVWU", argLength: 2, reg: gp11div, typ: "(UInt16,UInt16)", asm: "DIVW", clobberFlags: true}, // [arg0 / arg1, arg0 % arg1]

{name: "NEGLflags", argLength: 1, reg: gp11flags, typ: "(UInt32,Flags)", asm: "NEGL", resultInArg0: true}, // -arg0, flags set for 0-arg0.
// The following 4 add opcodes return the low 64 bits of the sum in the first result and
// the carry (the 65th bit) in the carry flag.
{name: "ADDQcarry", argLength: 2, reg: gp21flags, typ: "(UInt64,Flags)", asm: "ADDQ", commutative: true, resultInArg0: true}, // r = arg0+arg1
{name: "ADCQ", argLength: 3, reg: gp2flags1flags, typ: "(UInt64,Flags)", asm: "ADCQ", commutative: true, resultInArg0: true}, // r = arg0+arg1+carry(arg2)
{name: "ADDQconstcarry", argLength: 1, reg: gp11flags, typ: "(UInt64,Flags)", asm: "ADDQ", aux: "Int32", resultInArg0: true}, // r = arg0+auxint
{name: "ADCQconst", argLength: 2, reg: gp1flags1flags, typ: "(UInt64,Flags)", asm: "ADCQ", aux: "Int32", resultInArg0: true}, // r = arg0+auxint+carry(arg1)
{name: "MULQU2", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}, commutative: true, asm: "MULQ", clobberFlags: true}, // arg0 * arg1, returns (hi, lo)
{name: "DIVQU2", argLength: 3, reg: regInfo{inputs: []regMask{dx, ax, gpsp}, outputs: []regMask{ax, dx}}, asm: "DIVQ", clobberFlags: true}, // arg0:arg1 / arg2 (128-bit divided by 64-bit), returns (q, r)

Expand Down
2 changes: 2 additions & 0 deletions src/cmd/compile/internal/ssa/gen/genericOps.go
Expand Up @@ -491,6 +491,8 @@ var genericOps = []opData{
{name: "Sub32carry", argLength: 2, typ: "(UInt32,Flags)"}, // arg0 - arg1, returns (value, carry)
{name: "Sub32withcarry", argLength: 3}, // arg0 - arg1 - arg2, arg2=carry (0 or 1)

{name: "Add64carry", argLength: 3, commutative: true, typ: "(UInt64,UInt64)"}, // arg0 + arg1 + arg2, arg2 must be 0 or 1. returns (value, value>>64)

{name: "Signmask", argLength: 1, typ: "Int32"}, // 0 if arg0 >= 0, -1 if arg0 < 0
{name: "Zeromask", argLength: 1, typ: "UInt32"}, // 0 if arg0 == 0, 0xffffffff if arg0 != 0
{name: "Slicemask", argLength: 1}, // 0 if arg0 == 0, -1 if arg0 > 0, undef if arg0<0. Type is native int size.
Expand Down
93 changes: 93 additions & 0 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 899f3a2

Please sign in to comment.