Skip to content

Commit

Permalink
cmd/compile: arm64 intrinsics for math/bits.OnesCount
Browse files Browse the repository at this point in the history
This adds math/bits intrinsics for OnesCount on arm64.

name         old time/op  new time/op  delta
OnesCount    3.81ns ± 0%  1.60ns ± 0%  -57.96%  (p=0.000 n=7+8)
OnesCount8   1.60ns ± 0%  1.60ns ± 0%     ~     (all equal)
OnesCount16  2.41ns ± 0%  1.60ns ± 0%  -33.61%  (p=0.000 n=8+8)
OnesCount32  4.17ns ± 0%  1.60ns ± 0%  -61.58%  (p=0.000 n=8+8)
OnesCount64  3.80ns ± 0%  1.60ns ± 0%  -57.84%  (p=0.000 n=8+8)

Update #18616

Conflicts:
	src/cmd/compile/internal/gc/asm_test.go

Change-Id: I63ac2f63acafdb1f60656ab8a56be0b326eec5cb
Reviewed-on: https://go-review.googlesource.com/90835
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
  • Loading branch information
bmakam-qdt authored and cherrymui committed Feb 15, 2018
1 parent c26fac8 commit fcba051
Show file tree
Hide file tree
Showing 11 changed files with 277 additions and 14 deletions.
1 change: 1 addition & 0 deletions src/cmd/asm/internal/asm/testdata/arm64enc.s
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$-8
UMULL R18, R22, R19 // d37eb29b
UXTBW R2, R6 // 461c0053
UXTHW R7, R20 // f43c0053
VCNT V0.B8, V0.B8 // 0058200e
WFE // 5f2003d5
WFI // 7f2003d5
YIELD // 3f2003d5
Expand Down
14 changes: 14 additions & 0 deletions src/cmd/compile/internal/arm64/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
fallthrough
case ssa.OpARM64MVN,
ssa.OpARM64NEG,
ssa.OpARM64FMOVDfpgp,
ssa.OpARM64FMOVDgpfp,
ssa.OpARM64FNEGS,
ssa.OpARM64FNEGD,
ssa.OpARM64FSQRTD,
Expand Down Expand Up @@ -563,6 +565,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARM64VCNT:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = (v.Args[0].Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + ((arm64.ARNG_8B & 15) << 5)
p.To.Type = obj.TYPE_REG
p.To.Reg = (v.Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + ((arm64.ARNG_8B & 15) << 5)
case ssa.OpARM64VUADDLV:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = (v.Args[0].Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + ((arm64.ARNG_8B & 15) << 5)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg() - arm64.REG_F0 + arm64.REG_V0
case ssa.OpARM64CSELULT,
ssa.OpARM64CSELULT0:
r1 := int16(arm64.REGZERO)
Expand Down
24 changes: 24 additions & 0 deletions src/cmd/compile/internal/gc/asm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2751,6 +2751,30 @@ var linuxARM64Tests = []*asmTest{
`,
pos: []string{"TBZ"},
},
{
fn: `
func $(x uint64) int {
return bits.OnesCount64(x)
}
`,
pos: []string{"\tVCNT\t", "\tVUADDLV\t"},
},
{
fn: `
func $(x uint32) int {
return bits.OnesCount32(x)
}
`,
pos: []string{"\tVCNT\t", "\tVUADDLV\t"},
},
{
fn: `
func $(x uint16) int {
return bits.OnesCount16(x)
}
`,
pos: []string{"\tVCNT\t", "\tVUADDLV\t"},
},
// Load-combining tests.
{
fn: `
Expand Down
9 changes: 7 additions & 2 deletions src/cmd/compile/internal/gc/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -3156,18 +3156,23 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
},
sys.PPC64)
sys.PPC64, sys.ARM64)
addF("math/bits", "OnesCount32",
makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
sys.AMD64)
addF("math/bits", "OnesCount32",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
},
sys.PPC64)
sys.PPC64, sys.ARM64)
addF("math/bits", "OnesCount16",
makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
sys.AMD64)
addF("math/bits", "OnesCount16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount16, types.Types[TINT], args[0])
},
sys.ARM64)
// Note: no OnesCount8, the Go implementation is faster - just a table load.
addF("math/bits", "OnesCount",
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
Expand Down
10 changes: 10 additions & 0 deletions src/cmd/compile/internal/ssa/gen/ARM64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,16 @@
(Ctz64 <t> x) -> (CLZ (RBIT <t> x))
(Ctz32 <t> x) -> (CLZW (RBITW <t> x))

(PopCount64 <t> x) -> (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> x))))
(PopCount32 <t> x) -> (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt32to64 x)))))
(PopCount16 <t> x) -> (FMOVDfpgp <t> (VUADDLV <typ.Float64> (VCNT <typ.Float64> (FMOVDgpfp <typ.Float64> (ZeroExt16to64 x)))))

// Load args directly into the register class where it will be used.
(FMOVDgpfp <t> (Arg [off] {sym})) -> @b.Func.Entry (Arg <t> [off] {sym})
// Similarly for stores, if we see a store after FPR <-> GPR move, then redirect store to use the other register set.
(MOVDstore ptr (FMOVDfpgp val) mem) -> (FMOVDstore ptr val mem)
(FMOVDstore ptr (FMOVDgpfp val) mem) -> (MOVDstore ptr val mem)

(BitLen64 x) -> (SUB (MOVDconst [64]) (CLZ <typ.Int> x))

(Bswap64 x) -> (REV x)
Expand Down
29 changes: 17 additions & 12 deletions src/cmd/compile/internal/ssa/gen/ARM64Ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,18 +199,20 @@ func init() {
{name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int64"}, // arg0 &^ auxInt

// unary ops
{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0
{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS"}, // -arg0, float32
{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"}, // -arg0, float64
{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"}, // sqrt(arg0), float64
{name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // byte reverse, 64-bit
{name: "REVW", argLength: 1, reg: gp11, asm: "REVW"}, // byte reverse, 32-bit
{name: "REV16W", argLength: 1, reg: gp11, asm: "REV16W"}, // byte reverse in each 16-bit halfword, 32-bit
{name: "RBIT", argLength: 1, reg: gp11, asm: "RBIT"}, // bit reverse, 64-bit
{name: "RBITW", argLength: 1, reg: gp11, asm: "RBITW"}, // bit reverse, 32-bit
{name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zero, 64-bit
{name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"}, // count leading zero, 32-bit
{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0
{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS"}, // -arg0, float32
{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"}, // -arg0, float64
{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"}, // sqrt(arg0), float64
{name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // byte reverse, 64-bit
{name: "REVW", argLength: 1, reg: gp11, asm: "REVW"}, // byte reverse, 32-bit
{name: "REV16W", argLength: 1, reg: gp11, asm: "REV16W"}, // byte reverse in each 16-bit halfword, 32-bit
{name: "RBIT", argLength: 1, reg: gp11, asm: "RBIT"}, // bit reverse, 64-bit
{name: "RBITW", argLength: 1, reg: gp11, asm: "RBITW"}, // bit reverse, 32-bit
{name: "CLZ", argLength: 1, reg: gp11, asm: "CLZ"}, // count leading zero, 64-bit
{name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"}, // count leading zero, 32-bit
{name: "VCNT", argLength: 1, reg: fp11, asm: "VCNT"}, // count set bits for each 8-bit unit and store the result in each 8-bit unit
{name: "VUADDLV", argLength: 1, reg: fp11, asm: "VUADDLV"}, // unsigned sum of eight bytes in a 64-bit value, zero extended to 64-bit.

// shifts
{name: "SLL", argLength: 2, reg: gp21, asm: "LSL"}, // arg0 << arg1, shift amount is mod 64
Expand Down Expand Up @@ -288,6 +290,9 @@ func init() {
{name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVQstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes of zero to arg0 + auxInt + aux. arg1=mem.

{name: "FMOVDgpfp", argLength: 1, reg: gpfp, asm: "FMOVD"}, // move int64 to float64 (no conversion)
{name: "FMOVDfpgp", argLength: 1, reg: fpgp, asm: "FMOVD"}, // move float64 to int64 (no conversion)

// conversions
{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
Expand Down
56 changes: 56 additions & 0 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit fcba051

Please sign in to comment.