Skip to content

Commit

Permalink
cmd/compile: use prove pass to detect Ctz of non-zero values
Browse files Browse the repository at this point in the history
On amd64, Ctz must include special handling of zeros.
But the prove pass has enough information to detect whether the input
is non-zero, allowing a more efficient lowering.

Introduce new CtzNonZero ops to capture and use this information.

Benchmark code:

func BenchmarkVisitBits(b *testing.B) {
	b.Run("8", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			x := uint8(0xff)
			for x != 0 {
				sink = bits.TrailingZeros8(x)
				x &= x - 1
			}
		}
	})

    // and similarly so for 16, 32, 64
}

name            old time/op  new time/op  delta
VisitBits/8-8   7.27ns ± 4%  5.58ns ± 4%  -23.35%  (p=0.000 n=28+26)
VisitBits/16-8  14.7ns ± 7%  10.5ns ± 4%  -28.43%  (p=0.000 n=30+28)
VisitBits/32-8  27.6ns ± 8%  19.3ns ± 3%  -30.14%  (p=0.000 n=30+26)
VisitBits/64-8  44.0ns ±11%  38.0ns ± 5%  -13.48%  (p=0.000 n=30+30)

Fixes #25077

Change-Id: Ie6e5bd86baf39ee8a4ca7cadcf56d934e047f957
Reviewed-on: https://go-review.googlesource.com/109358
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
  • Loading branch information
josharian committed Apr 26, 2018
1 parent adbb6ec commit d9a50a6
Show file tree
Hide file tree
Showing 19 changed files with 347 additions and 32 deletions.
5 changes: 5 additions & 0 deletions src/cmd/compile/internal/ssa/gen/AMD64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@
(Ctz16 x) -> (Select0 (BSFL (BTSLconst <typ.UInt32> [16] x)))
(Ctz8 x) -> (Select0 (BSFL (BTSLconst <typ.UInt32> [ 8] x)))

(Ctz64NonZero x) -> (Select0 (BSFQ x))
(Ctz32NonZero x) -> (Select0 (BSFL x))
(Ctz16NonZero x) -> (Select0 (BSFL x))
(Ctz8NonZero x) -> (Select0 (BSFL x))

// BitLen64 of a 64 bit value x requires checking whether x == 0, since BSRQ is undefined when x == 0.
// However, for zero-extended values, we can cheat a bit, and calculate
// BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently
Expand Down
3 changes: 3 additions & 0 deletions src/cmd/compile/internal/ssa/gen/ARM.rules
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@

(Sqrt x) -> (SQRTD x)

// TODO: optimize this for ARMv5 and ARMv6
(Ctz32NonZero x) -> (Ctz32 x)

// count trailing zero for ARMv5 and ARMv6
// 32 - CLZ(x&-x - 1)
(Ctz32 <t> x) && objabi.GOARM<=6 -> (RSBconst [32] (CLZ <t> (SUBconst <t> (AND <t> x (RSBconst <t> [0] x)) [1])))
Expand Down
3 changes: 3 additions & 0 deletions src/cmd/compile/internal/ssa/gen/ARM64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@
(Round x) -> (FRINTAD x)
(Trunc x) -> (FRINTZD x)

(Ctz64NonZero x) -> (Ctz64 x)
(Ctz32NonZero x) -> (Ctz32 x)

(Ctz64 <t> x) -> (CLZ (RBIT <t> x))
(Ctz32 <t> x) -> (CLZW (RBITW <t> x))

Expand Down
3 changes: 3 additions & 0 deletions src/cmd/compile/internal/ssa/gen/MIPS.rules
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@

(Sqrt x) -> (SQRTD x)

// TODO: optimize this case?
(Ctz32NonZero x) -> (Ctz32 x)

// count trailing zero
// 32 - CLZ(x&-x - 1)
(Ctz32 <t> x) -> (SUB (MOVWconst [32]) (CLZ <t> (SUBconst <t> [1] (AND <t> x (NEG <t> x)))))
Expand Down
4 changes: 4 additions & 0 deletions src/cmd/compile/internal/ssa/gen/PPC64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@
(Addr {sym} base) -> (MOVDaddr {sym} base)
(OffPtr [off] ptr) -> (ADD (MOVDconst <typ.Int64> [off]) ptr)

// TODO: optimize these cases?
(Ctz32NonZero x) -> (Ctz32 x)
(Ctz64NonZero x) -> (Ctz64 x)

(Ctz64 x) -> (POPCNTD (ANDN <typ.Int64> (ADDconst <typ.Int64> [-1] x) x))
(Ctz32 x) -> (POPCNTW (MOVWZreg (ANDN <typ.Int> (ADDconst <typ.Int> [-1] x) x)))

Expand Down
4 changes: 4 additions & 0 deletions src/cmd/compile/internal/ssa/gen/S390X.rules
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@
(OffPtr [off] ptr) && is32Bit(off) -> (ADDconst [off] ptr)
(OffPtr [off] ptr) -> (ADD (MOVDconst [off]) ptr)

// TODO: optimize these cases?
(Ctz64NonZero x) -> (Ctz64 x)
(Ctz32NonZero x) -> (Ctz32 x)

// Ctz(x) = 64 - findLeftmostOne((x-1)&^x)
(Ctz64 <t> x) -> (SUB (MOVDconst [64]) (FLOGR (AND <t> (SUBconst <t> [1] x) (NOT <t> x))))
(Ctz32 <t> x) -> (SUB (MOVDconst [64]) (FLOGR (MOVWZreg (ANDW <t> (SUBWconst <t> [1] x) (NOTW <t> x)))))
Expand Down
5 changes: 5 additions & 0 deletions src/cmd/compile/internal/ssa/gen/dec64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@
(Com32 <typ.UInt32> (Int64Hi x))
(Com32 <typ.UInt32> (Int64Lo x)))

// Sadly, just because we know that x is non-zero,
// we don't know whether either component is,
// so just treat Ctz64NonZero the same as Ctz64.
(Ctz64NonZero x) -> (Ctz64 x)

(Ctz64 x) ->
(Add32 <typ.UInt32>
(Ctz32 <typ.UInt32> (Int64Lo x))
Expand Down
20 changes: 12 additions & 8 deletions src/cmd/compile/internal/ssa/gen/genericOps.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,14 +240,18 @@ var genericOps = []opData{
{name: "Com32", argLength: 1},
{name: "Com64", argLength: 1},

{name: "Ctz8", argLength: 1}, // Count trailing (low order) zeroes (returns 0-8)
{name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
{name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
{name: "BitLen8", argLength: 1}, // Number of bits in arg[0] (returns 0-8)
{name: "BitLen16", argLength: 1}, // Number of bits in arg[0] (returns 0-16)
{name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32)
{name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64)
{name: "Ctz8", argLength: 1}, // Count trailing (low order) zeroes (returns 0-8)
{name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
{name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
{name: "Ctz8NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-7
{name: "Ctz16NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-15
{name: "Ctz32NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-31
{name: "Ctz64NonZero", argLength: 1}, // same as above, but arg[0] known to be non-zero, returns 0-63
{name: "BitLen8", argLength: 1}, // Number of bits in arg[0] (returns 0-8)
{name: "BitLen16", argLength: 1}, // Number of bits in arg[0] (returns 0-16)
{name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32)
{name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64)

{name: "Bswap32", argLength: 1}, // Swap bytes
{name: "Bswap64", argLength: 1}, // Swap bytes
Expand Down
24 changes: 24 additions & 0 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

62 changes: 38 additions & 24 deletions src/cmd/compile/internal/ssa/prove.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ var opMax = map[Op]int64{
OpAdd32: math.MaxInt32, OpSub32: math.MaxInt32,
}

// isNonNegative returns true if v is known to be non-negative.
// isNonNegative reports whether v is known to be non-negative.
func (ft *factsTable) isNonNegative(v *Value) bool {
if isNonNegative(v) {
return true
Expand Down Expand Up @@ -734,34 +734,48 @@ func addRestrictions(parent *Block, ft *factsTable, t domain, v, w *Value, r rel
}
}

var ctzNonZeroOp = map[Op]Op{OpCtz8: OpCtz8NonZero, OpCtz16: OpCtz16NonZero, OpCtz32: OpCtz32NonZero, OpCtz64: OpCtz64NonZero}

// simplifyBlock simplifies some constant values in b and evaluates
// branches to non-uniquely dominated successors of b.
func simplifyBlock(sdom SparseTree, ft *factsTable, b *Block) {
// Replace OpSlicemask operations in b with constants where possible.
for _, v := range b.Values {
if v.Op != OpSlicemask {
continue
}
x, delta := isConstDelta(v.Args[0])
if x == nil {
continue
}
// slicemask(x + y)
// if x is larger than -y (y is negative), then slicemask is -1.
lim, ok := ft.limits[x.ID]
if !ok {
continue
}
if lim.umin > uint64(-delta) {
if v.Args[0].Op == OpAdd64 {
v.reset(OpConst64)
} else {
v.reset(OpConst32)
switch v.Op {
case OpSlicemask:
// Replace OpSlicemask operations in b with constants where possible.
x, delta := isConstDelta(v.Args[0])
if x == nil {
continue
}
// slicemask(x + y)
// if x is larger than -y (y is negative), then slicemask is -1.
lim, ok := ft.limits[x.ID]
if !ok {
continue
}
if lim.umin > uint64(-delta) {
if v.Args[0].Op == OpAdd64 {
v.reset(OpConst64)
} else {
v.reset(OpConst32)
}
if b.Func.pass.debug > 0 {
b.Func.Warnl(v.Pos, "Proved slicemask not needed")
}
v.AuxInt = -1
}
case OpCtz8, OpCtz16, OpCtz32, OpCtz64:
// On some architectures, notably amd64, we can generate much better
// code for CtzNN if we know that the argument is non-zero.
// Capture that information here for use in arch-specific optimizations.
x := v.Args[0]
lim, ok := ft.limits[x.ID]
if !ok {
continue
}
if b.Func.pass.debug > 0 {
b.Func.Warnl(v.Pos, "Proved slicemask not needed")
if lim.umin > 0 || lim.min > 0 || lim.max < 0 {
v.Op = ctzNonZeroOp[v.Op]
}
v.AuxInt = -1
}
}

Expand Down Expand Up @@ -818,7 +832,7 @@ func removeBranch(b *Block, branch branch) {
}
}

// isNonNegative returns true is v is known to be greater or equal to zero.
// isNonNegative reports whether v is known to be greater or equal to zero.
func isNonNegative(v *Value) bool {
switch v.Op {
case OpConst64:
Expand Down
76 changes: 76 additions & 0 deletions src/cmd/compile/internal/ssa/rewriteAMD64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions src/cmd/compile/internal/ssa/rewriteARM.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d9a50a6

Please sign in to comment.