cmd/compile: tight code optimization opportunities #47120

rsc · 2021-07-10T12:13:42Z

The generated x86 code can be improved in some fairly simple ways - hoisting computed constants out of loop bodies, and avoiding unnecessary register moves - that have a significant performance impact on tight loops. In the following example those improvements produce a 35% speedup.

Here is an alternate, DFA-based implementation of utf8.Valid that I have been playing with:

func Valid(x []byte) bool {
	state := uint64(1 * 6)
	for _, b := range x {
		state = dfa[b] >> (state & 63)
	}
	return (state & 63) == 1*6
}

const (
	s00 = 0 | (1*6)<<(1*6)
	sC0 = 0
	sC2 = 0 | (2*6)<<(1*6)
	sE0 = 0 | (3*6)<<(1*6)
	sE1 = 0 | (4*6)<<(1*6)
	sED = 0 | (5*6)<<(1*6)
	sEE = sE1
	sF0 = 0 | (6*6)<<(1*6)
	sF1 = 0 | (7*6)<<(1*6)
	sF4 = 0 | (8*6)<<(1*6)
	sF5 = 0

	s80 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(7*6) | (4*6)<<(8*6)
	s90 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
	sA0 = 0 | (1*6)<<(2*6) | (2*6)<<(3*6) | (2*6)<<(4*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
)

var dfa = [256]uint64{
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80,
	s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90,
	sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
	sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
	sC0, sC0, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
	sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
	sE0, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sED, sEE, sEE,
	sF0, sF1, sF1, sF1, sF4, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5,
}

There are no big benchmarks of Valid in the package, but here are some that could be added:

var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
var valid1M = bytes.Repeat(valid1k, 1024)

func BenchmarkValid1K(b *testing.B) {
	b.SetBytes(int64(len(valid1k)))
	for i := 0; i < b.N; i++ {
		Valid(valid1k)
	}
}

func BenchmarkValid1M(b *testing.B) {
	b.SetBytes(int64(len(valid1M)))
	for i := 0; i < b.N; i++ {
		Valid(valid1M)
	}
}

The old Valid implementation runs at around 1450 MB/s.
The implementation above runs at around 1600 MB/s.
Better but not what I had hoped.
It compiles as follows:

% go test -c && go tool objdump -s 'utf8.Valid$' utf8.test
TEXT unicode/utf8.Valid(SB) /Users/rsc/go/src/unicode/utf8/utf8.go
  utf8.go:647		0x10789c0		4889442408		MOVQ AX, 0x8(SP)
  utf8.go:649		0x10789c5		31c9			XORL CX, CX
  utf8.go:649		0x10789c7		ba06000000		MOVL $0x6, DX
  utf8.go:649		0x10789cc		eb1f			JMP 0x10789ed
  utf8.go:649		0x10789ce		0fb63408		MOVZX 0(AX)(CX*1), SI
  utf8.go:649		0x10789d2		488d7901		LEAQ 0x1(CX), DI
  utf8.go:650		0x10789d6		4c8d0543ef1600		LEAQ unicode/utf8.dfa(SB), R8
  utf8.go:650		0x10789dd		498b34f0		MOVQ 0(R8)(SI*8), SI
  utf8.go:650		0x10789e1		4889d1			MOVQ DX, CX
  utf8.go:650		0x10789e4		48d3ee			SHRQ CL, SI
  utf8.go:649		0x10789e7		4889f9			MOVQ DI, CX
  utf8.go:652		0x10789ea		4889f2			MOVQ SI, DX
  utf8.go:649		0x10789ed		4839cb			CMPQ CX, BX
  utf8.go:649		0x10789f0		7fdc			JG 0x10789ce
  utf8.go:652		0x10789f2		4883e23f		ANDQ $0x3f, DX
  utf8.go:652		0x10789f6		4883fa06		CMPQ $0x6, DX
  utf8.go:652		0x10789fa		0f94c0			SETE AL
  utf8.go:652		0x10789fd		c3			RET
  :-1			0x10789fe		cc			INT $0x3
  :-1			0x10789ff		cc			INT $0x3
%

Translating this to proper non-regabi assembly I get:

#include "textflag.h"

TEXT ·Valid(SB),NOSPLIT,$0
	MOVQ	x_base+0(FP),AX // p = &x[0]
	MOVQ	x_len+8(FP),BX // n = len(x)
	XORL	CX, CX // i = 0
	MOVL	$6, DX
	JMP loop

body:
	MOVBLZX	(AX)(CX*1), SI // b = p[i]
	LEAQ	1(CX), DI // j = i+1
	LEAQ	·dfa(SB), R8
	MOVQ	(R8)(SI*8), SI // t = dfa[b]
	MOVQ	DX, CX // CX = state
	SHRQ	CX, SI // t >>= state&63
	MOVQ	DI, CX // i = j
	MOVQ	SI, DX // state = t

loop:
	CMPQ	CX, BX
	JLT	body

	ANDL	$0x3f, DX
	CMPL	DX, $6
	SETEQ	AL
	MOVB	AX, ret+24(FP)
	RET

This runs also at about 1600 MB/s.

First optimization: the LEAQ ·dfa(SB), R8 should be hoisted out of the loop body.
(I tried to do this in the Go version with dfa := &dfa but it got constant propagated away!)

That change brings it up to 1750 MB/s.

Second optimization: use DI for i instead of CX, to avoid the pressure on CX.
This lets the LEAQ 1(CX), DI and the later MOVQ DI, CX collapse to just LEAQ 1(DI), DI.

That change brings it up to 1900 MB/s.

The body is now:

body:
	MOVBLZX	(AX)(DI*1), SI // b = p[i]
	LEAQ	1(DI), DI // i++
	MOVQ	(R8)(SI*8), SI // t = dfa[b]
	MOVQ	DX, CX // CX = state
	SHRQ	CX, SI // t >>= state&63
	MOVQ	SI, DX // state = t

Third optimization: since DX is moving into CX, do that one instruction earlier, allowing the use of SI to be optimized into DX to eliminate the final MOVQ:

body:
	MOVBLZX	(AX)(DI*1), SI // b = p[i]
	LEAQ	1(DI), DI // i++
	MOVQ	DX, CX // CX = state
	MOVQ	(R8)(SI*8), DX // state = dfa[b]
	SHRQ	CX, DX // state >>= CX&63

I think this ends up being just "compute the shift amount before the shifted value".
That change brings it up to 2150 MB/s.

This is still a direct translation of the Go code: there are no tricks the compiler couldn't do.
For this particular loop, the optimizations make the code run 35% faster.

Final assembly:

#include "textflag.h"

TEXT ·Valid(SB),NOSPLIT,$0
	MOVQ	x_base+0(FP),AX // p = &x[0]
	MOVQ	x_len+8(FP),BX // n = len(x)
	XORL	DI, DI // i = 0
	MOVL	$6, DX
	LEAQ	·dfa(SB), R8
	JMP loop

body:
	MOVBLZX	(AX)(DI*1), SI // b = p[i]
	LEAQ	1(DI), DI // i++
	MOVQ	DX, CX // CX = state
	MOVQ	(R8)(SI*8), DX // t = dfa[b]
	SHRQ	CX, DX // t >>= state&63

loop:
	CMPQ	DI, BX
	JLT	body

	ANDL	$0x3f, DX
	CMPL	DX, $6
	SETEQ	AL
	MOVB	AX, ret+24(FP)
	RET

The text was updated successfully, but these errors were encountered:

rsc · 2021-07-10T12:24:11Z

Here is another, separate opportunity, for GOAMD64=v3 compilation. The SHRXQ instruction takes an explicit shift register, has separate source and destination operands, and can read source from memory. That allows reducing the loop to

body:
	MOVBLZX	(AX)(DI*1), SI // b = p[i]
	LEAQ	1(DI), DI // i++
	SHRXQ	DX, (R8)(SI*8), DX // state = dfa[b] >> (state&63)

That change runs at 3400 MB/s (!).

(The DFA tables were carefully constructed exactly to enable this implementation.)

zchee · 2021-07-14T08:18:53Z

@rsc sorry for hijacked, but what means GOAMD64=v3?

ALTree · 2021-07-14T08:23:11Z

@zchee see #45453 and https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels.

laboger · 2022-01-12T22:37:40Z

I see this hasn't had attention for a while but this is a problem I've noticed in ppc64 code too. Invariant values are not moved out of loops. I thought at one time there was work to do this but it must have been abandoned.

Here is one example:

   ff0a0:       00 00 e3 8b     lbz     r31,0(r3)  <--- nil check is not needed on each iteration
   ff0a4:       24 1f c7 78     rldicr  r7,r6,3,60  \
   ff0a8:       14 3a 03 7d     add     r8,r3,r7   /  These two could be strength-reduced?
   ff0ac:       00 00 28 e9     ld      r9,0(r8)
   ff0b0:       2a 20 47 7d     ldx     r10,r7,r4
   ff0b4:       2a 28 67 7d     ldx     r11,r7,r5
   ff0b8:       78 52 6a 7d     xor     r10,r11,r10
   ff0bc:       b0 00 61 39     addi    r11,r1,176  <---- this is invariant in the loop
   ff0c0:       2a 58 e7 7c     ldx     r7,r7,r11
   ff0c4:       78 52 e7 7c     xor     r7,r7,r10
   ff0c8:       78 3a 27 7d     xor     r7,r9,r7
   ff0cc:       00 00 e8 f8     std     r7,0(r8)
   ff0d0:       01 00 c6 38     addi    r6,r6,1
   ff0d4:       80 00 26 2c     cmpdi   r6,128
   ff0d8:       c8 ff 80 41     blt     ff0a0 <golang.org/x/crypto/argon2.processBlockGeneric+0x3a0>

gopherbot · 2022-02-11T17:18:10Z

Change https://go.dev/cl/385174 mentions this issue: cmd/compile: use shlx&shrx instruction for GOAMD64>=v3

The SHRX/SHLX instruction can take any general register as the shift count operand, and can read source from memory. This CL introduces some operators to combine load and shift to one instruction. For #47120 Change-Id: I13b48f53c7d30067a72eb2c8382242045dead36a Reviewed-on: https://go-review.googlesource.com/c/go/+/385174 Reviewed-by: Keith Randall <khr@golang.org> Trust: Cherry Mui <cherryyz@google.com>

rsc added the NeedsInvestigation Someone must examine and confirm this is a valid issue and not a duplicate of an existing one. label Jul 10, 2021

rsc added this to the Backlog milestone Jul 10, 2021

josharian added the Performance label Jul 10, 2021

gopherbot added the compiler/runtime Issues related to the Go compiler and/or runtime. label Jul 13, 2022

ndinsmore mentioned this issue Dec 12, 2022

Add native UTF-8 Validation using fast shift based DFA JuliaLang/julia#47880

Merged

thomcc mentioned this issue Feb 7, 2023

Rewrite libcore's UTF-8 validation for performance rust-lang/rust#107760

Closed

5 tasks

artyom mentioned this issue Oct 3, 2023

unicode/utf8: optimize utf8.Valid with AVX2 instructions on AMD64 #63347

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

cmd/compile: tight code optimization opportunities #47120

cmd/compile: tight code optimization opportunities #47120

rsc commented Jul 10, 2021 •

edited

Loading

rsc commented Jul 10, 2021 •

edited

Loading

zchee commented Jul 14, 2021

ALTree commented Jul 14, 2021

laboger commented Jan 12, 2022

gopherbot commented Feb 11, 2022

cmd/compile: tight code optimization opportunities #47120

cmd/compile: tight code optimization opportunities #47120

Comments

rsc commented Jul 10, 2021 • edited Loading

rsc commented Jul 10, 2021 • edited Loading

zchee commented Jul 14, 2021

ALTree commented Jul 14, 2021

laboger commented Jan 12, 2022

gopherbot commented Feb 11, 2022

rsc commented Jul 10, 2021 •

edited

Loading

rsc commented Jul 10, 2021 •

edited

Loading