Skip to content

Commit

Permalink
hash/crc32: optimize arm64 crc32 implementation
Browse files Browse the repository at this point in the history
ARMv8 defines crc32 instruction.

Comparing to the original crc32 calculation, this patch makes use of
crc32 instructions to do crc32 calculation instead of the multiple
lookup table algorithms.

ARMv8 provides IEEE and Castagnoli polynomials for crc32 calculation
so that the perfomance of these two types of crc32 get significant
improved.

name                                        old time/op   new time/op    delta
CRC32/poly=IEEE/size=15/align=0-32            117ns ± 0%      38ns ± 0%   -67.44%
CRC32/poly=IEEE/size=15/align=1-32            117ns ± 0%      38ns ± 0%   -67.52%
CRC32/poly=IEEE/size=40/align=0-32            129ns ± 0%      41ns ± 0%   -68.37%
CRC32/poly=IEEE/size=40/align=1-32            129ns ± 0%      41ns ± 0%   -68.29%
CRC32/poly=IEEE/size=512/align=0-32           828ns ± 0%     246ns ± 0%   -70.29%
CRC32/poly=IEEE/size=512/align=1-32           828ns ± 0%     132ns ± 0%   -84.06%
CRC32/poly=IEEE/size=1kB/align=0-32          1.58µs ± 0%    0.46µs ± 0%   -70.98%
CRC32/poly=IEEE/size=1kB/align=1-32          1.58µs ± 0%    0.46µs ± 0%   -70.92%
CRC32/poly=IEEE/size=4kB/align=0-32          6.06µs ± 0%    1.74µs ± 0%   -71.27%
CRC32/poly=IEEE/size=4kB/align=1-32          6.10µs ± 0%    1.74µs ± 0%   -71.44%
CRC32/poly=IEEE/size=32kB/align=0-32         48.3µs ± 0%    13.7µs ± 0%   -71.61%
CRC32/poly=IEEE/size=32kB/align=1-32         48.3µs ± 0%    13.7µs ± 0%   -71.60%
CRC32/poly=Castagnoli/size=15/align=0-32      116ns ± 0%      38ns ± 0%   -67.07%
CRC32/poly=Castagnoli/size=15/align=1-32      116ns ± 0%      38ns ± 0%   -66.90%
CRC32/poly=Castagnoli/size=40/align=0-32      127ns ± 0%      40ns ± 0%   -68.11%
CRC32/poly=Castagnoli/size=40/align=1-32      127ns ± 0%      40ns ± 0%   -68.11%
CRC32/poly=Castagnoli/size=512/align=0-32     828ns ± 0%     132ns ± 0%   -84.06%
CRC32/poly=Castagnoli/size=512/align=1-32     827ns ± 0%     132ns ± 0%   -84.04%
CRC32/poly=Castagnoli/size=1kB/align=0-32    1.59µs ± 0%    0.22µs ± 0%   -85.89%
CRC32/poly=Castagnoli/size=1kB/align=1-32    1.58µs ± 0%    0.22µs ± 0%   -85.79%
CRC32/poly=Castagnoli/size=4kB/align=0-32    6.14µs ± 0%    0.77µs ± 0%   -87.40%
CRC32/poly=Castagnoli/size=4kB/align=1-32    6.06µs ± 0%    0.77µs ± 0%   -87.25%
CRC32/poly=Castagnoli/size=32kB/align=0-32   48.3µs ± 0%     5.9µs ± 0%   -87.71%
CRC32/poly=Castagnoli/size=32kB/align=1-32   48.4µs ± 0%     6.0µs ± 0%   -87.69%
CRC32/poly=Koopman/size=15/align=0-32         104ns ± 0%     104ns ± 0%    +0.00%
CRC32/poly=Koopman/size=15/align=1-32         104ns ± 0%     104ns ± 0%    +0.00%
CRC32/poly=Koopman/size=40/align=0-32         235ns ± 0%     235ns ± 0%    +0.00%
CRC32/poly=Koopman/size=40/align=1-32         235ns ± 0%     235ns ± 0%    +0.00%
CRC32/poly=Koopman/size=512/align=0-32       2.71µs ± 0%    2.71µs ± 0%    -0.07%
CRC32/poly=Koopman/size=512/align=1-32       2.71µs ± 0%    2.71µs ± 0%    -0.04%
CRC32/poly=Koopman/size=1kB/align=0-32       5.40µs ± 0%    5.39µs ± 0%    -0.06%
CRC32/poly=Koopman/size=1kB/align=1-32       5.40µs ± 0%    5.40µs ± 0%    +0.02%
CRC32/poly=Koopman/size=4kB/align=0-32       21.5µs ± 0%    21.5µs ± 0%    -0.16%
CRC32/poly=Koopman/size=4kB/align=1-32       21.5µs ± 0%    21.5µs ± 0%    -0.05%
CRC32/poly=Koopman/size=32kB/align=0-32       172µs ± 0%     172µs ± 0%    -0.07%
CRC32/poly=Koopman/size=32kB/align=1-32       172µs ± 0%     172µs ± 0%    -0.01%

name                                        old speed     new speed      delta
CRC32/poly=IEEE/size=15/align=0-32          128MB/s ± 0%   394MB/s ± 0%  +207.95%
CRC32/poly=IEEE/size=15/align=1-32          128MB/s ± 0%   394MB/s ± 0%  +208.09%
CRC32/poly=IEEE/size=40/align=0-32          310MB/s ± 0%   979MB/s ± 0%  +216.07%
CRC32/poly=IEEE/size=40/align=1-32          310MB/s ± 0%   979MB/s ± 0%  +216.16%
CRC32/poly=IEEE/size=512/align=0-32         618MB/s ± 0%  2074MB/s ± 0%  +235.72%
CRC32/poly=IEEE/size=512/align=1-32         618MB/s ± 0%  3852MB/s ± 0%  +523.55%
CRC32/poly=IEEE/size=1kB/align=0-32         646MB/s ± 0%  2225MB/s ± 0%  +244.57%
CRC32/poly=IEEE/size=1kB/align=1-32         647MB/s ± 0%  2225MB/s ± 0%  +243.87%
CRC32/poly=IEEE/size=4kB/align=0-32         676MB/s ± 0%  2352MB/s ± 0%  +248.02%
CRC32/poly=IEEE/size=4kB/align=1-32         672MB/s ± 0%  2352MB/s ± 0%  +250.15%
CRC32/poly=IEEE/size=32kB/align=0-32        678MB/s ± 0%  2387MB/s ± 0%  +252.17%
CRC32/poly=IEEE/size=32kB/align=1-32        678MB/s ± 0%  2388MB/s ± 0%  +252.11%
CRC32/poly=Castagnoli/size=15/align=0-32    129MB/s ± 0%   393MB/s ± 0%  +205.51%
CRC32/poly=Castagnoli/size=15/align=1-32    129MB/s ± 0%   390MB/s ± 0%  +203.41%
CRC32/poly=Castagnoli/size=40/align=0-32    314MB/s ± 0%   988MB/s ± 0%  +215.04%
CRC32/poly=Castagnoli/size=40/align=1-32    314MB/s ± 0%   987MB/s ± 0%  +214.68%
CRC32/poly=Castagnoli/size=512/align=0-32   618MB/s ± 0%  3860MB/s ± 0%  +524.32%
CRC32/poly=Castagnoli/size=512/align=1-32   619MB/s ± 0%  3859MB/s ± 0%  +523.66%
CRC32/poly=Castagnoli/size=1kB/align=0-32   645MB/s ± 0%  4568MB/s ± 0%  +608.56%
CRC32/poly=Castagnoli/size=1kB/align=1-32   650MB/s ± 0%  4567MB/s ± 0%  +602.94%
CRC32/poly=Castagnoli/size=4kB/align=0-32   667MB/s ± 0%  5297MB/s ± 0%  +693.81%
CRC32/poly=Castagnoli/size=4kB/align=1-32   676MB/s ± 0%  5297MB/s ± 0%  +684.00%
CRC32/poly=Castagnoli/size=32kB/align=0-32  678MB/s ± 0%  5519MB/s ± 0%  +713.83%
CRC32/poly=Castagnoli/size=32kB/align=1-32  677MB/s ± 0%  5497MB/s ± 0%  +712.04%
CRC32/poly=Koopman/size=15/align=0-32       143MB/s ± 0%   144MB/s ± 0%    +0.27%
CRC32/poly=Koopman/size=15/align=1-32       143MB/s ± 0%   144MB/s ± 0%    +0.33%
CRC32/poly=Koopman/size=40/align=0-32       169MB/s ± 0%   170MB/s ± 0%    +0.12%
CRC32/poly=Koopman/size=40/align=1-32       170MB/s ± 0%   170MB/s ± 0%    +0.08%
CRC32/poly=Koopman/size=512/align=0-32      189MB/s ± 0%   189MB/s ± 0%    +0.07%
CRC32/poly=Koopman/size=512/align=1-32      189MB/s ± 0%   189MB/s ± 0%    +0.04%
CRC32/poly=Koopman/size=1kB/align=0-32      190MB/s ± 0%   190MB/s ± 0%    +0.05%
CRC32/poly=Koopman/size=1kB/align=1-32      190MB/s ± 0%   190MB/s ± 0%    -0.01%
CRC32/poly=Koopman/size=4kB/align=0-32      190MB/s ± 0%   190MB/s ± 0%    +0.15%
CRC32/poly=Koopman/size=4kB/align=1-32      190MB/s ± 0%   191MB/s ± 0%    +0.05%
CRC32/poly=Koopman/size=32kB/align=0-32     191MB/s ± 0%   191MB/s ± 0%    +0.06%
CRC32/poly=Koopman/size=32kB/align=1-32     191MB/s ± 0%   191MB/s ± 0%    +0.02%

Also fix a bug of arm64 assembler

The optimization is mainly contributed by Fangming.Fang <fangming.fang@arm.com>

Change-Id: I900678c2e445d7e8ad9e2a9ab3305d649230905f
Reviewed-on: https://go-review.googlesource.com/40074
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
  • Loading branch information
Wei Xiao authored and cherrymui committed Apr 13, 2017
1 parent aaf4682 commit ab636b8
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/cmd/internal/obj/arm64/asm7.go
Expand Up @@ -2564,7 +2564,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
}
o1 |= ((uint32(v) & 0x20) << (31 - 5)) | ((uint32(v) & 0x1F) << 19)
o1 |= uint32(c.brdist(p, 0, 14, 2) << 5)
o1 |= uint32(p.Reg)
o1 |= uint32(p.Reg & 31)

case 41: /* eret, nop, others with no operands */
o1 = c.op0(p, p.As)
Expand Down
51 changes: 51 additions & 0 deletions src/hash/crc32/crc32_arm64.go
@@ -0,0 +1,51 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// ARM64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
// description of the interface that each architecture-specific file
// implements.

package crc32

func supportsCRC32() bool
func castagnoliUpdate(crc uint32, p []byte) uint32
func ieeeUpdate(crc uint32, p []byte) uint32

var hasCRC32 = supportsCRC32()

func archAvailableCastagnoli() bool {
return hasCRC32
}

func archInitCastagnoli() {
if !hasCRC32 {
panic("arch-specific crc32 instruction for Catagnoli not available")
}
}

func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
if !hasCRC32 {
panic("arch-specific crc32 instruction for Castagnoli not available")
}

return ^castagnoliUpdate(^crc, p)
}

func archAvailableIEEE() bool {
return hasCRC32
}

func archInitIEEE() {
if !hasCRC32 {
panic("arch-specific crc32 instruction for IEEE not available")
}
}

func archUpdateIEEE(crc uint32, p []byte) uint32 {
if !hasCRC32 {
panic("arch-specific crc32 instruction for IEEE not available")
}

return ^ieeeUpdate(^crc, p)
}
97 changes: 97 additions & 0 deletions src/hash/crc32/crc32_arm64.s
@@ -0,0 +1,97 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// castagnoliUpdate updates the non-inverted crc with the given data.

// func castagnoliUpdate(crc uint32, p []byte) uint32
TEXT ·castagnoliUpdate(SB),NOSPLIT,$0-36
MOVWU crc+0(FP), R9 // CRC value
MOVD p+8(FP), R13 // data pointer
MOVD p_len+16(FP), R11 // len(p)

CMP $8, R11
BLT less_than_8

update:
MOVD.P 8(R13), R10
CRC32CX R10, R9
SUB $8, R11

CMP $8, R11
BLT less_than_8

JMP update

less_than_8:
TBZ $2, R11, less_than_4

MOVWU.P 4(R13), R10
CRC32CW R10, R9

less_than_4:
TBZ $1, R11, less_than_2

MOVHU.P 2(R13), R10
CRC32CH R10, R9

less_than_2:
TBZ $0, R11, done

MOVBU (R13), R10
CRC32CB R10, R9

done:
MOVWU R9, ret+32(FP)
RET

// ieeeUpdate updates the non-inverted crc with the given data.

// func ieeeUpdate(crc uint32, p []byte) uint32
TEXT ·ieeeUpdate(SB),NOSPLIT,$0-36
MOVWU crc+0(FP), R9 // CRC value
MOVD p+8(FP), R13 // data pointer
MOVD p_len+16(FP), R11 // len(p)

CMP $8, R11
BLT less_than_8

update:
MOVD.P 8(R13), R10
CRC32X R10, R9
SUB $8, R11

CMP $8, R11
BLT less_than_8

JMP update

less_than_8:
TBZ $2, R11, less_than_4

MOVWU.P 4(R13), R10
CRC32W R10, R9

less_than_4:
TBZ $1, R11, less_than_2

MOVHU.P 2(R13), R10
CRC32H R10, R9

less_than_2:
TBZ $0, R11, done

MOVBU (R13), R10
CRC32B R10, R9

done:
MOVWU R9, ret+32(FP)
RET

// func supportsCRC32() bool
TEXT ·supportsCRC32(SB),NOSPLIT,$0-1
MOVB runtime·supportCRC32(SB), R0
MOVB R0, ret+0(FP)
RET
2 changes: 1 addition & 1 deletion src/hash/crc32/crc32_otherarch.go
Expand Up @@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !amd64,!amd64p32,!s390x,!ppc64le
// +build !amd64,!amd64p32,!s390x,!ppc64le,!arm64

package crc32

Expand Down
2 changes: 2 additions & 0 deletions src/runtime/os_darwin_arm64.go
Expand Up @@ -4,6 +4,8 @@

package runtime

var supportCRC32 = false

//go:nosplit
func cputicks() int64 {
// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
Expand Down
7 changes: 7 additions & 0 deletions src/runtime/os_linux_arm64.go
Expand Up @@ -4,7 +4,12 @@

package runtime

const (
_ARM64_FEATURE_HAS_CRC32 = 0x80
)

var randomNumber uint32
var supportCRC32 bool

func archauxv(tag, val uintptr) {
switch tag {
Expand All @@ -14,6 +19,8 @@ func archauxv(tag, val uintptr) {
// it as a byte array.
randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
case _AT_HWCAP:
supportCRC32 = val & _ARM64_FEATURE_HAS_CRC32 != 0
}
}

Expand Down

2 comments on commit ab636b8

@vielmetti
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this contribution will have a positive impact on CockroachDB and RocksDB, based on this discussion in cockroachdb/cockroach#15868

@vielmetti
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #20370

Please sign in to comment.