From 74369b46fc6756741c016591724fd1cb8e26845f Mon Sep 17 00:00:00 2001 From: Meng Zhuo Date: Sun, 10 Feb 2019 20:59:10 +0800 Subject: [PATCH] internal/chacha20: add SIMD implementation on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inspired by Vectorization of ChaCha Stream Cipher https://eprint.iacr.org/2013/759.pdf name old time/op new time/op delta ChaCha20/32 690ns ± 0% 872ns ± 0% +26.38% (p=0.000 n=10+10) ChaCha20/63 750ns ± 0% 987ns ± 0% +31.53% (p=0.000 n=10+10) ChaCha20/64 674ns ± 0% 879ns ± 0% +30.42% (p=0.000 n=8+10) ChaCha20/256 2.28µs ± 0% 0.82µs ± 0% -64.13% (p=0.000 n=10+10) ChaCha20/1024 8.64µs ± 0% 2.92µs ± 0% -66.15% (p=0.000 n=9+9) ChaCha20/1350 11.9µs ± 0% 4.5µs ± 0% -62.51% (p=0.000 n=10+8) ChaCha20/65536 554µs ± 0% 181µs ± 0% -67.33% (p=0.000 n=10+10) name old speed new speed delta ChaCha20/32 46.3MB/s ± 0% 36.7MB/s ± 0% -20.87% (p=0.000 n=10+9) ChaCha20/63 83.9MB/s ± 0% 63.8MB/s ± 0% -23.97% (p=0.000 n=10+10) ChaCha20/64 94.9MB/s ± 0% 72.8MB/s ± 0% -23.31% (p=0.000 n=10+10) ChaCha20/256 112MB/s ± 0% 312MB/s ± 0% +178.74% (p=0.000 n=10+10) ChaCha20/1024 119MB/s ± 0% 350MB/s ± 0% +195.31% (p=0.000 n=10+9) ChaCha20/1350 114MB/s ± 0% 303MB/s ± 0% +166.73% (p=0.000 n=8+8) ChaCha20/65536 118MB/s ± 0% 362MB/s ± 0% +206.12% (p=0.000 n=10+10) Updates golang/go#22809 Change-Id: I487487faa2ae4ff29de6fd8eb1317740c2939c10 Reviewed-on: https://go-review.googlesource.com/c/107628 Reviewed-by: Filippo Valsorda --- internal/chacha20/asm_arm64.s | 308 ++++++++++++++++++++++++++++++ internal/chacha20/chacha_arm64.go | 31 +++ internal/chacha20/chacha_noasm.go | 2 +- 3 files changed, 340 insertions(+), 1 deletion(-) create mode 100644 internal/chacha20/asm_arm64.s create mode 100644 internal/chacha20/chacha_arm64.go diff --git a/internal/chacha20/asm_arm64.s b/internal/chacha20/asm_arm64.s new file mode 100644 index 0000000000..b3a16ef751 --- /dev/null +++ b/internal/chacha20/asm_arm64.s @@ -0,0 +1,308 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.11 +// +build !gccgo,!appengine + +#include "textflag.h" + +#define NUM_ROUNDS 10 + +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 + MOVD dst+0(FP), R1 + MOVD src+24(FP), R2 + MOVD src_len+32(FP), R3 + MOVD key+48(FP), R4 + MOVD nonce+56(FP), R6 + MOVD counter+64(FP), R7 + + MOVD $·constants(SB), R10 + MOVD $·incRotMatrix(SB), R11 + + MOVW (R7), R20 + + AND $~255, R3, R13 + ADD R2, R13, R12 // R12 for block end + AND $255, R3, R13 +loop: + MOVD $NUM_ROUNDS, R21 + VLD1 (R11), [V30.S4, V31.S4] + + // load contants + // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] + WORD $0x4D60E940 + + // load keys + // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4] + WORD $0x4DFFE884 + // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4] + WORD $0x4DFFE888 + SUB $32, R4 + + // load counter + nonce + // VLD1R (R7), [V12.S4] + WORD $0x4D40C8EC + + // VLD3R (R6), [V13.S4, V14.S4, V15.S4] + WORD $0x4D40E8CD + + // update counter + VADD V30.S4, V12.S4, V12.S4 + +chacha: + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) + VADD V0.S4, V4.S4, V0.S4 + VADD V1.S4, V5.S4, V1.S4 + VADD V2.S4, V6.S4, V2.S4 + VADD V3.S4, V7.S4, V3.S4 + VEOR V12.B16, V0.B16, V12.B16 + VEOR V13.B16, V1.B16, V13.B16 + VEOR V14.B16, V2.B16, V14.B16 + VEOR V15.B16, V3.B16, V15.B16 + VREV32 V12.H8, V12.H8 + VREV32 V13.H8, V13.H8 + VREV32 V14.H8, V14.H8 + VREV32 V15.H8, V15.H8 + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) + VADD V8.S4, V12.S4, V8.S4 + VADD V9.S4, V13.S4, V9.S4 + VADD V10.S4, V14.S4, V10.S4 + VADD V11.S4, V15.S4, V11.S4 + VEOR V8.B16, V4.B16, V16.B16 + VEOR V9.B16, V5.B16, V17.B16 + VEOR V10.B16, V6.B16, V18.B16 + VEOR V11.B16, V7.B16, V19.B16 + VSHL $12, V16.S4, V4.S4 + VSHL $12, V17.S4, V5.S4 + VSHL $12, V18.S4, V6.S4 + VSHL $12, V19.S4, V7.S4 + VSRI $20, V16.S4, V4.S4 + VSRI $20, V17.S4, V5.S4 + VSRI $20, V18.S4, V6.S4 + VSRI $20, V19.S4, V7.S4 + + // V0..V3 += V4..V7 + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) + VADD V0.S4, V4.S4, V0.S4 + VADD V1.S4, V5.S4, V1.S4 + VADD V2.S4, V6.S4, V2.S4 + VADD V3.S4, V7.S4, V3.S4 + VEOR V12.B16, V0.B16, V12.B16 + VEOR V13.B16, V1.B16, V13.B16 + VEOR V14.B16, V2.B16, V14.B16 + VEOR V15.B16, V3.B16, V15.B16 + VTBL V31.B16, [V12.B16], V12.B16 + VTBL V31.B16, [V13.B16], V13.B16 + VTBL V31.B16, [V14.B16], V14.B16 + VTBL V31.B16, [V15.B16], V15.B16 + + // V8..V11 += V12..V15 + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) + VADD V12.S4, V8.S4, V8.S4 + VADD V13.S4, V9.S4, V9.S4 + VADD V14.S4, V10.S4, V10.S4 + VADD V15.S4, V11.S4, V11.S4 + VEOR V8.B16, V4.B16, V16.B16 + VEOR V9.B16, V5.B16, V17.B16 + VEOR V10.B16, V6.B16, V18.B16 + VEOR V11.B16, V7.B16, V19.B16 + VSHL $7, V16.S4, V4.S4 + VSHL $7, V17.S4, V5.S4 + VSHL $7, V18.S4, V6.S4 + VSHL $7, V19.S4, V7.S4 + VSRI $25, V16.S4, V4.S4 + VSRI $25, V17.S4, V5.S4 + VSRI $25, V18.S4, V6.S4 + VSRI $25, V19.S4, V7.S4 + + // V0..V3 += V5..V7, V4 + // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) + VADD V0.S4, V5.S4, V0.S4 + VADD V1.S4, V6.S4, V1.S4 + VADD V2.S4, V7.S4, V2.S4 + VADD V3.S4, V4.S4, V3.S4 + VEOR V15.B16, V0.B16, V15.B16 + VEOR V12.B16, V1.B16, V12.B16 + VEOR V13.B16, V2.B16, V13.B16 + VEOR V14.B16, V3.B16, V14.B16 + VREV32 V12.H8, V12.H8 + VREV32 V13.H8, V13.H8 + VREV32 V14.H8, V14.H8 + VREV32 V15.H8, V15.H8 + + // V10 += V15; V5 <<<= ((V10 XOR V5), 12) + // ... + VADD V15.S4, V10.S4, V10.S4 + VADD V12.S4, V11.S4, V11.S4 + VADD V13.S4, V8.S4, V8.S4 + VADD V14.S4, V9.S4, V9.S4 + VEOR V10.B16, V5.B16, V16.B16 + VEOR V11.B16, V6.B16, V17.B16 + VEOR V8.B16, V7.B16, V18.B16 + VEOR V9.B16, V4.B16, V19.B16 + VSHL $12, V16.S4, V5.S4 + VSHL $12, V17.S4, V6.S4 + VSHL $12, V18.S4, V7.S4 + VSHL $12, V19.S4, V4.S4 + VSRI $20, V16.S4, V5.S4 + VSRI $20, V17.S4, V6.S4 + VSRI $20, V18.S4, V7.S4 + VSRI $20, V19.S4, V4.S4 + + // V0 += V5; V15 <<<= ((V0 XOR V15), 8) + // ... + VADD V5.S4, V0.S4, V0.S4 + VADD V6.S4, V1.S4, V1.S4 + VADD V7.S4, V2.S4, V2.S4 + VADD V4.S4, V3.S4, V3.S4 + VEOR V0.B16, V15.B16, V15.B16 + VEOR V1.B16, V12.B16, V12.B16 + VEOR V2.B16, V13.B16, V13.B16 + VEOR V3.B16, V14.B16, V14.B16 + VTBL V31.B16, [V12.B16], V12.B16 + VTBL V31.B16, [V13.B16], V13.B16 + VTBL V31.B16, [V14.B16], V14.B16 + VTBL V31.B16, [V15.B16], V15.B16 + + // V10 += V15; V5 <<<= ((V10 XOR V5), 7) + // ... + VADD V15.S4, V10.S4, V10.S4 + VADD V12.S4, V11.S4, V11.S4 + VADD V13.S4, V8.S4, V8.S4 + VADD V14.S4, V9.S4, V9.S4 + VEOR V10.B16, V5.B16, V16.B16 + VEOR V11.B16, V6.B16, V17.B16 + VEOR V8.B16, V7.B16, V18.B16 + VEOR V9.B16, V4.B16, V19.B16 + VSHL $7, V16.S4, V5.S4 + VSHL $7, V17.S4, V6.S4 + VSHL $7, V18.S4, V7.S4 + VSHL $7, V19.S4, V4.S4 + VSRI $25, V16.S4, V5.S4 + VSRI $25, V17.S4, V6.S4 + VSRI $25, V18.S4, V7.S4 + VSRI $25, V19.S4, V4.S4 + + SUB $1, R21 + CBNZ R21, chacha + + // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4] + WORD $0x4D60E950 + + // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4] + WORD $0x4DFFE894 + VADD V30.S4, V12.S4, V12.S4 + VADD V16.S4, V0.S4, V0.S4 + VADD V17.S4, V1.S4, V1.S4 + VADD V18.S4, V2.S4, V2.S4 + VADD V19.S4, V3.S4, V3.S4 + // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4] + WORD $0x4DFFE898 + // restore R4 + SUB $32, R4 + + // load counter + nonce + // VLD1R (R7), [V28.S4] + WORD $0x4D40C8FC + // VLD3R (R6), [V29.S4, V30.S4, V31.S4] + WORD $0x4D40E8DD + + VADD V20.S4, V4.S4, V4.S4 + VADD V21.S4, V5.S4, V5.S4 + VADD V22.S4, V6.S4, V6.S4 + VADD V23.S4, V7.S4, V7.S4 + VADD V24.S4, V8.S4, V8.S4 + VADD V25.S4, V9.S4, V9.S4 + VADD V26.S4, V10.S4, V10.S4 + VADD V27.S4, V11.S4, V11.S4 + VADD V28.S4, V12.S4, V12.S4 + VADD V29.S4, V13.S4, V13.S4 + VADD V30.S4, V14.S4, V14.S4 + VADD V31.S4, V15.S4, V15.S4 + + VZIP1 V1.S4, V0.S4, V16.S4 + VZIP2 V1.S4, V0.S4, V17.S4 + VZIP1 V3.S4, V2.S4, V18.S4 + VZIP2 V3.S4, V2.S4, V19.S4 + VZIP1 V5.S4, V4.S4, V20.S4 + VZIP2 V5.S4, V4.S4, V21.S4 + VZIP1 V7.S4, V6.S4, V22.S4 + VZIP2 V7.S4, V6.S4, V23.S4 + VZIP1 V9.S4, V8.S4, V24.S4 + VZIP2 V9.S4, V8.S4, V25.S4 + VZIP1 V11.S4, V10.S4, V26.S4 + VZIP2 V11.S4, V10.S4, V27.S4 + VZIP1 V13.S4, V12.S4, V28.S4 + VZIP2 V13.S4, V12.S4, V29.S4 + VZIP1 V15.S4, V14.S4, V30.S4 + VZIP2 V15.S4, V14.S4, V31.S4 + VZIP1 V18.D2, V16.D2, V0.D2 + VZIP2 V18.D2, V16.D2, V4.D2 + VZIP1 V19.D2, V17.D2, V8.D2 + VZIP2 V19.D2, V17.D2, V12.D2 + VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16] + + VZIP1 V22.D2, V20.D2, V1.D2 + VZIP2 V22.D2, V20.D2, V5.D2 + VZIP1 V23.D2, V21.D2, V9.D2 + VZIP2 V23.D2, V21.D2, V13.D2 + VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16] + VZIP1 V26.D2, V24.D2, V2.D2 + VZIP2 V26.D2, V24.D2, V6.D2 + VZIP1 V27.D2, V25.D2, V10.D2 + VZIP2 V27.D2, V25.D2, V14.D2 + VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16] + VZIP1 V30.D2, V28.D2, V3.D2 + VZIP2 V30.D2, V28.D2, V7.D2 + VZIP1 V31.D2, V29.D2, V11.D2 + VZIP2 V31.D2, V29.D2, V15.D2 + VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16] + VEOR V0.B16, V16.B16, V16.B16 + VEOR V1.B16, V17.B16, V17.B16 + VEOR V2.B16, V18.B16, V18.B16 + VEOR V3.B16, V19.B16, V19.B16 + VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1) + VEOR V4.B16, V20.B16, V20.B16 + VEOR V5.B16, V21.B16, V21.B16 + VEOR V6.B16, V22.B16, V22.B16 + VEOR V7.B16, V23.B16, V23.B16 + VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1) + VEOR V8.B16, V24.B16, V24.B16 + VEOR V9.B16, V25.B16, V25.B16 + VEOR V10.B16, V26.B16, V26.B16 + VEOR V11.B16, V27.B16, V27.B16 + VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1) + VEOR V12.B16, V28.B16, V28.B16 + VEOR V13.B16, V29.B16, V29.B16 + VEOR V14.B16, V30.B16, V30.B16 + VEOR V15.B16, V31.B16, V31.B16 + VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1) + + ADD $4, R20 + MOVW R20, (R7) // update counter + + CMP R2, R12 + BGT loop + + RET + + +DATA ·constants+0x00(SB)/4, $0x61707865 +DATA ·constants+0x04(SB)/4, $0x3320646e +DATA ·constants+0x08(SB)/4, $0x79622d32 +DATA ·constants+0x0c(SB)/4, $0x6b206574 +GLOBL ·constants(SB), NOPTR|RODATA, $32 + +DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 +DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 +DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 +DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 +DATA ·incRotMatrix+0x10(SB)/4, $0x02010003 +DATA ·incRotMatrix+0x14(SB)/4, $0x06050407 +DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B +DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F +GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32 diff --git a/internal/chacha20/chacha_arm64.go b/internal/chacha20/chacha_arm64.go new file mode 100644 index 0000000000..ad74e23aef --- /dev/null +++ b/internal/chacha20/chacha_arm64.go @@ -0,0 +1,31 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.11 +// +build !gccgo + +package chacha20 + +const ( + haveAsm = true + bufSize = 256 +) + +//go:noescape +func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) + +func (c *Cipher) xorKeyStreamAsm(dst, src []byte) { + + if len(src) >= bufSize { + xorKeyStreamVX(dst, src, &c.key, &c.nonce, &c.counter) + } + + if len(src)%bufSize != 0 { + i := len(src) - len(src)%bufSize + c.buf = [bufSize]byte{} + copy(c.buf[:], src[i:]) + xorKeyStreamVX(c.buf[:], c.buf[:], &c.key, &c.nonce, &c.counter) + c.len = bufSize - copy(dst[i:], c.buf[:len(src)%bufSize]) + } +} diff --git a/internal/chacha20/chacha_noasm.go b/internal/chacha20/chacha_noasm.go index 91520d1de0..47eac0314c 100644 --- a/internal/chacha20/chacha_noasm.go +++ b/internal/chacha20/chacha_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !s390x gccgo appengine +// +build !arm64,!s390x arm64,!go1.11 gccgo appengine package chacha20