fixes for issue #15 -- ARM assembly alignment bug

This change reintroduces ARM assembly for the streaming hash with a fix for unaligned accesses. The HashXXX() functions for ARM are implemented with calls to the underlying blocks() assembler routine, and so are not performant for tiny input sizes (crossover is at around 16 bytes). Performance benchmarking from a Marvell Armada 3720 here: Before: ``` goos: linux goarch: arm pkg: github.com/dchest/siphash BenchmarkHash8 2000000 607 ns/op 13.17 MB/s BenchmarkHash16 2000000 768 ns/op 20.82 MB/s BenchmarkHash40 1000000 1208 ns/op 33.09 MB/s BenchmarkHash64 1000000 1658 ns/op 38.60 MB/s BenchmarkHash128 500000 2844 ns/op 45.00 MB/s BenchmarkHash1K 100000 19368 ns/op 52.87 MB/s BenchmarkHash1Kunaligned 100000 19348 ns/op 52.93 MB/s BenchmarkHash8K 10000 151758 ns/op 53.98 MB/s BenchmarkHash128_8 2000000 863 ns/op 9.27 MB/s BenchmarkHash128_16 1000000 1029 ns/op 15.55 MB/s BenchmarkHash128_40 1000000 1469 ns/op 27.22 MB/s BenchmarkHash128_64 1000000 1909 ns/op 33.52 MB/s BenchmarkHash128_128 500000 3084 ns/op 41.50 MB/s BenchmarkHash128_1K 100000 19458 ns/op 52.62 MB/s BenchmarkHash128_8K 10000 150805 ns/op 54.32 MB/s BenchmarkFull8 1000000 1128 ns/op 7.09 MB/s BenchmarkFull16 1000000 1282 ns/op 12.48 MB/s BenchmarkFull40 1000000 1281 ns/op 18.73 MB/s BenchmarkFull64 1000000 2236 ns/op 28.62 MB/s BenchmarkFull128 1000000 2237 ns/op 57.20 MB/s BenchmarkFull1K 100000 21799 ns/op 46.97 MB/s BenchmarkFull1Kunaligned 100000 21479 ns/op 47.67 MB/s BenchmarkFull8K 10000 164920 ns/op 49.67 MB/s BenchmarkFull128_8 1000000 2120 ns/op 3.77 MB/s BenchmarkFull128_16 1000000 2271 ns/op 7.04 MB/s BenchmarkFull128_40 1000000 2266 ns/op 10.59 MB/s BenchmarkFull128_64 500000 3251 ns/op 19.68 MB/s BenchmarkFull128_128 500000 3238 ns/op 39.53 MB/s BenchmarkFull128_1K 100000 22546 ns/op 45.42 MB/s BenchmarkFull128_8K 10000 166300 ns/op 49.26 MB/s PASS ``` After: ``` goos: linux goarch: arm pkg: github.com/dchest/siphash BenchmarkHash8 2000000 677 ns/op 11.81 MB/s BenchmarkHash16 2000000 737 ns/op 21.71 MB/s BenchmarkHash40 2000000 963 ns/op 41.54 MB/s BenchmarkHash64 1000000 1167 ns/op 54.82 MB/s BenchmarkHash128 1000000 1719 ns/op 74.45 MB/s BenchmarkHash1K 200000 9349 ns/op 109.52 MB/s BenchmarkHash1Kunaligned 200000 11115 ns/op 92.12 MB/s BenchmarkHash8K 20000 70468 ns/op 116.25 MB/s BenchmarkHash128_8 1000000 1133 ns/op 7.06 MB/s BenchmarkHash128_16 1000000 1202 ns/op 13.31 MB/s BenchmarkHash128_40 1000000 1437 ns/op 27.83 MB/s BenchmarkHash128_64 1000000 1631 ns/op 39.23 MB/s BenchmarkHash128_128 1000000 2183 ns/op 58.62 MB/s BenchmarkHash128_1K 200000 9795 ns/op 104.54 MB/s BenchmarkHash128_8K 20000 70894 ns/op 115.55 MB/s BenchmarkFull8 2000000 694 ns/op 11.52 MB/s BenchmarkFull16 2000000 760 ns/op 21.03 MB/s BenchmarkFull40 2000000 764 ns/op 31.40 MB/s BenchmarkFull64 1000000 1186 ns/op 53.96 MB/s BenchmarkFull128 1000000 1181 ns/op 108.35 MB/s BenchmarkFull1K 200000 9399 ns/op 108.94 MB/s BenchmarkFull1Kunaligned 200000 11186 ns/op 91.54 MB/s BenchmarkFull8K 20000 70458 ns/op 116.27 MB/s BenchmarkFull128_8 1000000 2005 ns/op 3.99 MB/s BenchmarkFull128_16 1000000 2066 ns/op 7.74 MB/s BenchmarkFull128_40 1000000 2076 ns/op 11.56 MB/s BenchmarkFull128_64 500000 2495 ns/op 25.64 MB/s BenchmarkFull128_128 500000 2492 ns/op 51.36 MB/s BenchmarkFull128_1K 200000 10705 ns/op 95.65 MB/s BenchmarkFull128_8K 20000 71909 ns/op 113.92 MB/s PASS ```
dchest · Aug 17, 2018 · 4f4eba0 · 4f4eba0
1 parent ca249f4
commit 4f4eba0
Show file tree

Hide file tree

Showing 8 changed files with 243 additions and 16 deletions.
diff --git a/blocks.go b/blocks.go
@@ -1,4 +1,4 @@
-// +build !amd64 appengine gccgo
+// +build !arm,!amd64 appengine gccgo
 
 package siphash
 

diff --git a/blocks_arm.s b/blocks_arm.s
@@ -0,0 +1,144 @@
+#include "textflag.h"
+
+#define ROUND()\
+	ADD.S	R2,R0,R0;\
+	ADC	R3,R1,R1;\
+	EOR	R2<<13,R0,R8;\
+	EOR	R3>>19,R8,R8;\
+	EOR	R2>>19,R1,R11;\
+	EOR	R3<<13,R11,R11;\
+	ADD.S	R6,R4,R4;\
+	ADC	R7,R5,R5;\
+	EOR	R6<<16,R4,R2;\
+	EOR	R7>>16,R2,R2;\
+	EOR	R6>>16,R5,R3;\
+	EOR	R7<<16,R3,R3;\
+	ADD.S	R2,R1,R1;\
+	ADC	R3,R0,R0;\
+	EOR	R2<<21,R1,R6;\
+	EOR	R3>>11,R6,R6;\
+	EOR	R2>>11,R0,R7;\
+	EOR	R3<<21,R7,R7;\
+	ADD.S	R8,R4,R4;\
+	ADC	R11,R5,R5;\
+	EOR	R8<<17,R4,R2;\
+	EOR	R11>>15,R2,R2;\
+	EOR	R8>>15,R5,R3;\
+	EOR	R11<<17,R3,R3;\
+	ADD.S	R2,R1,R1;\
+	ADC	R3,R0,R0;\
+	EOR	R2<<13,R1,R8;\
+	EOR	R3>>19,R8,R8;\
+	EOR	R2>>19,R0,R11;\
+	EOR	R3<<13,R11,R11;\
+	ADD.S	R6,R5,R5;\
+	ADC	R7,R4,R4;\
+	EOR	R6<<16,R5,R2;\
+	EOR	R7>>16,R2,R2;\
+	EOR	R6>>16,R4,R3;\
+	EOR	R7<<16,R3,R3;\
+	ADD.S	R2,R0,R0;\
+	ADC	R3,R1,R1;\
+	EOR	R2<<21,R0,R6;\
+	EOR	R3>>11,R6,R6;\
+	EOR	R2>>11,R1,R7;\
+	EOR	R3<<21,R7,R7;\
+	ADD.S	R8,R5,R5;\
+	ADC	R11,R4,R4;\
+	EOR	R8<<17,R5,R2;\
+	EOR	R11>>15,R2,R2;\
+	EOR	R8>>15,R4,R3;\
+	EOR	R11<<17,R3,R3;
+
+// once(d *digest)
+TEXT ·once(SB),NOSPLIT,$4-4
+	MOVW	d+0(FP),R8
+	MOVM.IA	(R8),[R0,R1,R2,R3,R4,R5,R6,R7]
+	MOVW	48(R8),R12
+	MOVW	52(R8),R14
+	EOR	R12,R6,R6
+	EOR	R14,R7,R7
+	ROUND()
+	EOR	R12,R0,R0
+	EOR	R14,R1,R1
+	MOVW	d+0(FP),R8
+	MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
+	RET
+
+// finalize(d *digest) uint64
+TEXT ·finalize(SB),NOSPLIT,$4-12
+	MOVW	d+0(FP),R8
+	MOVM.IA	(R8),[R0,R1,R2,R3,R4,R5,R6,R7]
+	MOVW	48(R8),R12
+	MOVW	52(R8),R14
+	EOR	R12,R6,R6
+	EOR	R14,R7,R7
+	ROUND()
+	EOR	R12,R0,R0
+	EOR	R14,R1,R1
+	EOR	$255,R4
+	ROUND()
+	ROUND()
+	EOR	R2,R0,R0
+	EOR	R3,R1,R1
+	EOR	R6,R4,R4
+	EOR	R7,R5,R5
+	EOR	R4,R0,R0
+	EOR	R5,R1,R1
+	MOVW	R0,ret_lo+4(FP)
+	MOVW	R1,ret_hi+8(FP)
+	RET
+
+// blocks(d *digest, data []uint8)
+TEXT ·blocks(SB),NOSPLIT,$8-16
+	MOVW	R9,sav-8(SP)
+	MOVW	d+0(FP),R8
+	MOVM.IA	(R8),[R0,R1,R2,R3,R4,R5,R6,R7]
+	MOVW	p+4(FP),R9
+	MOVW	p_len+8(FP),R11
+	ADD	R9,R11,R11
+	MOVW	R11,endp-4(SP)
+	AND.S   $3,R9,R8
+	BNE     blocksunaligned
+blocksloop:
+	MOVM.IA.W (R9),[R12,R14]
+	EOR	R12,R6,R6
+	EOR	R14,R7,R7
+	ROUND()
+	EOR	R12,R0,R0
+	EOR	R14,R1,R1
+	MOVW	endp-4(SP),R11
+	CMP	R11,R9
+	BLO	blocksloop
+	MOVW	d+0(FP),R8
+	MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
+	MOVW	sav-8(SP),R9
+	RET
+blocksunaligned:
+	MOVBU    (R9),R12
+	MOVBU    1(R9),R11
+	ORR     R11<<8,R12,R12
+	MOVBU    2(R9),R11
+	ORR     R11<<16,R12,R12
+	MOVBU    3(R9),R11
+	ORR     R11<<24,R12,R12
+	MOVBU    4(R9),R14
+	MOVBU    5(R9),R11
+	ORR     R11<<8,R14,R14
+	MOVBU    6(R9),R11
+	ORR     R11<<16,R14,R14
+	MOVBU    7(R9),R11
+	ORR     R11<<24,R14,R14
+	ADD     $8,R9,R9
+	EOR     R12,R6,R6
+	EOR     R14,R7,R7
+	ROUND()
+	EOR     R12,R0,R0
+	EOR     R14,R1,R1
+	MOVW    endp-4(SP),R11
+	CMP     R11,R9
+	BLO     blocksunaligned
+	MOVW    d+0(FP),R8
+	MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
+	MOVW    sav-8(SP),R9
+	RET
diff --git a/blocks_asm.go b/blocks_asm.go
@@ -0,0 +1,21 @@
+// +build arm amd64,!appengine,!gccgo
+
+// Written in 2012 by Dmitry Chestnykh.
+//
+// To the extent possible under law, the author have dedicated all copyright
+// and related and neighboring rights to this software to the public domain
+// worldwide. This software is distributed without any warranty.
+// http://creativecommons.org/publicdomain/zero/1.0/
+
+// This file contains a function definition for use with assembly implementations of Hash()
+
+package siphash
+
+//go:noescape
+func blocks(d *digest, p []uint8)
+
+//go:noescape
+func finalize(d *digest) uint64
+
+//go:noescape
+func once(d *digest)
diff --git a/hash.go b/hash.go
@@ -1,4 +1,4 @@
-// +build !amd64 appengine gccgo
+// +build !arm,!amd64 appengine gccgo
 
 // Written in 2012 by Dmitry Chestnykh.
 //

diff --git a/hash128.go b/hash128.go
@@ -1,4 +1,4 @@
-// +build !amd64 appengine gccgo
+// +build !arm,!amd64 appengine gccgo
 // Written in 2012 by Dmitry Chestnykh.
 // Modifications 2014 for 128-bit hash function by Damian Gryski.
 //

diff --git a/hash_arm.go b/hash_arm.go
@@ -0,0 +1,27 @@
+// +build arm
+
+package siphash
+
+// NB: ARM implementation of forgoes extra speed for Hash()
+// and Hash128() by simply reusing the same blocks() implementation
+// in assembly used by the streaming hash.
+
+func Hash(k0, k1 uint64, p []byte) uint64 {
+	var d digest
+	d.size = Size
+	d.k0 = k0
+	d.k1 = k1
+	d.Reset()
+	d.Write(p)
+	return d.Sum64()
+}
+
+func Hash128(k0, k1 uint64, p []byte) (uint64, uint64) {
+	var d digest
+	d.size = Size128
+	d.k0 = k0
+	d.k1 = k1
+	d.Reset()
+	d.Write(p)
+	return d.sum128()
+}
diff --git a/hash_asm.go b/hash_asm.go
@@ -22,12 +22,3 @@ func Hash(k0, k1 uint64, b []byte) uint64
 // Hash128 returns the 128-bit SipHash-2-4 of the given byte slice with two
 // 64-bit parts of 128-bit key: k0 and k1.
 func Hash128(k0, k1 uint64, b []byte) (uint64, uint64)
-
-//go:noescape
-func blocks(d *digest, p []uint8)
-
-//go:noescape
-func finalize(d *digest) uint64
-
-//go:noescape
-func once(d *digest)
diff --git a/siphash_test.go b/siphash_test.go
@@ -347,16 +347,60 @@ func TestAlign(t *testing.T) {
 	data := "0076a9143219adce9b6f0a21fd53cb17e2fd9b2b4fac40b388ac"
 	k0 := uint64(316665572293978160)
 	k1 := uint64(8573005253291875333)
-	want := uint64(16770526497674945769)
+
+	want := []uint64{
+		16380727507974277821,
+		16770526497674945769,
+		11373998677292870540,
+		10374222295991299613,
+	}
+	want128 := []uint64{
+		14802151199638645495,
+		13251497035884452880,
+		7034723853391616289,
+		16742813562040528752,
+		10468120447644272532,
+		10941274532208162335,
+		11293904790559355408,
+		15432350433573653068,
+	}
 
 	d, err := hex.DecodeString(data)
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	res := Hash(k0, k1, d[1:])
-	if res != want {
-		t.Fatalf("Expected %v got %v", want, res)
+	var k [16]byte
+	binary.LittleEndian.PutUint64(k[0:], k0)
+	binary.LittleEndian.PutUint64(k[8:], k1)
+
+	for i := range want {
+		res := Hash(k0, k1, d[i:])
+		if res != want[i] {
+			t.Fatalf("Expected %v got %v", want[i], res)
+		}
+		reslo, reshi := Hash128(k0, k1, d[i:])
+		if reslo != want128[i*2] {
+			t.Fatalf("Expected %v got %v", want128[i*2], reslo)
+		}
+		if reshi != want128[i*2+1] {
+			t.Fatalf("Expected %v got %v", want128[i*2+1], reshi)
+		}
+		dig := newDigest(Size, k[:])
+		dig.Write(d[i:])
+		res = dig.Sum64()
+		if res != want[i] {
+			t.Fatalf("Expected %v got %v", want[i], res)
+		}
+		dig128 := newDigest(Size128, k[:])
+		dig128.Write(d[i:])
+		reslo, reshi = dig128.sum128()
+		if reslo != want128[i*2] {
+			t.Fatalf("Expected %v got %v", want128[i*2], reslo)
+		}
+		if reshi != want128[i*2+1] {
+			t.Fatalf("Expected %v got %v", want128[i*2+1], reshi)
+		}
 	}
 }