Skip to content

Commit

Permalink
fixes for issue #15 -- ARM assembly alignment bug
Browse files Browse the repository at this point in the history
This change reintroduces ARM assembly for the streaming hash
with a fix for unaligned accesses. The HashXXX() functions
for ARM are implemented with calls to the underlying blocks()
assembler routine, and so are not performant for tiny input
sizes (crossover is at around 16 bytes). Performance benchmarking
from a Marvell Armada 3720 here:

Before:

```
goos: linux
goarch: arm
pkg: github.com/dchest/siphash
BenchmarkHash8           	 2000000	       607 ns/op	  13.17 MB/s
BenchmarkHash16          	 2000000	       768 ns/op	  20.82 MB/s
BenchmarkHash40          	 1000000	      1208 ns/op	  33.09 MB/s
BenchmarkHash64          	 1000000	      1658 ns/op	  38.60 MB/s
BenchmarkHash128         	  500000	      2844 ns/op	  45.00 MB/s
BenchmarkHash1K          	  100000	     19368 ns/op	  52.87 MB/s
BenchmarkHash1Kunaligned 	  100000	     19348 ns/op	  52.93 MB/s
BenchmarkHash8K          	   10000	    151758 ns/op	  53.98 MB/s
BenchmarkHash128_8       	 2000000	       863 ns/op	   9.27 MB/s
BenchmarkHash128_16      	 1000000	      1029 ns/op	  15.55 MB/s
BenchmarkHash128_40      	 1000000	      1469 ns/op	  27.22 MB/s
BenchmarkHash128_64      	 1000000	      1909 ns/op	  33.52 MB/s
BenchmarkHash128_128     	  500000	      3084 ns/op	  41.50 MB/s
BenchmarkHash128_1K      	  100000	     19458 ns/op	  52.62 MB/s
BenchmarkHash128_8K      	   10000	    150805 ns/op	  54.32 MB/s
BenchmarkFull8           	 1000000	      1128 ns/op	   7.09 MB/s
BenchmarkFull16          	 1000000	      1282 ns/op	  12.48 MB/s
BenchmarkFull40          	 1000000	      1281 ns/op	  18.73 MB/s
BenchmarkFull64          	 1000000	      2236 ns/op	  28.62 MB/s
BenchmarkFull128         	 1000000	      2237 ns/op	  57.20 MB/s
BenchmarkFull1K          	  100000	     21799 ns/op	  46.97 MB/s
BenchmarkFull1Kunaligned 	  100000	     21479 ns/op	  47.67 MB/s
BenchmarkFull8K          	   10000	    164920 ns/op	  49.67 MB/s
BenchmarkFull128_8       	 1000000	      2120 ns/op	   3.77 MB/s
BenchmarkFull128_16      	 1000000	      2271 ns/op	   7.04 MB/s
BenchmarkFull128_40      	 1000000	      2266 ns/op	  10.59 MB/s
BenchmarkFull128_64      	  500000	      3251 ns/op	  19.68 MB/s
BenchmarkFull128_128     	  500000	      3238 ns/op	  39.53 MB/s
BenchmarkFull128_1K      	  100000	     22546 ns/op	  45.42 MB/s
BenchmarkFull128_8K      	   10000	    166300 ns/op	  49.26 MB/s
PASS
```

After:

```
goos: linux
goarch: arm
pkg: github.com/dchest/siphash
BenchmarkHash8           	 2000000	       677 ns/op	  11.81 MB/s
BenchmarkHash16          	 2000000	       737 ns/op	  21.71 MB/s
BenchmarkHash40          	 2000000	       963 ns/op	  41.54 MB/s
BenchmarkHash64          	 1000000	      1167 ns/op	  54.82 MB/s
BenchmarkHash128         	 1000000	      1719 ns/op	  74.45 MB/s
BenchmarkHash1K          	  200000	      9349 ns/op	 109.52 MB/s
BenchmarkHash1Kunaligned 	  200000	     11115 ns/op	  92.12 MB/s
BenchmarkHash8K          	   20000	     70468 ns/op	 116.25 MB/s
BenchmarkHash128_8       	 1000000	      1133 ns/op	   7.06 MB/s
BenchmarkHash128_16      	 1000000	      1202 ns/op	  13.31 MB/s
BenchmarkHash128_40      	 1000000	      1437 ns/op	  27.83 MB/s
BenchmarkHash128_64      	 1000000	      1631 ns/op	  39.23 MB/s
BenchmarkHash128_128     	 1000000	      2183 ns/op	  58.62 MB/s
BenchmarkHash128_1K      	  200000	      9795 ns/op	 104.54 MB/s
BenchmarkHash128_8K      	   20000	     70894 ns/op	 115.55 MB/s
BenchmarkFull8           	 2000000	       694 ns/op	  11.52 MB/s
BenchmarkFull16          	 2000000	       760 ns/op	  21.03 MB/s
BenchmarkFull40          	 2000000	       764 ns/op	  31.40 MB/s
BenchmarkFull64          	 1000000	      1186 ns/op	  53.96 MB/s
BenchmarkFull128         	 1000000	      1181 ns/op	 108.35 MB/s
BenchmarkFull1K          	  200000	      9399 ns/op	 108.94 MB/s
BenchmarkFull1Kunaligned 	  200000	     11186 ns/op	  91.54 MB/s
BenchmarkFull8K          	   20000	     70458 ns/op	 116.27 MB/s
BenchmarkFull128_8       	 1000000	      2005 ns/op	   3.99 MB/s
BenchmarkFull128_16      	 1000000	      2066 ns/op	   7.74 MB/s
BenchmarkFull128_40      	 1000000	      2076 ns/op	  11.56 MB/s
BenchmarkFull128_64      	  500000	      2495 ns/op	  25.64 MB/s
BenchmarkFull128_128     	  500000	      2492 ns/op	  51.36 MB/s
BenchmarkFull128_1K      	  200000	     10705 ns/op	  95.65 MB/s
BenchmarkFull128_8K      	   20000	     71909 ns/op	 113.92 MB/s
PASS
```
  • Loading branch information
rakitzis committed Aug 17, 2018
1 parent ca249f4 commit 4f4eba0
Show file tree
Hide file tree
Showing 8 changed files with 243 additions and 16 deletions.
2 changes: 1 addition & 1 deletion blocks.go
@@ -1,4 +1,4 @@
// +build !amd64 appengine gccgo
// +build !arm,!amd64 appengine gccgo

package siphash

Expand Down
144 changes: 144 additions & 0 deletions blocks_arm.s
@@ -0,0 +1,144 @@
#include "textflag.h"

#define ROUND()\
ADD.S R2,R0,R0;\
ADC R3,R1,R1;\
EOR R2<<13,R0,R8;\
EOR R3>>19,R8,R8;\
EOR R2>>19,R1,R11;\
EOR R3<<13,R11,R11;\
ADD.S R6,R4,R4;\
ADC R7,R5,R5;\
EOR R6<<16,R4,R2;\
EOR R7>>16,R2,R2;\
EOR R6>>16,R5,R3;\
EOR R7<<16,R3,R3;\
ADD.S R2,R1,R1;\
ADC R3,R0,R0;\
EOR R2<<21,R1,R6;\
EOR R3>>11,R6,R6;\
EOR R2>>11,R0,R7;\
EOR R3<<21,R7,R7;\
ADD.S R8,R4,R4;\
ADC R11,R5,R5;\
EOR R8<<17,R4,R2;\
EOR R11>>15,R2,R2;\
EOR R8>>15,R5,R3;\
EOR R11<<17,R3,R3;\
ADD.S R2,R1,R1;\
ADC R3,R0,R0;\
EOR R2<<13,R1,R8;\
EOR R3>>19,R8,R8;\
EOR R2>>19,R0,R11;\
EOR R3<<13,R11,R11;\
ADD.S R6,R5,R5;\
ADC R7,R4,R4;\
EOR R6<<16,R5,R2;\
EOR R7>>16,R2,R2;\
EOR R6>>16,R4,R3;\
EOR R7<<16,R3,R3;\
ADD.S R2,R0,R0;\
ADC R3,R1,R1;\
EOR R2<<21,R0,R6;\
EOR R3>>11,R6,R6;\
EOR R2>>11,R1,R7;\
EOR R3<<21,R7,R7;\
ADD.S R8,R5,R5;\
ADC R11,R4,R4;\
EOR R8<<17,R5,R2;\
EOR R11>>15,R2,R2;\
EOR R8>>15,R4,R3;\
EOR R11<<17,R3,R3;

// once(d *digest)
TEXT ·once(SB),NOSPLIT,$4-4
MOVW d+0(FP),R8
MOVM.IA (R8),[R0,R1,R2,R3,R4,R5,R6,R7]
MOVW 48(R8),R12
MOVW 52(R8),R14
EOR R12,R6,R6
EOR R14,R7,R7
ROUND()
EOR R12,R0,R0
EOR R14,R1,R1
MOVW d+0(FP),R8
MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
RET

// finalize(d *digest) uint64
TEXT ·finalize(SB),NOSPLIT,$4-12
MOVW d+0(FP),R8
MOVM.IA (R8),[R0,R1,R2,R3,R4,R5,R6,R7]
MOVW 48(R8),R12
MOVW 52(R8),R14
EOR R12,R6,R6
EOR R14,R7,R7
ROUND()
EOR R12,R0,R0
EOR R14,R1,R1
EOR $255,R4
ROUND()
ROUND()
EOR R2,R0,R0
EOR R3,R1,R1
EOR R6,R4,R4
EOR R7,R5,R5
EOR R4,R0,R0
EOR R5,R1,R1
MOVW R0,ret_lo+4(FP)
MOVW R1,ret_hi+8(FP)
RET

// blocks(d *digest, data []uint8)
TEXT ·blocks(SB),NOSPLIT,$8-16
MOVW R9,sav-8(SP)
MOVW d+0(FP),R8
MOVM.IA (R8),[R0,R1,R2,R3,R4,R5,R6,R7]
MOVW p+4(FP),R9
MOVW p_len+8(FP),R11
ADD R9,R11,R11
MOVW R11,endp-4(SP)
AND.S $3,R9,R8
BNE blocksunaligned
blocksloop:
MOVM.IA.W (R9),[R12,R14]
EOR R12,R6,R6
EOR R14,R7,R7
ROUND()
EOR R12,R0,R0
EOR R14,R1,R1
MOVW endp-4(SP),R11
CMP R11,R9
BLO blocksloop
MOVW d+0(FP),R8
MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
MOVW sav-8(SP),R9
RET
blocksunaligned:
MOVBU (R9),R12
MOVBU 1(R9),R11
ORR R11<<8,R12,R12
MOVBU 2(R9),R11
ORR R11<<16,R12,R12
MOVBU 3(R9),R11
ORR R11<<24,R12,R12
MOVBU 4(R9),R14
MOVBU 5(R9),R11
ORR R11<<8,R14,R14
MOVBU 6(R9),R11
ORR R11<<16,R14,R14
MOVBU 7(R9),R11
ORR R11<<24,R14,R14
ADD $8,R9,R9
EOR R12,R6,R6
EOR R14,R7,R7
ROUND()
EOR R12,R0,R0
EOR R14,R1,R1
MOVW endp-4(SP),R11
CMP R11,R9
BLO blocksunaligned
MOVW d+0(FP),R8
MOVM.IA [R0,R1,R2,R3,R4,R5,R6,R7],(R8)
MOVW sav-8(SP),R9
RET
21 changes: 21 additions & 0 deletions blocks_asm.go
@@ -0,0 +1,21 @@
// +build arm amd64,!appengine,!gccgo

// Written in 2012 by Dmitry Chestnykh.
//
// To the extent possible under law, the author have dedicated all copyright
// and related and neighboring rights to this software to the public domain
// worldwide. This software is distributed without any warranty.
// http://creativecommons.org/publicdomain/zero/1.0/

// This file contains a function definition for use with assembly implementations of Hash()

package siphash

//go:noescape
func blocks(d *digest, p []uint8)

//go:noescape
func finalize(d *digest) uint64

//go:noescape
func once(d *digest)
2 changes: 1 addition & 1 deletion hash.go
@@ -1,4 +1,4 @@
// +build !amd64 appengine gccgo
// +build !arm,!amd64 appengine gccgo

// Written in 2012 by Dmitry Chestnykh.
//
Expand Down
2 changes: 1 addition & 1 deletion hash128.go
@@ -1,4 +1,4 @@
// +build !amd64 appengine gccgo
// +build !arm,!amd64 appengine gccgo
// Written in 2012 by Dmitry Chestnykh.
// Modifications 2014 for 128-bit hash function by Damian Gryski.
//
Expand Down
27 changes: 27 additions & 0 deletions hash_arm.go
@@ -0,0 +1,27 @@
// +build arm

package siphash

// NB: ARM implementation of forgoes extra speed for Hash()
// and Hash128() by simply reusing the same blocks() implementation
// in assembly used by the streaming hash.

func Hash(k0, k1 uint64, p []byte) uint64 {
var d digest
d.size = Size
d.k0 = k0
d.k1 = k1
d.Reset()
d.Write(p)
return d.Sum64()
}

func Hash128(k0, k1 uint64, p []byte) (uint64, uint64) {
var d digest
d.size = Size128
d.k0 = k0
d.k1 = k1
d.Reset()
d.Write(p)
return d.sum128()
}
9 changes: 0 additions & 9 deletions hash_asm.go
Expand Up @@ -22,12 +22,3 @@ func Hash(k0, k1 uint64, b []byte) uint64
// Hash128 returns the 128-bit SipHash-2-4 of the given byte slice with two
// 64-bit parts of 128-bit key: k0 and k1.
func Hash128(k0, k1 uint64, b []byte) (uint64, uint64)

//go:noescape
func blocks(d *digest, p []uint8)

//go:noescape
func finalize(d *digest) uint64

//go:noescape
func once(d *digest)
52 changes: 48 additions & 4 deletions siphash_test.go
Expand Up @@ -347,16 +347,60 @@ func TestAlign(t *testing.T) {
data := "0076a9143219adce9b6f0a21fd53cb17e2fd9b2b4fac40b388ac"
k0 := uint64(316665572293978160)
k1 := uint64(8573005253291875333)
want := uint64(16770526497674945769)

want := []uint64{
16380727507974277821,
16770526497674945769,
11373998677292870540,
10374222295991299613,
}
want128 := []uint64{
14802151199638645495,
13251497035884452880,
7034723853391616289,
16742813562040528752,
10468120447644272532,
10941274532208162335,
11293904790559355408,
15432350433573653068,
}

d, err := hex.DecodeString(data)
if err != nil {
t.Fatal(err)
}

res := Hash(k0, k1, d[1:])
if res != want {
t.Fatalf("Expected %v got %v", want, res)
var k [16]byte
binary.LittleEndian.PutUint64(k[0:], k0)
binary.LittleEndian.PutUint64(k[8:], k1)

for i := range want {
res := Hash(k0, k1, d[i:])
if res != want[i] {
t.Fatalf("Expected %v got %v", want[i], res)
}
reslo, reshi := Hash128(k0, k1, d[i:])
if reslo != want128[i*2] {
t.Fatalf("Expected %v got %v", want128[i*2], reslo)
}
if reshi != want128[i*2+1] {
t.Fatalf("Expected %v got %v", want128[i*2+1], reshi)
}
dig := newDigest(Size, k[:])
dig.Write(d[i:])
res = dig.Sum64()
if res != want[i] {
t.Fatalf("Expected %v got %v", want[i], res)
}
dig128 := newDigest(Size128, k[:])
dig128.Write(d[i:])
reslo, reshi = dig128.sum128()
if reslo != want128[i*2] {
t.Fatalf("Expected %v got %v", want128[i*2], reslo)
}
if reshi != want128[i*2+1] {
t.Fatalf("Expected %v got %v", want128[i*2+1], reshi)
}
}
}

Expand Down

0 comments on commit 4f4eba0

Please sign in to comment.