Permalink
Browse files

borrow finalization from dchest's go version, re-work main loops, use…

… slightly re-worked compress for sse
  • Loading branch information...
floodyberry committed Jun 24, 2012
1 parent 9b27997 commit 801e2841ddb3ab77cc7d5712bb2683a9ba4e0cf4
Showing with 93 additions and 88 deletions.
  1. +23 −19 siphash.c
  2. +29 −39 siphash_sse2.c
  3. +41 −30 siphash_ssse3.c
View
@@ -16,8 +16,8 @@ uint64_t
siphash(unsigned char key[16], const unsigned char *m, size_t len) {
uint64_t v0, v1, v2, v3;
uint64_t mi, k0, k1;
- unsigned char buf[8];
- size_t i;
+ uint64_t last7;
+ size_t i, blocks;
k0 = U8TO64_LE(key + 0);
k1 = U8TO64_LE(key + 8);
@@ -26,8 +26,7 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
v2 = k0 ^ 0x6c7967656e657261ull;
v3 = k1 ^ 0x7465646279746573ull;
- buf[7] = (unsigned char)len;
- if (len < 8) goto sip7bytesorless;
+ last7 = (uint64_t)(len & 0xff) << 56;
#define sipcompress() \
v0 += v1; v2 += v3; \
@@ -39,24 +38,29 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
v1 ^= v2; v3 ^= v0; \
v2 = ROTL64(v2,32);
-siploop:
- mi = U8TO64_LE(m);
- v3 ^= mi;
- sipcompress()
- sipcompress()
- v0 ^= mi;
- m += 8;
- len -= 8;
- if (len >= 8) goto siploop;
+ for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
+ mi = U8TO64_LE(m + i);
+ v3 ^= mi;
+ sipcompress()
+ sipcompress()
+ v0 ^= mi;
+ }
-sip7bytesorless:
- for (i = 0; i < len; i++) buf[i] = m[i];
- for (; i < 7; i++) buf[i] = 0;
- mi = U8TO64_LE(buf);
- v3 ^= mi;
+ switch (len - blocks) {
+ case 7: last7 |= (uint64_t)m[i + 6] << 48;
+ case 6: last7 |= (uint64_t)m[i + 5] << 40;
+ case 5: last7 |= (uint64_t)m[i + 4] << 32;
+ case 4: last7 |= (uint64_t)m[i + 3] << 24;
+ case 3: last7 |= (uint64_t)m[i + 2] << 16;
+ case 2: last7 |= (uint64_t)m[i + 1] << 8;
+ case 1: last7 |= (uint64_t)m[i + 0] ;
+ case 0:
+ default:;
+ };
+ v3 ^= last7;
sipcompress()
sipcompress()
- v0 ^= mi;
+ v0 ^= last7;
v2 ^= 0xff;
sipcompress()
sipcompress()
View
@@ -12,23 +12,19 @@ static const packedelem64 siphash_final = {
uint64_t
siphash(unsigned char key[16], const unsigned char *m, size_t len) {
- xmmi k;
- xmmi v02,v20,v13,v11,v33;
- xmmi mi;
- packedelem64 res;
- unsigned char buf[8];
- size_t i;
+ xmmi k,v02,v20,v13,v11,v33,mi;
+ uint64_t last7;
+ uint32_t lo, hi;
+ size_t i, blocks;
k = _mm_loadu_si128((xmmi *)(key + 0));
v02 = siphash_init[0].v;
v13 = siphash_init[1].v;
v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
- buf[7] = (unsigned char)len;
- if (len < 8) goto sip7bytesorless;
+ last7 = (uint64_t)(len & 0xff) << 56;
-/*
#define sipcompress() \
v11 = v13; \
v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
@@ -46,37 +42,29 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
v13 = _mm_unpacklo_epi64(v11, v33); \
v13 = _mm_unpacklo_epi64(v11, v33); \
v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \
- v13 = _mm_xor_si128(v13, v20);
-*/
+ v13 = _mm_xor_si128(v13, v20);
-#define sipcompress() \
- v02 = _mm_add_epi64(v02, v13); \
- v11 = _mm_or_si128(_mm_slli_epi64(v13, 13), _mm_srli_epi64(v13, 64-13)); \
- v33 = _mm_or_si128(_mm_slli_epi64(v13, 16), _mm_srli_epi64(v13, 64-16)); \
- v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \
- v13 = _mm_xor_si128(v13, v02); \
- v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
- v20 = _mm_add_epi64(v20, v13); \
- v11 = _mm_or_si128(_mm_slli_epi64(v13, 17), _mm_srli_epi64(v13, 64-17)); \
- v33 = _mm_or_si128(_mm_slli_epi64(v13, 21), _mm_srli_epi64(v13, 64-21)); \
- v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \
- v13 = _mm_xor_si128(v13, v20); \
- v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2));
+ for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
+ mi = _mm_loadl_epi64((xmmi *)(m + i));
+ v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
+ sipcompress()
+ sipcompress()
+ v02 = _mm_xor_si128(v02, mi);
+ }
-siploop:
- mi = _mm_loadl_epi64((xmmi *)(m + 0));
- v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
- sipcompress()
- sipcompress()
- v02 = _mm_xor_si128(v02, mi);
- m += 8;
- len -= 8;
- if (len >= 8) goto siploop;
+ switch (len - blocks) {
+ case 7: last7 |= (uint64_t)m[i + 6] << 48;
+ case 6: last7 |= (uint64_t)m[i + 5] << 40;
+ case 5: last7 |= (uint64_t)m[i + 4] << 32;
+ case 4: last7 |= (uint64_t)m[i + 3] << 24;
+ case 3: last7 |= (uint64_t)m[i + 2] << 16;
+ case 2: last7 |= (uint64_t)m[i + 1] << 8;
+ case 1: last7 |= (uint64_t)m[i + 0] ;
+ case 0:
+ default:;
+ };
-sip7bytesorless:
- for (i = 0; i < len; i++) buf[i] = m[i];
- for (; i < 7; i++) buf[i] = 0;
- mi = _mm_loadl_epi64((xmmi *)(buf + 0));
+ mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
sipcompress()
sipcompress()
@@ -88,6 +76,8 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
sipcompress()
v02 = _mm_xor_si128(v02, v13);
- res.v = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
- return res.u[0];
+ v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
+ lo = _mm_cvtsi128_si32(v02);
+ hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
+ return ((uint64_t)hi << 32) | lo;
}
View
@@ -16,50 +16,59 @@ static const packedelem8 siphash_rot16v3 = {
uint64_t
siphash(unsigned char key[16], const unsigned char *m, size_t len) {
- xmmi k;
- xmmi v02,v20,v13,v11,v33;
- xmmi mi;
- packedelem64 res;
- unsigned char buf[8];
- size_t i;
+ xmmi k,v02,v20,v13,v11,v33,mi;
+ uint64_t last7;
+ uint32_t lo, hi;
+ size_t i, blocks;
k = _mm_loadu_si128((xmmi *)(key + 0));
v02 = siphash_init[0].v;
v13 = siphash_init[1].v;
v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
- buf[7] = (unsigned char)len;
- if (len < 8) goto sip7bytesorless;
+ last7 = (uint64_t)(len & 0xff) << 56;
#define sipcompress() \
+ v11 = v13; \
+ v33 = v13; \
+ v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \
v02 = _mm_add_epi64(v02, v13); \
- v11 = _mm_or_si128(_mm_slli_epi64(v13, 13), _mm_srli_epi64(v13, 64-13)); \
- v33 = _mm_shuffle_epi8(v13, siphash_rot16v3.v); \
+ v33 = _mm_shuffle_epi8(v33, siphash_rot16v3.v); \
v13 = _mm_unpacklo_epi64(v11, v33); \
v13 = _mm_xor_si128(v13, v02); \
v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
+ v11 = v13; \
+ v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
+ v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \
v20 = _mm_add_epi64(v20, v13); \
- v11 = _mm_or_si128(_mm_slli_epi64(v13, 17), _mm_srli_epi64(v13, 64-17)); \
- v33 = _mm_or_si128(_mm_slli_epi64(v13, 21), _mm_srli_epi64(v13, 64-21)); \
- v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \
- v13 = _mm_xor_si128(v13, v20); \
- v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2));
+ v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \
+ v13 = _mm_unpacklo_epi64(v11, v33); \
+ v13 = _mm_unpacklo_epi64(v11, v33); \
+ v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \
+ v13 = _mm_xor_si128(v13, v20);
-siploop:
- mi = _mm_loadl_epi64((xmmi *)(m + 0));
- v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
- sipcompress()
- sipcompress()
- v02 = _mm_xor_si128(v02, mi);
- m += 8;
- len -= 8;
- if (len >= 8) goto siploop;
+ for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
+ mi = _mm_loadl_epi64((xmmi *)(m + i));
+ v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
+ sipcompress()
+ sipcompress()
+ v02 = _mm_xor_si128(v02, mi);
+ }
+
+ switch (len - blocks) {
+ case 7: last7 |= (uint64_t)m[i + 6] << 48;
+ case 6: last7 |= (uint64_t)m[i + 5] << 40;
+ case 5: last7 |= (uint64_t)m[i + 4] << 32;
+ case 4: last7 |= (uint64_t)m[i + 3] << 24;
+ case 3: last7 |= (uint64_t)m[i + 2] << 16;
+ case 2: last7 |= (uint64_t)m[i + 1] << 8;
+ case 1: last7 |= (uint64_t)m[i + 0] ;
+ case 0:
+ default:;
+ };
-sip7bytesorless:
- for (i = 0; i < len; i++) buf[i] = m[i];
- for (; i < 7; i++) buf[i] = 0;
- mi = _mm_loadl_epi64((xmmi *)(buf + 0));
+ mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
sipcompress()
sipcompress()
@@ -71,6 +80,8 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
sipcompress()
v02 = _mm_xor_si128(v02, v13);
- res.v = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
- return res.u[0];
+ v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
+ lo = _mm_cvtsi128_si32(v02);
+ hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
+ return ((uint64_t)hi << 32) | lo;
}

0 comments on commit 801e284

Please sign in to comment.