Skip to content

Commit

Permalink
borrow finalization from dchest's go version, re-work main loops, use…
Browse files Browse the repository at this point in the history
… slightly re-worked compress for sse
  • Loading branch information
floodyberry committed Jun 24, 2012
1 parent 9b27997 commit 801e284
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 88 deletions.
42 changes: 23 additions & 19 deletions siphash.c
Expand Up @@ -16,8 +16,8 @@ uint64_t
siphash(unsigned char key[16], const unsigned char *m, size_t len) { siphash(unsigned char key[16], const unsigned char *m, size_t len) {
uint64_t v0, v1, v2, v3; uint64_t v0, v1, v2, v3;
uint64_t mi, k0, k1; uint64_t mi, k0, k1;
unsigned char buf[8]; uint64_t last7;
size_t i; size_t i, blocks;


k0 = U8TO64_LE(key + 0); k0 = U8TO64_LE(key + 0);
k1 = U8TO64_LE(key + 8); k1 = U8TO64_LE(key + 8);
Expand All @@ -26,8 +26,7 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
v2 = k0 ^ 0x6c7967656e657261ull; v2 = k0 ^ 0x6c7967656e657261ull;
v3 = k1 ^ 0x7465646279746573ull; v3 = k1 ^ 0x7465646279746573ull;


buf[7] = (unsigned char)len; last7 = (uint64_t)(len & 0xff) << 56;
if (len < 8) goto sip7bytesorless;


#define sipcompress() \ #define sipcompress() \
v0 += v1; v2 += v3; \ v0 += v1; v2 += v3; \
Expand All @@ -39,24 +38,29 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
v1 ^= v2; v3 ^= v0; \ v1 ^= v2; v3 ^= v0; \
v2 = ROTL64(v2,32); v2 = ROTL64(v2,32);


siploop: for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
mi = U8TO64_LE(m); mi = U8TO64_LE(m + i);
v3 ^= mi; v3 ^= mi;
sipcompress() sipcompress()
sipcompress() sipcompress()
v0 ^= mi; v0 ^= mi;
m += 8; }
len -= 8;
if (len >= 8) goto siploop;


sip7bytesorless: switch (len - blocks) {
for (i = 0; i < len; i++) buf[i] = m[i]; case 7: last7 |= (uint64_t)m[i + 6] << 48;
for (; i < 7; i++) buf[i] = 0; case 6: last7 |= (uint64_t)m[i + 5] << 40;
mi = U8TO64_LE(buf); case 5: last7 |= (uint64_t)m[i + 4] << 32;
v3 ^= mi; case 4: last7 |= (uint64_t)m[i + 3] << 24;
case 3: last7 |= (uint64_t)m[i + 2] << 16;
case 2: last7 |= (uint64_t)m[i + 1] << 8;
case 1: last7 |= (uint64_t)m[i + 0] ;
case 0:
default:;
};
v3 ^= last7;
sipcompress() sipcompress()
sipcompress() sipcompress()
v0 ^= mi; v0 ^= last7;
v2 ^= 0xff; v2 ^= 0xff;
sipcompress() sipcompress()
sipcompress() sipcompress()
Expand Down
68 changes: 29 additions & 39 deletions siphash_sse2.c
Expand Up @@ -12,23 +12,19 @@ static const packedelem64 siphash_final = {


uint64_t uint64_t
siphash(unsigned char key[16], const unsigned char *m, size_t len) { siphash(unsigned char key[16], const unsigned char *m, size_t len) {
xmmi k; xmmi k,v02,v20,v13,v11,v33,mi;
xmmi v02,v20,v13,v11,v33; uint64_t last7;
xmmi mi; uint32_t lo, hi;
packedelem64 res; size_t i, blocks;
unsigned char buf[8];
size_t i;


k = _mm_loadu_si128((xmmi *)(key + 0)); k = _mm_loadu_si128((xmmi *)(key + 0));
v02 = siphash_init[0].v; v02 = siphash_init[0].v;
v13 = siphash_init[1].v; v13 = siphash_init[1].v;
v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));


buf[7] = (unsigned char)len; last7 = (uint64_t)(len & 0xff) << 56;
if (len < 8) goto sip7bytesorless;


/*
#define sipcompress() \ #define sipcompress() \
v11 = v13; \ v11 = v13; \
v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
Expand All @@ -46,37 +42,29 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
v13 = _mm_unpacklo_epi64(v11, v33); \ v13 = _mm_unpacklo_epi64(v11, v33); \
v13 = _mm_unpacklo_epi64(v11, v33); \ v13 = _mm_unpacklo_epi64(v11, v33); \
v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \ v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \
v13 = _mm_xor_si128(v13, v20); v13 = _mm_xor_si128(v13, v20);
*/


#define sipcompress() \ for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
v02 = _mm_add_epi64(v02, v13); \ mi = _mm_loadl_epi64((xmmi *)(m + i));
v11 = _mm_or_si128(_mm_slli_epi64(v13, 13), _mm_srli_epi64(v13, 64-13)); \ v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
v33 = _mm_or_si128(_mm_slli_epi64(v13, 16), _mm_srli_epi64(v13, 64-16)); \ sipcompress()
v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \ sipcompress()
v13 = _mm_xor_si128(v13, v02); \ v02 = _mm_xor_si128(v02, mi);
v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ }
v20 = _mm_add_epi64(v20, v13); \
v11 = _mm_or_si128(_mm_slli_epi64(v13, 17), _mm_srli_epi64(v13, 64-17)); \
v33 = _mm_or_si128(_mm_slli_epi64(v13, 21), _mm_srli_epi64(v13, 64-21)); \
v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \
v13 = _mm_xor_si128(v13, v20); \
v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2));


siploop: switch (len - blocks) {
mi = _mm_loadl_epi64((xmmi *)(m + 0)); case 7: last7 |= (uint64_t)m[i + 6] << 48;
v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); case 6: last7 |= (uint64_t)m[i + 5] << 40;
sipcompress() case 5: last7 |= (uint64_t)m[i + 4] << 32;
sipcompress() case 4: last7 |= (uint64_t)m[i + 3] << 24;
v02 = _mm_xor_si128(v02, mi); case 3: last7 |= (uint64_t)m[i + 2] << 16;
m += 8; case 2: last7 |= (uint64_t)m[i + 1] << 8;
len -= 8; case 1: last7 |= (uint64_t)m[i + 0] ;
if (len >= 8) goto siploop; case 0:
default:;
};


sip7bytesorless: mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
for (i = 0; i < len; i++) buf[i] = m[i];
for (; i < 7; i++) buf[i] = 0;
mi = _mm_loadl_epi64((xmmi *)(buf + 0));
v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
sipcompress() sipcompress()
sipcompress() sipcompress()
Expand All @@ -88,6 +76,8 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
sipcompress() sipcompress()


v02 = _mm_xor_si128(v02, v13); v02 = _mm_xor_si128(v02, v13);
res.v = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
return res.u[0]; lo = _mm_cvtsi128_si32(v02);
hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
return ((uint64_t)hi << 32) | lo;
} }
71 changes: 41 additions & 30 deletions siphash_ssse3.c
Expand Up @@ -16,50 +16,59 @@ static const packedelem8 siphash_rot16v3 = {


uint64_t uint64_t
siphash(unsigned char key[16], const unsigned char *m, size_t len) { siphash(unsigned char key[16], const unsigned char *m, size_t len) {
xmmi k; xmmi k,v02,v20,v13,v11,v33,mi;
xmmi v02,v20,v13,v11,v33; uint64_t last7;
xmmi mi; uint32_t lo, hi;
packedelem64 res; size_t i, blocks;
unsigned char buf[8];
size_t i;


k = _mm_loadu_si128((xmmi *)(key + 0)); k = _mm_loadu_si128((xmmi *)(key + 0));
v02 = siphash_init[0].v; v02 = siphash_init[0].v;
v13 = siphash_init[1].v; v13 = siphash_init[1].v;
v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));


buf[7] = (unsigned char)len; last7 = (uint64_t)(len & 0xff) << 56;
if (len < 8) goto sip7bytesorless;


#define sipcompress() \ #define sipcompress() \
v11 = v13; \
v33 = v13; \
v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \
v02 = _mm_add_epi64(v02, v13); \ v02 = _mm_add_epi64(v02, v13); \
v11 = _mm_or_si128(_mm_slli_epi64(v13, 13), _mm_srli_epi64(v13, 64-13)); \ v33 = _mm_shuffle_epi8(v33, siphash_rot16v3.v); \
v33 = _mm_shuffle_epi8(v13, siphash_rot16v3.v); \
v13 = _mm_unpacklo_epi64(v11, v33); \ v13 = _mm_unpacklo_epi64(v11, v33); \
v13 = _mm_xor_si128(v13, v02); \ v13 = _mm_xor_si128(v13, v02); \
v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
v11 = v13; \
v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \
v20 = _mm_add_epi64(v20, v13); \ v20 = _mm_add_epi64(v20, v13); \
v11 = _mm_or_si128(_mm_slli_epi64(v13, 17), _mm_srli_epi64(v13, 64-17)); \ v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \
v33 = _mm_or_si128(_mm_slli_epi64(v13, 21), _mm_srli_epi64(v13, 64-21)); \ v13 = _mm_unpacklo_epi64(v11, v33); \
v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \ v13 = _mm_unpacklo_epi64(v11, v33); \
v13 = _mm_xor_si128(v13, v20); \ v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \
v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); v13 = _mm_xor_si128(v13, v20);


siploop: for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
mi = _mm_loadl_epi64((xmmi *)(m + 0)); mi = _mm_loadl_epi64((xmmi *)(m + i));
v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
sipcompress() sipcompress()
sipcompress() sipcompress()
v02 = _mm_xor_si128(v02, mi); v02 = _mm_xor_si128(v02, mi);
m += 8; }
len -= 8;
if (len >= 8) goto siploop; switch (len - blocks) {
case 7: last7 |= (uint64_t)m[i + 6] << 48;
case 6: last7 |= (uint64_t)m[i + 5] << 40;
case 5: last7 |= (uint64_t)m[i + 4] << 32;
case 4: last7 |= (uint64_t)m[i + 3] << 24;
case 3: last7 |= (uint64_t)m[i + 2] << 16;
case 2: last7 |= (uint64_t)m[i + 1] << 8;
case 1: last7 |= (uint64_t)m[i + 0] ;
case 0:
default:;
};


sip7bytesorless: mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
for (i = 0; i < len; i++) buf[i] = m[i];
for (; i < 7; i++) buf[i] = 0;
mi = _mm_loadl_epi64((xmmi *)(buf + 0));
v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
sipcompress() sipcompress()
sipcompress() sipcompress()
Expand All @@ -71,6 +80,8 @@ siphash(unsigned char key[16], const unsigned char *m, size_t len) {
sipcompress() sipcompress()


v02 = _mm_xor_si128(v02, v13); v02 = _mm_xor_si128(v02, v13);
res.v = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
return res.u[0]; lo = _mm_cvtsi128_si32(v02);
hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
return ((uint64_t)hi << 32) | lo;
} }

0 comments on commit 801e284

Please sign in to comment.