Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

initial commit

  • Loading branch information...
commit d96a2fb87626ce09bba173ecef6cabec9f1ab3ff 0 parents
Andrew M. authored June 21, 2012
36  Makefile
... ...
@@ -0,0 +1,36 @@
  1
+deafult: gcc64
  2
+gcc64: gcc64_siphash gcc64_siphash_sse2 gcc64_siphash_ssse3
  3
+icc64: icc64_siphash icc64_siphash_sse2 icc64_siphash_ssse3
  4
+gcc32: gcc32_siphash gcc32_siphash_sse2 gcc32_siphash_ssse3
  5
+icc32: icc32_siphash icc32_siphash_sse2 icc32_siphash_ssse3
  6
+
  7
+gcc64_siphash:
  8
+	gcc siphash.c test.c -m64 -O3 -o test_gcc64_siphash -Wall
  9
+gcc64_siphash_sse2:
  10
+	gcc siphash_sse2.c test.c -m64 -msse2 -O3 -o test_gcc64_siphash_sse2 -Wall
  11
+gcc64_siphash_ssse3:
  12
+	gcc siphash_ssse3.c test.c -m64 -mssse3 -O3 -o test_gcc64_siphash_ssse3 -Wall
  13
+
  14
+icc64_siphash:
  15
+	icc siphash.c test.c -m64 -O3 -o test_icc64_siphash -Wall
  16
+icc64_siphash_sse2:
  17
+	icc siphash_sse2.c test.c -m64 -msse2 -O3 -o test_icc64_siphash_sse2 -Wall
  18
+icc64_siphash_ssse3:
  19
+	icc siphash_ssse3.c test.c -m64 -mssse3 -O3 -o test_icc64_siphash_ssse3 -Wall
  20
+
  21
+gcc32_siphash:
  22
+	gcc siphash.c test.c -m32 -O3 -o test_gcc32_siphash -Wall
  23
+gcc32_siphash_sse2:
  24
+	gcc siphash_sse2.c test.c -m32 -msse2 -O3 -o test_gcc32_siphash_sse2 -Wall
  25
+gcc32_siphash_ssse3:
  26
+	gcc siphash_ssse3.c test.c -m32 -mssse3 -O3 -o test_gcc32_siphash_ssse3 -Wall
  27
+
  28
+icc32_siphash:
  29
+	icc siphash.c test.c -m32 -O3 -o test_icc32_siphash -Wall
  30
+icc32_siphash_sse2:
  31
+	icc siphash_sse2.c test.c -m32 -msse2 -O3 -o test_icc32_siphash_sse2 -Wall
  32
+icc32_siphash_ssse3:
  33
+	icc siphash_ssse3.c test.c -m32 -mssse3 -O3 -o test_icc32_siphash_ssse3 -Wall
  34
+
  35
+clean:
  36
+	rm -f test_*
4  README.md
Source Rendered
... ...
@@ -0,0 +1,4 @@
  1
+Basic+SSE2+SSSE3 implementation of [SipHash-2-4](http://131002.net/siphash/) based off the paper. On an E5200, icc gives 
  2
+the best speeds for all SSE versions & 32bit versions, gcc gives the best speed for 64bit basic
  3
+
  4
+done to see how difficult implementing the spec was as their source is not up (yet), and curious about 32bit performance
67  siphash.c
... ...
@@ -0,0 +1,67 @@
  1
+#include "siphash_impl.h"
  2
+
  3
+static uint64_t INLINE
  4
+U8TO64_LE(const unsigned char *p) {
  5
+	return *(const uint64_t *)p;
  6
+}
  7
+
  8
+/*
  9
+static void INLINE
  10
+U64TO8_LE(unsigned char *p, const uint64_t v) {
  11
+	*(uint64_t *)p = v;
  12
+}
  13
+*/
  14
+
  15
+uint64_t
  16
+siphash(unsigned char key[16], const unsigned char *m, size_t len) {
  17
+	uint64_t v0, v1, v2, v3;
  18
+	uint64_t mi, k0, k1;
  19
+	unsigned char buf[8];
  20
+	size_t i;
  21
+
  22
+	k0 = U8TO64_LE(key + 0);
  23
+	k1 = U8TO64_LE(key + 8);
  24
+	v0 = k0 ^ 0x736f6d6570736575ull;
  25
+	v1 = k1 ^ 0x646f72616e646f6dull;
  26
+	v2 = k0 ^ 0x6c7967656e657261ull;
  27
+	v3 = k1 ^ 0x7465646279746573ull;
  28
+
  29
+	buf[7] = (unsigned char)len;
  30
+	if (len < 8) goto sip7bytesorless;
  31
+
  32
+#define sipcompress() \
  33
+	v0 += v1; v2 += v3; \
  34
+	v1 = ROTL64(v1,13);	v3 = ROTL64(v3,16); \
  35
+	v1 ^= v0; v3 ^= v2; \
  36
+	v0 = ROTL64(v0,32); \
  37
+	v2 += v1; v0 += v3; \
  38
+	v1 = ROTL64(v1,17); v3 = ROTL64(v3,21); \
  39
+	v1 ^= v2; v3 ^= v0; \
  40
+	v2 = ROTL64(v2,32);
  41
+
  42
+siploop:
  43
+	mi = U8TO64_LE(m);
  44
+	v3 ^= mi;
  45
+	sipcompress()
  46
+	sipcompress()
  47
+	v0 ^= mi;
  48
+	m += 8;
  49
+	len -= 8;
  50
+	if (len >= 8) goto siploop;
  51
+
  52
+sip7bytesorless:
  53
+	for (i = 0; i < len; i++) buf[i] = m[i];
  54
+	for (; i < 7; i++) buf[i] = 0;
  55
+	mi = U8TO64_LE(buf);
  56
+	v3 ^= mi;
  57
+	sipcompress()
  58
+	sipcompress()
  59
+	v0 ^= mi;
  60
+	v2 ^= 0xff;
  61
+	sipcompress()
  62
+	sipcompress()
  63
+	sipcompress()
  64
+	sipcompress()
  65
+	return v0 ^ v1 ^ v2 ^ v3;
  66
+}
  67
+
8  siphash.h
... ...
@@ -0,0 +1,8 @@
  1
+#ifndef SIPHASH_H
  2
+#define SIPHASH_H
  3
+
  4
+#include "siphash_impl.h"
  5
+
  6
+uint64_t siphash(unsigned char key[16], const unsigned char *m, size_t len);
  7
+
  8
+#endif // SIPHASH_H
45  siphash_impl.h
... ...
@@ -0,0 +1,45 @@
  1
+#ifndef SIPHASH_IMPL_H
  2
+#define SIPHASH_IMPL_H
  3
+
  4
+#if defined(_MSC_VER)
  5
+	#include <intrin.h>
  6
+
  7
+	#define INLINE __forceinline
  8
+	#define NOINLINE __declspec(noinline)
  9
+	#define ROTL64(a,b) _rotl64(a,b)
  10
+	#define MM16 __declspec(align(16))
  11
+
  12
+	typedef unsigned int uint32_t;
  13
+	typedef unsigned __int64 uint64_t;
  14
+#else
  15
+	#include <stdint.h>
  16
+	#include <stdlib.h>
  17
+
  18
+	#define INLINE __attribute__((always_inline))
  19
+	#define NOINLINE __attribute__((noinline))
  20
+	#define ROTL64(a,b) (((a)<<(b))|((a)>>(64-b)))
  21
+	#define MM16 __attribute__((aligned(16)))
  22
+#endif
  23
+	
  24
+#if defined(__SSE2__)
  25
+	#include <emmintrin.h>
  26
+	typedef __m128i xmmi;
  27
+	typedef __m64 qmm;
  28
+
  29
+	typedef union packedelem64_t {
  30
+		uint64_t u[2];
  31
+		xmmi v;	
  32
+	} packedelem64;
  33
+	
  34
+	typedef union packedelem8_t {
  35
+		unsigned char u[16];
  36
+		xmmi v;	
  37
+	} packedelem8;
  38
+#endif
  39
+
  40
+#if defined(__SSSE3__)
  41
+	#include <tmmintrin.h>
  42
+#endif
  43
+
  44
+#endif // SIPHASH_IMPL_H
  45
+
94  siphash_sse2.c
... ...
@@ -0,0 +1,94 @@
  1
+#include "siphash_impl.h"
  2
+
  3
+/* 0,2,1,3 */
  4
+static const packedelem64 siphash_init[2] = {
  5
+	{{0x736f6d6570736575ull,0x6c7967656e657261ull}},
  6
+	{{0x646f72616e646f6dull,0x7465646279746573ull}}
  7
+};
  8
+
  9
+static const packedelem64 siphash_final = {
  10
+	{0x0000000000000000ull,0x00000000000000ffull}
  11
+};
  12
+
  13
+uint64_t
  14
+siphash(unsigned char key[16], const unsigned char *m, size_t len) {
  15
+	xmmi k;
  16
+	xmmi v02,v20,v13,v11,v33;
  17
+	xmmi mi;
  18
+	uint64_t MM16 res[2];
  19
+	unsigned char buf[8];
  20
+	size_t i;
  21
+
  22
+	k = _mm_loadu_si128((xmmi *)(key + 0));
  23
+	v02 = siphash_init[0].v;
  24
+	v13 = siphash_init[1].v;
  25
+	v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
  26
+	v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
  27
+
  28
+	buf[7] = (unsigned char)len;
  29
+	if (len < 8) goto sip7bytesorless;
  30
+
  31
+/*
  32
+#define sipcompress() \
  33
+	v11 = v13; \
  34
+	v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
  35
+	v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \
  36
+	v02 = _mm_add_epi64(v02, v13); \
  37
+	v33 = _mm_or_si128(_mm_slli_epi64(v33, 16), _mm_srli_epi64(v33, 64-16)); \
  38
+	v13 = _mm_unpacklo_epi64(v11, v33); \
  39
+	v13 = _mm_xor_si128(v13, v02); \
  40
+	v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
  41
+	v11 = v13; \
  42
+	v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
  43
+	v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \
  44
+	v20 = _mm_add_epi64(v20, v13); \
  45
+	v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \
  46
+	v13 = _mm_unpacklo_epi64(v11, v33); \
  47
+	v13 = _mm_unpacklo_epi64(v11, v33); \
  48
+	v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \
  49
+	v13 = _mm_xor_si128(v13, v20);	
  50
+*/
  51
+
  52
+#define sipcompress() \
  53
+	v02 = _mm_add_epi64(v02, v13); \
  54
+	v11 = _mm_or_si128(_mm_slli_epi64(v13, 13), _mm_srli_epi64(v13, 64-13)); \
  55
+	v33 = _mm_or_si128(_mm_slli_epi64(v13, 16), _mm_srli_epi64(v13, 64-16)); \
  56
+	v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \
  57
+	v13 = _mm_xor_si128(v13, v02); \
  58
+	v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
  59
+	v20 = _mm_add_epi64(v20, v13); \
  60
+	v11 = _mm_or_si128(_mm_slli_epi64(v13, 17), _mm_srli_epi64(v13, 64-17)); \
  61
+	v33 = _mm_or_si128(_mm_slli_epi64(v13, 21), _mm_srli_epi64(v13, 64-21)); \
  62
+	v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \
  63
+	v13 = _mm_xor_si128(v13, v20); \
  64
+	v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2));
  65
+
  66
+siploop:
  67
+	mi = _mm_loadl_epi64((xmmi *)(m + 0));
  68
+	v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  69
+	sipcompress()
  70
+	sipcompress()
  71
+	v02 = _mm_xor_si128(v02, mi);
  72
+	m += 8;
  73
+	len -= 8;
  74
+	if (len >= 8) goto siploop;
  75
+
  76
+sip7bytesorless:
  77
+	for (i = 0; i < len; i++) buf[i] = m[i];
  78
+	for (; i < 7; i++) buf[i] = 0;
  79
+	mi = _mm_loadl_epi64((xmmi *)(buf + 0));
  80
+	v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  81
+	sipcompress()
  82
+	sipcompress()
  83
+	v02 = _mm_xor_si128(v02, mi);
  84
+	v02 = _mm_xor_si128(v02, siphash_final.v);
  85
+	sipcompress()
  86
+	sipcompress()
  87
+	sipcompress()
  88
+	sipcompress()
  89
+
  90
+	v02 = _mm_xor_si128(v02, v13);
  91
+	v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
  92
+	_mm_store_si128((xmmi *)res, v02);
  93
+	return res[0];
  94
+}
77  siphash_ssse3.c
... ...
@@ -0,0 +1,77 @@
  1
+#include "siphash_impl.h"
  2
+
  3
+/* 0,2,1,3 */
  4
+static const packedelem64 siphash_init[2] = {
  5
+	{{0x736f6d6570736575ull,0x6c7967656e657261ull}},
  6
+	{{0x646f72616e646f6dull,0x7465646279746573ull}}
  7
+};
  8
+
  9
+static const packedelem64 siphash_final = {
  10
+	{0x0000000000000000ull,0x00000000000000ffull}
  11
+};
  12
+
  13
+static const packedelem8 siphash_rot16v3 = {
  14
+	{14,15,8,9,10,11,12,13,8,9,10,11,12,13,14,15}
  15
+};
  16
+
  17
+uint64_t
  18
+siphash(unsigned char key[16], const unsigned char *m, size_t len) {
  19
+	xmmi k;
  20
+	xmmi v02,v20,v13,v11,v33;
  21
+	xmmi mi;
  22
+	uint64_t MM16 res[2];
  23
+	unsigned char buf[8];
  24
+	size_t i;
  25
+
  26
+	k = _mm_loadu_si128((xmmi *)(key + 0));
  27
+	v02 = siphash_init[0].v;
  28
+	v13 = siphash_init[1].v;
  29
+	v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
  30
+	v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
  31
+
  32
+	buf[7] = (unsigned char)len;
  33
+	if (len < 8) goto sip7bytesorless;
  34
+
  35
+#define sipcompress() \
  36
+	v02 = _mm_add_epi64(v02, v13); \
  37
+	v11 = _mm_or_si128(_mm_slli_epi64(v13, 13), _mm_srli_epi64(v13, 64-13)); \
  38
+	v33 = _mm_shuffle_epi8(v13, siphash_rot16v3.v); \
  39
+	v13 = _mm_unpacklo_epi64(v11, v33); \
  40
+	v13 = _mm_xor_si128(v13, v02); \
  41
+	v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
  42
+	v20 = _mm_add_epi64(v20, v13); \
  43
+	v11 = _mm_or_si128(_mm_slli_epi64(v13, 17), _mm_srli_epi64(v13, 64-17)); \
  44
+	v33 = _mm_or_si128(_mm_slli_epi64(v13, 21), _mm_srli_epi64(v13, 64-21)); \
  45
+	v13 = _mm_unpacklo_epi64(v11, _mm_shuffle_epi32(v33, _MM_SHUFFLE(1,0,3,2))); \
  46
+	v13 = _mm_xor_si128(v13, v20); \
  47
+	v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2));
  48
+
  49
+siploop:
  50
+	mi = _mm_loadl_epi64((xmmi *)(m + 0));
  51
+	v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  52
+	sipcompress()
  53
+	sipcompress()
  54
+	v02 = _mm_xor_si128(v02, mi);
  55
+	m += 8;
  56
+	len -= 8;
  57
+	if (len >= 8) goto siploop;
  58
+
  59
+sip7bytesorless:
  60
+	for (i = 0; i < len; i++) buf[i] = m[i];
  61
+	for (; i < 7; i++) buf[i] = 0;
  62
+	mi = _mm_loadl_epi64((xmmi *)(buf + 0));
  63
+	v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  64
+	sipcompress()
  65
+	sipcompress()
  66
+	v02 = _mm_xor_si128(v02, mi);
  67
+	v02 = _mm_xor_si128(v02, siphash_final.v);
  68
+	sipcompress()
  69
+	sipcompress()
  70
+	sipcompress()
  71
+	sipcompress()
  72
+
  73
+	v02 = _mm_xor_si128(v02, v13);
  74
+	v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
  75
+	_mm_store_si128((xmmi *)res, v02);
  76
+	return res[0];
  77
+}
52  test.c
... ...
@@ -0,0 +1,52 @@
  1
+#include <stdio.h>
  2
+#include "siphash.h"
  3
+
  4
+#if defined(_MSC_VER)
  5
+	static uint64_t INLINE
  6
+	get_ticks(void) {
  7
+		return __rdtsc();
  8
+	}
  9
+#else
  10
+	static uint64_t INLINE
  11
+	get_ticks(void) {
  12
+		uint32_t lo, hi;
  13
+		__asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi));
  14
+		return ((uint64_t)lo | ((uint64_t)hi << 32));
  15
+	}
  16
+#endif
  17
+
  18
+int main() {
  19
+	static unsigned char key[16] = {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f};
  20
+	static unsigned char msg[15] = {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e};
  21
+	static unsigned char buf[1024] = {0xff};
  22
+	static const size_t outer_reps = 80, inner_reps = 128;
  23
+	size_t i, j, len;
  24
+	uint64_t t, sum, tempsum;
  25
+	double cycles;
  26
+
  27
+	uint64_t res = siphash(key, msg, 15);
  28
+	if (res != 0xa129ca6149be45e5ull) {
  29
+		printf("fail\n");
  30
+		return 0;
  31
+	}
  32
+
  33
+	for (i = 0; i < 500000; i++)
  34
+		key[0] += (unsigned char)siphash(key, buf, 1024);
  35
+
  36
+	for (len = 1; len < 1024; len++) {
  37
+		sum = 1000000000000000ull;
  38
+		t = get_ticks();
  39
+		for (j = 0; j < outer_reps; j++) {
  40
+			for (i = 0; i < inner_reps; i++)
  41
+				key[0] += (unsigned char)siphash(key, buf, len);
  42
+			tempsum = (get_ticks() - t);
  43
+			if (tempsum < sum)
  44
+				sum = tempsum;
  45
+		}
  46
+		
  47
+		cycles = (double)sum / (double)inner_reps;
  48
+		printf("%u bytes, %.4f cycles/byte, %.4f cycles\n", (uint32_t)len, cycles / len, cycles);
  49
+	}
  50
+
  51
+	return 0;
  52
+}

0 notes on commit d96a2fb

Please sign in to comment.
Something went wrong with that request. Please try again.