Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

supercop-20091009

  • Loading branch information...
commit c3194be6ecfd771d4f9171338b7aa4bdd6e6e08c 1 parent 82ebbfe
Daniel J. Bernstein authored committed
View
9 crypto_hash/blake32/ssse3/README
@@ -0,0 +1,9 @@
+BLAKE-32 ssse3 eBASH implementation
+authors: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
+ Peter Schwabe <peter@cryptojedi.org>
+ Samuel Neves <sneves@dei.uc.pt>
+
+This implementation assumes that no salt is used.
+
+Level of copyright protection: 0
+Level of patent protection: 0
View
1  crypto_hash/blake32/ssse3/api.h
@@ -0,0 +1 @@
+#define CRYPTO_BYTES 32
View
351 crypto_hash/blake32/ssse3/hash.c
@@ -0,0 +1,351 @@
+#include <string.h>
+#include <stdio.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include "crypto_hash.h"
+
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#define U8TO32(p) \
+ (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
+ ((u32)((p)[2]) << 8) | ((u32)((p)[3]) ))
+#define U32TO8(p, v) \
+ (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
+ (p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) );
+
+typedef struct {
+ u32 h[8], s[4], t[2];
+ int buflen, nullt;
+ u8 buf[64];
+} state;
+
+const u8 sigma[][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
+ {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
+ {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
+ { 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
+ { 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
+ {12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
+ {13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
+ { 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
+ {10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
+ {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
+ {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
+ { 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
+ { 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
+ {12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
+ {13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
+ { 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
+ {10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0 }};
+
+const u32 cst[16] = {
+ 0x243F6A88,0x85A308D3,0x13198A2E,0x03707344,
+ 0xA4093822,0x299F31D0,0x082EFA98,0xEC4E6C89,
+ 0x452821E6,0x38D01377,0xBE5466CF,0x34E90C6C,
+ 0xC0AC29B7,0xC97C50DD,0x3F84D5B5,0xB5470917};
+
+const u8 padding[] =
+ {0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+
+
+static int blake32_compress( state * state, const u8 * datablock ) {
+
+ __m128i row1,row2,row3,row4;
+ __m128i buf1,buf2;
+ static const u8 rot8[16] = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+ static const u8 rot16[16] = {2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13};
+ __m128i r8 = _mm_load_si128((__m128i*)rot8);
+ __m128i r16 = _mm_load_si128((__m128i*)rot16);
+
+
+ u32 m[16];
+ int r;
+ u64 t;
+
+ static const int sig[][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+ };
+ static const u32 z[16] = {
+ 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344,
+ 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89,
+ 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C,
+ 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917
+ };
+
+ /* get message */
+ m[ 0] = U8TO32(datablock + 0);
+ m[ 1] = U8TO32(datablock + 4);
+ m[ 2] = U8TO32(datablock + 8);
+ m[ 3] = U8TO32(datablock +12);
+ m[ 4] = U8TO32(datablock +16);
+ m[ 5] = U8TO32(datablock +20);
+ m[ 6] = U8TO32(datablock +24);
+ m[ 7] = U8TO32(datablock +28);
+ m[ 8] = U8TO32(datablock +32);
+ m[ 9] = U8TO32(datablock +36);
+ m[10] = U8TO32(datablock +40);
+ m[11] = U8TO32(datablock +44);
+ m[12] = U8TO32(datablock +48);
+ m[13] = U8TO32(datablock +52);
+ m[14] = U8TO32(datablock +56);
+ m[15] = U8TO32(datablock +60);
+
+ row1 = _mm_set_epi32(state->h[ 3], state->h[ 2],
+ state->h[ 1], state->h[ 0]);
+ row2 = _mm_set_epi32(state->h[ 7], state->h[ 6],
+ state->h[ 5], state->h[ 4]);
+ row3 = _mm_set_epi32(0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88);
+
+ if (state->nullt)
+ row4 = _mm_set_epi32(0xEC4E6C89, 0x082EFA98, 0x299F31D0, 0xA4093822);
+ else
+ row4 = _mm_set_epi32(0xEC4E6C89^state->t[1], 0x082EFA98^state->t[1],
+ 0x299F31D0^state->t[0], 0xA4093822^state->t[0]);
+
+#define round(r) \
+ /* column step */ \
+ buf1 = _mm_set_epi32(m[sig[r][ 6]], \
+ m[sig[r][ 4]], \
+ m[sig[r][ 2]], \
+ m[sig[r][ 0]]); \
+ buf2 = _mm_set_epi32(z[sig[r][ 7]], \
+ z[sig[r][ 5]], \
+ z[sig[r][ 3]], \
+ z[sig[r][ 1]]); \
+ buf1 = _mm_xor_si128( buf1, buf2); \
+ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf1), row2 ); \
+ buf1 = _mm_set_epi32(z[sig[r][ 6]], \
+ z[sig[r][ 4]], \
+ z[sig[r][ 2]], \
+ z[sig[r][ 0]]); \
+ buf2 = _mm_set_epi32(m[sig[r][ 7]], \
+ m[sig[r][ 5]], \
+ m[sig[r][ 3]], \
+ m[sig[r][ 1]]); \
+ row4 = _mm_xor_si128( row4, row1 ); \
+ row4 = _mm_shuffle_epi8(row4, r16); \
+ row3 = _mm_add_epi32( row3, row4 ); \
+ row2 = _mm_xor_si128( row2, row3 ); \
+ buf1 = _mm_xor_si128( buf1, buf2); \
+ row2 = _mm_xor_si128(_mm_srli_epi32( row2, 12 ),_mm_slli_epi32( row2, 20 )); \
+ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf1), row2 ); \
+ row4 = _mm_xor_si128( row4, row1 ); \
+ row4 = _mm_shuffle_epi8(row4, r8); \
+ row3 = _mm_add_epi32( row3, row4 ); \
+ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \
+ row2 = _mm_xor_si128( row2, row3 ); \
+ row2 = _mm_xor_si128(_mm_srli_epi32( row2, 7 ),_mm_slli_epi32( row2, 25 )); \
+\
+ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
+ row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); \
+\
+ /* diagonal step */ \
+ buf1 = _mm_set_epi32(m[sig[r][14]], \
+ m[sig[r][12]], \
+ m[sig[r][10]], \
+ m[sig[r][ 8]]); \
+ buf2 = _mm_set_epi32(z[sig[r][15]], \
+ z[sig[r][13]], \
+ z[sig[r][11]], \
+ z[sig[r][ 9]]); \
+ buf1 = _mm_xor_si128( buf1, buf2); \
+ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf1 ), row2 ); \
+ buf1 = _mm_set_epi32(z[sig[r][14]], \
+ z[sig[r][12]], \
+ z[sig[r][10]], \
+ z[sig[r][ 8]]); \
+ buf2 = _mm_set_epi32(m[sig[r][15]], \
+ m[sig[r][13]], \
+ m[sig[r][11]], \
+ m[sig[r][ 9]]); \
+ row4 = _mm_xor_si128( row4, row1 ); \
+ buf1 = _mm_xor_si128( buf1, buf2); \
+ row4 = _mm_shuffle_epi8(row4, r16); \
+ row3 = _mm_add_epi32( row3, row4 ); \
+ row2 = _mm_xor_si128( row2, row3 ); \
+ row2 = _mm_xor_si128(_mm_srli_epi32( row2, 12 ),_mm_slli_epi32( row2, 20 )); \
+ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf1 ), row2 ); \
+ row4 = _mm_xor_si128( row4, row1 ); \
+ row4 = _mm_shuffle_epi8(row4, r8); \
+ row3 = _mm_add_epi32( row3, row4 ); \
+ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \
+ row2 = _mm_xor_si128( row2, row3 ); \
+ row2 = _mm_xor_si128(_mm_srli_epi32( row2, 7 ),_mm_slli_epi32( row2, 25 )); \
+\
+ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \
+ row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); \
+\
+
+ round(0);
+ round(1);
+ round(2);
+ round(3);
+ round(4);
+ round(5);
+ round(6);
+ round(7);
+ round(8);
+ round(9);
+
+ _mm_store_si128( (__m128i *)m, _mm_xor_si128(row1,row3));
+ state->h[0] ^= m[ 0];
+ state->h[1] ^= m[ 1];
+ state->h[2] ^= m[ 2];
+ state->h[3] ^= m[ 3];
+ _mm_store_si128( (__m128i *)m, _mm_xor_si128(row2,row4));
+ state->h[4] ^= m[ 0];
+ state->h[5] ^= m[ 1];
+ state->h[6] ^= m[ 2];
+ state->h[7] ^= m[ 3];
+
+ return 0;
+}
+
+
+void blake32_init( state *S ) {
+
+ S->h[0]=0x6A09E667;
+ S->h[1]=0xBB67AE85;
+ S->h[2]=0x3C6EF372;
+ S->h[3]=0xA54FF53A;
+ S->h[4]=0x510E527F;
+ S->h[5]=0x9B05688C;
+ S->h[6]=0x1F83D9AB;
+ S->h[7]=0x5BE0CD19;
+ S->t[0]=S->t[1]=S->buflen=S->nullt=0;
+ S->s[0]=S->s[1]=S->s[2]=S->s[3] =0;
+}
+
+
+void blake32_update( state *S, const u8 *data, u64 datalen ) {
+
+ int left=S->buflen >> 3;
+ int fill=64 - left;
+
+ if( left && ( ((datalen >> 3) & 0x3F) >= fill ) ) {
+ memcpy( (void*) (S->buf + left), (void*) data, fill );
+ S->t[0] += 512;
+ if (S->t[0] == 0) S->t[1]++;
+ blake32_compress( S, S->buf );
+ data += fill;
+ datalen -= (fill << 3);
+ left = 0;
+ }
+
+ while( datalen >= 512 ) {
+ S->t[0] += 512;
+ if (S->t[0] == 0) S->t[1]++;
+ blake32_compress( S, data );
+ data += 64;
+ datalen -= 512;
+ }
+
+ if( datalen > 0 ) {
+ memcpy( (void*) (S->buf + left), (void*) data, datalen>>3 );
+ S->buflen = (left<<3) + datalen;
+ }
+ else S->buflen=0;
+}
+
+
+void blake32_final( state *S, u8 *digest ) {
+
+ u8 msglen[8], zo=0x01, oo=0x81;
+ u32 lo=S->t[0] + S->buflen, hi=S->t[1];
+ if ( lo < S->buflen ) hi++;
+ U32TO8( msglen + 0, hi );
+ U32TO8( msglen + 4, lo );
+
+ if ( S->buflen == 440 ) { /* one padding byte */
+ S->t[0] -= 8;
+ blake32_update( S, &oo, 8 );
+ }
+ else {
+ if ( S->buflen < 440 ) { /* enough space to fill the block */
+ if ( !S->buflen ) S->nullt=1;
+ S->t[0] -= 440 - S->buflen;
+ blake32_update( S, padding, 440 - S->buflen );
+ }
+ else { /* need 2 compressions */
+ S->t[0] -= 512 - S->buflen;
+ blake32_update( S, padding, 512 - S->buflen );
+ S->t[0] -= 440;
+ blake32_update( S, padding+1, 440 );
+ S->nullt = 1;
+ }
+ blake32_update( S, &zo, 8 );
+ S->t[0] -= 8;
+ }
+ S->t[0] -= 64;
+ blake32_update( S, msglen, 64 );
+
+ U32TO8( digest + 0, S->h[0]);
+ U32TO8( digest + 4, S->h[1]);
+ U32TO8( digest + 8, S->h[2]);
+ U32TO8( digest +12, S->h[3]);
+ U32TO8( digest +16, S->h[4]);
+ U32TO8( digest +20, S->h[5]);
+ U32TO8( digest +24, S->h[6]);
+ U32TO8( digest +28, S->h[7]);
+}
+
+
+void crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) {
+
+ state S;
+ blake32_init( &S );
+ blake32_update( &S, in, inlen*8 );
+ blake32_final( &S, out );
+}
+
+/*
+int main() {
+
+ int i, v;
+ u8 data[72], digest[32];
+ u8 test1[]= {0xD1, 0xE3, 0x9B, 0x45, 0x7D, 0x22, 0x50, 0xB4, 0xF5, 0xB1, 0x52, 0xE7, 0x41, 0x57, 0xFB, 0xA4, \
+ 0xC1, 0xB4, 0x23, 0xB8, 0x75, 0x49, 0x10, 0x6B, 0x07, 0xFD, 0x3A, 0x3E, 0x7F, 0x4A, 0xEB, 0x28};
+ u8 test2[]= {0x8A, 0x63, 0x84, 0x88, 0xC3, 0x18, 0xC5, 0xA8, 0x22, 0x2A, 0x18, 0x13, 0x17, 0x4C, 0x36, 0xB4, \
+ 0xBB, 0x66, 0xE4, 0x5B, 0x09, 0xAF, 0xDD, 0xFD, 0x7F, 0x2B, 0x2F, 0xE3, 0x16, 0x1B, 0x7A, 0x6D};
+
+ for(i=0; i<72; ++i) data[i]=0;
+
+ crypto_hash( digest, data, 1 );
+ v=0;
+ for(i=0; i<32; ++i) {
+ printf("%02X", digest[i]);
+ if ( digest[i] != test1[i]) v=1;
+ }
+ if (v) printf("\nerror\n");
+ else printf("\nok\n");
+
+ for(i=0; i<72; ++i) data[i]=0;
+
+ crypto_hash( digest, data, 72 );
+ v=0;
+ for(i=0; i<32; ++i) {
+ printf("%02X", digest[i]);
+ if ( digest[i] != test2[i]) v=1;
+ }
+ if (v) printf("\nerror\n");
+ else printf("\nok\n");
+
+ return 0;
+}
+*/
View
9 crypto_hash/blake64/ssse3/README
@@ -0,0 +1,9 @@
+BLAKE-64 ssse3 eBASH implementation
+authors: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
+ Peter Schwabe <peter@cryptojedi.org>
+ Samuel Neves <sneves@dei.uc.pt>
+
+This implementation assumes that no salt is used.
+
+Level of copyright protection: 0
+Level of patent protection: 0
View
1  crypto_hash/blake64/ssse3/api.h
@@ -0,0 +1 @@
+#define CRYPTO_BYTES 64
View
437 crypto_hash/blake64/ssse3/hash.c
@@ -0,0 +1,437 @@
+#include <string.h>
+#include <stdio.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include "crypto_hash.h"
+
+typedef unsigned long long u64;
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#define U8TO32(p) \
+ (((u32)((p)[0]) << 24) | ((u32)((p)[1]) << 16) | \
+ ((u32)((p)[2]) << 8) | ((u32)((p)[3]) ))
+#define U8TO64(p) \
+ (((u64)U8TO32(p) << 32) | (u64)U8TO32((p) + 4))
+#define U32TO8(p, v) \
+ (p)[0] = (u8)((v) >> 24); (p)[1] = (u8)((v) >> 16); \
+ (p)[2] = (u8)((v) >> 8); (p)[3] = (u8)((v) );
+#define U64TO8(p, v) \
+ U32TO8((p), (u32)((v) >> 32)); \
+ U32TO8((p) + 4, (u32)((v) ));
+
+typedef struct {
+ u64 h[8], s[4], t[2];
+ int buflen, nullt;
+ u8 buf[128];
+} state;
+
+const u8 sigma[][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
+ {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
+ {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
+ { 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
+ { 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
+ {12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
+ {13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
+ { 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
+ {10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
+ {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
+ {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
+ { 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
+ { 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
+ {12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
+ {13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
+ { 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
+ {10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0 }
+ };
+
+const u64 cst[16] = {
+ 0x243F6A8885A308D3ULL,0x13198A2E03707344ULL,0xA4093822299F31D0ULL,0x082EFA98EC4E6C89ULL,
+ 0x452821E638D01377ULL,0xBE5466CF34E90C6CULL,0xC0AC29B7C97C50DDULL,0x3F84D5B5B5470917ULL,
+ 0x9216D5D98979FB1BULL,0xD1310BA698DFB5ACULL,0x2FFD72DBD01ADFB7ULL,0xB8E1AFED6A267E96ULL,
+ 0xBA7C9045F12C7F99ULL,0x24A19947B3916CF7ULL,0x0801F2E2858EFC16ULL,0x636920D871574E69ULL
+};
+
+static const u8 padding[129] =
+ { 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+
+static int blake64_compress( state * state, const u8 * datablock ) {
+
+ __m128i row1a,row1b;
+ __m128i row2a,row2b;
+ __m128i row3a,row3b;
+ __m128i row4a,row4b;
+ __m128i buf1a,buf2a;
+ static const u8 rot16[16] = {2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9};
+ __m128i r16 = _mm_load_si128((__m128i*)rot16);
+
+
+ u64 m[16];
+ u64 y[16];
+
+ /* constants and permutation */
+ static const int sig[][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
+ };
+
+ static const u64 z[16] = {
+ 0x243F6A8885A308D3ULL,0x13198A2E03707344ULL,
+ 0xA4093822299F31D0ULL,0x082EFA98EC4E6C89ULL,
+ 0x452821E638D01377ULL,0xBE5466CF34E90C6CULL,
+ 0xC0AC29B7C97C50DDULL,0x3F84D5B5B5470917ULL,
+ 0x9216D5D98979FB1BULL,0xD1310BA698DFB5ACULL,
+ 0x2FFD72DBD01ADFB7ULL,0xB8E1AFED6A267E96ULL,
+ 0xBA7C9045F12C7F99ULL,0x24A19947B3916CF7ULL,
+ 0x0801F2E2858EFC16ULL,0x636920D871574E69ULL
+ };
+
+ /* get message */
+ m[ 0] = U8TO64(datablock + 0);
+ m[ 1] = U8TO64(datablock + 8);
+ m[ 2] = U8TO64(datablock + 16);
+ m[ 3] = U8TO64(datablock + 24);
+ m[ 4] = U8TO64(datablock + 32);
+ m[ 5] = U8TO64(datablock + 40);
+ m[ 6] = U8TO64(datablock + 48);
+ m[ 7] = U8TO64(datablock + 56);
+ m[ 8] = U8TO64(datablock + 64);
+ m[ 9] = U8TO64(datablock + 72);
+ m[10] = U8TO64(datablock + 80);
+ m[11] = U8TO64(datablock + 88);
+ m[12] = U8TO64(datablock + 96);
+ m[13] = U8TO64(datablock +104);
+ m[14] = U8TO64(datablock +112);
+ m[15] = U8TO64(datablock +120);
+
+ row1b = _mm_set_epi64((__m64)state->h[3],(__m64)state->h[2]);
+ row1a = _mm_set_epi64((__m64)state->h[1],(__m64)state->h[0]);
+ row2b = _mm_set_epi64((__m64)state->h[7],(__m64)state->h[6]);
+ row2a = _mm_set_epi64((__m64)state->h[5],(__m64)state->h[4]);
+ row3b = _mm_set_epi64((__m64)0x082EFA98EC4E6C89ULL,
+ (__m64)0xA4093822299F31D0ULL);
+ row3a = _mm_set_epi64((__m64)0x13198A2E03707344ULL,
+ (__m64)0x243F6A8885A308D3ULL);
+
+ if (state->nullt) {
+ row4b = _mm_set_epi64((__m64)0x3F84D5B5B5470917ULL,
+ (__m64)0xC0AC29B7C97C50DDULL);
+ row4a = _mm_set_epi64((__m64)0xBE5466CF34E90C6CULL,
+ (__m64)0x452821E638D01377ULL);
+ }
+ else {
+ row4b = _mm_set_epi64((__m64)(0x3F84D5B5B5470917ULL^state->t[1]),
+ (__m64)(0xC0AC29B7C97C50DDULL^state->t[1]));
+ row4a = _mm_set_epi64((__m64)(0xBE5466CF34E90C6CULL^state->t[0]),
+ (__m64)(0x452821E638D01377ULL^state->t[0]));
+ }
+ /* initialization ok (beware of bug on Celeron and P4!) */
+
+
+
+#define round(r)\
+ /* column step */\
+ /***************************************************/\
+ /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13 */ \
+ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 2]], (__m64)m[sig[r][ 0]] ); \
+ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 3]], (__m64)z[sig[r][ 1]] ); \
+ buf1a = _mm_xor_si128( buf1a, buf2a ); \
+ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a ); \
+ row4a = _mm_xor_si128( row4a, row1a ); \
+ row4a = _mm_shuffle_epi32(row4a, 0xB1); \
+ row3a = _mm_add_epi64( row3a, row4a ); \
+ row2a = _mm_xor_si128( row2a, row3a ); \
+ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \
+ \
+ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 3]], (__m64)m[sig[r][ 1]] ); \
+ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 2]], (__m64)z[sig[r][ 0]] ); \
+ buf1a = _mm_xor_si128( buf1a, buf2a ); \
+ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a ); \
+ row4a = _mm_xor_si128( row4a, row1a ); \
+ row4a = _mm_shuffle_epi8(row4a, r16); \
+ row3a = _mm_add_epi64( row3a, row4a ); \
+ row2a = _mm_xor_si128( row2a, row3a ); \
+ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \
+ \
+ /* same stuff for low-order side */\
+ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 6]], (__m64)m[sig[r][ 4]] );\
+ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 7]], (__m64)z[sig[r][ 5]] );\
+ buf1a = _mm_xor_si128( buf1a, buf2a ); \
+ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b ); \
+ row4b = _mm_xor_si128( row4b, row1b ); \
+ row4b = _mm_shuffle_epi32(row4b, 0xB1); \
+ row3b = _mm_add_epi64( row3b, row4b ); \
+ row2b = _mm_xor_si128( row2b, row3b ); \
+ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \
+\
+ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 7]], (__m64)m[sig[r][ 5]] ); \
+ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 6]], (__m64)z[sig[r][ 4]] ); \
+ buf1a = _mm_xor_si128( buf1a, buf2a ); \
+ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b ); \
+ row4b = _mm_xor_si128( row4b, row1b ); \
+ row4b = _mm_shuffle_epi8(row4b, r16); \
+ row3b = _mm_add_epi64( row3b, row4b ); \
+ row2b = _mm_xor_si128( row2b, row3b ); \
+ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \
+\
+ /* shuffle */\
+ _mm_store_si128( 0+ (__m128i *)y, row4a); \
+ _mm_store_si128( 1+ (__m128i *)y, row4b); \
+ row4a = row3a;\
+ row3a = row3b;\
+ row3b = row4a;\
+ row4a = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );\
+ row4b = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );\
+ _mm_store_si128( 0+ (__m128i *)y, row2a); \
+ _mm_store_si128( 1+ (__m128i *)y, row2b); \
+ row2a = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \
+ row2b = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \
+ /* diagonal step */\
+ /***************************************************/\
+ /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13 */\
+ buf2a = _mm_set_epi64( (__m64)m[sig[r][10]], (__m64)m[sig[r][ 8]] );\
+ buf1a = _mm_set_epi64( (__m64)z[sig[r][11]], (__m64)z[sig[r][ 9]] );\
+ buf1a = _mm_xor_si128( buf1a, buf2a );\
+ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\
+ row4a = _mm_xor_si128( row4a, row1a ); \
+ row4a = _mm_shuffle_epi32(row4a, 0xB1); \
+ row3a = _mm_add_epi64( row3a, row4a ); \
+ row2a = _mm_xor_si128( row2a, row3a ); \
+ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \
+\
+ buf2a = _mm_set_epi64( (__m64)m[sig[r][11]], (__m64)m[sig[r][ 9]] );\
+ buf1a = _mm_set_epi64( (__m64)z[sig[r][10]], (__m64)z[sig[r][ 8]] );\
+ buf1a = _mm_xor_si128( buf1a, buf2a );\
+ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\
+ row4a = _mm_xor_si128( row4a, row1a ); \
+ row4a = _mm_shuffle_epi8(row4a, r16); \
+ row3a = _mm_add_epi64( row3a, row4a ); \
+ row2a = _mm_xor_si128( row2a, row3a ); \
+ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \
+\
+ /* same stuff for low-order side */\
+ buf2a = _mm_set_epi64( (__m64)m[sig[r][14]], (__m64)m[sig[r][12]] );\
+ buf1a = _mm_set_epi64( (__m64)z[sig[r][15]], (__m64)z[sig[r][13]] );\
+ buf1a = _mm_xor_si128( buf1a, buf2a );\
+ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\
+ row4b = _mm_xor_si128( row4b, row1b ); \
+ buf2a = _mm_set_epi64( (__m64)m[sig[r][15]], (__m64)m[sig[r][13]] );\
+ row4b = _mm_shuffle_epi32(row4b, 0xB1); \
+ row3b = _mm_add_epi64( row3b, row4b ); \
+ row2b = _mm_xor_si128( row2b, row3b ); \
+ buf1a = _mm_set_epi64( (__m64)z[sig[r][14]], (__m64)z[sig[r][12]] );\
+ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \
+\
+ buf1a = _mm_xor_si128( buf1a, buf2a );\
+ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\
+ row4b = _mm_xor_si128( row4b, row1b ); \
+ row4b = _mm_shuffle_epi8(row4b, r16); \
+ row3b = _mm_add_epi64( row3b, row4b ); \
+ row2b = _mm_xor_si128( row2b, row3b ); \
+ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \
+\
+ /* shuffle back */\
+ buf1a = row3a;\
+ row3a = row3b;\
+ row3b = buf1a;\
+ _mm_store_si128( 0+ (__m128i *)y, row2a); \
+ _mm_store_si128( 1+ (__m128i *)y, row2b); \
+ row2a = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \
+ row2b = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \
+ _mm_store_si128( 0+ (__m128i *)y, row4a); \
+ _mm_store_si128( 1+ (__m128i *)y, row4b); \
+ row4a = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \
+ row4b = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \
+ \
+
+ round(0);
+ round(1);
+ round(2);
+ round(3);
+ round(4);
+ round(5);
+ round(6);
+ round(7);
+ round(8);
+ round(9);
+ round(10);
+ round(11);
+ round(12);
+ round(13);
+
+ row1a = _mm_xor_si128(row3a,row1a);
+ row1b = _mm_xor_si128(row3b,row1b);
+ _mm_store_si128( (__m128i *)m, row1a);
+ state->h[0] ^= m[ 0];
+ state->h[1] ^= m[ 1];
+ _mm_store_si128( (__m128i *)m, row1b);
+ state->h[2] ^= m[ 0];
+ state->h[3] ^= m[ 1];
+
+ row2a = _mm_xor_si128(row4a,row2a);
+ row2b = _mm_xor_si128(row4b,row2b);
+ _mm_store_si128( (__m128i *)m, row2a);
+ state->h[4] ^= m[ 0];
+ state->h[5] ^= m[ 1];
+ _mm_store_si128( (__m128i *)m, row2b);
+ state->h[6] ^= m[ 0];
+ state->h[7] ^= m[ 1];
+
+ return 0;
+}
+
+void blake64_init( state * S ) {
+
+ S->h[0]=0x6A09E667F3BCC908ULL;
+ S->h[1]=0xBB67AE8584CAA73BULL;
+ S->h[2]=0x3C6EF372FE94F82BULL;
+ S->h[3]=0xA54FF53A5F1D36F1ULL;
+ S->h[4]=0x510E527FADE682D1ULL;
+ S->h[5]=0x9B05688C2B3E6C1FULL;
+ S->h[6]=0x1F83D9ABFB41BD6BULL;
+ S->h[7]=0x5BE0CD19137E2179ULL;
+ S->t[0]=S->t[1]=S->buflen=S->nullt=0;
+ S->s[0]=S->s[1]=S->s[2]=S->s[3] =0;
+
+}
+
+
+void blake64_update( state * S, const u8 * data, u64 datalen ) {
+
+
+ int left = (S->buflen >> 3);
+ int fill = 128 - left;
+
+ if( left && ( ((datalen >> 3) & 0x7F) >= fill ) ) {
+ memcpy( (void *) (S->buf + left), (void *) data, fill );
+ S->t[0] += 1024;
+ blake64_compress( S, S->buf );
+ data += fill;
+ datalen -= (fill << 3);
+ left = 0;
+ }
+
+ while( datalen >= 1024 ) {
+ S->t[0] += 1024;
+ blake64_compress( S, data );
+ data += 128;
+ datalen -= 1024;
+ }
+
+ if( datalen > 0 ) {
+ memcpy( (void *) (S->buf + left), (void *) data, ( datalen>>3 ) & 0x7F );
+ S->buflen = (left<<3) + datalen;
+ }
+ else S->buflen=0;
+}
+
+
+void blake64_final( state * S, u8 * digest ) {
+
+ u8 msglen[16], zo=0x01,oo=0x81;
+ u64 lo=S->t[0] + S->buflen, hi = S->t[1];
+ if ( lo < S->buflen ) hi++;
+ U64TO8( msglen + 0, hi );
+ U64TO8( msglen + 8, lo );
+
+ if ( S->buflen == 888 ) { /* one padding byte */
+ S->t[0] -= 8;
+ blake64_update( S, &oo, 8 );
+ }
+ else {
+ if ( S->buflen < 888 ) { /* enough space to fill the block */
+ if ( S->buflen == 0 ) S->nullt=1;
+ S->t[0] -= 888 - S->buflen;
+ blake64_update( S, padding, 888 - S->buflen );
+ }
+ else { /* NOT enough space, need 2 compressions */
+ S->t[0] -= 1024 - S->buflen;
+ blake64_update( S, padding, 1024 - S->buflen );
+ S->t[0] -= 888;
+ blake64_update( S, padding+1, 888 );
+ S->nullt = 1;
+ }
+ blake64_update( S, &zo, 8 );
+ S->t[0] -= 8;
+ }
+ S->t[0] -= 128;
+ blake64_update( S, msglen, 128 );
+
+ U64TO8( digest + 0, S->h[0]);
+ U64TO8( digest + 8, S->h[1]);
+ U64TO8( digest +16, S->h[2]);
+ U64TO8( digest +24, S->h[3]);
+ U64TO8( digest +32, S->h[4]);
+ U64TO8( digest +40, S->h[5]);
+ U64TO8( digest +48, S->h[6]);
+ U64TO8( digest +56, S->h[7]);
+}
+
+
+void crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ) {
+
+ state S;
+ blake64_init( &S );
+ blake64_update( &S, in, inlen*8 );
+ blake64_final( &S, out );
+}
+
+/*
+int main() {
+
+ int i, v;
+ u8 data[144], digest[64];
+ u8 test1[]= {0x76, 0x5F, 0x70, 0x84, 0x54, 0x82, 0x26, 0xC3, 0xE6, 0xF4, 0x77, 0x9B, 0x95, 0x46, 0x61, 0xDF, \
+ 0x49, 0xA2, 0x72, 0xE2, 0xBA, 0x16, 0x63, 0x5F, 0x17, 0xA3, 0x09, 0x37, 0x56, 0xAA, 0x93, 0x64, \
+ 0x2A, 0x92, 0xE5, 0xBD, 0xDB, 0x21, 0xA3, 0x21, 0x8F, 0x72, 0xB7, 0xFD, 0x44, 0xE9, 0xFA, 0x19, \
+ 0xF8, 0x6A, 0x86, 0x33, 0x4E, 0xBE, 0xDA, 0x0F, 0x4D, 0x42, 0x04, 0xBF, 0x3B, 0x6B, 0xED, 0x68};
+ u8 test2[]= {0xEA, 0xB7, 0x30, 0x28, 0x04, 0x28, 0x21, 0x05, 0x71, 0xF3, 0xF8, 0xDE, 0xE6, 0x78, 0xA9, 0xB1, \
+ 0xBB, 0xEF, 0x58, 0xDF, 0x55, 0x47, 0x12, 0x65, 0xB7, 0x1E, 0x26, 0x2B, 0x8E, 0xFF, 0xBA, 0x25, \
+ 0x33, 0xC1, 0x53, 0x17, 0xC3, 0xE9, 0xF8, 0x97, 0xB2, 0x69, 0xED, 0x41, 0x46, 0xAE, 0xD0, 0xF3, \
+ 0xA2, 0x98, 0x27, 0x06, 0x00, 0x55, 0xCA, 0x14, 0x65, 0x27, 0x53, 0xEF, 0xE2, 0x0A, 0x91, 0x3E};
+
+ for(i=0; i<144; ++i) data[i]=0;
+
+ crypto_hash( digest, data, 1 );
+ v=0;
+ for(i=0; i<64; ++i) {
+ printf("%02X", digest[i]);
+ if ( digest[i] != test1[i]) v=1;
+ }
+ if (v) printf("\nerror\n");
+ else printf("\nok\n");
+
+ for(i=0; i<144; ++i) data[i]=0;
+
+ crypto_hash( digest, data, 144 );
+ v=0;
+ for(i=0; i<64; ++i) {
+ printf("%02X", digest[i]);
+ if ( digest[i] != test2[i]) v=1;
+ }
+ if (v) printf("\nerror\n");
+ else printf("\nok\n");
+
+ return 0;
+}
+*/
View
2  crypto_stream/chacha12/amd64-ssse3/api.h
@@ -0,0 +1,2 @@
+#define crypto_stream_chacha12_e_amd64_ssse3_KEYBYTES 32
+#define crypto_stream_chacha12_e_amd64_ssse3_NONCEBYTES 8
View
1,533 crypto_stream/chacha12/amd64-ssse3/chacha.s
@@ -0,0 +1,1533 @@
+# Author: Samuel Neves
+# ChaCha stream cipher
+# Derived from the 'amd64-xmm6' implementation by Daniel Bernstein
+# Requires SSSE3 extensions (i.e. Core 2, Core i7, Atom)
+
+.data
+
+.globl R16
+.globl R08
+
+.p2align 6
+
+R16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+R08: .byte 3, 0 ,1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
+
+.text
+.p2align 5
+.globl _ECRYPT_keystream_bytes
+.globl ECRYPT_keystream_bytes
+_ECRYPT_keystream_bytes:
+ECRYPT_keystream_bytes:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+mov %rdi,%r8
+mov %rsi,%rsi
+mov %rsi,%rdi
+mov %rdx,%rdx
+cmp $0,%rdx
+
+jbe ._done
+
+mov $0,%rax
+
+mov %rdx,%rcx
+
+rep stosb
+
+sub %rdx,%rdi
+
+jmp ._start
+
+.text
+.p2align 5
+.globl _ECRYPT_decrypt_bytes
+.globl ECRYPT_decrypt_bytes
+_ECRYPT_decrypt_bytes:
+ECRYPT_decrypt_bytes:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+
+mov %rdi,%r8
+
+mov %rsi,%rsi
+
+mov %rdx,%rdi
+
+mov %rcx,%rdx
+
+cmp $0,%rdx
+
+jbe ._done
+
+jmp ._start
+
+.text
+.p2align 5
+.globl _ECRYPT_encrypt_bytes
+.globl ECRYPT_encrypt_bytes
+_ECRYPT_encrypt_bytes:
+ECRYPT_encrypt_bytes:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+
+mov %rdi,%r8
+
+mov %rsi,%rsi
+
+mov %rdx,%rdi
+
+mov %rcx,%rdx
+
+cmp $0,%rdx
+
+jbe ._done
+
+._start:
+
+cmp $256,%rdx
+
+jb ._bytesbetween1and255
+
+movdqa 0(%r8),%xmm0
+
+pshufd $0x55,%xmm0,%xmm1
+
+pshufd $0xaa,%xmm0,%xmm2
+
+pshufd $0xff,%xmm0,%xmm3
+
+pshufd $0x00,%xmm0,%xmm0
+
+movdqa %xmm0,0(%rsp)
+
+movdqa %xmm1,16(%rsp)
+
+movdqa %xmm2,32(%rsp)
+
+movdqa %xmm3,48(%rsp)
+
+movdqa 16(%r8),%xmm0
+
+pshufd $0x55,%xmm0,%xmm1
+
+pshufd $0xaa,%xmm0,%xmm2
+
+pshufd $0xff,%xmm0,%xmm3
+
+pshufd $0x00,%xmm0,%xmm0
+
+movdqa %xmm0,64(%rsp)
+
+movdqa %xmm1,80(%rsp)
+
+movdqa %xmm2,96(%rsp)
+
+movdqa %xmm3,112(%rsp)
+
+movdqa 32(%r8),%xmm0
+
+pshufd $0x55,%xmm0,%xmm1
+
+pshufd $0xaa,%xmm0,%xmm2
+
+pshufd $0xff,%xmm0,%xmm3
+
+pshufd $0x00,%xmm0,%xmm0
+
+movdqa %xmm0,128(%rsp)
+
+movdqa %xmm1,144(%rsp)
+
+movdqa %xmm2,160(%rsp)
+
+movdqa %xmm3,176(%rsp)
+
+movdqa 48(%r8),%xmm0
+
+pshufd $0xaa,%xmm0,%xmm1
+
+pshufd $0xff,%xmm0,%xmm0
+
+movdqa %xmm1,192(%rsp)
+
+movdqa %xmm0,208(%rsp)
+
+._bytesatleast256:
+
+movq %rdx,288(%rsp)
+
+movq 48(%r8),%rdx
+
+lea 1(%rdx),%rcx
+
+lea 2(%rdx),%r9
+
+lea 3(%rdx),%rax
+
+lea 4(%rdx),%r10
+
+movl %edx,224(%rsp)
+
+movl %ecx,4+224(%rsp)
+
+movl %r9d,8+224(%rsp)
+
+movl %eax,12+224(%rsp)
+
+shr $32,%rdx
+
+shr $32,%rcx
+
+shr $32,%r9
+
+shr $32,%rax
+
+movl %edx,240(%rsp)
+
+movl %ecx,4+240(%rsp)
+
+movl %r9d,8+240(%rsp)
+
+movl %eax,12+240(%rsp)
+
+movq %r10,48(%r8)
+
+mov $12,%rdx
+
+movdqa 32(%rsp),%xmm0
+
+movdqa 96(%rsp),%xmm1
+
+movdqa 160(%rsp),%xmm2
+
+movdqa 192(%rsp),%xmm3
+
+movdqa 48(%rsp),%xmm4
+
+movdqa 112(%rsp),%xmm5
+
+movdqa 176(%rsp),%xmm6
+
+movdqa 208(%rsp),%xmm7
+
+movdqa 0(%rsp),%xmm8
+
+movdqa 64(%rsp),%xmm9
+
+movdqa 128(%rsp),%xmm10
+
+movdqa 224(%rsp),%xmm11
+
+movdqa 16(%rsp),%xmm12
+
+movdqa 80(%rsp),%xmm13
+
+movdqa 144(%rsp),%xmm14
+
+movdqa 240(%rsp),%xmm15
+
+movdqa %xmm6,256(%rsp)
+
+._mainloop1:
+
+movdqa R16, %xmm6 # load
+
+paddd %xmm9,%xmm8
+
+pxor %xmm8,%xmm11
+
+paddd %xmm13,%xmm12
+
+pxor %xmm12,%xmm15
+
+#movdqa %xmm11,%xmm6
+#psrld $16,%xmm11
+#pslld $16,%xmm6
+#pxor %xmm6,%xmm11
+pshufb %xmm6, %xmm11
+
+#movdqa %xmm15,%xmm6
+#psrld $16,%xmm15
+#pslld $16,%xmm6
+#pxor %xmm6,%xmm15
+pshufb %xmm6, %xmm15
+
+paddd %xmm11,%xmm10
+
+pxor %xmm10,%xmm9
+
+paddd %xmm15,%xmm14
+
+pxor %xmm14,%xmm13
+
+movdqa %xmm9,%xmm6
+
+psrld $20,%xmm9
+
+pslld $12,%xmm6
+
+pxor %xmm6,%xmm9
+
+movdqa %xmm13,%xmm6
+
+psrld $20,%xmm13
+
+pslld $12,%xmm6
+
+pxor %xmm6,%xmm13
+
+movdqa R08, %xmm6 # load
+
+paddd %xmm9,%xmm8
+
+pxor %xmm8,%xmm11
+
+#movdqa %xmm11,%xmm6
+#psrld $24,%xmm11
+#pslld $8,%xmm6
+#pxor %xmm6,%xmm11
+pshufb %xmm6, %xmm11
+
+paddd %xmm13,%xmm12
+
+pxor %xmm12,%xmm15
+
+#movdqa %xmm15,%xmm6
+#psrld $24,%xmm15
+#pslld $8,%xmm6
+#pxor %xmm6,%xmm15
+pshufb %xmm6, %xmm15
+
+paddd %xmm11,%xmm10
+
+pxor %xmm10,%xmm9
+
+movdqa %xmm9,%xmm6
+
+psrld $25,%xmm9
+
+pslld $7,%xmm6
+
+pxor %xmm6,%xmm9
+
+paddd %xmm15,%xmm14
+
+pxor %xmm14,%xmm13
+
+movdqa %xmm13,%xmm6
+
+psrld $25,%xmm13
+
+pslld $7,%xmm6
+
+pxor %xmm6,%xmm13
+
+movdqa %xmm14,272(%rsp)
+
+movdqa 256(%rsp),%xmm6
+
+movdqa R16, %xmm14 # load
+
+paddd %xmm1,%xmm0
+
+pxor %xmm0,%xmm3
+
+#movdqa %xmm3,%xmm14
+#psrld $16,%xmm3
+#pslld $16,%xmm14
+#pxor %xmm14,%xmm3
+pshufb %xmm14, %xmm3
+
+paddd %xmm5,%xmm4
+
+pxor %xmm4,%xmm7
+
+#movdqa %xmm7,%xmm14
+#psrld $16,%xmm7
+#pslld $16,%xmm14
+#pxor %xmm14,%xmm7
+pshufb %xmm14, %xmm7
+
+paddd %xmm3,%xmm2
+
+pxor %xmm2,%xmm1
+
+movdqa %xmm1,%xmm14
+
+psrld $20,%xmm1
+
+pslld $12,%xmm14
+
+pxor %xmm14,%xmm1
+
+paddd %xmm7,%xmm6
+
+pxor %xmm6,%xmm5
+
+movdqa %xmm5,%xmm14
+
+psrld $20,%xmm5
+
+pslld $12,%xmm14
+
+pxor %xmm14,%xmm5
+
+movdqa R08, %xmm14 # load
+
+paddd %xmm1,%xmm0
+
+pxor %xmm0,%xmm3
+
+#movdqa %xmm3,%xmm14
+#psrld $24,%xmm3
+#pslld $8,%xmm14
+#pxor %xmm14,%xmm3
+pshufb %xmm14, %xmm3
+
+paddd %xmm5,%xmm4
+
+pxor %xmm4,%xmm7
+
+#movdqa %xmm7,%xmm14
+#psrld $24,%xmm7
+#pslld $8,%xmm14
+#pxor %xmm14,%xmm7
+pshufb %xmm14, %xmm7
+
+paddd %xmm3,%xmm2
+
+pxor %xmm2,%xmm1
+
+movdqa %xmm1,%xmm14
+
+psrld $25,%xmm1
+
+pslld $7,%xmm14
+
+pxor %xmm14,%xmm1
+
+paddd %xmm7,%xmm6
+
+pxor %xmm6,%xmm5
+
+movdqa %xmm5,%xmm14
+
+psrld $25,%xmm5
+
+pslld $7,%xmm14
+
+pxor %xmm14,%xmm5
+
+movdqa R16, %xmm14 # load
+
+paddd %xmm13,%xmm8
+
+pxor %xmm8,%xmm7
+
+#movdqa %xmm7,%xmm14
+#psrld $16,%xmm7
+#pslld $16,%xmm14
+#pxor %xmm14,%xmm7
+pshufb %xmm14, %xmm7
+
+paddd %xmm1,%xmm12
+
+pxor %xmm12,%xmm11
+
+#movdqa %xmm11,%xmm14
+#psrld $16,%xmm11
+#pslld $16,%xmm14
+#pxor %xmm14,%xmm11
+pshufb %xmm14, %xmm11
+
+paddd %xmm7,%xmm2
+
+pxor %xmm2,%xmm13
+
+movdqa %xmm13,%xmm14
+
+psrld $20,%xmm13
+
+pslld $12,%xmm14
+
+pxor %xmm14,%xmm13
+
+paddd %xmm11,%xmm6
+
+pxor %xmm6,%xmm1
+
+movdqa %xmm1,%xmm14
+
+psrld $20,%xmm1
+
+pslld $12,%xmm14
+
+pxor %xmm14,%xmm1
+
+movdqa R08, %xmm14 # load
+
+paddd %xmm13,%xmm8
+
+pxor %xmm8,%xmm7
+
+#movdqa %xmm7,%xmm14
+#psrld $24,%xmm7
+#pslld $8,%xmm14
+#pxor %xmm14,%xmm7
+pshufb %xmm14, %xmm7
+
+paddd %xmm1,%xmm12
+
+pxor %xmm12,%xmm11
+
+#movdqa %xmm11,%xmm14
+#psrld $24,%xmm11
+#pslld $8,%xmm14
+#pxor %xmm14,%xmm11
+pshufb %xmm14, %xmm11
+
+paddd %xmm7,%xmm2
+
+pxor %xmm2,%xmm13
+
+movdqa %xmm13,%xmm14
+
+psrld $25,%xmm13
+
+pslld $7,%xmm14
+
+pxor %xmm14,%xmm13
+
+paddd %xmm11,%xmm6
+
+pxor %xmm6,%xmm1
+
+movdqa %xmm1,%xmm14
+
+psrld $25,%xmm1
+
+pslld $7,%xmm14
+
+pxor %xmm14,%xmm1
+
+movdqa %xmm6,256(%rsp)
+
+movdqa 272(%rsp),%xmm14
+
+movdqa R16, %xmm6 # load
+
+paddd %xmm5,%xmm0
+
+pxor %xmm0,%xmm15
+
+#movdqa %xmm15,%xmm6
+#psrld $16,%xmm15
+#pslld $16,%xmm6
+#pxor %xmm6,%xmm15
+pshufb %xmm6, %xmm15
+
+paddd %xmm9,%xmm4
+
+pxor %xmm4,%xmm3
+
+#movdqa %xmm3,%xmm6
+#psrld $16,%xmm3
+#pslld $16,%xmm6
+#pxor %xmm6,%xmm3
+pshufb %xmm6, %xmm3
+
+paddd %xmm15,%xmm10
+
+pxor %xmm10,%xmm5
+
+movdqa %xmm5,%xmm6
+
+psrld $20,%xmm5
+
+pslld $12,%xmm6
+
+pxor %xmm6,%xmm5
+
+sub $2,%rdx
+
+paddd %xmm3,%xmm14
+
+pxor %xmm14,%xmm9
+
+movdqa %xmm9,%xmm6
+
+psrld $20,%xmm9
+
+pslld $12,%xmm6
+
+pxor %xmm6,%xmm9
+
+movdqa R08, %xmm6 # load
+
+paddd %xmm5,%xmm0
+
+pxor %xmm0,%xmm15
+
+#movdqa %xmm15,%xmm6
+#psrld $24,%xmm15
+#pslld $8,%xmm6
+#pxor %xmm6,%xmm15
+pshufb %xmm6, %xmm15
+
+paddd %xmm9,%xmm4
+
+pxor %xmm4,%xmm3
+
+#movdqa %xmm3,%xmm6
+#psrld $24,%xmm3
+#pslld $8,%xmm6
+#pxor %xmm6,%xmm3
+pshufb %xmm6, %xmm3
+
+paddd %xmm15,%xmm10
+
+pxor %xmm10,%xmm5
+
+movdqa %xmm5,%xmm6
+
+psrld $25,%xmm5
+
+pslld $7,%xmm6
+
+pxor %xmm6,%xmm5
+
+paddd %xmm3,%xmm14
+
+pxor %xmm14,%xmm9
+
+movdqa %xmm9,%xmm6
+
+psrld $25,%xmm9
+
+pslld $7,%xmm6
+
+pxor %xmm6,%xmm9
+
+ja ._mainloop1
+
+movdqa 256(%rsp),%xmm6
+
+paddd 0(%rsp),%xmm8
+
+paddd 16(%rsp),%xmm12
+
+paddd 32(%rsp),%xmm0
+
+paddd 48(%rsp),%xmm4
+
+movd %xmm8,%rdx
+
+movd %xmm12,%rcx
+
+movd %xmm0,%r9
+
+movd %xmm4,%rax
+
+pshufd $0x39,%xmm8,%xmm8
+
+pshufd $0x39,%xmm12,%xmm12
+
+pshufd $0x39,%xmm0,%xmm0
+
+pshufd $0x39,%xmm4,%xmm4
+
+xorl 0(%rsi),%edx
+
+xorl 4(%rsi),%ecx
+
+xorl 8(%rsi),%r9d
+
+xorl 12(%rsi),%eax
+
+movl %edx,0(%rdi)
+
+movl %ecx,4(%rdi)
+
+movl %r9d,8(%rdi)
+
+movl %eax,12(%rdi)
+
+movd %xmm8,%rdx
+
+movd %xmm12,%rcx
+
+movd %xmm0,%r9
+
+movd %xmm4,%rax
+
+pshufd $0x39,%xmm8,%xmm8
+
+pshufd $0x39,%xmm12,%xmm12
+
+pshufd $0x39,%xmm0,%xmm0
+
+pshufd $0x39,%xmm4,%xmm4
+
+xorl 64(%rsi),%edx
+
+xorl 68(%rsi),%ecx
+
+xorl 72(%rsi),%r9d
+
+xorl 76(%rsi),%eax
+
+movl %edx,64(%rdi)
+
+movl %ecx,68(%rdi)
+
+movl %r9d,72(%rdi)
+
+movl %eax,76(%rdi)
+
+movd %xmm8,%rdx
+
+movd %xmm12,%rcx
+
+movd %xmm0,%r9
+
+movd %xmm4,%rax
+
+pshufd $0x39,%xmm8,%xmm8
+
+pshufd $0x39,%xmm12,%xmm12
+
+pshufd $0x39,%xmm0,%xmm0
+
+pshufd $0x39,%xmm4,%xmm4
+
+xorl 128(%rsi),%edx
+
+xorl 132(%rsi),%ecx
+
+xorl 136(%rsi),%r9d
+
+xorl 140(%rsi),%eax
+
+movl %edx,128(%rdi)
+
+movl %ecx,132(%rdi)
+
+movl %r9d,136(%rdi)
+
+movl %eax,140(%rdi)
+
+movd %xmm8,%rdx
+
+movd %xmm12,%rcx
+
+movd %xmm0,%r9
+
+movd %xmm4,%rax
+
+xorl 192(%rsi),%edx
+
+xorl 196(%rsi),%ecx
+
+xorl 200(%rsi),%r9d
+
+xorl 204(%rsi),%eax
+
+movl %edx,192(%rdi)
+
+movl %ecx,196(%rdi)
+
+movl %r9d,200(%rdi)
+
+movl %eax,204(%rdi)
+
+paddd 64(%rsp),%xmm9
+
+paddd 80(%rsp),%xmm13
+
+paddd 96(%rsp),%xmm1
+
+paddd 112(%rsp),%xmm5
+
+movd %xmm9,%rdx
+
+movd %xmm13,%rcx
+
+movd %xmm1,%r9
+
+movd %xmm5,%rax
+
+pshufd $0x39,%xmm9,%xmm9
+
+pshufd $0x39,%xmm13,%xmm13
+
+pshufd $0x39,%xmm1,%xmm1
+
+pshufd $0x39,%xmm5,%xmm5
+
+xorl 16(%rsi),%edx
+
+xorl 20(%rsi),%ecx
+
+xorl 24(%rsi),%r9d
+
+xorl 28(%rsi),%eax
+
+movl %edx,16(%rdi)
+
+movl %ecx,20(%rdi)
+
+movl %r9d,24(%rdi)
+
+movl %eax,28(%rdi)
+
+movd %xmm9,%rdx
+
+movd %xmm13,%rcx
+
+movd %xmm1,%r9
+
+movd %xmm5,%rax
+
+pshufd $0x39,%xmm9,%xmm9
+
+pshufd $0x39,%xmm13,%xmm13
+
+pshufd $0x39,%xmm1,%xmm1
+
+pshufd $0x39,%xmm5,%xmm5
+
+xorl 80(%rsi),%edx
+
+xorl 84(%rsi),%ecx
+
+xorl 88(%rsi),%r9d
+
+xorl 92(%rsi),%eax
+
+movl %edx,80(%rdi)
+
+movl %ecx,84(%rdi)
+
+movl %r9d,88(%rdi)
+
+movl %eax,92(%rdi)
+
+movd %xmm9,%rdx
+
+movd %xmm13,%rcx
+
+movd %xmm1,%r9
+
+movd %xmm5,%rax
+
+pshufd $0x39,%xmm9,%xmm9
+
+pshufd $0x39,%xmm13,%xmm13
+
+pshufd $0x39,%xmm1,%xmm1
+
+pshufd $0x39,%xmm5,%xmm5
+
+xorl 144(%rsi),%edx
+
+xorl 148(%rsi),%ecx
+
+xorl 152(%rsi),%r9d
+
+xorl 156(%rsi),%eax
+
+movl %edx,144(%rdi)
+
+movl %ecx,148(%rdi)
+
+movl %r9d,152(%rdi)
+
+movl %eax,156(%rdi)
+
+movd %xmm9,%rdx
+
+movd %xmm13,%rcx
+
+movd %xmm1,%r9
+
+movd %xmm5,%rax
+
+xorl 208(%rsi),%edx
+
+xorl 212(%rsi),%ecx
+
+xorl 216(%rsi),%r9d
+
+xorl 220(%rsi),%eax
+
+movl %edx,208(%rdi)
+
+movl %ecx,212(%rdi)
+
+movl %r9d,216(%rdi)
+
+movl %eax,220(%rdi)
+
+paddd 128(%rsp),%xmm10
+
+paddd 144(%rsp),%xmm14
+
+paddd 160(%rsp),%xmm2
+
+paddd 176(%rsp),%xmm6
+
+movd %xmm10,%rdx
+
+movd %xmm14,%rcx
+
+movd %xmm2,%r9
+
+movd %xmm6,%rax
+
+pshufd $0x39,%xmm10,%xmm10
+
+pshufd $0x39,%xmm14,%xmm14
+
+pshufd $0x39,%xmm2,%xmm2
+
+pshufd $0x39,%xmm6,%xmm6
+
+xorl 32(%rsi),%edx
+
+xorl 36(%rsi),%ecx
+
+xorl 40(%rsi),%r9d
+
+xorl 44(%rsi),%eax
+
+movl %edx,32(%rdi)
+
+movl %ecx,36(%rdi)
+
+movl %r9d,40(%rdi)
+
+movl %eax,44(%rdi)
+
+movd %xmm10,%rdx
+
+movd %xmm14,%rcx
+
+movd %xmm2,%r9
+
+movd %xmm6,%rax
+
+pshufd $0x39,%xmm10,%xmm10
+
+pshufd $0x39,%xmm14,%xmm14
+
+pshufd $0x39,%xmm2,%xmm2
+
+pshufd $0x39,%xmm6,%xmm6
+
+xorl 96(%rsi),%edx
+
+xorl 100(%rsi),%ecx
+
+xorl 104(%rsi),%r9d
+
+xorl 108(%rsi),%eax
+
+movl %edx,96(%rdi)
+
+movl %ecx,100(%rdi)
+
+movl %r9d,104(%rdi)
+
+movl %eax,108(%rdi)
+
+movd %xmm10,%rdx
+
+movd %xmm14,%rcx
+
+movd %xmm2,%r9
+
+movd %xmm6,%rax
+
+pshufd $0x39,%xmm10,%xmm10
+
+pshufd $0x39,%xmm14,%xmm14
+
+pshufd $0x39,%xmm2,%xmm2
+
+pshufd $0x39,%xmm6,%xmm6
+
+xorl 160(%rsi),%edx
+
+xorl 164(%rsi),%ecx
+
+xorl 168(%rsi),%r9d
+
+xorl 172(%rsi),%eax
+
+movl %edx,160(%rdi)
+
+movl %ecx,164(%rdi)
+
+movl %r9d,168(%rdi)
+
+movl %eax,172(%rdi)
+
+movd %xmm10,%rdx
+
+movd %xmm14,%rcx
+
+movd %xmm2,%r9
+
+movd %xmm6,%rax
+
+xorl 224(%rsi),%edx
+
+xorl 228(%rsi),%ecx
+
+xorl 232(%rsi),%r9d
+
+xorl 236(%rsi),%eax
+
+movl %edx,224(%rdi)
+
+movl %ecx,228(%rdi)
+
+movl %r9d,232(%rdi)
+
+movl %eax,236(%rdi)
+
+paddd 224(%rsp),%xmm11
+
+paddd 240(%rsp),%xmm15
+
+paddd 192(%rsp),%xmm3
+
+paddd 208(%rsp),%xmm7
+
+movd %xmm11,%rdx
+
+movd %xmm15,%rcx
+
+movd %xmm3,%r9
+
+movd %xmm7,%rax
+
+pshufd $0x39,%xmm11,%xmm11
+
+pshufd $0x39,%xmm15,%xmm15
+
+pshufd $0x39,%xmm3,%xmm3
+
+pshufd $0x39,%xmm7,%xmm7
+
+xorl 48(%rsi),%edx
+
+xorl 52(%rsi),%ecx
+
+xorl 56(%rsi),%r9d
+
+xorl 60(%rsi),%eax
+
+movl %edx,48(%rdi)
+
+movl %ecx,52(%rdi)
+
+movl %r9d,56(%rdi)
+
+movl %eax,60(%rdi)
+
+movd %xmm11,%rdx
+
+movd %xmm15,%rcx
+
+movd %xmm3,%r9
+
+movd %xmm7,%rax
+
+pshufd $0x39,%xmm11,%xmm11
+
+pshufd $0x39,%xmm15,%xmm15
+
+pshufd $0x39,%xmm3,%xmm3
+
+pshufd $0x39,%xmm7,%xmm7
+
+xorl 112(%rsi),%edx
+
+xorl 116(%rsi),%ecx
+
+xorl 120(%rsi),%r9d
+
+xorl 124(%rsi),%eax
+
+movl %edx,112(%rdi)
+
+movl %ecx,116(%rdi)
+
+movl %r9d,120(%rdi)
+
+movl %eax,124(%rdi)
+
+movd %xmm11,%rdx
+
+movd %xmm15,%rcx
+
+movd %xmm3,%r9
+
+movd %xmm7,%rax
+
+pshufd $0x39,%xmm11,%xmm11
+
+pshufd $0x39,%xmm15,%xmm15
+
+pshufd $0x39,%xmm3,%xmm3
+
+pshufd $0x39,%xmm7,%xmm7
+
+xorl 176(%rsi),%edx
+
+xorl 180(%rsi),%ecx
+
+xorl 184(%rsi),%r9d
+
+xorl 188(%rsi),%eax
+
+movl %edx,176(%rdi)
+
+movl %ecx,180(%rdi)
+
+movl %r9d,184(%rdi)
+
+movl %eax,188(%rdi)
+
+movd %xmm11,%rdx
+
+movd %xmm15,%rcx
+
+movd %xmm3,%r9
+
+movd %xmm7,%rax
+
+xorl 240(%rsi),%edx
+
+xorl 244(%rsi),%ecx
+
+xorl 248(%rsi),%r9d
+
+xorl 252(%rsi),%eax
+
+movl %edx,240(%rdi)
+
+movl %ecx,244(%rdi)
+
+movl %r9d,248(%rdi)
+
+movl %eax,252(%rdi)
+
+movq 288(%rsp),%rdx
+
+sub $256,%rdx
+
+add $256,%rsi
+
+add $256,%rdi
+
+cmp $256,%rdx
+
+jae ._bytesatleast256
+
+cmp $0,%rdx
+
+jbe ._done
+
+._bytesbetween1and255:
+
+cmp $64,%rdx
+
+jae ._nocopy
+
+mov %rdi,%r9
+
+leaq 320(%rsp),%rdi
+
+mov %rdx,%rcx
+
+rep movsb
+
+leaq 320(%rsp),%rdi
+
+leaq 320(%rsp),%rsi
+
+._nocopy:
+
+movq %rdx,288(%rsp)
+
+movdqa 0(%r8),%xmm0
+
+movdqa 16(%r8),%xmm1
+
+movdqa 32(%r8),%xmm2
+
+movdqa 48(%r8),%xmm3
+
+mov $20,%rdx
+
+._mainloop2:
+
+paddd %xmm1,%xmm0
+
+pxor %xmm0,%xmm3
+
+#movdqa %xmm3,%xmm4
+#pslld $16,%xmm3
+#psrld $16,%xmm4
+#pxor %xmm4,%xmm3
+pshufb (R16), %xmm3
+
+paddd %xmm3,%xmm2
+
+pxor %xmm2,%xmm1
+
+movdqa %xmm1,%xmm4
+
+pslld $12,%xmm1
+
+psrld $20,%xmm4
+
+pxor %xmm4,%xmm1
+
+paddd %xmm1,%xmm0
+
+pxor %xmm0,%xmm3
+
+#movdqa %xmm3,%xmm4
+#pslld $8,%xmm3
+#psrld $24,%xmm4
+
+pshufd $0x93,%xmm0,%xmm0
+
+#pxor %xmm4,%xmm3
+pshufb (R08), %xmm3
+
+paddd %xmm3,%xmm2
+
+pshufd $0x4e,%xmm3,%xmm3
+
+pxor %xmm2,%xmm1
+
+pshufd $0x39,%xmm2,%xmm2
+
+movdqa %xmm1,%xmm4
+
+pslld $7,%xmm1
+
+psrld $25,%xmm4
+
+pxor %xmm4,%xmm1
+
+sub $2,%rdx
+
+paddd %xmm1,%xmm0
+
+pxor %xmm0,%xmm3
+
+#movdqa %xmm3,%xmm4
+#pslld $16,%xmm3
+#psrld $16,%xmm4
+#pxor %xmm4,%xmm3
+pshufb (R16), %xmm3
+
+paddd %xmm3,%xmm2
+
+pxor %xmm2,%xmm1
+
+movdqa %xmm1,%xmm4
+
+pslld $12,%xmm1
+
+psrld $20,%xmm4
+
+pxor %xmm4,%xmm1
+
+paddd %xmm1,%xmm0
+
+pxor %xmm0,%xmm3
+
+#movdqa %xmm3,%xmm4
+#pslld $8,%xmm3
+#psrld $24,%xmm4
+
+pshufd $0x39,%xmm0,%xmm0
+
+#pxor %xmm4,%xmm3
+pshufb (R08), %xmm3
+
+paddd %xmm3,%xmm2
+
+pshufd $0x4e,%xmm3,%xmm3
+
+pxor %xmm2,%xmm1
+
+pshufd $0x93,%xmm2,%xmm2
+
+movdqa %xmm1,%xmm4
+
+pslld $7,%xmm1
+
+psrld $25,%xmm4
+
+pxor %xmm4,%xmm1
+
+ja ._mainloop2
+
+paddd 0(%r8),%xmm0
+
+paddd 16(%r8),%xmm1
+
+paddd 32(%r8),%xmm2
+
+paddd 48(%r8),%xmm3
+
+pxor 0(%rsi),%xmm0
+
+pxor 16(%rsi),%xmm1
+
+pxor 32(%rsi),%xmm2
+
+pxor 48(%rsi),%xmm3
+
+movdqa %xmm0,0(%rdi)
+
+movdqa %xmm1,16(%rdi)
+
+movdqa %xmm2,32(%rdi)
+
+movdqa %xmm3,48(%rdi)
+
+movq 288(%rsp),%rdx
+
+movl 48(%r8),%ecx
+
+movl 52(%r8),%eax
+
+add $1,%rcx
+
+shl $32,%rax
+
+add %rax,%rcx
+
+mov %rcx,%rax
+
+shr $32,%rax
+
+movl %ecx,48(%r8)
+
+movl %eax,52(%r8)
+
+cmp $64,%rdx
+
+ja ._bytesatleast65
+
+jae ._bytesatleast64
+
+mov %rdi,%rsi
+
+mov %r9,%rdi
+
+mov %rdx,%rcx
+
+rep movsb
+
+._bytesatleast64:
+
+._done:
+
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
+
+._bytesatleast65:
+
+sub $64,%rdx
+
+add $64,%rdi
+
+add $64,%rsi
+
+jmp ._bytesbetween1and255
+
+.text
+.p2align 5
+.globl _ECRYPT_init
+.globl ECRYPT_init
+_ECRYPT_init:
+ECRYPT_init:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
+
+.text
+.p2align 5
+.globl _ECRYPT_keysetup
+.globl ECRYPT_keysetup
+_ECRYPT_keysetup:
+ECRYPT_keysetup:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+
+mov %rsi,%rsi
+
+mov %rdx,%rdx
+
+mov %rdi,%rdi
+
+movl 0(%rsi),%r8d
+
+movl 4(%rsi),%r9d
+
+movl 8(%rsi),%eax
+
+movl 12(%rsi),%r10d
+
+movl %r8d,16(%rdi)
+
+movl %r9d,20(%rdi)
+
+movl %eax,24(%rdi)
+
+movl %r10d,28(%rdi)
+
+cmp $256,%rdx
+
+jb ._kbits128
+
+._kbits256:
+
+movl 16(%rsi),%edx
+
+movl 20(%rsi),%ecx
+
+movl 24(%rsi),%r8d
+
+movl 28(%rsi),%esi
+
+movl %edx,32(%rdi)
+
+movl %ecx,36(%rdi)
+
+movl %r8d,40(%rdi)
+
+movl %esi,44(%rdi)
+
+mov $1634760805,%rsi
+
+mov $857760878,%rdx
+
+mov $2036477234,%rcx
+
+mov $1797285236,%r8
+
+movl %esi,0(%rdi)
+
+movl %edx,4(%rdi)
+
+movl %ecx,8(%rdi)
+
+movl %r8d,12(%rdi)
+
+jmp ._keysetupdone
+
+._kbits128:
+
+movl 0(%rsi),%edx
+
+movl 4(%rsi),%ecx
+
+movl 8(%rsi),%r8d
+
+movl 12(%rsi),%esi
+
+movl %edx,32(%rdi)
+
+movl %ecx,36(%rdi)
+
+movl %r8d,40(%rdi)
+
+movl %esi,44(%rdi)
+
+mov $1634760805,%rsi
+
+mov $824206446,%rdx
+
+mov $2036477238,%rcx
+
+mov $1797285236,%r8
+
+movl %esi,0(%rdi)
+
+movl %edx,4(%rdi)
+
+movl %ecx,8(%rdi)
+
+movl %r8d,12(%rdi)
+
+._keysetupdone:
+
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
+
+.text
+.p2align 5
+.globl _ECRYPT_ivsetup
+.globl ECRYPT_ivsetup
+_ECRYPT_ivsetup:
+ECRYPT_ivsetup:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+
+mov %rsi,%rsi
+
+mov %rdi,%rdi
+
+mov $0,%r8
+
+mov $0,%r9
+
+movl 0(%rsi),%eax
+
+movl 4(%rsi),%esi
+
+movl %r8d,48(%rdi)
+
+movl %r9d,52(%rdi)
+
+movl %eax,56(%rdi)
+
+movl %esi,60(%rdi)
+
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
+
+
View
2  crypto_stream/chacha20/amd64-ssse3/api.h
@@ -0,0 +1,2 @@
+#define crypto_stream_chacha20_e_amd64_ssse3_KEYBYTES 32
+#define crypto_stream_chacha20_e_amd64_ssse3_NONCEBYTES 8
View
1,533 crypto_stream/chacha20/amd64-ssse3/chacha.s
@@ -0,0 +1,1533 @@
+# Author: Samuel Neves
+# ChaCha stream cipher
+# Derived from the 'amd64-xmm6' implementation by Daniel Bernstein
+# Requires SSSE3 extensions (i.e. Core 2, Core i7, Atom)
+
+.data
+
+.globl R16
+.globl R08
+
+.p2align 6
+
+R16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+R08: .byte 3, 0 ,1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
+
+.text
+.p2align 5
+.globl _ECRYPT_keystream_bytes
+.globl ECRYPT_keystream_bytes
+_ECRYPT_keystream_bytes:
+ECRYPT_keystream_bytes:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+mov %rdi,%r8
+mov %rsi,%rsi
+mov %rsi,%rdi
+mov %rdx,%rdx
+cmp $0,%rdx
+
+jbe ._done
+
+mov $0,%rax
+
+mov %rdx,%rcx
+
+rep stosb
+
+sub %rdx,%rdi
+
+jmp ._start
+
+.text
+.p2align 5
+.globl _ECRYPT_decrypt_bytes
+.globl ECRYPT_decrypt_bytes
+_ECRYPT_decrypt_bytes:
+ECRYPT_decrypt_bytes:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+
+mov %rdi,%r8
+
+mov %rsi,%rsi
+
+mov %rdx,%rdi
+
+mov %rcx,%rdx
+
+cmp $0,%rdx
+
+jbe ._done
+
+jmp ._start
+
+.text
+.p2align 5
+.globl _ECRYPT_encrypt_bytes
+.globl ECRYPT_encrypt_bytes
+_ECRYPT_encrypt_bytes:
+ECRYPT_encrypt_bytes:
+mov %rsp,%r11
+and $31,%r11
+add $384,%r11
+sub %r11,%rsp
+
+mov %rdi,%r8
+
+mov %rsi,%rsi
+
+mov %rdx,%rdi
+
+mov %rcx,%rdx
+
+cmp $0,%rdx
+
+jbe ._done
+
+._start:
+
+cmp $256,%rdx
+
+jb ._bytesbetween1and255
+
+movdqa 0(%r8),%xmm0
+
+pshufd $0x55,%xmm0,%xmm1
+
+pshufd $0xaa,%xmm0,%xmm2
+
+pshufd $0xff,%xmm0,%xmm3
+
+pshufd $0x00,%xmm0,%xmm0
+
+movdqa %xmm0,0(%rsp)
+
+movdqa %xmm1,16(%rsp)
+
+movdqa %xmm2,32(%rsp)
+
+movdqa %xmm3,48(%rsp)
+
+movdqa 16(%r8),%xmm0
+
+pshufd $0x55,%xmm0,%xmm1
+
+pshufd $0xaa,%xmm0,%xmm2
+
+pshufd $0xff,%xmm0,%xmm3
+
+pshufd $0x00,%xmm0,%xmm0
+
+movdqa %xmm0,64(%rsp)
+
+movdqa %xmm1,80(%rsp)
+
+movdqa %xmm2,96(%rsp)
+
+movdqa %xmm3,112(%rsp)
+
+movdqa 32(%r8),%xmm0
+
+pshufd $0x55,%xmm0,%xmm1
+
+pshufd $0xaa,%xmm0,%xmm2
+
+pshufd $0xff,%xmm0,%xmm3
+
+pshufd $0x00,%xmm0,%xmm0
+
+movdqa %xmm0,128(%rsp)
+
+movdqa %xmm1,144(%rsp)
+
+movdqa %xmm2,160(%rsp)
+
+movdqa %xmm3,176(%rsp)
+
+movdqa 48(%r8),%xmm0
+
+pshufd $0xaa,%xmm0,%xmm1
+
+pshufd $0xff,%xmm0,%xmm0
+
+movdqa %xmm1,192(%rsp)
+
+movdqa %xmm0,208(%rsp)
+
+._bytesatleast256:
+
+movq %rdx,288(%rsp)
+
+movq 48(%r8),%rdx
+
+lea 1(%rdx),%rcx
+
+lea 2(%rdx),%r9
+
+lea 3(%rdx),%rax
+
+lea 4(%rdx),%r10
+
+movl %edx,224(%rsp)
+
+movl %ecx,4+224(%rsp)
+
+movl %r9d,8+224(%rsp)
+
+movl %eax,12+224(%rsp)
+
+shr $32,%rdx
+
+shr $32,%rcx
+
+shr $32,%r9
+
+shr $32,%rax
+
+movl %edx,240(%rsp)
+
+movl %ecx,4+240(%rsp)
+
+movl %r9d,8+240(%rsp)
+
+movl %eax,12+240(%rsp)
+
+movq %r10,48(%r8)
+
+mov $20,%rdx
+
+movdqa 32(%rsp),%xmm0
+
+movdqa 96(%rsp),%xmm1
+
+movdqa 160(%rsp),%xmm2
+
+movdqa 192(%rsp),%xmm3
+
+movdqa 48(%rsp),%xmm4
+
+movdqa 112(%rsp),%xmm5
+
+movdqa 176(%rsp),%xmm6
+
+movdqa 208(%rsp),%xmm7
+
+movdqa 0(%rsp),%xmm8
+
+movdqa 64(%rsp),%xmm9
+
+movdqa 128(%rsp),%xmm10
+
+movdqa 224(%rsp),%xmm11
+
+movdqa 16(%rsp),%xmm12
+
+movdqa 80(%rsp),%xmm13
+
+movdqa 144(%rsp),%xmm14
+
+movdqa 240(%rsp),%xmm15
+
+movdqa %xmm6,256(%rsp)
+
+._mainloop1:
+
+movdqa R16, %xmm6 # load
+
+paddd %xmm9,%xmm8
+
+pxor %xmm8,%xmm11
+
+paddd %xmm13,%xmm12
+
+pxor %xmm12,%xmm15
+
+#movdqa %xmm11,%xmm6
+#psrld $16,%xmm11
+#pslld $16,%xmm6
+#pxor %xmm6,%xmm11
+pshufb %xmm6, %xmm11
+
+#movdqa %xmm15,%xmm6
+#psrld $16,%xmm15
+#pslld $16,%xmm6
+#pxor %xmm6,%xmm15
+pshufb %xmm6, %xmm15
+
+paddd %xmm11,%xmm10
+
+pxor %xmm10,%xmm9
+
+paddd %xmm15,%xmm14
+
+pxor %xmm14,%xmm13
+
+movdqa %xmm9,%xmm6
+
+psrld $20,%xmm9
+
+pslld $12,%xmm6
+
+pxor %xmm6,%xmm9
+
+movdqa %xmm13,%xmm6
+
+psrld $20,%xmm13
+
+pslld $12,%xmm6
+
+pxor %xmm6,%xmm13
+
+movdqa R08, %xmm6 # load
+
+paddd %xmm9,%xmm8
+
+pxor %xmm8,%xmm11
+
+#movdqa %xmm11,%xmm6
+#psrld $24,%xmm11
+#pslld $8,%xmm6
+#pxor %xmm6,%xmm11
+pshufb %xmm6, %xmm11
+
+paddd %xmm13,%xmm12
+
+pxor %xmm12,%xmm15
+
+#movdqa %xmm15,%xmm6
+#psrld $24,%xmm15
+#pslld $8,%xmm6
+#pxor %xmm6,%xmm15
+pshufb %xmm6, %xmm15
+
+paddd %xmm11,%xmm10
+
+pxor %xmm10,%xmm9
+
+movdqa %xmm9,%xmm6
+
+psrld $25,%xmm9
+
+pslld $7,%xmm6
+
+pxor %xmm6,%xmm9
+
+paddd %xmm15,%xmm14
+
+pxor %xmm14,%xmm13
+
+movdqa %xmm13,%xmm6
+
+psrld $25,%xmm13
+
+pslld $7,%xmm6
+
+pxor %xmm6,%xmm13
+
+movdqa %xmm14,272(%rsp)
+
+movdqa 256(%rsp),%xmm6
+
+movdqa R16, %xmm14 # load
+
+paddd %xmm1,%xmm0
+
+pxor %xmm0,%xmm3
+
+#movdqa %xmm3,%xmm14
+#psrld $16,%xmm3
+#pslld $16,%xmm14
+#pxor %xmm14,%xmm3
+pshufb %xmm14, %xmm3
+
+paddd %xmm5,%xmm4
+
+pxor %xmm4,%xmm7
+
+#movdqa %xmm7,%xmm14
+#psrld $16,%xmm7
+#pslld $16,%xmm14
+#pxor %xmm14,%xmm7
+pshufb %xmm14, %xmm7
+
+paddd %xmm3,%xmm2
+
+pxor %xmm2,%xmm1
+
+movdqa %xmm1,%xmm14
+
+psrld $20,%xmm1
+
+pslld $12,%xmm14
+
+pxor %xmm14,%xmm1
+
+paddd %xmm7,%xmm6
+
+pxor %xmm6,%xmm5
+
+movdqa %xmm5,%xmm14
+
+psrld $20,%xmm5
+
+pslld $12,%xmm14
+
+pxor %xmm14,%xmm5
+
+movdqa R08, %xmm14 # load
+
+paddd %xmm1,%xmm0
+
+pxor %xmm0,%xmm3
+
+#movdqa %xmm3,%xmm14
+#psrld $24,%xmm3
+#pslld $8,%xmm14
+#pxor %xmm14,%xmm3
+pshufb %xmm14, %xmm3
+
+paddd %xmm5,%xmm4
+
+pxor %xmm4,%xmm7
+
+#movdqa %xmm7,%xmm14
+#psrld $24,%xmm7
+#pslld $8,%xmm14
+#pxor %xmm14,%xmm7
+pshufb %xmm14, %xmm7
+
+paddd %xmm3,%xmm2
+
+pxor %xmm2,%xmm1
+
+movdqa %xmm1,%xmm14
+
+psrld $25,%xmm1
+
+pslld $7,%xmm14
+
+pxor %xmm14,%xmm1
+
+paddd %xmm7,%xmm6
+
+pxor %xmm6,%xmm5
+
+movdqa %xmm5,%xmm14
+
+psrld $25,%xmm5
+
+pslld $7,%xmm14
+
+pxor %xmm14,%xmm5
+
+movdqa R16, %xmm14 # load
+
+paddd %xmm13,%xmm8
+
+pxor %xmm8,%xmm7
+
+#movdqa %xmm7,%xmm14
+#psrld $16,%xmm7
+#pslld $16,%xmm14
+#pxor %xmm14,%xmm7
+pshufb %xmm14, %xmm7
+
+paddd %xmm1,%xmm12
+
+pxor %xmm12,%xmm11
+
+#movdqa %xmm11,%xmm14
+#psrld $16,%xmm11
+#pslld $16,%xmm14
+#pxor %xmm14,%xmm11
+pshufb %xmm14, %xmm11
+
+paddd %xmm7,%xmm2
+
+pxor %xmm2,%xmm13
+
+movdqa %xmm13,%xmm14
+
+psrld $20,%xmm13
+
+pslld $12,%xmm14
+
+pxor %xmm14,%xmm13
+
+paddd %xmm11,%xmm6
+
+pxor %xmm6,%xmm1
+
+movdqa %xmm1,%xmm14
+
+psrld $20,%xmm1
+
+pslld $12,%xmm14
+
+pxor %xmm14,%xmm1
+
+movdqa R08, %xmm14 # load
+
+paddd %xmm13,%xmm8
+
+pxor %xmm8,%xmm7
+
+#movdqa %xmm7,%xmm14
+#psrld $24,%xmm7
+#pslld $8,%xmm14
+#pxor %xmm14,%xmm7
+pshufb %xmm14, %xmm7
+
+paddd %xmm1,%xmm12
+
+pxor %xmm12,%xmm11
+
+#movdqa %xmm11,%xmm14
+#psrld $24,%xmm11