diff --git a/changelog.dd b/changelog.dd index 4c589c74343..74ad61cd4dd 100644 --- a/changelog.dd +++ b/changelog.dd @@ -2,7 +2,7 @@ $(VERSION 061, ddd mm, 2012, =================================================, $(WHATSNEW $(LI std.digest: Added new package for digests. This replaces std.md5 and the internal crc32 implementation.) - $(LI std.digest.sha: Added SHA1 digest implementation.) + $(LI std.digest.sha: Added SHA1 digest implementation, including a fast SSSE3 version.) $(LI std.digest.ripemd: Added RIPEMD-160 digest implementation.) $(LI std.uuid: Support SHA1 UUIDs.) $(LI std.uuid: md5UUID and sha1UUID can now be used in pure code.) diff --git a/posix.mak b/posix.mak index ea2064a7969..92d09d35b3d 100644 --- a/posix.mak +++ b/posix.mak @@ -192,6 +192,7 @@ EXTRA_DOCUMENTABLES += $(addprefix etc/c/,curl sqlite3 zlib) $(addprefix \ std/c/, fenv locale math process stdarg stddef stdio stdlib string \ time wcharh) EXTRA_MODULES += $(EXTRA_DOCUMENTABLES) $(addprefix \ + std/internal/digest/, sha_SSSE3 ) $(addprefix \ std/internal/math/, biguintcore biguintnoasm biguintx86 \ gammafunction errorfunction) $(addprefix std/internal/, \ processinit uni uni_tab) diff --git a/std/digest/sha.d b/std/digest/sha.d index 045f258c1a1..62152cd782b 100644 --- a/std/digest/sha.d +++ b/std/digest/sha.d @@ -125,9 +125,25 @@ unittest hash = sha.finish(); } +version(OSX) +{ + // Do not use. +} +else version(D_InlineAsm_X86) +{ + private version = USE_SSSE3; +} +else version(D_InlineAsm_X86_64) +{ + private version = USE_SSSE3; +} + import std.ascii : hexDigits; import std.exception : assumeUnique; import core.bitop : bswap; +version(USE_SSSE3) import core.cpuid : hasSSSE3Support = ssse3; +version(USE_SSSE3) import std.internal.digest.sha_SSSE3 : transformSSSE3; + version(unittest) { @@ -212,7 +228,19 @@ private nothrow pure uint rotateLeft(uint x, uint n) */ struct SHA1 { - alias transformX86 transform; + version(USE_SSSE3) + { + private __gshared immutable nothrow pure void function(uint[5]* state, const(ubyte[64])* block) transform; + + shared static this() + { + transform = hasSSSE3Support() ? &transformSSSE3 : &transformX86; + } + } + else + { + alias transformX86 transform; + } private: uint state[5] = /* state (ABCDE) */ diff --git a/std/internal/digest/sha_SSSE3.d b/std/internal/digest/sha_SSSE3.d new file mode 100644 index 00000000000..2363a88e981 --- /dev/null +++ b/std/internal/digest/sha_SSSE3.d @@ -0,0 +1,709 @@ +// Written in the D programming language. + +/** + * Computes SHA1 digests of arbitrary data, using an optimized algorithm with SSSE3 instructions. + * + * Authors: + * The general idea is described by Dean Gaudet. + * Another important observation is published by Max Locktyukhin. + * (Both implementations are public domain.) + * Translation to X86 and D by Kai Nacke + * + * References: + * $(LINK2 http://arctic.org/~dean/crypto/sha1.html) + * $(LINK2 http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/, Fast implementation of SHA1) + */ +module std.internal.digest.sha_SSSE3; + +import std.conv; + +version(OSX) +{ + // Do not use. +} +else version(D_InlineAsm_X86) +{ + private version = USE_SSSE3; + private version = _32Bit; +} +else version(D_InlineAsm_X86_64) +{ + private version = USE_SSSE3; + private version = _64Bit; +} + +/* + * The idea is quite simple. The SHA-1 specification defines the following message schedule: + * W[i] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 + * + * To employ SSE, simply write down the formula four times: + * W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 + * W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 + * W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 + * W[i+3] = (W[i ] ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 + * The last formula requires value W[i] computed with the first formula. + * Because the xor operation and the rotate operation are commutative, we can replace the + * last formula with + * W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 + * and then calculate + * W[i+3] ^= W[i] rol 1 + * which unfortunately requires many additional operations. This approach was described by + * Dean Gaudet. + * + * Max Locktyukhin observed that + * W[i] = W[i-A] ^ W[i-B] + * is equivalent to + * W[i] = W[i-2*A] ^ W[i-2*B] + * (if the indices are still in valid ranges). Using this observation, the formula is + * translated to + * W[i] = (W[i-6] ^ W[i-16] ^ W[i-28] ^ W[i-32]) rol 2 + * Again, to employ SSE the formula is used four times. + * + * Later on, the expression W[i] + K(i) is used. (K(i) is the constant used in round i.) + * Once the 4 W[i] are calculated, we can also add the four K(i) values with one SSE instruction. + * + * The 32bit and 64bit implementations are almost identical. The main difference is that there + * are only 8 XMM registers in 32bit mode. Therefore, space on the stack is needed to save + * computed values. + */ + +version(USE_SSSE3) +{ + /* + * The general idea is to use the XMM registers as a sliding window over + * message schedule. XMM0 to XMM7 are used to store the last 64 byte of + * the message schedule. In 64 bit mode this is fine because of the number of + * registers. The main difference of the 32 bit code is that a part of the + * calculated message schedule is saved on the stack because 2 temporary + * registers are needed. + */ + + /* Number of message words we are precalculating. */ + private immutable int PRECALC_AHEAD = 16; + + /* T1 and T2 are used for intermediate results of computations. */ + private immutable string T1 = "EAX"; + private immutable string T2 = "EBX"; + + /* The registers used for the SHA-1 variables. */ + private immutable string A = "ECX"; + private immutable string B = "ESI"; + private immutable string C = "EDI"; + private immutable string D = "EBP"; + private immutable string E = "EDX"; + + /* */ + version(_32Bit) + { + private immutable string SP = "ESP"; + private immutable string BUFFER_PTR = "EAX"; + private immutable string STATE_PTR = "EBX"; + + // Control byte for shuffle instruction (only used in round 0-15) + private immutable string X_SHUFFLECTL = "XMM6"; + + // Round constant (only used in round 0-15) + private immutable string X_CONSTANT = "XMM7"; + } + version(_64Bit) + { + private immutable string SP = "RSP"; + private immutable string BUFFER_PTR = "R9"; + private immutable string STATE_PTR = "R8"; + + // Registers for temporary results (XMM10 and XMM11 are also used temporary) + private immutable string W_TMP = "XMM8"; + private immutable string W_TMP2 = "XMM9"; + + // Control byte for shuffle instruction (only used in round 0-15) + private immutable string X_SHUFFLECTL = "XMM12"; + + // Round constant + private immutable string X_CONSTANT = "XMM13"; + } + + /* The control words for the byte shuffle instruction. */ + align(16) private immutable uint[4] bswap_shufb_ctl = + [ + 0x0001_0203, 0x0405_0607, 0x0809_0a0b, 0x0c0d_0e0f + ]; + + /* The round constants. */ + align(16) private immutable uint[16] constants = + [ + // Constants for round 0-19 + 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, + // Constants for round 20-39 + 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, + // Constants for round 40-59 + 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, + // Constants for round 60-79 + 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + ]; + + /** Simple version to produce numbers < 100 as string. */ + private nothrow pure string to_string(uint i) + { + assert(i < 100); + string s; + if (i >= 10) + s ~= cast(char)('0' + (i / 10) % 10); + return s ~ cast(char)('0' + i % 10); + } + + /** Returns the reference to constant used in round i. */ + private nothrow pure string constant(uint i) + { + return "[constants + 16*"~to_string(i/20)~"]"; + } + + /** Returns the XMM register number used in round i */ + private nothrow pure uint regno(uint i) + { + return (i/4)&7; + } + + /** Returns reference to storage of vector W[i..i+4]. */ + private nothrow pure string WiV(uint i) + { + return "["~SP~" + WI_PTR + "~to_string((i/4)&7)~"*16]"; + } + + /** Returns reference to storage of vector (W + K)[i..i+4]. */ + private nothrow pure string WiKiV(uint i) + { + return "["~SP~" + WI_PLUS_KI_PTR + "~to_string((i/4)&3)~"*16]"; + } + + /** Returns reference to storage of value W[i] + K[i]. */ + private nothrow pure string WiKi(uint i) + { + return "["~SP~" + WI_PLUS_KI_PTR + 4*"~to_string(i&15)~"]"; + } + + /** + * Chooses the instruction sequence based on the 32bit or 64bit model. + */ + private nothrow pure string[] swt3264(string[] insn32, string[] insn64) + { + version(_32Bit) + { + return insn32; + } + version(_64Bit) + { + return insn64; + } + } + + /** + * Flattens the instruction sequence and wraps it in an asm block. + */ + private nothrow pure string wrap(string[] insn) + { + string s = "asm {"; + foreach (t; insn) s ~= (t ~ "; \n"); + s ~= "}"; + return s; + // Is not CTFE: + // return "asm { " ~ join(insn, "; \n") ~ "}"; + } + + /** + * Weaves the 2 instruction sequences together. + */ + private nothrow pure string[] weave(string[] seq1, string[] seq2, uint dist = 1) + { + string[] res = []; + auto i1 = 0, i2 = 0; + while (i1 < seq1.length || i2 < seq2.length) + { + if (i2 < seq2.length) + { + res ~= seq2[i2..i2+1]; + i2 += 1; + } + if (i1 < seq1.length) + { + res ~= seq1[i1..std.algorithm.min(i1+dist,$)]; + i1 += dist; + } + } + return res; + } + + /** + * Generates instructions to load state from memory into registers. + */ + private nothrow pure string[] loadstate(string base, string a, string b, string c, string d, string e) + { + return ["mov "~a~",["~base~" + 0*4]", + "mov "~b~",["~base~" + 1*4]", + "mov "~c~",["~base~" + 2*4]", + "mov "~d~",["~base~" + 3*4]", + "mov "~e~",["~base~" + 4*4]" ]; + } + + /** + * Generates instructions to update state from registers, saving result in memory. + */ + private nothrow pure string[] savestate(string base, string a, string b, string c, string d, string e) + { + return ["add ["~base~" + 0*4],"~a, + "add ["~base~" + 1*4],"~b, + "add ["~base~" + 2*4],"~c, + "add ["~base~" + 3*4],"~d, + "add ["~base~" + 4*4],"~e ]; + } + + /** Calculates Ch(x, y, z) = z ^ (x & (y ^ z)) */ + private nothrow pure string[] Ch(string x, string y, string z) + { + return ["mov "~T1~","~y, + "xor "~T1~","~z, + "and "~T1~","~x, + "xor "~T1~","~z ]; + } + + /** Calculates Parity(x, y, z) = x ^ y ^ z */ + private nothrow pure string[] Parity(string x, string y, string z) + { + return ["mov "~T1~","~z, + "xor "~T1~","~y, + "xor "~T1~","~x ]; + } + + /** Calculates Maj(x, y, z) = (x & y) | (z & (x ^ y)) */ + private nothrow pure string[] Maj(string x, string y, string z) + { + return ["mov "~T1~","~y, + "mov "~T2~","~x, + "or "~T1~","~x, + "and "~T2~","~y, + "and "~T1~","~z, + "or "~T1~","~T2 ]; + } + + /** Returns function for round i. Function returns result in T1 and may destroy T2. */ + private nothrow pure string[] F(int i, string b, string c, string d) + { + string[] insn; + if (i >= 0 && i <= 19) insn = Ch(b, c, d); + else if (i >= 20 && i <= 39) insn = Parity(b, c, d); + else if (i >= 40 && i <= 59) insn = Maj(b, c, d); + else if (i >= 60 && i <= 79) insn = Parity(b, c, d); + else assert(false, "Coding error"); + return insn; + } + + /** Returns instruction used to setup a round. */ + private nothrow pure string[] xsetup(int i) + { + if (i == 0) + { + return swt3264(["movdqa "~X_SHUFFLECTL~",[bswap_shufb_ctl]", + "movdqa "~X_CONSTANT~","~constant(i)], + ["movdqa "~X_SHUFFLECTL~",[bswap_shufb_ctl]", + "movdqa "~X_CONSTANT~","~constant(i)]); + } + version(_64Bit) + { + if (i%20 == 0) + { + return ["movdqa "~X_CONSTANT~","~constant(i)]; + } + } + return []; + } + + /** + * Loads the message words and performs the little to big endian conversion. + * Requires that the shuffle control word and the round constant is loaded + * into required XMM register. The BUFFER_PTR register must point to the + * buffer. + */ + private nothrow pure string[] precalc_00_15(int i) + { + int regno = regno(i); + + string W = "XMM" ~ to_string(regno); + version(_32Bit) + { + string W_TMP = "XMM" ~ to_string(regno+2); + } + version(_64Bit) + { + string W_TMP = "XMM" ~ to_string(regno+8); + } + + if ((i & 3) == 0) + { + return ["movdqu "~W~",["~BUFFER_PTR~" + "~to_string(regno)~"*16]"]; + } + else if ((i & 3) == 1) + { + return ["pshufb "~W~","~X_SHUFFLECTL] ~ + swt3264(["movdqa "~WiV(i)~","~W], []); + } + else if ((i & 3) == 2) + { + return ["movdqa "~W_TMP~","~W, + "paddd "~W_TMP~","~X_CONSTANT, + ]; + } + else + { + return ["movdqa "~WiKiV(i)~","~W_TMP, + ]; + } + } + + /** + * Done on 4 consequtive W[i] values in a single XMM register + * W[i ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1 + * W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1 + * W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1 + * W[i+3] = ( 0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1 + * + * This additional calculation unfortunately requires many additional operations + * W[i+3] ^= W[i] rol 1 + * + * Once we have 4 W[i] values in XMM we can also add four K values with one instruction + * W[i:i+3] += {K,K,K,K} + */ + private nothrow pure string[] precalc_16_31(int i) + { + int regno = regno(i); + + string W = "XMM" ~ to_string(regno); + string W_minus_4 = "XMM" ~ to_string((regno-1)&7); + string W_minus_8 = "XMM" ~ to_string((regno-2)&7); + string W_minus_12 = "XMM" ~ to_string((regno-3)&7); + string W_minus_16 = "XMM" ~ to_string((regno-4)&7); + version(_32Bit) + { + string W_TMP = "XMM" ~ to_string((regno+1)&7); + string W_TMP2 = "XMM" ~ to_string((regno+2)&7); + } + + if ((i & 3) == 0) + { + return ["movdqa "~W~","~W_minus_12, + "palignr "~W~","~W_minus_16~",8", // W[i] = W[i-14] + "pxor "~W~","~W_minus_16, // W[i] ^= W[i-16] + "pxor "~W~","~W_minus_8, // W[i] ^= W[i-8] + "movdqa "~W_TMP~","~W_minus_4, + ]; + } + else if ((i & 3) == 1) + { + return ["psrldq "~W_TMP~",4", // W[i-3] + "pxor "~W~","~W_TMP, // W[i] ^= W[i-3] + "movdqa "~W_TMP~","~W, + "psrld "~W~",31", + "pslld "~W_TMP~",1", + ]; + } + else if ((i & 3) == 2) + { + return ["por "~W~","~W_TMP, + "movdqa "~W_TMP~","~W, + "pslldq "~W_TMP~",12", + "movdqa "~W_TMP2~","~W_TMP, + "pslld "~W_TMP~",1", + ]; + } + else + { + return ["psrld "~W_TMP2~",31", + "por "~W_TMP~","~W_TMP2, + "pxor "~W~","~W_TMP, + "movdqa "~W_TMP~","~W ] ~ + swt3264(["movdqa "~WiV(i)~","~W, + "paddd "~W_TMP~","~constant(i) ], + ["paddd "~W_TMP~","~X_CONSTANT ]) ~ + ["movdqa "~WiKiV(i)~","~W_TMP]; + } + } + + /** Performs the main calculation as decribed above. */ + private nothrow pure string[] precalc_32_79(int i) + { + int regno = regno(i); + + string W = "XMM" ~ to_string(regno); + string W_minus_4 = "XMM" ~ to_string((regno-1)&7); + string W_minus_8 = "XMM" ~ to_string((regno-2)&7); + string W_minus_16 = "XMM" ~ to_string((regno-4)&7); + version(_32Bit) + { + string W_minus_28 = "[ESP + WI_PTR + "~ to_string((regno-7)&7)~"*16]"; + string W_minus_32 = "[ESP + WI_PTR + "~ to_string((regno-8)&7)~"*16]"; + string W_TMP = "XMM" ~ to_string((regno+1)&7); + string W_TMP2 = "XMM" ~ to_string((regno+2)&7); + } + version(_64Bit) + { + string W_minus_28 = "XMM" ~ to_string((regno-7)&7); + string W_minus_32 = "XMM" ~ to_string((regno-8)&7); + } + + if ((i & 3) == 0) + { + return swt3264(["movdqa "~W~","~W_minus_32], []) ~ + ["movdqa "~W_TMP~","~W_minus_4, + "pxor "~W~","~W_minus_28, // W is W_minus_32 before xor + "palignr "~W_TMP~","~W_minus_8~",8", + ]; + } + else if ((i & 3) == 1) + { + return ["pxor "~W~","~W_minus_16, + "pxor "~W~","~W_TMP, + "movdqa "~W_TMP~","~W, + ]; + } + else if ((i & 3) == 2) + { + return ["psrld "~W~",30", + "pslld "~W_TMP~",2", + "por "~W_TMP~","~W, + ]; + } + else + { + if (i < 76) + return ["movdqa "~W~","~W_TMP] ~ + swt3264(["movdqa "~WiV(i)~","~W, + "paddd "~W_TMP~","~constant(i)], + ["paddd "~W_TMP~","~X_CONSTANT]) ~ + ["movdqa "~WiKiV(i)~","~W_TMP]; + else + return swt3264(["paddd "~W_TMP~","~constant(i)], + ["paddd "~W_TMP~","~X_CONSTANT]) ~ + ["movdqa "~WiKiV(i)~","~W_TMP]; + } + } + + /** Choose right precalc method. */ + private nothrow pure string[] precalc(int i) + { + if (i >= 0 && i < 16) return precalc_00_15(i); + if (i >= 16 && i < 32) return precalc_16_31(i); + if (i >= 32 && i < 80) return precalc_32_79(i); + return []; + } + + /** + * Return code for round i and i+1. + * Performs the following rotation: + * in=>out: A=>D, B=>E, C=>A, D=>B, E=>C + */ + private nothrow pure string[] round(int i, string a, string b, string c, string d, string e) + { + return xsetup(PRECALC_AHEAD + i) ~ + weave(F(i, b, c, d) ~ // Returns result in T1; may destroy T2 + ["add "~e~","~WiKi(i), + "ror "~b~",2", + "mov "~T2~","~a, + "add "~d~","~WiKi(i+1), + "rol "~T2~",5", + "add "~e~","~T1 ], + precalc(PRECALC_AHEAD + i), 2) ~ + weave( + ["add "~T2~","~e, // T2 = (A <<< 5) + F(B, C, D) + Wi + Ki + E + "mov "~e~","~T2, + "rol "~T2~",5", + "add "~d~","~T2 ] ~ + F(i+1, a, b, c) ~ // Returns result in T1; may destroy T2 + ["add "~d~","~T1, + "ror "~a~",2"], + precalc(PRECALC_AHEAD + i+1), 2); + } + + // Offset into stack (see below) + version(_32Bit) + { + private enum { STATE_OFS = 4, WI_PLUS_KI_PTR = 8, WI_PTR = 72 }; + } + version(_64Bit) + { + private enum { WI_PLUS_KI_PTR = 0 }; + } + + /** The prologue sequence. */ + private nothrow pure string[] prologue() + { + version(_32Bit) + { + /* + * Parameters: + * EAX contains pointer to input buffer + * + * Stack layout as follows: + * +----------------+ + * | ptr to state | + * +----------------+ + * | return address | + * +----------------+ + * | EBP | + * +----------------+ + * | ESI | + * +----------------+ + * | EDI | + * +----------------+ + * | EBX | + * +----------------+ + * | Space for | + * | Wi | <- ESP+72 + * +----------------+ + * | Space for | + * | Wi+Ki | <- ESP+8 + * +----------------+ <- 16byte aligned + * | ptr to state | <- ESP+4 + * +----------------+ + * | old ESP | <- ESP + * +----------------+ + */ + static assert(BUFFER_PTR == "EAX"); + static assert(STATE_PTR == "EBX"); + return [// Save registers according to calling convention + "push EBP", + "push ESI", + "push EDI", + "push EBX", + // Load parameters + "mov EBX, [ESP + 5*4]", //pointer to state + // Align stack + "mov EBP, ESP", + "sub ESP, 4*16 + 8*16", + "and ESP, 0xffff_fff0", + "push EBX", + "push EBP", + ]; + } + version(_64Bit) + { + /* + * Parameters: + * RSI contains pointer to state + * RDI contains pointer to input buffer + * + * Stack layout as follows: + * +----------------+ + * | return address | + * +----------------+ + * | RBP | + * +----------------+ + * | RBX | + * +----------------+ + * | Unused | + * +----------------+ + * | Space for | + * | Wi+Ki | <- RSP + * +----------------+ <- 16byte aligned + */ + return [// Save registers according to calling convention + "push RBP", + "push RBX", + // Save parameters + "mov "~STATE_PTR~", RSI", //pointer to state + "mov "~BUFFER_PTR~", RDI", //pointer to buffer + // Align stack + "sub RSP, 4*16+8", + ]; + } + } + + /** + * The epilogue sequence. Just pop the saved registers from stack and return to caller. + */ + private nothrow pure string[] epilogue() + { + version(_32Bit) + { + return ["pop ESP", + "pop EBX", + "pop EDI", + "pop ESI", + "pop EBP", + "ret 4", + ]; + } + version(_64Bit) + { + return ["add RSP,4*16+8", + "pop RBX", + "pop RBP", + "ret 0", + ]; + } + } + + /** + * + */ + public nothrow pure void transformSSSE3(uint[5]* state, const(ubyte[64])* buffer) + { + mixin(wrap(["naked;"] ~ prologue())); + // Precalc first 4*16=64 bytes + mixin(wrap(xsetup(0))); + mixin(wrap(weave(precalc(0)~precalc(1)~precalc(2)~precalc(3), + precalc(4)~precalc(5)~precalc(6)~precalc(7)))); + mixin(wrap(weave(loadstate(STATE_PTR, A, B, C, D, E), + weave(precalc(8)~precalc(9)~precalc(10)~precalc(11), + precalc(12)~precalc(13)~precalc(14)~precalc(15))))); + // Round 1 + mixin(wrap(round( 0, A, B, C, D, E))); + mixin(wrap(round( 2, D, E, A, B, C))); + mixin(wrap(round( 4, B, C, D, E, A))); + mixin(wrap(round( 6, E, A, B, C, D))); + mixin(wrap(round( 8, C, D, E, A, B))); + mixin(wrap(round(10, A, B, C, D, E))); + mixin(wrap(round(12, D, E, A, B, C))); + mixin(wrap(round(14, B, C, D, E, A))); + mixin(wrap(round(16, E, A, B, C, D))); + mixin(wrap(round(18, C, D, E, A, B))); + // Round 2 + mixin(wrap(round(20, A, B, C, D, E))); + mixin(wrap(round(22, D, E, A, B, C))); + mixin(wrap(round(24, B, C, D, E, A))); + mixin(wrap(round(26, E, A, B, C, D))); + mixin(wrap(round(28, C, D, E, A, B))); + mixin(wrap(round(30, A, B, C, D, E))); + mixin(wrap(round(32, D, E, A, B, C))); + mixin(wrap(round(34, B, C, D, E, A))); + mixin(wrap(round(36, E, A, B, C, D))); + mixin(wrap(round(38, C, D, E, A, B))); + // Round 3 + mixin(wrap(round(40, A, B, C, D, E))); + mixin(wrap(round(42, D, E, A, B, C))); + mixin(wrap(round(44, B, C, D, E, A))); + mixin(wrap(round(46, E, A, B, C, D))); + mixin(wrap(round(48, C, D, E, A, B))); + mixin(wrap(round(50, A, B, C, D, E))); + mixin(wrap(round(52, D, E, A, B, C))); + mixin(wrap(round(54, B, C, D, E, A))); + mixin(wrap(round(56, E, A, B, C, D))); + mixin(wrap(round(58, C, D, E, A, B))); + // Round 4 + mixin(wrap(round(60, A, B, C, D, E))); + mixin(wrap(round(62, D, E, A, B, C))); + mixin(wrap(round(64, B, C, D, E, A))); + mixin(wrap(round(66, E, A, B, C, D))); + mixin(wrap(round(68, C, D, E, A, B))); + mixin(wrap(round(70, A, B, C, D, E))); + mixin(wrap(round(72, D, E, A, B, C))); + mixin(wrap(round(74, B, C, D, E, A))); + mixin(wrap(round(76, E, A, B, C, D))); + mixin(wrap(round(78, C, D, E, A, B))); + version(_32Bit) + { + // Load pointer to state + mixin(wrap(["mov "~STATE_PTR~",[ESP + STATE_OFS]"])); + } + mixin(wrap(savestate(STATE_PTR, A, B, C, D, E))); + mixin(wrap(epilogue())); + } +} + diff --git a/win32.mak b/win32.mak index 8bfd10427bf..ee96e69bcd2 100644 --- a/win32.mak +++ b/win32.mak @@ -191,6 +191,8 @@ SRC_STD_C_FREEBSD= std\c\freebsd\socket.d SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d +SRC_STD_INTERNAL_DIGEST= std\internal\digest\sha_SSSE3.d + SRC_STD_INTERNAL_MATH= std\internal\math\biguintcore.d \ std\internal\math\biguintnoasm.d std\internal\math\biguintx86.d \ std\internal\math\gammafunction.d std\internal\math\errorfunction.d @@ -207,6 +209,7 @@ SRC_TO_COMPILE_NOT_STD= crc32.d \ $(SRC_STD_WIN) \ $(SRC_STD_C_WIN) \ $(SRC_STD_INTERNAL) \ + $(SRC_STD_INTERNAL_DIGEST) \ $(SRC_STD_INTERNAL_MATH) \ $(SRC_STD_INTERNAL_WINDOWS) \ $(SRC_ETC) \ @@ -719,7 +722,8 @@ zip : win32.mak win64.mak posix.mak $(STDDOC) $(SRC) \ $(SRC_STD) $(SRC_STD_C) $(SRC_STD_WIN) \ $(SRC_STD_C_WIN) $(SRC_STD_C_LINUX) $(SRC_STD_C_OSX) $(SRC_STD_C_FREEBSD) \ $(SRC_ETC) $(SRC_ETC_C) $(SRC_ZLIB) $(SRC_STD_NET) $(SRC_STD_DIGEST) \ - $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) + $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) \ + $(SRC_STD_INTERNAL_WINDOWS) del phobos.zip zip32 -u phobos win32.mak win64.mak posix.mak $(STDDOC) zip32 -u phobos $(SRC) @@ -731,6 +735,7 @@ zip : win32.mak win64.mak posix.mak $(STDDOC) $(SRC) \ zip32 -u phobos $(SRC_STD_C_OSX) zip32 -u phobos $(SRC_STD_C_FREEBSD) zip32 -u phobos $(SRC_STD_INTERNAL) + zip32 -u phobos $(SRC_STD_INTERNAL_DIGEST) zip32 -u phobos $(SRC_STD_INTERNAL_MATH) zip32 -u phobos $(SRC_STD_INTERNAL_WINDOWS) zip32 -u phobos $(SRC_ETC) $(SRC_ETC_C) diff --git a/win64.mak b/win64.mak index 41e2837802f..f0bd3e801f9 100644 --- a/win64.mak +++ b/win64.mak @@ -213,6 +213,8 @@ SRC_STD_C_FREEBSD= std\c\freebsd\socket.d SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d +SRC_STD_INTERNAL_DIGEST= std\internal\digest\sha_SSSE3.d + SRC_STD_INTERNAL_MATH= std\internal\math\biguintcore.d \ std\internal\math\biguintnoasm.d std\internal\math\biguintx86.d \ std\internal\math\gammafunction.d std\internal\math\errorfunction.d @@ -229,6 +231,7 @@ SRC_TO_COMPILE_NOT_STD= crc32.d \ $(SRC_STD_WIN) \ $(SRC_STD_C_WIN) \ $(SRC_STD_INTERNAL) \ + $(SRC_STD_INTERNAL_DIGEST) \ $(SRC_STD_INTERNAL_MATH) \ $(SRC_STD_INTERNAL_WINDOWS) \ $(SRC_ETC) \ @@ -772,7 +775,8 @@ zip : win32.mak win64.mak posix.mak $(STDDOC) $(SRC) \ $(SRC_STD) $(SRC_STD_C) $(SRC_STD_WIN) \ $(SRC_STD_C_WIN) $(SRC_STD_C_LINUX) $(SRC_STD_C_OSX) $(SRC_STD_C_FREEBSD) \ $(SRC_ETC) $(SRC_ETC_C) $(SRC_ZLIB) $(SRC_STD_NET) $(SRC_STD_DIGEST)\ - $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS) + $(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) \ + $(SRC_STD_INTERNAL_WINDOWS) del phobos.zip zip32 -u phobos win32.mak win64.mak posix.mak $(STDDOC) zip32 -u phobos $(SRC) @@ -784,6 +788,7 @@ zip : win32.mak win64.mak posix.mak $(STDDOC) $(SRC) \ zip32 -u phobos $(SRC_STD_C_OSX) zip32 -u phobos $(SRC_STD_C_FREEBSD) zip32 -u phobos $(SRC_STD_INTERNAL) + zip32 -u phobos $(SRC_STD_INTERNAL_DIGEST) zip32 -u phobos $(SRC_STD_INTERNAL_MATH) zip32 -u phobos $(SRC_STD_INTERNAL_WINDOWS) zip32 -u phobos $(SRC_ETC) $(SRC_ETC_C)