diff --git a/changelog.dd b/changelog.dd
index 4c589c74343..74ad61cd4dd 100644
--- a/changelog.dd
+++ b/changelog.dd
@@ -2,7 +2,7 @@ $(VERSION 061, ddd mm, 2012, =================================================,
     $(WHATSNEW
         $(LI std.digest: Added new package for digests. This replaces std.md5 and the internal crc32
              implementation.)
-        $(LI std.digest.sha: Added SHA1 digest implementation.)
+        $(LI std.digest.sha: Added SHA1 digest implementation, including a fast SSSE3 version.)
         $(LI std.digest.ripemd: Added RIPEMD-160 digest implementation.)
         $(LI std.uuid: Support SHA1 UUIDs.)
         $(LI std.uuid: md5UUID and sha1UUID can now be used in pure code.)
diff --git a/posix.mak b/posix.mak
index ea2064a7969..92d09d35b3d 100644
--- a/posix.mak
+++ b/posix.mak
@@ -192,6 +192,7 @@ EXTRA_DOCUMENTABLES += $(addprefix etc/c/,curl sqlite3 zlib) $(addprefix	\
 std/c/, fenv locale math process stdarg stddef stdio stdlib string	\
 time wcharh)
 EXTRA_MODULES += $(EXTRA_DOCUMENTABLES) $(addprefix			\
+	std/internal/digest/, sha_SSSE3 ) $(addprefix \
 	std/internal/math/, biguintcore biguintnoasm biguintx86	\
 	gammafunction errorfunction) $(addprefix std/internal/, \
 	processinit uni uni_tab)
diff --git a/std/digest/sha.d b/std/digest/sha.d
index 045f258c1a1..62152cd782b 100644
--- a/std/digest/sha.d
+++ b/std/digest/sha.d
@@ -125,9 +125,25 @@ unittest
     hash = sha.finish();
 }
 
+version(OSX)
+{
+    // Do not use.
+}
+else version(D_InlineAsm_X86)
+{
+    private version = USE_SSSE3;
+}
+else version(D_InlineAsm_X86_64)
+{
+    private version = USE_SSSE3;
+}
+
 import std.ascii : hexDigits;
 import std.exception : assumeUnique;
 import core.bitop : bswap;
+version(USE_SSSE3) import core.cpuid : hasSSSE3Support = ssse3;
+version(USE_SSSE3) import std.internal.digest.sha_SSSE3 : transformSSSE3;
+
 
 version(unittest)
 {
@@ -212,7 +228,19 @@ private nothrow pure uint rotateLeft(uint x, uint n)
  */
 struct SHA1
 {
-    alias transformX86 transform;
+    version(USE_SSSE3)
+    {
+        private __gshared immutable nothrow pure void function(uint[5]* state, const(ubyte[64])* block) transform;
+
+        shared static this()
+        {
+            transform = hasSSSE3Support() ? &transformSSSE3 : &transformX86;
+        }
+    }
+    else
+    {
+        alias transformX86 transform;
+    }
 
     private:
         uint state[5] =                                   /* state (ABCDE) */
diff --git a/std/internal/digest/sha_SSSE3.d b/std/internal/digest/sha_SSSE3.d
new file mode 100644
index 00000000000..2363a88e981
--- /dev/null
+++ b/std/internal/digest/sha_SSSE3.d
@@ -0,0 +1,709 @@
+// Written in the D programming language.
+
+/**
+ * Computes SHA1 digests of arbitrary data, using an optimized algorithm with SSSE3 instructions.
+ *
+ * Authors:
+ * The general idea is described by Dean Gaudet.
+ * Another important observation is published by Max Locktyukhin.
+ * (Both implementations are public domain.)
+ * Translation to X86 and D by Kai Nacke <kai@redstar.de>
+ *
+ * References:
+ *      $(LINK2 http://arctic.org/~dean/crypto/sha1.html)
+ *      $(LINK2 http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/, Fast implementation of SHA1)
+ */
+module std.internal.digest.sha_SSSE3;
+
+import std.conv;
+
+version(OSX)
+{
+    // Do not use.
+}
+else version(D_InlineAsm_X86)
+{
+    private version = USE_SSSE3;
+    private version = _32Bit;
+}
+else version(D_InlineAsm_X86_64)
+{
+    private version = USE_SSSE3;
+    private version = _64Bit;
+}
+
+/*
+ * The idea is quite simple. The SHA-1 specification defines the following message schedule:
+ *     W[i] = (W[i-3] ^ W[i-8]  ^ W[i-14] ^ W[i-16]) rol 1
+ *
+ * To employ SSE, simply write down the formula four times:
+ *     W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+ *     W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+ *     W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+ *     W[i+3] = (W[i  ] ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+ * The last formula requires value W[i] computed with the first formula.
+ * Because the xor operation and the rotate operation are commutative, we can replace the
+ * last formula with
+ *     W[i+3] = (     0 ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+ * and then calculate
+ *     W[i+3] ^= W[i] rol 1
+ * which unfortunately requires many additional operations. This approach was described by
+ * Dean Gaudet.
+ *
+ * Max Locktyukhin observed that
+ *     W[i] = W[i-A] ^ W[i-B]
+ * is equivalent to
+ *     W[i] = W[i-2*A] ^ W[i-2*B]
+ * (if the indices are still in valid ranges). Using this observation, the formula is
+ * translated to
+ *     W[i] = (W[i-6] ^ W[i-16] ^ W[i-28] ^ W[i-32]) rol 2
+ * Again, to employ SSE the formula is used four times.
+ *
+ * Later on, the expression W[i] + K(i) is used. (K(i) is the constant used in round i.)
+ * Once the 4 W[i] are calculated, we can also add the four K(i) values with one SSE instruction.
+ *
+ * The 32bit and 64bit implementations are almost identical. The main difference is that there
+ * are only 8 XMM registers in 32bit mode. Therefore, space on the stack is needed to save
+ * computed values.
+ */
+
+version(USE_SSSE3)
+{
+    /*
+     * The general idea is to use the XMM registers as a sliding window over
+     * message schedule. XMM0 to XMM7 are used to store the last 64 byte of
+     * the message schedule. In 64 bit mode this is fine because of the number of
+     * registers. The main difference of the 32 bit code is that a part of the
+     * calculated message schedule is saved on the stack because 2 temporary
+     * registers are needed.
+     */
+
+    /* Number of message words we are precalculating. */
+    private immutable int PRECALC_AHEAD = 16;
+
+    /* T1 and T2 are used for intermediate results of computations. */
+    private immutable string T1 = "EAX";
+    private immutable string T2 = "EBX";
+
+    /* The registers used for the SHA-1 variables. */
+    private immutable string A = "ECX";
+    private immutable string B = "ESI";
+    private immutable string C = "EDI";
+    private immutable string D = "EBP";
+    private immutable string E = "EDX";
+
+    /* */
+    version(_32Bit)
+    {
+        private immutable string SP = "ESP";
+        private immutable string BUFFER_PTR = "EAX";
+        private immutable string STATE_PTR = "EBX";
+
+        // Control byte for shuffle instruction (only used in round 0-15)
+        private immutable string X_SHUFFLECTL = "XMM6";
+
+        // Round constant (only used in round 0-15)
+        private immutable string X_CONSTANT = "XMM7";
+    }
+    version(_64Bit)
+    {
+        private immutable string SP = "RSP";
+        private immutable string BUFFER_PTR = "R9";
+        private immutable string STATE_PTR = "R8";
+
+        // Registers for temporary results (XMM10 and XMM11 are also used temporary)
+        private immutable string W_TMP = "XMM8";
+        private immutable string W_TMP2 = "XMM9";
+
+        // Control byte for shuffle instruction (only used in round 0-15)
+        private immutable string X_SHUFFLECTL = "XMM12";
+
+        // Round constant
+        private immutable string X_CONSTANT = "XMM13";
+    }
+
+    /* The control words for the byte shuffle instruction. */
+    align(16) private immutable uint[4] bswap_shufb_ctl =
+    [
+        0x0001_0203, 0x0405_0607, 0x0809_0a0b, 0x0c0d_0e0f
+    ];
+
+    /* The round constants. */
+    align(16) private immutable uint[16] constants =
+    [
+        // Constants for round 0-19
+        0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999,
+        // Constants for round 20-39
+        0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1,
+        // Constants for round 40-59
+        0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc,
+        // Constants for round 60-79
+        0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+    ];
+
+    /** Simple version to produce numbers < 100 as string. */
+    private nothrow pure string to_string(uint i)
+    {
+        assert(i < 100);
+        string s;
+	if (i >= 10)
+            s ~= cast(char)('0' + (i / 10) % 10);
+        return s ~ cast(char)('0' + i % 10);
+    }
+
+    /** Returns the reference to constant used in round i. */
+    private nothrow pure string constant(uint i)
+    {
+        return "[constants + 16*"~to_string(i/20)~"]";
+    }
+
+    /** Returns the XMM register number used in round i */
+    private nothrow pure uint regno(uint i)
+    {
+        return (i/4)&7;
+    }
+
+    /** Returns reference to storage of vector W[i..i+4]. */
+    private nothrow pure string WiV(uint i)
+    {
+        return "["~SP~" + WI_PTR + "~to_string((i/4)&7)~"*16]";
+    }
+
+    /** Returns reference to storage of vector (W + K)[i..i+4]. */
+    private nothrow pure string WiKiV(uint i)
+    {
+        return "["~SP~" + WI_PLUS_KI_PTR + "~to_string((i/4)&3)~"*16]";
+    }
+
+    /** Returns reference to storage of value W[i] + K[i]. */
+    private nothrow pure string WiKi(uint i)
+    {
+        return "["~SP~" + WI_PLUS_KI_PTR + 4*"~to_string(i&15)~"]";
+    }
+
+    /**
+     * Chooses the instruction sequence based on the 32bit or 64bit model.
+     */
+    private nothrow pure string[] swt3264(string[] insn32, string[] insn64)
+    {
+        version(_32Bit)
+        {
+            return insn32;
+        }
+        version(_64Bit)
+        {
+            return insn64;
+        }
+    }
+
+    /**
+     * Flattens the instruction sequence and wraps it in an asm block.
+     */
+    private nothrow pure string wrap(string[] insn)
+    {
+        string s = "asm {";
+        foreach (t; insn) s ~= (t ~ "; \n");
+        s ~= "}";
+        return s;
+        // Is not CTFE:
+        // return "asm { " ~ join(insn, "; \n") ~ "}";
+    }
+
+    /**
+     * Weaves the 2 instruction sequences together.
+     */
+    private nothrow pure string[] weave(string[] seq1, string[] seq2, uint dist = 1)
+    {
+        string[] res = [];
+        auto i1 = 0, i2 = 0;
+        while (i1 < seq1.length || i2 < seq2.length)
+        {
+            if (i2 < seq2.length)
+            {
+                res ~= seq2[i2..i2+1];
+                i2 += 1;
+            }
+            if (i1 < seq1.length)
+            {
+                res ~= seq1[i1..std.algorithm.min(i1+dist,$)];
+                i1 += dist;
+            }
+        }
+        return res;
+    }
+
+    /**
+     * Generates instructions to load state from memory into registers.
+     */
+    private nothrow pure string[] loadstate(string base, string a, string b, string c, string d, string e)
+    {
+        return ["mov "~a~",["~base~" + 0*4]",
+                "mov "~b~",["~base~" + 1*4]",
+                "mov "~c~",["~base~" + 2*4]",
+                "mov "~d~",["~base~" + 3*4]",
+                "mov "~e~",["~base~" + 4*4]" ];
+    }
+
+    /**
+     * Generates instructions to update state from registers, saving result in memory.
+     */
+    private nothrow pure string[] savestate(string base, string a, string b, string c, string d, string e)
+    {
+        return ["add ["~base~" + 0*4],"~a,
+                "add ["~base~" + 1*4],"~b,
+                "add ["~base~" + 2*4],"~c,
+                "add ["~base~" + 3*4],"~d,
+                "add ["~base~" + 4*4],"~e ];
+    }
+
+    /** Calculates Ch(x, y, z) = z ^ (x & (y ^ z)) */
+    private nothrow pure string[] Ch(string x, string y, string z)
+    {
+        return ["mov "~T1~","~y,
+                "xor "~T1~","~z,
+                "and "~T1~","~x,
+                "xor "~T1~","~z ];
+    }
+
+    /** Calculates Parity(x, y, z) = x ^ y ^ z */
+    private nothrow pure string[] Parity(string x, string y, string z)
+    {
+        return ["mov "~T1~","~z,
+                "xor "~T1~","~y,
+                "xor "~T1~","~x ];
+    }
+
+    /** Calculates Maj(x, y, z) = (x & y) | (z & (x ^ y)) */
+    private nothrow pure string[] Maj(string x, string y, string z)
+    {
+        return ["mov "~T1~","~y,
+                "mov "~T2~","~x,
+                "or  "~T1~","~x,
+                "and "~T2~","~y,
+                "and "~T1~","~z,
+                "or  "~T1~","~T2 ];
+    }
+
+    /** Returns function for round i. Function returns result in T1 and may destroy T2. */
+    private nothrow pure string[] F(int i, string b, string c, string d)
+    {
+        string[] insn;
+        if (i >=  0 && i <= 19) insn = Ch(b, c, d);
+        else if (i >= 20 && i <= 39) insn = Parity(b, c, d);
+        else if (i >= 40 && i <= 59) insn = Maj(b, c, d);
+        else if (i >= 60 && i <= 79) insn = Parity(b, c, d);
+        else assert(false, "Coding error");
+        return insn;
+    }
+
+    /** Returns instruction used to setup a round. */
+    private nothrow pure string[] xsetup(int i)
+    {
+        if (i == 0)
+        {
+            return swt3264(["movdqa "~X_SHUFFLECTL~",[bswap_shufb_ctl]",
+                             "movdqa "~X_CONSTANT~","~constant(i)],
+                            ["movdqa "~X_SHUFFLECTL~",[bswap_shufb_ctl]",
+                             "movdqa "~X_CONSTANT~","~constant(i)]);
+        }
+        version(_64Bit)
+        {
+            if (i%20 == 0)
+            {
+                return ["movdqa "~X_CONSTANT~","~constant(i)];
+            }
+        }
+        return [];
+    }
+
+    /**
+     * Loads the message words and performs the little to big endian conversion.
+     * Requires that the shuffle control word and the round constant is loaded
+     * into required XMM register. The BUFFER_PTR register must point to the
+     * buffer.
+     */
+    private nothrow pure string[] precalc_00_15(int i)
+    {
+        int regno = regno(i);
+
+        string W = "XMM" ~ to_string(regno);
+        version(_32Bit)
+        {
+            string W_TMP = "XMM" ~ to_string(regno+2);
+        }
+        version(_64Bit)
+        {
+            string W_TMP = "XMM" ~ to_string(regno+8);
+        }
+
+        if ((i & 3) == 0)
+        {
+            return ["movdqu "~W~",["~BUFFER_PTR~" + "~to_string(regno)~"*16]"];
+        }
+        else if ((i & 3) == 1)
+        {
+            return ["pshufb "~W~","~X_SHUFFLECTL] ~
+                    swt3264(["movdqa "~WiV(i)~","~W], []);
+        }
+        else if ((i & 3) == 2)
+        {
+            return ["movdqa "~W_TMP~","~W,
+                    "paddd "~W_TMP~","~X_CONSTANT,
+                   ];
+        }
+        else
+        {
+            return ["movdqa "~WiKiV(i)~","~W_TMP,
+                   ];
+        }
+    }
+
+    /**
+     * Done on 4 consequtive W[i] values in a single XMM register
+     *  W[i  ] = (W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]) rol 1
+     *  W[i+1] = (W[i-2] ^ W[i-7] ^ W[i-13] ^ W[i-15]) rol 1
+     *  W[i+2] = (W[i-1] ^ W[i-6] ^ W[i-12] ^ W[i-14]) rol 1
+     *  W[i+3] = (   0   ^ W[i-5] ^ W[i-11] ^ W[i-13]) rol 1
+     *
+     * This additional calculation unfortunately requires many additional operations
+     *  W[i+3] ^= W[i] rol 1
+     *
+     * Once we have 4 W[i] values in XMM we can also add four K values with one instruction
+     *   W[i:i+3] += {K,K,K,K}
+     */
+    private nothrow pure string[] precalc_16_31(int i)
+    {
+        int regno = regno(i);
+
+        string W = "XMM" ~ to_string(regno);
+        string W_minus_4 = "XMM" ~ to_string((regno-1)&7);
+        string W_minus_8 = "XMM" ~ to_string((regno-2)&7);
+        string W_minus_12 = "XMM" ~ to_string((regno-3)&7);
+        string W_minus_16 = "XMM" ~ to_string((regno-4)&7);
+        version(_32Bit)
+        {
+            string W_TMP = "XMM" ~ to_string((regno+1)&7);
+            string W_TMP2 = "XMM" ~ to_string((regno+2)&7);
+        }
+
+        if ((i & 3) == 0)
+        {
+            return ["movdqa "~W~","~W_minus_12,
+                    "palignr "~W~","~W_minus_16~",8",   // W[i] = W[i-14]
+                    "pxor "~W~","~W_minus_16,           // W[i] ^= W[i-16]
+                    "pxor "~W~","~W_minus_8,            // W[i] ^= W[i-8]
+                    "movdqa "~W_TMP~","~W_minus_4,
+            ];
+        }
+        else if ((i & 3) == 1)
+        {
+            return ["psrldq "~W_TMP~",4",               // W[i-3]
+                    "pxor "~W~","~W_TMP,                // W[i] ^= W[i-3]
+                    "movdqa "~W_TMP~","~W,
+                    "psrld "~W~",31",
+                    "pslld "~W_TMP~",1",
+            ];
+        }
+        else if ((i & 3) == 2)
+        {
+            return ["por "~W~","~W_TMP,
+                    "movdqa "~W_TMP~","~W,
+                    "pslldq "~W_TMP~",12",
+                    "movdqa "~W_TMP2~","~W_TMP,
+                    "pslld "~W_TMP~",1",
+            ];
+        }
+        else
+        {
+            return ["psrld "~W_TMP2~",31",
+                    "por "~W_TMP~","~W_TMP2,
+                    "pxor "~W~","~W_TMP,
+                    "movdqa "~W_TMP~","~W ] ~
+                   swt3264(["movdqa "~WiV(i)~","~W,
+                            "paddd "~W_TMP~","~constant(i) ],
+                           ["paddd "~W_TMP~","~X_CONSTANT ]) ~
+                   ["movdqa "~WiKiV(i)~","~W_TMP];
+        }
+    }
+
+    /** Performs the main calculation as decribed above. */
+    private nothrow pure string[] precalc_32_79(int i)
+    {
+        int regno = regno(i);
+
+        string W = "XMM" ~ to_string(regno);
+        string W_minus_4 = "XMM" ~ to_string((regno-1)&7);
+        string W_minus_8 = "XMM" ~ to_string((regno-2)&7);
+        string W_minus_16 = "XMM" ~ to_string((regno-4)&7);
+        version(_32Bit)
+        {
+            string W_minus_28 = "[ESP + WI_PTR + "~ to_string((regno-7)&7)~"*16]";
+            string W_minus_32 = "[ESP + WI_PTR + "~ to_string((regno-8)&7)~"*16]";
+            string W_TMP = "XMM" ~ to_string((regno+1)&7);
+            string W_TMP2 = "XMM" ~ to_string((regno+2)&7);
+        }
+        version(_64Bit)
+        {
+            string W_minus_28 = "XMM" ~ to_string((regno-7)&7);
+            string W_minus_32 = "XMM" ~ to_string((regno-8)&7);
+        }
+
+        if ((i & 3) == 0)
+        {
+            return swt3264(["movdqa "~W~","~W_minus_32], []) ~
+                   ["movdqa "~W_TMP~","~W_minus_4,
+                    "pxor "~W~","~W_minus_28,         // W is W_minus_32 before xor
+                    "palignr "~W_TMP~","~W_minus_8~",8",
+            ];
+        }
+        else if ((i & 3) == 1)
+        {
+            return ["pxor "~W~","~W_minus_16,
+                    "pxor "~W~","~W_TMP,
+                    "movdqa "~W_TMP~","~W,
+            ];
+        }
+        else if ((i & 3) == 2)
+        {
+            return ["psrld "~W~",30",
+                    "pslld "~W_TMP~",2",
+                    "por "~W_TMP~","~W,
+            ];
+        }
+        else
+        {
+            if (i < 76)
+                return ["movdqa "~W~","~W_TMP] ~
+                       swt3264(["movdqa "~WiV(i)~","~W,
+                                "paddd "~W_TMP~","~constant(i)],
+                               ["paddd "~W_TMP~","~X_CONSTANT]) ~
+                       ["movdqa "~WiKiV(i)~","~W_TMP];
+            else
+                return swt3264(["paddd "~W_TMP~","~constant(i)],
+                               ["paddd "~W_TMP~","~X_CONSTANT]) ~
+                       ["movdqa "~WiKiV(i)~","~W_TMP];
+        }
+    }
+
+    /** Choose right precalc method. */
+    private nothrow pure string[] precalc(int i)
+    {
+        if (i >= 0 && i < 16) return precalc_00_15(i);
+        if (i >= 16 && i < 32) return precalc_16_31(i);
+        if (i >= 32 && i < 80) return precalc_32_79(i);
+        return [];
+    }
+
+    /**
+     * Return code for round i and i+1.
+     * Performs the following rotation:
+     * in=>out: A=>D, B=>E, C=>A, D=>B, E=>C
+     */
+    private nothrow pure string[] round(int i, string a, string b, string c, string d, string e)
+    {
+        return xsetup(PRECALC_AHEAD + i) ~
+               weave(F(i, b, c, d) ~ // Returns result in T1; may destroy T2
+               ["add "~e~","~WiKi(i),
+                "ror "~b~",2",
+                "mov "~T2~","~a,
+                "add "~d~","~WiKi(i+1),
+                "rol "~T2~",5",
+                "add "~e~","~T1 ],
+                precalc(PRECALC_AHEAD + i), 2) ~
+               weave(
+               ["add "~T2~","~e,  // T2 = (A <<< 5) + F(B, C, D) + Wi + Ki + E
+                "mov "~e~","~T2,
+                "rol "~T2~",5",
+                "add "~d~","~T2 ] ~
+               F(i+1, a, b, c) ~ // Returns result in T1; may destroy T2
+               ["add "~d~","~T1,
+                "ror "~a~",2"],
+                precalc(PRECALC_AHEAD + i+1), 2);
+    }
+
+    // Offset into stack (see below)
+    version(_32Bit)
+    {
+        private enum { STATE_OFS = 4, WI_PLUS_KI_PTR = 8, WI_PTR = 72 };
+    }
+    version(_64Bit)
+    {
+        private enum { WI_PLUS_KI_PTR = 0 };
+    }
+
+    /** The prologue sequence. */
+    private nothrow pure string[] prologue()
+    {
+        version(_32Bit)
+        {
+            /*
+             * Parameters:
+             *   EAX contains pointer to input buffer
+             *
+             * Stack layout as follows:
+             * +----------------+
+             * | ptr to state   |
+             * +----------------+
+             * | return address |
+             * +----------------+
+             * | EBP            |
+             * +----------------+
+             * | ESI            |
+             * +----------------+
+             * | EDI            |
+             * +----------------+
+             * | EBX            |
+             * +----------------+
+             * | Space for      |
+             * | Wi             | <- ESP+72
+             * +----------------+
+             * | Space for      |
+             * | Wi+Ki          | <- ESP+8
+             * +----------------+ <- 16byte aligned
+             * | ptr to state   | <- ESP+4
+             * +----------------+
+             * | old ESP        | <- ESP
+             * +----------------+
+             */
+            static assert(BUFFER_PTR == "EAX");
+            static assert(STATE_PTR == "EBX");
+            return [// Save registers according to calling convention
+                    "push EBP",
+                    "push ESI",
+                    "push EDI",
+                    "push EBX",
+                    // Load parameters
+                    "mov EBX, [ESP + 5*4]", //pointer to state
+                    // Align stack
+                    "mov EBP, ESP",
+                    "sub ESP, 4*16 + 8*16",
+                    "and ESP, 0xffff_fff0",
+                    "push EBX",
+                    "push EBP",
+            ];
+        }
+        version(_64Bit)
+        {
+            /*
+             * Parameters:
+             *   RSI contains pointer to state
+             *   RDI contains pointer to input buffer
+             *
+             * Stack layout as follows:
+             * +----------------+
+             * | return address |
+             * +----------------+
+             * | RBP            |
+             * +----------------+
+             * | RBX            |
+             * +----------------+
+             * | Unused         |
+             * +----------------+
+             * | Space for      |
+             * | Wi+Ki          | <- RSP
+             * +----------------+ <- 16byte aligned
+             */
+            return [// Save registers according to calling convention
+                    "push RBP",
+                    "push RBX",
+                    // Save parameters
+                    "mov "~STATE_PTR~", RSI", //pointer to state
+                    "mov "~BUFFER_PTR~", RDI", //pointer to buffer
+                    // Align stack
+                    "sub RSP, 4*16+8",
+            ];
+        }
+    }
+
+    /**
+      * The epilogue sequence. Just pop the saved registers from stack and return to caller.
+      */
+    private nothrow pure string[] epilogue()
+    {
+        version(_32Bit)
+        {
+            return ["pop ESP",
+                    "pop EBX",
+                    "pop EDI",
+                    "pop ESI",
+                    "pop EBP",
+                    "ret 4",
+                   ];
+        }
+        version(_64Bit)
+        {
+            return ["add RSP,4*16+8",
+                    "pop RBX",
+                    "pop RBP",
+                    "ret 0",
+                   ];
+        }
+    }
+
+    /**
+     *
+     */
+    public nothrow pure void transformSSSE3(uint[5]* state, const(ubyte[64])* buffer)
+    {
+        mixin(wrap(["naked;"] ~ prologue()));
+        // Precalc first 4*16=64 bytes
+        mixin(wrap(xsetup(0)));
+        mixin(wrap(weave(precalc(0)~precalc(1)~precalc(2)~precalc(3),
+                         precalc(4)~precalc(5)~precalc(6)~precalc(7))));
+        mixin(wrap(weave(loadstate(STATE_PTR, A, B, C, D, E),
+                   weave(precalc(8)~precalc(9)~precalc(10)~precalc(11),
+                         precalc(12)~precalc(13)~precalc(14)~precalc(15)))));
+        // Round 1
+        mixin(wrap(round( 0, A, B, C, D, E)));
+        mixin(wrap(round( 2, D, E, A, B, C)));
+        mixin(wrap(round( 4, B, C, D, E, A)));
+        mixin(wrap(round( 6, E, A, B, C, D)));
+        mixin(wrap(round( 8, C, D, E, A, B)));
+        mixin(wrap(round(10, A, B, C, D, E)));
+        mixin(wrap(round(12, D, E, A, B, C)));
+        mixin(wrap(round(14, B, C, D, E, A)));
+        mixin(wrap(round(16, E, A, B, C, D)));
+        mixin(wrap(round(18, C, D, E, A, B)));
+        // Round 2
+        mixin(wrap(round(20, A, B, C, D, E)));
+        mixin(wrap(round(22, D, E, A, B, C)));
+        mixin(wrap(round(24, B, C, D, E, A)));
+        mixin(wrap(round(26, E, A, B, C, D)));
+        mixin(wrap(round(28, C, D, E, A, B)));
+        mixin(wrap(round(30, A, B, C, D, E)));
+        mixin(wrap(round(32, D, E, A, B, C)));
+        mixin(wrap(round(34, B, C, D, E, A)));
+        mixin(wrap(round(36, E, A, B, C, D)));
+        mixin(wrap(round(38, C, D, E, A, B)));
+        // Round 3
+        mixin(wrap(round(40, A, B, C, D, E)));
+        mixin(wrap(round(42, D, E, A, B, C)));
+        mixin(wrap(round(44, B, C, D, E, A)));
+        mixin(wrap(round(46, E, A, B, C, D)));
+        mixin(wrap(round(48, C, D, E, A, B)));
+        mixin(wrap(round(50, A, B, C, D, E)));
+        mixin(wrap(round(52, D, E, A, B, C)));
+        mixin(wrap(round(54, B, C, D, E, A)));
+        mixin(wrap(round(56, E, A, B, C, D)));
+        mixin(wrap(round(58, C, D, E, A, B)));
+        // Round 4
+        mixin(wrap(round(60, A, B, C, D, E)));
+        mixin(wrap(round(62, D, E, A, B, C)));
+        mixin(wrap(round(64, B, C, D, E, A)));
+        mixin(wrap(round(66, E, A, B, C, D)));
+        mixin(wrap(round(68, C, D, E, A, B)));
+        mixin(wrap(round(70, A, B, C, D, E)));
+        mixin(wrap(round(72, D, E, A, B, C)));
+        mixin(wrap(round(74, B, C, D, E, A)));
+        mixin(wrap(round(76, E, A, B, C, D)));
+        mixin(wrap(round(78, C, D, E, A, B)));
+        version(_32Bit)
+        {
+            // Load pointer to state
+            mixin(wrap(["mov "~STATE_PTR~",[ESP + STATE_OFS]"]));
+        }
+        mixin(wrap(savestate(STATE_PTR, A, B, C, D, E)));
+        mixin(wrap(epilogue()));
+    }
+}
+
diff --git a/win32.mak b/win32.mak
index 8bfd10427bf..ee96e69bcd2 100644
--- a/win32.mak
+++ b/win32.mak
@@ -191,6 +191,8 @@ SRC_STD_C_FREEBSD= std\c\freebsd\socket.d
 
 SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d
 
+SRC_STD_INTERNAL_DIGEST= std\internal\digest\sha_SSSE3.d
+
 SRC_STD_INTERNAL_MATH= std\internal\math\biguintcore.d \
 	std\internal\math\biguintnoasm.d std\internal\math\biguintx86.d \
 	std\internal\math\gammafunction.d std\internal\math\errorfunction.d
@@ -207,6 +209,7 @@ SRC_TO_COMPILE_NOT_STD= crc32.d \
 	$(SRC_STD_WIN) \
 	$(SRC_STD_C_WIN) \
 	$(SRC_STD_INTERNAL) \
+	$(SRC_STD_INTERNAL_DIGEST) \
 	$(SRC_STD_INTERNAL_MATH) \
 	$(SRC_STD_INTERNAL_WINDOWS) \
 	$(SRC_ETC) \
@@ -719,7 +722,8 @@ zip : win32.mak win64.mak posix.mak $(STDDOC) $(SRC) \
 	$(SRC_STD) $(SRC_STD_C) $(SRC_STD_WIN) \
 	$(SRC_STD_C_WIN) $(SRC_STD_C_LINUX) $(SRC_STD_C_OSX) $(SRC_STD_C_FREEBSD) \
 	$(SRC_ETC) $(SRC_ETC_C) $(SRC_ZLIB) $(SRC_STD_NET) $(SRC_STD_DIGEST) \
-	$(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS)
+	$(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) \
+	$(SRC_STD_INTERNAL_WINDOWS)
 	del phobos.zip
 	zip32 -u phobos win32.mak win64.mak posix.mak $(STDDOC)
 	zip32 -u phobos $(SRC)
@@ -731,6 +735,7 @@ zip : win32.mak win64.mak posix.mak $(STDDOC) $(SRC) \
 	zip32 -u phobos $(SRC_STD_C_OSX)
 	zip32 -u phobos $(SRC_STD_C_FREEBSD)
 	zip32 -u phobos $(SRC_STD_INTERNAL)
+	zip32 -u phobos $(SRC_STD_INTERNAL_DIGEST)
 	zip32 -u phobos $(SRC_STD_INTERNAL_MATH)
 	zip32 -u phobos $(SRC_STD_INTERNAL_WINDOWS)
 	zip32 -u phobos $(SRC_ETC) $(SRC_ETC_C)
diff --git a/win64.mak b/win64.mak
index 41e2837802f..f0bd3e801f9 100644
--- a/win64.mak
+++ b/win64.mak
@@ -213,6 +213,8 @@ SRC_STD_C_FREEBSD= std\c\freebsd\socket.d
 
 SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d
 
+SRC_STD_INTERNAL_DIGEST= std\internal\digest\sha_SSSE3.d
+
 SRC_STD_INTERNAL_MATH= std\internal\math\biguintcore.d \
 	std\internal\math\biguintnoasm.d std\internal\math\biguintx86.d \
     std\internal\math\gammafunction.d std\internal\math\errorfunction.d
@@ -229,6 +231,7 @@ SRC_TO_COMPILE_NOT_STD= crc32.d \
 	$(SRC_STD_WIN) \
 	$(SRC_STD_C_WIN) \
 	$(SRC_STD_INTERNAL) \
+	$(SRC_STD_INTERNAL_DIGEST) \
 	$(SRC_STD_INTERNAL_MATH) \
 	$(SRC_STD_INTERNAL_WINDOWS) \
 	$(SRC_ETC) \
@@ -772,7 +775,8 @@ zip : win32.mak win64.mak posix.mak $(STDDOC) $(SRC) \
 	$(SRC_STD) $(SRC_STD_C) $(SRC_STD_WIN) \
 	$(SRC_STD_C_WIN) $(SRC_STD_C_LINUX) $(SRC_STD_C_OSX) $(SRC_STD_C_FREEBSD) \
 	$(SRC_ETC) $(SRC_ETC_C) $(SRC_ZLIB) $(SRC_STD_NET) $(SRC_STD_DIGEST)\
-	$(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_MATH) $(SRC_STD_INTERNAL_WINDOWS)
+	$(SRC_STD_INTERNAL) $(SRC_STD_INTERNAL_DIGEST) $(SRC_STD_INTERNAL_MATH) \
+	$(SRC_STD_INTERNAL_WINDOWS)
 	del phobos.zip
 	zip32 -u phobos win32.mak win64.mak posix.mak $(STDDOC)
 	zip32 -u phobos $(SRC)
@@ -784,6 +788,7 @@ zip : win32.mak win64.mak posix.mak $(STDDOC) $(SRC) \
 	zip32 -u phobos $(SRC_STD_C_OSX)
 	zip32 -u phobos $(SRC_STD_C_FREEBSD)
 	zip32 -u phobos $(SRC_STD_INTERNAL)
+	zip32 -u phobos $(SRC_STD_INTERNAL_DIGEST)
 	zip32 -u phobos $(SRC_STD_INTERNAL_MATH)
 	zip32 -u phobos $(SRC_STD_INTERNAL_WINDOWS)
 	zip32 -u phobos $(SRC_ETC) $(SRC_ETC_C)