From 3e2d76b00e6a942a85079eb93311cad3c169e1e9 Mon Sep 17 00:00:00 2001
From: Jonathan Crapuchettes <jonathan@economicmodeling.com>
Date: Fri, 6 Jun 2014 11:44:05 -0700
Subject: [PATCH] Adding 64-bit sse and sse2 code for array float operations.

---
 src/rt/arrayfloat.d | 168 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)

diff --git a/src/rt/arrayfloat.d b/src/rt/arrayfloat.d
index dfbfe92f1e0..20d1b1e413a 100644
--- a/src/rt/arrayfloat.d
+++ b/src/rt/arrayfloat.d
@@ -143,6 +143,51 @@ private template CodeGenSliceSliceOp(string opD, string opSSE, string op3DNow)
             }
         }
     }
+    else version (D_InlineAsm_X86_64)
+    {
+        // All known X86_64 have SSE2
+        if (b.length >= 16)
+        {
+            auto n = aptr + (b.length & ~15);
+
+            // Unaligned case
+            asm
+            {
+                mov RAX, bptr; // left operand
+                mov RCX, cptr; // right operand
+                mov RSI, aptr; // destination operand
+                mov RDI, n;    // end comparison
+
+                align 8;
+            startsseloopb:
+                movups XMM0, [RAX];
+                movups XMM1, [RAX+16];
+                movups XMM2, [RAX+32];
+                movups XMM3, [RAX+48];
+                add RAX, 64;
+                movups XMM4, [RCX];
+                movups XMM5, [RCX+16];
+                movups XMM6, [RCX+32];
+                movups XMM7, [RCX+48];
+                add RSI, 64;
+                ` ~ opSSE ~ ` XMM0, XMM4;
+                ` ~ opSSE ~ ` XMM1, XMM5;
+                ` ~ opSSE ~ ` XMM2, XMM6;
+                ` ~ opSSE ~ ` XMM3, XMM7;
+                add RCX, 64;
+                movups [RSI+ 0-64], XMM0;
+                movups [RSI+16-64], XMM1;
+                movups [RSI+32-64], XMM2;
+                movups [RSI+48-64], XMM3;
+                cmp RSI, RDI;
+                jb startsseloopb;
+
+                mov aptr, RSI;
+                mov bptr, RAX;
+                mov cptr, RCX;
+            }
+        }
+    }
 
     // Handle remainder
     while (aptr < aend)
@@ -407,6 +452,43 @@ private template CodeGenExpSliceOpAssign(string opD, string opSSE, string op3DNo
             }
         }
     }
+    else version (D_InlineAsm_X86_64)
+    {
+        // All known X86_64 have SSE2
+        if (a.length >= 16)
+        {
+            auto n = aptr + (a.length & ~15);
+            if (aptr < n)
+
+            asm
+            {
+                mov RSI, aptr;
+                mov RDI, n;
+                movss XMM4, value;
+                shufps XMM4, XMM4, 0;
+
+                align 8;
+            startsseloopa:
+                movups XMM0, [RSI];
+                movups XMM1, [RSI+16];
+                movups XMM2, [RSI+32];
+                movups XMM3, [RSI+48];
+                add RSI, 64;
+                ` ~ opSSE ~ ` XMM0, XMM4;
+                ` ~ opSSE ~ ` XMM1, XMM4;
+                ` ~ opSSE ~ ` XMM2, XMM4;
+                ` ~ opSSE ~ ` XMM3, XMM4;
+                movups [RSI+ 0-64], XMM0;
+                movups [RSI+16-64], XMM1;
+                movups [RSI+32-64], XMM2;
+                movups [RSI+48-64], XMM3;
+                cmp RSI, RDI;
+                jb startsseloopa;
+
+                mov aptr, RSI;
+            }
+        }
+    }
 
     while (aptr < aend)
         *aptr++ ` ~ opD ~ ` value;
@@ -709,6 +791,46 @@ private template CodeGenSliceExpOp(string opD, string opSSE, string op3DNow)
             }
         }
     }
+    else version (D_InlineAsm_X86_64)
+    {
+        // All known X86_64 have SSE2
+        if (a.length >= 16)
+        {
+            auto n = aptr + (a.length & ~15);
+
+            // Unaligned case
+            asm
+            {
+                mov RAX, bptr;
+                mov RSI, aptr;
+                mov RDI, n;
+                movss XMM4, value;
+                shufps XMM4, XMM4, 0;
+
+                align 8;
+            startsseloop:
+                add RSI, 64;
+                movups XMM0, [RAX];
+                movups XMM1, [RAX+16];
+                movups XMM2, [RAX+32];
+                movups XMM3, [RAX+48];
+                add RAX, 64;
+                ` ~ opSSE ~ ` XMM0, XMM4;
+                ` ~ opSSE ~ ` XMM1, XMM4;
+                ` ~ opSSE ~ ` XMM2, XMM4;
+                ` ~ opSSE ~ ` XMM3, XMM4;
+                movups [RSI+ 0-64], XMM0;
+                movups [RSI+16-64], XMM1;
+                movups [RSI+32-64], XMM2;
+                movups [RSI+48-64], XMM3;
+                cmp RSI, RDI;
+                jb startsseloop;
+
+                mov aptr, RSI;
+                mov bptr, RAX;
+            }
+        }
+    }
 
     while (aptr < aend)
         *aptr++ = *bptr++ ` ~ opD ~ ` value;
@@ -921,6 +1043,10 @@ unittest
 /* ======================================================================== */
 /* ======================================================================== */
 
+/* template for the case
+ *   a[] ?= b[]
+ * with some binary operator ?
+ */
 private template CodeGenSliceOpAssign(string opD, string opSSE, string op3DNow)
 {
     const CodeGenSliceOpAssign = `
@@ -1006,6 +1132,48 @@ private template CodeGenSliceOpAssign(string opD, string opSSE, string op3DNow)
             }
         }
     }
+    else version (D_InlineAsm_X86_64)
+    {
+        // All known X86_64 have SSE2
+        if (a.length >= 16)
+        {
+            auto n = aptr + (a.length & ~15);
+
+            // Unaligned case
+            asm
+            {
+                mov RCX, bptr; // right operand
+                mov RSI, aptr; // destination operand
+                mov RDI, n; // end comparison
+
+                align 8;
+            startsseloopb:
+                movups XMM0, [RSI];
+                movups XMM1, [RSI+16];
+                movups XMM2, [RSI+32];
+                movups XMM3, [RSI+48];
+                add RSI, 64;
+                movups XMM4, [RCX];
+                movups XMM5, [RCX+16];
+                movups XMM6, [RCX+32];
+                movups XMM7, [RCX+48];
+                add RCX, 64;
+                ` ~ opSSE ~ ` XMM0, XMM4;
+                ` ~ opSSE ~ ` XMM1, XMM5;
+                ` ~ opSSE ~ ` XMM2, XMM6;
+                ` ~ opSSE ~ ` XMM3, XMM7;
+                movups [RSI+ 0-64], XMM0;
+                movups [RSI+16-64], XMM1;
+                movups [RSI+32-64], XMM2;
+                movups [RSI+48-64], XMM3;
+                cmp RSI, RDI;
+                jb startsseloopb;
+
+                mov aptr, RSI;
+                mov bptr, RCX;
+            }
+        }
+    }
 
     while (aptr < aend)
         *aptr++ ` ~ opD ~ ` *bptr++;