From 6991cd9191cb01a15a00001869c806ac28638e91 Mon Sep 17 00:00:00 2001
From: Anipik <anirudhagnihotry098@gmail.com>
Date: Wed, 3 Oct 2018 16:05:09 -0700
Subject: [PATCH 1/7]  scale modified to handle both aligned and unaligned
 read\writes

---
 src/Microsoft.ML.CpuMath/Avx.cs               |   2 +-
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs     | 144 ++++++++++++++---
 .../CpuMathUtils.netcoreapp.cs                |   4 +-
 src/Microsoft.ML.CpuMath/Sse.cs               |   6 +-
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs     | 147 ++++++++++++++++--
 src/Microsoft.ML.CpuMath/Thunk.cs             |   2 +-
 src/Native/CpuMathNative/Sse.cpp              | 118 +++++++++++++-
 .../AvxPerformanceTests.cs                    |   4 +-
 .../CpuMathNativeUtils.cs                     |   4 +-
 .../NativePerformanceTests.cs                 |   4 +-
 .../SsePerformanceTests.cs                    |   4 +-
 .../UnitTests.cs                              |   2 +-
 12 files changed, 385 insertions(+), 56 deletions(-)
diff --git a/src/Microsoft.ML.CpuMath/Avx.cs b/src/Microsoft.ML.CpuMath/Avx.cs
index 6dcf898b6f..5d4610d9bc 100644
--- a/src/Microsoft.ML.CpuMath/Avx.cs
+++ b/src/Microsoft.ML.CpuMath/Avx.cs
@@ -625,7 +625,7 @@ public static void Scale(float a, float[] dst, int count)
             unsafe
             {
                 fixed (float* pd = &dst[0])
-                    Thunk.ScaleU(a, pd, count);
+                    Thunk.Scale(a, pd, count);
             }
         }
 
diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index b31a427139..22706b9ef8 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -13,11 +13,36 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using nuint = System.UInt64;
 
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class AvxIntrinsics
     {
+        public static readonly uint[] LeadingAlignmentMask = new uint[64]
+        {
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
+        };
+
+        public static readonly uint[] TrailingAlignmentMask = new uint[64]
+        {
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+        };
+
         private static readonly Vector256<float> _absMask256 = Avx.StaticCast<int, float>(Avx.SetAllVector256(0x7FFFFFFF));
 
         private const int Vector256Alignment = 32;
@@ -451,45 +476,118 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
             }
         }
 
-        public static unsafe void ScaleU(float scale, Span<float> dst)
+        public static unsafe void Scale(float scale, Span<float> dst)
         {
-            fixed (float* pdst = dst)
+            fixed (uint* pLeadingAlignmentMask = &LeadingAlignmentMask[0])
+            fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0])
+            fixed (float* pd = dst)
             {
-                float* pDstCurrent = pdst;
-                float* pEnd = pdst + dst.Length;
+                float* pdLim = pd + dst.Length;
 
+                int length = dst.Length;
                 Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
 
-                while (pDstCurrent + 8 <= pEnd)
+                if (length < 8)
                 {
-                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
+                    for(int i = 0; i < length; i++)
+                    {
+                        dst[i] *= scale;
+                    }
+                    return;
+                }
 
-                    dstVector = Avx.Multiply(scaleVector256, dstVector);
-                    Avx.Store(pDstCurrent, dstVector);
+                nuint address = (nuint)(pd);
+                int misalignment = (int)(address % 32);
+                int remainder = 0;
+                float* pDstCurrent = pd;
 
-                    pDstCurrent += 8;
+                if ((misalignment & 3) != 0)
+                {
+                    // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
+                    remainder = length % 8;
+
+                    for (float* pEnd = pd + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 8)
+                    {
+                        Vector256<float> temp = Avx.LoadVector256(pDstCurrent);
+                        temp = Avx.Multiply(scaleVector256, temp);
+                        Avx.Store(pDstCurrent, temp);
+                    }
                 }
+                else
+                {
+                    if (misalignment != 0)
+                    {
+                        // Handle cases where the data is not 256-bit aligned by doing an unaligned read and then
+                        // masking any elements that will be included in the first aligned read
 
-                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
+                        misalignment >>= 2;
+                        misalignment = 8 - misalignment;
 
-                if (pDstCurrent + 4 <= pEnd)
-                {
-                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                        Vector256<float> result = Avx.LoadVector256(pDstCurrent);
 
-                    dstVector = Sse.Multiply(scaleVector128, dstVector);
-                    Sse.Store(pDstCurrent, dstVector);
+                        Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + (misalignment * 8));
+                        Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (( 8 - misalignment) * 8));
 
-                    pDstCurrent += 4;
-                }
+                        Vector256<float> temp = Avx.And(result, leadingMask);
+                        result = Avx.And(result, trailingMask);
 
-                while (pDstCurrent < pEnd)
-                {
-                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                        temp = Avx.Multiply(scaleVector256, temp);
+                        result = Avx.Or(temp, result);
 
-                    dstVector = Sse.MultiplyScalar(scaleVector128, dstVector);
-                    Sse.StoreScalar(pDstCurrent, dstVector);
+                        Avx.Store(pDstCurrent, result);
 
-                    pDstCurrent++;
+                        pDstCurrent += misalignment;
+                        length -= misalignment;
+                    }
+
+                    if (length > 7)
+                    {
+                        // Handle all the 256-bit blocks that we can now that we have offset to an aligned address
+
+                        remainder = length % 8;
+
+                        for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 8)
+                        {
+                            // The JIT will only fold away unaligned loads due to the semantics behind
+                            // the VEX-encoding of the memory operand for `ins xmm, xmm, [mem]`. Since
+                            // modern hardware has unaligned loads that are as fast as aligned loads,
+                            // when it doesn't cross a cache-line/page boundary, we will just assert
+                            // that the alignment is correct and allow for the more-efficient codegen.
+
+                            Contracts.Assert(((nuint)(pDstCurrent) % 32) == 0);
+                            Vector256<float> temp = Avx.LoadVector256(pDstCurrent);
+                            temp = Avx.Multiply(scaleVector256, temp);
+                            Avx.Store(pDstCurrent, temp);
+                        }
+                    }
+                    else
+                    {
+                        // Handle the "worst-case" scenario, which is when we have 8-16 elements and the input is not
+                        // 256-bit aligned. This means we can't do any aligned loads and will just end up doing two
+                        // unaligned loads where we mask the input each time.
+                        remainder = length;
+                    }
+
+                    if (remainder != 0)
+                    {
+                        // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                        // unaligned load will read to the end of the array and then mask out any elements already processed
+
+                        pDstCurrent -= (8 - remainder);
+
+                        Vector256<float> result = Avx.LoadVector256(pDstCurrent);
+
+                        Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
+                        Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + (( 8 - remainder) * 8));
+
+                        Vector256<float> temp = Avx.And(result, trailingMask);
+                        result = Avx.And(result, leadingMask);
+
+                        temp = Avx.Multiply(scaleVector256, temp);
+                        temp = Avx.Or(temp, result);
+
+                        Avx.Store(pDstCurrent, temp);
+                    }
                 }
             }
         }
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index f15f5c3938..09738684f5 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -248,11 +248,11 @@ private static void Scale(float a, Span<float> dst)
         {
             if (Avx.IsSupported)
             {
-                AvxIntrinsics.ScaleU(a, dst);
+                AvxIntrinsics.Scale(a, dst);
             }
             else if (Sse.IsSupported)
             {
-                SseIntrinsics.ScaleU(a, dst);
+                SseIntrinsics.Scale(a, dst);
             }
             else
             {
diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs
index 13de22dd5b..d541c02533 100644
--- a/src/Microsoft.ML.CpuMath/Sse.cs
+++ b/src/Microsoft.ML.CpuMath/Sse.cs
@@ -606,7 +606,7 @@ public static void Scale(float a, AlignedArray dst)
             unsafe
             {
                 fixed (float* pdst = &dst.Items[0])
-                    Thunk.ScaleA(a, Ptr(dst, pdst), dst.Size);
+                    Thunk.Scale(a, Ptr(dst, pdst), dst.Size);
             }
         }
 
@@ -618,7 +618,7 @@ public static void Scale(float a, float[] dst, int count)
             unsafe
             {
                 fixed (float* pd = &dst[0])
-                    Thunk.ScaleU(a, pd, count);
+                    Thunk.Scale(a, pd, count);
             }
         }
 
@@ -631,7 +631,7 @@ public static void Scale(float a, float[] dst, int offset, int count)
             unsafe
             {
                 fixed (float* pd = &dst[offset])
-                    Thunk.ScaleU(a, pd, count);
+                    Thunk.Scale(a, pd, count);
             }
         }
 
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 4c36d0094e..77eded3f55 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -17,6 +17,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using nuint = System.UInt64;
 
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
@@ -29,6 +30,22 @@ internal static class SseIntrinsics
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
 
+        public static readonly uint[] LeadingAlignmentMask = new uint[16]
+        {
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
+        };
+
+        public static readonly uint[] TrailingAlignmentMask = new uint[16]
+        {
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+        };
+
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static bool HasCompatibleAlignment(AlignedArray alignedArray)
         {
@@ -437,33 +454,131 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
             }
         }
 
-        public static unsafe void ScaleU(float scale, Span<float> dst)
+        public static unsafe void Scale(float scale, Span<float> dst)
         {
-            fixed (float* pdst = dst)
+            fixed (uint* pLeadingAlignmentMask = &LeadingAlignmentMask[0])
+            fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0])
+            fixed (float* pd = dst)
             {
-                float* pDstCurrent = pdst;
-                float* pEnd = pdst + dst.Length;
+                float* pdLim = pd + dst.Length;
 
-                Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+                int length = dst.Length;
+                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
-                while (pDstCurrent + 4 <= pEnd)
+                if (length < 4)
                 {
-                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                    // Handle cases where we have less than 128-bits total and can't ever use SIMD acceleration.
+                    for (int i = 0; i < length; i++)
+                    {
+                        dst[i] *= scale;
+                    }
+                    return;
+                }
 
-                    dstVector = Sse.Multiply(scaleVector, dstVector);
-                    Sse.Store(pDstCurrent, dstVector);
+                nuint address = (nuint)(pd);
+                int misalignment = (int)(address % 16);
+                int remainder = 0;
+                float* pDstCurrent = pd;
 
-                    pDstCurrent += 4;
-                }
+                if ((misalignment & 3) != 0)
+                {
+                    // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
+                    remainder = length % 4;
 
-                while (pDstCurrent < pEnd)
+                    for (float* pEnd = pd + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4)
+                    {
+                        Vector128<float> temp = Sse.LoadVector128(pDstCurrent);
+                        temp = Sse.Multiply(scaleVector128, temp);
+                        Sse.Store(pDstCurrent, temp);
+                    }
+                }
+                else
                 {
-                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    if (misalignment != 0)
+                    {
+                        // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then
+                        // masking any elements that will be included in the first aligned read
 
-                    dstVector = Sse.MultiplyScalar(scaleVector, dstVector);
-                    Sse.StoreScalar(pDstCurrent, dstVector);
+                        misalignment >>= 2;
+                        misalignment = 4 - misalignment;
 
-                    pDstCurrent++;
+                        Vector128<float> result = Sse.LoadVector128(pDstCurrent);
+
+                        Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + (misalignment * 4));
+                        Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + ((4 - misalignment) * 4));
+
+                        Vector128<float> temp = Sse.And(result, leadingMask);
+                        result = Sse.And(result, trailingMask);
+
+                        temp = Sse.Multiply(scaleVector128, temp);
+                        result = Sse.Or(temp, result);
+
+                        Sse.Store(pDstCurrent, result);
+
+                        pDstCurrent += misalignment;
+                        length -= misalignment;
+                    }
+
+                    if (length > 4)
+                    {
+                        // Handle all the 128-bit blocks that we can now that we have offset to an aligned address
+
+                        remainder = length % 4;
+
+                        for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4)
+                        {
+                            if (Avx.IsSupported)
+                            {
+                                // The JIT will only fold away unaligned loads due to the semantics behind
+                                // the VEX-encoding of the memory operand for `ins xmm, xmm, [mem]`. Since
+                                // modern hardware has unaligned loads that are as fast as aligned loads,
+                                // when it doesn't cross a cache-line/page boundary, we will just assert
+                                // that the alignment is correct and allow for the more-efficient codegen.
+
+                                Contracts.Assert(((nuint)(pDstCurrent) % 16) == 0);
+                                Vector128<float> temp = Sse.LoadVector128(pDstCurrent);
+                                temp = Sse.Multiply(scaleVector128, temp);
+                                Sse.Store(pDstCurrent, temp);
+                            }
+                            else
+                            {
+                                // If we aren't using the VEX-encoding, then the reverse is true and the JIT
+                                // will only fold away aligned loads (due to semantics of the legacy encoding).
+                                // We don't need an assert, since the instruction will throw for unaligned inputs.
+                                Vector128<float> temp = Sse.LoadAlignedVector128(pDstCurrent);
+                                temp = Sse.Multiply(scaleVector128, temp);
+                                Sse.Store(pDstCurrent, temp);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // Handle the "worst-case" scenario, which is when we have 4-8 elements and the input is not
+                        // 128-bit aligned. This means we can't do any aligned loads and will just end up doing two
+                        // unaligned loads where we mask the input each time.
+                        remainder = length;
+                    }
+
+                    if (remainder != 0)
+                    {
+                        // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                        // unaligned load will read to the end of the array and then mask out any elements already processed
+
+                        pDstCurrent -= (4 - remainder);
+
+                        Vector128<float> result = Sse.LoadVector128(pDstCurrent);
+
+                        Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
+                        Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4));
+
+                        Vector128<float> temp = Sse.And(result, trailingMask);
+                        result = Sse.And(result, leadingMask);
+
+                        temp = Sse.Multiply(scaleVector128, temp);
+                        temp = Sse.Or(temp, result);
+
+                        Sse.Store(pDstCurrent, temp);
+                    }
                 }
             }
         }
diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs
index 1053f75b75..a7451df8be 100644
--- a/src/Microsoft.ML.CpuMath/Thunk.cs
+++ b/src/Microsoft.ML.CpuMath/Thunk.cs
@@ -169,7 +169,7 @@ public static extern void AddXYTranGradPX(/*const*/ float* px, /*const*/ int* pp
             float decay, float cond, int crow, int ccol);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleU(float a, float* pd, int c);
+        public static extern void Scale(float a, float* pd, int c);
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void ScaleA(float a, float* pd, int c);
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp
index c5221f0020..1777497a3b 100644
--- a/src/Native/CpuMathNative/Sse.cpp
+++ b/src/Native/CpuMathNative/Sse.cpp
@@ -85,6 +85,22 @@ DWORD xmmYmmStateSupport()
 
 #endif
 
+const unsigned int LeadingAlignmentMask[16] =
+{
+    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
+};
+
+const unsigned int TrailingAlignmentMask[16] =
+{
+    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF,
+    0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+};
+
 // Test whether Avx is available.
 EXPORT_API(bool) ChkAvx()
 {
@@ -1429,7 +1445,7 @@ EXPORT_API(void) AddScalarU(float a, _Inout_ float * pd, int c)
     }
 }
 
-EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c)
+EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c)
 {
     float * pdLim = pd + c;
 
@@ -1449,6 +1465,106 @@ EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c)
     }
 }
 
+EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c)
+{
+    float * pdLim = pd + c;
+    __m128 x1 = _mm_set1_ps(a);
+    
+    if (c < 4)
+    {
+        for (; pd < pdLim; pd++)
+        {
+            __m128 x2 = _mm_load_ss(pd);
+            x2 = _mm_mul_ss(x1, x2);
+            _mm_store_ss(pd, x2);
+        }
+        return;           
+    }
+
+    uintptr_t address = (uintptr_t)(pd);
+    uintptr_t misalignment = address % 16;
+    int remainder = 0;
+
+    if ((misalignment & 3) != 0)
+    {
+        // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
+        remainder = c % 4;
+        
+        for (const float* pEnd = pd + (c - remainder); pd < pEnd; pd += 4)
+        {
+            __m128 x2 = _mm_loadu_ps(pd);
+            x2 = _mm_mul_ps(x1, x2);
+            _mm_storeu_ps(pd, x2);
+        }
+    }
+    else
+    {
+        if (misalignment != 0)
+        {
+            // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then
+            // masking any elements that will be included in the first aligned read
+            misalignment >>= 2;
+            misalignment = 4 - misalignment;
+            
+            __m128 result = _mm_loadu_ps(pd);            
+            
+            __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + (misalignment * 4));
+            __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - misalignment) * 4));
+            
+            __m128 temp = _mm_and_ps(result, leadingMask);
+            result = _mm_and_ps(result, trailingMask);
+            
+            temp = _mm_mul_ps(temp, x1);
+            result = _mm_or_ps(temp, result);            
+            
+            _mm_storeu_ps(pd, result);
+            
+            pd += misalignment;
+            c -= misalignment;            
+        }
+
+        if (c > 3)
+        {
+            // Handle all the 128-bit blocks that we can now that we have offset to an aligned address
+            remainder = c % 4;
+            for (const float* pEnd = pd + (c - remainder); pd < pEnd; pd += 4)
+            {
+                __m128 x2 = _mm_load_ps(pd);
+                x2 = _mm_mul_ps(x1, x2);
+                _mm_storeu_ps(pd, x2);
+            }
+        }
+        else
+        {
+            // Handle the "worst-case" scenario, which is when we have 4-8 elements and the input is not
+            // 128-bit aligned. This means we can't do any aligned loads and will just end up doing two
+            // unaligned loads where we mask the input each time.
+            remainder = c;
+        }
+    }
+
+    if (remainder != 0)
+    {
+        // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+        // unaligned load will read to the end of the array and then mask out any elements already processed
+        
+        pd -= (4 - remainder);
+        __m128 result = _mm_loadu_ps(pd);            
+            
+        __m128 trailingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + (remainder * 4));
+        __m128 leadingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - remainder) * 4));
+            
+        __m128 temp = _mm_and_ps(result, trailingMask);
+        result = _mm_and_ps(result, leadingMask);
+            
+        temp = _mm_mul_ps(temp, x1);
+        result = _mm_or_ps(temp, result);            
+            
+        _mm_storeu_ps(pd, result);
+        return;   
+    }
+}
+
 EXPORT_API(void) ScaleA(float a, _Inout_ float * pd, int c)
 {
     float * pdLim = pd + c;
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
index 8e691b0847..5930090bc5 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
@@ -16,8 +16,8 @@ public void AddScalarU()
             => AvxIntrinsics.AddScalarU(DefaultScale, new Span<float>(dst, 0, Length));
 
         [Benchmark]
-        public void ScaleU()
-            => AvxIntrinsics.ScaleU(DefaultScale, new Span<float>(dst, 0, Length));
+        public void Scale()
+            => AvxIntrinsics.Scale(DefaultScale, new Span<float>(dst, 0, Length));
 
         [Benchmark]
         public void ScaleSrcU()
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
index 8df3352556..83b412e61d 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -23,8 +23,8 @@ internal static class CpuMathNativeUtils
         [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "Scale"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void Scale(float a, /*_Inout_*/ float* pd, int c);
 
         [DllImport("CpuMathNative", EntryPoint = "ScaleSrcU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void ScaleSrcU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs
index b992ed3001..3cce45046c 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs
@@ -21,11 +21,11 @@ public unsafe void AddScalarU()
         }
         
         [Benchmark]
-        public unsafe void ScaleU()
+        public unsafe void Scale()
         {
             fixed (float* pdst = dst)
             {
-                CpuMathNativeUtils.ScaleU(DefaultScale, pdst, Length);
+                CpuMathNativeUtils.Scale(DefaultScale, pdst, Length);
             }
         }
         
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index 046e065955..923d7c539f 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -16,8 +16,8 @@ public void AddScalarU()
             => SseIntrinsics.AddScalarU(DefaultScale, new Span<float>(dst, 0, Length));
         
         [Benchmark]
-        public void ScaleU()
-            => SseIntrinsics.ScaleU(DefaultScale, new Span<float>(dst, 0, Length));
+        public void Scale()
+            => SseIntrinsics.Scale(DefaultScale, new Span<float>(dst, 0, Length));
         
         [Benchmark]
         public void ScaleSrcU()
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
index 7284ce6c1a..6c0f1cbaf0 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
@@ -233,7 +233,7 @@ public void AddScalarUTest(int test)
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
-        public void ScaleUTest(int test)
+        public void ScaleTest(int test)
         {
             float[] dst = (float[])_testArrays[test].Clone();
             float[] expected = (float[])dst.Clone();

From f003a9c7f2309580f7da7d13890e6e70cea85400 Mon Sep 17 00:00:00 2001
From: Anipik <anirudhagnihotry098@gmail.com>
Date: Wed, 3 Oct 2018 16:33:48 -0700
Subject: [PATCH 2/7] fixing build for uniz

---
 src/Native/CpuMathNative/Sse.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp
index 1777497a3b..be49be0957 100644
--- a/src/Native/CpuMathNative/Sse.cpp
+++ b/src/Native/CpuMathNative/Sse.cpp
@@ -27,6 +27,7 @@
 #include <tmmintrin.h>
 #include <smmintrin.h>
 #include <nmmintrin.h>
+#include <stdint.h>
 
 #define _load1(ps, pi) \
     _mm_set_ss(ps[pi[0]])

From 9a54d54e27614651ba8c76d876989163135b37b5 Mon Sep 17 00:00:00 2001
From: Anipik <anirudhagnihotry098@gmail.com>
Date: Thu, 4 Oct 2018 10:28:41 -0700
Subject: [PATCH 3/7] Removing dead code

---
 src/Microsoft.ML.CpuMath/Thunk.cs |  2 --
 src/Native/CpuMathNative/Sse.cpp  | 37 ++-----------------------------
 2 files changed, 2 insertions(+), 37 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs
index a7451df8be..23192fc277 100644
--- a/src/Microsoft.ML.CpuMath/Thunk.cs
+++ b/src/Microsoft.ML.CpuMath/Thunk.cs
@@ -171,8 +171,6 @@ public static extern void AddXYTranGradPX(/*const*/ float* px, /*const*/ int* pp
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void Scale(float a, float* pd, int c);
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleA(float a, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void ScaleX(float a, float* pd, int c);
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void ScaleSrcU(float a, /*const*/ float* ps, float* pd, int c);
diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp
index be49be0957..101af8a5f8 100644
--- a/src/Native/CpuMathNative/Sse.cpp
+++ b/src/Native/CpuMathNative/Sse.cpp
@@ -1447,26 +1447,6 @@ EXPORT_API(void) AddScalarU(float a, _Inout_ float * pd, int c)
 }
 
 EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    __m128 x1 = _mm_set1_ps(a);
-    for (; pd + 4 <= pdLim; pd += 4)
-    {
-        __m128 x2 = _mm_loadu_ps(pd);
-        x2 = _mm_mul_ps(x1, x2);
-        _mm_storeu_ps(pd, x2);
-    }
-
-    for (; pd < pdLim; pd++)
-    {
-        __m128 x2 = _mm_load_ss(pd);
-        x2 = _mm_mul_ss(x1, x2);
-        _mm_store_ss(pd, x2);
-    }
-}
-
-EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c)
 {
     float * pdLim = pd + c;
     __m128 x1 = _mm_set1_ps(a);
@@ -1552,8 +1532,8 @@ EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c)
         pd -= (4 - remainder);
         __m128 result = _mm_loadu_ps(pd);            
             
-        __m128 trailingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + (remainder * 4));
-        __m128 leadingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - remainder) * 4));
+        __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4));
+        __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4));
             
         __m128 temp = _mm_and_ps(result, trailingMask);
         result = _mm_and_ps(result, leadingMask);
@@ -1566,19 +1546,6 @@ EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c)
     }
 }
 
-EXPORT_API(void) ScaleA(float a, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
-
-    __m128 x1 = _mm_set1_ps(a);
-    for (; pd < pdLim; pd += 4)
-    {
-        __m128 x2 = _mm_load_ps(pd);
-        x2 = _mm_mul_ps(x1, x2);
-        _mm_store_ps(pd, x2);
-    }
-}
-
 EXPORT_API(void) ScaleSrcU(float a, _In_ const float * ps, _Inout_ float * pd, int c)
 {
     float * pdLim = pd + c;

From 0dbd303ec4da1ff304ff7cdaa190258c9d8aaf75 Mon Sep 17 00:00:00 2001
From: Anipik <anirudhagnihotry098@gmail.com>
Date: Mon, 8 Oct 2018 14:46:02 -0700
Subject: [PATCH 4/7] using switch statement for lesser length input

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 14 +++++++++-----
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 10 +++++-----
 src/Native/CpuMathNative/Sse.cpp          |  9 ++++-----
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 22706b9ef8..5e450a0e84 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -482,16 +482,21 @@ public static unsafe void Scale(float scale, Span<float> dst)
             fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0])
             fixed (float* pd = dst)
             {
-                float* pdLim = pd + dst.Length;
-
+                float* pDstCurrent = pd;
                 int length = dst.Length;
                 Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
 
                 if (length < 8)
                 {
-                    for(int i = 0; i < length; i++)
+                    switch(length)
                     {
-                        dst[i] *= scale;
+                        case 7: dst[6] *= scale; goto case 6;
+                        case 6: dst[5] *= scale; goto case 5;
+                        case 5: dst[4] *= scale; goto case 4;
+                        case 4: dst[3] *= scale; goto case 3;
+                        case 3: dst[2] *= scale; goto case 2;
+                        case 2: dst[1] *= scale; goto case 1;
+                        case 1: dst[0] *= scale; break;
                     }
                     return;
                 }
@@ -499,7 +504,6 @@ public static unsafe void Scale(float scale, Span<float> dst)
                 nuint address = (nuint)(pd);
                 int misalignment = (int)(address % 32);
                 int remainder = 0;
-                float* pDstCurrent = pd;
 
                 if ((misalignment & 3) != 0)
                 {
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 77eded3f55..5b50e667fe 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -460,17 +460,18 @@ public static unsafe void Scale(float scale, Span<float> dst)
             fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0])
             fixed (float* pd = dst)
             {
-                float* pdLim = pd + dst.Length;
-
+                float* pDstCurrent = pd;
                 int length = dst.Length;
                 Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
                 if (length < 4)
                 {
                     // Handle cases where we have less than 128-bits total and can't ever use SIMD acceleration.
-                    for (int i = 0; i < length; i++)
+                    switch (length)
                     {
-                        dst[i] *= scale;
+                        case 3: dst[2] *= scale; goto case 2;
+                        case 2: dst[1] *= scale; goto case 1;
+                        case 1: dst[0] *= scale; break;
                     }
                     return;
                 }
@@ -478,7 +479,6 @@ public static unsafe void Scale(float scale, Span<float> dst)
                 nuint address = (nuint)(pd);
                 int misalignment = (int)(address % 16);
                 int remainder = 0;
-                float* pDstCurrent = pd;
 
                 if ((misalignment & 3) != 0)
                 {
diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp
index 101af8a5f8..dab94d1b7f 100644
--- a/src/Native/CpuMathNative/Sse.cpp
+++ b/src/Native/CpuMathNative/Sse.cpp
@@ -1448,16 +1448,15 @@ EXPORT_API(void) AddScalarU(float a, _Inout_ float * pd, int c)
 
 EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c)
 {
-    float * pdLim = pd + c;
     __m128 x1 = _mm_set1_ps(a);
     
     if (c < 4)
     {
-        for (; pd < pdLim; pd++)
+        switch(c)
         {
-            __m128 x2 = _mm_load_ss(pd);
-            x2 = _mm_mul_ss(x1, x2);
-            _mm_store_ss(pd, x2);
+            case 3: pd[2] *= a;
+            case 2: pd[1] *= a;
+            case 1: pd[0] *= a;
         }
         return;           
     }

From ed64154adc752531cdf1ca6b928c73b35d0d22aa Mon Sep 17 00:00:00 2001
From: Anipik <anirudhagnihotry098@gmail.com>
Date: Tue, 9 Oct 2018 11:49:20 -0700
Subject: [PATCH 5/7] Remainder block corrected

---
 src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 28 +++++++++++------------
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 28 +++++++++++------------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index 5e450a0e84..d27729497b 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -571,27 +571,27 @@ public static unsafe void Scale(float scale, Span<float> dst)
                         // unaligned loads where we mask the input each time.
                         remainder = length;
                     }
+                }
 
-                    if (remainder != 0)
-                    {
-                        // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
-                        // unaligned load will read to the end of the array and then mask out any elements already processed
+                if (remainder != 0)
+                {
+                    // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                    // unaligned load will read to the end of the array and then mask out any elements already processed
 
-                        pDstCurrent -= (8 - remainder);
+                    pDstCurrent -= (8 - remainder);
 
-                        Vector256<float> result = Avx.LoadVector256(pDstCurrent);
+                    Vector256<float> result = Avx.LoadVector256(pDstCurrent);
 
-                        Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
-                        Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + (( 8 - remainder) * 8));
+                    Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
+                    Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8));
 
-                        Vector256<float> temp = Avx.And(result, trailingMask);
-                        result = Avx.And(result, leadingMask);
+                    Vector256<float> temp = Avx.And(result, trailingMask);
+                    result = Avx.And(result, leadingMask);
 
-                        temp = Avx.Multiply(scaleVector256, temp);
-                        temp = Avx.Or(temp, result);
+                    temp = Avx.Multiply(scaleVector256, temp);
+                    temp = Avx.Or(temp, result);
 
-                        Avx.Store(pDstCurrent, temp);
-                    }
+                    Avx.Store(pDstCurrent, temp);
                 }
             }
         }
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 5b50e667fe..2faca55058 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -558,27 +558,27 @@ public static unsafe void Scale(float scale, Span<float> dst)
                         // unaligned loads where we mask the input each time.
                         remainder = length;
                     }
+                }
 
-                    if (remainder != 0)
-                    {
-                        // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
-                        // unaligned load will read to the end of the array and then mask out any elements already processed
+                if (remainder != 0)
+                {
+                    // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                    // unaligned load will read to the end of the array and then mask out any elements already processed
 
-                        pDstCurrent -= (4 - remainder);
+                    pDstCurrent -= (4 - remainder);
 
-                        Vector128<float> result = Sse.LoadVector128(pDstCurrent);
+                    Vector128<float> result = Sse.LoadVector128(pDstCurrent);
 
-                        Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
-                        Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4));
+                    Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
+                    Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4));
 
-                        Vector128<float> temp = Sse.And(result, trailingMask);
-                        result = Sse.And(result, leadingMask);
+                    Vector128<float> temp = Sse.And(result, trailingMask);
+                    result = Sse.And(result, leadingMask);
 
-                        temp = Sse.Multiply(scaleVector128, temp);
-                        temp = Sse.Or(temp, result);
+                    temp = Sse.Multiply(scaleVector128, temp);
+                    temp = Sse.Or(temp, result);
 
-                        Sse.Store(pDstCurrent, temp);
-                    }
+                    Sse.Store(pDstCurrent, temp);
                 }
             }
         }

From c49e6a24db7cc313fc7cbc33778ffd48d5162f60 Mon Sep 17 00:00:00 2001
From: Anipik <anirudhagnihotry098@gmail.com>
Date: Wed, 10 Oct 2018 10:39:12 -0700
Subject: [PATCH 6/7] avx.isSupported path removed

---
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 30 ++++++-----------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 2faca55058..70c645ced6 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -89,7 +89,7 @@ internal static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-       internal static Vector128<float> VectorSum128(in Vector128<float> vector)
+        internal static Vector128<float> VectorSum128(in Vector128<float> vector)
         {
             if (Sse3.IsSupported)
             {
@@ -527,28 +527,12 @@ public static unsafe void Scale(float scale, Span<float> dst)
 
                         for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4)
                         {
-                            if (Avx.IsSupported)
-                            {
-                                // The JIT will only fold away unaligned loads due to the semantics behind
-                                // the VEX-encoding of the memory operand for `ins xmm, xmm, [mem]`. Since
-                                // modern hardware has unaligned loads that are as fast as aligned loads,
-                                // when it doesn't cross a cache-line/page boundary, we will just assert
-                                // that the alignment is correct and allow for the more-efficient codegen.
-
-                                Contracts.Assert(((nuint)(pDstCurrent) % 16) == 0);
-                                Vector128<float> temp = Sse.LoadVector128(pDstCurrent);
-                                temp = Sse.Multiply(scaleVector128, temp);
-                                Sse.Store(pDstCurrent, temp);
-                            }
-                            else
-                            {
-                                // If we aren't using the VEX-encoding, then the reverse is true and the JIT
-                                // will only fold away aligned loads (due to semantics of the legacy encoding).
-                                // We don't need an assert, since the instruction will throw for unaligned inputs.
-                                Vector128<float> temp = Sse.LoadAlignedVector128(pDstCurrent);
-                                temp = Sse.Multiply(scaleVector128, temp);
-                                Sse.Store(pDstCurrent, temp);
-                            }
+                            // If we aren't using the VEX-encoding, then the reverse is true and the JIT
+                            // will only fold away aligned loads (due to semantics of the legacy encoding).
+                            // We don't need an assert, since the instruction will throw for unaligned inputs.
+                            Vector128<float> temp = Sse.LoadAlignedVector128(pDstCurrent);
+                            temp = Sse.Multiply(scaleVector128, temp);
+                            Sse.Store(pDstCurrent, temp);
                         }
                     }
                     else

From 061e5b34db1073506f29cef5cb5ab8eadbf3a819 Mon Sep 17 00:00:00 2001
From: Anipik <anirudhagnihotry098@gmail.com>
Date: Wed, 10 Oct 2018 12:50:22 -0700
Subject: [PATCH 7/7] correcting comments and removing return statement

---
 src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 5 ++---
 src/Native/CpuMathNative/Sse.cpp          | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 70c645ced6..ede8a7da97 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -522,13 +522,12 @@ public static unsafe void Scale(float scale, Span<float> dst)
                     if (length > 4)
                     {
                         // Handle all the 128-bit blocks that we can now that we have offset to an aligned address
-
                         remainder = length % 4;
 
                         for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4)
                         {
-                            // If we aren't using the VEX-encoding, then the reverse is true and the JIT
-                            // will only fold away aligned loads (due to semantics of the legacy encoding).
+                            // If we aren't using the VEX-encoding, the JIT will only fold away aligned loads 
+                            // (due to semantics of the legacy encoding).
                             // We don't need an assert, since the instruction will throw for unaligned inputs.
                             Vector128<float> temp = Sse.LoadAlignedVector128(pDstCurrent);
                             temp = Sse.Multiply(scaleVector128, temp);
diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp
index dab94d1b7f..4a2d30e979 100644
--- a/src/Native/CpuMathNative/Sse.cpp
+++ b/src/Native/CpuMathNative/Sse.cpp
@@ -1541,7 +1541,6 @@ EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c)
         result = _mm_or_ps(temp, result);            
             
         _mm_storeu_ps(pd, result);
-        return;   
     }
 }