diff --git a/src/Microsoft.ML.CpuMath/Avx.cs b/src/Microsoft.ML.CpuMath/Avx.cs
index 6dcf898b6f..5d4610d9bc 100644
--- a/src/Microsoft.ML.CpuMath/Avx.cs
+++ b/src/Microsoft.ML.CpuMath/Avx.cs
@@ -625,7 +625,7 @@ public static void Scale(float a, float[] dst, int count)
             unsafe
             {
                 fixed (float* pd = &dst[0])
-                    Thunk.ScaleU(a, pd, count);
+                    Thunk.Scale(a, pd, count);
             }
         }
 
diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
index b31a427139..d27729497b 100644
--- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -13,11 +13,36 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using nuint = System.UInt64;
 
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class AvxIntrinsics
     {
+        public static readonly uint[] LeadingAlignmentMask = new uint[64]
+        {
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
+        };
+
+        public static readonly uint[] TrailingAlignmentMask = new uint[64]
+        {
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+        };
+
         private static readonly Vector256<float> _absMask256 = Avx.StaticCast<int, float>(Avx.SetAllVector256(0x7FFFFFFF));
 
         private const int Vector256Alignment = 32;
@@ -451,45 +476,122 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
             }
         }
 
-        public static unsafe void ScaleU(float scale, Span<float> dst)
+        public static unsafe void Scale(float scale, Span<float> dst)
         {
-            fixed (float* pdst = dst)
+            fixed (uint* pLeadingAlignmentMask = &LeadingAlignmentMask[0])
+            fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0])
+            fixed (float* pd = dst)
             {
-                float* pDstCurrent = pdst;
-                float* pEnd = pdst + dst.Length;
-
+                float* pDstCurrent = pd;
+                int length = dst.Length;
                 Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
 
-                while (pDstCurrent + 8 <= pEnd)
+                if (length < 8)
                 {
-                    Vector256<float> dstVector = Avx.LoadVector256(pDstCurrent);
+                    switch(length)
+                    {
+                        case 7: dst[6] *= scale; goto case 6;
+                        case 6: dst[5] *= scale; goto case 5;
+                        case 5: dst[4] *= scale; goto case 4;
+                        case 4: dst[3] *= scale; goto case 3;
+                        case 3: dst[2] *= scale; goto case 2;
+                        case 2: dst[1] *= scale; goto case 1;
+                        case 1: dst[0] *= scale; break;
+                    }
+                    return;
+                }
 
-                    dstVector = Avx.Multiply(scaleVector256, dstVector);
-                    Avx.Store(pDstCurrent, dstVector);
+                nuint address = (nuint)(pd);
+                int misalignment = (int)(address % 32);
+                int remainder = 0;
 
-                    pDstCurrent += 8;
+                if ((misalignment & 3) != 0)
+                {
+                    // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
+                    remainder = length % 8;
+
+                    for (float* pEnd = pd + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 8)
+                    {
+                        Vector256<float> temp = Avx.LoadVector256(pDstCurrent);
+                        temp = Avx.Multiply(scaleVector256, temp);
+                        Avx.Store(pDstCurrent, temp);
+                    }
                 }
+                else
+                {
+                    if (misalignment != 0)
+                    {
+                        // Handle cases where the data is not 256-bit aligned by doing an unaligned read and then
+                        // masking any elements that will be included in the first aligned read
 
-                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
+                        misalignment >>= 2;
+                        misalignment = 8 - misalignment;
 
-                if (pDstCurrent + 4 <= pEnd)
-                {
-                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                        Vector256<float> result = Avx.LoadVector256(pDstCurrent);
 
-                    dstVector = Sse.Multiply(scaleVector128, dstVector);
-                    Sse.Store(pDstCurrent, dstVector);
+                        Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + (misalignment * 8));
+                        Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (( 8 - misalignment) * 8));
 
-                    pDstCurrent += 4;
+                        Vector256<float> temp = Avx.And(result, leadingMask);
+                        result = Avx.And(result, trailingMask);
+
+                        temp = Avx.Multiply(scaleVector256, temp);
+                        result = Avx.Or(temp, result);
+
+                        Avx.Store(pDstCurrent, result);
+
+                        pDstCurrent += misalignment;
+                        length -= misalignment;
+                    }
+
+                    if (length > 7)
+                    {
+                        // Handle all the 256-bit blocks that we can now that we have offset to an aligned address
+
+                        remainder = length % 8;
+
+                        for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 8)
+                        {
+                            // The JIT will only fold away unaligned loads due to the semantics behind
+                            // the VEX-encoding of the memory operand for `ins xmm, xmm, [mem]`. Since
+                            // modern hardware has unaligned loads that are as fast as aligned loads,
+                            // when it doesn't cross a cache-line/page boundary, we will just assert
+                            // that the alignment is correct and allow for the more-efficient codegen.
+
+                            Contracts.Assert(((nuint)(pDstCurrent) % 32) == 0);
+                            Vector256<float> temp = Avx.LoadVector256(pDstCurrent);
+                            temp = Avx.Multiply(scaleVector256, temp);
+                            Avx.Store(pDstCurrent, temp);
+                        }
+                    }
+                    else
+                    {
+                        // Handle the "worst-case" scenario, which is when we have 8-16 elements and the input is not
+                        // 256-bit aligned. This means we can't do any aligned loads and will just end up doing two
+                        // unaligned loads where we mask the input each time.
+                        remainder = length;
+                    }
                 }
 
-                while (pDstCurrent < pEnd)
+                if (remainder != 0)
                 {
-                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                    // unaligned load will read to the end of the array and then mask out any elements already processed
 
-                    dstVector = Sse.MultiplyScalar(scaleVector128, dstVector);
-                    Sse.StoreScalar(pDstCurrent, dstVector);
+                    pDstCurrent -= (8 - remainder);
 
-                    pDstCurrent++;
+                    Vector256<float> result = Avx.LoadVector256(pDstCurrent);
+
+                    Vector256<float> trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8));
+                    Vector256<float> leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8));
+
+                    Vector256<float> temp = Avx.And(result, trailingMask);
+                    result = Avx.And(result, leadingMask);
+
+                    temp = Avx.Multiply(scaleVector256, temp);
+                    temp = Avx.Or(temp, result);
+
+                    Avx.Store(pDstCurrent, temp);
                 }
             }
         }
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index f15f5c3938..09738684f5 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -248,11 +248,11 @@ private static void Scale(float a, Span<float> dst)
         {
             if (Avx.IsSupported)
             {
-                AvxIntrinsics.ScaleU(a, dst);
+                AvxIntrinsics.Scale(a, dst);
             }
             else if (Sse.IsSupported)
             {
-                SseIntrinsics.ScaleU(a, dst);
+                SseIntrinsics.Scale(a, dst);
             }
             else
             {
diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs
index 13de22dd5b..d541c02533 100644
--- a/src/Microsoft.ML.CpuMath/Sse.cs
+++ b/src/Microsoft.ML.CpuMath/Sse.cs
@@ -606,7 +606,7 @@ public static void Scale(float a, AlignedArray dst)
             unsafe
             {
                 fixed (float* pdst = &dst.Items[0])
-                    Thunk.ScaleA(a, Ptr(dst, pdst), dst.Size);
+                    Thunk.Scale(a, Ptr(dst, pdst), dst.Size);
             }
         }
 
@@ -618,7 +618,7 @@ public static void Scale(float a, float[] dst, int count)
             unsafe
             {
                 fixed (float* pd = &dst[0])
-                    Thunk.ScaleU(a, pd, count);
+                    Thunk.Scale(a, pd, count);
             }
         }
 
@@ -631,7 +631,7 @@ public static void Scale(float a, float[] dst, int offset, int count)
             unsafe
             {
                 fixed (float* pd = &dst[offset])
-                    Thunk.ScaleU(a, pd, count);
+                    Thunk.Scale(a, pd, count);
             }
         }
 
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index 4c36d0094e..ede8a7da97 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -17,6 +17,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using nuint = System.UInt64;
 
 namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
@@ -29,6 +30,22 @@ internal static class SseIntrinsics
         // The count of bytes in Vector128<T>, corresponding to _cbAlign in AlignedArray
         private const int Vector128Alignment = 16;
 
+        public static readonly uint[] LeadingAlignmentMask = new uint[16]
+        {
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000,
+            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
+        };
+
+        public static readonly uint[] TrailingAlignmentMask = new uint[16]
+        {
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF,
+            0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF,
+            0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+        };
+
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static bool HasCompatibleAlignment(AlignedArray alignedArray)
         {
@@ -72,7 +89,7 @@ internal static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-       internal static Vector128<float> VectorSum128(in Vector128<float> vector)
+        internal static Vector128<float> VectorSum128(in Vector128<float> vector)
         {
             if (Sse3.IsSupported)
             {
@@ -437,33 +454,114 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
             }
         }
 
-        public static unsafe void ScaleU(float scale, Span<float> dst)
+        public static unsafe void Scale(float scale, Span<float> dst)
         {
-            fixed (float* pdst = dst)
+            fixed (uint* pLeadingAlignmentMask = &LeadingAlignmentMask[0])
+            fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0])
+            fixed (float* pd = dst)
             {
-                float* pDstCurrent = pdst;
-                float* pEnd = pdst + dst.Length;
+                float* pDstCurrent = pd;
+                int length = dst.Length;
+                Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
-                Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+                if (length < 4)
+                {
+                    // Handle cases where we have less than 128-bits total and can't ever use SIMD acceleration.
+                    switch (length)
+                    {
+                        case 3: dst[2] *= scale; goto case 2;
+                        case 2: dst[1] *= scale; goto case 1;
+                        case 1: dst[0] *= scale; break;
+                    }
+                    return;
+                }
 
-                while (pDstCurrent + 4 <= pEnd)
+                nuint address = (nuint)(pd);
+                int misalignment = (int)(address % 16);
+                int remainder = 0;
+
+                if ((misalignment & 3) != 0)
                 {
-                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                    // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
+                    remainder = length % 4;
 
-                    dstVector = Sse.Multiply(scaleVector, dstVector);
-                    Sse.Store(pDstCurrent, dstVector);
+                    for (float* pEnd = pd + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4)
+                    {
+                        Vector128<float> temp = Sse.LoadVector128(pDstCurrent);
+                        temp = Sse.Multiply(scaleVector128, temp);
+                        Sse.Store(pDstCurrent, temp);
+                    }
+                }
+                else
+                {
+                    if (misalignment != 0)
+                    {
+                        // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then
+                        // masking any elements that will be included in the first aligned read
 
-                    pDstCurrent += 4;
+                        misalignment >>= 2;
+                        misalignment = 4 - misalignment;
+
+                        Vector128<float> result = Sse.LoadVector128(pDstCurrent);
+
+                        Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + (misalignment * 4));
+                        Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + ((4 - misalignment) * 4));
+
+                        Vector128<float> temp = Sse.And(result, leadingMask);
+                        result = Sse.And(result, trailingMask);
+
+                        temp = Sse.Multiply(scaleVector128, temp);
+                        result = Sse.Or(temp, result);
+
+                        Sse.Store(pDstCurrent, result);
+
+                        pDstCurrent += misalignment;
+                        length -= misalignment;
+                    }
+
+                    if (length > 4)
+                    {
+                        // Handle all the 128-bit blocks that we can now that we have offset to an aligned address
+                        remainder = length % 4;
+
+                        for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4)
+                        {
+                            // If we aren't using the VEX-encoding, the JIT will only fold away aligned loads 
+                            // (due to semantics of the legacy encoding).
+                            // We don't need an assert, since the instruction will throw for unaligned inputs.
+                            Vector128<float> temp = Sse.LoadAlignedVector128(pDstCurrent);
+                            temp = Sse.Multiply(scaleVector128, temp);
+                            Sse.Store(pDstCurrent, temp);
+                        }
+                    }
+                    else
+                    {
+                        // Handle the "worst-case" scenario, which is when we have 4-8 elements and the input is not
+                        // 128-bit aligned. This means we can't do any aligned loads and will just end up doing two
+                        // unaligned loads where we mask the input each time.
+                        remainder = length;
+                    }
                 }
 
-                while (pDstCurrent < pEnd)
+                if (remainder != 0)
                 {
-                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+                    // unaligned load will read to the end of the array and then mask out any elements already processed
 
-                    dstVector = Sse.MultiplyScalar(scaleVector, dstVector);
-                    Sse.StoreScalar(pDstCurrent, dstVector);
+                    pDstCurrent -= (4 - remainder);
 
-                    pDstCurrent++;
+                    Vector128<float> result = Sse.LoadVector128(pDstCurrent);
+
+                    Vector128<float> trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4));
+                    Vector128<float> leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4));
+
+                    Vector128<float> temp = Sse.And(result, trailingMask);
+                    result = Sse.And(result, leadingMask);
+
+                    temp = Sse.Multiply(scaleVector128, temp);
+                    temp = Sse.Or(temp, result);
+
+                    Sse.Store(pDstCurrent, temp);
                 }
             }
         }
diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs
index 1053f75b75..23192fc277 100644
--- a/src/Microsoft.ML.CpuMath/Thunk.cs
+++ b/src/Microsoft.ML.CpuMath/Thunk.cs
@@ -169,9 +169,7 @@ public static extern void AddXYTranGradPX(/*const*/ float* px, /*const*/ int* pp
             float decay, float cond, int crow, int ccol);
 
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleU(float a, float* pd, int c);
-        [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
-        public static extern void ScaleA(float a, float* pd, int c);
+        public static extern void Scale(float a, float* pd, int c);
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
         public static extern void ScaleX(float a, float* pd, int c);
         [DllImport(NativePath), SuppressUnmanagedCodeSecurity]
diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp
index c5221f0020..4a2d30e979 100644
--- a/src/Native/CpuMathNative/Sse.cpp
+++ b/src/Native/CpuMathNative/Sse.cpp
@@ -27,6 +27,7 @@
 #include <tmmintrin.h>
 #include <smmintrin.h>
 #include <nmmintrin.h>
+#include <stdint.h>
 
 #define _load1(ps, pi) \
     _mm_set_ss(ps[pi[0]])
@@ -85,6 +86,22 @@ DWORD xmmYmmStateSupport()
 
 #endif
 
+const unsigned int LeadingAlignmentMask[16] =
+{
+    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000,
+    0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000,
+};
+
+const unsigned int TrailingAlignmentMask[16] =
+{
+    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF,
+    0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF,
+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
+};
+
 // Test whether Avx is available.
 EXPORT_API(bool) ChkAvx()
 {
@@ -1429,36 +1446,101 @@ EXPORT_API(void) AddScalarU(float a, _Inout_ float * pd, int c)
     }
 }
 
-EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c)
+EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c)
 {
-    float * pdLim = pd + c;
-
     __m128 x1 = _mm_set1_ps(a);
-    for (; pd + 4 <= pdLim; pd += 4)
+    
+    if (c < 4)
     {
-        __m128 x2 = _mm_loadu_ps(pd);
-        x2 = _mm_mul_ps(x1, x2);
-        _mm_storeu_ps(pd, x2);
+        switch(c)
+        {
+            case 3: pd[2] *= a;
+            case 2: pd[1] *= a;
+            case 1: pd[0] *= a;
+        }
+        return;           
     }
 
-    for (; pd < pdLim; pd++)
+    uintptr_t address = (uintptr_t)(pd);
+    uintptr_t misalignment = address % 16;
+    int remainder = 0;
+
+    if ((misalignment & 3) != 0)
     {
-        __m128 x2 = _mm_load_ss(pd);
-        x2 = _mm_mul_ss(x1, x2);
-        _mm_store_ss(pd, x2);
+        // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations
+        remainder = c % 4;
+        
+        for (const float* pEnd = pd + (c - remainder); pd < pEnd; pd += 4)
+        {
+            __m128 x2 = _mm_loadu_ps(pd);
+            x2 = _mm_mul_ps(x1, x2);
+            _mm_storeu_ps(pd, x2);
+        }
     }
-}
+    else
+    {
+        if (misalignment != 0)
+        {
+            // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then
+            // masking any elements that will be included in the first aligned read
+            misalignment >>= 2;
+            misalignment = 4 - misalignment;
+            
+            __m128 result = _mm_loadu_ps(pd);            
+            
+            __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + (misalignment * 4));
+            __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - misalignment) * 4));
+            
+            __m128 temp = _mm_and_ps(result, leadingMask);
+            result = _mm_and_ps(result, trailingMask);
+            
+            temp = _mm_mul_ps(temp, x1);
+            result = _mm_or_ps(temp, result);            
+            
+            _mm_storeu_ps(pd, result);
+            
+            pd += misalignment;
+            c -= misalignment;            
+        }
 
-EXPORT_API(void) ScaleA(float a, _Inout_ float * pd, int c)
-{
-    float * pdLim = pd + c;
+        if (c > 3)
+        {
+            // Handle all the 128-bit blocks that we can now that we have offset to an aligned address
+            remainder = c % 4;
+            for (const float* pEnd = pd + (c - remainder); pd < pEnd; pd += 4)
+            {
+                __m128 x2 = _mm_load_ps(pd);
+                x2 = _mm_mul_ps(x1, x2);
+                _mm_storeu_ps(pd, x2);
+            }
+        }
+        else
+        {
+            // Handle the "worst-case" scenario, which is when we have 4-8 elements and the input is not
+            // 128-bit aligned. This means we can't do any aligned loads and will just end up doing two
+            // unaligned loads where we mask the input each time.
+            remainder = c;
+        }
+    }
 
-    __m128 x1 = _mm_set1_ps(a);
-    for (; pd < pdLim; pd += 4)
-    {
-        __m128 x2 = _mm_load_ps(pd);
-        x2 = _mm_mul_ps(x1, x2);
-        _mm_store_ps(pd, x2);
+    if (remainder != 0)
+    {
+        // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next
+        // unaligned load will read to the end of the array and then mask out any elements already processed
+        
+        pd -= (4 - remainder);
+        __m128 result = _mm_loadu_ps(pd);            
+            
+        __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4));
+        __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4));
+            
+        __m128 temp = _mm_and_ps(result, trailingMask);
+        result = _mm_and_ps(result, leadingMask);
+            
+        temp = _mm_mul_ps(temp, x1);
+        result = _mm_or_ps(temp, result);            
+            
+        _mm_storeu_ps(pd, result);
     }
 }
 
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
index 8e691b0847..5930090bc5 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs
@@ -16,8 +16,8 @@ public void AddScalarU()
             => AvxIntrinsics.AddScalarU(DefaultScale, new Span<float>(dst, 0, Length));
 
         [Benchmark]
-        public void ScaleU()
-            => AvxIntrinsics.ScaleU(DefaultScale, new Span<float>(dst, 0, Length));
+        public void Scale()
+            => AvxIntrinsics.Scale(DefaultScale, new Span<float>(dst, 0, Length));
 
         [Benchmark]
         public void ScaleSrcU()
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
index 8df3352556..83b412e61d 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -23,8 +23,8 @@ internal static class CpuMathNativeUtils
         [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "Scale"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void Scale(float a, /*_Inout_*/ float* pd, int c);
 
         [DllImport("CpuMathNative", EntryPoint = "ScaleSrcU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void ScaleSrcU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs
index b992ed3001..3cce45046c 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs
@@ -21,11 +21,11 @@ public unsafe void AddScalarU()
         }
         
         [Benchmark]
-        public unsafe void ScaleU()
+        public unsafe void Scale()
         {
             fixed (float* pdst = dst)
             {
-                CpuMathNativeUtils.ScaleU(DefaultScale, pdst, Length);
+                CpuMathNativeUtils.Scale(DefaultScale, pdst, Length);
             }
         }
         
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index 046e065955..923d7c539f 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -16,8 +16,8 @@ public void AddScalarU()
             => SseIntrinsics.AddScalarU(DefaultScale, new Span<float>(dst, 0, Length));
         
         [Benchmark]
-        public void ScaleU()
-            => SseIntrinsics.ScaleU(DefaultScale, new Span<float>(dst, 0, Length));
+        public void Scale()
+            => SseIntrinsics.Scale(DefaultScale, new Span<float>(dst, 0, Length));
         
         [Benchmark]
         public void ScaleSrcU()
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
index 7284ce6c1a..6c0f1cbaf0 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs
@@ -233,7 +233,7 @@ public void AddScalarUTest(int test)
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
-        public void ScaleUTest(int test)
+        public void ScaleTest(int test)
         {
             float[] dst = (float[])_testArrays[test].Clone();
             float[] expected = (float[])dst.Clone();