From 6991cd9191cb01a15a00001869c806ac28638e91 Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 3 Oct 2018 16:05:09 -0700 Subject: [PATCH 1/7] scale modified to handle both aligned and unaligned read\writes --- src/Microsoft.ML.CpuMath/Avx.cs | 2 +- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 144 ++++++++++++++--- .../CpuMathUtils.netcoreapp.cs | 4 +- src/Microsoft.ML.CpuMath/Sse.cs | 6 +- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 147 ++++++++++++++++-- src/Microsoft.ML.CpuMath/Thunk.cs | 2 +- src/Native/CpuMathNative/Sse.cpp | 118 +++++++++++++- .../AvxPerformanceTests.cs | 4 +- .../CpuMathNativeUtils.cs | 4 +- .../NativePerformanceTests.cs | 4 +- .../SsePerformanceTests.cs | 4 +- .../UnitTests.cs | 2 +- 12 files changed, 385 insertions(+), 56 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/Avx.cs b/src/Microsoft.ML.CpuMath/Avx.cs index 6dcf898b6f..5d4610d9bc 100644 --- a/src/Microsoft.ML.CpuMath/Avx.cs +++ b/src/Microsoft.ML.CpuMath/Avx.cs @@ -625,7 +625,7 @@ public static void Scale(float a, float[] dst, int count) unsafe { fixed (float* pd = &dst[0]) - Thunk.ScaleU(a, pd, count); + Thunk.Scale(a, pd, count); } } diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index b31a427139..22706b9ef8 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -13,11 +13,36 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using nuint = System.UInt64; namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class AvxIntrinsics { + public static readonly uint[] LeadingAlignmentMask = new uint[64] + { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, + }; + + public static readonly uint[] TrailingAlignmentMask = new uint[64] + { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + }; + private static readonly Vector256 _absMask256 = Avx.StaticCast(Avx.SetAllVector256(0x7FFFFFFF)); private const int Vector256Alignment = 32; @@ -451,45 +476,118 @@ public static unsafe void AddScalarU(float scalar, Span dst) } } - public static unsafe void ScaleU(float scale, Span dst) + public static unsafe void Scale(float scale, Span dst) { - fixed (float* pdst = dst) + fixed (uint* pLeadingAlignmentMask = &LeadingAlignmentMask[0]) + fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0]) + fixed (float* pd = dst) { - float* pDstCurrent = pdst; - float* pEnd = pdst + dst.Length; + float* pdLim = pd + dst.Length; + int length = dst.Length; Vector256 scaleVector256 = Avx.SetAllVector256(scale); - while (pDstCurrent + 8 <= pEnd) + if (length < 8) { - Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + for(int i = 0; i < length; i++) + { + dst[i] *= scale; + } + return; + } - dstVector = Avx.Multiply(scaleVector256, dstVector); - Avx.Store(pDstCurrent, dstVector); + nuint address = (nuint)(pd); + int misalignment = (int)(address % 32); + int remainder = 0; + float* pDstCurrent = pd; - pDstCurrent += 8; + if ((misalignment & 3) != 0) + { + // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations + remainder = length % 8; + + for (float* pEnd = pd + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 8) + { + Vector256 temp = Avx.LoadVector256(pDstCurrent); + temp = Avx.Multiply(scaleVector256, temp); + Avx.Store(pDstCurrent, temp); + } } + else + { + if (misalignment != 0) + { + // Handle cases where the data is not 256-bit aligned by doing an unaligned read and then + // masking any elements that will be included in the first aligned read - Vector128 scaleVector128 = Sse.SetAllVector128(scale); + misalignment >>= 2; + misalignment = 8 - misalignment; - if (pDstCurrent + 4 <= pEnd) - { - Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + Vector256 result = Avx.LoadVector256(pDstCurrent); - dstVector = Sse.Multiply(scaleVector128, dstVector); - Sse.Store(pDstCurrent, dstVector); + Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + (misalignment * 8)); + Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (( 8 - misalignment) * 8)); - pDstCurrent += 4; - } + Vector256 temp = Avx.And(result, leadingMask); + result = Avx.And(result, trailingMask); - while (pDstCurrent < pEnd) - { - Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + temp = Avx.Multiply(scaleVector256, temp); + result = Avx.Or(temp, result); - dstVector = Sse.MultiplyScalar(scaleVector128, dstVector); - Sse.StoreScalar(pDstCurrent, dstVector); + Avx.Store(pDstCurrent, result); - pDstCurrent++; + pDstCurrent += misalignment; + length -= misalignment; + } + + if (length > 7) + { + // Handle all the 256-bit blocks that we can now that we have offset to an aligned address + + remainder = length % 8; + + for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 8) + { + // The JIT will only fold away unaligned loads due to the semantics behind + // the VEX-encoding of the memory operand for `ins xmm, xmm, [mem]`. Since + // modern hardware has unaligned loads that are as fast as aligned loads, + // when it doesn't cross a cache-line/page boundary, we will just assert + // that the alignment is correct and allow for the more-efficient codegen. + + Contracts.Assert(((nuint)(pDstCurrent) % 32) == 0); + Vector256 temp = Avx.LoadVector256(pDstCurrent); + temp = Avx.Multiply(scaleVector256, temp); + Avx.Store(pDstCurrent, temp); + } + } + else + { + // Handle the "worst-case" scenario, which is when we have 8-16 elements and the input is not + // 256-bit aligned. This means we can't do any aligned loads and will just end up doing two + // unaligned loads where we mask the input each time. + remainder = length; + } + + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed + + pDstCurrent -= (8 - remainder); + + Vector256 result = Avx.LoadVector256(pDstCurrent); + + Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); + Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + (( 8 - remainder) * 8)); + + Vector256 temp = Avx.And(result, trailingMask); + result = Avx.And(result, leadingMask); + + temp = Avx.Multiply(scaleVector256, temp); + temp = Avx.Or(temp, result); + + Avx.Store(pDstCurrent, temp); + } } } } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index f15f5c3938..09738684f5 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -248,11 +248,11 @@ private static void Scale(float a, Span dst) { if (Avx.IsSupported) { - AvxIntrinsics.ScaleU(a, dst); + AvxIntrinsics.Scale(a, dst); } else if (Sse.IsSupported) { - SseIntrinsics.ScaleU(a, dst); + SseIntrinsics.Scale(a, dst); } else { diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs index 13de22dd5b..d541c02533 100644 --- a/src/Microsoft.ML.CpuMath/Sse.cs +++ b/src/Microsoft.ML.CpuMath/Sse.cs @@ -606,7 +606,7 @@ public static void Scale(float a, AlignedArray dst) unsafe { fixed (float* pdst = &dst.Items[0]) - Thunk.ScaleA(a, Ptr(dst, pdst), dst.Size); + Thunk.Scale(a, Ptr(dst, pdst), dst.Size); } } @@ -618,7 +618,7 @@ public static void Scale(float a, float[] dst, int count) unsafe { fixed (float* pd = &dst[0]) - Thunk.ScaleU(a, pd, count); + Thunk.Scale(a, pd, count); } } @@ -631,7 +631,7 @@ public static void Scale(float a, float[] dst, int offset, int count) unsafe { fixed (float* pd = &dst[offset]) - Thunk.ScaleU(a, pd, count); + Thunk.Scale(a, pd, count); } } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 4c36d0094e..77eded3f55 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -17,6 +17,7 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using nuint = System.UInt64; namespace Microsoft.ML.Runtime.Internal.CpuMath { @@ -29,6 +30,22 @@ internal static class SseIntrinsics // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; + public static readonly uint[] LeadingAlignmentMask = new uint[16] + { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, + }; + + public static readonly uint[] TrailingAlignmentMask = new uint[16] + { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + }; + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static bool HasCompatibleAlignment(AlignedArray alignedArray) { @@ -437,33 +454,131 @@ public static unsafe void AddScalarU(float scalar, Span dst) } } - public static unsafe void ScaleU(float scale, Span dst) + public static unsafe void Scale(float scale, Span dst) { - fixed (float* pdst = dst) + fixed (uint* pLeadingAlignmentMask = &LeadingAlignmentMask[0]) + fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0]) + fixed (float* pd = dst) { - float* pDstCurrent = pdst; - float* pEnd = pdst + dst.Length; + float* pdLim = pd + dst.Length; - Vector128 scaleVector = Sse.SetAllVector128(scale); + int length = dst.Length; + Vector128 scaleVector128 = Sse.SetAllVector128(scale); - while (pDstCurrent + 4 <= pEnd) + if (length < 4) { - Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + // Handle cases where we have less than 128-bits total and can't ever use SIMD acceleration. + for (int i = 0; i < length; i++) + { + dst[i] *= scale; + } + return; + } - dstVector = Sse.Multiply(scaleVector, dstVector); - Sse.Store(pDstCurrent, dstVector); + nuint address = (nuint)(pd); + int misalignment = (int)(address % 16); + int remainder = 0; + float* pDstCurrent = pd; - pDstCurrent += 4; - } + if ((misalignment & 3) != 0) + { + // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations + remainder = length % 4; - while (pDstCurrent < pEnd) + for (float* pEnd = pd + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4) + { + Vector128 temp = Sse.LoadVector128(pDstCurrent); + temp = Sse.Multiply(scaleVector128, temp); + Sse.Store(pDstCurrent, temp); + } + } + else { - Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + if (misalignment != 0) + { + // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then + // masking any elements that will be included in the first aligned read - dstVector = Sse.MultiplyScalar(scaleVector, dstVector); - Sse.StoreScalar(pDstCurrent, dstVector); + misalignment >>= 2; + misalignment = 4 - misalignment; - pDstCurrent++; + Vector128 result = Sse.LoadVector128(pDstCurrent); + + Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + (misalignment * 4)); + Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + ((4 - misalignment) * 4)); + + Vector128 temp = Sse.And(result, leadingMask); + result = Sse.And(result, trailingMask); + + temp = Sse.Multiply(scaleVector128, temp); + result = Sse.Or(temp, result); + + Sse.Store(pDstCurrent, result); + + pDstCurrent += misalignment; + length -= misalignment; + } + + if (length > 4) + { + // Handle all the 128-bit blocks that we can now that we have offset to an aligned address + + remainder = length % 4; + + for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4) + { + if (Avx.IsSupported) + { + // The JIT will only fold away unaligned loads due to the semantics behind + // the VEX-encoding of the memory operand for `ins xmm, xmm, [mem]`. Since + // modern hardware has unaligned loads that are as fast as aligned loads, + // when it doesn't cross a cache-line/page boundary, we will just assert + // that the alignment is correct and allow for the more-efficient codegen. + + Contracts.Assert(((nuint)(pDstCurrent) % 16) == 0); + Vector128 temp = Sse.LoadVector128(pDstCurrent); + temp = Sse.Multiply(scaleVector128, temp); + Sse.Store(pDstCurrent, temp); + } + else + { + // If we aren't using the VEX-encoding, then the reverse is true and the JIT + // will only fold away aligned loads (due to semantics of the legacy encoding). + // We don't need an assert, since the instruction will throw for unaligned inputs. + Vector128 temp = Sse.LoadAlignedVector128(pDstCurrent); + temp = Sse.Multiply(scaleVector128, temp); + Sse.Store(pDstCurrent, temp); + } + } + } + else + { + // Handle the "worst-case" scenario, which is when we have 4-8 elements and the input is not + // 128-bit aligned. This means we can't do any aligned loads and will just end up doing two + // unaligned loads where we mask the input each time. + remainder = length; + } + + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed + + pDstCurrent -= (4 - remainder); + + Vector128 result = Sse.LoadVector128(pDstCurrent); + + Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); + Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4)); + + Vector128 temp = Sse.And(result, trailingMask); + result = Sse.And(result, leadingMask); + + temp = Sse.Multiply(scaleVector128, temp); + temp = Sse.Or(temp, result); + + Sse.Store(pDstCurrent, temp); + } } } } diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs index 1053f75b75..a7451df8be 100644 --- a/src/Microsoft.ML.CpuMath/Thunk.cs +++ b/src/Microsoft.ML.CpuMath/Thunk.cs @@ -169,7 +169,7 @@ public static extern void AddXYTranGradPX(/*const*/ float* px, /*const*/ int* pp float decay, float cond, int crow, int ccol); [DllImport(NativePath), SuppressUnmanagedCodeSecurity] - public static extern void ScaleU(float a, float* pd, int c); + public static extern void Scale(float a, float* pd, int c); [DllImport(NativePath), SuppressUnmanagedCodeSecurity] public static extern void ScaleA(float a, float* pd, int c); [DllImport(NativePath), SuppressUnmanagedCodeSecurity] diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp index c5221f0020..1777497a3b 100644 --- a/src/Native/CpuMathNative/Sse.cpp +++ b/src/Native/CpuMathNative/Sse.cpp @@ -85,6 +85,22 @@ DWORD xmmYmmStateSupport() #endif +const unsigned int LeadingAlignmentMask[16] = +{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, +}; + +const unsigned int TrailingAlignmentMask[16] = +{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, +}; + // Test whether Avx is available. EXPORT_API(bool) ChkAvx() { @@ -1429,7 +1445,7 @@ EXPORT_API(void) AddScalarU(float a, _Inout_ float * pd, int c) } } -EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c) +EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c) { float * pdLim = pd + c; @@ -1449,6 +1465,106 @@ EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c) } } +EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c) +{ + float * pdLim = pd + c; + __m128 x1 = _mm_set1_ps(a); + + if (c < 4) + { + for (; pd < pdLim; pd++) + { + __m128 x2 = _mm_load_ss(pd); + x2 = _mm_mul_ss(x1, x2); + _mm_store_ss(pd, x2); + } + return; + } + + uintptr_t address = (uintptr_t)(pd); + uintptr_t misalignment = address % 16; + int remainder = 0; + + if ((misalignment & 3) != 0) + { + // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations + remainder = c % 4; + + for (const float* pEnd = pd + (c - remainder); pd < pEnd; pd += 4) + { + __m128 x2 = _mm_loadu_ps(pd); + x2 = _mm_mul_ps(x1, x2); + _mm_storeu_ps(pd, x2); + } + } + else + { + if (misalignment != 0) + { + // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then + // masking any elements that will be included in the first aligned read + misalignment >>= 2; + misalignment = 4 - misalignment; + + __m128 result = _mm_loadu_ps(pd); + + __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + (misalignment * 4)); + __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - misalignment) * 4)); + + __m128 temp = _mm_and_ps(result, leadingMask); + result = _mm_and_ps(result, trailingMask); + + temp = _mm_mul_ps(temp, x1); + result = _mm_or_ps(temp, result); + + _mm_storeu_ps(pd, result); + + pd += misalignment; + c -= misalignment; + } + + if (c > 3) + { + // Handle all the 128-bit blocks that we can now that we have offset to an aligned address + remainder = c % 4; + for (const float* pEnd = pd + (c - remainder); pd < pEnd; pd += 4) + { + __m128 x2 = _mm_load_ps(pd); + x2 = _mm_mul_ps(x1, x2); + _mm_storeu_ps(pd, x2); + } + } + else + { + // Handle the "worst-case" scenario, which is when we have 4-8 elements and the input is not + // 128-bit aligned. This means we can't do any aligned loads and will just end up doing two + // unaligned loads where we mask the input each time. + remainder = c; + } + } + + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed + + pd -= (4 - remainder); + __m128 result = _mm_loadu_ps(pd); + + __m128 trailingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + (remainder * 4)); + __m128 leadingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - remainder) * 4)); + + __m128 temp = _mm_and_ps(result, trailingMask); + result = _mm_and_ps(result, leadingMask); + + temp = _mm_mul_ps(temp, x1); + result = _mm_or_ps(temp, result); + + _mm_storeu_ps(pd, result); + return; + } +} + EXPORT_API(void) ScaleA(float a, _Inout_ float * pd, int c) { float * pdLim = pd + c; diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs index 8e691b0847..5930090bc5 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -16,8 +16,8 @@ public void AddScalarU() => AvxIntrinsics.AddScalarU(DefaultScale, new Span(dst, 0, Length)); [Benchmark] - public void ScaleU() - => AvxIntrinsics.ScaleU(DefaultScale, new Span(dst, 0, Length)); + public void Scale() + => AvxIntrinsics.Scale(DefaultScale, new Span(dst, 0, Length)); [Benchmark] public void ScaleSrcU() diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs index 8df3352556..83b412e61d 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs @@ -23,8 +23,8 @@ internal static class CpuMathNativeUtils [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "Scale"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void Scale(float a, /*_Inout_*/ float* pd, int c); [DllImport("CpuMathNative", EntryPoint = "ScaleSrcU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void ScaleSrcU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs index b992ed3001..3cce45046c 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs @@ -21,11 +21,11 @@ public unsafe void AddScalarU() } [Benchmark] - public unsafe void ScaleU() + public unsafe void Scale() { fixed (float* pdst = dst) { - CpuMathNativeUtils.ScaleU(DefaultScale, pdst, Length); + CpuMathNativeUtils.Scale(DefaultScale, pdst, Length); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index 046e065955..923d7c539f 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -16,8 +16,8 @@ public void AddScalarU() => SseIntrinsics.AddScalarU(DefaultScale, new Span(dst, 0, Length)); [Benchmark] - public void ScaleU() - => SseIntrinsics.ScaleU(DefaultScale, new Span(dst, 0, Length)); + public void Scale() + => SseIntrinsics.Scale(DefaultScale, new Span(dst, 0, Length)); [Benchmark] public void ScaleSrcU() diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs index 7284ce6c1a..6c0f1cbaf0 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs @@ -233,7 +233,7 @@ public void AddScalarUTest(int test) [Theory] [InlineData(0)] [InlineData(1)] - public void ScaleUTest(int test) + public void ScaleTest(int test) { float[] dst = (float[])_testArrays[test].Clone(); float[] expected = (float[])dst.Clone(); From f003a9c7f2309580f7da7d13890e6e70cea85400 Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 3 Oct 2018 16:33:48 -0700 Subject: [PATCH 2/7] fixing build for uniz --- src/Native/CpuMathNative/Sse.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp index 1777497a3b..be49be0957 100644 --- a/src/Native/CpuMathNative/Sse.cpp +++ b/src/Native/CpuMathNative/Sse.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #define _load1(ps, pi) \ _mm_set_ss(ps[pi[0]]) From 9a54d54e27614651ba8c76d876989163135b37b5 Mon Sep 17 00:00:00 2001 From: Anipik Date: Thu, 4 Oct 2018 10:28:41 -0700 Subject: [PATCH 3/7] Removing dead code --- src/Microsoft.ML.CpuMath/Thunk.cs | 2 -- src/Native/CpuMathNative/Sse.cpp | 37 ++----------------------------- 2 files changed, 2 insertions(+), 37 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs index a7451df8be..23192fc277 100644 --- a/src/Microsoft.ML.CpuMath/Thunk.cs +++ b/src/Microsoft.ML.CpuMath/Thunk.cs @@ -171,8 +171,6 @@ public static extern void AddXYTranGradPX(/*const*/ float* px, /*const*/ int* pp [DllImport(NativePath), SuppressUnmanagedCodeSecurity] public static extern void Scale(float a, float* pd, int c); [DllImport(NativePath), SuppressUnmanagedCodeSecurity] - public static extern void ScaleA(float a, float* pd, int c); - [DllImport(NativePath), SuppressUnmanagedCodeSecurity] public static extern void ScaleX(float a, float* pd, int c); [DllImport(NativePath), SuppressUnmanagedCodeSecurity] public static extern void ScaleSrcU(float a, /*const*/ float* ps, float* pd, int c); diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp index be49be0957..101af8a5f8 100644 --- a/src/Native/CpuMathNative/Sse.cpp +++ b/src/Native/CpuMathNative/Sse.cpp @@ -1447,26 +1447,6 @@ EXPORT_API(void) AddScalarU(float a, _Inout_ float * pd, int c) } EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c) -{ - float * pdLim = pd + c; - - __m128 x1 = _mm_set1_ps(a); - for (; pd + 4 <= pdLim; pd += 4) - { - __m128 x2 = _mm_loadu_ps(pd); - x2 = _mm_mul_ps(x1, x2); - _mm_storeu_ps(pd, x2); - } - - for (; pd < pdLim; pd++) - { - __m128 x2 = _mm_load_ss(pd); - x2 = _mm_mul_ss(x1, x2); - _mm_store_ss(pd, x2); - } -} - -EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c) { float * pdLim = pd + c; __m128 x1 = _mm_set1_ps(a); @@ -1552,8 +1532,8 @@ EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c) pd -= (4 - remainder); __m128 result = _mm_loadu_ps(pd); - __m128 trailingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + (remainder * 4)); - __m128 leadingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - remainder) * 4)); + __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4)); + __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4)); __m128 temp = _mm_and_ps(result, trailingMask); result = _mm_and_ps(result, leadingMask); @@ -1566,19 +1546,6 @@ EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c) } } -EXPORT_API(void) ScaleA(float a, _Inout_ float * pd, int c) -{ - float * pdLim = pd + c; - - __m128 x1 = _mm_set1_ps(a); - for (; pd < pdLim; pd += 4) - { - __m128 x2 = _mm_load_ps(pd); - x2 = _mm_mul_ps(x1, x2); - _mm_store_ps(pd, x2); - } -} - EXPORT_API(void) ScaleSrcU(float a, _In_ const float * ps, _Inout_ float * pd, int c) { float * pdLim = pd + c; From 0dbd303ec4da1ff304ff7cdaa190258c9d8aaf75 Mon Sep 17 00:00:00 2001 From: Anipik Date: Mon, 8 Oct 2018 14:46:02 -0700 Subject: [PATCH 4/7] using switch statement for lesser length input --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 14 +++++++++----- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 10 +++++----- src/Native/CpuMathNative/Sse.cpp | 9 ++++----- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 22706b9ef8..5e450a0e84 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -482,16 +482,21 @@ public static unsafe void Scale(float scale, Span dst) fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0]) fixed (float* pd = dst) { - float* pdLim = pd + dst.Length; - + float* pDstCurrent = pd; int length = dst.Length; Vector256 scaleVector256 = Avx.SetAllVector256(scale); if (length < 8) { - for(int i = 0; i < length; i++) + switch(length) { - dst[i] *= scale; + case 7: dst[6] *= scale; goto case 6; + case 6: dst[5] *= scale; goto case 5; + case 5: dst[4] *= scale; goto case 4; + case 4: dst[3] *= scale; goto case 3; + case 3: dst[2] *= scale; goto case 2; + case 2: dst[1] *= scale; goto case 1; + case 1: dst[0] *= scale; break; } return; } @@ -499,7 +504,6 @@ public static unsafe void Scale(float scale, Span dst) nuint address = (nuint)(pd); int misalignment = (int)(address % 32); int remainder = 0; - float* pDstCurrent = pd; if ((misalignment & 3) != 0) { diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 77eded3f55..5b50e667fe 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -460,17 +460,18 @@ public static unsafe void Scale(float scale, Span dst) fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0]) fixed (float* pd = dst) { - float* pdLim = pd + dst.Length; - + float* pDstCurrent = pd; int length = dst.Length; Vector128 scaleVector128 = Sse.SetAllVector128(scale); if (length < 4) { // Handle cases where we have less than 128-bits total and can't ever use SIMD acceleration. - for (int i = 0; i < length; i++) + switch (length) { - dst[i] *= scale; + case 3: dst[2] *= scale; goto case 2; + case 2: dst[1] *= scale; goto case 1; + case 1: dst[0] *= scale; break; } return; } @@ -478,7 +479,6 @@ public static unsafe void Scale(float scale, Span dst) nuint address = (nuint)(pd); int misalignment = (int)(address % 16); int remainder = 0; - float* pDstCurrent = pd; if ((misalignment & 3) != 0) { diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp index 101af8a5f8..dab94d1b7f 100644 --- a/src/Native/CpuMathNative/Sse.cpp +++ b/src/Native/CpuMathNative/Sse.cpp @@ -1448,16 +1448,15 @@ EXPORT_API(void) AddScalarU(float a, _Inout_ float * pd, int c) EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c) { - float * pdLim = pd + c; __m128 x1 = _mm_set1_ps(a); if (c < 4) { - for (; pd < pdLim; pd++) + switch(c) { - __m128 x2 = _mm_load_ss(pd); - x2 = _mm_mul_ss(x1, x2); - _mm_store_ss(pd, x2); + case 3: pd[2] *= a; + case 2: pd[1] *= a; + case 1: pd[0] *= a; } return; } From ed64154adc752531cdf1ca6b928c73b35d0d22aa Mon Sep 17 00:00:00 2001 From: Anipik Date: Tue, 9 Oct 2018 11:49:20 -0700 Subject: [PATCH 5/7] Remainder block corrected --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 28 +++++++++++------------ src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 28 +++++++++++------------ 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index 5e450a0e84..d27729497b 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -571,27 +571,27 @@ public static unsafe void Scale(float scale, Span dst) // unaligned loads where we mask the input each time. remainder = length; } + } - if (remainder != 0) - { - // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next - // unaligned load will read to the end of the array and then mask out any elements already processed + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed - pDstCurrent -= (8 - remainder); + pDstCurrent -= (8 - remainder); - Vector256 result = Avx.LoadVector256(pDstCurrent); + Vector256 result = Avx.LoadVector256(pDstCurrent); - Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); - Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + (( 8 - remainder) * 8)); + Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); + Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8)); - Vector256 temp = Avx.And(result, trailingMask); - result = Avx.And(result, leadingMask); + Vector256 temp = Avx.And(result, trailingMask); + result = Avx.And(result, leadingMask); - temp = Avx.Multiply(scaleVector256, temp); - temp = Avx.Or(temp, result); + temp = Avx.Multiply(scaleVector256, temp); + temp = Avx.Or(temp, result); - Avx.Store(pDstCurrent, temp); - } + Avx.Store(pDstCurrent, temp); } } } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 5b50e667fe..2faca55058 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -558,27 +558,27 @@ public static unsafe void Scale(float scale, Span dst) // unaligned loads where we mask the input each time. remainder = length; } + } - if (remainder != 0) - { - // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next - // unaligned load will read to the end of the array and then mask out any elements already processed + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed - pDstCurrent -= (4 - remainder); + pDstCurrent -= (4 - remainder); - Vector128 result = Sse.LoadVector128(pDstCurrent); + Vector128 result = Sse.LoadVector128(pDstCurrent); - Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); - Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4)); + Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); + Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4)); - Vector128 temp = Sse.And(result, trailingMask); - result = Sse.And(result, leadingMask); + Vector128 temp = Sse.And(result, trailingMask); + result = Sse.And(result, leadingMask); - temp = Sse.Multiply(scaleVector128, temp); - temp = Sse.Or(temp, result); + temp = Sse.Multiply(scaleVector128, temp); + temp = Sse.Or(temp, result); - Sse.Store(pDstCurrent, temp); - } + Sse.Store(pDstCurrent, temp); } } } From c49e6a24db7cc313fc7cbc33778ffd48d5162f60 Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 10 Oct 2018 10:39:12 -0700 Subject: [PATCH 6/7] avx.isSupported path removed --- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 30 ++++++----------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 2faca55058..70c645ced6 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -89,7 +89,7 @@ internal static unsafe void Store4(in Vector128 x, float* dst, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - internal static Vector128 VectorSum128(in Vector128 vector) + internal static Vector128 VectorSum128(in Vector128 vector) { if (Sse3.IsSupported) { @@ -527,28 +527,12 @@ public static unsafe void Scale(float scale, Span dst) for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4) { - if (Avx.IsSupported) - { - // The JIT will only fold away unaligned loads due to the semantics behind - // the VEX-encoding of the memory operand for `ins xmm, xmm, [mem]`. Since - // modern hardware has unaligned loads that are as fast as aligned loads, - // when it doesn't cross a cache-line/page boundary, we will just assert - // that the alignment is correct and allow for the more-efficient codegen. - - Contracts.Assert(((nuint)(pDstCurrent) % 16) == 0); - Vector128 temp = Sse.LoadVector128(pDstCurrent); - temp = Sse.Multiply(scaleVector128, temp); - Sse.Store(pDstCurrent, temp); - } - else - { - // If we aren't using the VEX-encoding, then the reverse is true and the JIT - // will only fold away aligned loads (due to semantics of the legacy encoding). - // We don't need an assert, since the instruction will throw for unaligned inputs. - Vector128 temp = Sse.LoadAlignedVector128(pDstCurrent); - temp = Sse.Multiply(scaleVector128, temp); - Sse.Store(pDstCurrent, temp); - } + // If we aren't using the VEX-encoding, then the reverse is true and the JIT + // will only fold away aligned loads (due to semantics of the legacy encoding). + // We don't need an assert, since the instruction will throw for unaligned inputs. + Vector128 temp = Sse.LoadAlignedVector128(pDstCurrent); + temp = Sse.Multiply(scaleVector128, temp); + Sse.Store(pDstCurrent, temp); } } else From 061e5b34db1073506f29cef5cb5ab8eadbf3a819 Mon Sep 17 00:00:00 2001 From: Anipik Date: Wed, 10 Oct 2018 12:50:22 -0700 Subject: [PATCH 7/7] correcting comments and removing return statement --- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 5 ++--- src/Native/CpuMathNative/Sse.cpp | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 70c645ced6..ede8a7da97 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -522,13 +522,12 @@ public static unsafe void Scale(float scale, Span dst) if (length > 4) { // Handle all the 128-bit blocks that we can now that we have offset to an aligned address - remainder = length % 4; for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4) { - // If we aren't using the VEX-encoding, then the reverse is true and the JIT - // will only fold away aligned loads (due to semantics of the legacy encoding). + // If we aren't using the VEX-encoding, the JIT will only fold away aligned loads + // (due to semantics of the legacy encoding). // We don't need an assert, since the instruction will throw for unaligned inputs. Vector128 temp = Sse.LoadAlignedVector128(pDstCurrent); temp = Sse.Multiply(scaleVector128, temp); diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp index dab94d1b7f..4a2d30e979 100644 --- a/src/Native/CpuMathNative/Sse.cpp +++ b/src/Native/CpuMathNative/Sse.cpp @@ -1541,7 +1541,6 @@ EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c) result = _mm_or_ps(temp, result); _mm_storeu_ps(pd, result); - return; } }