diff --git a/src/Microsoft.ML.CpuMath/Avx.cs b/src/Microsoft.ML.CpuMath/Avx.cs index 6dcf898b6f..5d4610d9bc 100644 --- a/src/Microsoft.ML.CpuMath/Avx.cs +++ b/src/Microsoft.ML.CpuMath/Avx.cs @@ -625,7 +625,7 @@ public static void Scale(float a, float[] dst, int count) unsafe { fixed (float* pd = &dst[0]) - Thunk.ScaleU(a, pd, count); + Thunk.Scale(a, pd, count); } } diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index b31a427139..d27729497b 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -13,11 +13,36 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using nuint = System.UInt64; namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class AvxIntrinsics { + public static readonly uint[] LeadingAlignmentMask = new uint[64] + { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, + }; + + public static readonly uint[] TrailingAlignmentMask = new uint[64] + { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + }; + private static readonly Vector256 _absMask256 = Avx.StaticCast(Avx.SetAllVector256(0x7FFFFFFF)); private const int Vector256Alignment = 32; @@ -451,45 +476,122 @@ public static unsafe void AddScalarU(float scalar, Span dst) } } - public static unsafe void ScaleU(float scale, Span dst) + public static unsafe void Scale(float scale, Span dst) { - fixed (float* pdst = dst) + fixed (uint* pLeadingAlignmentMask = &LeadingAlignmentMask[0]) + fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0]) + fixed (float* pd = dst) { - float* pDstCurrent = pdst; - float* pEnd = pdst + dst.Length; - + float* pDstCurrent = pd; + int length = dst.Length; Vector256 scaleVector256 = Avx.SetAllVector256(scale); - while (pDstCurrent + 8 <= pEnd) + if (length < 8) { - Vector256 dstVector = Avx.LoadVector256(pDstCurrent); + switch(length) + { + case 7: dst[6] *= scale; goto case 6; + case 6: dst[5] *= scale; goto case 5; + case 5: dst[4] *= scale; goto case 4; + case 4: dst[3] *= scale; goto case 3; + case 3: dst[2] *= scale; goto case 2; + case 2: dst[1] *= scale; goto case 1; + case 1: dst[0] *= scale; break; + } + return; + } - dstVector = Avx.Multiply(scaleVector256, dstVector); - Avx.Store(pDstCurrent, dstVector); + nuint address = (nuint)(pd); + int misalignment = (int)(address % 32); + int remainder = 0; - pDstCurrent += 8; + if ((misalignment & 3) != 0) + { + // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations + remainder = length % 8; + + for (float* pEnd = pd + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 8) + { + Vector256 temp = Avx.LoadVector256(pDstCurrent); + temp = Avx.Multiply(scaleVector256, temp); + Avx.Store(pDstCurrent, temp); + } } + else + { + if (misalignment != 0) + { + // Handle cases where the data is not 256-bit aligned by doing an unaligned read and then + // masking any elements that will be included in the first aligned read - Vector128 scaleVector128 = Sse.SetAllVector128(scale); + misalignment >>= 2; + misalignment = 8 - misalignment; - if (pDstCurrent + 4 <= pEnd) - { - Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + Vector256 result = Avx.LoadVector256(pDstCurrent); - dstVector = Sse.Multiply(scaleVector128, dstVector); - Sse.Store(pDstCurrent, dstVector); + Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + (misalignment * 8)); + Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (( 8 - misalignment) * 8)); - pDstCurrent += 4; + Vector256 temp = Avx.And(result, leadingMask); + result = Avx.And(result, trailingMask); + + temp = Avx.Multiply(scaleVector256, temp); + result = Avx.Or(temp, result); + + Avx.Store(pDstCurrent, result); + + pDstCurrent += misalignment; + length -= misalignment; + } + + if (length > 7) + { + // Handle all the 256-bit blocks that we can now that we have offset to an aligned address + + remainder = length % 8; + + for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 8) + { + // The JIT will only fold away unaligned loads due to the semantics behind + // the VEX-encoding of the memory operand for `ins xmm, xmm, [mem]`. Since + // modern hardware has unaligned loads that are as fast as aligned loads, + // when it doesn't cross a cache-line/page boundary, we will just assert + // that the alignment is correct and allow for the more-efficient codegen. + + Contracts.Assert(((nuint)(pDstCurrent) % 32) == 0); + Vector256 temp = Avx.LoadVector256(pDstCurrent); + temp = Avx.Multiply(scaleVector256, temp); + Avx.Store(pDstCurrent, temp); + } + } + else + { + // Handle the "worst-case" scenario, which is when we have 8-16 elements and the input is not + // 256-bit aligned. This means we can't do any aligned loads and will just end up doing two + // unaligned loads where we mask the input each time. + remainder = length; + } } - while (pDstCurrent < pEnd) + if (remainder != 0) { - Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed - dstVector = Sse.MultiplyScalar(scaleVector128, dstVector); - Sse.StoreScalar(pDstCurrent, dstVector); + pDstCurrent -= (8 - remainder); - pDstCurrent++; + Vector256 result = Avx.LoadVector256(pDstCurrent); + + Vector256 trailingMask = Avx.LoadVector256(((float*)(pTrailingAlignmentMask)) + (remainder * 8)); + Vector256 leadingMask = Avx.LoadVector256(((float*)(pLeadingAlignmentMask)) + ((8 - remainder) * 8)); + + Vector256 temp = Avx.And(result, trailingMask); + result = Avx.And(result, leadingMask); + + temp = Avx.Multiply(scaleVector256, temp); + temp = Avx.Or(temp, result); + + Avx.Store(pDstCurrent, temp); } } } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index f15f5c3938..09738684f5 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -248,11 +248,11 @@ private static void Scale(float a, Span dst) { if (Avx.IsSupported) { - AvxIntrinsics.ScaleU(a, dst); + AvxIntrinsics.Scale(a, dst); } else if (Sse.IsSupported) { - SseIntrinsics.ScaleU(a, dst); + SseIntrinsics.Scale(a, dst); } else { diff --git a/src/Microsoft.ML.CpuMath/Sse.cs b/src/Microsoft.ML.CpuMath/Sse.cs index 13de22dd5b..d541c02533 100644 --- a/src/Microsoft.ML.CpuMath/Sse.cs +++ b/src/Microsoft.ML.CpuMath/Sse.cs @@ -606,7 +606,7 @@ public static void Scale(float a, AlignedArray dst) unsafe { fixed (float* pdst = &dst.Items[0]) - Thunk.ScaleA(a, Ptr(dst, pdst), dst.Size); + Thunk.Scale(a, Ptr(dst, pdst), dst.Size); } } @@ -618,7 +618,7 @@ public static void Scale(float a, float[] dst, int count) unsafe { fixed (float* pd = &dst[0]) - Thunk.ScaleU(a, pd, count); + Thunk.Scale(a, pd, count); } } @@ -631,7 +631,7 @@ public static void Scale(float a, float[] dst, int offset, int count) unsafe { fixed (float* pd = &dst[offset]) - Thunk.ScaleU(a, pd, count); + Thunk.Scale(a, pd, count); } } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 4c36d0094e..ede8a7da97 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -17,6 +17,7 @@ using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using nuint = System.UInt64; namespace Microsoft.ML.Runtime.Internal.CpuMath { @@ -29,6 +30,22 @@ internal static class SseIntrinsics // The count of bytes in Vector128, corresponding to _cbAlign in AlignedArray private const int Vector128Alignment = 16; + public static readonly uint[] LeadingAlignmentMask = new uint[16] + { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, + }; + + public static readonly uint[] TrailingAlignmentMask = new uint[16] + { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, + }; + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static bool HasCompatibleAlignment(AlignedArray alignedArray) { @@ -72,7 +89,7 @@ internal static unsafe void Store4(in Vector128 x, float* dst, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - internal static Vector128 VectorSum128(in Vector128 vector) + internal static Vector128 VectorSum128(in Vector128 vector) { if (Sse3.IsSupported) { @@ -437,33 +454,114 @@ public static unsafe void AddScalarU(float scalar, Span dst) } } - public static unsafe void ScaleU(float scale, Span dst) + public static unsafe void Scale(float scale, Span dst) { - fixed (float* pdst = dst) + fixed (uint* pLeadingAlignmentMask = &LeadingAlignmentMask[0]) + fixed (uint* pTrailingAlignmentMask = &TrailingAlignmentMask[0]) + fixed (float* pd = dst) { - float* pDstCurrent = pdst; - float* pEnd = pdst + dst.Length; + float* pDstCurrent = pd; + int length = dst.Length; + Vector128 scaleVector128 = Sse.SetAllVector128(scale); - Vector128 scaleVector = Sse.SetAllVector128(scale); + if (length < 4) + { + // Handle cases where we have less than 128-bits total and can't ever use SIMD acceleration. + switch (length) + { + case 3: dst[2] *= scale; goto case 2; + case 2: dst[1] *= scale; goto case 1; + case 1: dst[0] *= scale; break; + } + return; + } - while (pDstCurrent + 4 <= pEnd) + nuint address = (nuint)(pd); + int misalignment = (int)(address % 16); + int remainder = 0; + + if ((misalignment & 3) != 0) { - Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations + remainder = length % 4; - dstVector = Sse.Multiply(scaleVector, dstVector); - Sse.Store(pDstCurrent, dstVector); + for (float* pEnd = pd + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4) + { + Vector128 temp = Sse.LoadVector128(pDstCurrent); + temp = Sse.Multiply(scaleVector128, temp); + Sse.Store(pDstCurrent, temp); + } + } + else + { + if (misalignment != 0) + { + // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then + // masking any elements that will be included in the first aligned read - pDstCurrent += 4; + misalignment >>= 2; + misalignment = 4 - misalignment; + + Vector128 result = Sse.LoadVector128(pDstCurrent); + + Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + (misalignment * 4)); + Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + ((4 - misalignment) * 4)); + + Vector128 temp = Sse.And(result, leadingMask); + result = Sse.And(result, trailingMask); + + temp = Sse.Multiply(scaleVector128, temp); + result = Sse.Or(temp, result); + + Sse.Store(pDstCurrent, result); + + pDstCurrent += misalignment; + length -= misalignment; + } + + if (length > 4) + { + // Handle all the 128-bit blocks that we can now that we have offset to an aligned address + remainder = length % 4; + + for (float* pEnd = pDstCurrent + (length - remainder); pDstCurrent < pEnd; pDstCurrent += 4) + { + // If we aren't using the VEX-encoding, the JIT will only fold away aligned loads + // (due to semantics of the legacy encoding). + // We don't need an assert, since the instruction will throw for unaligned inputs. + Vector128 temp = Sse.LoadAlignedVector128(pDstCurrent); + temp = Sse.Multiply(scaleVector128, temp); + Sse.Store(pDstCurrent, temp); + } + } + else + { + // Handle the "worst-case" scenario, which is when we have 4-8 elements and the input is not + // 128-bit aligned. This means we can't do any aligned loads and will just end up doing two + // unaligned loads where we mask the input each time. + remainder = length; + } } - while (pDstCurrent < pEnd) + if (remainder != 0) { - Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed - dstVector = Sse.MultiplyScalar(scaleVector, dstVector); - Sse.StoreScalar(pDstCurrent, dstVector); + pDstCurrent -= (4 - remainder); - pDstCurrent++; + Vector128 result = Sse.LoadVector128(pDstCurrent); + + Vector128 trailingMask = Sse.LoadVector128(((float*)(pTrailingAlignmentMask)) + (remainder * 4)); + Vector128 leadingMask = Sse.LoadVector128(((float*)(pLeadingAlignmentMask)) + ((4 - remainder) * 4)); + + Vector128 temp = Sse.And(result, trailingMask); + result = Sse.And(result, leadingMask); + + temp = Sse.Multiply(scaleVector128, temp); + temp = Sse.Or(temp, result); + + Sse.Store(pDstCurrent, temp); } } } diff --git a/src/Microsoft.ML.CpuMath/Thunk.cs b/src/Microsoft.ML.CpuMath/Thunk.cs index 1053f75b75..23192fc277 100644 --- a/src/Microsoft.ML.CpuMath/Thunk.cs +++ b/src/Microsoft.ML.CpuMath/Thunk.cs @@ -169,9 +169,7 @@ public static extern void AddXYTranGradPX(/*const*/ float* px, /*const*/ int* pp float decay, float cond, int crow, int ccol); [DllImport(NativePath), SuppressUnmanagedCodeSecurity] - public static extern void ScaleU(float a, float* pd, int c); - [DllImport(NativePath), SuppressUnmanagedCodeSecurity] - public static extern void ScaleA(float a, float* pd, int c); + public static extern void Scale(float a, float* pd, int c); [DllImport(NativePath), SuppressUnmanagedCodeSecurity] public static extern void ScaleX(float a, float* pd, int c); [DllImport(NativePath), SuppressUnmanagedCodeSecurity] diff --git a/src/Native/CpuMathNative/Sse.cpp b/src/Native/CpuMathNative/Sse.cpp index c5221f0020..4a2d30e979 100644 --- a/src/Native/CpuMathNative/Sse.cpp +++ b/src/Native/CpuMathNative/Sse.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #define _load1(ps, pi) \ _mm_set_ss(ps[pi[0]]) @@ -85,6 +86,22 @@ DWORD xmmYmmStateSupport() #endif +const unsigned int LeadingAlignmentMask[16] = +{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, +}; + +const unsigned int TrailingAlignmentMask[16] = +{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF, + 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, + 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, +}; + // Test whether Avx is available. EXPORT_API(bool) ChkAvx() { @@ -1429,36 +1446,101 @@ EXPORT_API(void) AddScalarU(float a, _Inout_ float * pd, int c) } } -EXPORT_API(void) ScaleU(float a, _Inout_ float * pd, int c) +EXPORT_API(void) Scale(float a, _Inout_ float * pd, int c) { - float * pdLim = pd + c; - __m128 x1 = _mm_set1_ps(a); - for (; pd + 4 <= pdLim; pd += 4) + + if (c < 4) { - __m128 x2 = _mm_loadu_ps(pd); - x2 = _mm_mul_ps(x1, x2); - _mm_storeu_ps(pd, x2); + switch(c) + { + case 3: pd[2] *= a; + case 2: pd[1] *= a; + case 1: pd[0] *= a; + } + return; } - for (; pd < pdLim; pd++) + uintptr_t address = (uintptr_t)(pd); + uintptr_t misalignment = address % 16; + int remainder = 0; + + if ((misalignment & 3) != 0) { - __m128 x2 = _mm_load_ss(pd); - x2 = _mm_mul_ss(x1, x2); - _mm_store_ss(pd, x2); + // Handles cases where the data is not 32-bit aligned and we can't ever use aligned operations + remainder = c % 4; + + for (const float* pEnd = pd + (c - remainder); pd < pEnd; pd += 4) + { + __m128 x2 = _mm_loadu_ps(pd); + x2 = _mm_mul_ps(x1, x2); + _mm_storeu_ps(pd, x2); + } } -} + else + { + if (misalignment != 0) + { + // Handle cases where the data is not 128-bit aligned by doing an unaligned read and then + // masking any elements that will be included in the first aligned read + misalignment >>= 2; + misalignment = 4 - misalignment; + + __m128 result = _mm_loadu_ps(pd); + + __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + (misalignment * 4)); + __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + ((4 - misalignment) * 4)); + + __m128 temp = _mm_and_ps(result, leadingMask); + result = _mm_and_ps(result, trailingMask); + + temp = _mm_mul_ps(temp, x1); + result = _mm_or_ps(temp, result); + + _mm_storeu_ps(pd, result); + + pd += misalignment; + c -= misalignment; + } -EXPORT_API(void) ScaleA(float a, _Inout_ float * pd, int c) -{ - float * pdLim = pd + c; + if (c > 3) + { + // Handle all the 128-bit blocks that we can now that we have offset to an aligned address + remainder = c % 4; + for (const float* pEnd = pd + (c - remainder); pd < pEnd; pd += 4) + { + __m128 x2 = _mm_load_ps(pd); + x2 = _mm_mul_ps(x1, x2); + _mm_storeu_ps(pd, x2); + } + } + else + { + // Handle the "worst-case" scenario, which is when we have 4-8 elements and the input is not + // 128-bit aligned. This means we can't do any aligned loads and will just end up doing two + // unaligned loads where we mask the input each time. + remainder = c; + } + } - __m128 x1 = _mm_set1_ps(a); - for (; pd < pdLim; pd += 4) - { - __m128 x2 = _mm_load_ps(pd); - x2 = _mm_mul_ps(x1, x2); - _mm_store_ps(pd, x2); + if (remainder != 0) + { + // Handle any trailing elements that don't fit into a 128-bit block by moving back so that the next + // unaligned load will read to the end of the array and then mask out any elements already processed + + pd -= (4 - remainder); + __m128 result = _mm_loadu_ps(pd); + + __m128 trailingMask = _mm_loadu_ps(((float*)(&TrailingAlignmentMask)) + (remainder * 4)); + __m128 leadingMask = _mm_loadu_ps(((float*)(&LeadingAlignmentMask)) + ((4 - remainder) * 4)); + + __m128 temp = _mm_and_ps(result, trailingMask); + result = _mm_and_ps(result, leadingMask); + + temp = _mm_mul_ps(temp, x1); + result = _mm_or_ps(temp, result); + + _mm_storeu_ps(pd, result); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs index 8e691b0847..5930090bc5 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/AvxPerformanceTests.cs @@ -16,8 +16,8 @@ public void AddScalarU() => AvxIntrinsics.AddScalarU(DefaultScale, new Span(dst, 0, Length)); [Benchmark] - public void ScaleU() - => AvxIntrinsics.ScaleU(DefaultScale, new Span(dst, 0, Length)); + public void Scale() + => AvxIntrinsics.Scale(DefaultScale, new Span(dst, 0, Length)); [Benchmark] public void ScaleSrcU() diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs index 8df3352556..83b412e61d 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs @@ -23,8 +23,8 @@ internal static class CpuMathNativeUtils [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "Scale"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void Scale(float a, /*_Inout_*/ float* pd, int c); [DllImport("CpuMathNative", EntryPoint = "ScaleSrcU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void ScaleSrcU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs index b992ed3001..3cce45046c 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/NativePerformanceTests.cs @@ -21,11 +21,11 @@ public unsafe void AddScalarU() } [Benchmark] - public unsafe void ScaleU() + public unsafe void Scale() { fixed (float* pdst = dst) { - CpuMathNativeUtils.ScaleU(DefaultScale, pdst, Length); + CpuMathNativeUtils.Scale(DefaultScale, pdst, Length); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index 046e065955..923d7c539f 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -16,8 +16,8 @@ public void AddScalarU() => SseIntrinsics.AddScalarU(DefaultScale, new Span(dst, 0, Length)); [Benchmark] - public void ScaleU() - => SseIntrinsics.ScaleU(DefaultScale, new Span(dst, 0, Length)); + public void Scale() + => SseIntrinsics.Scale(DefaultScale, new Span(dst, 0, Length)); [Benchmark] public void ScaleSrcU() diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs index 7284ce6c1a..6c0f1cbaf0 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netcoreapp/UnitTests.cs @@ -233,7 +233,7 @@ public void AddScalarUTest(int test) [Theory] [InlineData(0)] [InlineData(1)] - public void ScaleUTest(int test) + public void ScaleTest(int test) { float[] dst = (float[])_testArrays[test].Clone(); float[] expected = (float[])dst.Clone();