From ab45576ab4f2658713083804bb7581fc32381488 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 5 Jan 2022 14:09:32 -0800 Subject: [PATCH 1/2] Change the BitArray(bool[]) constructor to use the xplat intrinsics --- .../src/System/Collections/BitArray.cs | 87 +++++-------------- 1 file changed, 20 insertions(+), 67 deletions(-) diff --git a/src/libraries/System.Collections/src/System/Collections/BitArray.cs b/src/libraries/System.Collections/src/System/Collections/BitArray.cs index 9f8531a9e818a..a643a75f1b595 100644 --- a/src/libraries/System.Collections/src/System/Collections/BitArray.cs +++ b/src/libraries/System.Collections/src/System/Collections/BitArray.cs @@ -4,9 +4,11 @@ using System.Buffers.Binary; using System.Diagnostics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.Arm; +using Internal.Runtime.CompilerServices; namespace System.Collections { @@ -145,81 +147,32 @@ public unsafe BitArray(bool[] values) // (true for any non-zero values, false for 0) - any values between 2-255 will be interpreted as false. // Instead, We compare with zeroes (== false) then negate the result to ensure compatibility. - if (Avx2.IsSupported) + ref byte value = ref Unsafe.As(ref MemoryMarshal.GetArrayDataReference(values)); + + if (Vector256.IsHardwareAccelerated) { - // JIT does not support code hoisting for SIMD yet - Vector256 zero = Vector256.Zero; - fixed (bool* ptr = values) + for (; (i + Vector256ByteCount) <= (uint)values.Length; i += Vector256ByteCount) { - for (; (i + Vector256ByteCount) <= (uint)values.Length; i += Vector256ByteCount) - { - Vector256 vector = Avx.LoadVector256((byte*)ptr + i); - Vector256 isFalse = Avx2.CompareEqual(vector, zero); - int result = Avx2.MoveMask(isFalse); - m_array[i / 32u] = ~result; - } + Vector256 vector = Vector256.LoadUnsafe(ref value, i); + Vector256 isFalse = Vector256.Equals(vector, Vector256.Zero); + + uint result = isFalse.ExtractMostSignificantBits(); + m_array[i / 32u] = (int)(~result); } } - else if (Sse2.IsSupported) + else if (Vector128.IsHardwareAccelerated) { - // JIT does not support code hoisting for SIMD yet - Vector128 zero = Vector128.Zero; - fixed (bool* ptr = values) + for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) { - for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) - { - Vector128 lowerVector = Sse2.LoadVector128((byte*)ptr + i); - Vector128 lowerIsFalse = Sse2.CompareEqual(lowerVector, zero); - int lowerPackedIsFalse = Sse2.MoveMask(lowerIsFalse); + Vector128 lowerVector = Vector128.LoadUnsafe(ref value, i); + Vector128 lowerIsFalse = Vector128.Equals(lowerVector, Vector128.Zero); + uint lowerResult = lowerIsFalse.ExtractMostSignificantBits(); - Vector128 upperVector = Sse2.LoadVector128((byte*)ptr + i + Vector128.Count); - Vector128 upperIsFalse = Sse2.CompareEqual(upperVector, zero); - int upperPackedIsFalse = Sse2.MoveMask(upperIsFalse); + Vector128 upperVector = Vector128.LoadUnsafe(ref value, i + Vector128ByteCount); + Vector128 upperIsFalse = Vector128.Equals(upperVector, Vector128.Zero); + uint upperResult = upperIsFalse.ExtractMostSignificantBits(); - m_array[i / 32u] = ~((upperPackedIsFalse << 16) | lowerPackedIsFalse); - } - } - } - else if (AdvSimd.Arm64.IsSupported) - { - // JIT does not support code hoisting for SIMD yet - // However comparison against zero can be replaced to cmeq against zero (vceqzq_s8) - // See dotnet/runtime#33972 for details - Vector128 zero = Vector128.Zero; - Vector128 bitMask128 = BitConverter.IsLittleEndian ? - Vector128.Create(0x80402010_08040201).AsByte() : - Vector128.Create(0x01020408_10204080).AsByte(); - - fixed (bool* ptr = values) - { - for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) - { - // Same logic as SSE2 path, however we lack MoveMask (equivalent) instruction - // As a workaround, mask out the relevant bit after comparison - // and combine by ORing all of them together (In this case, adding all of them does the same thing) - Vector128 lowerVector = AdvSimd.LoadVector128((byte*)ptr + i); - Vector128 lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero); - Vector128 bitsExtracted1 = AdvSimd.And(lowerIsFalse, bitMask128); - bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); - bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); - bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); - Vector128 lowerPackedIsFalse = bitsExtracted1.AsInt16(); - - Vector128 upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128.Count); - Vector128 upperIsFalse = AdvSimd.CompareEqual(upperVector, zero); - Vector128 bitsExtracted2 = AdvSimd.And(upperIsFalse, bitMask128); - bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); - bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); - bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); - Vector128 upperPackedIsFalse = bitsExtracted2.AsInt16(); - - int result = AdvSimd.Arm64.ZipLow(lowerPackedIsFalse, upperPackedIsFalse).AsInt32().ToScalar(); - if (!BitConverter.IsLittleEndian) - { - result = BinaryPrimitives.ReverseEndianness(result); - } - m_array[i / 32u] = ~result; - } + m_array[i / 32u] = (int)(~((upperResult << 16) | lowerResult)); } } From 9b6ea8db9b65fc761bae964596d8f629b7f5108b Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 5 Jan 2022 15:23:34 -0800 Subject: [PATCH 2/2] Change the And, Or, Xor, and Not methods to use the xplat intrinsics --- .../src/System/Collections/BitArray.cs | 167 +++++------------- 1 file changed, 47 insertions(+), 120 deletions(-) diff --git a/src/libraries/System.Collections/src/System/Collections/BitArray.cs b/src/libraries/System.Collections/src/System/Collections/BitArray.cs index a643a75f1b595..925f175f7fcf8 100644 --- a/src/libraries/System.Collections/src/System/Collections/BitArray.cs +++ b/src/libraries/System.Collections/src/System/Collections/BitArray.cs @@ -353,43 +353,24 @@ public unsafe BitArray And(BitArray value) } uint i = 0; - if (Avx2.IsSupported) - { - fixed (int* leftPtr = thisArray) - fixed (int* rightPtr = valueArray) - { - for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) - { - Vector256 leftVec = Avx.LoadVector256(leftPtr + i); - Vector256 rightVec = Avx.LoadVector256(rightPtr + i); - Avx.Store(leftPtr + i, Avx2.And(leftVec, rightVec)); - } - } - } - else if (Sse2.IsSupported) + + ref int left = ref MemoryMarshal.GetArrayDataReference(thisArray); + ref int right = ref MemoryMarshal.GetArrayDataReference(valueArray); + + if (Vector256.IsHardwareAccelerated) { - fixed (int* leftPtr = thisArray) - fixed (int* rightPtr = valueArray) + for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) { - for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) - { - Vector128 leftVec = Sse2.LoadVector128(leftPtr + i); - Vector128 rightVec = Sse2.LoadVector128(rightPtr + i); - Sse2.Store(leftPtr + i, Sse2.And(leftVec, rightVec)); - } + Vector256 result = Vector256.LoadUnsafe(ref left, i) & Vector256.LoadUnsafe(ref right, i); + result.StoreUnsafe(ref left, i); } } - else if (AdvSimd.IsSupported) + else if (Vector128.IsHardwareAccelerated) { - fixed (int* leftPtr = thisArray) - fixed (int* rightPtr = valueArray) + for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) { - for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) - { - Vector128 leftVec = AdvSimd.LoadVector128(leftPtr + i); - Vector128 rightVec = AdvSimd.LoadVector128(rightPtr + i); - AdvSimd.Store(leftPtr + i, AdvSimd.And(leftVec, rightVec)); - } + Vector128 result = Vector128.LoadUnsafe(ref left, i) & Vector128.LoadUnsafe(ref right, i); + result.StoreUnsafe(ref left, i); } } @@ -439,43 +420,24 @@ public unsafe BitArray Or(BitArray value) } uint i = 0; - if (Avx2.IsSupported) - { - fixed (int* leftPtr = thisArray) - fixed (int* rightPtr = valueArray) - { - for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) - { - Vector256 leftVec = Avx.LoadVector256(leftPtr + i); - Vector256 rightVec = Avx.LoadVector256(rightPtr + i); - Avx.Store(leftPtr + i, Avx2.Or(leftVec, rightVec)); - } - } - } - else if (Sse2.IsSupported) + + ref int left = ref MemoryMarshal.GetArrayDataReference(thisArray); + ref int right = ref MemoryMarshal.GetArrayDataReference(valueArray); + + if (Vector256.IsHardwareAccelerated) { - fixed (int* leftPtr = thisArray) - fixed (int* rightPtr = valueArray) + for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) { - for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) - { - Vector128 leftVec = Sse2.LoadVector128(leftPtr + i); - Vector128 rightVec = Sse2.LoadVector128(rightPtr + i); - Sse2.Store(leftPtr + i, Sse2.Or(leftVec, rightVec)); - } + Vector256 result = Vector256.LoadUnsafe(ref left, i) | Vector256.LoadUnsafe(ref right, i); + result.StoreUnsafe(ref left, i); } } - else if (AdvSimd.IsSupported) + else if (Vector128.IsHardwareAccelerated) { - fixed (int* leftPtr = thisArray) - fixed (int* rightPtr = valueArray) + for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) { - for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) - { - Vector128 leftVec = AdvSimd.LoadVector128(leftPtr + i); - Vector128 rightVec = AdvSimd.LoadVector128(rightPtr + i); - AdvSimd.Store(leftPtr + i, AdvSimd.Or(leftVec, rightVec)); - } + Vector128 result = Vector128.LoadUnsafe(ref left, i) | Vector128.LoadUnsafe(ref right, i); + result.StoreUnsafe(ref left, i); } } @@ -525,43 +487,24 @@ public unsafe BitArray Xor(BitArray value) } uint i = 0; - if (Avx2.IsSupported) - { - fixed (int* leftPtr = m_array) - fixed (int* rightPtr = value.m_array) - { - for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) - { - Vector256 leftVec = Avx.LoadVector256(leftPtr + i); - Vector256 rightVec = Avx.LoadVector256(rightPtr + i); - Avx.Store(leftPtr + i, Avx2.Xor(leftVec, rightVec)); - } - } - } - else if (Sse2.IsSupported) + + ref int left = ref MemoryMarshal.GetArrayDataReference(thisArray); + ref int right = ref MemoryMarshal.GetArrayDataReference(valueArray); + + if (Vector256.IsHardwareAccelerated) { - fixed (int* leftPtr = thisArray) - fixed (int* rightPtr = valueArray) + for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) { - for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) - { - Vector128 leftVec = Sse2.LoadVector128(leftPtr + i); - Vector128 rightVec = Sse2.LoadVector128(rightPtr + i); - Sse2.Store(leftPtr + i, Sse2.Xor(leftVec, rightVec)); - } + Vector256 result = Vector256.LoadUnsafe(ref left, i) ^ Vector256.LoadUnsafe(ref right, i); + result.StoreUnsafe(ref left, i); } } - else if (AdvSimd.IsSupported) + else if (Vector128.IsHardwareAccelerated) { - fixed (int* leftPtr = thisArray) - fixed (int* rightPtr = valueArray) + for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) { - for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) - { - Vector128 leftVec = AdvSimd.LoadVector128(leftPtr + i); - Vector128 rightVec = AdvSimd.LoadVector128(rightPtr + i); - AdvSimd.Store(leftPtr + i, AdvSimd.Xor(leftVec, rightVec)); - } + Vector128 result = Vector128.LoadUnsafe(ref left, i) ^ Vector128.LoadUnsafe(ref right, i); + result.StoreUnsafe(ref left, i); } } @@ -603,39 +546,23 @@ public unsafe BitArray Not() } uint i = 0; - if (Avx2.IsSupported) - { - Vector256 ones = Vector256.Create(-1); - fixed (int* ptr = thisArray) - { - for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) - { - Vector256 vec = Avx.LoadVector256(ptr + i); - Avx.Store(ptr + i, Avx2.Xor(vec, ones)); - } - } - } - else if (Sse2.IsSupported) + + ref int value = ref MemoryMarshal.GetArrayDataReference(thisArray); + + if (Vector256.IsHardwareAccelerated) { - Vector128 ones = Vector128.Create(-1); - fixed (int* ptr = thisArray) + for (; i < (uint)count - (Vector256IntCount - 1u); i += Vector256IntCount) { - for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) - { - Vector128 vec = Sse2.LoadVector128(ptr + i); - Sse2.Store(ptr + i, Sse2.Xor(vec, ones)); - } + Vector256 result = ~Vector256.LoadUnsafe(ref value, i); + result.StoreUnsafe(ref value, i); } } - else if (AdvSimd.IsSupported) + else if (Vector128.IsHardwareAccelerated) { - fixed (int* leftPtr = thisArray) + for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) { - for (; i < (uint)count - (Vector128IntCount - 1u); i += Vector128IntCount) - { - Vector128 leftVec = AdvSimd.LoadVector128(leftPtr + i); - AdvSimd.Store(leftPtr + i, AdvSimd.Not(leftVec)); - } + Vector128 result = ~Vector128.LoadUnsafe(ref value, i); + result.StoreUnsafe(ref value, i); } }