Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

try to port ASCIIUtility.WidenAsciiToUtf16 to x-plat intrinsics #73055

Merged
merged 16 commits into from
Sep 9, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
266 changes: 79 additions & 187 deletions src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1370,6 +1370,27 @@ public static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAsciiBu
goto Finish;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool VectorContainsNonAsciiChar(Vector128<byte> asciiVector)
{
// max ASCII character is 0b_0111_1111, so the most significant bit (0x80) tells whether it contains non ascii

// prefer architecture specific intrinsic as they offer better perf
if (Sse41.IsSupported)
{
return !Sse41.TestZ(asciiVector, Vector128.Create((byte)0x80));
}
else if (AdvSimd.Arm64.IsSupported)
{
Vector128<byte> maxBytes = AdvSimd.Arm64.MaxPairwise(asciiVector, asciiVector);
return (maxBytes.AsUInt64().ToScalar() & 0x8080808080808080) != 0;
}
else
{
return asciiVector.ExtractMostSignificantBits() != 0;
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool VectorContainsNonAsciiChar(Vector128<ushort> utf16Vector)
{
Expand Down Expand Up @@ -1557,16 +1578,59 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
// Intrinsified in mono interpreter
nuint currentOffset = 0;

// If SSE2 is supported, use those specific intrinsics instead of the generic vectorized
// code below. This has two benefits: (a) we can take advantage of specific instructions like
// pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
// this method is running.

if (Sse2.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian))
if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128<byte>.Count)
{
if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;

if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256<byte>.Count)
{
currentOffset = WidenAsciiToUtf16_Intrinsified(pAsciiBuffer, pUtf16Buffer, elementCount);
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256<byte>.Count;

do
{
Vector256<byte> asciiVector = Vector256.Load(pAsciiBuffer + currentOffset);

if (asciiVector.ExtractMostSignificantBits() != 0)
Copy link
Member

@stephentoub stephentoub Sep 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/dotnet/runtime/pull/73055/files?diff=split&w=1#diff-66bbe89271f826c9232bd146abb678844754515dc027f70ad0ce36f751da46ebR1378 suggests that Sse41.TestZ is faster than ExtractMostSignificantBits for 128 bits. Does the same not hold for Avx.TestZ for 256 bits?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not:

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System.Reflection.Metadata;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace VectorBenchmarks
{
    internal class Program
    {
        static void Main(string[] args) => BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);
    }

    public unsafe class ContainsNonAscii
    {
        private const int Size = 1024;
        private byte* _bytes;
            
        [GlobalSetup]
        public void Setup()
        {
            _bytes = (byte*)NativeMemory.AlignedAlloc(new UIntPtr(Size), new UIntPtr(32));
            new Span<byte>(_bytes, Size).Clear();
        }

        [GlobalCleanup]
        public void Free() => NativeMemory.AlignedFree(_bytes);

        [Benchmark]
        public bool ExtractMostSignificantBits()
        {
            ref byte searchSpace = ref *_bytes;
            nuint currentOffset = 0;
            nuint finalOffsetWhereCanRunLoop = Size - (uint)Vector256<byte>.Count;

            do
            {
                Vector256<byte> asciiVector = Vector256.LoadUnsafe(ref searchSpace, currentOffset);

                if (asciiVector.ExtractMostSignificantBits() != 0)
                {
                    return true;
                }

                currentOffset += (nuint)Vector256<byte>.Count;
            } while (currentOffset <= finalOffsetWhereCanRunLoop);

            return false;
        }

        [Benchmark]
        public bool TestZ()
        {
            ref byte searchSpace = ref *_bytes;
            nuint currentOffset = 0;
            nuint finalOffsetWhereCanRunLoop = Size - (uint)Vector256<byte>.Count;

            do
            {
                Vector256<byte> asciiVector = Vector256.LoadUnsafe(ref searchSpace, currentOffset);

                if (!Avx.TestZ(asciiVector, Vector256.Create((byte)0x80)))
                {
                    return true;
                }

                currentOffset += (nuint)Vector256<byte>.Count;
            } while (currentOffset <= finalOffsetWhereCanRunLoop);

            return false;
        }
    }
}
BenchmarkDotNet=v0.13.2, OS=Windows 11 (10.0.22000.856/21H2)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=7.0.100-rc.1.22423.16
  [Host]     : .NET 7.0.0 (7.0.22.42223), X64 RyuJIT AVX2
  DefaultJob : .NET 7.0.0 (7.0.22.42223), X64 RyuJIT AVX2
Method Mean Error StdDev Code Size
ExtractMostSignificantBits 11.78 ns 0.042 ns 0.040 ns 57 B
TestZ 14.76 ns 0.320 ns 0.416 ns 68 B

.NET 7.0.0 (7.0.22.42223), X64 RyuJIT AVX2

; VectorBenchmarks.ContainsNonAscii.ExtractMostSignificantBits()
       vzeroupper
       mov       rax,[rcx+8]
       xor       edx,edx
       nop       dword ptr [rax]
M00_L00:
       vmovdqu   ymm0,ymmword ptr [rax+rdx]
       vpmovmskb ecx,ymm0
       test      ecx,ecx
       jne       short M00_L01
       add       rdx,20
       cmp       rdx,3E0
       jbe       short M00_L00
       xor       eax,eax
       vzeroupper
       ret
M00_L01:
       mov       eax,1
       vzeroupper
       ret
; Total bytes of code 57

.NET 7.0.0 (7.0.22.42223), X64 RyuJIT AVX2

; VectorBenchmarks.ContainsNonAscii.TestZ()
       vzeroupper
       mov       rax,[rcx+8]
       xor       edx,edx
       vmovupd   ymm0,[7FF9E3D94D60]
       nop       dword ptr [rax]
       nop       dword ptr [rax]
M00_L00:
       vptest    ymm0,ymmword ptr [rax+rdx]
       jne       short M00_L01
       add       rdx,20
       cmp       rdx,3E0
       jbe       short M00_L00
       xor       eax,eax
       vzeroupper
       ret
M00_L01:
       mov       eax,1
       vzeroupper
       ret
; Total bytes of code 68

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not:

That's not what I see on my machine.

Method Mean
ExtractMostSignificantBits_128 31.77 ns
TestZ_128 25.58 ns
ExtractMostSignificantBits_256 15.58 ns
TestZ_256 11.66 ns
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Running;
using System;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;

[HideColumns("Error", "StdDev", "Median", "RatioSD")]
public unsafe partial class Program
{
    static void Main(string[] args) => BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args);

    private const int Size = 1024;
    private byte* _bytes;

    [GlobalSetup]
    public void Setup()
    {
        _bytes = (byte*)NativeMemory.AlignedAlloc(new UIntPtr(Size), new UIntPtr(32));
        new Span<byte>(_bytes, Size).Clear();
    }

    [GlobalCleanup]
    public void Free() => NativeMemory.AlignedFree(_bytes);

    [Benchmark]
    public bool ExtractMostSignificantBits_128()
    {
        ref byte searchSpace = ref *_bytes;
        nuint currentOffset = 0;
        nuint finalOffsetWhereCanRunLoop = Size - (uint)Vector128<byte>.Count;

        do
        {
            Vector128<byte> asciiVector = Vector128.LoadUnsafe(ref searchSpace, currentOffset);

            if (asciiVector.ExtractMostSignificantBits() != 0)
            {
                return true;
            }

            currentOffset += (nuint)Vector128<byte>.Count;
        } while (currentOffset <= finalOffsetWhereCanRunLoop);

        return false;
    }

    [Benchmark]
    public bool TestZ_128()
    {
        ref byte searchSpace = ref *_bytes;
        nuint currentOffset = 0;
        nuint finalOffsetWhereCanRunLoop = Size - (uint)Vector128<byte>.Count;

        do
        {
            Vector128<byte> asciiVector = Vector128.LoadUnsafe(ref searchSpace, currentOffset);

            if (!Sse41.TestZ(asciiVector, Vector128.Create((byte)0x80)))
            {
                return true;
            }

            currentOffset += (nuint)Vector128<byte>.Count;
        } while (currentOffset <= finalOffsetWhereCanRunLoop);

        return false;
    }

    [Benchmark]
    public bool ExtractMostSignificantBits_256()
    {
        ref byte searchSpace = ref *_bytes;
        nuint currentOffset = 0;
        nuint finalOffsetWhereCanRunLoop = Size - (uint)Vector256<byte>.Count;

        do
        {
            Vector256<byte> asciiVector = Vector256.LoadUnsafe(ref searchSpace, currentOffset);

            if (asciiVector.ExtractMostSignificantBits() != 0)
            {
                return true;
            }

            currentOffset += (nuint)Vector256<byte>.Count;
        } while (currentOffset <= finalOffsetWhereCanRunLoop);

        return false;
    }

    [Benchmark]
    public bool TestZ_256()
    {
        ref byte searchSpace = ref *_bytes;
        nuint currentOffset = 0;
        nuint finalOffsetWhereCanRunLoop = Size - (uint)Vector256<byte>.Count;

        do
        {
            Vector256<byte> asciiVector = Vector256.LoadUnsafe(ref searchSpace, currentOffset);

            if (!Avx.TestZ(asciiVector, Vector256.Create((byte)0x80)))
            {
                return true;
            }

            currentOffset += (nuint)Vector256<byte>.Count;
        } while (currentOffset <= finalOffsetWhereCanRunLoop);

        return false;
    }
}

Copy link
Member

@EgorBo EgorBo Sep 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stephentoub I think it depends on CPU, I even had to revert TestZ from Vector.Equals because it produced regressions #67902

Copy link
Member

@stephentoub stephentoub Sep 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it depends on CPU, I even had to revert TestZ from Vector.Equals because it produced regressions #67902

That change reverted it from both the 256-bit and 128-bit code paths. This PR uses TestZ for 128-bit. Why is that ok?

I'm questioning the non-symmetrical usage.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are some helpful comments in these SO links (and related) https://stackoverflow.com/questions/60446759/sse2-test-xmm-bitmask-directly-without-using-pmovmskb

Copy link
Member

@stephentoub stephentoub Sep 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, but I'm not seeing the answer there to my question.

I'll restate:
This PR is adding additional code to prefer using TestZ with Vector128:
https://github.com/dotnet/runtime/pull/73055/files#diff-66bbe89271f826c9232bd146abb678844754515dc027f70ad0ce36f751da46ebR1379-R1391
Your #67902 reverted other changes that preferred using TestZ, not just on 256-bit but also on 128-bit vectors.
Does it still make sense for this PR to be adding additional code to use TestZ with Vector128?

(Part of why I'm pushing on this is with a goal of avoiding needing to drop down to direct instrinsics as much as possible. I'd hope we can get to a point where the obvious code to write is the best code to write in as many situations as possible.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I am not saying the current non-symmetrical usage is correct, I'd probably change both to use ExtractMostSignificantBits

C++ compilers also do different things here, e.g. LLVM folds even direct MoveMask usage to testz: https://godbolt.org/z/MobvxvzGK

Copy link
Member

@tannergooding tannergooding Sep 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is "better" is going to depend on a few factors.

On x86/x64, ExtractMostSignificantBits is likely faster because it can always emit exactly as movmsk and there are some CPUs where TestZ can be slower, particularly for "small inputs" where the match is sooner. When the match is later, TestZ typically wins out regardless.

On Arm64, doing the early comparison against == Zero is likely better because it is a single instruction vs the multi-instruction sequence required to emulate x64's movmsk.

I think the best choice here is to use == Zero (and therefore TestZ) as I believe it will, on average, produce the best/most consistent code. The cases where it might be a bit slower will typically be for smaller inputs where we're already returning quickly and the extra couple nanoseconds won't really matter.

{
break;
}

(Vector256<ushort> low, Vector256<ushort> upper) = Vector256.Widen(asciiVector);
low.Store(pCurrentWriteAddress);
upper.Store(pCurrentWriteAddress + Vector256<ushort>.Count);

currentOffset += (nuint)Vector256<byte>.Count;
pCurrentWriteAddress += (nuint)Vector256<byte>.Count;
} while (currentOffset <= finalOffsetWhereCanRunLoop);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On every iteration of the loop we're bumping currentOffset and then also adding that to pAsciiBuffer. Would it be faster to instead compute the upper bound as an address, just bump the current pointer in the loop, and then after the loop compute the currentOffset if needed based on the ending/starting difference?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It produces better codegen, but the reported time difference is within the range of error (0-0.3ns gain or loss)

public unsafe class Widen
{
    private const int Size = 1024;
    private byte* _bytes;
    private char* _chars;

    [GlobalSetup]
    public void Setup()
    {
        _bytes = (byte*)NativeMemory.AlignedAlloc(new UIntPtr(Size), new UIntPtr(32));
        new Span<byte>(_bytes, Size).Fill((byte)'a');
        _chars = (char*)NativeMemory.AlignedAlloc(new UIntPtr(Size * sizeof(char)), new UIntPtr(32));
    }

    [GlobalCleanup]
    public void Free()
    {
        NativeMemory.AlignedFree(_bytes);
        NativeMemory.AlignedFree(_chars);
    }

    [Benchmark]
    public void Current()
    {
        ref byte searchSpace = ref *_bytes;
        ushort* pCurrentWriteAddress = (ushort*)_chars;
        nuint currentOffset = 0;
        nuint finalOffsetWhereCanRunLoop = Size - (uint)Vector256<byte>.Count;

        do
        {
            Vector256<byte> asciiVector = Vector256.Load(_bytes + currentOffset);

            if (asciiVector.ExtractMostSignificantBits() != 0)
            {
                break;
            }

            (Vector256<ushort> low, Vector256<ushort> upper) = Vector256.Widen(asciiVector);
            low.Store(pCurrentWriteAddress);
            upper.Store(pCurrentWriteAddress + Vector256<ushort>.Count);

            currentOffset += (nuint)Vector256<byte>.Count;
            pCurrentWriteAddress += (nuint)Vector256<byte>.Count;
        } while (currentOffset <= finalOffsetWhereCanRunLoop);
    }

    [Benchmark]
    public void Suggested()
    {
        ref byte currentSearchSpace = ref *_bytes;
        ref ushort currentWriteAddress = ref Unsafe.As<char, ushort>(ref *_chars);
        ref byte oneVectorAwayFromEnd = ref Unsafe.Add(ref currentSearchSpace, Size - Vector256<byte>.Count);

        do
        {
            Vector256<byte> asciiVector = Vector256.LoadUnsafe(ref currentSearchSpace);

            if (asciiVector.ExtractMostSignificantBits() != 0)
            {
                break;
            }

            (Vector256<ushort> low, Vector256<ushort> upper) = Vector256.Widen(asciiVector);
            low.StoreUnsafe(ref currentWriteAddress);
            upper.StoreUnsafe(ref currentWriteAddress, (nuint)Vector256<ushort>.Count);

            currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256<byte>.Count);
            currentWriteAddress = ref Unsafe.Add(ref currentWriteAddress, Vector256<byte>.Count);
        } while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd));
    }
}
BenchmarkDotNet=v0.13.2, OS=Windows 11 (10.0.22000.856/21H2)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK=7.0.100-rc.1.22423.16
  [Host]     : .NET 7.0.0 (7.0.22.42223), X64 RyuJIT AVX2
  DefaultJob : .NET 7.0.0 (7.0.22.42223), X64 RyuJIT AVX2
Method Mean Error StdDev Code Size
Current 44.29 ns 0.171 ns 0.152 ns 81 B
Suggested 44.54 ns 0.042 ns 0.032 ns 77 B

.NET 7.0.0 (7.0.22.42223), X64 RyuJIT AVX2

; VectorBenchmarks.Widen.Current()
       vzeroupper
       mov       eax,[rcx+8]
       mov       rax,[rcx+10]
       xor       edx,edx
M00_L00:
       mov       r8,[rcx+8]
       vmovdqu   ymm0,ymmword ptr [r8+rdx]
       vpmovmskb r8d,ymm0
       test      r8d,r8d
       jne       short M00_L01
       vmovaps   ymm1,ymm0
       vpmovzxbw ymm1,xmm1
       vextractf128 xmm0,ymm0,1
       vpmovzxbw ymm0,xmm0
       vmovdqu   ymmword ptr [rax],ymm1
       vmovdqu   ymmword ptr [rax+20],ymm0
       add       rdx,20
       add       rax,40
       cmp       rdx,3E0
       jbe       short M00_L00
M00_L01:
       vzeroupper
       ret
; Total bytes of code 81

.NET 7.0.0 (7.0.22.42223), X64 RyuJIT AVX2

; VectorBenchmarks.Widen.Suggested()
       vzeroupper
       mov       rax,[rcx+8]
       mov       rdx,[rcx+10]
       lea       rcx,[rax+3E0]
M00_L00:
       vmovdqu   ymm0,ymmword ptr [rax]
       vpmovmskb r8d,ymm0
       test      r8d,r8d
       jne       short M00_L01
       vmovaps   ymm1,ymm0
       vpmovzxbw ymm1,xmm1
       vextractf128 xmm0,ymm0,1
       vpmovzxbw ymm0,xmm0
       vmovdqu   ymmword ptr [rdx],ymm1
       vmovdqu   ymmword ptr [rdx+20],ymm0
       add       rax,20
       add       rdx,40
       cmp       rax,rcx
       jbe       short M00_L00
M00_L01:
       vzeroupper
       ret
; Total bytes of code 77

If you don't mind I am going to merge it as it is and apply your suggestion in my next PR.

}
else
{
// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128<byte>.Count;

do
{
Vector128<byte> asciiVector = Vector128.Load(pAsciiBuffer + currentOffset);

if (VectorContainsNonAsciiChar(asciiVector))
{
break;
}

// Vector128.Widen is not used here as it less performant on ARM64
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we know why? Naively I'd expect the JIT to be able to produce the same code for both.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am sorry, but I don't. It's not that I did not try to find out, it's the arm64 tooling that makes it hard for me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding?

If this is by design, ok. But if it's something we can/should be fixing in the JIT, I want to make sure we're not sweeping such issues under the rug. Ideally the obvious code is also the best performing code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Codegen looks good to me:
image
(add could be contained but it's unrelated here)

Copy link
Member

@tannergooding tannergooding Sep 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd need a comparison between the two code paths to see where the difference is.

I would expect these to be identical except for a case where the original code was making some assumption (based on knowing the inputs were restricted to a subset of all possible values) and therefore skipping an otherwise "required" step that would be necessary to ensure deterministic results for "any input".

Vector128<ushort> utf16HalfVector = Vector128.WidenLower(asciiVector);
utf16HalfVector.Store(pCurrentWriteAddress);
utf16HalfVector = Vector128.WidenUpper(asciiVector);
utf16HalfVector.Store(pCurrentWriteAddress + Vector128<ushort>.Count);

currentOffset += (nuint)Vector128<byte>.Count;
pCurrentWriteAddress += (nuint)Vector128<byte>.Count;
} while (currentOffset <= finalOffsetWhereCanRunLoop);
}
}
else if (Vector.IsHardwareAccelerated)
Copy link
Member

@stephentoub stephentoub Sep 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the Vector<T> path still needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the Vector path still needed?

Some Mono variants don't support Vector128 for all configs yet

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some Mono variants don't support Vector128 for all configs yet

Which support Vector and not Vector128?

Just the presence of these paths are keeping the methods from being R2R'd it seems.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mono-LLVM supports both, Mono without LLVM (e.g. default Mono jit mode or AOT) supports only Vector<>

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mono-LLVM supports both, Mono without LLVM (e.g. default Mono jit mode or AOT) supports only Vector<>

Is that getting fixed?

We now have multiple vectorized implementations that don't have a Vector<T> code path. Why is this one special that it still needs one?

Expand Down Expand Up @@ -1697,177 +1761,6 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
goto Finish;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool ContainsNonAsciiByte(Vector128<byte> value)
{
if (!AdvSimd.Arm64.IsSupported)
{
throw new PlatformNotSupportedException();
}
value = AdvSimd.Arm64.MaxPairwise(value, value);
return (value.AsUInt64().ToScalar() & 0x8080808080808080) != 0;
}

private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
{
// JIT turns the below into constants

uint SizeOfVector128 = (uint)Unsafe.SizeOf<Vector128<byte>>();
nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1);

// This method is written such that control generally flows top-to-bottom, avoiding
// jumps as much as possible in the optimistic case of "all ASCII". If we see non-ASCII
// data, we jump out of the hot paths to targets at the end of the method.

Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported);
Debug.Assert(BitConverter.IsLittleEndian);
Debug.Assert(elementCount >= 2 * SizeOfVector128);

// We're going to get the best performance when we have aligned writes, so we'll take the
// hit of potentially unaligned reads in order to hit this sweet spot.

Vector128<byte> asciiVector;
Vector128<byte> utf16FirstHalfVector;
bool containsNonAsciiBytes;

// First, perform an unaligned read of the first part of the input buffer.

if (Sse2.IsSupported)
{
asciiVector = Sse2.LoadVector128(pAsciiBuffer); // unaligned load
containsNonAsciiBytes = (uint)Sse2.MoveMask(asciiVector) != 0;
}
else if (AdvSimd.Arm64.IsSupported)
{
asciiVector = AdvSimd.LoadVector128(pAsciiBuffer);
containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector);
}
else
{
throw new PlatformNotSupportedException();
}

// If there's non-ASCII data in the first 8 elements of the vector, there's nothing we can do.

if (containsNonAsciiBytes)
{
return 0;
}

// Then perform an unaligned write of the first part of the input buffer.

Vector128<byte> zeroVector = Vector128<byte>.Zero;

if (Sse2.IsSupported)
{
utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
Sse2.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
}
else if (AdvSimd.IsSupported)
{
utf16FirstHalfVector = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower()).AsByte();
AdvSimd.Store((byte*)pUtf16Buffer, utf16FirstHalfVector); // unaligned
}
else
{
throw new PlatformNotSupportedException();
}

// Calculate how many elements we wrote in order to get pOutputBuffer to its next alignment
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

based on the benchmarking that I've done this was not improving perf in noticeable way, but increasing the code complexity. I've removed it and added Vector256 code path (less code = less code to duplicate)

// point, then use that as the base offset going forward. Remember the >> 1 to account for
// that we wrote chars, not bytes. This means we may re-read data in the next iteration of
// the loop, but this is ok.

nuint currentOffset = (SizeOfVector128 >> 1) - (((nuint)pUtf16Buffer >> 1) & (MaskOfAllBitsInVector128 >> 1));
Debug.Assert(0 < currentOffset && currentOffset <= SizeOfVector128 / sizeof(char));

nuint finalOffsetWhereCanRunLoop = elementCount - SizeOfVector128;

// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002

char* pCurrentWriteAddress = pUtf16Buffer + currentOffset;

do
{
// In a loop, perform an unaligned read, widen to two vectors, then aligned write the two vectors.

if (Sse2.IsSupported)
{
asciiVector = Sse2.LoadVector128(pAsciiBuffer + currentOffset); // unaligned load
containsNonAsciiBytes = (uint)Sse2.MoveMask(asciiVector) != 0;
}
else if (AdvSimd.Arm64.IsSupported)
{
asciiVector = AdvSimd.LoadVector128(pAsciiBuffer + currentOffset);
containsNonAsciiBytes = ContainsNonAsciiByte(asciiVector);
}
else
{
throw new PlatformNotSupportedException();
}

if (containsNonAsciiBytes)
{
// non-ASCII byte somewhere
goto NonAsciiDataSeenInInnerLoop;
}

if (Sse2.IsSupported)
{
Vector128<byte> low = Sse2.UnpackLow(asciiVector, zeroVector);
Sse2.StoreAligned((byte*)pCurrentWriteAddress, low);

Vector128<byte> high = Sse2.UnpackHigh(asciiVector, zeroVector);
Sse2.StoreAligned((byte*)pCurrentWriteAddress + SizeOfVector128, high);
}
else if (AdvSimd.Arm64.IsSupported)
{
Vector128<ushort> low = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower());
Vector128<ushort> high = AdvSimd.ZeroExtendWideningUpper(asciiVector);
AdvSimd.Arm64.StorePair((ushort*)pCurrentWriteAddress, low, high);
}
else
{
throw new PlatformNotSupportedException();
}

currentOffset += SizeOfVector128;
pCurrentWriteAddress += SizeOfVector128;
} while (currentOffset <= finalOffsetWhereCanRunLoop);

Finish:

return currentOffset;

NonAsciiDataSeenInInnerLoop:

// Can we at least widen the first part of the vector?

if (!containsNonAsciiBytes)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed this code, as it was impossible to satisfy this requirement as the jump was performed only when the flag was set to true:

if (containsNonAsciiBytes)
{
// non-ASCII byte somewhere
goto NonAsciiDataSeenInInnerLoop;
}

{
// First part was all ASCII, widen
if (Sse2.IsSupported)
{
utf16FirstHalfVector = Sse2.UnpackLow(asciiVector, zeroVector);
Sse2.StoreAligned((byte*)(pUtf16Buffer + currentOffset), utf16FirstHalfVector);
}
else if (AdvSimd.Arm64.IsSupported)
{
Vector128<ushort> lower = AdvSimd.ZeroExtendWideningLower(asciiVector.GetLower());
AdvSimd.Store((ushort*)(pUtf16Buffer + currentOffset), lower);
}
else
{
throw new PlatformNotSupportedException();
}
currentOffset += SizeOfVector128 / 2;
}

goto Finish;
}

/// <summary>
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
/// writes them to the output buffer with machine endianness.
Expand All @@ -1877,19 +1770,18 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB
{
Debug.Assert(AllBytesInUInt32AreAscii(value));

if (Sse2.X64.IsSupported)
{
Debug.Assert(BitConverter.IsLittleEndian, "SSE2 widening assumes little-endian.");
Vector128<byte> vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte();
Vector128<ulong> vecWide = Sse2.UnpackLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), Sse2.X64.ConvertToUInt64(vecWide));
}
else if (AdvSimd.Arm64.IsSupported)
if (AdvSimd.Arm64.IsSupported)
{
Vector128<byte> vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte();
Vector128<ulong> vecWide = AdvSimd.Arm64.ZipLow(vecNarrow, Vector128<byte>.Zero).AsUInt64();
Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), vecWide.ToScalar());
}
else if (Vector128.IsHardwareAccelerated)
{
Vector128<byte> vecNarrow = Vector128.CreateScalar(value).AsByte();
Vector128<ulong> vecWide = Vector128.WidenLower(vecNarrow).AsUInt64();
Unsafe.WriteUnaligned<ulong>(ref Unsafe.As<char, byte>(ref outputBuffer), vecWide.ToScalar());
}
else
{
if (BitConverter.IsLittleEndian)
Expand Down