Skip to content

Commit

Permalink
add Vector256 code path
Browse files Browse the repository at this point in the history
  • Loading branch information
adamsitnik committed Jul 29, 2022
1 parent 349aca7 commit 764d038
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3722,6 +3722,7 @@ public static bool TryCopyTo<T>(this Vector256<T> vector, Span<T> destination)
/// <param name="source">The vector whose elements are to be widened.</param>
/// <returns>A pair of vectors that contain the widened lower and upper halves of <paramref name="source" />.</returns>
[CLSCompliant(false)]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe (Vector256<ushort> Lower, Vector256<ushort> Upper) Widen(Vector256<byte> source)
=> (WidenLower(source), WidenUpper(source));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1569,12 +1569,11 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
// Intrinsified in mono interpreter
nuint currentOffset = 0;

if (Vector128.IsHardwareAccelerated && BitConverter.IsLittleEndian)
if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector<byte>.Count)
{
if (elementCount >= 2 * (uint)Unsafe.SizeOf<Vector128<byte>>())
{
currentOffset = WidenAsciiToUtf16_Intrinsified(pAsciiBuffer, pUtf16Buffer, elementCount);
}
currentOffset = Vector256.IsHardwareAccelerated
? WidenAsciiToUtf16_Vector256(pAsciiBuffer, pUtf16Buffer, elementCount)
: WidenAsciiToUtf16_Vector128(pAsciiBuffer, pUtf16Buffer, elementCount);
}
else if (Vector.IsHardwareAccelerated)
{
Expand Down Expand Up @@ -1704,10 +1703,7 @@ public static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16Buf
goto Finish;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool ContainsNonAsciiByte(Vector128<byte> asciiVector) => asciiVector.ExtractMostSignificantBits() != 0;

private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
private static unsafe nuint WidenAsciiToUtf16_Vector128(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
{
Debug.Assert(Vector128.IsHardwareAccelerated);
Debug.Assert(BitConverter.IsLittleEndian);
Expand Down Expand Up @@ -1743,6 +1739,40 @@ private static unsafe nuint WidenAsciiToUtf16_Intrinsified(byte* pAsciiBuffer, c
return currentOffset;
}

private static unsafe nuint WidenAsciiToUtf16_Vector256(byte* pAsciiBuffer, char* pUtf16Buffer, nuint elementCount)
{
Debug.Assert(Vector256.IsHardwareAccelerated);
Debug.Assert(BitConverter.IsLittleEndian);
Debug.Assert(elementCount >= 2 * (uint)Vector256<byte>.Count);

nuint currentOffset = 0;
ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer;

// Calculating the destination address outside the loop results in significant
// perf wins vs. relying on the JIT to fold memory addressing logic into the
// write instructions. See: https://github.com/dotnet/runtime/issues/33002
nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256<byte>.Count;

do
{
Vector256<byte> asciiVector = Vector256.Load(pAsciiBuffer + currentOffset);

if (asciiVector.ExtractMostSignificantBits() != 0)
{
break;
}

(Vector256<ushort> low, Vector256<ushort> upper) = Vector256.Widen(asciiVector);
low.Store(pCurrentWriteAddress);
upper.Store(pCurrentWriteAddress + Vector256<ushort>.Count);

currentOffset += (nuint)Vector256<byte>.Count;
pCurrentWriteAddress += (nuint)Vector256<byte>.Count;
} while (currentOffset <= finalOffsetWhereCanRunLoop);

return currentOffset;
}

/// <summary>
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
/// writes them to the output buffer with machine endianness.
Expand Down

0 comments on commit 764d038

Please sign in to comment.