Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ascii.Equals and Ascii.EqualsIgnoreCase #84886

Merged
merged 6 commits into from
Apr 22, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 5 additions & 16 deletions src/libraries/Common/src/System/Net/CaseInsensitiveAscii.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections;
using System.Diagnostics;
using System.Text;

namespace System.Net
{
Expand Down Expand Up @@ -106,22 +108,9 @@ public new bool Equals(object? firstObject, object? secondObject)
}
if (secondString != null)
{
int index = firstString.Length;
if (index == secondString.Length)
{
if (FastGetHashCode(firstString) == FastGetHashCode(secondString))
{
while (index > 0)
{
index--;
if (AsciiToLower[firstString[index]] != AsciiToLower[secondString[index]])
{
return false;
}
}
return true;
}
}
Debug.Assert(Ascii.IsValid(firstString) || Ascii.IsValid(secondString), "Expected at least one of the inputs to be valid ASCII");

return Ascii.EqualsIgnoreCase(firstString, secondString);
}
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,11 @@ private static void UseFullString(ReadOnlySpan<string> uniqueStrings, bool ignor
results = new(allAscii, ignoreCase, 0, 0, 0, 0);
}

// TODO https://github.com/dotnet/runtime/issues/28230:
// Replace this once Ascii.IsValid exists.
internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
{
#if NET8_0_OR_GREATER
return System.Text.Ascii.IsValid(s);
#else
fixed (char* src = s)
{
uint* ptrUInt32 = (uint*)src;
Expand Down Expand Up @@ -172,6 +173,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)

[MethodImpl(MethodImplOptions.AggressiveInlining)]
static bool AllCharsInUInt32AreAscii(uint value) => (value & ~0x007F_007Fu) == 0;
#endif
}

private static double GetUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,61 +2,26 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Text;

namespace System
{
internal static class ByteArrayHelpers
adamsitnik marked this conversation as resolved.
Show resolved Hide resolved
{
// TODO: https://github.com/dotnet/runtime/issues/28230
// Use Ascii.Equals* when it's available.

internal static bool EqualsOrdinalAsciiIgnoreCase(string left, ReadOnlySpan<byte> right)
{
Debug.Assert(left != null, "Expected non-null string");
Debug.Assert(Ascii.IsValid(left) || Ascii.IsValid(right), "Expected at least one of the inputs to be valid ASCII");

if (left.Length != right.Length)
{
return false;
}

for (int i = 0; i < left.Length; i++)
{
uint charA = left[i];
uint charB = right[i];

// We're only interested in ASCII characters here.
if ((charA - 'a') <= ('z' - 'a'))
charA -= ('a' - 'A');
if ((charB - 'a') <= ('z' - 'a'))
charB -= ('a' - 'A');

if (charA != charB)
{
return false;
}
}

return true;
return Ascii.EqualsIgnoreCase(right, left);
}

internal static bool EqualsOrdinalAscii(string left, ReadOnlySpan<byte> right)
{
Debug.Assert(left != null, "Expected non-null string");
Debug.Assert(Ascii.IsValid(left) || Ascii.IsValid(right), "Expected at least one of the inputs to be valid ASCII");

if (left.Length != right.Length)
{
return false;
}

for (int i = 0; i < left.Length; i++)
{
if (left[i] != right[i])
{
return false;
}
}

return true;
return Ascii.Equals(right, left);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\SystemException.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Ascii.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Ascii.CaseConversion.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Ascii.Equality.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Ascii.Transcoding.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Ascii.Trimming.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Text\Ascii.Utility.cs" />
Expand Down
230 changes: 230 additions & 0 deletions src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Equality.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Text.Unicode;
using System.Numerics;

namespace System.Text
{
public static partial class Ascii
{
/// <summary>
/// Determines whether the provided buffers contain equal ASCII characters.
/// </summary>
/// <param name="left">The buffer to compare with <paramref name="right" />.</param>
/// <param name="right">The buffer to compare with <paramref name="left" />.</param>
/// <returns><see langword="true" /> if the corresponding elements in <paramref name="left" /> and <paramref name="right" /> were equal and ASCII. <see langword="false" /> otherwise.</returns>
/// <remarks>If both buffers contain equal, but non-ASCII characters, the method returns <see langword="false" />.</remarks>
public static bool Equals(ReadOnlySpan<byte> left, ReadOnlySpan<char> right)
{
if (left.Length != right.Length)
{
return false;
}

if (!Vector128.IsHardwareAccelerated || right.Length < Vector128<ushort>.Count)
{
for (int i = 0; i < right.Length; i++)
{
char c = right[i];
byte b = left[i];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we care about the bound check here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we've already confirmed left.Length == right.Length above, I think we're OK without another bounds check here.


if (c != b)
{
return false;
}

if (!UnicodeUtility.IsAsciiCodePoint(c) || !UnicodeUtility.IsAsciiCodePoint(b))
{
return false;
}
}
}
else if (Vector256.IsHardwareAccelerated && right.Length >= Vector256<ushort>.Count)
{
ref ushort currentCharsSearchSpace = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(right));
ref ushort oneVectorAwayFromCharsEnd = ref Unsafe.Add(ref currentCharsSearchSpace, right.Length - Vector256<ushort>.Count);
ref byte currentBytesSearchSpace = ref MemoryMarshal.GetReference(left);
ref byte oneVectorAwayFromBytesEnd = ref Unsafe.Add(ref currentBytesSearchSpace, left.Length - Vector128<byte>.Count);

Vector128<byte> byteValues;
Vector256<ushort> charValues;

// Loop until either we've finished all elements or there's less than a vector's-worth remaining.
do
{
charValues = Vector256.LoadUnsafe(ref currentCharsSearchSpace);
byteValues = Vector128.LoadUnsafe(ref currentBytesSearchSpace);

// it's OK to widen the bytes, it's NOT OK to narrow the chars (we could loose some information)
if (Vector256.Equals(Widen(byteValues), charValues) != Vector256<ushort>.AllBitsSet)
{
return false;
}

if (charValues.AsByte().ExtractMostSignificantBits() != 0 || byteValues.ExtractMostSignificantBits() != 0)
adamsitnik marked this conversation as resolved.
Show resolved Hide resolved
{
return false;
}

currentCharsSearchSpace = ref Unsafe.Add(ref currentCharsSearchSpace, Vector256<ushort>.Count);
currentBytesSearchSpace = ref Unsafe.Add(ref currentBytesSearchSpace, Vector128<byte>.Count);
}
while (!Unsafe.IsAddressGreaterThan(ref currentCharsSearchSpace, ref oneVectorAwayFromCharsEnd));

// If any elements remain, process the last vector in the search space.
if ((uint)right.Length % Vector256<ushort>.Count != 0)
{
charValues = Vector256.LoadUnsafe(ref oneVectorAwayFromCharsEnd);
byteValues = Vector128.LoadUnsafe(ref oneVectorAwayFromBytesEnd);

// it's OK to widen the bytes, it's NOT OK to narrow the chars (we could loose some information)
if (Vector256.Equals(Widen(byteValues), charValues) != Vector256<ushort>.AllBitsSet)
{
return false;
}

if (charValues.AsByte().ExtractMostSignificantBits() != 0 || byteValues.ExtractMostSignificantBits() != 0)
{
return false;
}
}
}
else
{
ref ushort currentCharsSearchSpace = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(right));
ref ushort oneVectorAwayFromCharsEnd = ref Unsafe.Add(ref currentCharsSearchSpace, right.Length - Vector128<ushort>.Count);
ref byte currentBytesSearchSpace = ref MemoryMarshal.GetReference(left);
ref byte oneVectorAwayFromBytesEnd = ref Unsafe.Add(ref currentBytesSearchSpace, left.Length - Vector64<byte>.Count);

Vector64<byte> byteValues;
Vector128<ushort> charValues;

// Loop until either we've finished all elements or there's less than a vector's-worth remaining.
do
{
charValues = Vector128.LoadUnsafe(ref currentCharsSearchSpace);
byteValues = Vector64.LoadUnsafe(ref currentBytesSearchSpace);

// it's OK to widen the bytes, it's NOT OK to narrow the chars (we could loose some information)
if (Vector128.Equals(Widen(byteValues), charValues) != Vector128<ushort>.AllBitsSet)
{
return false;
}

if (VectorContainsNonAsciiChar(charValues) || VectorContainsNonAsciiChar(byteValues))
{
return false;
}

currentCharsSearchSpace = ref Unsafe.Add(ref currentCharsSearchSpace, Vector128<ushort>.Count);
currentBytesSearchSpace = ref Unsafe.Add(ref currentBytesSearchSpace, Vector64<byte>.Count);
}
while (!Unsafe.IsAddressGreaterThan(ref currentCharsSearchSpace, ref oneVectorAwayFromCharsEnd));

// If any elements remain, process the last vector in the search space.
if ((uint)right.Length % Vector128<ushort>.Count != 0)
{
charValues = Vector128.LoadUnsafe(ref oneVectorAwayFromCharsEnd);
byteValues = Vector64.LoadUnsafe(ref oneVectorAwayFromBytesEnd);

// it's OK to widen the bytes, it's NOT OK to narrow the chars (we could loose some information)
if (Vector128.Equals(Widen(byteValues), charValues) != Vector128<ushort>.AllBitsSet)
{
return false;
}

if (VectorContainsNonAsciiChar(charValues) || VectorContainsNonAsciiChar(byteValues))
{
return false;
}
}
}

return true;
}

/// <summary>
/// Determines whether the provided buffers contain equal ASCII characters, ignoring case considerations.
/// </summary>
/// <param name="left">The buffer to compare with <paramref name="right" />.</param>
/// <param name="right">The buffer to compare with <paramref name="left" />.</param>
/// <returns><see langword="true" /> if the corresponding elements in <paramref name="left" /> and <paramref name="right" /> were equal ignoring case considerations and ASCII. <see langword="false" /> otherwise.</returns>
/// <remarks>If both buffers contain equal, but non-ASCII characters, the method returns <see langword="false" />.</remarks>
public static bool EqualsIgnoreCase(ReadOnlySpan<byte> left, ReadOnlySpan<byte> right)
=> left.Length == right.Length && SequenceEqualIgnoreCase(left, right);

/// <inheritdoc cref="EqualsIgnoreCase(ReadOnlySpan{byte}, ReadOnlySpan{byte})"/>
public static bool EqualsIgnoreCase(ReadOnlySpan<byte> left, ReadOnlySpan<char> right)
=> left.Length == right.Length && SequenceEqualIgnoreCase(right, left);

/// <inheritdoc cref="EqualsIgnoreCase(ReadOnlySpan{byte}, ReadOnlySpan{byte})"/>
public static bool EqualsIgnoreCase(ReadOnlySpan<char> left, ReadOnlySpan<char> right)
=> left.Length == right.Length && SequenceEqualIgnoreCase(right, left);

private static bool SequenceEqualIgnoreCase<TText, TValue>(ReadOnlySpan<TText> text, ReadOnlySpan<TValue> value)
where TText : unmanaged, INumberBase<TText>
where TValue : unmanaged, INumberBase<TValue>
{
Debug.Assert(text.Length == value.Length);

for (int i = 0; i < text.Length; i++)
{
uint valueA = uint.CreateTruncating(text[i]);
uint valueB = uint.CreateTruncating(value[i]);

if (!UnicodeUtility.IsAsciiCodePoint(valueA) || !UnicodeUtility.IsAsciiCodePoint(valueB))
adamsitnik marked this conversation as resolved.
Show resolved Hide resolved
{
return false;
}

if (valueA == valueB)
{
continue; // exact match
}

valueA |= 0x20u;
if ((uint)(valueA - 'a') > (uint)('z' - 'a'))
{
return false; // not exact match, and first input isn't in [A-Za-z]
}

if (valueA != (valueB | 0x20u))
{
return false;
}
}

return true;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<ushort> Widen(Vector64<byte> bytes)
{
if (AdvSimd.IsSupported)
{
return AdvSimd.ZeroExtendWideningLower(bytes);
}
else
{
(Vector64<ushort> lower, Vector64<ushort> upper) = Vector64.Widen(bytes);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC Vector64 uses the software fallback on x64.
crossgen2 with main-branch shows that too.

I don't know if there are plans or WIP to hw-accelerate Vector64 in the .NET 8 timeframe.
Otherwise maybe read the value as long into a Vector128 and do the widening then. Something like

static Vector128<ushort> LoadAndWiden(ref byte ptr)
{
    if (AdvSimd.IsSupported)
    {
        Vector64<byte> vec64 = Vector64.LoadUnsafe(ref ptr);
        return AdvSimd.ZeroExtendWideningLower(vec64);
    }
    else if (Sse2.IsSupported)
    {
        ulong value = Unsafe.ReadUnaligned<ulong>(ref ptr);
        Vector128<byte> vec = Vector128.CreateScalarUnsafe(value).AsByte();
        return Sse2.UnpackLow(vec, Vector128<byte>.Zero).AsUInt16();
    }
    else
    {
        Vector64<byte> vec64 = Vector64.LoadUnsafe(ref ptr);
        (Vector64<ushort> lower, Vector64<ushort> upper) = Vector64.Widen(vec64);
        return Vector128.Create(lower, upper);
    }
}

return Vector128.Create(lower, upper);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<ushort> Widen(Vector128<byte> bytes)
{
(Vector128<ushort> lower, Vector128<ushort> upper) = Vector128.Widen(bytes);
return Vector256.Create(lower, upper);
}

private static bool VectorContainsNonAsciiChar(Vector64<byte> bytes)
=> !Utf8Utility.AllBytesInUInt64AreAscii(bytes.AsUInt64().ToScalar());
Copy link
Member

@gfoidl gfoidl Apr 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same concern here for Vector64. Pass in ulong as Utf8Utility.AllBytesInUInt64AreAscii expects that.
So get rid of Vector64 where possible and use ulong instead?

Edit: the vector is stored to stack, then the ulong read from there. Not ideal, but not too bad.

}
}
4 changes: 4 additions & 0 deletions src/libraries/System.Runtime/ref/System.Runtime.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13904,6 +13904,10 @@ namespace System.Text
{
public static class Ascii
{
public static bool Equals(System.ReadOnlySpan<byte> left, System.ReadOnlySpan<char> right) { throw null; }
public static bool EqualsIgnoreCase(System.ReadOnlySpan<byte> left, System.ReadOnlySpan<byte> right) { throw null; }
public static bool EqualsIgnoreCase(System.ReadOnlySpan<byte> left, System.ReadOnlySpan<char> right) { throw null; }
public static bool EqualsIgnoreCase(System.ReadOnlySpan<char> left, System.ReadOnlySpan<char> right) { throw null; }
public static bool IsValid(System.ReadOnlySpan<byte> value) { throw null; }
public static bool IsValid(System.ReadOnlySpan<char> value) { throw null; }
public static bool IsValid(byte value) { throw null; }
Expand Down