Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ordinal Ignore Case Optimization #40910

Merged
merged 2 commits into from
Aug 17, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/libraries/Common/src/Interop/Interop.Casing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,8 @@ internal static partial class Globalization

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_ChangeCaseTurkish")]
internal static extern unsafe void ChangeCaseTurkish(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity, bool bToUpper);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_InitOrdinalCasingPage")]
internal static extern unsafe void InitOrdinalCasingPage(int pageNumber, char* pTarget);
}
}
9 changes: 0 additions & 9 deletions src/libraries/Common/src/Interop/Interop.Collation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@ internal static partial class Globalization
[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_LastIndexOf")]
internal static extern unsafe int LastIndexOf(IntPtr sortHandle, char* target, int cwTargetLength, char* pSource, int cwSourceLength, CompareOptions options, int* matchLengthPtr);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_IndexOfOrdinalIgnoreCase")]
internal static extern unsafe int IndexOfOrdinalIgnoreCase(string target, int cwTargetLength, char* pSource, int cwSourceLength, bool findLast);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_IndexOfOrdinalIgnoreCase")]
internal static extern unsafe int IndexOfOrdinalIgnoreCase(char* target, int cwTargetLength, char* pSource, int cwSourceLength, bool findLast);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_StartsWith")]
[return: MarshalAs(UnmanagedType.Bool)]
internal static extern unsafe bool StartsWith(IntPtr sortHandle, char* target, int cwTargetLength, char* source, int cwSourceLength, CompareOptions options, int* matchedLength);
Expand All @@ -49,9 +43,6 @@ internal static partial class Globalization
[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_GetSortKey")]
internal static extern unsafe int GetSortKey(IntPtr sortHandle, char* str, int strLength, byte* sortKey, int sortKeyLength, CompareOptions options);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_CompareStringOrdinalIgnoreCase")]
internal static extern unsafe int CompareStringOrdinalIgnoreCase(char* lpStr1, int cwStr1Len, char* lpStr2, int cwStr2Len);

[DllImport(Libraries.GlobalizationNative, EntryPoint = "GlobalizationNative_GetSortVersion")]
internal static extern int GetSortVersion(IntPtr sortHandle);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ FCFuncStart(gPalGlobalizationNative)
QCFuncElement("ChangeCaseTurkish", GlobalizationNative_ChangeCaseTurkish)
QCFuncElement("CloseSortHandle", GlobalizationNative_CloseSortHandle)
QCFuncElement("CompareString", GlobalizationNative_CompareString)
QCFuncElement("CompareStringOrdinalIgnoreCase", GlobalizationNative_CompareStringOrdinalIgnoreCase)
QCFuncElement("EndsWith", GlobalizationNative_EndsWith)
QCFuncElement("EnumCalendarInfo", GlobalizationNative_EnumCalendarInfo)
QCFuncElement("GetCalendarInfo", GlobalizationNative_GetCalendarInfo)
Expand All @@ -49,8 +48,8 @@ FCFuncStart(gPalGlobalizationNative)
QCFuncElement("GetSortVersion", GlobalizationNative_GetSortVersion)
QCFuncElement("GetTimeZoneDisplayName", GlobalizationNative_GetTimeZoneDisplayName)
QCFuncElement("IndexOf", GlobalizationNative_IndexOf)
QCFuncElement("IndexOfOrdinalIgnoreCase", GlobalizationNative_IndexOfOrdinalIgnoreCase)
QCFuncElement("InitICUFunctions", GlobalizationNative_InitICUFunctions)
QCFuncElement("InitOrdinalCasingPage", GlobalizationNative_InitOrdinalCasingPage)
QCFuncElement("IsNormalized", GlobalizationNative_IsNormalized)
QCFuncElement("IsPredefinedLocale", GlobalizationNative_IsPredefinedLocale)
QCFuncElement("LastIndexOf", GlobalizationNative_LastIndexOf)
Expand Down
18 changes: 18 additions & 0 deletions src/libraries/Native/Unix/System.Globalization.Native/pal_casing.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,24 @@ void GlobalizationNative_ChangeCaseTurkish(
}
}

void GlobalizationNative_InitOrdinalCasingPage(int32_t pageNumber, UChar* pTarget)
{
pageNumber <<= 8;
for (int i = 0; i < 256; i++)
{
// Unfortunately, to ensure one-to-one simple mapping we have to call u_toupper on every character.
// Using string casing ICU APIs cannot give such results even when using NULL locale to force root behavior.
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, this is because Unicode itself doesn't have 1:1 case mapping.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually if limit the functionality to UnicodeData.txt, it will be 1:1. Yes, I understand in general Unicode casing is not 1:1.

pTarget[i] = (UChar) u_toupper((UChar32)(pageNumber + i));
}

if (pageNumber == 0x0100)
{
// Disable Turkish I behavior on Ordinal operations
pTarget[0x31] = (UChar)0x0131; // Turkish lowercase i
pTarget[0x7F] = (UChar)0x017F; // // 017F;LATIN SMALL LETTER LONG S
}
}

#ifdef __clang__
#pragma clang diagnostic pop
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ PALEXPORT void GlobalizationNative_ChangeCaseTurkish(const UChar* lpSrc,
UChar* lpDst,
int32_t cwDstLength,
int32_t bToUpper);

PALEXPORT void GlobalizationNative_InitOrdinalCasingPage(int32_t pageNumber, UChar* pTarget);
103 changes: 2 additions & 101 deletions src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ int32_t GlobalizationNative_CompareString(
}
if (lpStr2 == NULL)
{
lpStr2 = &dummyChar;
lpStr2 = &dummyChar;
}

result = ucol_strcoll(pColl, lpStr1, cwStr1Length, lpStr2, cwStr2Length);
Expand Down Expand Up @@ -497,7 +497,7 @@ int32_t GlobalizationNative_IndexOf(

return (result == UCOL_EQUAL) ? 0 : -1;
}

UErrorCode err = U_ZERO_ERROR;
const UCollator* pColl = GetCollatorFromSortHandle(pSortHandle, options, &err);

Expand Down Expand Up @@ -605,61 +605,6 @@ static int AreEqualOrdinalIgnoreCase(UChar32 one, UChar32 two)
return u_toupper(one) == u_toupper(two);
}

/*
Function:
IndexOfOrdinalIgnoreCase
*/
int32_t GlobalizationNative_IndexOfOrdinalIgnoreCase(
const UChar* lpTarget, int32_t cwTargetLength, const UChar* lpSource, int32_t cwSourceLength, int32_t findLast)
{
int32_t result = -1;

int32_t endIndex = cwSourceLength - cwTargetLength;
assert(endIndex >= 0);

int32_t i = 0;
while (i <= endIndex)
{
int32_t srcIdx = i, trgIdx = 0;
const UChar *src = lpSource, *trg = lpTarget;

int32_t match = TRUE;
while (trgIdx < cwTargetLength)
{
UChar32 srcCodepoint, trgCodepoint;

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wsign-conversion"
#endif
U16_NEXT(src, srcIdx, cwSourceLength, srcCodepoint);
U16_NEXT(trg, trgIdx, cwTargetLength, trgCodepoint);
#ifdef __clang__
#pragma clang diagnostic pop
#endif

if (!AreEqualOrdinalIgnoreCase(srcCodepoint, trgCodepoint))
{
match = FALSE;
break;
}
}

if (match)
{
result = i;
if (!findLast)
{
break;
}
}

U16_FWD_1(lpSource, i, cwSourceLength);
}

return result;
}

/*
collation element is an int used for sorting. It consists of 3 components:
* primary - first 16 bits, representing the base letter
Expand Down Expand Up @@ -934,47 +879,3 @@ int32_t GlobalizationNative_GetSortKey(

return result;
}

int32_t GlobalizationNative_CompareStringOrdinalIgnoreCase(
const UChar* lpStr1, int32_t cwStr1Length, const UChar* lpStr2, int32_t cwStr2Length)
{
assert(lpStr1 != NULL);
assert(cwStr1Length >= 0);
assert(lpStr2 != NULL);
assert(cwStr2Length >= 0);

int32_t str1Idx = 0;
int32_t str2Idx = 0;

while (str1Idx < cwStr1Length && str2Idx < cwStr2Length)
{
UChar32 str1Codepoint, str2Codepoint;

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wsign-conversion"
#endif
U16_NEXT(lpStr1, str1Idx, cwStr1Length, str1Codepoint);
U16_NEXT(lpStr2, str2Idx, cwStr2Length, str2Codepoint);
#ifdef __clang__
#pragma clang diagnostic pop
#endif

if (str1Codepoint != str2Codepoint && u_toupper(str1Codepoint) != u_toupper(str2Codepoint))
{
return str1Codepoint < str2Codepoint ? -1 : 1;
}
}

if (cwStr1Length < cwStr2Length)
{
return -1;
}

if (cwStr2Length < cwStr1Length)
{
return 1;
}

return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,6 @@ PALEXPORT int32_t GlobalizationNative_LastIndexOf(SortHandle* pSortHandle,
int32_t options,
int32_t* pMatchedLength);

PALEXPORT int32_t GlobalizationNative_IndexOfOrdinalIgnoreCase(const UChar* lpTarget,
int32_t cwTargetLength,
const UChar* lpSource,
int32_t cwSourceLength,
int32_t findLast);

PALEXPORT int32_t GlobalizationNative_StartsWith(SortHandle* pSortHandle,
const UChar* lpTarget,
int32_t cwTargetLength,
Expand All @@ -67,8 +61,3 @@ PALEXPORT int32_t GlobalizationNative_GetSortKey(SortHandle* pSortHandle,
uint8_t* sortKey,
int32_t cbSortKeyLength,
int32_t options);

PALEXPORT int32_t GlobalizationNative_CompareStringOrdinalIgnoreCase(const UChar* lpStr1,
int32_t cwStr1Length,
const UChar* lpStr2,
int32_t cwStr2Length);
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
PER_FUNCTION_BLOCK(u_getVersion, libicuuc) \
PER_FUNCTION_BLOCK(u_strlen, libicuuc) \
PER_FUNCTION_BLOCK(u_strncpy, libicuuc) \
PER_FUNCTION_BLOCK(u_strToUpper, libicuuc) \
PER_FUNCTION_BLOCK(u_tolower, libicuuc) \
PER_FUNCTION_BLOCK(u_toupper, libicuuc) \
PER_FUNCTION_BLOCK(ucal_add, libicui18n) \
Expand Down Expand Up @@ -187,6 +188,7 @@ FOR_ALL_ICU_FUNCTIONS
#define u_getVersion(...) u_getVersion_ptr(__VA_ARGS__)
#define u_strlen(...) u_strlen_ptr(__VA_ARGS__)
#define u_strncpy(...) u_strncpy_ptr(__VA_ARGS__)
#define u_strToUpper(...) u_strToUpper_ptr(__VA_ARGS__)
#define u_tolower(...) u_tolower_ptr(__VA_ARGS__)
#define u_toupper(...) u_toupper_ptr(__VA_ARGS__)
#define ucal_add(...) ucal_add_ptr(__VA_ARGS__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,7 @@ void u_charsToUChars(const char * cs, UChar * us, int32_t length);
void u_getVersion(UVersionInfo versionArray);
int32_t u_strlen(const UChar * s);
UChar * u_strncpy(UChar * dst, const UChar * src, int32_t n);
int32_t u_strToUpper (UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode);
UChar32 u_tolower(UChar32 c);
UChar32 u_toupper(UChar32 c);
void ucal_add(UCalendar * cal, UCalendarDateFields field, int32_t amount, UErrorCode * status);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,8 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\Normalization.Nls.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\NumberFormatInfo.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\NumberStyles.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\Ordinal.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\OrdinalCasing.Icu.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\PersianCalendar.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\RegionInfo.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\SortKey.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,135 +35,6 @@ private void IcuInitSortHandle()
}
}

private static unsafe int IcuIndexOfOrdinalCore(ReadOnlySpan<char> source, ReadOnlySpan<char> value, bool ignoreCase, bool fromBeginning)
{
Debug.Assert(!GlobalizationMode.Invariant);
Debug.Assert(!GlobalizationMode.UseNls);
Debug.Assert(!value.IsEmpty);

// Ordinal (non-linguistic) comparisons require the length of the target string to be no greater
// than the length of the search space. Since our caller already checked for empty target strings,
// the below check also handles the case of empty search space strings.

if (source.Length < value.Length)
{
return -1;
}

Debug.Assert(!source.IsEmpty);

if (ignoreCase)
{
fixed (char* pSource = &MemoryMarshal.GetReference(source))
fixed (char* pValue = &MemoryMarshal.GetReference(value))
{
return Interop.Globalization.IndexOfOrdinalIgnoreCase(pValue, value.Length, pSource, source.Length, findLast: !fromBeginning);
}
}

int startIndex, endIndex, jump;
if (fromBeginning)
{
// Left to right, from zero to last possible index in the source string.
// Incrementing by one after each iteration. Stop condition is last possible index plus 1.
startIndex = 0;
endIndex = source.Length - value.Length + 1;
jump = 1;
}
else
{
// Right to left, from first possible index in the source string to zero.
// Decrementing by one after each iteration. Stop condition is last possible index minus 1.
startIndex = source.Length - value.Length;
endIndex = -1;
jump = -1;
}

for (int i = startIndex; i != endIndex; i += jump)
{
int valueIndex, sourceIndex;

for (valueIndex = 0, sourceIndex = i;
valueIndex < value.Length && source[sourceIndex] == value[valueIndex];
valueIndex++, sourceIndex++)
;

if (valueIndex == value.Length)
{
return i;
}
}

return -1;
}

private static unsafe int IcuLastIndexOfOrdinalCore(string source, string value, int startIndex, int count, bool ignoreCase)
{
Debug.Assert(!GlobalizationMode.Invariant);
Debug.Assert(!GlobalizationMode.UseNls);

Debug.Assert(source != null);
Debug.Assert(value != null);

if (value.Length == 0)
{
return startIndex;
}

if (count < value.Length)
{
return -1;
}

// startIndex is the index into source where we start search backwards from.
// leftStartIndex is the index into source of the start of the string that is
// count characters away from startIndex.
int leftStartIndex = startIndex - count + 1;

if (ignoreCase)
{
fixed (char* pSource = source)
{
int lastIndex = Interop.Globalization.IndexOfOrdinalIgnoreCase(value, value.Length, pSource + leftStartIndex, count, findLast: true);
return lastIndex != -1 ?
leftStartIndex + lastIndex :
-1;
}
}

for (int i = startIndex - value.Length + 1; i >= leftStartIndex; i--)
{
int valueIndex, sourceIndex;

for (valueIndex = 0, sourceIndex = i;
valueIndex < value.Length && source[sourceIndex] == value[valueIndex];
valueIndex++, sourceIndex++) ;

if (valueIndex == value.Length) {
return i;
}
}

return -1;
}

private static unsafe int IcuCompareStringOrdinalIgnoreCase(ref char string1, int count1, ref char string2, int count2)
{
Debug.Assert(!GlobalizationMode.Invariant);
Debug.Assert(!GlobalizationMode.UseNls);

Debug.Assert(count1 > 0);
Debug.Assert(count2 > 0);

fixed (char* char1 = &string1)
fixed (char* char2 = &string2)
{
Debug.Assert(char1 != null);
Debug.Assert(char2 != null);
return Interop.Globalization.CompareStringOrdinalIgnoreCase(char1, count1, char2, count2);
}
}

private unsafe int IcuCompareString(ReadOnlySpan<char> string1, ReadOnlySpan<char> string2, CompareOptions options)
{
Debug.Assert(!GlobalizationMode.Invariant);
Expand Down