-
Notifications
You must be signed in to change notification settings - Fork 4.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[API Proposal]: Introduce new string NormalizationForm for NFKC_Casefold #102269
Comments
Tagging subscribers to this area: @dotnet/area-system-globalization |
This is currently possible via icu-dotnet, but requires another fairly heavy dependency. |
Another implementation is in ICU4N |
@tarekgh do you have any recommendation for a .NET 8 project that needs an NFKC_Casefold implementation? Are there any other options aside from icu-dotnet and ICU4N (which is still in alpha)? |
The only way to do it without using ICU4N or icu-dotnet is to manually call the ICU library. The challenge is to get the correct ICU library name loaded in the process. One way to work around this challenge is to use app local ICU which carry the ICU with your app, and you'll know the exact library name at that time. Or if you are running on Windows and not using app locale ICU, Windows has the ICU library named public unsafe class Program
{
private enum UErrorCode
{
U_USING_FALLBACK_WARNING = -128, /**< A resource bundle lookup returned a fallback result (not an error) */
U_ERROR_WARNING_START = -128, /**< Start of information results (semantically successful) */
U_USING_DEFAULT_WARNING = -127, /**< A resource bundle lookup returned a result from the root locale (not an error) */
U_SAFECLONE_ALLOCATED_WARNING = -126, /**< A SafeClone operation required allocating memory (informational only) */
U_STATE_OLD_WARNING = -125, /**< ICU has to use compatibility layer to construct the service. Expect performance/memory usage degradation. Consider upgrading */
U_STRING_NOT_TERMINATED_WARNING = -124,/**< An output string could not be NUL-terminated because output length==destCapacity. */
U_SORT_KEY_TOO_SHORT_WARNING = -123, /**< Number of levels requested in getBound is higher than the number of levels in the sort key */
U_AMBIGUOUS_ALIAS_WARNING = -122, /**< This converter alias can go to different converter implementations */
U_DIFFERENT_UCA_VERSION = -121, /**< ucol_open encountered a mismatch between UCA version and collator image version, so the collator was constructed from rules. No impact to further function */
U_PLUGIN_CHANGED_LEVEL_WARNING = -120, /**< A plugin caused a level change. May not be an error, but later plugins may not load. */
U_ZERO_ERROR = 0, /**< No error, no warning. */
U_ILLEGAL_ARGUMENT_ERROR = 1, /**< Start of codes indicating failure */
U_MISSING_RESOURCE_ERROR = 2, /**< The requested resource cannot be found */
U_INVALID_FORMAT_ERROR = 3, /**< Data format is not what is expected */
U_FILE_ACCESS_ERROR = 4, /**< The requested file cannot be found */
U_INTERNAL_PROGRAM_ERROR = 5, /**< Indicates a bug in the library code */
U_MESSAGE_PARSE_ERROR = 6, /**< Unable to parse a message (message format) */
U_MEMORY_ALLOCATION_ERROR = 7, /**< Memory allocation error */
U_INDEX_OUTOFBOUNDS_ERROR = 8, /**< Trying to access the index that is out of bounds */
U_PARSE_ERROR = 9, /**< Equivalent to Java ParseException */
U_INVALID_CHAR_FOUND = 10, /**< Character conversion: Unmappable input sequence. In other APIs: Invalid character. */
U_TRUNCATED_CHAR_FOUND = 11, /**< Character conversion: Incomplete input sequence. */
U_ILLEGAL_CHAR_FOUND = 12, /**< Character conversion: Illegal input sequence/combination of input units. */
U_INVALID_TABLE_FORMAT = 13, /**< Conversion table file found, but corrupted */
U_INVALID_TABLE_FILE = 14, /**< Conversion table file not found */
U_BUFFER_OVERFLOW_ERROR = 15, /**< A result would not fit in the supplied buffer */
U_UNSUPPORTED_ERROR = 16, /**< Requested operation not supported in current context */
U_RESOURCE_TYPE_MISMATCH = 17, /**< an operation is requested over a resource that does not support it */
U_ILLEGAL_ESCAPE_SEQUENCE = 18, /**< ISO-2022 illegal escape sequence */
U_UNSUPPORTED_ESCAPE_SEQUENCE = 19, /**< ISO-2022 unsupported escape sequence */
U_NO_SPACE_AVAILABLE = 20, /**< No space available for in-buffer expansion for Arabic shaping */
U_CE_NOT_FOUND_ERROR = 21, /**< Currently used only while setting variable top, but can be used generally */
U_PRIMARY_TOO_LONG_ERROR = 22, /**< User tried to set variable top to a primary that is longer than two bytes */
U_STATE_TOO_OLD_ERROR = 23, /**< ICU cannot construct a service from this state, as it is no longer supported */
U_TOO_MANY_ALIASES_ERROR = 24, /**< There are too many aliases in the path to the requested resource.
It is very possible that a circular alias definition has occurred */
U_ENUM_OUT_OF_SYNC_ERROR = 25, /**< UEnumeration out of sync with underlying collection */
U_INVARIANT_CONVERSION_ERROR = 26, /**< Unable to convert a UChar* string to char* with the invariant converter. */
U_INVALID_STATE_ERROR = 27, /**< Requested operation can not be completed with ICU in its current state */
U_COLLATOR_VERSION_MISMATCH = 28, /**< Collator version is not compatible with the base version */
U_USELESS_COLLATOR_ERROR = 29, /**< Collator is options only and no base is specified */
U_NO_WRITE_PERMISSION = 30, /**< Attempt to modify read-only or constant data. */
}
private delegate void* Unorm2_getNFKCCasefoldInstanceDelegate(UErrorCode *pErrorCode);
private delegate int Unorm2_normalizeDelegate(/*const UNormalizer2*/ void* norm2, char* src, int length, char* dest, int capacity, UErrorCode *pErrorCode);
public static void Main(string[] args)
{
if (NativeLibrary.TryLoad("icu", out IntPtr icuHandle))
{
Console.WriteLine($"ICU is Loaded");
if (NativeLibrary.TryGetExport(icuHandle, "unorm2_getNFKCCasefoldInstance", out IntPtr unorm2_getNFKCCasefoldInstancePtr))
{
Console.WriteLine($"unorm2_getNFKCCasefoldInstance is Loaded");
Unorm2_getNFKCCasefoldInstanceDelegate unorm2_getNFKCCasefoldInstanceDelegate = Marshal.GetDelegateForFunctionPointer<Unorm2_getNFKCCasefoldInstanceDelegate>(unorm2_getNFKCCasefoldInstancePtr);
UErrorCode errorCode = UErrorCode.U_ZERO_ERROR;
void* uNormalizer2 = unorm2_getNFKCCasefoldInstanceDelegate(&errorCode);
Console.WriteLine($"errorCode: {errorCode} ... uNormalizer2: {(long)uNormalizer2}");
if (NativeLibrary.TryGetExport(icuHandle, "unorm2_normalize", out IntPtr unorm2_normalizePtr))
{
Console.WriteLine($"unorm2_normalize is Loaded");
Unorm2_normalizeDelegate unorm2_normalize = Marshal.GetDelegateForFunctionPointer<Unorm2_normalizeDelegate>(unorm2_normalizePtr);
char* buffer = stackalloc char[100];
fixed (char* src = "Hello World")
{
int normalizedLen = unorm2_normalize(uNormalizer2, src, "Hello World".Length, buffer, 100, &errorCode);
Console.WriteLine($"normalizedLen: {normalizedLen} ... errorCode: {errorCode} ... Normalized: {new string(buffer, 0, normalizedLen)}");
Console.WriteLine($"");
}
}
else
{
Console.WriteLine($"unorm2_normalize is NOT Loaded");
}
}
else
{
Console.WriteLine($"unorm2_getNFKCCasefoldInstance is NOT Loaded");
}
}
}
} Note, this code is just a sample to know how you can do it. You can use it at your own risk as I didn't test it or tried it on different platforms. |
Thank you @tarekgh |
Background and motivation
NFKC normalization is very often performed along with casefolding.
http://www.unicode.org/reports/tr36/
The NFKC_Casefold algorithm is already implemented in ICU as another form of Unicode normalization:
I think introducing NFKC_Casefold method will be complementary to the currently planned casefolding support in .NET.
API Proposal
API Usage
Alternative Designs
No response
Risks
No response
The text was updated successfully, but these errors were encountered: