diff --git a/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp b/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp index cab71f15e7098e..53a45ed008a491 100644 --- a/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp +++ b/src/coreclr/pal/tests/palsuite/locale_info/MultiByteToWideChar/test4/test4.cpp @@ -11,6 +11,7 @@ **==========================================================================*/ #include +#include PALTEST(locale_info_MultiByteToWideChar_test4_paltest_multibytetowidechar_test4, "locale_info/MultiByteToWideChar/test4/paltest_multibytetowidechar_test4") { @@ -222,6 +223,41 @@ PALTEST(locale_info_MultiByteToWideChar_test4_paltest_multibytetowidechar_test4, free(wideBuffer); } + +#if BIGENDIAN + { + const char* ascii = "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456"; + const size_t asciiLengthWithTerminator = strlen(ascii) + 1; + const unsigned int flags = MINIPAL_TREAT_AS_LITTLE_ENDIAN; + + size_t requiredLength = minipal_get_length_utf8_to_utf16(ascii, asciiLengthWithTerminator, flags); + if (requiredLength != asciiLengthWithTerminator) + { + Fail("minipal_get_length_utf8_to_utf16 with MINIPAL_TREAT_AS_LITTLE_ENDIAN returned %zu, expected %zu\n", + requiredLength, asciiLengthWithTerminator); + } + + WCHAR wideBuffer[64]; + size_t convertedLength = minipal_convert_utf8_to_utf16(ascii, asciiLengthWithTerminator, + (CHAR16_T*)wideBuffer, sizeof(wideBuffer) / sizeof(wideBuffer[0]), flags); + + if (convertedLength != asciiLengthWithTerminator) + { + Fail("minipal_convert_utf8_to_utf16 with MINIPAL_TREAT_AS_LITTLE_ENDIAN returned %zu, expected %zu\n", + convertedLength, asciiLengthWithTerminator); + } + + for (size_t i = 0; i < asciiLengthWithTerminator; i++) + { + WCHAR expected = (WCHAR)(((unsigned char)ascii[i]) << 8); + if (wideBuffer[i] != expected) + { + Fail("minipal_convert_utf8_to_utf16 with MINIPAL_TREAT_AS_LITTLE_ENDIAN mismatch at %zu: got 0x%04x expected 0x%04x\n", + i, wideBuffer[i], expected); + } + } + } +#endif PAL_Terminate(); diff --git a/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp b/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp index bf2dabedefa880..697730b9dbd026 100644 --- a/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp +++ b/src/coreclr/pal/tests/palsuite/locale_info/WideCharToMultiByte/test5/test5.cpp @@ -11,6 +11,7 @@ **==========================================================================*/ #include +#include PALTEST(locale_info_WideCharToMultiByte_test5_paltest_widechartomultibyte_test5, "locale_info/WideCharToMultiByte/test5/paltest_widechartomultibyte_test5") { @@ -146,6 +147,42 @@ PALTEST(locale_info_WideCharToMultiByte_test5_paltest_widechartomultibyte_test5, free(utf8Buffer); } + +#if BIGENDIAN + { + const char* expected = "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456"; + const size_t expectedLengthWithTerminator = strlen(expected) + 1; + const unsigned int flags = MINIPAL_TREAT_AS_LITTLE_ENDIAN; + + WCHAR littleEndianWide[64]; + for (size_t i = 0; i < expectedLengthWithTerminator; i++) + { + littleEndianWide[i] = (WCHAR)(((unsigned char)expected[i]) << 8); + } + + size_t requiredLength = minipal_get_length_utf16_to_utf8((const CHAR16_T*)littleEndianWide, + expectedLengthWithTerminator, flags); + if (requiredLength != expectedLengthWithTerminator) + { + Fail("minipal_get_length_utf16_to_utf8 with MINIPAL_TREAT_AS_LITTLE_ENDIAN returned %zu, expected %zu\n", + requiredLength, expectedLengthWithTerminator); + } + + CHAR utf8Buffer[64]; + size_t convertedLength = minipal_convert_utf16_to_utf8((const CHAR16_T*)littleEndianWide, + expectedLengthWithTerminator, utf8Buffer, sizeof(utf8Buffer), flags); + if (convertedLength != expectedLengthWithTerminator) + { + Fail("minipal_convert_utf16_to_utf8 with MINIPAL_TREAT_AS_LITTLE_ENDIAN returned %zu, expected %zu\n", + convertedLength, expectedLengthWithTerminator); + } + + if (memcmp(utf8Buffer, expected, expectedLengthWithTerminator) != 0) + { + Fail("minipal_convert_utf16_to_utf8 with MINIPAL_TREAT_AS_LITTLE_ENDIAN produced unexpected bytes\n"); + } + } +#endif PAL_Terminate(); diff --git a/src/native/minipal/utf8.c b/src/native/minipal/utf8.c index 6580d20b201ea1..e0b25fce415473 100644 --- a/src/native/minipal/utf8.c +++ b/src/native/minipal/utf8.c @@ -981,6 +981,11 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun // Didn't throw, just use this buffer size. break; } +#if BIGENDIAN + if (self->treatAsLE) + *pTarget = ((CHAR16_T)ch)<<8; + else +#endif *pTarget = (CHAR16_T)ch; ENSURE_BUFFER_INC @@ -1008,6 +1013,11 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun if (ch > 0x7F) goto ProcessChar; +#if BIGENDIAN + if (self->treatAsLE) + *pTarget = ((CHAR16_T)ch)<<8; + else +#endif *pTarget = (CHAR16_T)ch; ENSURE_BUFFER_INC } @@ -1031,7 +1041,11 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun pSrc++; if (ch > 0x7F) goto LongCode; - +#if BIGENDIAN + if (self->treatAsLE) + *pTarget = ((CHAR16_T)ch)<<8; + else +#endif *pTarget = (CHAR16_T)ch; ENSURE_BUFFER_INC @@ -1041,7 +1055,11 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun ch = *pSrc; pSrc++; if (ch > 0x7F) goto LongCode; - +#if BIGENDIAN + if (self->treatAsLE) + *pTarget = ((CHAR16_T)ch)<<8; + else +#endif *pTarget = (CHAR16_T)ch; ENSURE_BUFFER_INC } @@ -1071,10 +1089,17 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun else #endif { +#if BIGENDIAN + *pTarget = ((CHAR16_T)((ch >> 8) & 0x7F))<<8; + pSrc += 2; + *(pTarget + 1) = ((CHAR16_T)(ch & 0x7F))<<8; + pTarget += 2; +#else *pTarget = (CHAR16_T)(ch & 0x7F); pSrc += 2; *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F); pTarget += 2; +#endif } } @@ -1109,6 +1134,18 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun else #endif { +#if BIGENDIAN + *pTarget = ((CHAR16_T)((ch >> 24) & 0x7F)) << 8; + *(pTarget + 1) = ((CHAR16_T)((ch >> 16) & 0x7F)) << 8; + *(pTarget + 2) = ((CHAR16_T)((ch >> 8) & 0x7F)) << 8; + *(pTarget + 3) = ((CHAR16_T)(ch & 0x7F)) << 8; + pSrc += 8; + *(pTarget + 4) = ((CHAR16_T)((chb >> 24) & 0x7F)) << 8; + *(pTarget + 5) = ((CHAR16_T)((chb >> 16) & 0x7F)) << 8; + *(pTarget + 6) = ((CHAR16_T)((chb >> 8) & 0x7F)) << 8; + *(pTarget + 7) = ((CHAR16_T)(chb & 0x7F)) << 8; + pTarget += 8; +#else *pTarget = (CHAR16_T)(ch & 0x7F); *(pTarget + 1) = (CHAR16_T)((ch >> 8) & 0x7F); *(pTarget + 2) = (CHAR16_T)((ch >> 16) & 0x7F); @@ -1119,6 +1156,7 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun *(pTarget + 6) = (CHAR16_T)((chb >> 16) & 0x7F); *(pTarget + 7) = (CHAR16_T)((chb >> 24) & 0x7F); pTarget += 8; +#endif } } break; @@ -1141,6 +1179,11 @@ static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCoun pSrc++; if (ch <= 0x7F) { +#if BIGENDIAN + if (self->treatAsLE) + *pTarget = ((CHAR16_T)ch)<<8; + else +#endif *pTarget = (CHAR16_T)ch; ENSURE_BUFFER_INC continue; @@ -1393,6 +1436,11 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un // read next char. The JIT optimization seems to be getting confused when // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead +#if BIGENDIAN + if (self->treatAsLE) + ch = (*pSrc) >> 8; + else +#endif ch = *pSrc; pSrc++; @@ -1531,6 +1579,11 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered while (pSrc < pLocalEnd) { +#if BIGENDIAN + if (self->treatAsLE) + ch = (*pSrc) >> 8; + else +#endif ch = *pSrc; pSrc++; @@ -1564,6 +1617,11 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un while (pSrc < pStop) { +#if BIGENDIAN + if (self->treatAsLE) + ch = (*pSrc) >> 8; + else +#endif ch = *pSrc; pSrc++; @@ -1575,6 +1633,11 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un // get pSrc aligned if (((size_t)pSrc & 0x2) != 0) { +#if BIGENDIAN + if (self->treatAsLE) + ch = (*pSrc) >> 8; + else +#endif ch = *pSrc; pSrc++; if (ch > 0x7F) goto LongCode; @@ -1588,9 +1651,14 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un { ch = *(int*)pSrc; int chc = *(int*)(pSrc + 2); - +#if BIGENDIAN + if (self->treatAsLE){ + if (((ch | chc) & (int)0x80FF80FF) != 0) goto LongCodeWithMask; + } + else +#else if (((ch | chc) & (int)0xFF80FF80) != 0) goto LongCodeWithMask; - +#endif if (pTarget + 4 > pAllocatedBufferEnd) { errno = MINIPAL_ERROR_INSUFFICIENT_BUFFER; @@ -1611,12 +1679,21 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un else #endif { +#if BIGENDIAN + *(pTarget)= (unsigned char)((ch >> 16) >> 8); + *(pTarget+1) = (unsigned char)(ch >> 8); + pSrc += 4; + *(pTarget + 2) = (unsigned char)((chc >> 16) >> 8); + *(pTarget + 3) = (unsigned char)(chc >> 8); + pTarget += 4; +#else *pTarget = (unsigned char)ch; *(pTarget + 1) = (unsigned char)(ch >> 16); pSrc += 4; *(pTarget + 2) = (unsigned char)chc; *(pTarget + 3) = (unsigned char)(chc >> 16); pTarget += 4; +#endif } } continue; @@ -1787,7 +1864,13 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) // use separate helper variables for local contexts so that the jit optimizations // won't get confused about the variable lifetimes - int cha = *pSrc; + int cha; +#if BIGENDIAN + if (self->treatAsLE) + cha = (*pSrc) >> 8; + else +#endif + cha = *pSrc; // count the pending surrogate byteCount++; @@ -1825,6 +1908,11 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) // read next char. The JIT optimization seems to be getting confused when // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead +#if BIGENDIAN + if (self->treatAsLE) + ch = (*pSrc) >> 8; + else +#endif ch = *pSrc; pSrc++; @@ -1900,8 +1988,13 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) CHAR16_T* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered while (pSrc < pLocalEnd) { - ch = *pSrc; - pSrc++; +#if BIGENDIAN + if (self->treatAsLE) + ch = (*pSrc) >> 8; + else +#endif + ch = *pSrc; + pSrc++; if (ch > 0x7F) goto ProcessChar; } @@ -1922,6 +2015,11 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) while (pSrc < pStop) { +#if BIGENDIAN + if (self->treatAsLE) + ch = (*pSrc) >> 8; + else +#endif ch = *pSrc; pSrc++; @@ -1939,6 +2037,11 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) // get pSrc aligned if (((size_t)pSrc & 0x2) != 0) { +#if BIGENDIAN + if (self->treatAsLE) + ch = (*pSrc) >> 8; + else +#endif ch = *pSrc; pSrc++; if (ch > 0x7F) // Not ASCII @@ -1958,6 +2061,13 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) { ch = *(int*)pSrc; int chc = *(int*)(pSrc + 2); +#if BIGENDIAN + if (self->treatAsLE) + { + ch = ((((ch) & 0xff000000u) >> 24) | (((ch) & 0x00ff0000u) >> 8) | (((ch) & 0x0000ff00u) << 8) | (((ch) & 0x000000ffu) << 24)); + chc = ((((chc) & 0xff000000u) >> 24) | (((chc) & 0x00ff0000u) >> 8) | (((chc) & 0x0000ff00u) << 8) | (((chc) & 0x000000ffu) << 24)); + } +#endif if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII { if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte @@ -1979,6 +2089,14 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) ch = *(int*)pSrc; chc = *(int*)(pSrc + 2); +#if BIGENDIAN + if (self->treatAsLE) + { + ch = ((((ch) & 0xff000000u) >> 24) | (((ch) & 0x00ff0000u) >> 8) | (((ch) & 0x0000ff00u) << 8) | (((ch) & 0x000000ffu) << 24)); + chc = ((((chc) & 0xff000000u) >> 24) | (((chc) & 0x00ff0000u) >> 8) | (((chc) & 0x0000ff00u) << 8) | (((chc) & 0x000000ffu) << 24)); + } +#endif + if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII { if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte @@ -2002,10 +2120,17 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) LongCodeWithMask: #if BIGENDIAN // be careful about the sign extension - if (!self->treatAsLE) ch = (int)(((unsigned int)ch) >> 16); + if (self->treatAsLE) + { + ch = (CHAR16_T)(((ch & 0xFF) << 8) | ((ch >> 8) & 0xFF)); + } else -#endif + { + ch = (int)(((unsigned int)ch) >> 16); + } +#else ch = (CHAR16_T)ch; +#endif pSrc++; @@ -2022,8 +2147,13 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count) if (InRange(ch, HIGH_SURROGATE_START, LOW_SURROGATE_END)) { // 4 byte encoding - high surrogate + low surrogate - - int chd = *pSrc; + int chd; +#if BIGENDIAN + if (self->treatAsLE) + chd = (*pSrc) >> 8; + else +#endif + chd = *pSrc; if ( ch > HIGH_SURROGATE_END || !InRange(chd, LOW_SURROGATE_START, LOW_SURROGATE_END))