Permalink
Browse files

wxWidgets Utf8 conversion could not handle strings that where not nul…

…l terminated. Fixed with backport.
  • Loading branch information...
1 parent 0b997e2 commit c5dcb04f69fbd079c874b38e9f5b6b6d5ecf83de Alexander Stigsen committed Aug 10, 2009
Showing with 161 additions and 2 deletions.
  1. +159 −2 src/Utf.cpp
  2. +2 −0 src/Utf.h
View
161 src/Utf.cpp
@@ -103,10 +103,167 @@ size_t ConvertFromUTF8toString(const wxCharBuffer& utf8_buff, size_t utf8_buff_l
wxChar* buff = text.GetWriteBuf(utf8_buff_len);
// Convert to widechar
- const size_t wchar_len = wxConvUTF8.MB2WC(buff, utf8_buff, utf8_buff_len);
- if (wchar_len == (size_t)-1) return (size_t)-1; // invalid conversion
+ const size_t wchar_len = UTF8ToWChar(buff, utf8_buff_len, utf8_buff, utf8_buff_len);
+ if (wchar_len == wxCONV_FAILED) return wxCONV_FAILED; // invalid conversion
text.UngetWriteBuf(wchar_len);
return wchar_len;
}
+
+// this table gives the length of the UTF-8 encoding from its first character:
+const unsigned char tableUtf8Lengths[256] = {
+ // single-byte sequences (ASCII):
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00..0F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10..1F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20..2F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30..3F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50..5F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60..6F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70..7F
+
+ // these are invalid:
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80..8F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 90..9F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A0..AF
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0..BF
+ 0, 0, // C0,C1
+
+ // two-byte sequences:
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C2..CF
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0..DF
+
+ // three-byte sequences:
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0..EF
+
+ // four-byte sequences:
+ 4, 4, 4, 4, 4, // F0..F4
+
+ // these are invalid again (5- or 6-byte
+ // sequences and sequences for code points
+ // above U+10FFFF, as restricted by RFC 3629):
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F5..FF
+};
+
+
+// Backport of wxMBConvStrictUTF8::ToWChar
+size_t UTF8ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen) { // static
+ wchar_t *out = dstLen ? dst : NULL;
+ size_t written = 0;
+
+ if ( srcLen == wxNO_LEN )
+ srcLen = strlen(src) + 1;
+
+ for ( const char *p = src; ; p++ )
+ {
+ if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
+ {
+ // all done successfully, just add the trailing NULL if we are not
+ // using explicit length
+ if ( srcLen == wxNO_LEN )
+ {
+ if ( out )
+ {
+ if ( !dstLen )
+ break;
+
+ *out = L'\0';
+ }
+
+ written++;
+ }
+
+ return written;
+ }
+
+ if ( out && !dstLen-- )
+ break;
+
+ wxUint32 code;
+ unsigned char c = *p;
+
+ if ( c < 0x80 )
+ {
+ if ( srcLen == 0 ) // the test works for wxNO_LEN too
+ break;
+
+ if ( srcLen != wxNO_LEN )
+ srcLen--;
+
+ code = c;
+ }
+ else
+ {
+ unsigned len = tableUtf8Lengths[c];
+ if ( !len )
+ break;
+
+ if ( srcLen < len ) // the test works for wxNO_LEN too
+ break;
+
+ if ( srcLen != wxNO_LEN )
+ srcLen -= len;
+
+ // Char. number range | UTF-8 octet sequence
+ // (hexadecimal) | (binary)
+ // ----------------------+----------------------------------------
+ // 0000 0000 - 0000 007F | 0xxxxxxx
+ // 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
+ // 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ // 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // Code point value is stored in bits marked with 'x',
+ // lowest-order bit of the value on the right side in the diagram
+ // above. (from RFC 3629)
+
+ // mask to extract lead byte's value ('x' bits above), by sequence
+ // length:
+ static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
+
+ // mask and value of lead byte's most significant bits, by length:
+ static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
+ static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
+
+ len--; // it's more convenient to work with 0-based length here
+
+ // extract the lead byte's value bits:
+ if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
+ break;
+
+ code = c & leadValueMask[len];
+
+ // all remaining bytes, if any, are handled in the same way
+ // regardless of sequence's length:
+ for ( ; len; --len )
+ {
+ c = *++p;
+ if ( (c & 0xC0) != 0x80 )
+ return wxCONV_FAILED;
+
+ code <<= 6;
+ code |= c & 0x3F;
+ }
+ }
+
+#ifdef WC_UTF16
+ // cast is ok because wchar_t == wxUint16 if WC_UTF16
+ if ( encode_utf16(code, (wxUint16 *)out) == 2 )
+ {
+ if ( out )
+ out++;
+ written++;
+ }
+#else // !WC_UTF16
+ if ( out )
+ *out = code;
+#endif // WC_UTF16/!WC_UTF16
+
+ if ( out )
+ out++;
+
+ written++;
+ }
+
+ return wxCONV_FAILED;
+}
View
2 src/Utf.h
@@ -31,4 +31,6 @@ size_t ConvertFromUTF8(const wxCharBuffer& utf8_buff, const wxMBConv& conv, wxWC
size_t ConvertFromUTF8toString(const wxCharBuffer& utf8_buff, size_t utf8_buff_len, wxString& text);
+size_t UTF8ToWChar(wchar_t *dst, size_t dstLen, const char *src, size_t srcLen);
+
#endif // __UTF_H__

0 comments on commit c5dcb04

Please sign in to comment.