From e668e6e00f2edca2d21758bd339a8c16628c2059 Mon Sep 17 00:00:00 2001 From: Mikhail Zabaluev Date: Wed, 17 Mar 2010 02:34:51 +0200 Subject: [PATCH] Made g_utf8_to_ucs4_fast() even faster https://bugzilla.gnome.org/show_bug.cgi?id=619435 --- glib/gutf8.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/glib/gutf8.c b/glib/gutf8.c index 98f470716..dd9872b55 100644 --- a/glib/gutf8.c +++ b/glib/gutf8.c @@ -851,7 +851,6 @@ g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written) { - gint j, charlen; gunichar *result; gint n_chars, i; const gchar *p; @@ -882,49 +881,37 @@ g_utf8_to_ucs4_fast (const gchar *str, p = str; for (i=0; i < n_chars; i++) { - gunichar wc = ((unsigned char *)p)[0]; + gunichar wc = (guchar)*p++; if (wc < 0x80) { result[i] = wc; - p++; } else { - if (wc < 0xe0) - { - charlen = 2; - wc &= 0x1f; - } - else if (wc < 0xf0) - { - charlen = 3; - wc &= 0x0f; - } - else if (wc < 0xf8) - { - charlen = 4; - wc &= 0x07; - } - else if (wc < 0xfc) - { - charlen = 5; - wc &= 0x03; - } - else + gunichar mask = 0x40; + + if (G_UNLIKELY ((wc & mask) == 0)) { - charlen = 6; - wc &= 0x01; + /* It's an out-of-sequence 10xxxxxxx byte. + * Rather than making an ugly hash of this and the next byte + * and overrunning the buffer, it's more useful to treat it + * with a replacement character */ + result[i] = 0xfffd; + continue; } - for (j = 1; j < charlen; j++) + do { wc <<= 6; - wc |= ((unsigned char *)p)[j] & 0x3f; + wc |= (guchar)(*p++) & 0x3f; + mask <<= 5; } + while((wc & mask) != 0); + + wc &= mask - 1; result[i] = wc; - p += charlen; } } result[i] = 0;