Skip to content

Commit

Permalink
Merge pull request #8 from pixelglow/utf8-codes-up-to-4-bytes
Browse files Browse the repository at this point in the history
UTF-8 codes up to 4 bytes
  • Loading branch information
ellson committed Dec 6, 2013
2 parents 34fde71 + b755721 commit 970d387
Showing 1 changed file with 37 additions and 40 deletions.
77 changes: 37 additions & 40 deletions lib/common/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,9 @@ char* htmlEntityUTF8 (char* s, graph_t* g)
unsigned int v;
int ignored;

int uc;
int ui;

NOTUSED(ignored);

if (lastg != g) {
Expand All @@ -1420,14 +1423,30 @@ char* htmlEntityUTF8 (char* s, graph_t* g)
agxbinit(&xb, BUFSIZ, buf);

while ((c = *(unsigned char*)s++)) {
if (c < 0xC0) {
if (c < 0xC0)
/*
* Handles properly formed UTF-8 characters between
* 0x01 and 0x7F. Also treats \0 and naked trail
* bytes 0x80 to 0xBF as valid characters representing
* themselves.
*/
if (c == '&') {
uc = 0;
else if (c < 0xE0)
uc = 1;
else if (c < 0xF0)
uc = 2;
else if (c < 0xF8)
uc = 3;
else {
uc = -1;
if (!warned) {
agerr(AGWARN, "UTF8 codes > 4 bytes are not currently supported (graph %s) - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", agnameof(g));
warned = 1;
}
c = cvtAndAppend (c, &xb);
}

if (uc == 0 && c == '&') {
/* replace html entity sequences like: &amp;
* and: &#123; with their UTF8 equivalents */
v = htmlEntity (&s);
Expand All @@ -1443,45 +1462,23 @@ char* htmlEntityUTF8 (char* s, graph_t* g)
ignored = agxbputc(&xb, ((v >> 6) & 0x3F) | 0x80);
c = (v & 0x3F) | 0x80;
}
}
}
}
else if (c < 0xE0) { /* copy 2 byte UTF8 characters */
if ((*s & 0xC0) == 0x80) {
ignored = agxbputc(&xb, c);
c = *(unsigned char*)s++;
}
else {
if (!warned) {
agerr(AGWARN, "Invalid 2-byte UTF8 found in input of graph %s - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", agnameof(g));
warned = 1;
}
c = cvtAndAppend (c, &xb);
}
}
else if (c < 0xF0) { /* copy 3 byte UTF8 characters */
if (((*s & 0xC0) == 0x80) && ((s[1] & 0xC0) == 0x80)) {
ignored = agxbputc(&xb, c);
c = *(unsigned char*)s++;
ignored = agxbputc(&xb, c);
c = *(unsigned char*)s++;
}
else {
if (!warned) {
agerr(AGWARN, "Invalid 3-byte UTF8 found in input of graph %s - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", agnameof(g));
warned = 1;
}
c = cvtAndAppend (c, &xb);
}
}
else {
if (!warned) {
agerr(AGWARN, "UTF8 codes > 3 bytes are not currently supported (graph %s) - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", agnameof(g));
warned = 1;
}
c = cvtAndAppend (c, &xb);
}
}
ignored = agxbputc(&xb, c);
else /* copy n byte UTF8 characters */
for (ui = 0; ui < uc; ++ui)
if ((*s & 0xC0) == 0x80) {
ignored = agxbputc(&xb, c);
c = *(unsigned char*)s++;
}
else {
if (!warned) {
agerr(AGWARN, "Invalid %d-byte UTF8 found in input of graph %s - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", uc + 1, agnameof(g));
warned = 1;
}
c = cvtAndAppend (c, &xb);
break;
}
ignored = agxbputc(&xb, c);
}
ns = strdup (agxbuse(&xb));
agxbfree(&xb);
Expand Down

0 comments on commit 970d387

Please sign in to comment.