Skip to content

Commit

Permalink
Much improved UTF8 support
Browse files Browse the repository at this point in the history
* if we're not given a valid UTF8 code sequence then we don't lex the string as UTF8 (more compatible with old code)
* If we got non-ASCII chars and *then* a proper UTF8 code, convert all previous chars
* String.join/split fixed for UTF8
  • Loading branch information
gfwilliams committed Jun 16, 2023
1 parent 1209324 commit aa64b17
Show file tree
Hide file tree
Showing 12 changed files with 265 additions and 36 deletions.
24 changes: 19 additions & 5 deletions libs/graphics/jswrap_graphics.c
Original file line number Diff line number Diff line change
Expand Up @@ -2162,7 +2162,7 @@ void _jswrap_graphics_stringMetrics(JsGraphics *gfx, JsVar *var, int lineStartIn
int fontHeight = _jswrap_graphics_getFontHeightInternal(gfx, &info);
JsVar *str = jsvAsString(var);
JsvStringIterator it;
jsvStringIteratorNew(&it, str, (lineStartIndex<0)?0:lineStartIndex);
jsvStringIteratorNewUTF8(&it, str, (lineStartIndex<0)?0:lineStartIndex);
int width = 0;
int height = fontHeight;
int maxWidth = 0;
Expand Down Expand Up @@ -2285,7 +2285,7 @@ JsVar *jswrap_graphics_wrapString(JsVar *parent, JsVar *str, int maxWidth) {
bool wasNewLine = false;

JsvStringIterator it;
jsvStringIteratorNew(&it, str, 0);
jsvStringIteratorNewUTF8(&it, str, 0);

while (jsvStringIteratorHasChar(&it) || endOfText) {
int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
Expand All @@ -2302,12 +2302,21 @@ JsVar *jswrap_graphics_wrapString(JsVar *parent, JsVar *str, int maxWidth) {
lineWidth += wordWidth;
} else { // doesn't fit one one line - move to new line
lineWidth = wordWidth;
if (jsvGetStringLength(currentLine) || wasNewLine)
if (jsvGetStringLength(currentLine) || wasNewLine) {
#ifdef ESPR_UNICODE_SUPPORT
if (jsvIsUTF8String(str))
currentLine = jsvNewUTF8StringAndUnLock(currentLine);
#endif
jsvArrayPush(lines, currentLine);
}
jsvUnLock(currentLine);
if (wordIdxAtMaxWidth) {
// word is too long to fit on a line
currentLine = jsvNewFromStringVar(str, wordStartIdx, wordIdxAtMaxWidth-(wordStartIdx+1));
#ifdef ESPR_UNICODE_SUPPORT
if (jsvIsUTF8String(str))
currentLine = jsvNewUTF8StringAndUnLock(currentLine);
#endif
jsvArrayPushAndUnLock(lines, currentLine);
wordStartIdx = wordIdxAtMaxWidth-1;
lineWidth -= wordWidthAtMaxWidth;
Expand Down Expand Up @@ -2345,8 +2354,13 @@ JsVar *jswrap_graphics_wrapString(JsVar *parent, JsVar *str, int maxWidth) {
}
jsvStringIteratorFree(&it);
// deal with final line
if (jsvGetStringLength(currentLine))
if (jsvGetStringLength(currentLine)) {
#ifdef ESPR_UNICODE_SUPPORT
if (jsvIsUTF8String(str))
currentLine = jsvNewUTF8StringAndUnLock(currentLine);
#endif
jsvArrayPush(lines, currentLine);
}
jsvUnLock2(str,currentLine);
_jswrap_graphics_freeFontInfo(&info);
return lines;
Expand Down Expand Up @@ -2450,7 +2464,7 @@ JsVar *jswrap_graphics_drawString(JsVar *parent, JsVar *var, int x, int y, bool
JsVar *str = jsvAsString(var);
#endif
JsvStringIterator it;
jsvStringIteratorNew(&it, str, 0);
jsvStringIteratorNewUTF8(&it, str, 0);
while (jsvStringIteratorHasChar(&it)) {
int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
if (ch=='\n') {
Expand Down
83 changes: 75 additions & 8 deletions src/jslex.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ static JSLEX_INLINE void jslTokenAppendChar(char ch) {
}

// Check if a token matches (IGNORING FIRST CHAR)
static bool jslCheckToken(const char *token, int tokenId) {
static bool jslCheckToken(const char *token, short int tokenId) {
int i;
token--; // because we add 1 in for loop
for (i=1;i<lex->tokenl;i++) {
Expand Down Expand Up @@ -281,6 +281,39 @@ static JSLEX_INLINE void jslSingleChar() {
jslGetNextCh();
}

#ifdef ESPR_UNICODE_SUPPORT
/* We've now parsed some of a String and we didn't think it was UTF8,
but we hit a UTF8 character. For instance:
"F\xF6n F\u00F6n" where '\xF6' wouldn't have made the string Unicode but '\u00F6' would
We need to go back over the String that
we parsed and convert any non-ASCII escape codes we came across back to UTF8.
*/
static void jslConvertTokenValueUTF8(JsvStringIterator *it) {
JsVar *utf8str = jsvNewFromEmptyString();
if (!utf8str) return;
jsvStringIteratorFree(it);
JsvStringIterator src;
jsvStringIteratorNew(&src, lex->tokenValue, 0);
jsvStringIteratorNew(it, utf8str, 0);
while (jsvStringIteratorHasChar(&src)) {
char ch = jsvStringIteratorGetCharAndNext(&src);
if (jsUTF8IsStartChar(ch)) {
// convert to a UTF8 sequence
char utf8[4];
unsigned int l = jsUTF8Encode((unsigned char)ch, utf8);
for (unsigned int i=0;i<l;i++)
jsvStringIteratorAppend(it, utf8[i]);
} else // normal ASCII
jsvStringIteratorAppend(it, ch);
}
jsvStringIteratorFree(&src);
jsvUnLock(lex->tokenValue);
lex->tokenValue = utf8str;
}
#endif

static void jslLexString() {
char delim = lex->currCh;
JsvStringIterator it;
Expand All @@ -299,6 +332,7 @@ static void jslLexString() {
char lastCh = delim;
int nesting = 0;
#ifdef ESPR_UNICODE_SUPPORT
bool hadCharsInUTF8Range = false;
int high_surrogate = 0;
lex->isUTF8 = false;
#endif // ESPR_UNICODE_SUPPORT
Expand Down Expand Up @@ -361,8 +395,11 @@ static void jslLexString() {
jsExceptionHere(JSET_ERROR, "Unmatched Unicode surrogate");
}
len = jsUTF8Encode(codepoint, buf);
if (jsUTF8IsStartChar(buf[0]))
if (jsUTF8IsStartChar(buf[0])) {
if (!lex->isUTF8 && hadCharsInUTF8Range)
jslConvertTokenValueUTF8(&it);
lex->isUTF8 = true;
}
ch = buf[len-1]; // last char is in 'ch' as jsvStringIteratorAppend(..., ch) is called later on
if (len>1) {
n=0;
Expand All @@ -371,9 +408,13 @@ static void jslLexString() {
jsvStringIteratorAppend(&it, c);
}
}
} else
} else { // !isUTF8
hadCharsInUTF8Range |= jsUTF8IsStartChar((char)codepoint);
#else
{
#endif
ch = (char)codepoint;
}
} break;
default:
if (lex->currCh>='0' && lex->currCh<='7') {
Expand Down Expand Up @@ -404,12 +445,38 @@ static void jslLexString() {
break;
} else {
#ifdef ESPR_UNICODE_SUPPORT
if (jsUTF8IsStartChar(lex->currCh))
lex->isUTF8 = true;
if (jsUTF8IsStartChar(lex->currCh)) {
char buf[4];
buf[0] = lex->currCh;
bool isValidUTF8 = true;
unsigned int len = jsUTF8LengthFromChar(lex->currCh);
for (unsigned int i=1;i<len;i++) {
jslGetNextCh();
buf[i] = lex->currCh;
if ((lex->currCh&0xC0) != 0x80) {
// not a valid UTF8 sequence! We'll actually just carry
// on as we would if we were a non-UTF8 Espruino implementation
isValidUTF8 = false;
len = i+1;
break;
}
}
if (isValidUTF8) {
if (!lex->isUTF8 && hadCharsInUTF8Range)
jslConvertTokenValueUTF8(&it);
lex->isUTF8 = true;
} else
hadCharsInUTF8Range = true;
// copy data back in
for (unsigned int i=0;i<len-1;i++)
jsvStringIteratorAppend(&it, buf[i]);
}
#endif
jsvStringIteratorAppend(&it, lex->currCh);
lastCh = lex->currCh;
jslGetNextCh();
{
jsvStringIteratorAppend(&it, lex->currCh);
lastCh = lex->currCh;
jslGetNextCh();
}
}
#ifdef ESPR_UNICODE_SUPPORT
if (high_surrogate) {
Expand Down
2 changes: 1 addition & 1 deletion src/jsutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,7 @@ void vcbprintf(
buf[1] = 0;
if (jsvIsString(v)) {
JsvStringIterator it;
jsvStringIteratorNew(&it, v, 0);
jsvStringIteratorNewUTF8(&it, v, 0);
if (quoted) {
int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
while (jsvStringIteratorHasChar(&it) || ch>=0) {
Expand Down
48 changes: 43 additions & 5 deletions src/jsvar.c
Original file line number Diff line number Diff line change
Expand Up @@ -1117,6 +1117,8 @@ JsVar *jsvNewUTF8StringAndUnLock(JsVar* dataString) {
}
#endif



JsVar *jsvNewFromInteger(JsVarInt value) {
JsVar *var = jsvNewWithFlags(JSV_INTEGER);
if (!var) return 0; // no memory
Expand Down Expand Up @@ -1912,6 +1914,10 @@ JsVar *jsvNewFromStringVar(const JsVar *str, size_t stridx, size_t maxLength) {
}
JsVar *var = jsvNewFromEmptyString();
if (var) jsvAppendStringVar(var, str, stridx, maxLength);
#ifdef ESPR_UNICODE_SUPPORT
if (jsvIsUTF8String(str))
var = jsvNewUTF8StringAndUnLock(var);
#endif
return var;
}

Expand All @@ -1924,7 +1930,7 @@ int jsvGetCharInString(JsVar *v, size_t idx) {
if (!jsvIsString(v)) return 0;

JsvStringIterator it;
jsvStringIteratorNew(&it, v, idx);
jsvStringIteratorNewUTF8(&it, v, idx);
int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
jsvStringIteratorFree(&it);
return ch;
Expand Down Expand Up @@ -1955,6 +1961,28 @@ int jsvGetStringIndexOf(JsVar *str, char ch) {
return -1;
}

#ifdef ESPR_UNICODE_SUPPORT
/// If we have a UTF8 string return the string behind it, or just return what was passed in
JsVar *jsvGetUTF8BackingString(JsVar *str) {
if (!jsvIsUTF8String(str)) return str?jsvLockAgain(str):0;
return jsvLock(jsvGetFirstChild(str));
}

/// Convert an UTF8 index in a String to a String index in the backing String. On non-UTF8 builds it passes straight through
int jsvConvertFromUTF8Index(JsVar *str, int idx) {
JsvStringIterator it;
jsvStringIteratorNewUTF8(&it, str, idx);
idx = jsvStringIteratorGetIndex(&it); // jsvStringIteratorGetIndex still reports back as non-UTF8
jsvStringIteratorFree(&it);
return idx;
}
#else
/// Convert an UTF8 index in a String to a String index in the backing String. On non-UTF8 builds it passes straight through
int jsvConvertFromUTF8Index(JsVar *str, int idx) {
return idx;
}
#endif

/** Does this string contain only Numeric characters (with optional '-'/'+' at the front)? NOT '.'/'e' and similar (allowDecimalPoint is for '.' only) */
bool jsvIsStringNumericInt(const JsVar *var, bool allowDecimalPoint) {
assert(jsvIsString(var));
Expand Down Expand Up @@ -2515,8 +2543,8 @@ bool jsvIsStringIEqualAndUnLock(JsVar *var, const char *str) {
* */
int jsvCompareString(JsVar *va, JsVar *vb, size_t starta, size_t startb, bool equalAtEndOfString) {
JsvStringIterator ita, itb;
jsvStringIteratorNew(&ita, va, starta);
jsvStringIteratorNew(&itb, vb, startb);
jsvStringIteratorNewUTF8(&ita, va, starta);
jsvStringIteratorNewUTF8(&itb, vb, startb);
// step to first positions
while (true) {
int ca = jsvStringIteratorGetUTF8CharAndNext(&ita);
Expand All @@ -2543,8 +2571,8 @@ JsVar *jsvGetCommonCharacters(JsVar *va, JsVar *vb) {
JsVar *v = jsvNewFromEmptyString();
if (!v) return 0;
JsvStringIterator ita, itb;
jsvStringIteratorNew(&ita, va, 0);
jsvStringIteratorNew(&itb, vb, 0);
jsvStringIteratorNewUTF8(&ita, va, 0);
jsvStringIteratorNewUTF8(&itb, vb, 0);
int ca = jsvStringIteratorGetUTF8CharAndNext(&ita);
int cb = jsvStringIteratorGetUTF8CharAndNext(&itb);
while (ca>0 && cb>0 && ca == cb) {
Expand Down Expand Up @@ -3491,6 +3519,9 @@ JsVar *jsvArrayJoin(JsVar *arr, JsVar *filler, bool ignoreNull) {
if (!str) return 0; // out of memory
assert(!filler || jsvIsString(filler));

#ifdef ESPR_UNICODE_SUPPORT
bool wasUTF8 = false;
#endif
JsvIterator it;
jsvIteratorNew(&it, arr, JSIF_EVERY_ARRAY_ELEMENT);
JsvStringIterator itdst;
Expand All @@ -3508,6 +3539,9 @@ JsVar *jsvArrayJoin(JsVar *arr, JsVar *filler, bool ignoreNull) {
if (value && (!ignoreNull || !jsvIsNull(value))) {
JsVar *valueStr = jsvAsString(value);
if (valueStr) { // could be out of memory
#ifdef ESPR_UNICODE_SUPPORT
wasUTF8 |= jsvIsUTF8String(valueStr);
#endif
jsvStringIteratorAppendString(&itdst, valueStr, 0, JSVAPPENDSTRINGVAR_MAXLENGTH);
jsvUnLock(valueStr);
}
Expand All @@ -3519,6 +3553,10 @@ JsVar *jsvArrayJoin(JsVar *arr, JsVar *filler, bool ignoreNull) {
}
jsvIteratorFree(&it);
jsvStringIteratorFree(&itdst);
#ifdef ESPR_UNICODE_SUPPORT
if (wasUTF8)
str = jsvNewUTF8StringAndUnLock(str);
#endif
return str;
}

Expand Down
7 changes: 7 additions & 0 deletions src/jsvar.h
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,13 @@ int jsvGetCharInString(JsVar *v, size_t idx); ///< Get a character at the given
void jsvSetCharInString(JsVar *v, size_t idx, char ch, bool bitwiseOR); ///< Set a character at the given index in the String. If bitwiseOR, ch will be ORed with the character already at that position.
int jsvGetStringIndexOf(JsVar *str, char ch); ///< Get the index of a character in a string, or -1

#ifdef ESPR_UNICODE_SUPPORT
/// If we have a UTF8 string return the string behind it, or just return what was passed in
JsVar *jsvGetUTF8BackingString(JsVar *str);
#endif
/// Convert an UTF8 index in a String to a String index in the backing String. On non-UTF8 builds it passes straight through
int jsvConvertFromUTF8Index(JsVar *str, int idx);

JsVarInt jsvGetInteger(const JsVar *v);
void jsvSetInteger(JsVar *v, JsVarInt value); ///< Set an integer value (use carefully!)
JsVarFloat jsvGetFloat(const JsVar *v); ///< Get the floating point representation of this var
Expand Down
26 changes: 14 additions & 12 deletions src/jsvariterator.c
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,8 @@ void jsvStringIteratorNew(JsvStringIterator *it, JsVar *str, size_t startIdx) {
assert(jsvHasCharacterData(str));
#ifdef ESPR_UNICODE_SUPPORT
it->isUTF8 = jsvIsUTF8String(str);
if (it->isUTF8) {
it->var = jsvLock(jsvGetFirstChild(str));
if (it->isUTF8) { // if it's UTF8, skip the UTF8 tag and go straight to the data
it->var = jsvGetUTF8BackingString(str);
assert(jsvHasCharacterData(it->var));
} else
#endif
Expand All @@ -274,31 +274,33 @@ void jsvStringIteratorNew(JsvStringIterator *it, JsVar *str, size_t startIdx) {
} else if (jsvIsFlashString(it->var)) {
it->charsInVar = 0;
it->charIdx = startIdx; // if it's not UTF8 we can just load up the bit we want immediately
#ifdef ESPR_UNICODE_SUPPORT
if (it->isUTF8)
it->charIdx = 0; // otherwise we'll have to iterate below this
jsvStringIteratorLoadFlashString(it);
if (!it->isUTF8) return; // nothing else to do here if not UTF8
#else
return jsvStringIteratorLoadFlashString(it);
#endif
#endif
} else{
it->ptr = &it->var->varData.str[0];
}
it->charIdx = startIdx;
jsvStringIteratorCatchUp(it);
}

void jsvStringIteratorNewUTF8(JsvStringIterator *it, JsVar *str, size_t startIdx) {
#ifdef ESPR_UNICODE_SUPPORT
jsvStringIteratorNew(it, str, 0);
if (it->isUTF8) {
it->charIdx = 0;
while (startIdx) {
jsvStringIteratorNextUTF8(it);
startIdx--;
}
} else
#endif
it->charIdx = startIdx;
it->charIdx = startIdx;
jsvStringIteratorCatchUp(it);
#else
jsvStringIteratorNew(it, str, startIdx);
#endif
}


void jsvStringIteratorClone(JsvStringIterator *dstit, JsvStringIterator *it) {
*dstit = *it;
if (dstit->var) {
Expand Down Expand Up @@ -746,7 +748,7 @@ void jsvIteratorNew(JsvIterator *it, JsVar *obj, JsvIteratorFlags flags) {
} else if (jsvIsUTF8String(obj)) {
it->type = JSVI_UNICODE;
it->it.unicode.index = 0;
jsvStringIteratorNew(&it->it.unicode.str, jsvLock(jsvGetFirstChild(obj)), 0);
jsvStringIteratorNew(&it->it.unicode.str, jsvGetUTF8BackingString(obj), 0);
jsvIteratorUTF8Next(it); // read a char as the current char
#endif
} else if (jsvHasCharacterData(obj)) {
Expand Down
Loading

0 comments on commit aa64b17

Please sign in to comment.