Much improved UTF8 support

* if we're not given a valid UTF8 code sequence then we don't lex the string as UTF8 (more compatible with old code) * If we got non-ASCII chars and *then* a proper UTF8 code, convert all previous chars * String.join/split fixed for UTF8
espruino · Jun 16, 2023 · aa64b17 · aa64b17
1 parent 1209324
commit aa64b17
Show file tree

Hide file tree

Showing 12 changed files with 265 additions and 36 deletions.
diff --git a/libs/graphics/jswrap_graphics.c b/libs/graphics/jswrap_graphics.c
@@ -2162,7 +2162,7 @@ void _jswrap_graphics_stringMetrics(JsGraphics *gfx, JsVar *var, int lineStartIn
   int fontHeight = _jswrap_graphics_getFontHeightInternal(gfx, &info);
   JsVar *str = jsvAsString(var);
   JsvStringIterator it;
-  jsvStringIteratorNew(&it, str, (lineStartIndex<0)?0:lineStartIndex);
+  jsvStringIteratorNewUTF8(&it, str, (lineStartIndex<0)?0:lineStartIndex);
   int width = 0;
   int height = fontHeight;
   int maxWidth = 0;
@@ -2285,7 +2285,7 @@ JsVar *jswrap_graphics_wrapString(JsVar *parent, JsVar *str, int maxWidth) {
   bool wasNewLine = false;
 
   JsvStringIterator it;
-  jsvStringIteratorNew(&it, str, 0);
+  jsvStringIteratorNewUTF8(&it, str, 0);
 
   while (jsvStringIteratorHasChar(&it) || endOfText) {
     int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
@@ -2302,12 +2302,21 @@ JsVar *jswrap_graphics_wrapString(JsVar *parent, JsVar *str, int maxWidth) {
         lineWidth += wordWidth;
       } else { // doesn't fit one one line - move to new line
         lineWidth = wordWidth;
-        if (jsvGetStringLength(currentLine) || wasNewLine)
+        if (jsvGetStringLength(currentLine) || wasNewLine) {
+#ifdef ESPR_UNICODE_SUPPORT
+          if (jsvIsUTF8String(str))
+            currentLine = jsvNewUTF8StringAndUnLock(currentLine);
+#endif
           jsvArrayPush(lines, currentLine);
+        }
         jsvUnLock(currentLine);
         if (wordIdxAtMaxWidth) {
           // word is too long to fit on a line
           currentLine = jsvNewFromStringVar(str, wordStartIdx, wordIdxAtMaxWidth-(wordStartIdx+1));
+#ifdef ESPR_UNICODE_SUPPORT
+          if (jsvIsUTF8String(str))
+            currentLine = jsvNewUTF8StringAndUnLock(currentLine);
+#endif
           jsvArrayPushAndUnLock(lines, currentLine);
           wordStartIdx = wordIdxAtMaxWidth-1;
           lineWidth -= wordWidthAtMaxWidth;
@@ -2345,8 +2354,13 @@ JsVar *jswrap_graphics_wrapString(JsVar *parent, JsVar *str, int maxWidth) {
   }
   jsvStringIteratorFree(&it);
   // deal with final line
-  if (jsvGetStringLength(currentLine))
+  if (jsvGetStringLength(currentLine)) {
+#ifdef ESPR_UNICODE_SUPPORT
+    if (jsvIsUTF8String(str))
+      currentLine = jsvNewUTF8StringAndUnLock(currentLine);
+#endif
     jsvArrayPush(lines, currentLine);
+  }
   jsvUnLock2(str,currentLine);
    _jswrap_graphics_freeFontInfo(&info);
   return lines;
@@ -2450,7 +2464,7 @@ JsVar *jswrap_graphics_drawString(JsVar *parent, JsVar *var, int x, int y, bool
   JsVar *str = jsvAsString(var);
 #endif
   JsvStringIterator it;
-  jsvStringIteratorNew(&it, str, 0);
+  jsvStringIteratorNewUTF8(&it, str, 0);
   while (jsvStringIteratorHasChar(&it)) {
     int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
     if (ch=='\n') {

diff --git a/src/jslex.c b/src/jslex.c
@@ -97,7 +97,7 @@ static JSLEX_INLINE void jslTokenAppendChar(char ch) {
 }
 
 // Check if a token matches (IGNORING FIRST CHAR)
-static bool jslCheckToken(const char *token, int tokenId) {
+static bool jslCheckToken(const char *token, short int tokenId) {
   int i;
   token--; // because we add 1 in for loop
   for (i=1;i<lex->tokenl;i++) {
@@ -281,6 +281,39 @@ static JSLEX_INLINE void jslSingleChar() {
   jslGetNextCh();
 }
 
+#ifdef ESPR_UNICODE_SUPPORT
+/* We've now parsed some of a String and we didn't think it was UTF8,
+but we hit a UTF8 character. For instance:
+
+"F\xF6n F\u00F6n" where '\xF6' wouldn't have made the string Unicode but '\u00F6' would
+
+We need to go back over the String that
+we parsed and convert any non-ASCII escape codes we came across back to UTF8.
+*/
+static void jslConvertTokenValueUTF8(JsvStringIterator *it) {
+  JsVar *utf8str = jsvNewFromEmptyString();
+  if (!utf8str) return;
+  jsvStringIteratorFree(it);
+  JsvStringIterator src;
+  jsvStringIteratorNew(&src, lex->tokenValue, 0);
+  jsvStringIteratorNew(it, utf8str, 0);
+  while (jsvStringIteratorHasChar(&src)) {
+    char ch = jsvStringIteratorGetCharAndNext(&src);
+    if (jsUTF8IsStartChar(ch)) {
+      // convert to a UTF8 sequence
+      char utf8[4];
+      unsigned int l = jsUTF8Encode((unsigned char)ch, utf8);
+      for (unsigned int i=0;i<l;i++)
+        jsvStringIteratorAppend(it, utf8[i]);
+    } else // normal ASCII
+      jsvStringIteratorAppend(it, ch);
+  }
+  jsvStringIteratorFree(&src);
+  jsvUnLock(lex->tokenValue);
+  lex->tokenValue = utf8str;
+}
+#endif
+
 static void jslLexString() {
   char delim = lex->currCh;
   JsvStringIterator it;
@@ -299,6 +332,7 @@ static void jslLexString() {
   char lastCh = delim;
   int nesting = 0;
 #ifdef ESPR_UNICODE_SUPPORT
+  bool hadCharsInUTF8Range = false;
   int high_surrogate = 0;
   lex->isUTF8 = false;
 #endif  // ESPR_UNICODE_SUPPORT
@@ -361,8 +395,11 @@ static void jslLexString() {
             jsExceptionHere(JSET_ERROR, "Unmatched Unicode surrogate");
           }
           len = jsUTF8Encode(codepoint, buf);
-          if (jsUTF8IsStartChar(buf[0]))
+          if (jsUTF8IsStartChar(buf[0])) {
+            if (!lex->isUTF8 && hadCharsInUTF8Range)
+              jslConvertTokenValueUTF8(&it);
             lex->isUTF8 = true;
+          }
           ch = buf[len-1]; // last char is in 'ch' as jsvStringIteratorAppend(..., ch) is called later on
           if (len>1) {
             n=0;
@@ -371,9 +408,13 @@ static void jslLexString() {
               jsvStringIteratorAppend(&it, c);
             }
           }
-        } else
+        } else { // !isUTF8
+          hadCharsInUTF8Range |= jsUTF8IsStartChar((char)codepoint);
+#else
+        {
 #endif
           ch = (char)codepoint;
+        }
       } break;
       default:
         if (lex->currCh>='0' && lex->currCh<='7') {
@@ -404,12 +445,38 @@ static void jslLexString() {
       break;
     } else {
 #ifdef ESPR_UNICODE_SUPPORT
-      if (jsUTF8IsStartChar(lex->currCh))
-        lex->isUTF8 = true;
+      if (jsUTF8IsStartChar(lex->currCh)) {
+        char buf[4];
+        buf[0] = lex->currCh;
+        bool isValidUTF8 = true;
+        unsigned int len = jsUTF8LengthFromChar(lex->currCh);
+        for (unsigned int i=1;i<len;i++) {
+          jslGetNextCh();
+          buf[i] = lex->currCh;
+          if ((lex->currCh&0xC0) != 0x80) {
+            // not a valid UTF8 sequence! We'll actually just carry
+            // on as we would if we were a non-UTF8 Espruino implementation
+            isValidUTF8 = false;
+            len = i+1;
+            break;
+          }
+        }
+        if (isValidUTF8) {
+          if (!lex->isUTF8 && hadCharsInUTF8Range)
+            jslConvertTokenValueUTF8(&it);
+          lex->isUTF8 = true;
+        } else
+          hadCharsInUTF8Range = true;
+        // copy data back in
+        for (unsigned int i=0;i<len-1;i++)
+            jsvStringIteratorAppend(&it, buf[i]);
+      }
 #endif
-      jsvStringIteratorAppend(&it, lex->currCh);
-      lastCh = lex->currCh;
-      jslGetNextCh();
+      {
+        jsvStringIteratorAppend(&it, lex->currCh);
+        lastCh = lex->currCh;
+        jslGetNextCh();
+      }
     }
 #ifdef ESPR_UNICODE_SUPPORT
     if (high_surrogate) {

diff --git a/src/jsutils.c b/src/jsutils.c
@@ -826,7 +826,7 @@ void vcbprintf(
         buf[1] = 0;
         if (jsvIsString(v)) {
           JsvStringIterator it;
-          jsvStringIteratorNew(&it, v, 0);
+          jsvStringIteratorNewUTF8(&it, v, 0);
           if (quoted) {
             int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
             while (jsvStringIteratorHasChar(&it) || ch>=0) {

diff --git a/src/jsvar.c b/src/jsvar.c
@@ -1117,6 +1117,8 @@ JsVar *jsvNewUTF8StringAndUnLock(JsVar* dataString) {
 }
 #endif
 
+
+
 JsVar *jsvNewFromInteger(JsVarInt value) {
   JsVar *var = jsvNewWithFlags(JSV_INTEGER);
   if (!var) return 0; // no memory
@@ -1912,6 +1914,10 @@ JsVar *jsvNewFromStringVar(const JsVar *str, size_t stridx, size_t maxLength) {
   }
   JsVar *var = jsvNewFromEmptyString();
   if (var) jsvAppendStringVar(var, str, stridx, maxLength);
+#ifdef ESPR_UNICODE_SUPPORT
+  if (jsvIsUTF8String(str))
+    var = jsvNewUTF8StringAndUnLock(var);
+#endif
   return var;
 }
 
@@ -1924,7 +1930,7 @@ int jsvGetCharInString(JsVar *v, size_t idx) {
   if (!jsvIsString(v)) return 0;
 
   JsvStringIterator it;
-  jsvStringIteratorNew(&it, v, idx);
+  jsvStringIteratorNewUTF8(&it, v, idx);
   int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
   jsvStringIteratorFree(&it);
   return ch;
@@ -1955,6 +1961,28 @@ int jsvGetStringIndexOf(JsVar *str, char ch) {
   return -1;
 }
 
+#ifdef ESPR_UNICODE_SUPPORT
+/// If we have a UTF8 string return the string behind it, or just return what was passed in
+JsVar *jsvGetUTF8BackingString(JsVar *str) {
+  if (!jsvIsUTF8String(str)) return str?jsvLockAgain(str):0;
+  return jsvLock(jsvGetFirstChild(str));
+}
+
+/// Convert an UTF8 index in a String to a String index in the backing String. On non-UTF8 builds it passes straight through
+int jsvConvertFromUTF8Index(JsVar *str, int idx) {
+  JsvStringIterator it;
+  jsvStringIteratorNewUTF8(&it, str, idx);
+  idx = jsvStringIteratorGetIndex(&it); // jsvStringIteratorGetIndex still reports back as non-UTF8
+  jsvStringIteratorFree(&it);
+  return idx;
+}
+#else
+/// Convert an UTF8 index in a String to a String index in the backing String. On non-UTF8 builds it passes straight through
+int jsvConvertFromUTF8Index(JsVar *str, int idx) {
+  return idx;
+}
+#endif
+
 /** Does this string contain only Numeric characters (with optional '-'/'+' at the front)? NOT '.'/'e' and similar (allowDecimalPoint is for '.' only) */
 bool jsvIsStringNumericInt(const JsVar *var, bool allowDecimalPoint) {
   assert(jsvIsString(var));
@@ -2515,8 +2543,8 @@ bool jsvIsStringIEqualAndUnLock(JsVar *var, const char *str) {
  *  */
 int jsvCompareString(JsVar *va, JsVar *vb, size_t starta, size_t startb, bool equalAtEndOfString) {
   JsvStringIterator ita, itb;
-  jsvStringIteratorNew(&ita, va, starta);
-  jsvStringIteratorNew(&itb, vb, startb);
+  jsvStringIteratorNewUTF8(&ita, va, starta);
+  jsvStringIteratorNewUTF8(&itb, vb, startb);
   // step to first positions
   while (true) {
     int ca = jsvStringIteratorGetUTF8CharAndNext(&ita);
@@ -2543,8 +2571,8 @@ JsVar *jsvGetCommonCharacters(JsVar *va, JsVar *vb) {
   JsVar *v = jsvNewFromEmptyString();
   if (!v) return 0;
   JsvStringIterator ita, itb;
-  jsvStringIteratorNew(&ita, va, 0);
-  jsvStringIteratorNew(&itb, vb, 0);
+  jsvStringIteratorNewUTF8(&ita, va, 0);
+  jsvStringIteratorNewUTF8(&itb, vb, 0);
   int ca = jsvStringIteratorGetUTF8CharAndNext(&ita);
   int cb = jsvStringIteratorGetUTF8CharAndNext(&itb);
   while (ca>0 && cb>0 && ca == cb) {
@@ -3491,6 +3519,9 @@ JsVar *jsvArrayJoin(JsVar *arr, JsVar *filler, bool ignoreNull) {
   if (!str) return 0; // out of memory
   assert(!filler || jsvIsString(filler));
 
+#ifdef ESPR_UNICODE_SUPPORT
+  bool wasUTF8 = false;
+#endif
   JsvIterator it;
   jsvIteratorNew(&it, arr, JSIF_EVERY_ARRAY_ELEMENT);
   JsvStringIterator itdst;
@@ -3508,6 +3539,9 @@ JsVar *jsvArrayJoin(JsVar *arr, JsVar *filler, bool ignoreNull) {
       if (value && (!ignoreNull || !jsvIsNull(value))) {
         JsVar *valueStr = jsvAsString(value);
         if (valueStr) { // could be out of memory
+#ifdef ESPR_UNICODE_SUPPORT
+          wasUTF8 |= jsvIsUTF8String(valueStr);
+#endif
           jsvStringIteratorAppendString(&itdst, valueStr, 0, JSVAPPENDSTRINGVAR_MAXLENGTH);
           jsvUnLock(valueStr);
         }
@@ -3519,6 +3553,10 @@ JsVar *jsvArrayJoin(JsVar *arr, JsVar *filler, bool ignoreNull) {
   }
   jsvIteratorFree(&it);
   jsvStringIteratorFree(&itdst);
+#ifdef ESPR_UNICODE_SUPPORT
+  if (wasUTF8)
+    str = jsvNewUTF8StringAndUnLock(str);
+#endif
   return str;
 }
 

diff --git a/src/jsvar.h b/src/jsvar.h
@@ -522,6 +522,13 @@ int jsvGetCharInString(JsVar *v, size_t idx); ///< Get a character at the given
 void jsvSetCharInString(JsVar *v, size_t idx, char ch, bool bitwiseOR); ///< Set a character at the given index in the String. If bitwiseOR, ch will be ORed with the character already at that position.
 int jsvGetStringIndexOf(JsVar *str, char ch); ///< Get the index of a character in a string, or -1
 
+#ifdef ESPR_UNICODE_SUPPORT
+/// If we have a UTF8 string return the string behind it, or just return what was passed in
+JsVar *jsvGetUTF8BackingString(JsVar *str);
+#endif
+/// Convert an UTF8 index in a String to a String index in the backing String. On non-UTF8 builds it passes straight through
+int jsvConvertFromUTF8Index(JsVar *str, int idx);
+
 JsVarInt jsvGetInteger(const JsVar *v);
 void jsvSetInteger(JsVar *v, JsVarInt value); ///< Set an integer value (use carefully!)
 JsVarFloat jsvGetFloat(const JsVar *v); ///< Get the floating point representation of this var

diff --git a/src/jsvariterator.c b/src/jsvariterator.c
@@ -257,8 +257,8 @@ void jsvStringIteratorNew(JsvStringIterator *it, JsVar *str, size_t startIdx) {
   assert(jsvHasCharacterData(str));
 #ifdef ESPR_UNICODE_SUPPORT
   it->isUTF8 = jsvIsUTF8String(str);
-  if (it->isUTF8) {
-    it->var =  jsvLock(jsvGetFirstChild(str));
+  if (it->isUTF8) { // if it's UTF8, skip the UTF8 tag and go straight to the data
+    it->var =  jsvGetUTF8BackingString(str);
     assert(jsvHasCharacterData(it->var));
   } else
 #endif
@@ -274,31 +274,33 @@ void jsvStringIteratorNew(JsvStringIterator *it, JsVar *str, size_t startIdx) {
   } else if (jsvIsFlashString(it->var)) {
     it->charsInVar = 0;
     it->charIdx = startIdx; // if it's not UTF8 we can just load up the bit we want immediately
-#ifdef ESPR_UNICODE_SUPPORT
-    if (it->isUTF8)
-      it->charIdx = 0; // otherwise we'll have to iterate below this
-    jsvStringIteratorLoadFlashString(it);
-    if (!it->isUTF8) return; // nothing else to do here if not UTF8
-#else
     return jsvStringIteratorLoadFlashString(it);
-#endif
 #endif
   } else{
     it->ptr = &it->var->varData.str[0];
   }
+  it->charIdx = startIdx;
+  jsvStringIteratorCatchUp(it);
+}
+
+void jsvStringIteratorNewUTF8(JsvStringIterator *it, JsVar *str, size_t startIdx) {
 #ifdef ESPR_UNICODE_SUPPORT
+  jsvStringIteratorNew(it, str, 0);
   if (it->isUTF8) {
     it->charIdx = 0;
     while (startIdx) {
       jsvStringIteratorNextUTF8(it);
       startIdx--;
     }
   } else
-#endif
-    it->charIdx = startIdx;
+  it->charIdx = startIdx;
   jsvStringIteratorCatchUp(it);
+#else
+ jsvStringIteratorNew(it, str, startIdx);
+#endif
 }
 
+
 void jsvStringIteratorClone(JsvStringIterator *dstit, JsvStringIterator *it) {
   *dstit = *it;
   if (dstit->var) {
@@ -746,7 +748,7 @@ void jsvIteratorNew(JsvIterator *it, JsVar *obj, JsvIteratorFlags flags) {
   } else if (jsvIsUTF8String(obj)) {
     it->type = JSVI_UNICODE;
     it->it.unicode.index = 0;
-    jsvStringIteratorNew(&it->it.unicode.str, jsvLock(jsvGetFirstChild(obj)), 0);
+    jsvStringIteratorNew(&it->it.unicode.str, jsvGetUTF8BackingString(obj), 0);
     jsvIteratorUTF8Next(it); // read a char as the current char
 #endif
   } else if (jsvHasCharacterData(obj)) {