code · pull · May 31, 2026 · May 31, 2026
diff --git a/packages/lexical-playground/__tests__/unit/AutocompleteExtensionHelpers.test.ts b/packages/lexical-playground/__tests__/unit/AutocompleteExtensionHelpers.test.ts
@@ -24,6 +24,22 @@ describe('detectLanguage', () => {
     {expected: 'ja', input: 'あり', label: 'hiragana'},
     {expected: 'ja', input: 'カタカナ', label: 'katakana'},
     {expected: 'ja', input: '漢字', label: 'CJK Unified ideograph'},
+    {expected: 'ja', input: '㐀', label: 'CJK Extension A (BMP)'},
+    {
+      expected: 'ja',
+      input: '\u{20000}',
+      label: 'supplementary-plane CJK ideograph (Ext B)',
+    },
+    {
+      expected: 'ja',
+      input: 'foo \u{20000}',
+      label: 'trailing supplementary-plane ideograph after ascii',
+    },
+    {
+      expected: 'en',
+      input: '\u{1F642}',
+      label: 'astral emoji is not a CJK/Hangul script',
+    },
     {
       expected: 'en',
       input: '안녕 hi',
@@ -146,4 +162,125 @@ describe('createWordlistDictionary', () => {
     const dict = createWordlistDictionary(words, {minPrefixLength: 3});
     expect(dict.minPrefixLength).toBe(3);
   });
+
+  // The list is a priority order (most-common / highest-ranked first), so
+  // `query` must return the *earliest-listed* completion for a prefix —
+  // never merely the shortest one that happens to share it.
+  test('returns the earliest-listed match, not the shortest, for nested words', () => {
+    // 'conditions' precedes the shorter 'condition' it contains; the
+    // earlier entry must still win.
+    const dict = createWordlistDictionary(['conditions', 'condition'], {
+      minPrefixLength: 4,
+    });
+    expect(dict.query('cond')).toBe('itions');
+  });
+
+  test('a sorted nested pair resolves to the earlier (shorter) entry', () => {
+    const dict = createWordlistDictionary(['condition', 'conditions'], {
+      minPrefixLength: 4,
+    });
+    expect(dict.query('cond')).toBe('ition');
+  });
+
+  test('picks the highest-priority of several matches at each prefix length', () => {
+    const dict = createWordlistDictionary(['apply', 'apple', 'application']);
+    expect(dict.query('app')).toBe('ly');
+    expect(dict.query('appl')).toBe('y');
+  });
+
+  test('a word equal to the prefix is never its own suggestion', () => {
+    // 'form' leaves no suffix, so the longer 'formal' answers instead.
+    const dict = createWordlistDictionary(['form', 'formal']);
+    expect(dict.query('form')).toBe('al');
+  });
+
+  test('returns null when the only match equals the prefix', () => {
+    const dict = createWordlistDictionary(['form']);
+    expect(dict.query('form')).toBeNull();
+  });
+
+  test('returns null for a prefix longer than every entry', () => {
+    const dict = createWordlistDictionary(['test']);
+    expect(dict.query('testing')).toBeNull();
+  });
+
+  test('an empty wordlist yields no suggestions', () => {
+    const dict = createWordlistDictionary([]);
+    expect(dict.query('anything')).toBeNull();
+  });
+
+  test('builds and queries multi-byte (Hangul) words', () => {
+    const dict = createWordlistDictionary(['사용', '사용법', '사용자']);
+    expect(dict.query('사용')).toBe('법'); // 사용법 is listed first
+    expect(dict.query('사')).toBeNull(); // below the default minPrefixLength
+  });
+
+  test('matches case-insensitively while preserving the source casing in the suffix', () => {
+    const dict = createWordlistDictionary(['JavaScript']);
+    expect(dict.query('java')).toBe('Script');
+    expect(dict.query('JAVA')).toBe('Script');
+  });
+
+  test('minPrefixLength shortens the set of queryable prefixes', () => {
+    const dict = createWordlistDictionary(['testing'], {minPrefixLength: 4});
+    expect(dict.query('tes')).toBeNull();
+    expect(dict.query('test')).toBe('ing');
+  });
+
+  test('duplicate entries are handled and the first occurrence wins', () => {
+    const dict = createWordlistDictionary(['testing', 'testing', 'tester']);
+    expect(dict.query('test')).toBe('ing');
+  });
+});
+
+describe('createWordlistDictionary — Korean (non-ASCII) wordlist', () => {
+  // A small slice of real multi-syllable Korean nouns. Several share the
+  // '사용' ("use") and '학' ("study / school") stems, so prefix lookups,
+  // priority order, and minPrefixLength gating are all exercised on
+  // Hangul rather than ASCII. Hangul syllables are single UTF-16 units,
+  // so a two-syllable prefix has length 2.
+  const koreanWords = [
+    '사용', // use
+    '사용법', // instructions
+    '사용자', // user
+    '학교', // school
+    '학생', // student
+    '학생회', // student council
+  ];
+
+  test('completes a Hangul prefix to its earliest-listed longer word', () => {
+    const dict = createWordlistDictionary(koreanWords);
+    expect(dict.query('사용')).toBe('법'); // 사용법 precedes 사용자
+    expect(dict.query('학생')).toBe('회'); // 학생회 is the only longer 학생*
+  });
+
+  test('respects the default minPrefixLength of 2 on Hangul', () => {
+    const dict = createWordlistDictionary(koreanWords);
+    expect(dict.query('사')).toBeNull(); // single syllable, below the minimum
+    expect(dict.query('학')).toBeNull();
+  });
+
+  test('a complete word with no longer entry yields no suggestion', () => {
+    const dict = createWordlistDictionary(koreanWords);
+    expect(dict.query('학교')).toBeNull(); // 학교 is a leaf
+  });
+
+  test('returns null when no entry shares the Hangul prefix', () => {
+    const dict = createWordlistDictionary(koreanWords);
+    expect(dict.query('컴퓨')).toBeNull(); // 컴퓨터 (computer) is not in the list
+  });
+
+  test('honours the priority order among words sharing a stem', () => {
+    // Re-ordered so 사용자 outranks 사용법; the earliest-listed wins.
+    const dict = createWordlistDictionary(['사용자', '사용법', '사용권']);
+    expect(dict.query('사용')).toBe('자');
+  });
+
+  test('a custom minPrefixLength gates shorter Hangul prefixes', () => {
+    const dict = createWordlistDictionary(['학생회', '학생회장'], {
+      minPrefixLength: 3,
+    });
+    expect(dict.query('학생')).toBeNull(); // 2 syllables < 3
+    expect(dict.query('학생회')).toBe('장'); // 3-syllable prefix completes
+  });
 });
diff --git a/packages/lexical-playground/src/plugins/AutocompleteExtension/detectLanguage.ts b/packages/lexical-playground/src/plugins/AutocompleteExtension/detectLanguage.ts
@@ -7,7 +7,8 @@
  */
 
 // Script-range helpers. Add a new entry when introducing a new language;
-// the BMP ranges below cover the common ones for IME-using languages.
+// the ranges below cover the common scripts for IME-using languages,
+// including the supplementary-plane CJK ideographs.
 
 function isHangul(cp: number): boolean {
   return (
@@ -26,8 +27,13 @@ function isJapaneseKana(cp: number): boolean {
   );
 }
 
-function isCJKUnified(cp: number): boolean {
-  return cp >= 0x4e00 && cp <= 0x9fff;
+function isCJKIdeograph(cp: number): boolean {
+  return (
+    (cp >= 0x3400 && cp <= 0x4dbf) || // CJK Unified Ideographs Extension A
+    (cp >= 0x4e00 && cp <= 0x9fff) || // CJK Unified Ideographs
+    (cp >= 0xf900 && cp <= 0xfaff) || // CJK Compatibility Ideographs
+    (cp >= 0x20000 && cp <= 0x2fa1f) // Supplementary Ideographic Plane (Ext B–F + Compatibility Supplement)
+  );
 }
 
 function isZeroWidthOrControl(cp: number): boolean {
@@ -50,26 +56,31 @@ function isZeroWidthOrControl(cp: number): boolean {
  *
  * Returns `'en'` for ASCII / Latin (the catch-all), `'ko'` for Hangul,
  * and `'ja'` for both kana-bearing prefixes and prefixes whose last
- * visible codepoint is a CJK Unified Ideograph. The CJK Unified range
- * is shared between Japanese kanji and Chinese hanzi; the default
+ * visible codepoint is a CJK ideograph (CJK Unified, Extension A,
+ * Compatibility Ideographs, or the supplementary plane). That range is
+ * shared between Japanese kanji and Chinese hanzi; the default
  * dictionary set covers English and Korean only, so kana and CJK
- * Unified prefixes produce no suggestion until a host registers a
+ * ideograph prefixes produce no suggestion until a host registers a
  * dictionary under the `ja` (or `zh`) key. Japanese is omitted from
  * the defaults because the platform IMEs already provide their own
  * dropdown autocompletion. Hosts that need to distinguish Chinese
  * from Japanese should pass a custom `detectLanguage` that uses
  * application context (locale, user preference).
  */
 export function detectLanguage(text: string): string {
-  for (let i = text.length - 1; i >= 0; i--) {
-    const cp = text.codePointAt(i);
+  // Iterate real code points from the end so a trailing supplementary-
+  // plane ideograph is read whole rather than as its lone low surrogate,
+  // skipping zero-width and control characters.
+  const codePoints = Array.from(text);
+  for (let i = codePoints.length - 1; i >= 0; i--) {
+    const cp = codePoints[i].codePointAt(0);
     if (cp === undefined || isZeroWidthOrControl(cp)) {
       continue;
     }
     if (isHangul(cp)) {
       return 'ko';
     }
-    if (isJapaneseKana(cp) || isCJKUnified(cp)) {
+    if (isJapaneseKana(cp) || isCJKIdeograph(cp)) {
       return 'ja';
     }
     return 'en';

diff --git a/packages/lexical-playground/src/plugins/AutocompleteExtension/dictionary.ts b/packages/lexical-playground/src/plugins/AutocompleteExtension/dictionary.ts
@@ -51,6 +51,12 @@ export interface WordlistDictionaryOptions {
  * wordlist. Returns the first word in the list that starts with the
  * prefix (and is longer than it), case-insensitive by default.
  *
+ * Case-insensitive matching assumes case folding preserves length,
+ * which holds for the scripts this targets (Latin, Hangul, kana, Han).
+ * Scripts where `toLowerCase()` changes length — e.g. Turkish `İ`
+ * (U+0130) folds to two code points — are not handled specially; pass
+ * `caseSensitive: true` for wordlists in those scripts.
+ *
  * For most languages this is the minimum useful implementation. Plug
  * one in as:
  *
@@ -67,66 +73,62 @@ export function createWordlistDictionary(
   options: WordlistDictionaryOptions = {},
 ): AutocompleteDictionary {
   const {minPrefixLength = 2, caseSensitive = false} = options;
-  // Trie keyed on the case-folded character; each terminal stores the
-  // original word so the returned suffix preserves the source casing.
-  // Traversal order matches `words` insertion (Map iteration), so
-  // `query` returns the earliest-listed match for a prefix — same
-  // semantics as the previous linear scan, but O(prefix.length) per
-  // lookup instead of O(N * prefix.length).
-  const root: TrieNode = {children: new Map(), word: null};
-  for (const word of words) {
-    const key = caseSensitive ? word : word.toLowerCase();
-    let node = root;
-    for (const char of key) {
-      let child = node.children.get(char);
-      if (child === undefined) {
-        child = {children: new Map(), word: null};
-        node.children.set(char, child);
-      }
-      node = child;
-    }
-    if (node.word === null) {
-      node.word = word;
-    }
-  }
+  const fold = (text: string): string =>
+    caseSensitive ? text : text.toLowerCase();
+  // Index the wordlist for prefix lookups with a single integer of
+  // overhead per word. `order` holds the word indices sorted by their
+  // case-folded text, so the words sharing any given prefix form one
+  // contiguous block. `query` binary-searches for the start of that
+  // block and scans it for the earliest-listed (highest-priority) word
+  // longer than the prefix — the same word a linear `Array.find` scan
+  // would return, but without scanning the whole list. Folded text is
+  // only needed to build the order and is recomputed on the fly during
+  // lookup, so nothing but `order` (a `Uint32Array`) is retained.
+  const folded = words.map(fold);
+  const order = Uint32Array.from(
+    words
+      .map((_, index) => index)
+      .sort((a, b) =>
+        folded[a] < folded[b] ? -1 : folded[a] > folded[b] ? 1 : a - b,
+      ),
+  );
   return {
     minPrefixLength,
     query(prefix: string): null | string {
       if (prefix.length < minPrefixLength) {
         return null;
       }
-      const needle = caseSensitive ? prefix : prefix.toLowerCase();
-      let node = root;
-      for (const char of needle) {
-        const next = node.children.get(char);
-        if (next === undefined) {
-          return null;
+      const needle = fold(prefix);
+      // Lower bound: first position whose folded word is >= needle.
+      let lo = 0;
+      let hi = order.length;
+      while (lo < hi) {
+        const mid = (lo + hi) >>> 1;
+        if (fold(words[order[mid]]) < needle) {
+          lo = mid + 1;
+        } else {
+          hi = mid;
+        }
+      }
+      // Scan the contiguous block of words that start with `needle` for
+      // the earliest-listed entry strictly longer than the prefix.
+      let bestIndex = -1;
+      let bestWord: string | null = null;
+      for (let k = lo; k < order.length; k++) {
+        const index = order[k];
+        const word = words[index];
+        if (!fold(word).startsWith(needle)) {
+          break;
+        }
+        if (
+          word.length > prefix.length &&
+          (bestIndex === -1 || index < bestIndex)
+        ) {
+          bestIndex = index;
+          bestWord = word;
         }
-        node = next;
       }
-      const match = findFirstSuggestion(node, prefix.length);
-      return match === null ? null : match.substring(prefix.length);
+      return bestWord === null ? null : bestWord.substring(prefix.length);
     },
   };
 }
-
-interface TrieNode {
-  children: Map<string, TrieNode>;
-  word: string | null;
-}
-
-function findFirstSuggestion(
-  node: TrieNode,
-  prefixLength: number,
-): string | null {
-  if (node.word !== null && node.word.length > prefixLength) {
-    return node.word;
-  }
-  for (const child of node.children.values()) {
-    const found = findFirstSuggestion(child, prefixLength);
-    if (found !== null) {
-      return found;
-    }
-  }
-  return null;
-}