diff --git a/packages/lexical-playground/__tests__/unit/AutocompleteExtensionHelpers.test.ts b/packages/lexical-playground/__tests__/unit/AutocompleteExtensionHelpers.test.ts index baf5c961036..a6ed04949e9 100644 --- a/packages/lexical-playground/__tests__/unit/AutocompleteExtensionHelpers.test.ts +++ b/packages/lexical-playground/__tests__/unit/AutocompleteExtensionHelpers.test.ts @@ -24,6 +24,22 @@ describe('detectLanguage', () => { {expected: 'ja', input: 'あり', label: 'hiragana'}, {expected: 'ja', input: 'カタカナ', label: 'katakana'}, {expected: 'ja', input: '漢字', label: 'CJK Unified ideograph'}, + {expected: 'ja', input: '㐀', label: 'CJK Extension A (BMP)'}, + { + expected: 'ja', + input: '\u{20000}', + label: 'supplementary-plane CJK ideograph (Ext B)', + }, + { + expected: 'ja', + input: 'foo \u{20000}', + label: 'trailing supplementary-plane ideograph after ascii', + }, + { + expected: 'en', + input: '\u{1F642}', + label: 'astral emoji is not a CJK/Hangul script', + }, { expected: 'en', input: '안녕 hi', @@ -146,4 +162,125 @@ describe('createWordlistDictionary', () => { const dict = createWordlistDictionary(words, {minPrefixLength: 3}); expect(dict.minPrefixLength).toBe(3); }); + + // The list is a priority order (most-common / highest-ranked first), so + // `query` must return the *earliest-listed* completion for a prefix — + // never merely the shortest one that happens to share it. + test('returns the earliest-listed match, not the shortest, for nested words', () => { + // 'conditions' precedes the shorter 'condition' it contains; the + // earlier entry must still win. + const dict = createWordlistDictionary(['conditions', 'condition'], { + minPrefixLength: 4, + }); + expect(dict.query('cond')).toBe('itions'); + }); + + test('a sorted nested pair resolves to the earlier (shorter) entry', () => { + const dict = createWordlistDictionary(['condition', 'conditions'], { + minPrefixLength: 4, + }); + expect(dict.query('cond')).toBe('ition'); + }); + + test('picks the highest-priority of several matches at each prefix length', () => { + const dict = createWordlistDictionary(['apply', 'apple', 'application']); + expect(dict.query('app')).toBe('ly'); + expect(dict.query('appl')).toBe('y'); + }); + + test('a word equal to the prefix is never its own suggestion', () => { + // 'form' leaves no suffix, so the longer 'formal' answers instead. + const dict = createWordlistDictionary(['form', 'formal']); + expect(dict.query('form')).toBe('al'); + }); + + test('returns null when the only match equals the prefix', () => { + const dict = createWordlistDictionary(['form']); + expect(dict.query('form')).toBeNull(); + }); + + test('returns null for a prefix longer than every entry', () => { + const dict = createWordlistDictionary(['test']); + expect(dict.query('testing')).toBeNull(); + }); + + test('an empty wordlist yields no suggestions', () => { + const dict = createWordlistDictionary([]); + expect(dict.query('anything')).toBeNull(); + }); + + test('builds and queries multi-byte (Hangul) words', () => { + const dict = createWordlistDictionary(['사용', '사용법', '사용자']); + expect(dict.query('사용')).toBe('법'); // 사용법 is listed first + expect(dict.query('사')).toBeNull(); // below the default minPrefixLength + }); + + test('matches case-insensitively while preserving the source casing in the suffix', () => { + const dict = createWordlistDictionary(['JavaScript']); + expect(dict.query('java')).toBe('Script'); + expect(dict.query('JAVA')).toBe('Script'); + }); + + test('minPrefixLength shortens the set of queryable prefixes', () => { + const dict = createWordlistDictionary(['testing'], {minPrefixLength: 4}); + expect(dict.query('tes')).toBeNull(); + expect(dict.query('test')).toBe('ing'); + }); + + test('duplicate entries are handled and the first occurrence wins', () => { + const dict = createWordlistDictionary(['testing', 'testing', 'tester']); + expect(dict.query('test')).toBe('ing'); + }); +}); + +describe('createWordlistDictionary — Korean (non-ASCII) wordlist', () => { + // A small slice of real multi-syllable Korean nouns. Several share the + // '사용' ("use") and '학' ("study / school") stems, so prefix lookups, + // priority order, and minPrefixLength gating are all exercised on + // Hangul rather than ASCII. Hangul syllables are single UTF-16 units, + // so a two-syllable prefix has length 2. + const koreanWords = [ + '사용', // use + '사용법', // instructions + '사용자', // user + '학교', // school + '학생', // student + '학생회', // student council + ]; + + test('completes a Hangul prefix to its earliest-listed longer word', () => { + const dict = createWordlistDictionary(koreanWords); + expect(dict.query('사용')).toBe('법'); // 사용법 precedes 사용자 + expect(dict.query('학생')).toBe('회'); // 학생회 is the only longer 학생* + }); + + test('respects the default minPrefixLength of 2 on Hangul', () => { + const dict = createWordlistDictionary(koreanWords); + expect(dict.query('사')).toBeNull(); // single syllable, below the minimum + expect(dict.query('학')).toBeNull(); + }); + + test('a complete word with no longer entry yields no suggestion', () => { + const dict = createWordlistDictionary(koreanWords); + expect(dict.query('학교')).toBeNull(); // 학교 is a leaf + }); + + test('returns null when no entry shares the Hangul prefix', () => { + const dict = createWordlistDictionary(koreanWords); + expect(dict.query('컴퓨')).toBeNull(); // 컴퓨터 (computer) is not in the list + }); + + test('honours the priority order among words sharing a stem', () => { + // Re-ordered so 사용자 outranks 사용법; the earliest-listed wins. + const dict = createWordlistDictionary(['사용자', '사용법', '사용권']); + expect(dict.query('사용')).toBe('자'); + }); + + test('a custom minPrefixLength gates shorter Hangul prefixes', () => { + const dict = createWordlistDictionary(['학생회', '학생회장'], { + minPrefixLength: 3, + }); + expect(dict.query('학생')).toBeNull(); // 2 syllables < 3 + expect(dict.query('학생회')).toBe('장'); // 3-syllable prefix completes + }); }); diff --git a/packages/lexical-playground/src/plugins/AutocompleteExtension/detectLanguage.ts b/packages/lexical-playground/src/plugins/AutocompleteExtension/detectLanguage.ts index fb1107de6c1..1311bdd9d14 100644 --- a/packages/lexical-playground/src/plugins/AutocompleteExtension/detectLanguage.ts +++ b/packages/lexical-playground/src/plugins/AutocompleteExtension/detectLanguage.ts @@ -7,7 +7,8 @@ */ // Script-range helpers. Add a new entry when introducing a new language; -// the BMP ranges below cover the common ones for IME-using languages. +// the ranges below cover the common scripts for IME-using languages, +// including the supplementary-plane CJK ideographs. function isHangul(cp: number): boolean { return ( @@ -26,8 +27,13 @@ function isJapaneseKana(cp: number): boolean { ); } -function isCJKUnified(cp: number): boolean { - return cp >= 0x4e00 && cp <= 0x9fff; +function isCJKIdeograph(cp: number): boolean { + return ( + (cp >= 0x3400 && cp <= 0x4dbf) || // CJK Unified Ideographs Extension A + (cp >= 0x4e00 && cp <= 0x9fff) || // CJK Unified Ideographs + (cp >= 0xf900 && cp <= 0xfaff) || // CJK Compatibility Ideographs + (cp >= 0x20000 && cp <= 0x2fa1f) // Supplementary Ideographic Plane (Ext B–F + Compatibility Supplement) + ); } function isZeroWidthOrControl(cp: number): boolean { @@ -50,10 +56,11 @@ function isZeroWidthOrControl(cp: number): boolean { * * Returns `'en'` for ASCII / Latin (the catch-all), `'ko'` for Hangul, * and `'ja'` for both kana-bearing prefixes and prefixes whose last - * visible codepoint is a CJK Unified Ideograph. The CJK Unified range - * is shared between Japanese kanji and Chinese hanzi; the default + * visible codepoint is a CJK ideograph (CJK Unified, Extension A, + * Compatibility Ideographs, or the supplementary plane). That range is + * shared between Japanese kanji and Chinese hanzi; the default * dictionary set covers English and Korean only, so kana and CJK - * Unified prefixes produce no suggestion until a host registers a + * ideograph prefixes produce no suggestion until a host registers a * dictionary under the `ja` (or `zh`) key. Japanese is omitted from * the defaults because the platform IMEs already provide their own * dropdown autocompletion. Hosts that need to distinguish Chinese @@ -61,15 +68,19 @@ function isZeroWidthOrControl(cp: number): boolean { * application context (locale, user preference). */ export function detectLanguage(text: string): string { - for (let i = text.length - 1; i >= 0; i--) { - const cp = text.codePointAt(i); + // Iterate real code points from the end so a trailing supplementary- + // plane ideograph is read whole rather than as its lone low surrogate, + // skipping zero-width and control characters. + const codePoints = Array.from(text); + for (let i = codePoints.length - 1; i >= 0; i--) { + const cp = codePoints[i].codePointAt(0); if (cp === undefined || isZeroWidthOrControl(cp)) { continue; } if (isHangul(cp)) { return 'ko'; } - if (isJapaneseKana(cp) || isCJKUnified(cp)) { + if (isJapaneseKana(cp) || isCJKIdeograph(cp)) { return 'ja'; } return 'en'; diff --git a/packages/lexical-playground/src/plugins/AutocompleteExtension/dictionary.ts b/packages/lexical-playground/src/plugins/AutocompleteExtension/dictionary.ts index d8bb854c35c..58cf4cbc359 100644 --- a/packages/lexical-playground/src/plugins/AutocompleteExtension/dictionary.ts +++ b/packages/lexical-playground/src/plugins/AutocompleteExtension/dictionary.ts @@ -51,6 +51,12 @@ export interface WordlistDictionaryOptions { * wordlist. Returns the first word in the list that starts with the * prefix (and is longer than it), case-insensitive by default. * + * Case-insensitive matching assumes case folding preserves length, + * which holds for the scripts this targets (Latin, Hangul, kana, Han). + * Scripts where `toLowerCase()` changes length — e.g. Turkish `İ` + * (U+0130) folds to two code points — are not handled specially; pass + * `caseSensitive: true` for wordlists in those scripts. + * * For most languages this is the minimum useful implementation. Plug * one in as: * @@ -67,66 +73,62 @@ export function createWordlistDictionary( options: WordlistDictionaryOptions = {}, ): AutocompleteDictionary { const {minPrefixLength = 2, caseSensitive = false} = options; - // Trie keyed on the case-folded character; each terminal stores the - // original word so the returned suffix preserves the source casing. - // Traversal order matches `words` insertion (Map iteration), so - // `query` returns the earliest-listed match for a prefix — same - // semantics as the previous linear scan, but O(prefix.length) per - // lookup instead of O(N * prefix.length). - const root: TrieNode = {children: new Map(), word: null}; - for (const word of words) { - const key = caseSensitive ? word : word.toLowerCase(); - let node = root; - for (const char of key) { - let child = node.children.get(char); - if (child === undefined) { - child = {children: new Map(), word: null}; - node.children.set(char, child); - } - node = child; - } - if (node.word === null) { - node.word = word; - } - } + const fold = (text: string): string => + caseSensitive ? text : text.toLowerCase(); + // Index the wordlist for prefix lookups with a single integer of + // overhead per word. `order` holds the word indices sorted by their + // case-folded text, so the words sharing any given prefix form one + // contiguous block. `query` binary-searches for the start of that + // block and scans it for the earliest-listed (highest-priority) word + // longer than the prefix — the same word a linear `Array.find` scan + // would return, but without scanning the whole list. Folded text is + // only needed to build the order and is recomputed on the fly during + // lookup, so nothing but `order` (a `Uint32Array`) is retained. + const folded = words.map(fold); + const order = Uint32Array.from( + words + .map((_, index) => index) + .sort((a, b) => + folded[a] < folded[b] ? -1 : folded[a] > folded[b] ? 1 : a - b, + ), + ); return { minPrefixLength, query(prefix: string): null | string { if (prefix.length < minPrefixLength) { return null; } - const needle = caseSensitive ? prefix : prefix.toLowerCase(); - let node = root; - for (const char of needle) { - const next = node.children.get(char); - if (next === undefined) { - return null; + const needle = fold(prefix); + // Lower bound: first position whose folded word is >= needle. + let lo = 0; + let hi = order.length; + while (lo < hi) { + const mid = (lo + hi) >>> 1; + if (fold(words[order[mid]]) < needle) { + lo = mid + 1; + } else { + hi = mid; + } + } + // Scan the contiguous block of words that start with `needle` for + // the earliest-listed entry strictly longer than the prefix. + let bestIndex = -1; + let bestWord: string | null = null; + for (let k = lo; k < order.length; k++) { + const index = order[k]; + const word = words[index]; + if (!fold(word).startsWith(needle)) { + break; + } + if ( + word.length > prefix.length && + (bestIndex === -1 || index < bestIndex) + ) { + bestIndex = index; + bestWord = word; } - node = next; } - const match = findFirstSuggestion(node, prefix.length); - return match === null ? null : match.substring(prefix.length); + return bestWord === null ? null : bestWord.substring(prefix.length); }, }; } - -interface TrieNode { - children: Map; - word: string | null; -} - -function findFirstSuggestion( - node: TrieNode, - prefixLength: number, -): string | null { - if (node.word !== null && node.word.length > prefixLength) { - return node.word; - } - for (const child of node.children.values()) { - const found = findFirstSuggestion(child, prefixLength); - if (found !== null) { - return found; - } - } - return null; -}