Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,22 @@ describe('detectLanguage', () => {
{expected: 'ja', input: 'あり', label: 'hiragana'},
{expected: 'ja', input: 'カタカナ', label: 'katakana'},
{expected: 'ja', input: '漢字', label: 'CJK Unified ideograph'},
{expected: 'ja', input: '㐀', label: 'CJK Extension A (BMP)'},
{
expected: 'ja',
input: '\u{20000}',
label: 'supplementary-plane CJK ideograph (Ext B)',
},
{
expected: 'ja',
input: 'foo \u{20000}',
label: 'trailing supplementary-plane ideograph after ascii',
},
{
expected: 'en',
input: '\u{1F642}',
label: 'astral emoji is not a CJK/Hangul script',
},
{
expected: 'en',
input: '안녕 hi',
Expand Down Expand Up @@ -146,4 +162,125 @@ describe('createWordlistDictionary', () => {
const dict = createWordlistDictionary(words, {minPrefixLength: 3});
expect(dict.minPrefixLength).toBe(3);
});

// The list is a priority order (most-common / highest-ranked first), so
// `query` must return the *earliest-listed* completion for a prefix —
// never merely the shortest one that happens to share it.
test('returns the earliest-listed match, not the shortest, for nested words', () => {
// 'conditions' precedes the shorter 'condition' it contains; the
// earlier entry must still win.
const dict = createWordlistDictionary(['conditions', 'condition'], {
minPrefixLength: 4,
});
expect(dict.query('cond')).toBe('itions');
});

test('a sorted nested pair resolves to the earlier (shorter) entry', () => {
const dict = createWordlistDictionary(['condition', 'conditions'], {
minPrefixLength: 4,
});
expect(dict.query('cond')).toBe('ition');
});

test('picks the highest-priority of several matches at each prefix length', () => {
const dict = createWordlistDictionary(['apply', 'apple', 'application']);
expect(dict.query('app')).toBe('ly');
expect(dict.query('appl')).toBe('y');
});

test('a word equal to the prefix is never its own suggestion', () => {
// 'form' leaves no suffix, so the longer 'formal' answers instead.
const dict = createWordlistDictionary(['form', 'formal']);
expect(dict.query('form')).toBe('al');
});

test('returns null when the only match equals the prefix', () => {
const dict = createWordlistDictionary(['form']);
expect(dict.query('form')).toBeNull();
});

test('returns null for a prefix longer than every entry', () => {
const dict = createWordlistDictionary(['test']);
expect(dict.query('testing')).toBeNull();
});

test('an empty wordlist yields no suggestions', () => {
const dict = createWordlistDictionary([]);
expect(dict.query('anything')).toBeNull();
});

test('builds and queries multi-byte (Hangul) words', () => {
const dict = createWordlistDictionary(['사용', '사용법', '사용자']);
expect(dict.query('사용')).toBe('법'); // 사용법 is listed first
expect(dict.query('사')).toBeNull(); // below the default minPrefixLength
});

test('matches case-insensitively while preserving the source casing in the suffix', () => {
const dict = createWordlistDictionary(['JavaScript']);
expect(dict.query('java')).toBe('Script');
expect(dict.query('JAVA')).toBe('Script');
});

test('minPrefixLength shortens the set of queryable prefixes', () => {
const dict = createWordlistDictionary(['testing'], {minPrefixLength: 4});
expect(dict.query('tes')).toBeNull();
expect(dict.query('test')).toBe('ing');
});

test('duplicate entries are handled and the first occurrence wins', () => {
const dict = createWordlistDictionary(['testing', 'testing', 'tester']);
expect(dict.query('test')).toBe('ing');
});
});

describe('createWordlistDictionary — Korean (non-ASCII) wordlist', () => {
// A small slice of real multi-syllable Korean nouns. Several share the
// '사용' ("use") and '학' ("study / school") stems, so prefix lookups,
// priority order, and minPrefixLength gating are all exercised on
// Hangul rather than ASCII. Hangul syllables are single UTF-16 units,
// so a two-syllable prefix has length 2.
const koreanWords = [
'사용', // use
'사용법', // instructions
'사용자', // user
'학교', // school
'학생', // student
'학생회', // student council
];

test('completes a Hangul prefix to its earliest-listed longer word', () => {
const dict = createWordlistDictionary(koreanWords);
expect(dict.query('사용')).toBe('법'); // 사용법 precedes 사용자
expect(dict.query('학생')).toBe('회'); // 학생회 is the only longer 학생*
});

test('respects the default minPrefixLength of 2 on Hangul', () => {
const dict = createWordlistDictionary(koreanWords);
expect(dict.query('사')).toBeNull(); // single syllable, below the minimum
expect(dict.query('학')).toBeNull();
});

test('a complete word with no longer entry yields no suggestion', () => {
const dict = createWordlistDictionary(koreanWords);
expect(dict.query('학교')).toBeNull(); // 학교 is a leaf
});

test('returns null when no entry shares the Hangul prefix', () => {
const dict = createWordlistDictionary(koreanWords);
expect(dict.query('컴퓨')).toBeNull(); // 컴퓨터 (computer) is not in the list
});

test('honours the priority order among words sharing a stem', () => {
// Re-ordered so 사용자 outranks 사용법; the earliest-listed wins.
const dict = createWordlistDictionary(['사용자', '사용법', '사용권']);
expect(dict.query('사용')).toBe('자');
});

test('a custom minPrefixLength gates shorter Hangul prefixes', () => {
const dict = createWordlistDictionary(['학생회', '학생회장'], {
minPrefixLength: 3,
});
expect(dict.query('학생')).toBeNull(); // 2 syllables < 3
expect(dict.query('학생회')).toBe('장'); // 3-syllable prefix completes
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
*/

// Script-range helpers. Add a new entry when introducing a new language;
// the BMP ranges below cover the common ones for IME-using languages.
// the ranges below cover the common scripts for IME-using languages,
// including the supplementary-plane CJK ideographs.

function isHangul(cp: number): boolean {
return (
Expand All @@ -26,8 +27,13 @@ function isJapaneseKana(cp: number): boolean {
);
}

function isCJKUnified(cp: number): boolean {
return cp >= 0x4e00 && cp <= 0x9fff;
function isCJKIdeograph(cp: number): boolean {
return (
(cp >= 0x3400 && cp <= 0x4dbf) || // CJK Unified Ideographs Extension A
(cp >= 0x4e00 && cp <= 0x9fff) || // CJK Unified Ideographs
(cp >= 0xf900 && cp <= 0xfaff) || // CJK Compatibility Ideographs
(cp >= 0x20000 && cp <= 0x2fa1f) // Supplementary Ideographic Plane (Ext B–F + Compatibility Supplement)
);
}

function isZeroWidthOrControl(cp: number): boolean {
Expand All @@ -50,26 +56,31 @@ function isZeroWidthOrControl(cp: number): boolean {
*
* Returns `'en'` for ASCII / Latin (the catch-all), `'ko'` for Hangul,
* and `'ja'` for both kana-bearing prefixes and prefixes whose last
* visible codepoint is a CJK Unified Ideograph. The CJK Unified range
* is shared between Japanese kanji and Chinese hanzi; the default
* visible codepoint is a CJK ideograph (CJK Unified, Extension A,
* Compatibility Ideographs, or the supplementary plane). That range is
* shared between Japanese kanji and Chinese hanzi; the default
* dictionary set covers English and Korean only, so kana and CJK
* Unified prefixes produce no suggestion until a host registers a
* ideograph prefixes produce no suggestion until a host registers a
* dictionary under the `ja` (or `zh`) key. Japanese is omitted from
* the defaults because the platform IMEs already provide their own
* dropdown autocompletion. Hosts that need to distinguish Chinese
* from Japanese should pass a custom `detectLanguage` that uses
* application context (locale, user preference).
*/
export function detectLanguage(text: string): string {
for (let i = text.length - 1; i >= 0; i--) {
const cp = text.codePointAt(i);
// Iterate real code points from the end so a trailing supplementary-
// plane ideograph is read whole rather than as its lone low surrogate,
// skipping zero-width and control characters.
const codePoints = Array.from(text);
for (let i = codePoints.length - 1; i >= 0; i--) {
const cp = codePoints[i].codePointAt(0);
if (cp === undefined || isZeroWidthOrControl(cp)) {
continue;
}
if (isHangul(cp)) {
return 'ko';
}
if (isJapaneseKana(cp) || isCJKUnified(cp)) {
if (isJapaneseKana(cp) || isCJKIdeograph(cp)) {
return 'ja';
}
return 'en';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ export interface WordlistDictionaryOptions {
* wordlist. Returns the first word in the list that starts with the
* prefix (and is longer than it), case-insensitive by default.
*
* Case-insensitive matching assumes case folding preserves length,
* which holds for the scripts this targets (Latin, Hangul, kana, Han).
* Scripts where `toLowerCase()` changes length — e.g. Turkish `İ`
* (U+0130) folds to two code points — are not handled specially; pass
* `caseSensitive: true` for wordlists in those scripts.
*
* For most languages this is the minimum useful implementation. Plug
* one in as:
*
Expand All @@ -67,66 +73,62 @@ export function createWordlistDictionary(
options: WordlistDictionaryOptions = {},
): AutocompleteDictionary {
const {minPrefixLength = 2, caseSensitive = false} = options;
// Trie keyed on the case-folded character; each terminal stores the
// original word so the returned suffix preserves the source casing.
// Traversal order matches `words` insertion (Map iteration), so
// `query` returns the earliest-listed match for a prefix — same
// semantics as the previous linear scan, but O(prefix.length) per
// lookup instead of O(N * prefix.length).
const root: TrieNode = {children: new Map(), word: null};
for (const word of words) {
const key = caseSensitive ? word : word.toLowerCase();
let node = root;
for (const char of key) {
let child = node.children.get(char);
if (child === undefined) {
child = {children: new Map(), word: null};
node.children.set(char, child);
}
node = child;
}
if (node.word === null) {
node.word = word;
}
}
const fold = (text: string): string =>
caseSensitive ? text : text.toLowerCase();
// Index the wordlist for prefix lookups with a single integer of
// overhead per word. `order` holds the word indices sorted by their
// case-folded text, so the words sharing any given prefix form one
// contiguous block. `query` binary-searches for the start of that
// block and scans it for the earliest-listed (highest-priority) word
// longer than the prefix — the same word a linear `Array.find` scan
// would return, but without scanning the whole list. Folded text is
// only needed to build the order and is recomputed on the fly during
// lookup, so nothing but `order` (a `Uint32Array`) is retained.
const folded = words.map(fold);
const order = Uint32Array.from(
words
.map((_, index) => index)
.sort((a, b) =>
folded[a] < folded[b] ? -1 : folded[a] > folded[b] ? 1 : a - b,
),
);
return {
minPrefixLength,
query(prefix: string): null | string {
if (prefix.length < minPrefixLength) {
return null;
}
const needle = caseSensitive ? prefix : prefix.toLowerCase();
let node = root;
for (const char of needle) {
const next = node.children.get(char);
if (next === undefined) {
return null;
const needle = fold(prefix);
// Lower bound: first position whose folded word is >= needle.
let lo = 0;
let hi = order.length;
while (lo < hi) {
const mid = (lo + hi) >>> 1;
if (fold(words[order[mid]]) < needle) {
lo = mid + 1;
} else {
hi = mid;
}
}
// Scan the contiguous block of words that start with `needle` for
// the earliest-listed entry strictly longer than the prefix.
let bestIndex = -1;
let bestWord: string | null = null;
for (let k = lo; k < order.length; k++) {
const index = order[k];
const word = words[index];
if (!fold(word).startsWith(needle)) {
break;
}
if (
word.length > prefix.length &&
(bestIndex === -1 || index < bestIndex)
) {
bestIndex = index;
bestWord = word;
}
node = next;
}
const match = findFirstSuggestion(node, prefix.length);
return match === null ? null : match.substring(prefix.length);
return bestWord === null ? null : bestWord.substring(prefix.length);
},
};
}

interface TrieNode {
children: Map<string, TrieNode>;
word: string | null;
}

function findFirstSuggestion(
node: TrieNode,
prefixLength: number,
): string | null {
if (node.word !== null && node.word.length > prefixLength) {
return node.word;
}
for (const child of node.children.values()) {
const found = findFirstSuggestion(child, prefixLength);
if (found !== null) {
return found;
}
}
return null;
}