From f43fd4b6a261338da70556396a9c810884674e17 Mon Sep 17 00:00:00 2001 From: weareoutman Date: Thu, 29 Oct 2020 11:39:50 +0800 Subject: [PATCH] fix: fix languages other than en and zh #9 --- .../theme/SearchBar/fetchIndexes.spec.ts | 1 + src/client/theme/SearchBar/fetchIndexes.ts | 2 +- src/client/utils/SearchSourceFactory.spec.ts | 8 ++--- src/client/utils/SearchSourceFactory.ts | 3 +- .../utils/__mocks__/proxiedGenerated.ts | 1 + src/client/utils/smartQueries.spec.ts | 13 +++++++ src/client/utils/smartTerms.spec.ts | 1 + src/client/utils/smartTerms.ts | 10 +++--- src/client/utils/tokenize.spec.ts | 33 ++++++++++++++--- src/client/utils/tokenize.ts | 23 ++++++++++-- src/declarations.ts | 1 + src/server/utils/buildIndex.spec.ts | 35 +++++++++++++++++++ src/server/utils/buildIndex.ts | 4 +++ src/server/utils/generate.spec.ts | 24 +++++++++++++ src/server/utils/generate.ts | 8 +++++ 15 files changed, 149 insertions(+), 18 deletions(-) diff --git a/src/client/theme/SearchBar/fetchIndexes.spec.ts b/src/client/theme/SearchBar/fetchIndexes.spec.ts index fa16aa47..89e4d2f9 100644 --- a/src/client/theme/SearchBar/fetchIndexes.spec.ts +++ b/src/client/theme/SearchBar/fetchIndexes.spec.ts @@ -49,6 +49,7 @@ describe("fetchIndexes", () => { index: { invertedIndex: [ ["hello"], + ["alfabetização"], ["世界"], ["和平"], ["世界"], diff --git a/src/client/theme/SearchBar/fetchIndexes.ts b/src/client/theme/SearchBar/fetchIndexes.ts index 0ffc7ccf..813ed2f0 100644 --- a/src/client/theme/SearchBar/fetchIndexes.ts +++ b/src/client/theme/SearchBar/fetchIndexes.ts @@ -34,7 +34,7 @@ export async function fetchIndexes( const zhDictionary = json.reduce((acc, item) => { for (const tuple of item.index.invertedIndex) { - if (!/\w/.test(tuple[0][0])) { + if (/\p{Unified_Ideograph}/u.test(tuple[0][0])) { acc.add(tuple[0]); } } diff --git a/src/client/utils/SearchSourceFactory.spec.ts b/src/client/utils/SearchSourceFactory.spec.ts index dd8b06d7..91469b20 100644 --- a/src/client/utils/SearchSourceFactory.spec.ts +++ b/src/client/utils/SearchSourceFactory.spec.ts @@ -1,11 +1,9 @@ import lunr from "lunr"; -import { - SearchDocument, - SearchResult, - WrappedIndex, -} from "../../shared/interfaces"; +import { SearchDocument } from "../../shared/interfaces"; import { SearchSourceFactory } from "./SearchSourceFactory"; +jest.mock("./proxiedGenerated"); + describe("SearchSourceFactory", () => { const documentsOfTitles: SearchDocument[] = [ { diff --git a/src/client/utils/SearchSourceFactory.ts b/src/client/utils/SearchSourceFactory.ts index f433b096..3084b3a3 100644 --- a/src/client/utils/SearchSourceFactory.ts +++ b/src/client/utils/SearchSourceFactory.ts @@ -9,6 +9,7 @@ import { } from "../../shared/interfaces"; import { sortSearchResults } from "./sortSearchResults"; import { processTreeStatusOfSearchResults } from "./processTreeStatusOfSearchResults"; +import { language } from "./proxiedGenerated"; export function SearchSourceFactory( wrappedIndexes: WrappedIndex[], @@ -19,7 +20,7 @@ export function SearchSourceFactory( input: string, callback: (results: SearchResult[]) => void ): void { - const rawTokens = tokenize(input); + const rawTokens = tokenize(input, language); if (rawTokens.length === 0) { callback([]); return; diff --git a/src/client/utils/__mocks__/proxiedGenerated.ts b/src/client/utils/__mocks__/proxiedGenerated.ts index e2dac823..4cf3a28f 100644 --- a/src/client/utils/__mocks__/proxiedGenerated.ts +++ b/src/client/utils/__mocks__/proxiedGenerated.ts @@ -1,3 +1,4 @@ +export const language = ["en", "zh"]; export const indexHash = "abc"; export const searchResultLimits = 8; export const searchResultContextMaxLength = 50; diff --git a/src/client/utils/smartQueries.spec.ts b/src/client/utils/smartQueries.spec.ts index 394e5c8a..97572ea1 100644 --- a/src/client/utils/smartQueries.spec.ts +++ b/src/client/utils/smartQueries.spec.ts @@ -115,6 +115,19 @@ describe("smartQueries", () => { }, ], ], + [ + ["termos", "alfabetização"], + [ + { + tokens: ["termos", "alfabetização"], + keyword: "+termos +alfabetização", + }, + { + tokens: ["termos", "alfabetização"], + keyword: "+termos +alfabetização*", + }, + ], + ], ])("smartQueries(%j, zhDictionary) should return %j", (tokens, queries) => { expect(smartQueries(tokens, zhDictionary)).toEqual(queries); }); diff --git a/src/client/utils/smartTerms.spec.ts b/src/client/utils/smartTerms.spec.ts index 870440f7..931e9340 100644 --- a/src/client/utils/smartTerms.spec.ts +++ b/src/client/utils/smartTerms.spec.ts @@ -25,6 +25,7 @@ describe("smartTerms", () => { ], ], [["hello", "world", "命"], []], + [["alfabetização"], [["alfabetização"]]], ])("smartTerms(%j, zhDictionary) should return %j", (tokens, queries) => { expect(smartTerms(tokens, zhDictionary)).toEqual(queries); }); diff --git a/src/client/utils/smartTerms.ts b/src/client/utils/smartTerms.ts index 4baf735a..ba3f6abd 100644 --- a/src/client/utils/smartTerms.ts +++ b/src/client/utils/smartTerms.ts @@ -2,7 +2,7 @@ import { SmartTerm } from "../../shared/interfaces"; import { cutZhWords } from "./cutZhWords"; /** - * Get all possible terms for a list of tokens consists of words mixed English and Chinese, + * Get all possible terms for a list of tokens consists of words mixed in Chinese and non-Chinese, * by a Chinese words dictionary. * * @param tokens - Tokens consists of English words or strings of consecutive Chinese words. @@ -22,15 +22,15 @@ export function smartTerms( return; } const token = subTokens[0]; - if (/\w/.test(token)) { - const nextCarry = carry.concat(token); - cutMixedWords(subTokens.slice(1), nextCarry); - } else { + if (/\p{Unified_Ideograph}/u.test(token)) { const terms = cutZhWords(token, zhDictionary); for (const term of terms) { const nextCarry = carry.concat(...term); cutMixedWords(subTokens.slice(1), nextCarry); } + } else { + const nextCarry = carry.concat(token); + cutMixedWords(subTokens.slice(1), nextCarry); } } diff --git a/src/client/utils/tokenize.spec.ts b/src/client/utils/tokenize.spec.ts index 1fc4a316..62dcb370 100644 --- a/src/client/utils/tokenize.spec.ts +++ b/src/client/utils/tokenize.spec.ts @@ -1,15 +1,40 @@ +import lunr from "lunr"; + +// The `require`s below are required for testing `ja`. +// eslint-disable-next-line @typescript-eslint/no-var-requires +require("lunr-languages/lunr.stemmer.support")(lunr); +// eslint-disable-next-line @typescript-eslint/no-var-requires +require("lunr-languages/tinyseg")(lunr); +// eslint-disable-next-line @typescript-eslint/no-var-requires +require(`lunr-languages/lunr.ja`)(lunr); + import { tokenize } from "./tokenize"; describe("tokenize", () => { test.each<[string, string[]]>([ - ["Hello World", ["hello", "world"]], + ["Hello-World", ["hello", "world"]], ["Hello World 「世界和平」", ["hello", "world", "世界和平"]], [ "a1b2很好c3_d4更好56也好,不错。", ["a1b2", "很好", "c3_d4", "更好", "56", "也好", "不错"], ], - ["...", []], - ])("tokenize('%s') should return %j", (text, tokens) => { - expect(tokenize(text)).toEqual(tokens); + ["…", []], + ])("tokenize('%s', ['en', 'zh']) should return %j", (text, tokens) => { + expect(tokenize(text, ["en", "zh"])).toEqual(tokens); + }); + + test.each<[string, string[]]>([ + [ + "População portuguesa é composta", + ["população", "portuguesa", "é", "composta"], + ], + ])("tokenize('%s', ['en', 'pt']) should return %j", (text, tokens) => { + expect(tokenize(text, ["en", "pt"])).toEqual(tokens); + }); + + test.each<[string, string[]]>([ + ["私は電車が好きです。", ["私", "は", "電車", "が", "好き", "です", "。"]], + ])("tokenize('%s', ['ja']) should return %j", (text, tokens) => { + expect(tokenize(text, ["ja"])).toEqual(tokens); }); }); diff --git a/src/client/utils/tokenize.ts b/src/client/utils/tokenize.ts index dea03cbb..7acf71c8 100644 --- a/src/client/utils/tokenize.ts +++ b/src/client/utils/tokenize.ts @@ -1,15 +1,34 @@ +import lunr from "lunr"; + /** * Split a sentence to tokens, considering a sequence of consecutive Chinese words as a single token. * * @param text - Text to be tokenized. + * @param language - Languages used. * * @returns Tokens. */ -export function tokenize(text: string): string[] { +export function tokenize(text: string, language: string[]): string[] { + // Some languages have their own tokenizer. + if (language.length === 1 && ["ja", "jp", "th"].includes(language[0])) { + return ((lunr as any)[language[0]] as typeof lunr) + .tokenizer(text) + .map((token) => token.toString()); + } + + let regExpMatchWords = /[^-\s]+/g; + + // Especially optimization for `zh`. + if (language.includes("zh")) { + // Currently only works fine with letters in Latin alphabet and Chinese. + // regExpMatchWords = /\p{Unified_Ideograph}+|[^-\s\p{Unified_Ideograph}]+/gu; + regExpMatchWords = /\w+|\p{Unified_Ideograph}+/gu; + } + return ( text.toLowerCase().match( // https://zhuanlan.zhihu.com/p/33335629 - /\w+|\p{Unified_Ideograph}+/gu + regExpMatchWords // https://mothereff.in/regexpu#input=const+regex+%3D+/%5Cp%7BUnified_Ideograph%7D/u%3B&unicodePropertyEscape=1 // /\w+|[\u3400-\u4DBF\u4E00-\u9FFC\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29\u{20000}-\u{2A6DD}\u{2A700}-\u{2B734}\u{2B740}-\u{2B81D}\u{2B820}-\u{2CEA1}\u{2CEB0}-\u{2EBE0}\u{30000}-\u{3134A}]+/gu ) || [] diff --git a/src/declarations.ts b/src/declarations.ts index d952c3f7..c582f447 100644 --- a/src/declarations.ts +++ b/src/declarations.ts @@ -18,6 +18,7 @@ declare module "autocomplete.js" { } declare module "*/generated.js" { + export const language: string[]; export const indexHash: string | undefined; export const searchResultLimits: number; export const searchResultContextMaxLength: number; diff --git a/src/server/utils/buildIndex.spec.ts b/src/server/utils/buildIndex.spec.ts index 3e4a65e0..937802c0 100644 --- a/src/server/utils/buildIndex.spec.ts +++ b/src/server/utils/buildIndex.spec.ts @@ -19,6 +19,10 @@ describe("buildIndex", () => { i: 3, t: "Hola Mundo", }, + { + i: 4, + t: "私は電車が好きです。", + }, ], ]; let buildIndex: typeof _buildIndex; @@ -94,6 +98,37 @@ describe("buildIndex", () => { ]); }); + test('should work for ["ja"]', () => { + const wrappedIndexes = buildIndex( + allDocuments as SearchDocument[][], + { + language: ["ja"], + removeDefaultStopWordFilter: false, + } as ProcessedPluginOptions + ); + + expect(wrappedIndexes[0].index.search("hello")).toEqual([ + expect.objectContaining({ + ref: "1", + }), + ]); + expect(wrappedIndexes[0].index.search("世界")).toEqual([ + expect.objectContaining({ + ref: "2", + }), + ]); + expect(wrappedIndexes[0].index.search("hola")).toEqual([ + expect.objectContaining({ + ref: "3", + }), + ]); + expect(wrappedIndexes[0].index.search("好き")).toEqual([ + expect.objectContaining({ + ref: "4", + }), + ]); + }); + test('should work for ["en", "zh]', () => { const wrappedIndexes = buildIndex( allDocuments as SearchDocument[][], diff --git a/src/server/utils/buildIndex.ts b/src/server/utils/buildIndex.ts index 31d30e7a..374439b3 100644 --- a/src/server/utils/buildIndex.ts +++ b/src/server/utils/buildIndex.ts @@ -13,6 +13,10 @@ export function buildIndex( // eslint-disable-next-line @typescript-eslint/no-var-requires require("lunr-languages/lunr.stemmer.support")(lunr); } + if (language.includes("ja") || language.includes("jp")) { + // eslint-disable-next-line @typescript-eslint/no-var-requires + require("lunr-languages/tinyseg")(lunr); + } for (const lang of language.filter( (item) => item !== "en" && item !== "zh" )) { diff --git a/src/server/utils/generate.spec.ts b/src/server/utils/generate.spec.ts index 796dbe33..8e1fb36c 100644 --- a/src/server/utils/generate.spec.ts +++ b/src/server/utils/generate.spec.ts @@ -19,6 +19,7 @@ describe("generate", () => { ["en"], [ expect.stringMatching(/^import lunr from ".+\/lunr\/lunr\.js";$/), + 'export const language = ["en"];', 'export const indexHash = "abc";', "export const searchResultLimits = 8;", "export const searchResultContextMaxLength = 50;", @@ -32,6 +33,7 @@ describe("generate", () => { /^require\(".+\/lunr-languages\/lunr\.stemmer\.support\.js"\)\(lunr\);$/ ), 'require("@easyops-cn/docusaurus-search-local/dist/client/shared/lunrLanguageZh").lunrLanguageZh(lunr);', + 'export const language = ["zh"];', 'export const indexHash = "abc";', "export const searchResultLimits = 8;", "export const searchResultContextMaxLength = 50;", @@ -47,6 +49,26 @@ describe("generate", () => { expect.stringMatching( /^require\(".+\/lunr-languages\/lunr\.es\.js"\)\(lunr\);$/ ), + 'export const language = ["es"];', + 'export const indexHash = "abc";', + "export const searchResultLimits = 8;", + "export const searchResultContextMaxLength = 50;", + ], + ], + [ + ["ja"], + [ + expect.stringMatching(/^import lunr from ".+\/lunr\/lunr\.js";$/), + expect.stringMatching( + /^require\(".+\/lunr-languages\/lunr\.stemmer\.support\.js"\)\(lunr\);$/ + ), + expect.stringMatching( + /^require\(".+\/lunr-languages\/tinyseg\.js"\)\(lunr\);$/ + ), + expect.stringMatching( + /^require\(".+\/lunr-languages\/lunr\.ja\.js"\)\(lunr\);$/ + ), + 'export const language = ["ja"];', 'export const indexHash = "abc";', "export const searchResultLimits = 8;", "export const searchResultContextMaxLength = 50;", @@ -63,6 +85,7 @@ describe("generate", () => { expect.stringMatching( /^require\(".+\/lunr-languages\/lunr\.multi\.js"\)\(lunr\);$/ ), + 'export const language = ["en","zh"];', 'export const indexHash = "abc";', "export const searchResultLimits = 8;", "export const searchResultContextMaxLength = 50;", @@ -82,6 +105,7 @@ describe("generate", () => { expect.stringMatching( /^require\(".+\/lunr-languages\/lunr\.multi\.js"\)\(lunr\);$/ ), + 'export const language = ["en","es","zh"];', 'export const indexHash = "abc";', "export const searchResultLimits = 8;", "export const searchResultContextMaxLength = 50;", diff --git a/src/server/utils/generate.ts b/src/server/utils/generate.ts index 59a0ef86..18c67763 100644 --- a/src/server/utils/generate.ts +++ b/src/server/utils/generate.ts @@ -16,6 +16,13 @@ export function generate(config: ProcessedPluginOptions, dir: string): void { )})(lunr);` ); } + if (language.includes("ja") || language.includes("jp")) { + contents.push( + `require(${JSON.stringify( + require.resolve("lunr-languages/tinyseg") + )})(lunr);` + ); + } for (const lang of language.filter( (item) => item !== "en" && item !== "zh" )) { @@ -37,6 +44,7 @@ export function generate(config: ProcessedPluginOptions, dir: string): void { )})(lunr);` ); } + contents.push(`export const language = ${JSON.stringify(language)};`); contents.push(`export const indexHash = ${JSON.stringify(indexHash)};`); contents.push( `export const searchResultLimits = ${JSON.stringify(searchResultLimits)};`,