Skip to content

Commit

Permalink
Merge pull request #10 from easyops-cn/steve/fix-lang
Browse files Browse the repository at this point in the history
fix: fix languages other than en and zh
  • Loading branch information
weareoutman committed Oct 29, 2020
2 parents f85da0e + f43fd4b commit 7ee4e10
Show file tree
Hide file tree
Showing 15 changed files with 149 additions and 18 deletions.
1 change: 1 addition & 0 deletions src/client/theme/SearchBar/fetchIndexes.spec.ts
Expand Up @@ -49,6 +49,7 @@ describe("fetchIndexes", () => {
index: {
invertedIndex: [
["hello"],
["alfabetização"],
["世界"],
["和平"],
["世界"],
Expand Down
2 changes: 1 addition & 1 deletion src/client/theme/SearchBar/fetchIndexes.ts
Expand Up @@ -34,7 +34,7 @@ export async function fetchIndexes(

const zhDictionary = json.reduce((acc, item) => {
for (const tuple of item.index.invertedIndex) {
if (!/\w/.test(tuple[0][0])) {
if (/\p{Unified_Ideograph}/u.test(tuple[0][0])) {
acc.add(tuple[0]);
}
}
Expand Down
8 changes: 3 additions & 5 deletions src/client/utils/SearchSourceFactory.spec.ts
@@ -1,11 +1,9 @@
import lunr from "lunr";
import {
SearchDocument,
SearchResult,
WrappedIndex,
} from "../../shared/interfaces";
import { SearchDocument } from "../../shared/interfaces";
import { SearchSourceFactory } from "./SearchSourceFactory";

jest.mock("./proxiedGenerated");

describe("SearchSourceFactory", () => {
const documentsOfTitles: SearchDocument[] = [
{
Expand Down
3 changes: 2 additions & 1 deletion src/client/utils/SearchSourceFactory.ts
Expand Up @@ -9,6 +9,7 @@ import {
} from "../../shared/interfaces";
import { sortSearchResults } from "./sortSearchResults";
import { processTreeStatusOfSearchResults } from "./processTreeStatusOfSearchResults";
import { language } from "./proxiedGenerated";

export function SearchSourceFactory(
wrappedIndexes: WrappedIndex[],
Expand All @@ -19,7 +20,7 @@ export function SearchSourceFactory(
input: string,
callback: (results: SearchResult[]) => void
): void {
const rawTokens = tokenize(input);
const rawTokens = tokenize(input, language);
if (rawTokens.length === 0) {
callback([]);
return;
Expand Down
1 change: 1 addition & 0 deletions src/client/utils/__mocks__/proxiedGenerated.ts
@@ -1,3 +1,4 @@
export const language = ["en", "zh"];
export const indexHash = "abc";
export const searchResultLimits = 8;
export const searchResultContextMaxLength = 50;
13 changes: 13 additions & 0 deletions src/client/utils/smartQueries.spec.ts
Expand Up @@ -115,6 +115,19 @@ describe("smartQueries", () => {
},
],
],
[
["termos", "alfabetização"],
[
{
tokens: ["termos", "alfabetização"],
keyword: "+termos +alfabetização",
},
{
tokens: ["termos", "alfabetização"],
keyword: "+termos +alfabetização*",
},
],
],
])("smartQueries(%j, zhDictionary) should return %j", (tokens, queries) => {
expect(smartQueries(tokens, zhDictionary)).toEqual(queries);
});
Expand Down
1 change: 1 addition & 0 deletions src/client/utils/smartTerms.spec.ts
Expand Up @@ -25,6 +25,7 @@ describe("smartTerms", () => {
],
],
[["hello", "world", "命"], []],
[["alfabetização"], [["alfabetização"]]],
])("smartTerms(%j, zhDictionary) should return %j", (tokens, queries) => {
expect(smartTerms(tokens, zhDictionary)).toEqual(queries);
});
Expand Down
10 changes: 5 additions & 5 deletions src/client/utils/smartTerms.ts
Expand Up @@ -2,7 +2,7 @@ import { SmartTerm } from "../../shared/interfaces";
import { cutZhWords } from "./cutZhWords";

/**
* Get all possible terms for a list of tokens consists of words mixed English and Chinese,
* Get all possible terms for a list of tokens consists of words mixed in Chinese and non-Chinese,
* by a Chinese words dictionary.
*
* @param tokens - Tokens consists of English words or strings of consecutive Chinese words.
Expand All @@ -22,15 +22,15 @@ export function smartTerms(
return;
}
const token = subTokens[0];
if (/\w/.test(token)) {
const nextCarry = carry.concat(token);
cutMixedWords(subTokens.slice(1), nextCarry);
} else {
if (/\p{Unified_Ideograph}/u.test(token)) {
const terms = cutZhWords(token, zhDictionary);
for (const term of terms) {
const nextCarry = carry.concat(...term);
cutMixedWords(subTokens.slice(1), nextCarry);
}
} else {
const nextCarry = carry.concat(token);
cutMixedWords(subTokens.slice(1), nextCarry);
}
}

Expand Down
33 changes: 29 additions & 4 deletions src/client/utils/tokenize.spec.ts
@@ -1,15 +1,40 @@
import lunr from "lunr";

// The `require`s below are required for testing `ja`.
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/lunr.stemmer.support")(lunr);
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/tinyseg")(lunr);
// eslint-disable-next-line @typescript-eslint/no-var-requires
require(`lunr-languages/lunr.ja`)(lunr);

import { tokenize } from "./tokenize";

describe("tokenize", () => {
test.each<[string, string[]]>([
["Hello World", ["hello", "world"]],
["Hello-World", ["hello", "world"]],
["Hello World 「世界和平」", ["hello", "world", "世界和平"]],
[
"a1b2很好c3_d4更好56也好,不错。",
["a1b2", "很好", "c3_d4", "更好", "56", "也好", "不错"],
],
["...", []],
])("tokenize('%s') should return %j", (text, tokens) => {
expect(tokenize(text)).toEqual(tokens);
["…", []],
])("tokenize('%s', ['en', 'zh']) should return %j", (text, tokens) => {
expect(tokenize(text, ["en", "zh"])).toEqual(tokens);
});

test.each<[string, string[]]>([
[
"População portuguesa é composta",
["população", "portuguesa", "é", "composta"],
],
])("tokenize('%s', ['en', 'pt']) should return %j", (text, tokens) => {
expect(tokenize(text, ["en", "pt"])).toEqual(tokens);
});

test.each<[string, string[]]>([
["私は電車が好きです。", ["私", "は", "電車", "が", "好き", "です", "。"]],
])("tokenize('%s', ['ja']) should return %j", (text, tokens) => {
expect(tokenize(text, ["ja"])).toEqual(tokens);
});
});
23 changes: 21 additions & 2 deletions src/client/utils/tokenize.ts
@@ -1,15 +1,34 @@
import lunr from "lunr";

/**
* Split a sentence to tokens, considering a sequence of consecutive Chinese words as a single token.
*
* @param text - Text to be tokenized.
* @param language - Languages used.
*
* @returns Tokens.
*/
export function tokenize(text: string): string[] {
export function tokenize(text: string, language: string[]): string[] {
// Some languages have their own tokenizer.
if (language.length === 1 && ["ja", "jp", "th"].includes(language[0])) {
return ((lunr as any)[language[0]] as typeof lunr)
.tokenizer(text)
.map((token) => token.toString());
}

let regExpMatchWords = /[^-\s]+/g;

// Especially optimization for `zh`.
if (language.includes("zh")) {
// Currently only works fine with letters in Latin alphabet and Chinese.
// regExpMatchWords = /\p{Unified_Ideograph}+|[^-\s\p{Unified_Ideograph}]+/gu;
regExpMatchWords = /\w+|\p{Unified_Ideograph}+/gu;
}

return (
text.toLowerCase().match(
// https://zhuanlan.zhihu.com/p/33335629
/\w+|\p{Unified_Ideograph}+/gu
regExpMatchWords
// https://mothereff.in/regexpu#input=const+regex+%3D+/%5Cp%7BUnified_Ideograph%7D/u%3B&unicodePropertyEscape=1
// /\w+|[\u3400-\u4DBF\u4E00-\u9FFC\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29\u{20000}-\u{2A6DD}\u{2A700}-\u{2B734}\u{2B740}-\u{2B81D}\u{2B820}-\u{2CEA1}\u{2CEB0}-\u{2EBE0}\u{30000}-\u{3134A}]+/gu
) || []
Expand Down
1 change: 1 addition & 0 deletions src/declarations.ts
Expand Up @@ -18,6 +18,7 @@ declare module "autocomplete.js" {
}

declare module "*/generated.js" {
export const language: string[];
export const indexHash: string | undefined;
export const searchResultLimits: number;
export const searchResultContextMaxLength: number;
Expand Down
35 changes: 35 additions & 0 deletions src/server/utils/buildIndex.spec.ts
Expand Up @@ -19,6 +19,10 @@ describe("buildIndex", () => {
i: 3,
t: "Hola Mundo",
},
{
i: 4,
t: "私は電車が好きです。",
},
],
];
let buildIndex: typeof _buildIndex;
Expand Down Expand Up @@ -94,6 +98,37 @@ describe("buildIndex", () => {
]);
});

test('should work for ["ja"]', () => {
const wrappedIndexes = buildIndex(
allDocuments as SearchDocument[][],
{
language: ["ja"],
removeDefaultStopWordFilter: false,
} as ProcessedPluginOptions
);

expect(wrappedIndexes[0].index.search("hello")).toEqual([
expect.objectContaining({
ref: "1",
}),
]);
expect(wrappedIndexes[0].index.search("世界")).toEqual([
expect.objectContaining({
ref: "2",
}),
]);
expect(wrappedIndexes[0].index.search("hola")).toEqual([
expect.objectContaining({
ref: "3",
}),
]);
expect(wrappedIndexes[0].index.search("好き")).toEqual([
expect.objectContaining({
ref: "4",
}),
]);
});

test('should work for ["en", "zh]', () => {
const wrappedIndexes = buildIndex(
allDocuments as SearchDocument[][],
Expand Down
4 changes: 4 additions & 0 deletions src/server/utils/buildIndex.ts
Expand Up @@ -13,6 +13,10 @@ export function buildIndex(
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/lunr.stemmer.support")(lunr);
}
if (language.includes("ja") || language.includes("jp")) {
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/tinyseg")(lunr);
}
for (const lang of language.filter(
(item) => item !== "en" && item !== "zh"
)) {
Expand Down
24 changes: 24 additions & 0 deletions src/server/utils/generate.spec.ts
Expand Up @@ -19,6 +19,7 @@ describe("generate", () => {
["en"],
[
expect.stringMatching(/^import lunr from ".+\/lunr\/lunr\.js";$/),
'export const language = ["en"];',
'export const indexHash = "abc";',
"export const searchResultLimits = 8;",
"export const searchResultContextMaxLength = 50;",
Expand All @@ -32,6 +33,7 @@ describe("generate", () => {
/^require\(".+\/lunr-languages\/lunr\.stemmer\.support\.js"\)\(lunr\);$/
),
'require("@easyops-cn/docusaurus-search-local/dist/client/shared/lunrLanguageZh").lunrLanguageZh(lunr);',
'export const language = ["zh"];',
'export const indexHash = "abc";',
"export const searchResultLimits = 8;",
"export const searchResultContextMaxLength = 50;",
Expand All @@ -47,6 +49,26 @@ describe("generate", () => {
expect.stringMatching(
/^require\(".+\/lunr-languages\/lunr\.es\.js"\)\(lunr\);$/
),
'export const language = ["es"];',
'export const indexHash = "abc";',
"export const searchResultLimits = 8;",
"export const searchResultContextMaxLength = 50;",
],
],
[
["ja"],
[
expect.stringMatching(/^import lunr from ".+\/lunr\/lunr\.js";$/),
expect.stringMatching(
/^require\(".+\/lunr-languages\/lunr\.stemmer\.support\.js"\)\(lunr\);$/
),
expect.stringMatching(
/^require\(".+\/lunr-languages\/tinyseg\.js"\)\(lunr\);$/
),
expect.stringMatching(
/^require\(".+\/lunr-languages\/lunr\.ja\.js"\)\(lunr\);$/
),
'export const language = ["ja"];',
'export const indexHash = "abc";',
"export const searchResultLimits = 8;",
"export const searchResultContextMaxLength = 50;",
Expand All @@ -63,6 +85,7 @@ describe("generate", () => {
expect.stringMatching(
/^require\(".+\/lunr-languages\/lunr\.multi\.js"\)\(lunr\);$/
),
'export const language = ["en","zh"];',
'export const indexHash = "abc";',
"export const searchResultLimits = 8;",
"export const searchResultContextMaxLength = 50;",
Expand All @@ -82,6 +105,7 @@ describe("generate", () => {
expect.stringMatching(
/^require\(".+\/lunr-languages\/lunr\.multi\.js"\)\(lunr\);$/
),
'export const language = ["en","es","zh"];',
'export const indexHash = "abc";',
"export const searchResultLimits = 8;",
"export const searchResultContextMaxLength = 50;",
Expand Down
8 changes: 8 additions & 0 deletions src/server/utils/generate.ts
Expand Up @@ -16,6 +16,13 @@ export function generate(config: ProcessedPluginOptions, dir: string): void {
)})(lunr);`
);
}
if (language.includes("ja") || language.includes("jp")) {
contents.push(
`require(${JSON.stringify(
require.resolve("lunr-languages/tinyseg")
)})(lunr);`
);
}
for (const lang of language.filter(
(item) => item !== "en" && item !== "zh"
)) {
Expand All @@ -37,6 +44,7 @@ export function generate(config: ProcessedPluginOptions, dir: string): void {
)})(lunr);`
);
}
contents.push(`export const language = ${JSON.stringify(language)};`);
contents.push(`export const indexHash = ${JSON.stringify(indexHash)};`);
contents.push(
`export const searchResultLimits = ${JSON.stringify(searchResultLimits)};`,
Expand Down

0 comments on commit 7ee4e10

Please sign in to comment.