From f43fd4b6a261338da70556396a9c810884674e17 Mon Sep 17 00:00:00 2001
From: weareoutman <wangshenwei@qq.com>
Date: Thu, 29 Oct 2020 11:39:50 +0800
Subject: [PATCH] fix: fix languages other than en and zh

#9
---
 .../theme/SearchBar/fetchIndexes.spec.ts      |  1 +
 src/client/theme/SearchBar/fetchIndexes.ts    |  2 +-
 src/client/utils/SearchSourceFactory.spec.ts  |  8 ++---
 src/client/utils/SearchSourceFactory.ts       |  3 +-
 .../utils/__mocks__/proxiedGenerated.ts       |  1 +
 src/client/utils/smartQueries.spec.ts         | 13 +++++++
 src/client/utils/smartTerms.spec.ts           |  1 +
 src/client/utils/smartTerms.ts                | 10 +++---
 src/client/utils/tokenize.spec.ts             | 33 ++++++++++++++---
 src/client/utils/tokenize.ts                  | 23 ++++++++++--
 src/declarations.ts                           |  1 +
 src/server/utils/buildIndex.spec.ts           | 35 +++++++++++++++++++
 src/server/utils/buildIndex.ts                |  4 +++
 src/server/utils/generate.spec.ts             | 24 +++++++++++++
 src/server/utils/generate.ts                  |  8 +++++
 15 files changed, 149 insertions(+), 18 deletions(-)

diff --git a/src/client/theme/SearchBar/fetchIndexes.spec.ts b/src/client/theme/SearchBar/fetchIndexes.spec.ts
index fa16aa47..89e4d2f9 100644
--- a/src/client/theme/SearchBar/fetchIndexes.spec.ts
+++ b/src/client/theme/SearchBar/fetchIndexes.spec.ts
@@ -49,6 +49,7 @@ describe("fetchIndexes", () => {
             index: {
               invertedIndex: [
                 ["hello"],
+                ["alfabetização"],
                 ["世界"],
                 ["和平"],
                 ["世界"],
diff --git a/src/client/theme/SearchBar/fetchIndexes.ts b/src/client/theme/SearchBar/fetchIndexes.ts
index 0ffc7ccf..813ed2f0 100644
--- a/src/client/theme/SearchBar/fetchIndexes.ts
+++ b/src/client/theme/SearchBar/fetchIndexes.ts
@@ -34,7 +34,7 @@ export async function fetchIndexes(
 
     const zhDictionary = json.reduce((acc, item) => {
       for (const tuple of item.index.invertedIndex) {
-        if (!/\w/.test(tuple[0][0])) {
+        if (/\p{Unified_Ideograph}/u.test(tuple[0][0])) {
           acc.add(tuple[0]);
         }
       }
diff --git a/src/client/utils/SearchSourceFactory.spec.ts b/src/client/utils/SearchSourceFactory.spec.ts
index dd8b06d7..91469b20 100644
--- a/src/client/utils/SearchSourceFactory.spec.ts
+++ b/src/client/utils/SearchSourceFactory.spec.ts
@@ -1,11 +1,9 @@
 import lunr from "lunr";
-import {
-  SearchDocument,
-  SearchResult,
-  WrappedIndex,
-} from "../../shared/interfaces";
+import { SearchDocument } from "../../shared/interfaces";
 import { SearchSourceFactory } from "./SearchSourceFactory";
 
+jest.mock("./proxiedGenerated");
+
 describe("SearchSourceFactory", () => {
   const documentsOfTitles: SearchDocument[] = [
     {
diff --git a/src/client/utils/SearchSourceFactory.ts b/src/client/utils/SearchSourceFactory.ts
index f433b096..3084b3a3 100644
--- a/src/client/utils/SearchSourceFactory.ts
+++ b/src/client/utils/SearchSourceFactory.ts
@@ -9,6 +9,7 @@ import {
 } from "../../shared/interfaces";
 import { sortSearchResults } from "./sortSearchResults";
 import { processTreeStatusOfSearchResults } from "./processTreeStatusOfSearchResults";
+import { language } from "./proxiedGenerated";
 
 export function SearchSourceFactory(
   wrappedIndexes: WrappedIndex[],
@@ -19,7 +20,7 @@ export function SearchSourceFactory(
     input: string,
     callback: (results: SearchResult[]) => void
   ): void {
-    const rawTokens = tokenize(input);
+    const rawTokens = tokenize(input, language);
     if (rawTokens.length === 0) {
       callback([]);
       return;
diff --git a/src/client/utils/__mocks__/proxiedGenerated.ts b/src/client/utils/__mocks__/proxiedGenerated.ts
index e2dac823..4cf3a28f 100644
--- a/src/client/utils/__mocks__/proxiedGenerated.ts
+++ b/src/client/utils/__mocks__/proxiedGenerated.ts
@@ -1,3 +1,4 @@
+export const language = ["en", "zh"];
 export const indexHash = "abc";
 export const searchResultLimits = 8;
 export const searchResultContextMaxLength = 50;
diff --git a/src/client/utils/smartQueries.spec.ts b/src/client/utils/smartQueries.spec.ts
index 394e5c8a..97572ea1 100644
--- a/src/client/utils/smartQueries.spec.ts
+++ b/src/client/utils/smartQueries.spec.ts
@@ -115,6 +115,19 @@ describe("smartQueries", () => {
         },
       ],
     ],
+    [
+      ["termos", "alfabetização"],
+      [
+        {
+          tokens: ["termos", "alfabetização"],
+          keyword: "+termos +alfabetização",
+        },
+        {
+          tokens: ["termos", "alfabetização"],
+          keyword: "+termos +alfabetização*",
+        },
+      ],
+    ],
   ])("smartQueries(%j, zhDictionary) should return %j", (tokens, queries) => {
     expect(smartQueries(tokens, zhDictionary)).toEqual(queries);
   });
diff --git a/src/client/utils/smartTerms.spec.ts b/src/client/utils/smartTerms.spec.ts
index 870440f7..931e9340 100644
--- a/src/client/utils/smartTerms.spec.ts
+++ b/src/client/utils/smartTerms.spec.ts
@@ -25,6 +25,7 @@ describe("smartTerms", () => {
       ],
     ],
     [["hello", "world", "命"], []],
+    [["alfabetização"], [["alfabetização"]]],
   ])("smartTerms(%j, zhDictionary) should return %j", (tokens, queries) => {
     expect(smartTerms(tokens, zhDictionary)).toEqual(queries);
   });
diff --git a/src/client/utils/smartTerms.ts b/src/client/utils/smartTerms.ts
index 4baf735a..ba3f6abd 100644
--- a/src/client/utils/smartTerms.ts
+++ b/src/client/utils/smartTerms.ts
@@ -2,7 +2,7 @@ import { SmartTerm } from "../../shared/interfaces";
 import { cutZhWords } from "./cutZhWords";
 
 /**
- * Get all possible terms for a list of tokens consists of words mixed English and Chinese,
+ * Get all possible terms for a list of tokens consists of words mixed in Chinese and non-Chinese,
  * by a Chinese words dictionary.
  *
  * @param tokens - Tokens consists of English words or strings of consecutive Chinese words.
@@ -22,15 +22,15 @@ export function smartTerms(
       return;
     }
     const token = subTokens[0];
-    if (/\w/.test(token)) {
-      const nextCarry = carry.concat(token);
-      cutMixedWords(subTokens.slice(1), nextCarry);
-    } else {
+    if (/\p{Unified_Ideograph}/u.test(token)) {
       const terms = cutZhWords(token, zhDictionary);
       for (const term of terms) {
         const nextCarry = carry.concat(...term);
         cutMixedWords(subTokens.slice(1), nextCarry);
       }
+    } else {
+      const nextCarry = carry.concat(token);
+      cutMixedWords(subTokens.slice(1), nextCarry);
     }
   }
 
diff --git a/src/client/utils/tokenize.spec.ts b/src/client/utils/tokenize.spec.ts
index 1fc4a316..62dcb370 100644
--- a/src/client/utils/tokenize.spec.ts
+++ b/src/client/utils/tokenize.spec.ts
@@ -1,15 +1,40 @@
+import lunr from "lunr";
+
+// The `require`s below are required for testing `ja`.
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require("lunr-languages/lunr.stemmer.support")(lunr);
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require("lunr-languages/tinyseg")(lunr);
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require(`lunr-languages/lunr.ja`)(lunr);
+
 import { tokenize } from "./tokenize";
 
 describe("tokenize", () => {
   test.each<[string, string[]]>([
-    ["Hello World", ["hello", "world"]],
+    ["Hello-World", ["hello", "world"]],
     ["Hello World 「世界和平」", ["hello", "world", "世界和平"]],
     [
       "a1b2很好c3_d4更好56也好，不错。",
       ["a1b2", "很好", "c3_d4", "更好", "56", "也好", "不错"],
     ],
-    ["...", []],
-  ])("tokenize('%s') should return %j", (text, tokens) => {
-    expect(tokenize(text)).toEqual(tokens);
+    ["…", []],
+  ])("tokenize('%s', ['en', 'zh']) should return %j", (text, tokens) => {
+    expect(tokenize(text, ["en", "zh"])).toEqual(tokens);
+  });
+
+  test.each<[string, string[]]>([
+    [
+      "População portuguesa é composta",
+      ["população", "portuguesa", "é", "composta"],
+    ],
+  ])("tokenize('%s', ['en', 'pt']) should return %j", (text, tokens) => {
+    expect(tokenize(text, ["en", "pt"])).toEqual(tokens);
+  });
+
+  test.each<[string, string[]]>([
+    ["私は電車が好きです。", ["私", "は", "電車", "が", "好き", "です", "。"]],
+  ])("tokenize('%s', ['ja']) should return %j", (text, tokens) => {
+    expect(tokenize(text, ["ja"])).toEqual(tokens);
   });
 });
diff --git a/src/client/utils/tokenize.ts b/src/client/utils/tokenize.ts
index dea03cbb..7acf71c8 100644
--- a/src/client/utils/tokenize.ts
+++ b/src/client/utils/tokenize.ts
@@ -1,15 +1,34 @@
+import lunr from "lunr";
+
 /**
  * Split a sentence to tokens, considering a sequence of consecutive Chinese words as a single token.
  *
  * @param text - Text to be tokenized.
+ * @param language - Languages used.
  *
  * @returns Tokens.
  */
-export function tokenize(text: string): string[] {
+export function tokenize(text: string, language: string[]): string[] {
+  // Some languages have their own tokenizer.
+  if (language.length === 1 && ["ja", "jp", "th"].includes(language[0])) {
+    return ((lunr as any)[language[0]] as typeof lunr)
+      .tokenizer(text)
+      .map((token) => token.toString());
+  }
+
+  let regExpMatchWords = /[^-\s]+/g;
+
+  // Especially optimization for `zh`.
+  if (language.includes("zh")) {
+    // Currently only works fine with letters in Latin alphabet and Chinese.
+    // regExpMatchWords = /\p{Unified_Ideograph}+|[^-\s\p{Unified_Ideograph}]+/gu;
+    regExpMatchWords = /\w+|\p{Unified_Ideograph}+/gu;
+  }
+
   return (
     text.toLowerCase().match(
       // https://zhuanlan.zhihu.com/p/33335629
-      /\w+|\p{Unified_Ideograph}+/gu
+      regExpMatchWords
       // https://mothereff.in/regexpu#input=const+regex+%3D+/%5Cp%7BUnified_Ideograph%7D/u%3B&unicodePropertyEscape=1
       // /\w+|[\u3400-\u4DBF\u4E00-\u9FFC\uFA0E\uFA0F\uFA11\uFA13\uFA14\uFA1F\uFA21\uFA23\uFA24\uFA27-\uFA29\u{20000}-\u{2A6DD}\u{2A700}-\u{2B734}\u{2B740}-\u{2B81D}\u{2B820}-\u{2CEA1}\u{2CEB0}-\u{2EBE0}\u{30000}-\u{3134A}]+/gu
     ) || []
diff --git a/src/declarations.ts b/src/declarations.ts
index d952c3f7..c582f447 100644
--- a/src/declarations.ts
+++ b/src/declarations.ts
@@ -18,6 +18,7 @@ declare module "autocomplete.js" {
 }
 
 declare module "*/generated.js" {
+  export const language: string[];
   export const indexHash: string | undefined;
   export const searchResultLimits: number;
   export const searchResultContextMaxLength: number;
diff --git a/src/server/utils/buildIndex.spec.ts b/src/server/utils/buildIndex.spec.ts
index 3e4a65e0..937802c0 100644
--- a/src/server/utils/buildIndex.spec.ts
+++ b/src/server/utils/buildIndex.spec.ts
@@ -19,6 +19,10 @@ describe("buildIndex", () => {
         i: 3,
         t: "Hola Mundo",
       },
+      {
+        i: 4,
+        t: "私は電車が好きです。",
+      },
     ],
   ];
   let buildIndex: typeof _buildIndex;
@@ -94,6 +98,37 @@ describe("buildIndex", () => {
     ]);
   });
 
+  test('should work for ["ja"]', () => {
+    const wrappedIndexes = buildIndex(
+      allDocuments as SearchDocument[][],
+      {
+        language: ["ja"],
+        removeDefaultStopWordFilter: false,
+      } as ProcessedPluginOptions
+    );
+
+    expect(wrappedIndexes[0].index.search("hello")).toEqual([
+      expect.objectContaining({
+        ref: "1",
+      }),
+    ]);
+    expect(wrappedIndexes[0].index.search("世界")).toEqual([
+      expect.objectContaining({
+        ref: "2",
+      }),
+    ]);
+    expect(wrappedIndexes[0].index.search("hola")).toEqual([
+      expect.objectContaining({
+        ref: "3",
+      }),
+    ]);
+    expect(wrappedIndexes[0].index.search("好き")).toEqual([
+      expect.objectContaining({
+        ref: "4",
+      }),
+    ]);
+  });
+
   test('should work for ["en", "zh]', () => {
     const wrappedIndexes = buildIndex(
       allDocuments as SearchDocument[][],
diff --git a/src/server/utils/buildIndex.ts b/src/server/utils/buildIndex.ts
index 31d30e7a..374439b3 100644
--- a/src/server/utils/buildIndex.ts
+++ b/src/server/utils/buildIndex.ts
@@ -13,6 +13,10 @@ export function buildIndex(
     // eslint-disable-next-line @typescript-eslint/no-var-requires
     require("lunr-languages/lunr.stemmer.support")(lunr);
   }
+  if (language.includes("ja") || language.includes("jp")) {
+    // eslint-disable-next-line @typescript-eslint/no-var-requires
+    require("lunr-languages/tinyseg")(lunr);
+  }
   for (const lang of language.filter(
     (item) => item !== "en" && item !== "zh"
   )) {
diff --git a/src/server/utils/generate.spec.ts b/src/server/utils/generate.spec.ts
index 796dbe33..8e1fb36c 100644
--- a/src/server/utils/generate.spec.ts
+++ b/src/server/utils/generate.spec.ts
@@ -19,6 +19,7 @@ describe("generate", () => {
       ["en"],
       [
         expect.stringMatching(/^import lunr from ".+\/lunr\/lunr\.js";$/),
+        'export const language = ["en"];',
         'export const indexHash = "abc";',
         "export const searchResultLimits = 8;",
         "export const searchResultContextMaxLength = 50;",
@@ -32,6 +33,7 @@ describe("generate", () => {
           /^require\(".+\/lunr-languages\/lunr\.stemmer\.support\.js"\)\(lunr\);$/
         ),
         'require("@easyops-cn/docusaurus-search-local/dist/client/shared/lunrLanguageZh").lunrLanguageZh(lunr);',
+        'export const language = ["zh"];',
         'export const indexHash = "abc";',
         "export const searchResultLimits = 8;",
         "export const searchResultContextMaxLength = 50;",
@@ -47,6 +49,26 @@ describe("generate", () => {
         expect.stringMatching(
           /^require\(".+\/lunr-languages\/lunr\.es\.js"\)\(lunr\);$/
         ),
+        'export const language = ["es"];',
+        'export const indexHash = "abc";',
+        "export const searchResultLimits = 8;",
+        "export const searchResultContextMaxLength = 50;",
+      ],
+    ],
+    [
+      ["ja"],
+      [
+        expect.stringMatching(/^import lunr from ".+\/lunr\/lunr\.js";$/),
+        expect.stringMatching(
+          /^require\(".+\/lunr-languages\/lunr\.stemmer\.support\.js"\)\(lunr\);$/
+        ),
+        expect.stringMatching(
+          /^require\(".+\/lunr-languages\/tinyseg\.js"\)\(lunr\);$/
+        ),
+        expect.stringMatching(
+          /^require\(".+\/lunr-languages\/lunr\.ja\.js"\)\(lunr\);$/
+        ),
+        'export const language = ["ja"];',
         'export const indexHash = "abc";',
         "export const searchResultLimits = 8;",
         "export const searchResultContextMaxLength = 50;",
@@ -63,6 +85,7 @@ describe("generate", () => {
         expect.stringMatching(
           /^require\(".+\/lunr-languages\/lunr\.multi\.js"\)\(lunr\);$/
         ),
+        'export const language = ["en","zh"];',
         'export const indexHash = "abc";',
         "export const searchResultLimits = 8;",
         "export const searchResultContextMaxLength = 50;",
@@ -82,6 +105,7 @@ describe("generate", () => {
         expect.stringMatching(
           /^require\(".+\/lunr-languages\/lunr\.multi\.js"\)\(lunr\);$/
         ),
+        'export const language = ["en","es","zh"];',
         'export const indexHash = "abc";',
         "export const searchResultLimits = 8;",
         "export const searchResultContextMaxLength = 50;",
diff --git a/src/server/utils/generate.ts b/src/server/utils/generate.ts
index 59a0ef86..18c67763 100644
--- a/src/server/utils/generate.ts
+++ b/src/server/utils/generate.ts
@@ -16,6 +16,13 @@ export function generate(config: ProcessedPluginOptions, dir: string): void {
       )})(lunr);`
     );
   }
+  if (language.includes("ja") || language.includes("jp")) {
+    contents.push(
+      `require(${JSON.stringify(
+        require.resolve("lunr-languages/tinyseg")
+      )})(lunr);`
+    );
+  }
   for (const lang of language.filter(
     (item) => item !== "en" && item !== "zh"
   )) {
@@ -37,6 +44,7 @@ export function generate(config: ProcessedPluginOptions, dir: string): void {
       )})(lunr);`
     );
   }
+  contents.push(`export const language = ${JSON.stringify(language)};`);
   contents.push(`export const indexHash = ${JSON.stringify(indexHash)};`);
   contents.push(
     `export const searchResultLimits = ${JSON.stringify(searchResultLimits)};`,