fix: process stop words for search keyword

easyops-cn · Oct 30, 2020 · df3d789 · df3d789
1 parent cec795f
commit df3d789
Show file tree

Hide file tree

Showing 16 changed files with 352 additions and 98 deletions.
diff --git a/src/client/utils/SearchSourceFactory.spec.ts b/src/client/utils/SearchSourceFactory.spec.ts
@@ -2,6 +2,13 @@ import lunr from "lunr";
 import { SearchDocument } from "../../shared/interfaces";
 import { SearchSourceFactory } from "./SearchSourceFactory";
 
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require("lunr-languages/lunr.stemmer.support")(lunr);
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require("../../shared/lunrLanguageZh").lunrLanguageZh(lunr);
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require("lunr-languages/lunr.multi")(lunr);
+
 jest.mock("./proxiedGenerated");
 
 describe("SearchSourceFactory", () => {

diff --git a/src/client/utils/SearchSourceFactory.ts b/src/client/utils/SearchSourceFactory.ts
@@ -29,11 +29,18 @@ export function SearchSourceFactory(
     const queries = smartQueries(rawTokens, zhDictionary);
     const results: InitialSearchResult[] = [];
 
-    search: for (const { keyword, tokens } of queries) {
+    search: for (const { term, tokens } of queries) {
       for (const { documents, index, type } of wrappedIndexes) {
         results.push(
           ...index
-            .search(keyword)
+            .query((query) => {
+              for (const item of term) {
+                query.term(item.value, {
+                  wildcard: item.wildcard,
+                  presence: item.presence,
+                });
+              }
+            })
             .slice(0, resultsLimit)
             // Remove duplicated results.
             .filter(

diff --git a/src/client/utils/__mocks__/proxiedGenerated.ts b/src/client/utils/__mocks__/proxiedGenerated.ts
@@ -1,4 +1,13 @@
-export const language = ["en", "zh"];
+export let language = ["en", "zh"];
+export let removeDefaultStopWordFilter = false;
 export const indexHash = "abc";
 export const searchResultLimits = 8;
 export const searchResultContextMaxLength = 50;
+
+export function __setLanguage(value: string[]): void {
+  language = value;
+}
+
+export function __setRemoveDefaultStopWordFilter(value: boolean): void {
+  removeDefaultStopWordFilter = value;
+}
diff --git a/src/client/utils/cutZhWords.spec.ts b/src/client/utils/cutZhWords.spec.ts
@@ -1,10 +1,9 @@
-import { SmartTerm } from "../../shared/interfaces";
 import { cutZhWords } from "./cutZhWords";
 
 const zhDictionary = ["研究生", "研究", "生命", "科学", "生命科学"];
 
 describe("cutZhWords", () => {
-  test.each<[string, SmartTerm[]]>([
+  test.each<[string, string[][]]>([
     [
       "研究生命科学",
       [
@@ -33,7 +32,11 @@ describe("cutZhWords", () => {
     ["研究生我", [["研究生"], ["研究", "生*"]]],
     ["我", []],
     ["命", []],
-  ])("cutZhWords('%s', zhDictionary) should return %j", (token, queries) => {
-    expect(cutZhWords(token, zhDictionary)).toEqual(queries);
+  ])("cutZhWords('%s', zhDictionary) should work", (token, terms) => {
+    expect(
+      cutZhWords(token, zhDictionary).map((term) =>
+        term.map((item) => `${item.value}${item.trailing ? "*" : ""}`)
+      )
+    ).toEqual(terms);
   });
 });
diff --git a/src/client/utils/cutZhWords.ts b/src/client/utils/cutZhWords.ts
@@ -1,4 +1,4 @@
-import { SmartTerm } from "../../shared/interfaces";
+import { SmartTerm, WrappedTerm } from "../../shared/interfaces";
 
 /**
  * Get all possible terms for a string of consecutive Chinese words,
@@ -22,7 +22,9 @@ export function cutZhWords(token: string, zhDictionary: string[]): SmartTerm[] {
       if (subToken.substr(0, words.length) === words) {
         const nextCarry = {
           missed: carry.missed,
-          term: carry.term.concat(words),
+          term: carry.term.concat({
+            value: words,
+          }),
         };
         if (subToken.length > words.length) {
           cut(subToken.substr(words.length), nextCarry);
@@ -41,7 +43,10 @@ export function cutZhWords(token: string, zhDictionary: string[]): SmartTerm[] {
             matchedLastIndex = lastIndex;
             const nextCarry = {
               missed: carry.missed,
-              term: carry.term.concat(`${subWords}*`),
+              term: carry.term.concat({
+                value: subWords,
+                trailing: true,
+              }),
             };
             if (subToken.length > lastIndex) {
               cut(subToken.substr(lastIndex), nextCarry);
@@ -82,8 +87,3 @@ export function cutZhWords(token: string, zhDictionary: string[]): SmartTerm[] {
     })
     .map((item) => item.term);
 }
-
-interface WrappedTerm {
-  missed: number;
-  term: SmartTerm;
-}
diff --git a/src/client/utils/smartQueries.spec.ts b/src/client/utils/smartQueries.spec.ts
@@ -1,10 +1,36 @@
-import { SmartQuery } from "../../shared/interfaces";
+import lunr from "lunr";
 import { smartQueries } from "./smartQueries";
+import {
+  __setLanguage,
+  __setRemoveDefaultStopWordFilter,
+} from "./proxiedGenerated";
+import { SmartQuery } from "../../shared/interfaces";
+
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require("lunr-languages/lunr.stemmer.support")(lunr);
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require("../../shared/lunrLanguageZh").lunrLanguageZh(lunr);
+// eslint-disable-next-line @typescript-eslint/no-var-requires
+require("lunr-languages/lunr.multi")(lunr);
+
+(lunr as any).fake = {};
+
+jest.mock("./proxiedGenerated");
 
 const zhDictionary = ["研究生", "研究", "生命", "科学", "生命科学"];
 
+interface TestQuery {
+  tokens: string[];
+  keyword: string;
+}
+
 describe("smartQueries", () => {
-  test.each<[string[], SmartQuery[]]>([
+  beforeEach(() => {
+    __setLanguage(["en", "zh"]);
+    __setRemoveDefaultStopWordFilter(false);
+  });
+
+  test.each<[string[], TestQuery[]]>([
     [
       ["hello"],
       [
@@ -32,77 +58,118 @@ describe("smartQueries", () => {
       ],
     ],
     [
-      ["hello", "world", "研究生命科学"],
+      ["研究生命科学"],
       [
         {
-          tokens: ["hello", "world", "研究", "生命科学"],
-          keyword: "+hello +world +研究 +生命科学",
+          tokens: ["研究", "生命科学"],
+          keyword: "+研究 +生命科学",
+        },
+        {
+          tokens: ["研究", "生命", "科学"],
+          keyword: "+研究 +生命 +科学",
+        },
+        {
+          tokens: ["研究生", "科学"],
+          keyword: "+研究生 +科学",
         },
         {
-          tokens: ["hello", "world", "研究", "生命", "科学"],
-          keyword: "+hello +world +研究 +生命 +科学",
+          tokens: ["研究", "生命科学"],
+          keyword: "+研究 +生命科学*",
         },
         {
-          tokens: ["hello", "world", "研究生", "科学"],
-          keyword: "+hello +world +研究生 +科学",
+          tokens: ["研究", "生命", "科学"],
+          keyword: "+研究 +生命 +科学*",
         },
         {
-          tokens: ["hello", "world", "研究", "生命科学"],
-          keyword: "+hello +world +研究 +生命科学*",
+          tokens: ["研究生", "科学"],
+          keyword: "+研究生 +科学*",
         },
         {
-          tokens: ["hello", "world", "研究", "生命", "科学"],
-          keyword: "+hello +world +研究 +生命 +科学*",
+          tokens: ["研究", "生命"],
+          keyword: "+研究 +生命",
         },
         {
-          tokens: ["hello", "world", "研究生", "科学"],
-          keyword: "+hello +world +研究生 +科学*",
+          tokens: ["研究", "科学"],
+          keyword: "+研究 +科学",
+        },
+        {
+          tokens: ["生命", "科学"],
+          keyword: "+生命 +科学",
+        },
+        {
+          tokens: ["研究", "科学"],
+          keyword: "+研究 +科学*",
+        },
+        {
+          tokens: ["生命", "科学"],
+          keyword: "+生命 +科学*",
         },
       ],
     ],
     [
-      ["hello", "world", "研究生"],
+      ["研究生"],
       [
         {
-          tokens: ["hello", "world", "研究生"],
-          keyword: "+hello +world +研究生",
+          tokens: ["研究生"],
+          keyword: "+研究生",
         },
         {
-          tokens: ["hello", "world", "研究", "生"],
-          keyword: "+hello +world +研究 +生*",
+          tokens: ["研究", "生"],
+          keyword: "+研究 +生*",
         },
         {
-          tokens: ["hello", "world", "研究生"],
-          keyword: "+hello +world +研究生*",
+          tokens: ["研究生"],
+          keyword: "+研究生*",
         },
       ],
     ],
-    [
-      ["hello", "world", "生命科学", "研究生"],
+    /* [
+      ["生命科学", "研究生"],
       [
         {
-          tokens: ["hello", "world", "生命科学", "研究生"],
-          keyword: "+hello +world +生命科学 +研究生",
+          tokens: ["生命科学", "研究生"],
+          keyword: "+生命科学 +研究生",
+        },
+        {
+          tokens: ["生命科学", "研究", "生"],
+          keyword: "+生命科学 +研究 +生*",
+        },
+        {
+          tokens: ["生命", "科学", "研究生"],
+          keyword: "+生命 +科学 +研究生",
+        },
+        {
+          tokens: ["生命", "科学", "研究", "生"],
+          keyword: "+生命 +科学 +研究 +生*",
+        },
+        {
+          tokens: ["生命科学", "研究生"],
+          keyword: "+生命科学 +研究生*",
         },
         {
-          tokens: ["hello", "world", "生命科学", "研究", "生"],
-          keyword: "+hello +world +生命科学 +研究 +生*",
+          tokens: ["生命", "科学", "研究生"],
+          keyword: "+生命 +科学 +研究生*",
         },
+      ],
+    ], */
+    [
+      ["a", "hello", "world"],
+      [
         {
-          tokens: ["hello", "world", "生命", "科学", "研究生"],
-          keyword: "+hello +world +生命 +科学 +研究生",
+          tokens: ["a", "hello", "world"],
+          keyword: "+a +hello +world",
         },
         {
-          tokens: ["hello", "world", "生命", "科学", "研究", "生"],
-          keyword: "+hello +world +生命 +科学 +研究 +生*",
+          tokens: ["hello", "world"],
+          keyword: "+hello +world",
         },
         {
-          tokens: ["hello", "world", "生命科学", "研究生"],
-          keyword: "+hello +world +生命科学 +研究生*",
+          tokens: ["a", "hello", "world"],
+          keyword: "+a +hello +world*",
         },
         {
-          tokens: ["hello", "world", "生命", "科学", "研究生"],
-          keyword: "+hello +world +生命 +科学 +研究生*",
+          tokens: ["hello", "world"],
+          keyword: "+hello +world*",
         },
       ],
     ],
@@ -128,7 +195,58 @@ describe("smartQueries", () => {
         },
       ],
     ],
-  ])("smartQueries(%j, zhDictionary) should return %j", (tokens, queries) => {
-    expect(smartQueries(tokens, zhDictionary)).toEqual(queries);
+  ])("smartQueries(%j, zhDictionary) should work", (tokens, queries) => {
+    expect(smartQueries(tokens, zhDictionary).map(transformQuery)).toEqual(
+      queries
+    );
+  });
+});
+
+describe("smartQueries with no stop words filter", () => {
+  beforeEach(() => {
+    __setLanguage(["en", "fake"]);
+    __setRemoveDefaultStopWordFilter(true);
+  });
+
+  test.each<[string[], TestQuery[]]>([
+    [
+      ["a", "hello"],
+      [
+        {
+          tokens: ["a", "hello"],
+          keyword: "+a +hello",
+        },
+        {
+          tokens: ["a", "hello"],
+          keyword: "+a +hello*",
+        },
+      ],
+    ],
+  ])("smartQueries(%j, zhDictionary) should work", (tokens, queries) => {
+    expect(smartQueries(tokens, zhDictionary).map(transformQuery)).toEqual(
+      queries
+    );
   });
 });
+
+function transformQuery(query: SmartQuery): TestQuery {
+  return {
+    tokens: query.tokens,
+    keyword: query.term
+      .map(
+        (item) =>
+          `${item.presence === lunr.Query.presence.REQUIRED ? "+" : ""}${
+            (item.wildcard & lunr.Query.wildcard.LEADING) ===
+            lunr.Query.wildcard.LEADING
+              ? "*"
+              : ""
+          }${item.value}${
+            (item.wildcard & lunr.Query.wildcard.TRAILING) ===
+            lunr.Query.wildcard.TRAILING
+              ? "*"
+              : ""
+          }`
+      )
+      .join(" "),
+  };
+}