Merge pull request #152 from F3n67u/jieba-rs

feat: replace nodejieba with @node-rs/jieba
easyops-cn · Mar 10, 2022 · 95b2284 · 95b2284
2 parents df2680e + 024dcc2
commit 95b2284
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 349 deletions.
diff --git a/README.md b/README.md
@@ -58,24 +58,13 @@ module.exports = {
         // ```
         // language: ["en", "zh"],
         // ```
-        // When applying `zh` in language, please install `nodejieba` in your project.
       },
     ],
   ],
 };
 ````
 
-> Notice!
->
-> - We present this as a theme instead of plugin now, see [this comment](https://github.com/facebook/docusaurus/issues/6488#issuecomment-1024124096).
->
-> - When applying `"zh"` in language, please also install `nodejieba` in your project, which is required for tokenizing Chinese words. It is removed from peerDependencies since v0.20.0, so you have to install it manually even if you're using npm v7+.
-
-```shell
-npm install nodejieba
-# or
-yarn add nodejieba
-```
+> Notice: We present this as a theme instead of plugin now, see [this comment](https://github.com/facebook/docusaurus/issues/6488#issuecomment-1024124096).
 
 ## Theme Options
 
@@ -94,7 +83,7 @@ yarn add nodejieba
 | highlightSearchTermsOnTargetPage | boolean                                  | `false`      | Highlight search terms on target page.                                                                                                       |
 | searchResultLimits               | number                                   | `8`          | Limit the search results.                                                                                                                    |
 | searchResultContextMaxLength     | number                                   | `50`         | Set the max length of characters of each search result to show.                                                                              |
-| translations                     | TranslationMap                           | -            | Set translations of this theme, see [docs below](#translations).                                                                            |
+| translations                     | TranslationMap                           | -            | Set translations of this theme, see [docs below](#translations).                                                                             |
 | ignoreFiles                      | string \| RegExp \| (string \| RegExp)[] | /**meta**\$/ | Set the match rules to ignore some files.                                                                                                    |
 
 ### Translations
@@ -164,9 +153,6 @@ In case some specific errors occurred:
   - Try using @easyops-cn/docusaurus-search-local >= v0.16.0 with Docusaurus >= v2.0.0-alpha.73
   - Try using @easyops-cn/docusaurus-search-local between v0.14.0 and v0.15.1 with Docusaurus between v2.0.0-alpha.68 and v2.0.0-alpha.72
   - Or try using @easyops-cn/docusaurus-search-local <= v0.13.1 with Docusaurus <= v2.0.0-alpha.66
-- `Error: Command failed with signal "SIGSEGV"`:
-  - This is probably caused by a [known issue](https://github.com/yanyiwu/nodejieba/issues/187) introduced by `nodejieba@2.5.2`, if you enabled language of zh.
-  - Try downgrading `nodejieba` to `2.4.2` and it will work again, see discussions in [#47](https://github.com/easyops-cn/docusaurus-search-local/issues/47).
 
 ## Further Reading
 

diff --git a/package.json b/package.json
@@ -37,7 +37,8 @@
     "lunr": "^2.3.9",
     "lunr-languages": "^1.4.0",
     "mark.js": "^8.11.1",
-    "tslib": "^2.2.0"
+    "tslib": "^2.2.0",
+    "@node-rs/jieba": "^1.6.0"
   },
   "devDependencies": {
     "@babel/core": "^7.12.3",
@@ -73,7 +74,6 @@
     "identity-obj-proxy": "^3.0.0",
     "jest": "^26.5.3",
     "lint-staged": "^10.4.1",
-    "nodejieba": "^2.4.1",
     "prettier": "^2.1.2",
     "rimraf": "^3.0.2",
     "standard-version": "^9.0.0",

diff --git a/src/server/utils/tokenizer.spec.ts b/src/server/utils/tokenizer.spec.ts
@@ -1,14 +1,6 @@
-import nodejieba from "nodejieba";
 import { MatchMetadata } from "../../shared/interfaces";
 import { tokenizer } from "./tokenizer";
 
-jest.mock("nodejieba");
-(nodejieba.cut as jest.MockedFunction<typeof nodejieba.cut>).mockImplementation(
-  (input) => {
-    return [input.substr(0, 2), input.substr(2)];
-  }
-);
-
 describe("tokenizer", () => {
   test.each<[string | string[] | null | undefined, MatchMetadata, any[]]>([
     [null, {}, []],
@@ -54,20 +46,27 @@ describe("tokenizer", () => {
         {
           metadata: {
             index: 3,
-            position: [13, 2],
+            position: [13, 1],
           },
-          str: "很好",
+          str: "很",
         },
         {
           metadata: {
             index: 4,
+            position: [14, 1],
+          },
+          str: "好",
+        },
+        {
+          metadata: {
+            index: 5,
             position: [15, 1],
           },
           str: "用",
         },
         {
           metadata: {
-            index: 5,
+            index: 6,
             position: [17, 4],
           },
           str: "good",

diff --git a/src/server/utils/tokenizer.ts b/src/server/utils/tokenizer.ts
@@ -1,13 +1,11 @@
 import lunr from "lunr";
-import nodejieba from "nodejieba";
+import jieba from "@node-rs/jieba";
 import { MatchMetadata } from "../../shared/interfaces";
 import { cutWordByUnderscore } from "./cutWordByUnderscore";
 
 // https://zhuanlan.zhihu.com/p/33335629
 const RegExpConsecutiveWord = /\w+|\p{Unified_Ideograph}+/u;
 
-nodejieba.load();
-
 export function tokenizer(
   input: string | string[] | null | undefined,
   metadata: MatchMetadata
@@ -64,7 +62,7 @@ export function tokenizer(
 
       start += word.length;
     } else {
-      for (const zhWord of nodejieba.cut(word)) {
+      for (const zhWord of jieba.cut(word)) {
         tokens.push(
           new lunr.Token(zhWord, {
             ...(lunr.utils as any).clone(metadata),