Skip to content

Commit

Permalink
fix: process stop words for search keyword
Browse files Browse the repository at this point in the history
  • Loading branch information
weareoutman committed Oct 30, 2020
1 parent cec795f commit df3d789
Show file tree
Hide file tree
Showing 16 changed files with 352 additions and 98 deletions.
7 changes: 7 additions & 0 deletions src/client/utils/SearchSourceFactory.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@ import lunr from "lunr";
import { SearchDocument } from "../../shared/interfaces";
import { SearchSourceFactory } from "./SearchSourceFactory";

// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/lunr.stemmer.support")(lunr);
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("../../shared/lunrLanguageZh").lunrLanguageZh(lunr);
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/lunr.multi")(lunr);

jest.mock("./proxiedGenerated");

describe("SearchSourceFactory", () => {
Expand Down
11 changes: 9 additions & 2 deletions src/client/utils/SearchSourceFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,18 @@ export function SearchSourceFactory(
const queries = smartQueries(rawTokens, zhDictionary);
const results: InitialSearchResult[] = [];

search: for (const { keyword, tokens } of queries) {
search: for (const { term, tokens } of queries) {
for (const { documents, index, type } of wrappedIndexes) {
results.push(
...index
.search(keyword)
.query((query) => {
for (const item of term) {
query.term(item.value, {
wildcard: item.wildcard,
presence: item.presence,
});
}
})
.slice(0, resultsLimit)
// Remove duplicated results.
.filter(
Expand Down
11 changes: 10 additions & 1 deletion src/client/utils/__mocks__/proxiedGenerated.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
export const language = ["en", "zh"];
export let language = ["en", "zh"];
export let removeDefaultStopWordFilter = false;
export const indexHash = "abc";
export const searchResultLimits = 8;
export const searchResultContextMaxLength = 50;

export function __setLanguage(value: string[]): void {
language = value;
}

export function __setRemoveDefaultStopWordFilter(value: boolean): void {
removeDefaultStopWordFilter = value;
}
11 changes: 7 additions & 4 deletions src/client/utils/cutZhWords.spec.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import { SmartTerm } from "../../shared/interfaces";
import { cutZhWords } from "./cutZhWords";

const zhDictionary = ["研究生", "研究", "生命", "科学", "生命科学"];

describe("cutZhWords", () => {
test.each<[string, SmartTerm[]]>([
test.each<[string, string[][]]>([
[
"研究生命科学",
[
Expand Down Expand Up @@ -33,7 +32,11 @@ describe("cutZhWords", () => {
["研究生我", [["研究生"], ["研究", "生*"]]],
["我", []],
["命", []],
])("cutZhWords('%s', zhDictionary) should return %j", (token, queries) => {
expect(cutZhWords(token, zhDictionary)).toEqual(queries);
])("cutZhWords('%s', zhDictionary) should work", (token, terms) => {
expect(
cutZhWords(token, zhDictionary).map((term) =>
term.map((item) => `${item.value}${item.trailing ? "*" : ""}`)
)
).toEqual(terms);
});
});
16 changes: 8 additions & 8 deletions src/client/utils/cutZhWords.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { SmartTerm } from "../../shared/interfaces";
import { SmartTerm, WrappedTerm } from "../../shared/interfaces";

/**
* Get all possible terms for a string of consecutive Chinese words,
Expand All @@ -22,7 +22,9 @@ export function cutZhWords(token: string, zhDictionary: string[]): SmartTerm[] {
if (subToken.substr(0, words.length) === words) {
const nextCarry = {
missed: carry.missed,
term: carry.term.concat(words),
term: carry.term.concat({
value: words,
}),
};
if (subToken.length > words.length) {
cut(subToken.substr(words.length), nextCarry);
Expand All @@ -41,7 +43,10 @@ export function cutZhWords(token: string, zhDictionary: string[]): SmartTerm[] {
matchedLastIndex = lastIndex;
const nextCarry = {
missed: carry.missed,
term: carry.term.concat(`${subWords}*`),
term: carry.term.concat({
value: subWords,
trailing: true,
}),
};
if (subToken.length > lastIndex) {
cut(subToken.substr(lastIndex), nextCarry);
Expand Down Expand Up @@ -82,8 +87,3 @@ export function cutZhWords(token: string, zhDictionary: string[]): SmartTerm[] {
})
.map((item) => item.term);
}

interface WrappedTerm {
missed: number;
term: SmartTerm;
}
194 changes: 156 additions & 38 deletions src/client/utils/smartQueries.spec.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,36 @@
import { SmartQuery } from "../../shared/interfaces";
import lunr from "lunr";
import { smartQueries } from "./smartQueries";
import {
__setLanguage,
__setRemoveDefaultStopWordFilter,
} from "./proxiedGenerated";
import { SmartQuery } from "../../shared/interfaces";

// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/lunr.stemmer.support")(lunr);
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("../../shared/lunrLanguageZh").lunrLanguageZh(lunr);
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/lunr.multi")(lunr);

(lunr as any).fake = {};

jest.mock("./proxiedGenerated");

const zhDictionary = ["研究生", "研究", "生命", "科学", "生命科学"];

interface TestQuery {
tokens: string[];
keyword: string;
}

describe("smartQueries", () => {
test.each<[string[], SmartQuery[]]>([
beforeEach(() => {
__setLanguage(["en", "zh"]);
__setRemoveDefaultStopWordFilter(false);
});

test.each<[string[], TestQuery[]]>([
[
["hello"],
[
Expand Down Expand Up @@ -32,77 +58,118 @@ describe("smartQueries", () => {
],
],
[
["hello", "world", "研究生命科学"],
["研究生命科学"],
[
{
tokens: ["hello", "world", "研究", "生命科学"],
keyword: "+hello +world +研究 +生命科学",
tokens: ["研究", "生命科学"],
keyword: "+研究 +生命科学",
},
{
tokens: ["研究", "生命", "科学"],
keyword: "+研究 +生命 +科学",
},
{
tokens: ["研究生", "科学"],
keyword: "+研究生 +科学",
},
{
tokens: ["hello", "world", "研究", "生命", "科学"],
keyword: "+hello +world +研究 +生命 +科学",
tokens: ["研究", "生命科学"],
keyword: "+研究 +生命科学*",
},
{
tokens: ["hello", "world", "研究生", "科学"],
keyword: "+hello +world +研究生 +科学",
tokens: ["研究", "生命", "科学"],
keyword: "+研究 +生命 +科学*",
},
{
tokens: ["hello", "world", "研究", "生命科学"],
keyword: "+hello +world +研究 +生命科学*",
tokens: ["研究生", "科学"],
keyword: "+研究生 +科学*",
},
{
tokens: ["hello", "world", "研究", "生命", "科学"],
keyword: "+hello +world +研究 +生命 +科学*",
tokens: ["研究", "生命"],
keyword: "+研究 +生命",
},
{
tokens: ["hello", "world", "研究生", "科学"],
keyword: "+hello +world +研究生 +科学*",
tokens: ["研究", "科学"],
keyword: "+研究 +科学",
},
{
tokens: ["生命", "科学"],
keyword: "+生命 +科学",
},
{
tokens: ["研究", "科学"],
keyword: "+研究 +科学*",
},
{
tokens: ["生命", "科学"],
keyword: "+生命 +科学*",
},
],
],
[
["hello", "world", "研究生"],
["研究生"],
[
{
tokens: ["hello", "world", "研究生"],
keyword: "+hello +world +研究生",
tokens: ["研究生"],
keyword: "+研究生",
},
{
tokens: ["hello", "world", "研究", "生"],
keyword: "+hello +world +研究 +生*",
tokens: ["研究", "生"],
keyword: "+研究 +生*",
},
{
tokens: ["hello", "world", "研究生"],
keyword: "+hello +world +研究生*",
tokens: ["研究生"],
keyword: "+研究生*",
},
],
],
[
["hello", "world", "生命科学", "研究生"],
/* [
["生命科学", "研究生"],
[
{
tokens: ["hello", "world", "生命科学", "研究生"],
keyword: "+hello +world +生命科学 +研究生",
tokens: ["生命科学", "研究生"],
keyword: "+生命科学 +研究生",
},
{
tokens: ["生命科学", "研究", "生"],
keyword: "+生命科学 +研究 +生*",
},
{
tokens: ["生命", "科学", "研究生"],
keyword: "+生命 +科学 +研究生",
},
{
tokens: ["生命", "科学", "研究", "生"],
keyword: "+生命 +科学 +研究 +生*",
},
{
tokens: ["生命科学", "研究生"],
keyword: "+生命科学 +研究生*",
},
{
tokens: ["hello", "world", "生命科学", "研究", "生"],
keyword: "+hello +world +生命科学 +研究 +生*",
tokens: ["生命", "科学", "研究生"],
keyword: "+生命 +科学 +研究生*",
},
],
], */
[
["a", "hello", "world"],
[
{
tokens: ["hello", "world", "生命", "科学", "研究生"],
keyword: "+hello +world +生命 +科学 +研究生",
tokens: ["a", "hello", "world"],
keyword: "+a +hello +world",
},
{
tokens: ["hello", "world", "生命", "科学", "研究", "生"],
keyword: "+hello +world +生命 +科学 +研究 +生*",
tokens: ["hello", "world"],
keyword: "+hello +world",
},
{
tokens: ["hello", "world", "生命科学", "研究生"],
keyword: "+hello +world +生命科学 +研究生*",
tokens: ["a", "hello", "world"],
keyword: "+a +hello +world*",
},
{
tokens: ["hello", "world", "生命", "科学", "研究生"],
keyword: "+hello +world +生命 +科学 +研究生*",
tokens: ["hello", "world"],
keyword: "+hello +world*",
},
],
],
Expand All @@ -128,7 +195,58 @@ describe("smartQueries", () => {
},
],
],
])("smartQueries(%j, zhDictionary) should return %j", (tokens, queries) => {
expect(smartQueries(tokens, zhDictionary)).toEqual(queries);
])("smartQueries(%j, zhDictionary) should work", (tokens, queries) => {
expect(smartQueries(tokens, zhDictionary).map(transformQuery)).toEqual(
queries
);
});
});

describe("smartQueries with no stop words filter", () => {
beforeEach(() => {
__setLanguage(["en", "fake"]);
__setRemoveDefaultStopWordFilter(true);
});

test.each<[string[], TestQuery[]]>([
[
["a", "hello"],
[
{
tokens: ["a", "hello"],
keyword: "+a +hello",
},
{
tokens: ["a", "hello"],
keyword: "+a +hello*",
},
],
],
])("smartQueries(%j, zhDictionary) should work", (tokens, queries) => {
expect(smartQueries(tokens, zhDictionary).map(transformQuery)).toEqual(
queries
);
});
});

function transformQuery(query: SmartQuery): TestQuery {
return {
tokens: query.tokens,
keyword: query.term
.map(
(item) =>
`${item.presence === lunr.Query.presence.REQUIRED ? "+" : ""}${
(item.wildcard & lunr.Query.wildcard.LEADING) ===
lunr.Query.wildcard.LEADING
? "*"
: ""
}${item.value}${
(item.wildcard & lunr.Query.wildcard.TRAILING) ===
lunr.Query.wildcard.TRAILING
? "*"
: ""
}`
)
.join(" "),
};
}
Loading

0 comments on commit df3d789

Please sign in to comment.