From 5d080b74e0916b315db965aa2a56ef29b60f2a59 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Tue, 16 Nov 2021 15:22:09 -0500 Subject: [PATCH 1/2] Search with and on title first and favor that more (#22730) * search improvements * expose search result score on debug=1 qs (#22452) * Search index improvements (#22428) * concatenate headings for search ranking (#22445) * search index improvements * update index * add data-search property used to create search records * direct match title and normalize results to one per article * the search index is getting huge * add chinese lang, intro, and boost * create a article level record with no heading for each article * remove chinese language and exact match, boost breadcrumbs * break out debug mode into separate branch * break adding chinese out into another branch * one record per article remove limits * revert added spaces * revert adding property on article page * concatenate headings for search ranking * remove slug property, fix test * update record format in tests * revert adding this property back * scrape article-body not contents * adding heading property to tests * add headings to content property * Update script/search/parse-page-sections-into-records.js Co-authored-by: Peter Bengtsson * Update script/search/parse-page-sections-into-records.js Co-authored-by: Peter Bengtsson Co-authored-by: Peter Bengtsson * search by popularity (#22496) * search by popularity * adjust factors based on length of query and normalize Lunr score * Apply suggestions from code review Co-authored-by: Rachael Sewell Co-authored-by: Rachael Sewell * turn html into text with appropriate spaces (#22528) * turn HTML into text with appropriate spaces Part of #1141 * fix unit test * improve tests * small refactor * slice the search results .content (#22514) * search by popularity * adjust factors based on length of query and normalize Lunr score * slice the search result .content Part of #1142 * Update lib/search/lunr-search.js Co-authored-by: Rachael Sewell Co-authored-by: Rachael Sewell * truncate translated search index records (#22475) * truncate translated search index records * page entire page object * revert search indexes * try reverting search indexes again... * out of sync with main in parent branch so reverting last commit again * rever deleted indexes * remove debug lines * correct package-lock.json * slice content string better (#22586) Part of #1149 * include home pages in the search (#22568) * Search remove hyphen as token (#22584) * slice better when not present in content (#22672) * fix missing search breadcrumbs (#22681) * fix missing search breadcrumbs * hopefully fix tests Co-authored-by: Peter Bengtsson * Highlight based on combined matchData (#22692) * highlight based on combined matchData * refactoring * remove results when lang or version changes (#22697) * search with AND on title first and favor that more * commented out code * Update lib/search/lunr-search.js * only apply the wildcard to the title search if the last word is short Co-authored-by: Rachael Sewell --- lib/search/lunr-search.js | 111 +++++++++++++++++++++++++++++++------- 1 file changed, 93 insertions(+), 18 deletions(-) diff --git a/lib/search/lunr-search.js b/lib/search/lunr-search.js index 75f02a863f60..3d39f77350c7 100644 --- a/lib/search/lunr-search.js +++ b/lib/search/lunr-search.js @@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit // want to make sure this number accounts for that. const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10 + // Multiplication bonus given to matches that were made on the + // the search where ALL tokens are required. + // E.g. you search for 'foo bar' and we have three records: + // + // A) "This foo is very special" + // B) "With bar and foo you can't go wrong" + // C) "Only bar can save you" + // + // What will happen is that it only finds record (B) when it's + // requires to match both 'foo' *and* 'bar'. So you get these scores: + // + // A) score = result.score + popularity + // B) score = MATCH_PHRASE * (result.score + popularity) + // C) score = result.score + popularity + // + // So it's very powerful multiplier. But that's fine because a + // "phrase match" is a very accurate thing. + const MATCH_PHRASE = 5 + // Imagine that we have 1,000 documents. 100 of them contain the word // 'foobar'. Of those 100, we want to display the top 10 "best". // But if we only do `lunrindex.search('foobar').slice(0, 10)` we @@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit // records that we finally return. const PRE_LIMIT = 500 - let titleQuery = query.trim() - if (titleQuery.length <= 3 && !titleQuery.endsWith('*s')) { - // When the search input is really short, force it to search with - // the "forward wild card". I.e. you typed `go` we turn it into a - // search for `go*` which means it can find things like `Google`. - titleQuery += '*' - } + const titleQuery = query.trim() let highestTitleScore = 0.0 + + const andTitleResults = [] + + // This will turn something like 'foo and bar' into: + // [ + // { str: 'foo', metadata: { position: [Array], index: 0 } }, + // { str: 'bar', metadata: { position: [Array], index: 1 } } + // ] + // Note how the stopword gets omitted. + // It's important to omit the stopwords because even if the record + // actually contains the stopword, it won't match then. + // E.g. you have a record called "Foo And Bar" and you search for + // {foo AND and AND bar} it will actually not find anything. + // But if you change it to {foo AND bar} it will match "Foo And Bar" + // Same goes if any other stopwords were used like "Foo the Bar with for a". + // That also needs to become an AND-search of {foo AND bar} ...only. + const titleQueryTokenized = lunr.tokenizer(titleQuery).filter(lunr.stopWordFilter) + + if (titleQueryTokenized.length > 1) { + andTitleResults.push( + ...index + .query((q) => { + for (const { str } of titleQueryTokenized) { + q.term(str, { fields: ['title'], presence: lunr.Query.presence.REQUIRED }) + } + }) + .slice(0, PRE_LIMIT) + .map((result) => { + const { popularity } = records[result.ref] + if (result.score > highestTitleScore) { + highestTitleScore = result.score + } + const score = result.score / highestTitleScore + return { + result, + _score: MATCH_PHRASE * TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)), + } + }) + ) + } + const titleResults = index .query((q) => { - if (/['"]/.test(titleQuery)) { - // If the query contains a quotation marks, you can't easily - // enough break it up into individual words. - q.term(titleQuery, { fields: ['title'] }) - } else { - // This is the structured way of doing turning 'foo bar' - // into `title:foo title:bar'. - titleQuery.split(/ /g).forEach((part) => { - q.term(part, { fields: ['title'] }) + // The objective is to create an OR-query specifically for the 'title' + // because *we* value matches on that much higher than any other + // field in our records. + // But we want to make sure that the last word is always treated + // like a forward-tokenized token. I.e. you typed "google ku" + // becomes a search for "google ku*". + // Note that it's import that use the `lunr.tokenizer()` function when + // using the `index.query()` function because, for starters, it will + // normalize the input. + // If you use `index.search()` is the higher abstraction of basically + // doing this: + // (pseudo code) + // + // Index.prototype.search = function(input) { + // lunr.tokenize(input).forEach(token => { + // Index.query(callback => { + // callback(token) + // }) + // }) + // } + // + // If we didn't use the tokenized form, we'd get different results + // for searching for "SSH agent" and "ssh AgenT" for example. + titleQueryTokenized.forEach(({ str }, i) => { + const isLastToken = i === titleQueryTokenized.length - 1 + const isShort = str.length <= 3 + q.term(str, { + fields: ['title'], + wildcard: + isLastToken && isShort ? lunr.Query.wildcard.TRAILING : lunr.Query.wildcard.NONE, }) - } + }) }) .slice(0, PRE_LIMIT) .map((result) => { @@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit const _unique = new Set() const combinedMatchData = {} const results = [] - for (const matches of [titleResults, allResults]) { + for (const matches of [andTitleResults, titleResults, allResults]) { for (const match of matches) { const { result } = match // We need to loop over all results (both from title searches and From 5760b8512d25517c52b5f3de4efa2cc8b370ebd6 Mon Sep 17 00:00:00 2001 From: Peter Bengtsson Date: Tue, 16 Nov 2021 15:40:08 -0500 Subject: [PATCH 2/2] make FailBot unit tests independent of env (#22871) Part of #1229 --- lib/failbot.js | 2 +- tests/unit/failbot.js | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/failbot.js b/lib/failbot.js index 42e7399060d7..e63d344f69e2 100644 --- a/lib/failbot.js +++ b/lib/failbot.js @@ -31,7 +31,7 @@ async function retryingGot(url, args) { ) } -export function report(error, metadata) { +export async function report(error, metadata) { // If there's no HAYSTACK_URL set, bail early if (!process.env.HAYSTACK_URL) return diff --git a/tests/unit/failbot.js b/tests/unit/failbot.js index 63503b0919d7..eb6d7825e72e 100644 --- a/tests/unit/failbot.js +++ b/tests/unit/failbot.js @@ -5,6 +5,12 @@ describe('FailBot', () => { const requestBodiesSent = [] beforeEach(() => { + delete process.env.HAYSTACK_URL + + // Always reset the array to an empty one between tests + // so it doesn't intefere across tests. + requestBodiesSent.length = 0 + nock('https://haystack.example.com') .post('/') .reply(200, (uri, requestBody) => { @@ -15,15 +21,13 @@ describe('FailBot', () => { afterEach(() => { delete process.env.HAYSTACK_URL - // Reset the array to an empty one between tests - // so it doesn't intefere across tests. - requestBodiesSent.length = 0 }) describe('.report', () => { it('returns early if `HAYSTACK_URL` is not set', async () => { const result = await FailBot.report() expect(result).toBeUndefined() + expect(requestBodiesSent.length).toBe(0) }) it('sends the expected report', async () => {