From 5d080b74e0916b315db965aa2a56ef29b60f2a59 Mon Sep 17 00:00:00 2001
From: Peter Bengtsson <peterbe@github.com>
Date: Tue, 16 Nov 2021 15:22:09 -0500
Subject: [PATCH 1/2] Search with and on title first and favor that more
 (#22730)

* search improvements

* expose search result score on debug=1 qs (#22452)

* Search index improvements (#22428)

* concatenate headings for search ranking (#22445)

* search index improvements

* update index

* add data-search property used to create search records

* direct match title and normalize results to one per article

* the search index is getting huge

* add chinese lang, intro, and boost

* create a article level record with no heading for each article

* remove chinese language and exact match, boost breadcrumbs

* break out debug mode into separate branch

* break adding chinese out into another branch

* one record per article remove limits

* revert added spaces

* revert adding property on article page

* concatenate headings for search ranking

* remove slug property, fix test

* update record format in tests

* revert adding this property back

* scrape article-body not contents

* adding heading property to tests

* add headings to content property

* Update script/search/parse-page-sections-into-records.js

Co-authored-by: Peter Bengtsson <peterbe@github.com>

* Update script/search/parse-page-sections-into-records.js

Co-authored-by: Peter Bengtsson <peterbe@github.com>

Co-authored-by: Peter Bengtsson <peterbe@github.com>

* search by popularity (#22496)

* search by popularity

* adjust factors based on length of query and normalize Lunr score

* Apply suggestions from code review

Co-authored-by: Rachael Sewell <rachmari@github.com>

Co-authored-by: Rachael Sewell <rachmari@github.com>

* turn html into text with appropriate spaces (#22528)

* turn HTML into text with appropriate spaces

Part of #1141

* fix unit test

* improve tests

* small refactor

* slice the search results .content (#22514)

* search by popularity

* adjust factors based on length of query and normalize Lunr score

* slice the search result .content

Part of #1142

* Update lib/search/lunr-search.js

Co-authored-by: Rachael Sewell <rachmari@github.com>

Co-authored-by: Rachael Sewell <rachmari@github.com>

* truncate translated search index records (#22475)

* truncate translated search index records

* page entire page object

* revert search indexes

* try reverting search indexes again...

* out of sync with main in parent branch so reverting last commit again

* rever deleted indexes

* remove debug lines

* correct package-lock.json

* slice content string better (#22586)

Part of #1149

* include home pages in the search (#22568)

* Search remove hyphen as token (#22584)

* slice better when not present in content (#22672)

* fix missing search breadcrumbs (#22681)

* fix missing search breadcrumbs

* hopefully fix tests

Co-authored-by: Peter Bengtsson <mail@peterbe.com>

* Highlight based on combined matchData (#22692)

* highlight based on combined matchData

* refactoring

* remove results when lang or version changes (#22697)

* search with AND on title first and favor that more

* commented out code

* Update lib/search/lunr-search.js

* only apply the wildcard to the title search if the last word is short

Co-authored-by: Rachael Sewell <rachmari@github.com>
---
 lib/search/lunr-search.js | 111 +++++++++++++++++++++++++++++++-------
 1 file changed, 93 insertions(+), 18 deletions(-)

diff --git a/lib/search/lunr-search.js b/lib/search/lunr-search.js
index 75f02a863f60..3d39f77350c7 100644
--- a/lib/search/lunr-search.js
+++ b/lib/search/lunr-search.js
@@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit
   // want to make sure this number accounts for that.
   const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10
 
+  // Multiplication bonus given to matches that were made on the
+  // the search where ALL tokens are required.
+  // E.g. you search for 'foo bar' and we have three records:
+  //
+  //  A)  "This foo is very special"
+  //  B)  "With bar and foo you can't go wrong"
+  //  C)  "Only bar can save you"
+  //
+  // What will happen is that it only finds record (B) when it's
+  // requires to match both 'foo' *and* 'bar'. So you get these scores:
+  //
+  //  A) score = result.score + popularity
+  //  B) score = MATCH_PHRASE * (result.score + popularity)
+  //  C) score = result.score + popularity
+  //
+  // So it's very powerful multiplier. But that's fine because a
+  // "phrase match" is a very accurate thing.
+  const MATCH_PHRASE = 5
+
   // Imagine that we have 1,000 documents. 100 of them contain the word
   // 'foobar'. Of those 100, we want to display the top 10 "best".
   // But if we only do `lunrindex.search('foobar').slice(0, 10)` we
@@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit
   // records that we finally return.
   const PRE_LIMIT = 500
 
-  let titleQuery = query.trim()
-  if (titleQuery.length <= 3 && !titleQuery.endsWith('*s')) {
-    // When the search input is really short, force it to search with
-    // the "forward wild card". I.e. you typed `go` we turn it into a
-    // search for `go*` which means it can find things like `Google`.
-    titleQuery += '*'
-  }
+  const titleQuery = query.trim()
 
   let highestTitleScore = 0.0
+
+  const andTitleResults = []
+
+  // This will turn something like 'foo and bar' into:
+  // [
+  //   { str: 'foo', metadata: { position: [Array], index: 0 } },
+  //   { str: 'bar', metadata: { position: [Array], index: 1 } }
+  // ]
+  // Note how the stopword gets omitted.
+  // It's important to omit the stopwords because even if the record
+  // actually contains the stopword, it won't match then.
+  // E.g. you have a record called "Foo And Bar" and you search for
+  // {foo AND and AND bar} it will actually not find anything.
+  // But if you change it to {foo AND bar} it will match "Foo And Bar"
+  // Same goes if any other stopwords were used like "Foo the Bar with for a".
+  // That also needs to become an AND-search of {foo AND bar} ...only.
+  const titleQueryTokenized = lunr.tokenizer(titleQuery).filter(lunr.stopWordFilter)
+
+  if (titleQueryTokenized.length > 1) {
+    andTitleResults.push(
+      ...index
+        .query((q) => {
+          for (const { str } of titleQueryTokenized) {
+            q.term(str, { fields: ['title'], presence: lunr.Query.presence.REQUIRED })
+          }
+        })
+        .slice(0, PRE_LIMIT)
+        .map((result) => {
+          const { popularity } = records[result.ref]
+          if (result.score > highestTitleScore) {
+            highestTitleScore = result.score
+          }
+          const score = result.score / highestTitleScore
+          return {
+            result,
+            _score: MATCH_PHRASE * TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)),
+          }
+        })
+    )
+  }
+
   const titleResults = index
     .query((q) => {
-      if (/['"]/.test(titleQuery)) {
-        // If the query contains a quotation marks, you can't easily
-        // enough break it up into individual words.
-        q.term(titleQuery, { fields: ['title'] })
-      } else {
-        // This is the structured way of doing turning 'foo bar'
-        // into `title:foo title:bar'.
-        titleQuery.split(/ /g).forEach((part) => {
-          q.term(part, { fields: ['title'] })
+      // The objective is to create an OR-query specifically for the 'title'
+      // because *we* value matches on that much higher than any other
+      // field in our records.
+      // But we want to make sure that the last word is always treated
+      // like a forward-tokenized token. I.e. you typed "google ku"
+      // becomes a search for "google ku*".
+      // Note that it's import that use the `lunr.tokenizer()` function when
+      // using the `index.query()` function because, for starters, it will
+      // normalize the input.
+      // If you use `index.search()` is the higher abstraction of basically
+      // doing this:
+      // (pseudo code)
+      //
+      //    Index.prototype.search = function(input) {
+      //       lunr.tokenize(input).forEach(token => {
+      //          Index.query(callback => {
+      //              callback(token)
+      //          })
+      //       })
+      //    }
+      //
+      // If we didn't use the tokenized form, we'd get different results
+      // for searching for "SSH agent" and "ssh AgenT" for example.
+      titleQueryTokenized.forEach(({ str }, i) => {
+        const isLastToken = i === titleQueryTokenized.length - 1
+        const isShort = str.length <= 3
+        q.term(str, {
+          fields: ['title'],
+          wildcard:
+            isLastToken && isShort ? lunr.Query.wildcard.TRAILING : lunr.Query.wildcard.NONE,
         })
-      }
+      })
     })
     .slice(0, PRE_LIMIT)
     .map((result) => {
@@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit
   const _unique = new Set()
   const combinedMatchData = {}
   const results = []
-  for (const matches of [titleResults, allResults]) {
+  for (const matches of [andTitleResults, titleResults, allResults]) {
     for (const match of matches) {
       const { result } = match
       // We need to loop over all results (both from title searches and

From 5760b8512d25517c52b5f3de4efa2cc8b370ebd6 Mon Sep 17 00:00:00 2001
From: Peter Bengtsson <peterbe@github.com>
Date: Tue, 16 Nov 2021 15:40:08 -0500
Subject: [PATCH 2/2] make FailBot unit tests independent of env (#22871)

Part of #1229
---
 lib/failbot.js        |  2 +-
 tests/unit/failbot.js | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/failbot.js b/lib/failbot.js
index 42e7399060d7..e63d344f69e2 100644
--- a/lib/failbot.js
+++ b/lib/failbot.js
@@ -31,7 +31,7 @@ async function retryingGot(url, args) {
   )
 }
 
-export function report(error, metadata) {
+export async function report(error, metadata) {
   // If there's no HAYSTACK_URL set, bail early
   if (!process.env.HAYSTACK_URL) return
 
diff --git a/tests/unit/failbot.js b/tests/unit/failbot.js
index 63503b0919d7..eb6d7825e72e 100644
--- a/tests/unit/failbot.js
+++ b/tests/unit/failbot.js
@@ -5,6 +5,12 @@ describe('FailBot', () => {
   const requestBodiesSent = []
 
   beforeEach(() => {
+    delete process.env.HAYSTACK_URL
+
+    // Always reset the array to an empty one between tests
+    // so it doesn't intefere across tests.
+    requestBodiesSent.length = 0
+
     nock('https://haystack.example.com')
       .post('/')
       .reply(200, (uri, requestBody) => {
@@ -15,15 +21,13 @@ describe('FailBot', () => {
 
   afterEach(() => {
     delete process.env.HAYSTACK_URL
-    // Reset the array to an empty one between tests
-    // so it doesn't intefere across tests.
-    requestBodiesSent.length = 0
   })
 
   describe('.report', () => {
     it('returns early if `HAYSTACK_URL` is not set', async () => {
       const result = await FailBot.report()
       expect(result).toBeUndefined()
+      expect(requestBodiesSent.length).toBe(0)
     })
 
     it('sends the expected report', async () => {