Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/failbot.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ async function retryingGot(url, args) {
)
}

export function report(error, metadata) {
export async function report(error, metadata) {
// If there's no HAYSTACK_URL set, bail early
if (!process.env.HAYSTACK_URL) return

Expand Down
111 changes: 93 additions & 18 deletions lib/search/lunr-search.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit
// want to make sure this number accounts for that.
const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10

// Multiplication bonus given to matches that were made on the
// the search where ALL tokens are required.
// E.g. you search for 'foo bar' and we have three records:
//
// A) "This foo is very special"
// B) "With bar and foo you can't go wrong"
// C) "Only bar can save you"
//
// What will happen is that it only finds record (B) when it's
// requires to match both 'foo' *and* 'bar'. So you get these scores:
//
// A) score = result.score + popularity
// B) score = MATCH_PHRASE * (result.score + popularity)
// C) score = result.score + popularity
//
// So it's very powerful multiplier. But that's fine because a
// "phrase match" is a very accurate thing.
const MATCH_PHRASE = 5

// Imagine that we have 1,000 documents. 100 of them contain the word
// 'foobar'. Of those 100, we want to display the top 10 "best".
// But if we only do `lunrindex.search('foobar').slice(0, 10)` we
Expand All @@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit
// records that we finally return.
const PRE_LIMIT = 500

let titleQuery = query.trim()
if (titleQuery.length <= 3 && !titleQuery.endsWith('*s')) {
// When the search input is really short, force it to search with
// the "forward wild card". I.e. you typed `go` we turn it into a
// search for `go*` which means it can find things like `Google`.
titleQuery += '*'
}
const titleQuery = query.trim()

let highestTitleScore = 0.0

const andTitleResults = []

// This will turn something like 'foo and bar' into:
// [
// { str: 'foo', metadata: { position: [Array], index: 0 } },
// { str: 'bar', metadata: { position: [Array], index: 1 } }
// ]
// Note how the stopword gets omitted.
// It's important to omit the stopwords because even if the record
// actually contains the stopword, it won't match then.
// E.g. you have a record called "Foo And Bar" and you search for
// {foo AND and AND bar} it will actually not find anything.
// But if you change it to {foo AND bar} it will match "Foo And Bar"
// Same goes if any other stopwords were used like "Foo the Bar with for a".
// That also needs to become an AND-search of {foo AND bar} ...only.
const titleQueryTokenized = lunr.tokenizer(titleQuery).filter(lunr.stopWordFilter)

if (titleQueryTokenized.length > 1) {
andTitleResults.push(
...index
.query((q) => {
for (const { str } of titleQueryTokenized) {
q.term(str, { fields: ['title'], presence: lunr.Query.presence.REQUIRED })
}
})
.slice(0, PRE_LIMIT)
.map((result) => {
const { popularity } = records[result.ref]
if (result.score > highestTitleScore) {
highestTitleScore = result.score
}
const score = result.score / highestTitleScore
return {
result,
_score: MATCH_PHRASE * TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)),
}
})
)
}

const titleResults = index
.query((q) => {
if (/['"]/.test(titleQuery)) {
// If the query contains a quotation marks, you can't easily
// enough break it up into individual words.
q.term(titleQuery, { fields: ['title'] })
} else {
// This is the structured way of doing turning 'foo bar'
// into `title:foo title:bar'.
titleQuery.split(/ /g).forEach((part) => {
q.term(part, { fields: ['title'] })
// The objective is to create an OR-query specifically for the 'title'
// because *we* value matches on that much higher than any other
// field in our records.
// But we want to make sure that the last word is always treated
// like a forward-tokenized token. I.e. you typed "google ku"
// becomes a search for "google ku*".
// Note that it's import that use the `lunr.tokenizer()` function when
// using the `index.query()` function because, for starters, it will
// normalize the input.
// If you use `index.search()` is the higher abstraction of basically
// doing this:
// (pseudo code)
//
// Index.prototype.search = function(input) {
// lunr.tokenize(input).forEach(token => {
// Index.query(callback => {
// callback(token)
// })
// })
// }
//
// If we didn't use the tokenized form, we'd get different results
// for searching for "SSH agent" and "ssh AgenT" for example.
titleQueryTokenized.forEach(({ str }, i) => {
const isLastToken = i === titleQueryTokenized.length - 1
const isShort = str.length <= 3
q.term(str, {
fields: ['title'],
wildcard:
isLastToken && isShort ? lunr.Query.wildcard.TRAILING : lunr.Query.wildcard.NONE,
})
}
})
})
.slice(0, PRE_LIMIT)
.map((result) => {
Expand Down Expand Up @@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit
const _unique = new Set()
const combinedMatchData = {}
const results = []
for (const matches of [titleResults, allResults]) {
for (const matches of [andTitleResults, titleResults, allResults]) {
for (const match of matches) {
const { result } = match
// We need to loop over all results (both from title searches and
Expand Down
10 changes: 7 additions & 3 deletions tests/unit/failbot.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ describe('FailBot', () => {
const requestBodiesSent = []

beforeEach(() => {
delete process.env.HAYSTACK_URL

// Always reset the array to an empty one between tests
// so it doesn't intefere across tests.
requestBodiesSent.length = 0

nock('https://haystack.example.com')
.post('/')
.reply(200, (uri, requestBody) => {
Expand All @@ -15,15 +21,13 @@ describe('FailBot', () => {

afterEach(() => {
delete process.env.HAYSTACK_URL
// Reset the array to an empty one between tests
// so it doesn't intefere across tests.
requestBodiesSent.length = 0
})

describe('.report', () => {
it('returns early if `HAYSTACK_URL` is not set', async () => {
const result = await FailBot.report()
expect(result).toBeUndefined()
expect(requestBodiesSent.length).toBe(0)
})

it('sends the expected report', async () => {
Expand Down