diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart index 40a2410fd8..0e1909abb9 100644 --- a/app/lib/search/token_index.dart +++ b/app/lib/search/token_index.dart @@ -147,15 +147,11 @@ class TokenIndex { /// Maps token Strings to a weighted documents (addressed via indexes). final _inverseIds = >{}; - /// {id: size} map to store a value representative to the document length - late final List _docWeights; - - late final _length = _docWeights.length; + late final _length = _ids.length; TokenIndex(List ids, List values) : _ids = ids { assert(ids.length == values.length); final length = values.length; - _docWeights = List.filled(length, 0.0); for (var i = 0; i < length; i++) { final text = values[i]; @@ -166,12 +162,12 @@ class TokenIndex { if (tokens == null || tokens.isEmpty) { continue; } + // Document weight is a highly scaled-down proxy of the length. + final dw = 1 + math.log(1 + tokens.length) / 100; for (final token in tokens.keys) { final weights = _inverseIds.putIfAbsent(token, () => {}); - weights[i] = math.max(weights[i] ?? 0.0, tokens[token]!); + weights[i] = math.max(weights[i] ?? 0.0, tokens[token]! / dw); } - // Document weight is a highly scaled-down proxy of the length. - _docWeights[i] = 1 + math.log(1 + tokens.length) / 100; } } @@ -215,7 +211,7 @@ class TokenIndex { /// When [limitToIds] is specified, the result will contain only the set of /// identifiers in it. Map _scoreDocs(TokenMatch tokenMatch, - {double weight = 1.0, int wordCount = 1, Set? limitToIds}) { + {double weight = 1.0, Set? limitToIds}) { // Summarize the scores for the documents. final docScores = List.filled(_length, 0.0); for (final token in tokenMatch.tokens) { @@ -226,11 +222,6 @@ class TokenIndex { } } - // In multi-word queries we will penalize the score with the document size - // for each word separately. As these scores will be multiplied, we need to - // compensate the formula in order to prevent multiple exponential penalties. - final double wordSizeExponent = 1.0 / wordCount; - final result = {}; // post-process match weights for (var i = 0; i < _length; i++) { @@ -242,11 +233,7 @@ class TokenIndex { if (limitToIds != null && !limitToIds.contains(id)) { continue; } - var dw = _docWeights[i]; - if (wordCount > 1) { - dw = math.pow(dw, wordSizeExponent).toDouble(); - } - result[id] = w * weight / dw; + result[id] = w * weight; } return result; } @@ -255,7 +242,7 @@ class TokenIndex { /// scoring. @visibleForTesting Map search(String text) { - return _scoreDocs(lookupTokens(text)); + return searchWords(splitForQuery(text))._values; } /// Search the index for [words], with a (term-match / document coverage percent) @@ -271,7 +258,6 @@ class TokenIndex { final values = _scoreDocs( tokens, weight: weight, - wordCount: words.length, limitToIds: limitToIds, ); if (values.isEmpty) { diff --git a/app/test/search/token_index_test.dart b/app/test/search/token_index_test.dart index 3bcf2bb6e4..d9b8983e6f 100644 --- a/app/test/search/token_index_test.dart +++ b/app/test/search/token_index_test.dart @@ -68,7 +68,7 @@ void main() { }); expect(index.search('riak client'), { - 'uri://riak_client': closeTo(0.99, 0.01), + 'uri://riak_client': closeTo(0.98, 0.01), }); });