From 361835bbac2935e406c039fd76f73b668c656f09 Mon Sep 17 00:00:00 2001 From: Istvan Soos Date: Mon, 4 Nov 2024 17:16:42 +0100 Subject: [PATCH] Reduce the memory allocation during search with mutable IndexedScore. --- CHANGELOG.md | 1 + app/lib/search/mem_index.dart | 83 ++++++++---------- app/lib/search/token_index.dart | 113 +++++++++++++++---------- app/test/search/api_doc_page_test.dart | 4 +- 4 files changed, 109 insertions(+), 92 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d30a01b5ca..7df7830a8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ Important changes to data models, configuration, and migrations between each AppEngine version, listed here to ease deployment and troubleshooting. ## Next Release (replace with git tag when deployed) + * `search` uses the `IndexedScore` to reduce memory allocations. ## `20241031t095600-all` * Bumped runtimeVersion to `2024.10.29`. diff --git a/app/lib/search/mem_index.dart b/app/lib/search/mem_index.dart index 0aabd9af30..b0d3f4b0c6 100644 --- a/app/lib/search/mem_index.dart +++ b/app/lib/search/mem_index.dart @@ -311,32 +311,28 @@ class InMemoryPackageIndex { // We cannot update the main `packages` variable yet, as the dartdoc API // symbols are added on top of the core results, and `packages` is used // there too. - final coreScores = []; - var wordScopedPackages = packages; + final coreScores = IndexedScore(_packageNameIndex._packageNames); + for (var i = 0; i < _documents.length; i++) { + if (packages.contains(_documents[i].package)) { + coreScores.setValue(i, 1.0); + } + } + for (final word in words) { - final nameScore = _packageNameIndex.searchWord(word, - filterOnPackages: wordScopedPackages); if (includeNameMatches && _documentsByName.containsKey(word)) { nameMatches ??= {}; nameMatches.add(word); } - final descr = _descrIndex - .searchWords([word], weight: 0.90, limitToIds: wordScopedPackages); - final readme = _readmeIndex - .searchWords([word], weight: 0.75, limitToIds: wordScopedPackages); - final score = Score.max([nameScore, descr, readme]); - coreScores.add(score); - // don't update if the query is single-word - if (words.length > 1) { - wordScopedPackages = score.keys.toSet(); - if (wordScopedPackages.isEmpty) { - break; - } - } + final wordScore = + _packageNameIndex.searchWord(word, filterOnNonZeros: coreScores); + _descrIndex.searchAndAccumulate(word, + weight: 0.90.toDouble(), score: wordScore); + _readmeIndex.searchAndAccumulate(word, + weight: 0.75.toDouble(), score: wordScore); + coreScores.multiplyAllFrom(wordScore); } - - final core = Score.multiply(coreScores); + final core = coreScores.toScore(); var symbolPages = Score.empty; if (!checkAborted()) { @@ -502,16 +498,13 @@ class _TextResults { @visibleForTesting class PackageNameIndex { final List _packageNames; - late final Map _data; + late final List<_PkgNameData> _data; PackageNameIndex(this._packageNames) { - _data = Map.fromEntries(_packageNames.map((package) { + _data = _packageNames.map((package) { final collapsed = _collapseName(package); - return MapEntry( - package, - _PkgNameData(collapsed, trigrams(collapsed).toSet()), - ); - })); + return _PkgNameData(collapsed, trigrams(collapsed).toSet()); + }).toList(); } /// Maps package name to a reduced form of the name: @@ -522,45 +515,43 @@ class PackageNameIndex { /// Search [text] and return the matching packages with scores. @visibleForTesting Score search(String text) { - Score? score; + IndexedScore? score; for (final w in splitForQuery(text)) { - final s = searchWord(w, filterOnPackages: score?.keys); + final s = searchWord(w, filterOnNonZeros: score); if (score == null) { score = s; } else { - // Note: on one hand, it is inefficient to multiply the [Score] on each - // iteration. However, (1) this is only happening in test, (2) it may be - // better for the next iteration to work on a more limited `filterOnPackages`, - // and (3) it will be updated to a more efficient in-place update (#8225). - score = Score.multiply([score, s]); + score.multiplyAllFrom(s); } } - return score ?? Score.empty; + return score?.toScore() ?? Score.empty; + } /// Search using the parsed [word] and return the matching packages with scores - /// as a new [Score] instance. + /// as a new [IndexedScore] instance. /// - /// When [filterOnPackages] is present, only the names present are evaluated. - Score searchWord( + /// When [filterOnNonZeros] is present, only the indexes with an already + /// non-zero value are evaluated. + IndexedScore searchWord( String word, { - Iterable? filterOnPackages, + IndexedScore? filterOnNonZeros, }) { - final pkgNamesToCheck = filterOnPackages ?? _packageNames; - final values = {}; + final score = IndexedScore(_packageNames); final singularWord = word.length <= 3 || !word.endsWith('s') ? word : word.substring(0, word.length - 1); final collapsedWord = _collapseName(singularWord); final parts = collapsedWord.length <= 3 ? [collapsedWord] : trigrams(collapsedWord); - for (final pkg in pkgNamesToCheck) { - final entry = _data[pkg]; - if (entry == null) { + for (var i = 0; i < _data.length; i++) { + if (filterOnNonZeros?.isNotPositive(i) ?? false) { continue; } + + final entry = _data[i]; if (entry.collapsed.contains(collapsedWord)) { - values[pkg] = 1.0; + score.setValue(i, 1.0); continue; } var matched = 0; @@ -574,11 +565,11 @@ class PackageNameIndex { if (matched > 0) { final v = matched / parts.length; if (v >= 0.5) { - values[pkg] = v; + score.setValue(i, v); } } } - return Score(values); + return score; } } diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart index fda0f91cf7..4af4d43bb6 100644 --- a/app/lib/search/token_index.dart +++ b/app/lib/search/token_index.dart @@ -206,37 +206,22 @@ class TokenIndex { return tokenMatch; } - /// Returns an {id: score} map of the documents stored in the [TokenIndex]. - /// The tokens in [tokenMatch] will be used to calculate a weighted sum of scores. - /// - /// When [limitToIds] is specified, the result will contain only the set of - /// identifiers in it. - Map _scoreDocs(TokenMatch tokenMatch, - {double weight = 1.0, Set? limitToIds}) { - // Summarize the scores for the documents. - final docScores = List.filled(_length, 0.0); + /// Searches the index with [word] and stores the results in [score], using + /// accumulation operation on the already existing values. + void searchAndAccumulate( + String word, { + double weight = 1.0, + required IndexedScore score, + }) { + assert(score.length == _length); + final tokenMatch = lookupTokens(word); for (final token in tokenMatch.tokens) { - final docWeights = _inverseIds[token]!; - for (final e in docWeights.entries) { - final i = e.key; - docScores[i] = math.max(docScores[i], tokenMatch[token]! * e.value); + final matchWeight = tokenMatch[token]!; + final tokenWeight = _inverseIds[token]!; + for (final e in tokenWeight.entries) { + score.setValueMaxOf(e.key, matchWeight * e.value * weight); } } - - final result = {}; - // post-process match weights - for (var i = 0; i < _length; i++) { - final id = _ids[i]; - final w = docScores[i]; - if (w <= 0.0) { - continue; - } - if (limitToIds != null && !limitToIds.contains(id)) { - continue; - } - result[id] = w * weight; - } - return result; } /// Search the index for [text], with a (term-match / document coverage percent) @@ -248,24 +233,64 @@ class TokenIndex { /// Search the index for [words], with a (term-match / document coverage percent) /// scoring. - Score searchWords(List words, - {double weight = 1.0, Set? limitToIds}) { - if (limitToIds != null && limitToIds.isEmpty) { - return Score.empty; - } - final scores = []; + Score searchWords(List words, {double weight = 1.0}) { + IndexedScore? score; for (final w in words) { - final tokens = lookupTokens(w); - final values = _scoreDocs( - tokens, - weight: weight, - limitToIds: limitToIds, - ); - if (values.isEmpty) { - return Score.empty; + final s = IndexedScore(_ids); + searchAndAccumulate(w, score: s, weight: weight); + if (score == null) { + score = s; + // reset weight + weight = 1.0; + } else { + score.multiplyAllFrom(s); + } + } + return score?.toScore() ?? Score.empty; + } +} + +/// Mutable score list that can accessed via integer index. +class IndexedScore { + final List _keys; + final List _values; + + IndexedScore._(this._keys, this._values); + + factory IndexedScore(List keys) => + IndexedScore._(keys, List.filled(keys.length, 0.0)); + + late final length = _values.length; + + bool isNotPositive(int index) { + return _values[index] <= 0.0; + } + + void setValue(int index, double value) { + _values[index] = value; + } + + void setValueMaxOf(int index, double value) { + _values[index] = math.max(_values[index], value); + } + + void multiplyAllFrom(IndexedScore other) { + assert(other._values.length == _values.length); + for (var i = 0; i < _values.length; i++) { + if (_values[i] == 0.0) continue; + final v = other._values[i]; + _values[i] = v == 0.0 ? 0.0 : _values[i] * v; + } + } + + Score toScore() { + final map = {}; + for (var i = 0; i < _values.length; i++) { + final v = _values[i]; + if (v > 0.0) { + map[_keys[i]] = v; } - scores.add(Score(values)); } - return Score.multiply(scores); + return Score._(map); } } diff --git a/app/test/search/api_doc_page_test.dart b/app/test/search/api_doc_page_test.dart index 40b5ca9606..b5d565f201 100644 --- a/app/test/search/api_doc_page_test.dart +++ b/app/test/search/api_doc_page_test.dart @@ -99,7 +99,7 @@ void main() { 'packageHits': [ { 'package': 'foo', - 'score': closeTo(0.18, 0.01), // find WebPageGenerator + 'score': closeTo(0.26, 0.01), // find WebPageGenerator 'apiPages': [ {'path': 'generator.html'}, ], @@ -119,7 +119,7 @@ void main() { 'packageHits': [ { 'package': 'foo', - 'score': closeTo(0.11, 0.01), // find WebPageGenerator + 'score': closeTo(0.15, 0.01), // find WebPageGenerator 'apiPages': [ {'path': 'generator.html'}, ],