From 60c0155a60aa0ae29c3e4c79e5f7a73990d6d37e Mon Sep 17 00:00:00 2001 From: Istvan Soos Date: Thu, 7 Nov 2024 10:14:36 +0100 Subject: [PATCH 1/2] Use IndexedScore to accumulate name, description and readme scores. --- app/lib/search/mem_index.dart | 80 +++++++++++++++------------------ app/lib/search/token_index.dart | 42 +++++++++++++++++ 2 files changed, 77 insertions(+), 45 deletions(-) diff --git a/app/lib/search/mem_index.dart b/app/lib/search/mem_index.dart index 2cd6811545..a0fb1ed001 100644 --- a/app/lib/search/mem_index.dart +++ b/app/lib/search/mem_index.dart @@ -304,32 +304,28 @@ class InMemoryPackageIndex { // We cannot update the main `packages` variable yet, as the dartdoc API // symbols are added on top of the core results, and `packages` is used // there too. - final coreScores = []; - var wordScopedPackages = packages; + final coreScores = IndexedScore(_packageNameIndex._packageNames); + for (var i = 0; i < _documents.length; i++) { + if (packages.contains(_documents[i].package)) { + coreScores.setValue(i, 1.0); + } + } for (final word in words) { - final nameScore = _packageNameIndex.searchWord(word, - filterOnPackages: wordScopedPackages); if (includeNameMatches && _documentsByName.containsKey(word)) { nameMatches ??= {}; nameMatches.add(word); } - final descr = _descrIndex - .searchWords([word], weight: 0.90, limitToIds: wordScopedPackages); - final readme = _readmeIndex - .searchWords([word], weight: 0.75, limitToIds: wordScopedPackages); - final score = Score.max([nameScore, descr, readme]); - coreScores.add(score); - // don't update if the query is single-word - if (words.length > 1) { - wordScopedPackages = score.keys.toSet(); - if (wordScopedPackages.isEmpty) { - break; - } - } + final wordScore = + _packageNameIndex.searchWord(word, filterOnNonZeros: coreScores); + _descrIndex.searchAndAccumulate(word, + weight: 0.90.toDouble(), score: wordScore); + _readmeIndex.searchAndAccumulate(word, + weight: 0.75.toDouble(), score: wordScore); + coreScores.multiplyAllFrom(wordScore); } - final core = Score.multiply(coreScores); + final core = coreScores.toScore(); var symbolPages = Score.empty; if (!checkAborted()) { @@ -495,16 +491,13 @@ class _TextResults { @visibleForTesting class PackageNameIndex { final List _packageNames; - late final Map _data; + late final List<_PkgNameData> _data; PackageNameIndex(this._packageNames) { - _data = Map.fromEntries(_packageNames.map((package) { + _data = _packageNames.map((package) { final collapsed = _collapseName(package); - return MapEntry( - package, - _PkgNameData(collapsed, trigrams(collapsed).toSet()), - ); - })); + return _PkgNameData(collapsed, trigrams(collapsed).toSet()); + }).toList(); } /// Maps package name to a reduced form of the name: @@ -515,45 +508,42 @@ class PackageNameIndex { /// Search [text] and return the matching packages with scores. @visibleForTesting Score search(String text) { - Score? score; + IndexedScore? score; for (final w in splitForQuery(text)) { - final s = searchWord(w, filterOnPackages: score?.keys); + final s = searchWord(w, filterOnNonZeros: score); if (score == null) { score = s; } else { - // Note: on one hand, it is inefficient to multiply the [Score] on each - // iteration. However, (1) this is only happening in test, (2) it may be - // better for the next iteration to work on a more limited `filterOnPackages`, - // and (3) it will be updated to a more efficient in-place update (#8225). - score = Score.multiply([score, s]); + score.multiplyAllFrom(s); } } - return score ?? Score.empty; + return score?.toScore() ?? Score.empty; } /// Search using the parsed [word] and return the matching packages with scores - /// as a new [Score] instance. + /// as a new [IndexedScore] instance. /// - /// When [filterOnPackages] is present, only the names present are evaluated. - Score searchWord( + /// When [filterOnNonZeros] is present, only the indexes with an already + /// non-zero value are evaluated. + IndexedScore searchWord( String word, { - Iterable? filterOnPackages, + IndexedScore? filterOnNonZeros, }) { - final pkgNamesToCheck = filterOnPackages ?? _packageNames; - final values = {}; + final score = IndexedScore(_packageNames); final singularWord = word.length <= 3 || !word.endsWith('s') ? word : word.substring(0, word.length - 1); final collapsedWord = _collapseName(singularWord); final parts = collapsedWord.length <= 3 ? [collapsedWord] : trigrams(collapsedWord); - for (final pkg in pkgNamesToCheck) { - final entry = _data[pkg]; - if (entry == null) { + for (var i = 0; i < _data.length; i++) { + if (filterOnNonZeros?.isNotPositive(i) ?? false) { continue; } + + final entry = _data[i]; if (entry.collapsed.contains(collapsedWord)) { - values[pkg] = 1.0; + score.setValue(i, 1.0); continue; } var matched = 0; @@ -567,11 +557,11 @@ class PackageNameIndex { if (matched > 0) { final v = matched / parts.length; if (v >= 0.5) { - values[pkg] = v; + score.setValue(i, v); } } } - return Score(values); + return score; } } diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart index 95d42a46ba..b0f69ccb48 100644 --- a/app/lib/search/token_index.dart +++ b/app/lib/search/token_index.dart @@ -267,6 +267,24 @@ class TokenIndex { } return Score.multiply(scores); } + + /// Searches the index with [word] and stores the results in [score], using + /// accumulation operation on the already existing values. + void searchAndAccumulate( + String word, { + double weight = 1.0, + required IndexedScore score, + }) { + assert(score.length == _length); + final tokenMatch = lookupTokens(word); + for (final token in tokenMatch.tokens) { + final matchWeight = tokenMatch[token]!; + final tokenWeight = _inverseIds[token]!; + for (final e in tokenWeight.entries) { + score.setValueMaxOf(e.key, matchWeight * e.value * weight); + } + } + } } /// Mutable score list that can accessed via integer index. @@ -285,6 +303,10 @@ class IndexedScore { return _values[index] <= 0.0; } + void setValue(int index, double value) { + _values[index] = value; + } + void setValueMaxOf(int index, double value) { _values[index] = math.max(_values[index], value); } @@ -307,6 +329,15 @@ class IndexedScore { } } + void multiplyAllFrom(IndexedScore other) { + assert(other._values.length == _values.length); + for (var i = 0; i < _values.length; i++) { + if (_values[i] == 0.0) continue; + final v = other._values[i]; + _values[i] = v == 0.0 ? 0.0 : _values[i] * v; + } + } + Set toKeySet() { final set = {}; for (var i = 0; i < _values.length; i++) { @@ -317,4 +348,15 @@ class IndexedScore { } return set; } + + Score toScore() { + final map = {}; + for (var i = 0; i < _values.length; i++) { + final v = _values[i]; + if (v > 0.0) { + map[_keys[i]] = v; + } + } + return Score._(map); + } } From f1ab69e37a5f396115dcb702ceccc82db0bc6d88 Mon Sep 17 00:00:00 2001 From: Istvan Soos Date: Thu, 7 Nov 2024 10:18:19 +0100 Subject: [PATCH 2/2] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d30a01b5ca..7df7830a8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ Important changes to data models, configuration, and migrations between each AppEngine version, listed here to ease deployment and troubleshooting. ## Next Release (replace with git tag when deployed) + * `search` uses the `IndexedScore` to reduce memory allocations. ## `20241031t095600-all` * Bumped runtimeVersion to `2024.10.29`.