From 60d8db0724063d990c78c4e60f7d7e55b44617a6 Mon Sep 17 00:00:00 2001 From: Istvan Soos Date: Thu, 17 Oct 2024 08:59:40 +0200 Subject: [PATCH 1/3] Refactor TokenIndex initialization. --- app/lib/search/mem_index.dart | 93 ++++++++++++++++----------- app/lib/search/sdk_mem_index.dart | 10 ++- app/lib/search/token_index.dart | 45 +++++++------ app/test/search/token_index_test.dart | 51 +++++++-------- 4 files changed, 112 insertions(+), 87 deletions(-) diff --git a/app/lib/search/mem_index.dart b/app/lib/search/mem_index.dart index 68e5b765ff..6e72f70e8e 100644 --- a/app/lib/search/mem_index.dart +++ b/app/lib/search/mem_index.dart @@ -19,11 +19,12 @@ final _logger = Logger('search.mem_index'); final _textSearchTimeout = Duration(milliseconds: 500); class InMemoryPackageIndex { - final Map _packages = {}; + final List _documents; + final _documentsByName = {}; final _packageNameIndex = PackageNameIndex(); - final TokenIndex _descrIndex = TokenIndex(); - final TokenIndex _readmeIndex = TokenIndex(); - final TokenIndex _apiSymbolIndex = TokenIndex(); + late final TokenIndex _descrIndex; + late final TokenIndex _readmeIndex; + late final TokenIndex _apiSymbolIndex; /// Adjusted score takes the overall score and transforms /// it linearly into the [0.4-1.0] range. @@ -39,13 +40,38 @@ class InMemoryPackageIndex { InMemoryPackageIndex({ required Iterable documents, - }) { - for (final doc in documents) { - _addPackage(doc); + }) : _documents = [...documents] { + final apiDocPageKeys = []; + final apiDocPageValues = []; + for (final doc in _documents) { + _documentsByName[doc.package] = doc; + _packageNameIndex.add(doc.package); + + final apiDocPages = doc.apiDocPages; + if (apiDocPages != null) { + for (final page in apiDocPages) { + if (page.symbols != null && page.symbols!.isNotEmpty) { + apiDocPageKeys.add(_apiDocPageId(doc.package, page)); + apiDocPageValues.add(page.symbols!.join(' ')); + } + } + } } + + final packageKeys = _documents.map((d) => d.package).toList(); + _descrIndex = TokenIndex( + packageKeys, + _documents.map((d) => d.description).toList(), + ); + _readmeIndex = TokenIndex( + packageKeys, + _documents.map((d) => d.readme).toList(), + ); + _apiSymbolIndex = TokenIndex(apiDocPageKeys, apiDocPageValues); + // update like scores only if they were not set (should happen only in local tests) - if (_packages.values.any((e) => e.likeScore == null)) { - _packages.values.updateLikeScores(); + if (_documentsByName.values.any((e) => e.likeScore == null)) { + _documentsByName.values.updateLikeScores(); } _updateOverallScores(); _lastUpdated = clock.now().toUtc(); @@ -64,34 +90,22 @@ class InMemoryPackageIndex { IndexInfo indexInfo() { return IndexInfo( isReady: true, - packageCount: _packages.length, + packageCount: _documentsByName.length, lastUpdated: _lastUpdated, ); } - void _addPackage(PackageDocument doc) { - _packages[doc.package] = doc; - _packageNameIndex.add(doc.package); - _descrIndex.add(doc.package, doc.description); - _readmeIndex.add(doc.package, doc.readme); - - for (final ApiDocPage page in doc.apiDocPages ?? const []) { - final pageId = _apiDocPageId(doc.package, page); - if (page.symbols != null && page.symbols!.isNotEmpty) { - _apiSymbolIndex.add(pageId, page.symbols!.join(' ')); - } - } - } - PackageSearchResult search(ServiceSearchQuery query) { - final packages = Set.of(_packages.keys); + final packages = Set.of(_documentsByName.keys); // filter on package prefix if (query.parsedQuery.packagePrefix != null) { final String prefix = query.parsedQuery.packagePrefix!.toLowerCase(); packages.removeWhere( - (package) => - !_packages[package]!.package.toLowerCase().startsWith(prefix), + (package) => !_documentsByName[package]! + .package + .toLowerCase() + .startsWith(prefix), ); } @@ -99,14 +113,14 @@ class InMemoryPackageIndex { final combinedTagsPredicate = query.tagsPredicate.appendPredicate(query.parsedQuery.tagsPredicate); if (combinedTagsPredicate.isNotEmpty) { - packages.retainWhere((package) => - combinedTagsPredicate.matches(_packages[package]!.tagsForLookup)); + packages.retainWhere((package) => combinedTagsPredicate + .matches(_documentsByName[package]!.tagsForLookup)); } // filter on dependency if (query.parsedQuery.hasAnyDependency) { packages.removeWhere((package) { - final doc = _packages[package]!; + final doc = _documentsByName[package]!; if (doc.dependencies.isEmpty) return true; for (final dependency in query.parsedQuery.allDependencies) { if (!doc.dependencies.containsKey(dependency)) return true; @@ -122,7 +136,7 @@ class InMemoryPackageIndex { // filter on points if (query.minPoints != null && query.minPoints! > 0) { packages.removeWhere((package) { - final doc = _packages[package]!; + final doc = _documentsByName[package]!; return doc.grantedPoints < query.minPoints!; }); } @@ -132,7 +146,7 @@ class InMemoryPackageIndex { if (updatedDuration != null && updatedDuration > Duration.zero) { final now = clock.now(); packages.removeWhere((package) { - final doc = _packages[package]!; + final doc = _documentsByName[package]!; final diff = now.difference(doc.updated); return diff > updatedDuration; }); @@ -163,7 +177,8 @@ class InMemoryPackageIndex { .map((key, value) => value * _adjustedOverallScores[key]!); // If the search hits have an exact name match, we move it to the front of the result list. final parsedQueryText = query.parsedQuery.text; - if (parsedQueryText != null && _packages.containsKey(parsedQueryText)) { + if (parsedQueryText != null && + _documentsByName.containsKey(parsedQueryText)) { nameMatches = [parsedQueryText]; } packageHits = _rankWithValues(overallScore.getValues()); @@ -215,7 +230,7 @@ class InMemoryPackageIndex { /// Update the overall score both on [PackageDocument] and in the [_adjustedOverallScores] map. void _updateOverallScores() { - for (final doc in _packages.values) { + for (final doc in _documentsByName.values) { final downloadScore = doc.popularityScore ?? 0.0; final likeScore = doc.likeScore ?? 0.0; final popularity = (downloadScore + likeScore) / 2; @@ -316,7 +331,7 @@ class InMemoryPackageIndex { if (!aborted && phrases.isNotEmpty) { final matched = {}; for (final package in score.getKeys()) { - final doc = _packages[package]!; + final doc = _documentsByName[package]!; final bool matchedAllPhrases = phrases.every((phrase) => doc.package.contains(phrase) || doc.description!.contains(phrase) || @@ -341,7 +356,8 @@ class InMemoryPackageIndex { final int scoreCompare = -a.score!.compareTo(b.score!); if (scoreCompare != 0) return scoreCompare; // if two packages got the same score, order by last updated - return _compareUpdated(_packages[a.package]!, _packages[b.package]!); + return _compareUpdated( + _documentsByName[a.package]!, _documentsByName[b.package]!); }); return list; } @@ -350,11 +366,12 @@ class InMemoryPackageIndex { int Function(PackageDocument a, PackageDocument b) compare, { double Function(PackageDocument doc)? score, }) { - final list = _packages.values + final list = _documentsByName.values .map((doc) => PackageHit( package: doc.package, score: score == null ? null : score(doc))) .toList(); - list.sort((a, b) => compare(_packages[a.package]!, _packages[b.package]!)); + list.sort((a, b) => + compare(_documentsByName[a.package]!, _documentsByName[b.package]!)); return list; } diff --git a/app/lib/search/sdk_mem_index.dart b/app/lib/search/sdk_mem_index.dart index 949fbdc539..de7248af01 100644 --- a/app/lib/search/sdk_mem_index.dart +++ b/app/lib/search/sdk_mem_index.dart @@ -80,6 +80,7 @@ class SdkMemIndex { DartdocIndex index, { Set? allowedLibraries, }) async { + final textsPerLibrary = >{}; for (final f in index.entries) { final library = f.qualifiedName?.split('.').first; if (library == null) continue; @@ -92,10 +93,15 @@ class SdkMemIndex { if (f.isLibrary) { _baseUriPerLibrary[library] = _baseUri.resolve(f.href!).toString(); } - final tokens = _tokensPerLibrary.putIfAbsent(library, () => TokenIndex()); final text = f.qualifiedName?.replaceAll('.', ' ').replaceAll(':', ' '); - tokens.add(f.href!, text); + if (text != null && text.isNotEmpty) { + final texts = textsPerLibrary.putIfAbsent(library, () => {}); + texts[f.href!] = text; + } + } + for (final e in textsPerLibrary.entries) { + _tokensPerLibrary[e.key] = TokenIndex.fromMap(e.value); } } diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart index 36df845317..ccf904c808 100644 --- a/app/lib/search/token_index.dart +++ b/app/lib/search/token_index.dart @@ -161,25 +161,34 @@ class TokenIndex { /// {id: size} map to store a value representative to the document length final _docSizes = {}; - /// The number of tokens stored in the index. - int get tokenCount => _inverseIds.length; - - int get documentCount => _docSizes.length; - - void add(String id, String? text) { - if (text == null) return; - final tokens = tokenize(text); - if (tokens == null || tokens.isEmpty) { - return; - } - for (final token in tokens.keys) { - final Map weights = - _inverseIds.putIfAbsent(token, () => {}); - weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!); + TokenIndex(List keys, List values) { + assert(keys.length == values.length); + for (var i = 0; i < keys.length; i++) { + final id = keys[i]; + final text = values[i]; + + if (text == null) { + continue; + } + final tokens = tokenize(text); + if (tokens == null || tokens.isEmpty) { + continue; + } + for (final token in tokens.keys) { + final weights = + _inverseIds.putIfAbsent(token, () => {}); + weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!); + } + // Document size is a highly scaled-down proxy of the length. + final docSize = 1 + math.log(1 + tokens.length) / 100; + _docSizes[id] = docSize; } - // Document size is a highly scaled-down proxy of the length. - final docSize = 1 + math.log(1 + tokens.length) / 100; - _docSizes[id] = docSize; + } + + factory TokenIndex.fromMap(Map map) { + final keys = map.keys.toList(); + final values = map.values.toList(); + return TokenIndex(keys, values); } /// Match the text against the corpus and return the tokens or diff --git a/app/test/search/token_index_test.dart b/app/test/search/token_index_test.dart index dbd8c1f6b6..47f2393597 100644 --- a/app/test/search/token_index_test.dart +++ b/app/test/search/token_index_test.dart @@ -8,7 +8,7 @@ import 'package:test/test.dart'; void main() { group('TokenIndex', () { test('partial token lookup', () { - final index = TokenIndex()..add('x', 'SomeCamelCasedWord and others'); + final index = TokenIndex.fromMap({'x': 'SomeCamelCasedWord and others'}); expect(index.lookupTokens('word').tokenWeights, {'word': 1.0}); expect(index.lookupTokens('OtherCased').tokenWeights, {'cased': closeTo(0.70, 0.01)}); @@ -18,9 +18,10 @@ void main() { }); test('No match', () { - final TokenIndex index = TokenIndex() - ..add('uri://http', 'http') - ..add('uri://http_magic', 'http_magic'); + final TokenIndex index = TokenIndex.fromMap({ + 'uri://http': 'http', + 'uri://http_magic': 'http_magic', + }); expect(index.search('xml'), { // no match for http @@ -29,9 +30,10 @@ void main() { }); test('Scoring exact and partial matches', () { - final TokenIndex index = TokenIndex() - ..add('uri://http', 'http') - ..add('uri://http_magic', 'http_magic'); + final TokenIndex index = TokenIndex.fromMap({ + 'uri://http': 'http', + 'uri://http_magic': 'http_magic', + }); expect(index.search('http'), { 'uri://http': closeTo(0.993, 0.001), 'uri://http_magic': closeTo(0.989, 0.001), @@ -40,10 +42,11 @@ void main() { test('CamelCase indexing', () { final String queueText = '.DoubleLinkedQueue()'; - final TokenIndex index = TokenIndex() - ..add('queue', queueText) - ..add('queue_lower', queueText.toLowerCase()) - ..add('unmodifiable', 'CustomUnmodifiableMapBase'); + final TokenIndex index = TokenIndex.fromMap({ + 'queue': queueText, + 'queue_lower': queueText.toLowerCase(), + 'unmodifiable': 'CustomUnmodifiableMapBase', + }); expect(index.search('queue'), { 'queue': closeTo(0.53, 0.01), }); @@ -54,10 +57,11 @@ void main() { }); test('Wierd cases: riak client', () { - final TokenIndex index = TokenIndex() - ..add('uri://cli', 'cli') - ..add('uri://riak_client', 'riak_client') - ..add('uri://teamspeak', 'teamspeak'); + final TokenIndex index = TokenIndex.fromMap({ + 'uri://cli': 'cli', + 'uri://riak_client': 'riak_client', + 'uri://teamspeak': 'teamspeak', + }); expect(index.search('riak'), { 'uri://riak_client': closeTo(0.99, 0.01), @@ -68,24 +72,15 @@ void main() { }); }); - test('Free up memory', () { - final TokenIndex index = TokenIndex(); - expect(index.tokenCount, 0); - index.add('url1', 'text'); - expect(index.tokenCount, 1); - index.add('url2', 'another'); - expect(index.tokenCount, 2); - }); - test('Do not overweight partial matches', () { - final index = TokenIndex()..add('flutter_qr_reader', 'flutter_qr_reader'); + final index = + TokenIndex.fromMap({'flutter_qr_reader': 'flutter_qr_reader'}); final data = index.search('ByteDataReader'); // The partial match should not return more than 0.65 as score. expect(data, {'flutter_qr_reader': lessThan(0.65)}); }); test('longer words', () { - final index = TokenIndex(); final names = [ 'location', 'geolocator', @@ -98,9 +93,7 @@ void main() { 'location_picker', 'background_location_updates', ]; - for (final name in names) { - index.add(name, name); - } + final index = TokenIndex.fromMap(Map.fromIterables(names, names)); final match = index.search('location'); // location should be the top value, everything else should be lower final locationValue = match['location']; From 2c55514c7815429b6e77d44398f0b39e8e745a33 Mon Sep 17 00:00:00 2001 From: Istvan Soos Date: Thu, 17 Oct 2024 08:59:40 +0200 Subject: [PATCH 2/3] Better efficiency in TokenIndex. --- app/lib/search/token_index.dart | 63 +++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart index ccf904c808..df6a6e607e 100644 --- a/app/lib/search/token_index.dart +++ b/app/lib/search/token_index.dart @@ -155,16 +155,21 @@ class TokenMatch { /// Stores a token -> documentId inverted index with weights. class TokenIndex { - /// Maps token Strings to a weighted map of document ids. - final _inverseIds = >{}; + final List _ids; + + /// Maps token Strings to a weighted documents (addressed via indexes). + final _inverseIds = >{}; /// {id: size} map to store a value representative to the document length - final _docSizes = {}; + late final List _docWeights; + + late final _length = _docWeights.length; - TokenIndex(List keys, List values) { - assert(keys.length == values.length); - for (var i = 0; i < keys.length; i++) { - final id = keys[i]; + TokenIndex(List ids, List values) : _ids = ids { + assert(ids.length == values.length); + final length = values.length; + _docWeights = List.filled(length, 0.0); + for (var i = 0; i < length; i++) { final text = values[i]; if (text == null) { @@ -175,13 +180,11 @@ class TokenIndex { continue; } for (final token in tokens.keys) { - final weights = - _inverseIds.putIfAbsent(token, () => {}); - weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!); + final weights = _inverseIds.putIfAbsent(token, () => {}); + weights[i] = math.max(weights[i] ?? 0.0, tokens[token]!); } - // Document size is a highly scaled-down proxy of the length. - final docSize = 1 + math.log(1 + tokens.length) / 100; - _docSizes[id] = docSize; + // Document weight is a highly scaled-down proxy of the length. + _docWeights[i] = 1 + math.log(1 + tokens.length) / 100; } } @@ -200,9 +203,8 @@ class TokenIndex { for (final word in splitForIndexing(text)) { final tokens = tokenize(word, isSplit: true) ?? {}; - final present = tokens.keys - .where((token) => (_inverseIds[token]?.length ?? 0) > 0) - .toList(); + final present = + tokens.keys.where((token) => _inverseIds.containsKey(token)).toList(); if (present.isEmpty) { return TokenMatch(); } @@ -228,14 +230,12 @@ class TokenIndex { Map _scoreDocs(TokenMatch tokenMatch, {double weight = 1.0, int wordCount = 1, Set? limitToIds}) { // Summarize the scores for the documents. - final docScores = {}; + final docScores = List.filled(_length, 0.0); for (final token in tokenMatch.tokens) { final docWeights = _inverseIds[token]!; for (final e in docWeights.entries) { - if (limitToIds != null && !limitToIds.contains(e.key)) continue; - final double prevValue = docScores[e.key] ?? 0.0; - final double currentValue = tokenMatch[token]! * e.value; - docScores[e.key] = math.max(prevValue, currentValue); + final i = e.key; + docScores[i] = math.max(docScores[i], tokenMatch[token]! * e.value); } } @@ -244,15 +244,24 @@ class TokenIndex { // compensate the formula in order to prevent multiple exponential penalties. final double wordSizeExponent = 1.0 / wordCount; + final result = {}; // post-process match weights - docScores.updateAll((id, docScore) { - var docSize = _docSizes[id]!; + for (var i = 0; i < _length; i++) { + final id = _ids[i]; + final w = docScores[i]; + if (w <= 0.0) { + continue; + } + if (limitToIds != null && !limitToIds.contains(id)) { + continue; + } + var dw = _docWeights[i]; if (wordCount > 1) { - docSize = math.pow(docSize, wordSizeExponent).toDouble(); + dw = math.pow(dw, wordSizeExponent).toDouble(); } - return weight * docScore / docSize; - }); - return docScores; + result[id] = w * weight / dw; + } + return result; } /// Search the index for [text], with a (term-match / document coverage percent) From e8da10138ffd8a7cd35c1e10fb3d333d50021cc2 Mon Sep 17 00:00:00 2001 From: Istvan Soos Date: Thu, 17 Oct 2024 10:20:51 +0200 Subject: [PATCH 3/3] Benchmark tool --- app/bin/tools/search_benchmark.dart | 41 +++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 app/bin/tools/search_benchmark.dart diff --git a/app/bin/tools/search_benchmark.dart b/app/bin/tools/search_benchmark.dart new file mode 100644 index 0000000000..1b4394fb79 --- /dev/null +++ b/app/bin/tools/search_benchmark.dart @@ -0,0 +1,41 @@ +// Copyright (c) 2024, the Dart project authors. Please see the AUTHORS file +// for details. All rights reserved. Use of this source code is governed by a +// BSD-style license that can be found in the LICENSE file. + +import 'dart:convert'; +import 'dart:io'; + +import 'package:pub_dev/package/overrides.dart'; +import 'package:pub_dev/search/mem_index.dart'; +import 'package:pub_dev/search/models.dart'; +import 'package:pub_dev/search/search_service.dart'; + +/// Loads a search snapshot and executes queries on it, benchmarking their total time to complete. +Future main(List args) async { + // Assumes that the first argument is a search snapshot file. + final file = File(args.first); + final content = + json.decode(utf8.decode(gzip.decode(await file.readAsBytes()))) + as Map; + final snapshot = SearchSnapshot.fromJson(content); + snapshot.documents! + .removeWhere((packageName, doc) => isSoftRemoved(packageName)); + final index = InMemoryPackageIndex(documents: snapshot.documents!.values); + + // NOTE: please add more queries to this list, especially if there is a performance bottleneck. + final queries = [ + 'json', + 'camera', + 'android camera', + 'sql database', + ]; + + final sw = Stopwatch()..start(); + var count = 0; + for (var i = 0; i < 100; i++) { + index.search(ServiceSearchQuery.parse(query: queries[i % queries.length])); + count++; + } + sw.stop(); + print('${(sw.elapsedMilliseconds / count).toStringAsFixed(2)} ms/request'); +}