Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions app/bin/tools/search_benchmark.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright (c) 2024, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.

import 'dart:convert';
import 'dart:io';

import 'package:pub_dev/package/overrides.dart';
import 'package:pub_dev/search/mem_index.dart';
import 'package:pub_dev/search/models.dart';
import 'package:pub_dev/search/search_service.dart';

/// Loads a search snapshot and executes queries on it, benchmarking their total time to complete.
Future<void> main(List<String> args) async {
// Assumes that the first argument is a search snapshot file.
final file = File(args.first);
final content =
json.decode(utf8.decode(gzip.decode(await file.readAsBytes())))
as Map<String, Object?>;
final snapshot = SearchSnapshot.fromJson(content);
snapshot.documents!
.removeWhere((packageName, doc) => isSoftRemoved(packageName));
final index = InMemoryPackageIndex(documents: snapshot.documents!.values);

// NOTE: please add more queries to this list, especially if there is a performance bottleneck.
final queries = [
'json',
'camera',
'android camera',
'sql database',
];

final sw = Stopwatch()..start();
var count = 0;
for (var i = 0; i < 100; i++) {
index.search(ServiceSearchQuery.parse(query: queries[i % queries.length]));
count++;
}
sw.stop();
print('${(sw.elapsedMilliseconds / count).toStringAsFixed(2)} ms/request');
}
93 changes: 55 additions & 38 deletions app/lib/search/mem_index.dart
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ final _logger = Logger('search.mem_index');
final _textSearchTimeout = Duration(milliseconds: 500);

class InMemoryPackageIndex {
final Map<String, PackageDocument> _packages = <String, PackageDocument>{};
final List<PackageDocument> _documents;
final _documentsByName = <String, PackageDocument>{};
final _packageNameIndex = PackageNameIndex();
final TokenIndex _descrIndex = TokenIndex();
final TokenIndex _readmeIndex = TokenIndex();
final TokenIndex _apiSymbolIndex = TokenIndex();
late final TokenIndex _descrIndex;
late final TokenIndex _readmeIndex;
late final TokenIndex _apiSymbolIndex;

/// Adjusted score takes the overall score and transforms
/// it linearly into the [0.4-1.0] range.
Expand All @@ -39,13 +40,38 @@ class InMemoryPackageIndex {

InMemoryPackageIndex({
required Iterable<PackageDocument> documents,
}) {
for (final doc in documents) {
_addPackage(doc);
}) : _documents = [...documents] {
final apiDocPageKeys = <String>[];
final apiDocPageValues = <String>[];
for (final doc in _documents) {
_documentsByName[doc.package] = doc;
_packageNameIndex.add(doc.package);

final apiDocPages = doc.apiDocPages;
if (apiDocPages != null) {
for (final page in apiDocPages) {
if (page.symbols != null && page.symbols!.isNotEmpty) {
apiDocPageKeys.add(_apiDocPageId(doc.package, page));
apiDocPageValues.add(page.symbols!.join(' '));
}
}
}
}

final packageKeys = _documents.map((d) => d.package).toList();
_descrIndex = TokenIndex(
packageKeys,
_documents.map((d) => d.description).toList(),
);
_readmeIndex = TokenIndex(
packageKeys,
_documents.map((d) => d.readme).toList(),
);
_apiSymbolIndex = TokenIndex(apiDocPageKeys, apiDocPageValues);

// update like scores only if they were not set (should happen only in local tests)
if (_packages.values.any((e) => e.likeScore == null)) {
_packages.values.updateLikeScores();
if (_documentsByName.values.any((e) => e.likeScore == null)) {
_documentsByName.values.updateLikeScores();
}
_updateOverallScores();
_lastUpdated = clock.now().toUtc();
Expand All @@ -64,49 +90,37 @@ class InMemoryPackageIndex {
IndexInfo indexInfo() {
return IndexInfo(
isReady: true,
packageCount: _packages.length,
packageCount: _documentsByName.length,
lastUpdated: _lastUpdated,
);
}

void _addPackage(PackageDocument doc) {
_packages[doc.package] = doc;
_packageNameIndex.add(doc.package);
_descrIndex.add(doc.package, doc.description);
_readmeIndex.add(doc.package, doc.readme);

for (final ApiDocPage page in doc.apiDocPages ?? const []) {
final pageId = _apiDocPageId(doc.package, page);
if (page.symbols != null && page.symbols!.isNotEmpty) {
_apiSymbolIndex.add(pageId, page.symbols!.join(' '));
}
}
}

PackageSearchResult search(ServiceSearchQuery query) {
final packages = Set<String>.of(_packages.keys);
final packages = Set<String>.of(_documentsByName.keys);

// filter on package prefix
if (query.parsedQuery.packagePrefix != null) {
final String prefix = query.parsedQuery.packagePrefix!.toLowerCase();
packages.removeWhere(
(package) =>
!_packages[package]!.package.toLowerCase().startsWith(prefix),
(package) => !_documentsByName[package]!
.package
.toLowerCase()
.startsWith(prefix),
);
}

// filter on tags
final combinedTagsPredicate =
query.tagsPredicate.appendPredicate(query.parsedQuery.tagsPredicate);
if (combinedTagsPredicate.isNotEmpty) {
packages.retainWhere((package) =>
combinedTagsPredicate.matches(_packages[package]!.tagsForLookup));
packages.retainWhere((package) => combinedTagsPredicate
.matches(_documentsByName[package]!.tagsForLookup));
}

// filter on dependency
if (query.parsedQuery.hasAnyDependency) {
packages.removeWhere((package) {
final doc = _packages[package]!;
final doc = _documentsByName[package]!;
if (doc.dependencies.isEmpty) return true;
for (final dependency in query.parsedQuery.allDependencies) {
if (!doc.dependencies.containsKey(dependency)) return true;
Expand All @@ -122,7 +136,7 @@ class InMemoryPackageIndex {
// filter on points
if (query.minPoints != null && query.minPoints! > 0) {
packages.removeWhere((package) {
final doc = _packages[package]!;
final doc = _documentsByName[package]!;
return doc.grantedPoints < query.minPoints!;
});
}
Expand All @@ -132,7 +146,7 @@ class InMemoryPackageIndex {
if (updatedDuration != null && updatedDuration > Duration.zero) {
final now = clock.now();
packages.removeWhere((package) {
final doc = _packages[package]!;
final doc = _documentsByName[package]!;
final diff = now.difference(doc.updated);
return diff > updatedDuration;
});
Expand Down Expand Up @@ -163,7 +177,8 @@ class InMemoryPackageIndex {
.map((key, value) => value * _adjustedOverallScores[key]!);
// If the search hits have an exact name match, we move it to the front of the result list.
final parsedQueryText = query.parsedQuery.text;
if (parsedQueryText != null && _packages.containsKey(parsedQueryText)) {
if (parsedQueryText != null &&
_documentsByName.containsKey(parsedQueryText)) {
nameMatches = <String>[parsedQueryText];
}
packageHits = _rankWithValues(overallScore.getValues());
Expand Down Expand Up @@ -215,7 +230,7 @@ class InMemoryPackageIndex {

/// Update the overall score both on [PackageDocument] and in the [_adjustedOverallScores] map.
void _updateOverallScores() {
for (final doc in _packages.values) {
for (final doc in _documentsByName.values) {
final downloadScore = doc.popularityScore ?? 0.0;
final likeScore = doc.likeScore ?? 0.0;
final popularity = (downloadScore + likeScore) / 2;
Expand Down Expand Up @@ -316,7 +331,7 @@ class InMemoryPackageIndex {
if (!aborted && phrases.isNotEmpty) {
final matched = <String, double>{};
for (final package in score.getKeys()) {
final doc = _packages[package]!;
final doc = _documentsByName[package]!;
final bool matchedAllPhrases = phrases.every((phrase) =>
doc.package.contains(phrase) ||
doc.description!.contains(phrase) ||
Expand All @@ -341,7 +356,8 @@ class InMemoryPackageIndex {
final int scoreCompare = -a.score!.compareTo(b.score!);
if (scoreCompare != 0) return scoreCompare;
// if two packages got the same score, order by last updated
return _compareUpdated(_packages[a.package]!, _packages[b.package]!);
return _compareUpdated(
_documentsByName[a.package]!, _documentsByName[b.package]!);
});
return list;
}
Expand All @@ -350,11 +366,12 @@ class InMemoryPackageIndex {
int Function(PackageDocument a, PackageDocument b) compare, {
double Function(PackageDocument doc)? score,
}) {
final list = _packages.values
final list = _documentsByName.values
.map((doc) => PackageHit(
package: doc.package, score: score == null ? null : score(doc)))
.toList();
list.sort((a, b) => compare(_packages[a.package]!, _packages[b.package]!));
list.sort((a, b) =>
compare(_documentsByName[a.package]!, _documentsByName[b.package]!));
return list;
}

Expand Down
10 changes: 8 additions & 2 deletions app/lib/search/sdk_mem_index.dart
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class SdkMemIndex {
DartdocIndex index, {
Set<String>? allowedLibraries,
}) async {
final textsPerLibrary = <String, Map<String, String>>{};
for (final f in index.entries) {
final library = f.qualifiedName?.split('.').first;
if (library == null) continue;
Expand All @@ -92,10 +93,15 @@ class SdkMemIndex {
if (f.isLibrary) {
_baseUriPerLibrary[library] = _baseUri.resolve(f.href!).toString();
}
final tokens = _tokensPerLibrary.putIfAbsent(library, () => TokenIndex());

final text = f.qualifiedName?.replaceAll('.', ' ').replaceAll(':', ' ');
tokens.add(f.href!, text);
if (text != null && text.isNotEmpty) {
final texts = textsPerLibrary.putIfAbsent(library, () => {});
texts[f.href!] = text;
}
}
for (final e in textsPerLibrary.entries) {
_tokensPerLibrary[e.key] = TokenIndex.fromMap(e.value);
}
}

Expand Down
84 changes: 51 additions & 33 deletions app/lib/search/token_index.dart
Original file line number Diff line number Diff line change
Expand Up @@ -155,31 +155,43 @@ class TokenMatch {

/// Stores a token -> documentId inverted index with weights.
class TokenIndex {
/// Maps token Strings to a weighted map of document ids.
final _inverseIds = <String, Map<String, double>>{};
final List<String> _ids;

/// Maps token Strings to a weighted documents (addressed via indexes).
final _inverseIds = <String, Map<int, double>>{};

/// {id: size} map to store a value representative to the document length
final _docSizes = <String, double>{};
late final List<double> _docWeights;

/// The number of tokens stored in the index.
int get tokenCount => _inverseIds.length;
late final _length = _docWeights.length;

int get documentCount => _docSizes.length;
TokenIndex(List<String> ids, List<String?> values) : _ids = ids {
assert(ids.length == values.length);
final length = values.length;
_docWeights = List<double>.filled(length, 0.0);
for (var i = 0; i < length; i++) {
final text = values[i];

void add(String id, String? text) {
if (text == null) return;
final tokens = tokenize(text);
if (tokens == null || tokens.isEmpty) {
return;
}
for (final token in tokens.keys) {
final Map<String, double> weights =
_inverseIds.putIfAbsent(token, () => <String, double>{});
weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!);
if (text == null) {
continue;
}
final tokens = tokenize(text);
if (tokens == null || tokens.isEmpty) {
continue;
}
for (final token in tokens.keys) {
final weights = _inverseIds.putIfAbsent(token, () => {});
weights[i] = math.max(weights[i] ?? 0.0, tokens[token]!);
}
// Document weight is a highly scaled-down proxy of the length.
_docWeights[i] = 1 + math.log(1 + tokens.length) / 100;
}
// Document size is a highly scaled-down proxy of the length.
final docSize = 1 + math.log(1 + tokens.length) / 100;
_docSizes[id] = docSize;
}

factory TokenIndex.fromMap(Map<String, String> map) {
final keys = map.keys.toList();
final values = map.values.toList();
return TokenIndex(keys, values);
}

/// Match the text against the corpus and return the tokens or
Expand All @@ -191,9 +203,8 @@ class TokenIndex {
for (final word in splitForIndexing(text)) {
final tokens = tokenize(word, isSplit: true) ?? {};

final present = tokens.keys
.where((token) => (_inverseIds[token]?.length ?? 0) > 0)
.toList();
final present =
tokens.keys.where((token) => _inverseIds.containsKey(token)).toList();
if (present.isEmpty) {
return TokenMatch();
}
Expand All @@ -219,14 +230,12 @@ class TokenIndex {
Map<String, double> _scoreDocs(TokenMatch tokenMatch,
{double weight = 1.0, int wordCount = 1, Set<String>? limitToIds}) {
// Summarize the scores for the documents.
final docScores = <String, double>{};
final docScores = List<double>.filled(_length, 0.0);
for (final token in tokenMatch.tokens) {
final docWeights = _inverseIds[token]!;
for (final e in docWeights.entries) {
if (limitToIds != null && !limitToIds.contains(e.key)) continue;
final double prevValue = docScores[e.key] ?? 0.0;
final double currentValue = tokenMatch[token]! * e.value;
docScores[e.key] = math.max(prevValue, currentValue);
final i = e.key;
docScores[i] = math.max(docScores[i], tokenMatch[token]! * e.value);
}
}

Expand All @@ -235,15 +244,24 @@ class TokenIndex {
// compensate the formula in order to prevent multiple exponential penalties.
final double wordSizeExponent = 1.0 / wordCount;

final result = <String, double>{};
// post-process match weights
docScores.updateAll((id, docScore) {
var docSize = _docSizes[id]!;
for (var i = 0; i < _length; i++) {
final id = _ids[i];
final w = docScores[i];
if (w <= 0.0) {
continue;
}
if (limitToIds != null && !limitToIds.contains(id)) {
continue;
}
var dw = _docWeights[i];
if (wordCount > 1) {
docSize = math.pow(docSize, wordSizeExponent).toDouble();
dw = math.pow(dw, wordSizeExponent).toDouble();
}
return weight * docScore / docSize;
});
return docScores;
result[id] = w * weight / dw;
}
return result;
}

/// Search the index for [text], with a (term-match / document coverage percent)
Expand Down
Loading
Loading