From 60d8db0724063d990c78c4e60f7d7e55b44617a6 Mon Sep 17 00:00:00 2001
From: Istvan Soos <istvan.soos@gmail.com>
Date: Thu, 17 Oct 2024 08:59:40 +0200
Subject: [PATCH 1/3] Refactor TokenIndex initialization.

---
 app/lib/search/mem_index.dart         | 93 ++++++++++++++++-----------
 app/lib/search/sdk_mem_index.dart     | 10 ++-
 app/lib/search/token_index.dart       | 45 +++++++------
 app/test/search/token_index_test.dart | 51 +++++++--------
 4 files changed, 112 insertions(+), 87 deletions(-)
diff --git a/app/lib/search/mem_index.dart b/app/lib/search/mem_index.dart
index 68e5b765ff..6e72f70e8e 100644
--- a/app/lib/search/mem_index.dart
+++ b/app/lib/search/mem_index.dart
@@ -19,11 +19,12 @@ final _logger = Logger('search.mem_index');
 final _textSearchTimeout = Duration(milliseconds: 500);
 
 class InMemoryPackageIndex {
-  final Map<String, PackageDocument> _packages = <String, PackageDocument>{};
+  final List<PackageDocument> _documents;
+  final _documentsByName = <String, PackageDocument>{};
   final _packageNameIndex = PackageNameIndex();
-  final TokenIndex _descrIndex = TokenIndex();
-  final TokenIndex _readmeIndex = TokenIndex();
-  final TokenIndex _apiSymbolIndex = TokenIndex();
+  late final TokenIndex _descrIndex;
+  late final TokenIndex _readmeIndex;
+  late final TokenIndex _apiSymbolIndex;
 
   /// Adjusted score takes the overall score and transforms
   /// it linearly into the [0.4-1.0] range.
@@ -39,13 +40,38 @@ class InMemoryPackageIndex {
 
   InMemoryPackageIndex({
     required Iterable<PackageDocument> documents,
-  }) {
-    for (final doc in documents) {
-      _addPackage(doc);
+  }) : _documents = [...documents] {
+    final apiDocPageKeys = <String>[];
+    final apiDocPageValues = <String>[];
+    for (final doc in _documents) {
+      _documentsByName[doc.package] = doc;
+      _packageNameIndex.add(doc.package);
+
+      final apiDocPages = doc.apiDocPages;
+      if (apiDocPages != null) {
+        for (final page in apiDocPages) {
+          if (page.symbols != null && page.symbols!.isNotEmpty) {
+            apiDocPageKeys.add(_apiDocPageId(doc.package, page));
+            apiDocPageValues.add(page.symbols!.join(' '));
+          }
+        }
+      }
     }
+
+    final packageKeys = _documents.map((d) => d.package).toList();
+    _descrIndex = TokenIndex(
+      packageKeys,
+      _documents.map((d) => d.description).toList(),
+    );
+    _readmeIndex = TokenIndex(
+      packageKeys,
+      _documents.map((d) => d.readme).toList(),
+    );
+    _apiSymbolIndex = TokenIndex(apiDocPageKeys, apiDocPageValues);
+
     // update like scores only if they were not set (should happen only in local tests)
-    if (_packages.values.any((e) => e.likeScore == null)) {
-      _packages.values.updateLikeScores();
+    if (_documentsByName.values.any((e) => e.likeScore == null)) {
+      _documentsByName.values.updateLikeScores();
     }
     _updateOverallScores();
     _lastUpdated = clock.now().toUtc();
@@ -64,34 +90,22 @@ class InMemoryPackageIndex {
   IndexInfo indexInfo() {
     return IndexInfo(
       isReady: true,
-      packageCount: _packages.length,
+      packageCount: _documentsByName.length,
       lastUpdated: _lastUpdated,
     );
   }
 
-  void _addPackage(PackageDocument doc) {
-    _packages[doc.package] = doc;
-    _packageNameIndex.add(doc.package);
-    _descrIndex.add(doc.package, doc.description);
-    _readmeIndex.add(doc.package, doc.readme);
-
-    for (final ApiDocPage page in doc.apiDocPages ?? const []) {
-      final pageId = _apiDocPageId(doc.package, page);
-      if (page.symbols != null && page.symbols!.isNotEmpty) {
-        _apiSymbolIndex.add(pageId, page.symbols!.join(' '));
-      }
-    }
-  }
-
   PackageSearchResult search(ServiceSearchQuery query) {
-    final packages = Set<String>.of(_packages.keys);
+    final packages = Set<String>.of(_documentsByName.keys);
 
     // filter on package prefix
     if (query.parsedQuery.packagePrefix != null) {
       final String prefix = query.parsedQuery.packagePrefix!.toLowerCase();
       packages.removeWhere(
-        (package) =>
-            !_packages[package]!.package.toLowerCase().startsWith(prefix),
+        (package) => !_documentsByName[package]!
+            .package
+            .toLowerCase()
+            .startsWith(prefix),
       );
     }
 
@@ -99,14 +113,14 @@ class InMemoryPackageIndex {
     final combinedTagsPredicate =
         query.tagsPredicate.appendPredicate(query.parsedQuery.tagsPredicate);
     if (combinedTagsPredicate.isNotEmpty) {
-      packages.retainWhere((package) =>
-          combinedTagsPredicate.matches(_packages[package]!.tagsForLookup));
+      packages.retainWhere((package) => combinedTagsPredicate
+          .matches(_documentsByName[package]!.tagsForLookup));
     }
 
     // filter on dependency
     if (query.parsedQuery.hasAnyDependency) {
       packages.removeWhere((package) {
-        final doc = _packages[package]!;
+        final doc = _documentsByName[package]!;
         if (doc.dependencies.isEmpty) return true;
         for (final dependency in query.parsedQuery.allDependencies) {
           if (!doc.dependencies.containsKey(dependency)) return true;
@@ -122,7 +136,7 @@ class InMemoryPackageIndex {
     // filter on points
     if (query.minPoints != null && query.minPoints! > 0) {
       packages.removeWhere((package) {
-        final doc = _packages[package]!;
+        final doc = _documentsByName[package]!;
         return doc.grantedPoints < query.minPoints!;
       });
     }
@@ -132,7 +146,7 @@ class InMemoryPackageIndex {
     if (updatedDuration != null && updatedDuration > Duration.zero) {
       final now = clock.now();
       packages.removeWhere((package) {
-        final doc = _packages[package]!;
+        final doc = _documentsByName[package]!;
         final diff = now.difference(doc.updated);
         return diff > updatedDuration;
       });
@@ -163,7 +177,8 @@ class InMemoryPackageIndex {
             .map((key, value) => value * _adjustedOverallScores[key]!);
         // If the search hits have an exact name match, we move it to the front of the result list.
         final parsedQueryText = query.parsedQuery.text;
-        if (parsedQueryText != null && _packages.containsKey(parsedQueryText)) {
+        if (parsedQueryText != null &&
+            _documentsByName.containsKey(parsedQueryText)) {
           nameMatches = <String>[parsedQueryText];
         }
         packageHits = _rankWithValues(overallScore.getValues());
@@ -215,7 +230,7 @@ class InMemoryPackageIndex {
 
   /// Update the overall score both on [PackageDocument] and in the [_adjustedOverallScores] map.
   void _updateOverallScores() {
-    for (final doc in _packages.values) {
+    for (final doc in _documentsByName.values) {
       final downloadScore = doc.popularityScore ?? 0.0;
       final likeScore = doc.likeScore ?? 0.0;
       final popularity = (downloadScore + likeScore) / 2;
@@ -316,7 +331,7 @@ class InMemoryPackageIndex {
       if (!aborted && phrases.isNotEmpty) {
         final matched = <String, double>{};
         for (final package in score.getKeys()) {
-          final doc = _packages[package]!;
+          final doc = _documentsByName[package]!;
           final bool matchedAllPhrases = phrases.every((phrase) =>
               doc.package.contains(phrase) ||
               doc.description!.contains(phrase) ||
@@ -341,7 +356,8 @@ class InMemoryPackageIndex {
       final int scoreCompare = -a.score!.compareTo(b.score!);
       if (scoreCompare != 0) return scoreCompare;
       // if two packages got the same score, order by last updated
-      return _compareUpdated(_packages[a.package]!, _packages[b.package]!);
+      return _compareUpdated(
+          _documentsByName[a.package]!, _documentsByName[b.package]!);
     });
     return list;
   }
@@ -350,11 +366,12 @@ class InMemoryPackageIndex {
     int Function(PackageDocument a, PackageDocument b) compare, {
     double Function(PackageDocument doc)? score,
   }) {
-    final list = _packages.values
+    final list = _documentsByName.values
         .map((doc) => PackageHit(
             package: doc.package, score: score == null ? null : score(doc)))
         .toList();
-    list.sort((a, b) => compare(_packages[a.package]!, _packages[b.package]!));
+    list.sort((a, b) =>
+        compare(_documentsByName[a.package]!, _documentsByName[b.package]!));
     return list;
   }
 
diff --git a/app/lib/search/sdk_mem_index.dart b/app/lib/search/sdk_mem_index.dart
index 949fbdc539..de7248af01 100644
--- a/app/lib/search/sdk_mem_index.dart
+++ b/app/lib/search/sdk_mem_index.dart
@@ -80,6 +80,7 @@ class SdkMemIndex {
     DartdocIndex index, {
     Set<String>? allowedLibraries,
   }) async {
+    final textsPerLibrary = <String, Map<String, String>>{};
     for (final f in index.entries) {
       final library = f.qualifiedName?.split('.').first;
       if (library == null) continue;
@@ -92,10 +93,15 @@ class SdkMemIndex {
       if (f.isLibrary) {
         _baseUriPerLibrary[library] = _baseUri.resolve(f.href!).toString();
       }
-      final tokens = _tokensPerLibrary.putIfAbsent(library, () => TokenIndex());
 
       final text = f.qualifiedName?.replaceAll('.', ' ').replaceAll(':', ' ');
-      tokens.add(f.href!, text);
+      if (text != null && text.isNotEmpty) {
+        final texts = textsPerLibrary.putIfAbsent(library, () => {});
+        texts[f.href!] = text;
+      }
+    }
+    for (final e in textsPerLibrary.entries) {
+      _tokensPerLibrary[e.key] = TokenIndex.fromMap(e.value);
     }
   }
 
diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart
index 36df845317..ccf904c808 100644
--- a/app/lib/search/token_index.dart
+++ b/app/lib/search/token_index.dart
@@ -161,25 +161,34 @@ class TokenIndex {
   /// {id: size} map to store a value representative to the document length
   final _docSizes = <String, double>{};
 
-  /// The number of tokens stored in the index.
-  int get tokenCount => _inverseIds.length;
-
-  int get documentCount => _docSizes.length;
-
-  void add(String id, String? text) {
-    if (text == null) return;
-    final tokens = tokenize(text);
-    if (tokens == null || tokens.isEmpty) {
-      return;
-    }
-    for (final token in tokens.keys) {
-      final Map<String, double> weights =
-          _inverseIds.putIfAbsent(token, () => <String, double>{});
-      weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!);
+  TokenIndex(List<String> keys, List<String?> values) {
+    assert(keys.length == values.length);
+    for (var i = 0; i < keys.length; i++) {
+      final id = keys[i];
+      final text = values[i];
+
+      if (text == null) {
+        continue;
+      }
+      final tokens = tokenize(text);
+      if (tokens == null || tokens.isEmpty) {
+        continue;
+      }
+      for (final token in tokens.keys) {
+        final weights =
+            _inverseIds.putIfAbsent(token, () => <String, double>{});
+        weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!);
+      }
+      // Document size is a highly scaled-down proxy of the length.
+      final docSize = 1 + math.log(1 + tokens.length) / 100;
+      _docSizes[id] = docSize;
     }
-    // Document size is a highly scaled-down proxy of the length.
-    final docSize = 1 + math.log(1 + tokens.length) / 100;
-    _docSizes[id] = docSize;
+  }
+
+  factory TokenIndex.fromMap(Map<String, String> map) {
+    final keys = map.keys.toList();
+    final values = map.values.toList();
+    return TokenIndex(keys, values);
   }
 
   /// Match the text against the corpus and return the tokens or
diff --git a/app/test/search/token_index_test.dart b/app/test/search/token_index_test.dart
index dbd8c1f6b6..47f2393597 100644
--- a/app/test/search/token_index_test.dart
+++ b/app/test/search/token_index_test.dart
@@ -8,7 +8,7 @@ import 'package:test/test.dart';
 void main() {
   group('TokenIndex', () {
     test('partial token lookup', () {
-      final index = TokenIndex()..add('x', 'SomeCamelCasedWord and others');
+      final index = TokenIndex.fromMap({'x': 'SomeCamelCasedWord and others'});
       expect(index.lookupTokens('word').tokenWeights, {'word': 1.0});
       expect(index.lookupTokens('OtherCased').tokenWeights,
           {'cased': closeTo(0.70, 0.01)});
@@ -18,9 +18,10 @@ void main() {
     });
 
     test('No match', () {
-      final TokenIndex index = TokenIndex()
-        ..add('uri://http', 'http')
-        ..add('uri://http_magic', 'http_magic');
+      final TokenIndex index = TokenIndex.fromMap({
+        'uri://http': 'http',
+        'uri://http_magic': 'http_magic',
+      });
 
       expect(index.search('xml'), {
         // no match for http
@@ -29,9 +30,10 @@ void main() {
     });
 
     test('Scoring exact and partial matches', () {
-      final TokenIndex index = TokenIndex()
-        ..add('uri://http', 'http')
-        ..add('uri://http_magic', 'http_magic');
+      final TokenIndex index = TokenIndex.fromMap({
+        'uri://http': 'http',
+        'uri://http_magic': 'http_magic',
+      });
       expect(index.search('http'), {
         'uri://http': closeTo(0.993, 0.001),
         'uri://http_magic': closeTo(0.989, 0.001),
@@ -40,10 +42,11 @@ void main() {
 
     test('CamelCase indexing', () {
       final String queueText = '.DoubleLinkedQueue()';
-      final TokenIndex index = TokenIndex()
-        ..add('queue', queueText)
-        ..add('queue_lower', queueText.toLowerCase())
-        ..add('unmodifiable', 'CustomUnmodifiableMapBase');
+      final TokenIndex index = TokenIndex.fromMap({
+        'queue': queueText,
+        'queue_lower': queueText.toLowerCase(),
+        'unmodifiable': 'CustomUnmodifiableMapBase',
+      });
       expect(index.search('queue'), {
         'queue': closeTo(0.53, 0.01),
       });
@@ -54,10 +57,11 @@ void main() {
     });
 
     test('Wierd cases: riak client', () {
-      final TokenIndex index = TokenIndex()
-        ..add('uri://cli', 'cli')
-        ..add('uri://riak_client', 'riak_client')
-        ..add('uri://teamspeak', 'teamspeak');
+      final TokenIndex index = TokenIndex.fromMap({
+        'uri://cli': 'cli',
+        'uri://riak_client': 'riak_client',
+        'uri://teamspeak': 'teamspeak',
+      });
 
       expect(index.search('riak'), {
         'uri://riak_client': closeTo(0.99, 0.01),
@@ -68,24 +72,15 @@ void main() {
       });
     });
 
-    test('Free up memory', () {
-      final TokenIndex index = TokenIndex();
-      expect(index.tokenCount, 0);
-      index.add('url1', 'text');
-      expect(index.tokenCount, 1);
-      index.add('url2', 'another');
-      expect(index.tokenCount, 2);
-    });
-
     test('Do not overweight partial matches', () {
-      final index = TokenIndex()..add('flutter_qr_reader', 'flutter_qr_reader');
+      final index =
+          TokenIndex.fromMap({'flutter_qr_reader': 'flutter_qr_reader'});
       final data = index.search('ByteDataReader');
       // The partial match should not return more than 0.65 as score.
       expect(data, {'flutter_qr_reader': lessThan(0.65)});
     });
 
     test('longer words', () {
-      final index = TokenIndex();
       final names = [
         'location',
         'geolocator',
@@ -98,9 +93,7 @@ void main() {
         'location_picker',
         'background_location_updates',
       ];
-      for (final name in names) {
-        index.add(name, name);
-      }
+      final index = TokenIndex.fromMap(Map.fromIterables(names, names));
       final match = index.search('location');
       // location should be the top value, everything else should be lower
       final locationValue = match['location'];

From 2c55514c7815429b6e77d44398f0b39e8e745a33 Mon Sep 17 00:00:00 2001
From: Istvan Soos <istvan.soos@gmail.com>
Date: Thu, 17 Oct 2024 08:59:40 +0200
Subject: [PATCH 2/3] Better efficiency in TokenIndex.

---
 app/lib/search/token_index.dart | 63 +++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/app/lib/search/token_index.dart b/app/lib/search/token_index.dart
index ccf904c808..df6a6e607e 100644
--- a/app/lib/search/token_index.dart
+++ b/app/lib/search/token_index.dart
@@ -155,16 +155,21 @@ class TokenMatch {
 
 /// Stores a token -> documentId inverted index with weights.
 class TokenIndex {
-  /// Maps token Strings to a weighted map of document ids.
-  final _inverseIds = <String, Map<String, double>>{};
+  final List<String> _ids;
+
+  /// Maps token Strings to a weighted documents (addressed via indexes).
+  final _inverseIds = <String, Map<int, double>>{};
 
   /// {id: size} map to store a value representative to the document length
-  final _docSizes = <String, double>{};
+  late final List<double> _docWeights;
+
+  late final _length = _docWeights.length;
 
-  TokenIndex(List<String> keys, List<String?> values) {
-    assert(keys.length == values.length);
-    for (var i = 0; i < keys.length; i++) {
-      final id = keys[i];
+  TokenIndex(List<String> ids, List<String?> values) : _ids = ids {
+    assert(ids.length == values.length);
+    final length = values.length;
+    _docWeights = List<double>.filled(length, 0.0);
+    for (var i = 0; i < length; i++) {
       final text = values[i];
 
       if (text == null) {
@@ -175,13 +180,11 @@ class TokenIndex {
         continue;
       }
       for (final token in tokens.keys) {
-        final weights =
-            _inverseIds.putIfAbsent(token, () => <String, double>{});
-        weights[id] = math.max(weights[id] ?? 0.0, tokens[token]!);
+        final weights = _inverseIds.putIfAbsent(token, () => {});
+        weights[i] = math.max(weights[i] ?? 0.0, tokens[token]!);
       }
-      // Document size is a highly scaled-down proxy of the length.
-      final docSize = 1 + math.log(1 + tokens.length) / 100;
-      _docSizes[id] = docSize;
+      // Document weight is a highly scaled-down proxy of the length.
+      _docWeights[i] = 1 + math.log(1 + tokens.length) / 100;
     }
   }
 
@@ -200,9 +203,8 @@ class TokenIndex {
     for (final word in splitForIndexing(text)) {
       final tokens = tokenize(word, isSplit: true) ?? {};
 
-      final present = tokens.keys
-          .where((token) => (_inverseIds[token]?.length ?? 0) > 0)
-          .toList();
+      final present =
+          tokens.keys.where((token) => _inverseIds.containsKey(token)).toList();
       if (present.isEmpty) {
         return TokenMatch();
       }
@@ -228,14 +230,12 @@ class TokenIndex {
   Map<String, double> _scoreDocs(TokenMatch tokenMatch,
       {double weight = 1.0, int wordCount = 1, Set<String>? limitToIds}) {
     // Summarize the scores for the documents.
-    final docScores = <String, double>{};
+    final docScores = List<double>.filled(_length, 0.0);
     for (final token in tokenMatch.tokens) {
       final docWeights = _inverseIds[token]!;
       for (final e in docWeights.entries) {
-        if (limitToIds != null && !limitToIds.contains(e.key)) continue;
-        final double prevValue = docScores[e.key] ?? 0.0;
-        final double currentValue = tokenMatch[token]! * e.value;
-        docScores[e.key] = math.max(prevValue, currentValue);
+        final i = e.key;
+        docScores[i] = math.max(docScores[i], tokenMatch[token]! * e.value);
       }
     }
 
@@ -244,15 +244,24 @@ class TokenIndex {
     // compensate the formula in order to prevent multiple exponential penalties.
     final double wordSizeExponent = 1.0 / wordCount;
 
+    final result = <String, double>{};
     // post-process match weights
-    docScores.updateAll((id, docScore) {
-      var docSize = _docSizes[id]!;
+    for (var i = 0; i < _length; i++) {
+      final id = _ids[i];
+      final w = docScores[i];
+      if (w <= 0.0) {
+        continue;
+      }
+      if (limitToIds != null && !limitToIds.contains(id)) {
+        continue;
+      }
+      var dw = _docWeights[i];
       if (wordCount > 1) {
-        docSize = math.pow(docSize, wordSizeExponent).toDouble();
+        dw = math.pow(dw, wordSizeExponent).toDouble();
       }
-      return weight * docScore / docSize;
-    });
-    return docScores;
+      result[id] = w * weight / dw;
+    }
+    return result;
   }
 
   /// Search the index for [text], with a (term-match / document coverage percent)

From e8da10138ffd8a7cd35c1e10fb3d333d50021cc2 Mon Sep 17 00:00:00 2001
From: Istvan Soos <istvan.soos@gmail.com>
Date: Thu, 17 Oct 2024 10:20:51 +0200
Subject: [PATCH 3/3] Benchmark tool

---
 app/bin/tools/search_benchmark.dart | 41 +++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 app/bin/tools/search_benchmark.dart

diff --git a/app/bin/tools/search_benchmark.dart b/app/bin/tools/search_benchmark.dart
new file mode 100644
index 0000000000..1b4394fb79
--- /dev/null
+++ b/app/bin/tools/search_benchmark.dart
@@ -0,0 +1,41 @@
+// Copyright (c) 2024, the Dart project authors.  Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+import 'dart:convert';
+import 'dart:io';
+
+import 'package:pub_dev/package/overrides.dart';
+import 'package:pub_dev/search/mem_index.dart';
+import 'package:pub_dev/search/models.dart';
+import 'package:pub_dev/search/search_service.dart';
+
+/// Loads a search snapshot and executes queries on it, benchmarking their total time to complete.
+Future<void> main(List<String> args) async {
+  // Assumes that the first argument is a search snapshot file.
+  final file = File(args.first);
+  final content =
+      json.decode(utf8.decode(gzip.decode(await file.readAsBytes())))
+          as Map<String, Object?>;
+  final snapshot = SearchSnapshot.fromJson(content);
+  snapshot.documents!
+      .removeWhere((packageName, doc) => isSoftRemoved(packageName));
+  final index = InMemoryPackageIndex(documents: snapshot.documents!.values);
+
+  // NOTE: please add more queries to this list, especially if there is a performance bottleneck.
+  final queries = [
+    'json',
+    'camera',
+    'android camera',
+    'sql database',
+  ];
+
+  final sw = Stopwatch()..start();
+  var count = 0;
+  for (var i = 0; i < 100; i++) {
+    index.search(ServiceSearchQuery.parse(query: queries[i % queries.length]));
+    count++;
+  }
+  sw.stop();
+  print('${(sw.elapsedMilliseconds / count).toStringAsFixed(2)} ms/request');
+}