Support kNN vectors in disk usage action (#88785)

This change adds support for kNN vector fields to the `_disk_usage` API. The strategy: * Iterate the vector values (using the same strategy as for doc values) to estimate the vector data size * Run some random vector searches to estimate the vector index size Co-authored-by: Yannick Welsch <yannick@welsch.lu> Closes #84801
elastic · Jul 26, 2022 · abd561a · abd561a
1 parent 9b5cd67
commit abd561a
Show file tree

Hide file tree

Showing 6 changed files with 198 additions and 7 deletions.
diff --git a/docs/changelog/88785.yaml b/docs/changelog/88785.yaml
@@ -0,0 +1,6 @@
+pr: 88785
+summary: Support kNN vectors in disk usage action
+area: Search
+type: enhancement
+issues:
+ - 84801
diff --git a/docs/reference/indices/diskusage.asciidoc b/docs/reference/indices/diskusage.asciidoc
@@ -100,7 +100,9 @@ The API returns:
             "norms": "2.3kb",
             "norms_in_bytes": 2356,
             "term_vectors": "2.2kb",
-            "term_vectors_in_bytes": 2310
+            "term_vectors_in_bytes": 2310,
+            "knn_vectors": "0b",
+            "knn_vectors_in_bytes": 0
         },
         "fields": {
             "_id": {
@@ -119,7 +121,9 @@ The API returns:
                 "norms": "0b",
                 "norms_in_bytes": 0,
                 "term_vectors": "0b",
-                "term_vectors_in_bytes": 0
+                "term_vectors_in_bytes": 0,
+                "knn_vectors": "0b",
+                "knn_vectors_in_bytes": 0
             },
             "_primary_term": {...},
             "_seq_no": {...},
@@ -137,7 +141,9 @@ The API returns:
                 "norms": "0b",
                 "norms_in_bytes": 0,
                 "term_vectors": "0b",
-                "term_vectors_in_bytes": 0
+                "term_vectors_in_bytes": 0,
+                "knn_vectors": "0b",
+                "knn_vectors_in_bytes": 0
             },
             "context": {
                 "total": "28.6mb",
@@ -155,7 +161,9 @@ The API returns:
                 "norms": "2.3kb",
                 "norms_in_bytes": 2356,
                 "term_vectors": "2.2kb",
-                "term_vectors_in_bytes": 2310
+                "term_vectors_in_bytes": 2310,
+                "knn_vectors": "0b",
+                "knn_vectors_in_bytes": 0
             },
             "context.keyword": {...},
             "message": {...},

diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.stats/50_disk_usage.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.stats/50_disk_usage.yml
@@ -18,6 +18,7 @@ setup:
               genre:
                 type: keyword
                 doc_values: true
+
   - do:
       index:
         index: testindex
@@ -126,3 +127,52 @@ setup:
   - gt: { testindex.all_fields.doc_values_in_bytes: 0 }
   - gt: { testindex.all_fields.points_in_bytes: 0 }
   - match: { testindex.all_fields.term_vectors_in_bytes: 0 }
+
+---
+"Dense vectors":
+  - skip:
+      version: " - 8.3.99"
+      reason: dense vector support was added in 8.4
+
+  - do:
+      indices.put_mapping:
+        index: testindex
+        body:
+          properties:
+            vector:
+              type: dense_vector
+              dims: 3
+              index: true
+              similarity: l2_norm
+  - do:
+      index:
+        index: testindex
+        body: { "quantity": 0 }
+  - do:
+      index:
+        index: testindex
+        body: { "quantity": 99, "vector": [0.1, 0.2, 0.3] }
+  - do:
+      index:
+        index: testindex
+        body: { "quantity": 1000, "vector": [1.3, 0.2, 4.5] }
+
+  - do:
+      indices.disk_usage:
+        index: testindex
+        run_expensive_tasks: true
+
+  - gt: { testindex.store_size_in_bytes: 100 }
+
+  # all_fields
+  - gt: { testindex.all_fields.knn_vectors_in_bytes: 0 }
+
+  # quantity
+  - gt: { testindex.fields.quantity.total_in_bytes: 0 }
+  - gt: { testindex.fields.quantity.points_in_bytes: 0 }
+  - match: { testindex.fields.quantity.knn_vectors_in_bytes: 0 }
+
+  # vector
+  - gt: { testindex.fields.vector.total_in_bytes: 0 }
+  - match: { testindex.fields.vector.points_in_bytes: 0 }
+  - gt: { testindex.fields.vector.knn_vectors_in_bytes: 0 }
diff --git a/...rc/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java b/...rc/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java
@@ -13,6 +13,7 @@
 import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.KnnVectorsReader;
 import org.apache.lucene.codecs.NormsProducer;
 import org.apache.lucene.codecs.PointsReader;
 import org.apache.lucene.codecs.StoredFieldsReader;
@@ -37,6 +38,7 @@
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.VectorValues;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FilterDirectory;
@@ -118,6 +120,10 @@ void doAnalyze(IndexDiskUsageStats stats) throws IOException {
                 startTimeInNanos = System.nanoTime();
                 analyzeTermVectors(reader, stats);
                 executionTime.termVectorsTimeInNanos += System.nanoTime() - startTimeInNanos;
+
+                startTimeInNanos = System.nanoTime();
+                analyzeKnnVectors(reader, stats);
+                executionTime.knnVectorsTimeInNanos += System.nanoTime() - startTimeInNanos;
             }
         }
         logger.debug("analyzing the disk usage took {} stats: {}", executionTime, stats);
@@ -510,6 +516,36 @@ void visitField(Fields vectors, String fieldName) throws IOException {
         }
     }
 
+    void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
+        KnnVectorsReader vectorReader = reader.getVectorReader();
+        if (vectorReader == null) {
+            return;
+        }
+        for (FieldInfo field : reader.getFieldInfos()) {
+            cancellationChecker.checkForCancellation();
+            directory.resetBytesRead();
+            if (field.getVectorDimension() > 0) {
+                iterateDocValues(reader.maxDoc(), () -> vectorReader.getVectorValues(field.name), vectors -> {
+                    cancellationChecker.logEvent();
+                    vectors.vectorValue();
+                });
+
+                // do a couple of randomized searches to figure out min and max offsets of index file
+                VectorValues vectorValues = vectorReader.getVectorValues(field.name);
+                int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
+                int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
+                for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
+                    if ((i = vectorValues.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
+                        break;
+                    }
+                    cancellationChecker.checkForCancellation();
+                    vectorReader.search(field.name, vectorValues.vectorValue(), 100, null, Integer.MAX_VALUE);
+                }
+                stats.addKnnVectors(field.name, directory.getBytesRead());
+            }
+        }
+    }
+
     private static class TrackingReadBytesDirectory extends FilterDirectory {
         private final Map<String, BytesReadTracker> trackers = new HashMap<>();
 
@@ -602,7 +638,7 @@ public void readBytes(byte[] b, int offset, int len) throws IOException {
     }
 
     /**
-     * Lucene Codec organizes data field by field for doc values, points, postings, and norms; and document by document
+     * Lucene Codec organizes data field by field for doc values, points, postings, vectors, and norms; and document by document
      * for stored fields and term vectors. BytesReadTracker then can simply track the min and max read positions.
      * This would allow us to traverse only two ends of each partition.
      */
@@ -698,10 +734,11 @@ private static class ExecutionTime {
         long pointsTimeInNanos;
         long normsTimeInNanos;
         long termVectorsTimeInNanos;
+        long knnVectorsTimeInNanos;
 
         long totalInNanos() {
             return invertedIndexTimeInNanos + storedFieldsTimeInNanos + docValuesTimeInNanos + pointsTimeInNanos + normsTimeInNanos
-                + termVectorsTimeInNanos;
+                + termVectorsTimeInNanos + knnVectorsTimeInNanos;
         }
 
         @Override
@@ -726,6 +763,9 @@ public String toString() {
                 + "ms"
                 + ", term vectors: "
                 + termVectorsTimeInNanos / 1000_000
+                + "ms"
+                + ", knn vectors: "
+                + knnVectorsTimeInNanos / 1000_000
                 + "ms";
         }
     }

diff --git a/...r/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageStats.java b/...r/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageStats.java
@@ -8,6 +8,7 @@
 
 package org.elasticsearch.action.admin.indices.diskusage;
 
+import org.elasticsearch.Version;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -39,6 +40,8 @@ public final class IndexDiskUsageStats implements ToXContentFragment, Writeable
     public static final String NORMS_IN_BYTES = "norms_in_bytes";
     public static final String TERM_VECTORS = "term_vectors";
     public static final String TERM_VECTORS_IN_BYTES = "term_vectors_in_bytes";
+    public static final String KNN_VECTORS = "knn_vectors";
+    public static final String KNN_VECTORS_IN_BYTES = "knn_vectors_in_bytes";
 
     public static final String STORE_SIZE = "store_size";
     public static final String STORE_SIZE_IN_BYTES = "store_size_in_bytes";
@@ -119,6 +122,11 @@ public void addTermVectors(String fieldName, long bytes) {
         getOrAdd(fieldName).termVectorsBytes += bytes;
     }
 
+    public void addKnnVectors(String fieldName, long bytes) {
+        checkByteSize(bytes);
+        getOrAdd(fieldName).knnVectorsBytes += bytes;
+    }
+
     public IndexDiskUsageStats add(IndexDiskUsageStats other) {
         other.fields.forEach((k, v) -> getOrAdd(k).add(v));
         this.indexSizeInBytes += other.indexSizeInBytes;
@@ -168,6 +176,7 @@ public static final class PerFieldDiskUsage implements ToXContentFragment, Write
         private long pointsBytes;
         private long normsBytes;
         private long termVectorsBytes;
+        private long knnVectorsBytes;
 
         private PerFieldDiskUsage() {
 
@@ -180,6 +189,9 @@ private PerFieldDiskUsage(StreamInput in) throws IOException {
             pointsBytes = in.readVLong();
             normsBytes = in.readVLong();
             termVectorsBytes = in.readVLong();
+            if (in.getVersion().onOrAfter(Version.V_8_4_0)) {
+                knnVectorsBytes = in.readVLong();
+            }
         }
 
         @Override
@@ -190,6 +202,9 @@ public void writeTo(StreamOutput out) throws IOException {
             out.writeVLong(pointsBytes);
             out.writeVLong(normsBytes);
             out.writeVLong(termVectorsBytes);
+            if (out.getVersion().onOrAfter(Version.V_8_4_0)) {
+                out.writeVLong(knnVectorsBytes);
+            }
         }
 
         private void add(PerFieldDiskUsage other) {
@@ -199,6 +214,7 @@ private void add(PerFieldDiskUsage other) {
             pointsBytes += other.pointsBytes;
             normsBytes += other.normsBytes;
             termVectorsBytes += other.termVectorsBytes;
+            knnVectorsBytes += other.knnVectorsBytes;
         }
 
         public long getInvertedIndexBytes() {
@@ -225,8 +241,12 @@ public long getTermVectorsBytes() {
             return termVectorsBytes;
         }
 
+        public long getKnnVectorsBytes() {
+            return knnVectorsBytes;
+        }
+
         long totalBytes() {
-            return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes;
+            return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes + knnVectorsBytes;
         }
 
         @Override
@@ -254,6 +274,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
 
             builder.field(TERM_VECTORS, new ByteSizeValue(termVectorsBytes));
             builder.field(TERM_VECTORS_IN_BYTES, termVectorsBytes);
+
+            builder.field(KNN_VECTORS, new ByteSizeValue(knnVectorsBytes));
+            builder.field(KNN_VECTORS_IN_BYTES, knnVectorsBytes);
             return builder;
         }