diff --git a/docs/changelog/88785.yaml b/docs/changelog/88785.yaml new file mode 100644 index 0000000000000..00b890f8aa99b --- /dev/null +++ b/docs/changelog/88785.yaml @@ -0,0 +1,6 @@ +pr: 88785 +summary: Support kNN vectors in disk usage action +area: Search +type: enhancement +issues: + - 84801 diff --git a/docs/reference/indices/diskusage.asciidoc b/docs/reference/indices/diskusage.asciidoc index 20a2e0e1baa2e..bbfe6ae3eda0b 100644 --- a/docs/reference/indices/diskusage.asciidoc +++ b/docs/reference/indices/diskusage.asciidoc @@ -100,7 +100,9 @@ The API returns: "norms": "2.3kb", "norms_in_bytes": 2356, "term_vectors": "2.2kb", - "term_vectors_in_bytes": 2310 + "term_vectors_in_bytes": 2310, + "knn_vectors": "0b", + "knn_vectors_in_bytes": 0 }, "fields": { "_id": { @@ -119,7 +121,9 @@ The API returns: "norms": "0b", "norms_in_bytes": 0, "term_vectors": "0b", - "term_vectors_in_bytes": 0 + "term_vectors_in_bytes": 0, + "knn_vectors": "0b", + "knn_vectors_in_bytes": 0 }, "_primary_term": {...}, "_seq_no": {...}, @@ -137,7 +141,9 @@ The API returns: "norms": "0b", "norms_in_bytes": 0, "term_vectors": "0b", - "term_vectors_in_bytes": 0 + "term_vectors_in_bytes": 0, + "knn_vectors": "0b", + "knn_vectors_in_bytes": 0 }, "context": { "total": "28.6mb", @@ -155,7 +161,9 @@ The API returns: "norms": "2.3kb", "norms_in_bytes": 2356, "term_vectors": "2.2kb", - "term_vectors_in_bytes": 2310 + "term_vectors_in_bytes": 2310, + "knn_vectors": "0b", + "knn_vectors_in_bytes": 0 }, "context.keyword": {...}, "message": {...}, diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.stats/50_disk_usage.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.stats/50_disk_usage.yml index 5641d24a72c00..3fe6edf0823e7 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.stats/50_disk_usage.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/indices.stats/50_disk_usage.yml @@ -18,6 +18,7 @@ setup: genre: type: keyword doc_values: true + - do: index: index: testindex @@ -126,3 +127,52 @@ setup: - gt: { testindex.all_fields.doc_values_in_bytes: 0 } - gt: { testindex.all_fields.points_in_bytes: 0 } - match: { testindex.all_fields.term_vectors_in_bytes: 0 } + +--- +"Dense vectors": + - skip: + version: " - 8.3.99" + reason: dense vector support was added in 8.4 + + - do: + indices.put_mapping: + index: testindex + body: + properties: + vector: + type: dense_vector + dims: 3 + index: true + similarity: l2_norm + - do: + index: + index: testindex + body: { "quantity": 0 } + - do: + index: + index: testindex + body: { "quantity": 99, "vector": [0.1, 0.2, 0.3] } + - do: + index: + index: testindex + body: { "quantity": 1000, "vector": [1.3, 0.2, 4.5] } + + - do: + indices.disk_usage: + index: testindex + run_expensive_tasks: true + + - gt: { testindex.store_size_in_bytes: 100 } + + # all_fields + - gt: { testindex.all_fields.knn_vectors_in_bytes: 0 } + + # quantity + - gt: { testindex.fields.quantity.total_in_bytes: 0 } + - gt: { testindex.fields.quantity.points_in_bytes: 0 } + - match: { testindex.fields.quantity.knn_vectors_in_bytes: 0 } + + # vector + - gt: { testindex.fields.vector.total_in_bytes: 0 } + - match: { testindex.fields.vector.points_in_bytes: 0 } + - gt: { testindex.fields.vector.knn_vectors_in_bytes: 0 } diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java index a0d4a2af94c85..ba47c28c869d4 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzer.java @@ -13,6 +13,7 @@ import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; @@ -37,6 +38,7 @@ import org.apache.lucene.index.TermState; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; @@ -118,6 +120,10 @@ void doAnalyze(IndexDiskUsageStats stats) throws IOException { startTimeInNanos = System.nanoTime(); analyzeTermVectors(reader, stats); executionTime.termVectorsTimeInNanos += System.nanoTime() - startTimeInNanos; + + startTimeInNanos = System.nanoTime(); + analyzeKnnVectors(reader, stats); + executionTime.knnVectorsTimeInNanos += System.nanoTime() - startTimeInNanos; } } logger.debug("analyzing the disk usage took {} stats: {}", executionTime, stats); @@ -510,6 +516,36 @@ void visitField(Fields vectors, String fieldName) throws IOException { } } + void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException { + KnnVectorsReader vectorReader = reader.getVectorReader(); + if (vectorReader == null) { + return; + } + for (FieldInfo field : reader.getFieldInfos()) { + cancellationChecker.checkForCancellation(); + directory.resetBytesRead(); + if (field.getVectorDimension() > 0) { + iterateDocValues(reader.maxDoc(), () -> vectorReader.getVectorValues(field.name), vectors -> { + cancellationChecker.logEvent(); + vectors.vectorValue(); + }); + + // do a couple of randomized searches to figure out min and max offsets of index file + VectorValues vectorValues = vectorReader.getVectorValues(field.name); + int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc()); + int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1); + for (int i = 0; i < reader.maxDoc(); i += skipFactor) { + if ((i = vectorValues.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + cancellationChecker.checkForCancellation(); + vectorReader.search(field.name, vectorValues.vectorValue(), 100, null, Integer.MAX_VALUE); + } + stats.addKnnVectors(field.name, directory.getBytesRead()); + } + } + } + private static class TrackingReadBytesDirectory extends FilterDirectory { private final Map trackers = new HashMap<>(); @@ -602,7 +638,7 @@ public void readBytes(byte[] b, int offset, int len) throws IOException { } /** - * Lucene Codec organizes data field by field for doc values, points, postings, and norms; and document by document + * Lucene Codec organizes data field by field for doc values, points, postings, vectors, and norms; and document by document * for stored fields and term vectors. BytesReadTracker then can simply track the min and max read positions. * This would allow us to traverse only two ends of each partition. */ @@ -698,10 +734,11 @@ private static class ExecutionTime { long pointsTimeInNanos; long normsTimeInNanos; long termVectorsTimeInNanos; + long knnVectorsTimeInNanos; long totalInNanos() { return invertedIndexTimeInNanos + storedFieldsTimeInNanos + docValuesTimeInNanos + pointsTimeInNanos + normsTimeInNanos - + termVectorsTimeInNanos; + + termVectorsTimeInNanos + knnVectorsTimeInNanos; } @Override @@ -726,6 +763,9 @@ public String toString() { + "ms" + ", term vectors: " + termVectorsTimeInNanos / 1000_000 + + "ms" + + ", knn vectors: " + + knnVectorsTimeInNanos / 1000_000 + "ms"; } } diff --git a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageStats.java b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageStats.java index bde645a139e15..ca51768f516cd 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageStats.java +++ b/server/src/main/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageStats.java @@ -8,6 +8,7 @@ package org.elasticsearch.action.admin.indices.diskusage; +import org.elasticsearch.Version; import org.elasticsearch.common.Strings; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -39,6 +40,8 @@ public final class IndexDiskUsageStats implements ToXContentFragment, Writeable public static final String NORMS_IN_BYTES = "norms_in_bytes"; public static final String TERM_VECTORS = "term_vectors"; public static final String TERM_VECTORS_IN_BYTES = "term_vectors_in_bytes"; + public static final String KNN_VECTORS = "knn_vectors"; + public static final String KNN_VECTORS_IN_BYTES = "knn_vectors_in_bytes"; public static final String STORE_SIZE = "store_size"; public static final String STORE_SIZE_IN_BYTES = "store_size_in_bytes"; @@ -119,6 +122,11 @@ public void addTermVectors(String fieldName, long bytes) { getOrAdd(fieldName).termVectorsBytes += bytes; } + public void addKnnVectors(String fieldName, long bytes) { + checkByteSize(bytes); + getOrAdd(fieldName).knnVectorsBytes += bytes; + } + public IndexDiskUsageStats add(IndexDiskUsageStats other) { other.fields.forEach((k, v) -> getOrAdd(k).add(v)); this.indexSizeInBytes += other.indexSizeInBytes; @@ -168,6 +176,7 @@ public static final class PerFieldDiskUsage implements ToXContentFragment, Write private long pointsBytes; private long normsBytes; private long termVectorsBytes; + private long knnVectorsBytes; private PerFieldDiskUsage() { @@ -180,6 +189,9 @@ private PerFieldDiskUsage(StreamInput in) throws IOException { pointsBytes = in.readVLong(); normsBytes = in.readVLong(); termVectorsBytes = in.readVLong(); + if (in.getVersion().onOrAfter(Version.V_8_4_0)) { + knnVectorsBytes = in.readVLong(); + } } @Override @@ -190,6 +202,9 @@ public void writeTo(StreamOutput out) throws IOException { out.writeVLong(pointsBytes); out.writeVLong(normsBytes); out.writeVLong(termVectorsBytes); + if (out.getVersion().onOrAfter(Version.V_8_4_0)) { + out.writeVLong(knnVectorsBytes); + } } private void add(PerFieldDiskUsage other) { @@ -199,6 +214,7 @@ private void add(PerFieldDiskUsage other) { pointsBytes += other.pointsBytes; normsBytes += other.normsBytes; termVectorsBytes += other.termVectorsBytes; + knnVectorsBytes += other.knnVectorsBytes; } public long getInvertedIndexBytes() { @@ -225,8 +241,12 @@ public long getTermVectorsBytes() { return termVectorsBytes; } + public long getKnnVectorsBytes() { + return knnVectorsBytes; + } + long totalBytes() { - return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes; + return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes + knnVectorsBytes; } @Override @@ -254,6 +274,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.field(TERM_VECTORS, new ByteSizeValue(termVectorsBytes)); builder.field(TERM_VECTORS_IN_BYTES, termVectorsBytes); + + builder.field(KNN_VECTORS, new ByteSizeValue(knnVectorsBytes)); + builder.field(KNN_VECTORS_IN_BYTES, knnVectorsBytes); return builder; } diff --git a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java index 84022bc234436..f5797f8b1de8d 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/indices/diskusage/IndexDiskUsageAnalyzerTests.java @@ -9,11 +9,14 @@ package org.elasticsearch.action.admin.indices.diskusage; import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; import org.apache.lucene.codecs.lucene92.Lucene92Codec; +import org.apache.lucene.codecs.lucene92.Lucene92HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.BinaryPoint; @@ -21,6 +24,7 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.KnnVectorField; import org.apache.lucene.document.LatLonShape; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.NumericDocValuesField; @@ -41,6 +45,7 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentReader; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.IndexSearcher; @@ -80,6 +85,7 @@ import static org.hamcrest.Matchers.notNullValue; public class IndexDiskUsageAnalyzerTests extends ESTestCase { + private static final int DEFAULT_VECTOR_DIMENSION = 128; protected static Directory createNewDirectory() { final Directory dir = LuceneTestCase.newDirectory(); @@ -241,6 +247,26 @@ public void testBinaryPoints() throws Exception { } } + public void testKnnVectors() throws Exception { + try (Directory dir = createNewDirectory()) { + final CodecMode codec = randomFrom(CodecMode.values()); + VectorSimilarityFunction similarity = randomFrom(VectorSimilarityFunction.values()); + int numDocs = between(100, 1000); + int dimension = between(10, 200); + + indexRandomly(dir, codec, numDocs, doc -> { + float[] vector = randomVector(dimension); + doc.add(new KnnVectorField("vector", vector, similarity)); + }); + final IndexDiskUsageStats stats = IndexDiskUsageAnalyzer.analyze(testShardId(), lastCommit(dir), () -> {}); + logger.info("--> stats {}", stats); + + int dataBytes = numDocs * dimension * Float.BYTES; // size of flat vector data + int indexBytesEstimate = numDocs * Integer.BYTES * Lucene92HnswVectorsFormat.DEFAULT_MAX_CONN * 2; // rough size of HNSW graph + assertTrue(stats.total().getKnnVectorsBytes() > dataBytes + indexBytesEstimate); + } + } + public void testTriangle() throws Exception { try (Directory dir = createNewDirectory()) { final CodecMode codec = randomFrom(CodecMode.values()); @@ -488,6 +514,21 @@ static void addRandomTermVectors(Document doc) { } } + static void addRandomKnnVectors(Document doc) { + int numFields = randomFrom(1, 3); + for (int f = 0; f < numFields; f++) { + doc.add(new KnnVectorField("knnvector-" + f, randomVector(DEFAULT_VECTOR_DIMENSION))); + } + } + + private static float[] randomVector(int dimension) { + float[] vec = new float[dimension]; + for (int i = 0; i < vec.length; i++) { + vec[i] = randomFloat(); + } + return vec; + } + static void addRandomFields(Document doc) { if (randomBoolean()) { addRandomDocValuesField(doc); @@ -517,11 +558,15 @@ static void addRandomFields(Document doc) { if (randomBoolean()) { addRandomTermVectors(doc); } + if (randomBoolean()) { + addRandomKnnVectors(doc); + } } static class FieldLookup { private final Map dvSuffixes = new HashMap<>(); private final Map postingsSuffixes = new HashMap<>(); + private final Map vectorSuffixes = new HashMap<>(); FieldLookup(FieldInfos fieldInfos) { for (FieldInfo field : fieldInfos) { @@ -535,6 +580,10 @@ static class FieldLookup { if (dvSuffix != null) { dvSuffixes.put(dvSuffix, field); } + String vectorSuffix = attributes.get(PerFieldKnnVectorsFormat.PER_FIELD_SUFFIX_KEY); + if (vectorSuffix != null) { + vectorSuffixes.put(vectorSuffix, field); + } } } } @@ -574,6 +623,13 @@ String getPostingsField(String fileName) { assertThat("postingsSuffixes[" + postingsSuffixes + "] fileName[" + fileName + "]", field, notNullValue()); return field.name; } + + String getVectorsField(String fileName) { + final String suffix = parseSuffix(fileName); + final FieldInfo field = vectorSuffixes.get(suffix); + assertThat("vectorSuffixes[" + vectorSuffixes + "] fileName[" + fileName + "]", field, notNullValue()); + return field.name; + } } static void rewriteIndexWithPerFieldCodec(Directory source, CodecMode mode, Directory dst) throws IOException { @@ -591,6 +647,11 @@ public DocValuesFormat getDocValuesFormatForField(String field) { return new Lucene90DocValuesFormat(); } + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new Lucene92HnswVectorsFormat(); + } + @Override public String toString() { return super.toString(); @@ -645,6 +706,7 @@ static void collectPerFieldStats(SegmentReader reader, IndexDiskUsageStats stats stats.addStoredField("_all_stored_fields", bytes); case TVX, TVD -> stats.addTermVectors("_all_vectors_fields", bytes); case NVD, NVM -> stats.addNorms("_all_norms_fields", bytes); + case VEM, VEC, VEX -> stats.addKnnVectors(fieldLookup.getVectorsField(file), bytes); } } } finally { @@ -674,6 +736,8 @@ private static void assertStats(IndexDiskUsageStats actualStats, IndexDiskUsageS 0.01, 2048 ); + + assertFieldStats(field, "knn vectors", actualField.getKnnVectorsBytes(), expectedField.getKnnVectorsBytes(), 0.01, 1024); } // We are not able to collect per field stats for stored, vector, points, and norms IndexDiskUsageStats.PerFieldDiskUsage actualTotal = actualStats.total();