Skip to content

Commit

Permalink
Support kNN vectors in disk usage action (#88785)
Browse files Browse the repository at this point in the history
This change adds support for kNN vector fields to the `_disk_usage` API. The
strategy:
* Iterate the vector values (using the same strategy as for doc values) to
estimate the vector data size
* Run some random vector searches to estimate the vector index size 

Co-authored-by: Yannick Welsch <yannick@welsch.lu>

Closes #84801
  • Loading branch information
jtibshirani committed Jul 26, 2022
1 parent 9b5cd67 commit abd561a
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 7 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/88785.yaml
@@ -0,0 +1,6 @@
pr: 88785
summary: Support kNN vectors in disk usage action
area: Search
type: enhancement
issues:
- 84801
16 changes: 12 additions & 4 deletions docs/reference/indices/diskusage.asciidoc
Expand Up @@ -100,7 +100,9 @@ The API returns:
"norms": "2.3kb",
"norms_in_bytes": 2356,
"term_vectors": "2.2kb",
"term_vectors_in_bytes": 2310
"term_vectors_in_bytes": 2310,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"fields": {
"_id": {
Expand All @@ -119,7 +121,9 @@ The API returns:
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_primary_term": {...},
"_seq_no": {...},
Expand All @@ -137,7 +141,9 @@ The API returns:
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"context": {
"total": "28.6mb",
Expand All @@ -155,7 +161,9 @@ The API returns:
"norms": "2.3kb",
"norms_in_bytes": 2356,
"term_vectors": "2.2kb",
"term_vectors_in_bytes": 2310
"term_vectors_in_bytes": 2310,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"context.keyword": {...},
"message": {...},
Expand Down
Expand Up @@ -18,6 +18,7 @@ setup:
genre:
type: keyword
doc_values: true

- do:
index:
index: testindex
Expand Down Expand Up @@ -126,3 +127,52 @@ setup:
- gt: { testindex.all_fields.doc_values_in_bytes: 0 }
- gt: { testindex.all_fields.points_in_bytes: 0 }
- match: { testindex.all_fields.term_vectors_in_bytes: 0 }

---
"Dense vectors":
- skip:
version: " - 8.3.99"
reason: dense vector support was added in 8.4

- do:
indices.put_mapping:
index: testindex
body:
properties:
vector:
type: dense_vector
dims: 3
index: true
similarity: l2_norm
- do:
index:
index: testindex
body: { "quantity": 0 }
- do:
index:
index: testindex
body: { "quantity": 99, "vector": [0.1, 0.2, 0.3] }
- do:
index:
index: testindex
body: { "quantity": 1000, "vector": [1.3, 0.2, 4.5] }

- do:
indices.disk_usage:
index: testindex
run_expensive_tasks: true

- gt: { testindex.store_size_in_bytes: 100 }

# all_fields
- gt: { testindex.all_fields.knn_vectors_in_bytes: 0 }

# quantity
- gt: { testindex.fields.quantity.total_in_bytes: 0 }
- gt: { testindex.fields.quantity.points_in_bytes: 0 }
- match: { testindex.fields.quantity.knn_vectors_in_bytes: 0 }

# vector
- gt: { testindex.fields.vector.total_in_bytes: 0 }
- match: { testindex.fields.vector.points_in_bytes: 0 }
- gt: { testindex.fields.vector.knn_vectors_in_bytes: 0 }
Expand Up @@ -13,6 +13,7 @@
import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
Expand All @@ -37,6 +38,7 @@
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterDirectory;
Expand Down Expand Up @@ -118,6 +120,10 @@ void doAnalyze(IndexDiskUsageStats stats) throws IOException {
startTimeInNanos = System.nanoTime();
analyzeTermVectors(reader, stats);
executionTime.termVectorsTimeInNanos += System.nanoTime() - startTimeInNanos;

startTimeInNanos = System.nanoTime();
analyzeKnnVectors(reader, stats);
executionTime.knnVectorsTimeInNanos += System.nanoTime() - startTimeInNanos;
}
}
logger.debug("analyzing the disk usage took {} stats: {}", executionTime, stats);
Expand Down Expand Up @@ -510,6 +516,36 @@ void visitField(Fields vectors, String fieldName) throws IOException {
}
}

void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
KnnVectorsReader vectorReader = reader.getVectorReader();
if (vectorReader == null) {
return;
}
for (FieldInfo field : reader.getFieldInfos()) {
cancellationChecker.checkForCancellation();
directory.resetBytesRead();
if (field.getVectorDimension() > 0) {
iterateDocValues(reader.maxDoc(), () -> vectorReader.getVectorValues(field.name), vectors -> {
cancellationChecker.logEvent();
vectors.vectorValue();
});

// do a couple of randomized searches to figure out min and max offsets of index file
VectorValues vectorValues = vectorReader.getVectorValues(field.name);
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
if ((i = vectorValues.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
cancellationChecker.checkForCancellation();
vectorReader.search(field.name, vectorValues.vectorValue(), 100, null, Integer.MAX_VALUE);
}
stats.addKnnVectors(field.name, directory.getBytesRead());
}
}
}

private static class TrackingReadBytesDirectory extends FilterDirectory {
private final Map<String, BytesReadTracker> trackers = new HashMap<>();

Expand Down Expand Up @@ -602,7 +638,7 @@ public void readBytes(byte[] b, int offset, int len) throws IOException {
}

/**
* Lucene Codec organizes data field by field for doc values, points, postings, and norms; and document by document
* Lucene Codec organizes data field by field for doc values, points, postings, vectors, and norms; and document by document
* for stored fields and term vectors. BytesReadTracker then can simply track the min and max read positions.
* This would allow us to traverse only two ends of each partition.
*/
Expand Down Expand Up @@ -698,10 +734,11 @@ private static class ExecutionTime {
long pointsTimeInNanos;
long normsTimeInNanos;
long termVectorsTimeInNanos;
long knnVectorsTimeInNanos;

long totalInNanos() {
return invertedIndexTimeInNanos + storedFieldsTimeInNanos + docValuesTimeInNanos + pointsTimeInNanos + normsTimeInNanos
+ termVectorsTimeInNanos;
+ termVectorsTimeInNanos + knnVectorsTimeInNanos;
}

@Override
Expand All @@ -726,6 +763,9 @@ public String toString() {
+ "ms"
+ ", term vectors: "
+ termVectorsTimeInNanos / 1000_000
+ "ms"
+ ", knn vectors: "
+ knnVectorsTimeInNanos / 1000_000
+ "ms";
}
}
Expand Down
Expand Up @@ -8,6 +8,7 @@

package org.elasticsearch.action.admin.indices.diskusage;

import org.elasticsearch.Version;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand Down Expand Up @@ -39,6 +40,8 @@ public final class IndexDiskUsageStats implements ToXContentFragment, Writeable
public static final String NORMS_IN_BYTES = "norms_in_bytes";
public static final String TERM_VECTORS = "term_vectors";
public static final String TERM_VECTORS_IN_BYTES = "term_vectors_in_bytes";
public static final String KNN_VECTORS = "knn_vectors";
public static final String KNN_VECTORS_IN_BYTES = "knn_vectors_in_bytes";

public static final String STORE_SIZE = "store_size";
public static final String STORE_SIZE_IN_BYTES = "store_size_in_bytes";
Expand Down Expand Up @@ -119,6 +122,11 @@ public void addTermVectors(String fieldName, long bytes) {
getOrAdd(fieldName).termVectorsBytes += bytes;
}

public void addKnnVectors(String fieldName, long bytes) {
checkByteSize(bytes);
getOrAdd(fieldName).knnVectorsBytes += bytes;
}

public IndexDiskUsageStats add(IndexDiskUsageStats other) {
other.fields.forEach((k, v) -> getOrAdd(k).add(v));
this.indexSizeInBytes += other.indexSizeInBytes;
Expand Down Expand Up @@ -168,6 +176,7 @@ public static final class PerFieldDiskUsage implements ToXContentFragment, Write
private long pointsBytes;
private long normsBytes;
private long termVectorsBytes;
private long knnVectorsBytes;

private PerFieldDiskUsage() {

Expand All @@ -180,6 +189,9 @@ private PerFieldDiskUsage(StreamInput in) throws IOException {
pointsBytes = in.readVLong();
normsBytes = in.readVLong();
termVectorsBytes = in.readVLong();
if (in.getVersion().onOrAfter(Version.V_8_4_0)) {
knnVectorsBytes = in.readVLong();
}
}

@Override
Expand All @@ -190,6 +202,9 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeVLong(pointsBytes);
out.writeVLong(normsBytes);
out.writeVLong(termVectorsBytes);
if (out.getVersion().onOrAfter(Version.V_8_4_0)) {
out.writeVLong(knnVectorsBytes);
}
}

private void add(PerFieldDiskUsage other) {
Expand All @@ -199,6 +214,7 @@ private void add(PerFieldDiskUsage other) {
pointsBytes += other.pointsBytes;
normsBytes += other.normsBytes;
termVectorsBytes += other.termVectorsBytes;
knnVectorsBytes += other.knnVectorsBytes;
}

public long getInvertedIndexBytes() {
Expand All @@ -225,8 +241,12 @@ public long getTermVectorsBytes() {
return termVectorsBytes;
}

public long getKnnVectorsBytes() {
return knnVectorsBytes;
}

long totalBytes() {
return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes;
return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes + knnVectorsBytes;
}

@Override
Expand Down Expand Up @@ -254,6 +274,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws

builder.field(TERM_VECTORS, new ByteSizeValue(termVectorsBytes));
builder.field(TERM_VECTORS_IN_BYTES, termVectorsBytes);

builder.field(KNN_VECTORS, new ByteSizeValue(knnVectorsBytes));
builder.field(KNN_VECTORS_IN_BYTES, knnVectorsBytes);
return builder;
}

Expand Down

0 comments on commit abd561a

Please sign in to comment.