Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support kNN vectors in disk usage action #88785

Merged
merged 18 commits into from Jul 26, 2022
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/88785.yaml
@@ -0,0 +1,6 @@
pr: 88785
summary: Support kNN vectors in disk usage action
area: Search
type: enhancement
issues:
- 84801
16 changes: 12 additions & 4 deletions docs/reference/indices/diskusage.asciidoc
Expand Up @@ -100,7 +100,9 @@ The API returns:
"norms": "2.3kb",
"norms_in_bytes": 2356,
"term_vectors": "2.2kb",
"term_vectors_in_bytes": 2310
"term_vectors_in_bytes": 2310,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"fields": {
"_id": {
Expand All @@ -119,7 +121,9 @@ The API returns:
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"_primary_term": {...},
"_seq_no": {...},
Expand All @@ -137,7 +141,9 @@ The API returns:
"norms": "0b",
"norms_in_bytes": 0,
"term_vectors": "0b",
"term_vectors_in_bytes": 0
"term_vectors_in_bytes": 0,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"context": {
"total": "28.6mb",
Expand All @@ -155,7 +161,9 @@ The API returns:
"norms": "2.3kb",
"norms_in_bytes": 2356,
"term_vectors": "2.2kb",
"term_vectors_in_bytes": 2310
"term_vectors_in_bytes": 2310,
"knn_vectors": "0b",
"knn_vectors_in_bytes": 0
},
"context.keyword": {...},
"message": {...},
Expand Down
Expand Up @@ -18,26 +18,31 @@ setup:
genre:
type: keyword
doc_values: true
vector:
type: dense_vector
dims: 3
index: true
similarity: l2_norm
- do:
index:
index: testindex
body: { "name": "foo", "quantity": 0, "genre": [ "rock", "pop" ] }
- do:
index:
index: testindex
body: { "name": "bar", "quantity": 99, "genre": "pop" }
body: { "name": "bar", "quantity": 99, "genre": "pop", "vector": [0.1, 0.2, 0.3] }
- do:
index:
index: testindex
body: { "name": "baz", "quantity": 50, "genre": "jazz" }
- do:
index:
index: testindex
body: { "name": "bar & baz", "quantity": 1000, "genre": "blue" }
body: { "name": "bar & baz", "quantity": 1000, "genre": "blue", "vector": [0.1, 1.2, 2.4] }
- do:
index:
index: testindex
body: { "name": "foobar", "quantity": 1000, "genre": "country" }
body: { "name": "foobar", "quantity": 1000, "genre": "country", "vector": [1.3, 0.2, 4.5] }

---
"Name the index":
Expand Down Expand Up @@ -81,6 +86,15 @@ setup:
- match: { testindex.fields.quantity.norms_in_bytes: 0 }
- match: { testindex.fields.quantity.term_vectors_in_bytes: 0 }

# vector
- gt: { testindex.fields.vector.total_in_bytes: 0 }
- match: { testindex.fields.vector.inverted_index.total_in_bytes: 0 }
- match: { testindex.fields.vector.stored_fields_in_bytes: 0 }
- match: { testindex.fields.vector.doc_values_in_bytes: 0 }
- match: { testindex.fields.vector.points_in_bytes: 0 }
- match: { testindex.fields.vector.norms_in_bytes: 0 }
- match: { testindex.fields.vector.term_vectors_in_bytes: 0 }

# _source
- gt: { testindex.fields._source.total_in_bytes: 0 }
- match: { testindex.fields._source.inverted_index.total_in_bytes: 0 }
Expand Down Expand Up @@ -126,3 +140,29 @@ setup:
- gt: { testindex.all_fields.doc_values_in_bytes: 0 }
- gt: { testindex.all_fields.points_in_bytes: 0 }
- match: { testindex.all_fields.term_vectors_in_bytes: 0 }

---
"Dense vectors":
- skip:
version: " - 8.3.99"
reason: dense vector support was added in 8.4

- do:
indices.disk_usage:
index: testindex
run_expensive_tasks: true

- gt: { testindex.store_size_in_bytes: 100 }

# all_fields
- gt: { testindex.all_fields.knn_vectors_in_bytes: 0 }

# quantity
- gt: { testindex.fields.quantity.total_in_bytes: 0 }
- gt: { testindex.fields.quantity.points_in_bytes: 0 }
- match: { testindex.fields.quantity.knn_vectors_in_bytes: 0 }

# vector
- gt: { testindex.fields.vector.total_in_bytes: 0 }
- match: { testindex.fields.vector.points_in_bytes: 0 }
- gt: { testindex.fields.vector.knn_vectors_in_bytes: 0 }
Expand Up @@ -13,6 +13,7 @@
import org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
Expand All @@ -37,6 +38,7 @@
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterDirectory;
Expand Down Expand Up @@ -118,6 +120,10 @@ void doAnalyze(IndexDiskUsageStats stats) throws IOException {
startTimeInNanos = System.nanoTime();
analyzeTermVectors(reader, stats);
executionTime.termVectorsTimeInNanos += System.nanoTime() - startTimeInNanos;

startTimeInNanos = System.nanoTime();
analyzeKnnVectors(reader, stats);
executionTime.knnVectorsTimeInNanos += System.nanoTime() - startTimeInNanos;
}
}
logger.debug("analyzing the disk usage took {} stats: {}", executionTime, stats);
Expand Down Expand Up @@ -510,6 +516,36 @@ void visitField(Fields vectors, String fieldName) throws IOException {
}
}

void analyzeKnnVectors(SegmentReader reader, IndexDiskUsageStats stats) throws IOException {
KnnVectorsReader vectorReader = reader.getVectorReader();
if (vectorReader == null) {
return;
}
for (FieldInfo field : reader.getFieldInfos()) {
cancellationChecker.checkForCancellation();
directory.resetBytesRead();
if (field.getVectorDimension() > 0) {
iterateDocValues(reader.maxDoc(), () -> vectorReader.getVectorValues(field.name), vectors -> {
cancellationChecker.logEvent();
vectors.vectorValue();
});

// do a couple of randomized searches to figure out min and max offsets of index file
VectorValues vectorValues = vectorReader.getVectorValues(field.name);
int numDocsToVisit = reader.maxDoc() < 10 ? reader.maxDoc() : 10 * (int) Math.log10(reader.maxDoc());
int skipFactor = Math.max(reader.maxDoc() / numDocsToVisit, 1);
for (int i = 0; i < reader.maxDoc(); i += skipFactor) {
if ((i = vectorValues.advance(i)) == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
cancellationChecker.checkForCancellation();
vectorReader.search(field.name, vectorValues.vectorValue(), 100, null, Integer.MAX_VALUE);
}
stats.addKnnVectors(field.name, directory.getBytesRead());
}
}
}

private static class TrackingReadBytesDirectory extends FilterDirectory {
private final Map<String, BytesReadTracker> trackers = new HashMap<>();

Expand Down Expand Up @@ -602,7 +638,7 @@ public void readBytes(byte[] b, int offset, int len) throws IOException {
}

/**
* Lucene Codec organizes data field by field for doc values, points, postings, and norms; and document by document
* Lucene Codec organizes data field by field for doc values, points, postings, vectors, and norms; and document by document
* for stored fields and term vectors. BytesReadTracker then can simply track the min and max read positions.
* This would allow us to traverse only two ends of each partition.
*/
Expand Down Expand Up @@ -698,10 +734,11 @@ private static class ExecutionTime {
long pointsTimeInNanos;
long normsTimeInNanos;
long termVectorsTimeInNanos;
long knnVectorsTimeInNanos;

long totalInNanos() {
return invertedIndexTimeInNanos + storedFieldsTimeInNanos + docValuesTimeInNanos + pointsTimeInNanos + normsTimeInNanos
+ termVectorsTimeInNanos;
+ termVectorsTimeInNanos + knnVectorsTimeInNanos;
}

@Override
Expand All @@ -726,6 +763,9 @@ public String toString() {
+ "ms"
+ ", term vectors: "
+ termVectorsTimeInNanos / 1000_000
+ "ms"
+ ", knn vectors: "
+ knnVectorsTimeInNanos / 1000_000
+ "ms";
}
}
Expand Down
Expand Up @@ -8,6 +8,7 @@

package org.elasticsearch.action.admin.indices.diskusage;

import org.elasticsearch.Version;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand Down Expand Up @@ -39,6 +40,8 @@ public final class IndexDiskUsageStats implements ToXContentFragment, Writeable
public static final String NORMS_IN_BYTES = "norms_in_bytes";
public static final String TERM_VECTORS = "term_vectors";
public static final String TERM_VECTORS_IN_BYTES = "term_vectors_in_bytes";
public static final String KNN_VECTORS = "knn_vectors";
public static final String KNN_VECTORS_IN_BYTES = "knn_vectors_in_bytes";

public static final String STORE_SIZE = "store_size";
public static final String STORE_SIZE_IN_BYTES = "store_size_in_bytes";
Expand Down Expand Up @@ -119,6 +122,11 @@ public void addTermVectors(String fieldName, long bytes) {
getOrAdd(fieldName).termVectorsBytes += bytes;
}

public void addKnnVectors(String fieldName, long bytes) {
checkByteSize(bytes);
getOrAdd(fieldName).knnVectorsBytes += bytes;
}

public IndexDiskUsageStats add(IndexDiskUsageStats other) {
other.fields.forEach((k, v) -> getOrAdd(k).add(v));
this.indexSizeInBytes += other.indexSizeInBytes;
Expand Down Expand Up @@ -168,6 +176,7 @@ public static final class PerFieldDiskUsage implements ToXContentFragment, Write
private long pointsBytes;
private long normsBytes;
private long termVectorsBytes;
private long knnVectorsBytes;

private PerFieldDiskUsage() {

Expand All @@ -180,6 +189,9 @@ private PerFieldDiskUsage(StreamInput in) throws IOException {
pointsBytes = in.readVLong();
normsBytes = in.readVLong();
termVectorsBytes = in.readVLong();
if (in.getVersion().onOrAfter(Version.V_8_4_0)) {
knnVectorsBytes = in.readVLong();
}
}

@Override
Expand All @@ -190,6 +202,9 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeVLong(pointsBytes);
out.writeVLong(normsBytes);
out.writeVLong(termVectorsBytes);
if (out.getVersion().onOrAfter(Version.V_8_4_0)) {
out.writeVLong(knnVectorsBytes);
}
}

private void add(PerFieldDiskUsage other) {
Expand All @@ -199,6 +214,7 @@ private void add(PerFieldDiskUsage other) {
pointsBytes += other.pointsBytes;
normsBytes += other.normsBytes;
termVectorsBytes += other.termVectorsBytes;
knnVectorsBytes += other.knnVectorsBytes;
}

public long getInvertedIndexBytes() {
Expand All @@ -225,8 +241,12 @@ public long getTermVectorsBytes() {
return termVectorsBytes;
}

public long getKnnVectorsBytes() {
return knnVectorsBytes;
}

long totalBytes() {
return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes;
return invertedIndexBytes + storedFieldBytes + docValuesBytes + pointsBytes + normsBytes + termVectorsBytes + knnVectorsBytes;
}

@Override
Expand Down Expand Up @@ -254,6 +274,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws

builder.field(TERM_VECTORS, new ByteSizeValue(termVectorsBytes));
builder.field(TERM_VECTORS_IN_BYTES, termVectorsBytes);

builder.field(KNN_VECTORS, new ByteSizeValue(knnVectorsBytes));
builder.field(KNN_VECTORS_IN_BYTES, knnVectorsBytes);
return builder;
}

Expand Down