Skip to content

Commit

Permalink
LUCENE-9378: Make it possible to configure how to trade speed for com…
Browse files Browse the repository at this point in the history
…pression on doc values. (apache#2069)

This adds a switch to `Lucene80DocValuesFormat` which allows to
configure whether to prioritize retrieval speed over compression ratio
or the other way around. When prioritizing retrieval speed, binary doc
values are written using the exact same format as before more aggressive
compression got introduced.
  • Loading branch information
jpountz authored and epugh@opensourceconnections.com committed Jan 15, 2021
1 parent 5c40b06 commit 4f6b7fd
Show file tree
Hide file tree
Showing 14 changed files with 247 additions and 36 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ New Features

* LUCENE-9553: New XYPoint query that accepts an array of XYGeometries. (Ignacio Vera)

* LUCENE-9378: Doc values now allow configuring how to trade compression for
retrieval speed. (Adrien Grand)

Improvements
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.lucene.backward_codecs.lucene87;

import java.util.Objects;

import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
Expand All @@ -34,6 +36,7 @@
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
Expand All @@ -54,6 +57,23 @@
* @lucene.experimental
*/
public class Lucene87Codec extends Codec {

/** Configuration option for the codec. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED(Lucene87StoredFieldsFormat.Mode.BEST_SPEED, Lucene80DocValuesFormat.Mode.BEST_SPEED),
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION(Lucene87StoredFieldsFormat.Mode.BEST_COMPRESSION, Lucene80DocValuesFormat.Mode.BEST_COMPRESSION);

private final Lucene87StoredFieldsFormat.Mode storedMode;
private final Lucene80DocValuesFormat.Mode dvMode;

private Mode(Lucene87StoredFieldsFormat.Mode storedMode, Lucene80DocValuesFormat.Mode dvMode) {
this.storedMode = Objects.requireNonNull(storedMode);
this.dvMode = Objects.requireNonNull(dvMode);
}
}

private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene60FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
Expand Down Expand Up @@ -85,6 +105,7 @@ public Lucene87Codec() {
super("Lucene87");
this.storedFieldsFormat = new Lucene87StoredFieldsFormat();
this.defaultFormat = new Lucene84PostingsFormat();
this.defaultDVFormat = new Lucene80DocValuesFormat();
}

@Override
Expand Down Expand Up @@ -161,7 +182,7 @@ public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}

private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
private final DocValuesFormat defaultDVFormat;

private final NormsFormat normsFormat = new Lucene80NormsFormat();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,14 @@
/** writer for {@link Lucene80DocValuesFormat} */
final class Lucene80DocValuesConsumer extends DocValuesConsumer implements Closeable {

final Lucene80DocValuesFormat.Mode mode;
IndexOutput data, meta;
final int maxDoc;
private final SegmentWriteState state;

/** expert: Creates a new writer */
public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
public Lucene80DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension, Lucene80DocValuesFormat.Mode mode) throws IOException {
this.mode = mode;
boolean success = false;
try {
this.state = state;
Expand Down Expand Up @@ -490,13 +492,86 @@ public void close() throws IOException {
}

}


@Override
public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
meta.writeInt(field.number);
meta.writeByte(Lucene80DocValuesFormat.BINARY);

switch (mode) {
case BEST_SPEED:
meta.writeByte((byte) 0);
doAddUncompressedBinaryField(field, valuesProducer);
break;
case BEST_COMPRESSION:
meta.writeByte((byte) 1);
doAddCompressedBinaryField(field, valuesProducer);
break;
default:
throw new AssertionError();
}
}

private void doAddUncompressedBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
meta.writeLong(start); // dataOffset
int numDocsWithField = 0;
int minLength = Integer.MAX_VALUE;
int maxLength = 0;
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
numDocsWithField++;
BytesRef v = values.binaryValue();
int length = v.length;
data.writeBytes(v.bytes, v.offset, v.length);
minLength = Math.min(length, minLength);
maxLength = Math.max(length, maxLength);
}
assert numDocsWithField <= maxDoc;
meta.writeLong(data.getFilePointer() - start); // dataLength

if (numDocsWithField == 0) {
meta.writeLong(-2); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else if (numDocsWithField == maxDoc) {
meta.writeLong(-1); // docsWithFieldOffset
meta.writeLong(0L); // docsWithFieldLength
meta.writeShort((short) -1); // jumpTableEntryCount
meta.writeByte((byte) -1); // denseRankPower
} else {
long offset = data.getFilePointer();
meta.writeLong(offset); // docsWithFieldOffset
values = valuesProducer.getBinary(field);
final short jumpTableEntryCount = IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
meta.writeShort(jumpTableEntryCount);
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
}

meta.writeInt(numDocsWithField);
meta.writeInt(minLength);
meta.writeInt(maxLength);
if (maxLength > minLength) {
start = data.getFilePointer();
meta.writeLong(start);
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);

final DirectMonotonicWriter writer = DirectMonotonicWriter.getInstance(meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
long addr = 0;
writer.add(addr);
values = valuesProducer.getBinary(field);
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
addr += values.binaryValue().length;
writer.add(addr);
}
writer.finish();
meta.writeLong(data.getFilePointer() - start);
}
}

private void doAddCompressedBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
try (CompressedBinaryBlockWriter blockWriter = new CompressedBinaryBlockWriter()){
BinaryDocValues values = valuesProducer.getBinary(field);
long start = data.getFilePointer();
Expand Down Expand Up @@ -542,7 +617,6 @@ public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) th
meta.writeInt(maxLength);

blockWriter.writeMetaData();

}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@


import java.io.IOException;
import java.util.Objects;

import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
Expand Down Expand Up @@ -131,14 +132,30 @@
*/
public final class Lucene80DocValuesFormat extends DocValuesFormat {

/** Sole Constructor */
/** Configuration option for doc values. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED,
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION
}

private final Mode mode;

/** Default constructor. */
public Lucene80DocValuesFormat() {
this(Mode.BEST_SPEED);
}

/** Constructor */
public Lucene80DocValuesFormat(Mode mode) {
super("Lucene80");
this.mode = Objects.requireNonNull(mode);
}

@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new Lucene80DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
return new Lucene80DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION, mode);
}

@Override
Expand All @@ -152,7 +169,8 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_BIN_COMPRESSED = 1;
static final int VERSION_CURRENT = VERSION_BIN_COMPRESSED;
static final int VERSION_CONFIGURABLE_COMPRESSION = 2;
static final int VERSION_CURRENT = VERSION_CONFIGURABLE_COMPRESSION;

// indicates docvalues type
static final byte NUMERIC = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,20 @@ private void readNumeric(ChecksumIndexInput meta, NumericEntry entry) throws IOE

private BinaryEntry readBinary(ChecksumIndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
if (version >= Lucene80DocValuesFormat.VERSION_CONFIGURABLE_COMPRESSION) {
int b = meta.readByte();
switch (b) {
case 0:
case 1:
// valid
break;
default:
throw new CorruptIndexException("Unexpected byte: " + b + ", expected 0 or 1", meta);
}
entry.compressed = b != 0;
} else {
entry.compressed = version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED;
}
entry.dataOffset = meta.readLong();
entry.dataLength = meta.readLong();
entry.docsWithFieldOffset = meta.readLong();
Expand All @@ -183,19 +197,19 @@ private BinaryEntry readBinary(ChecksumIndexInput meta) throws IOException {
entry.numDocsWithField = meta.readInt();
entry.minLength = meta.readInt();
entry.maxLength = meta.readInt();
if ((version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED && entry.numDocsWithField > 0) || entry.minLength < entry.maxLength) {
if ((entry.compressed && entry.numDocsWithField > 0) || entry.minLength < entry.maxLength) {
entry.addressesOffset = meta.readLong();

// Old count of uncompressed addresses
long numAddresses = entry.numDocsWithField + 1L;
// New count of compressed addresses - the number of compresseed blocks
if (version >= Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED) {
if (entry.compressed) {
entry.numCompressedChunks = meta.readVInt();
entry.docsPerChunkShift = meta.readVInt();
entry.maxUncompressedChunkSize = meta.readVInt();
numAddresses = entry.numCompressedChunks;
}

final int blockShift = meta.readVInt();
entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, numAddresses, blockShift);
ramBytesUsed += entry.addressesMeta.ramBytesUsed();
Expand Down Expand Up @@ -303,6 +317,7 @@ private static class NumericEntry {
}

private static class BinaryEntry {
boolean compressed;
long dataOffset;
long dataLength;
long docsWithFieldOffset;
Expand Down Expand Up @@ -680,9 +695,7 @@ public boolean advanceExact(int target) throws IOException {
}
}

// BWC - old binary format
private BinaryDocValues getUncompressedBinary(FieldInfo field) throws IOException {
BinaryEntry entry = binaries.get(field.name);
private BinaryDocValues getUncompressedBinary(BinaryEntry entry) throws IOException {
if (entry.docsWithFieldOffset == -2) {
return DocValues.emptyBinary();
}
Expand Down Expand Up @@ -844,11 +857,16 @@ BytesRef decode(int docNumber) throws IOException {

@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
if (version < Lucene80DocValuesFormat.VERSION_BIN_COMPRESSED) {
return getUncompressedBinary(field);
BinaryEntry entry = binaries.get(field.name);
if (entry.compressed) {
return getCompressedBinary(entry);
} else {
return getUncompressedBinary(entry);
}
}

private BinaryDocValues getCompressedBinary(BinaryEntry entry) throws IOException {

BinaryEntry entry = binaries.get(field.name);
if (entry.docsWithFieldOffset == -2) {
return DocValues.emptyBinary();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
Expand All @@ -53,6 +54,23 @@
* @lucene.experimental
*/
public class Lucene90Codec extends Codec {

/** Configuration option for the codec. */
public static enum Mode {
/** Trade compression ratio for retrieval speed. */
BEST_SPEED(Lucene87StoredFieldsFormat.Mode.BEST_SPEED, Lucene80DocValuesFormat.Mode.BEST_SPEED),
/** Trade retrieval speed for compression ratio. */
BEST_COMPRESSION(Lucene87StoredFieldsFormat.Mode.BEST_COMPRESSION, Lucene80DocValuesFormat.Mode.BEST_COMPRESSION);

private final Lucene87StoredFieldsFormat.Mode storedMode;
private final Lucene80DocValuesFormat.Mode dvMode;

private Mode(Lucene87StoredFieldsFormat.Mode storedMode, Lucene80DocValuesFormat.Mode dvMode) {
this.storedMode = Objects.requireNonNull(storedMode);
this.dvMode = Objects.requireNonNull(dvMode);
}
}

private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
Expand Down Expand Up @@ -82,7 +100,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) {
* Instantiates a new codec.
*/
public Lucene90Codec() {
this(Lucene87StoredFieldsFormat.Mode.BEST_SPEED);
this(Mode.BEST_SPEED);
}

/**
Expand All @@ -91,10 +109,11 @@ public Lucene90Codec() {
* @param mode stored fields compression mode to use for newly
* flushed/merged segments.
*/
public Lucene90Codec(Lucene87StoredFieldsFormat.Mode mode) {
public Lucene90Codec(Mode mode) {
super("Lucene90");
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode));
this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
this.defaultFormat = new Lucene84PostingsFormat();
this.defaultDVFormat = new Lucene80DocValuesFormat(mode.dvMode);
}

@Override
Expand Down Expand Up @@ -172,7 +191,7 @@ public final DocValuesFormat docValuesFormat() {
return docValuesFormat;
}

private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
private final DocValuesFormat defaultDVFormat;

private final NormsFormat normsFormat = new Lucene80NormsFormat();

Expand Down
Loading

0 comments on commit 4f6b7fd

Please sign in to comment.