Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip optimization if the index has duplicate data #43121

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ static boolean execute(SearchContext searchContext,
System.arraycopy(oldFormats, 0, newFormats, 1, oldFormats.length);
sortAndFormatsForRewrittenNumericSort = searchContext.sort(); // stash SortAndFormats to restore it later
searchContext.sort(new SortAndFormats(new Sort(newSortFields), newFormats));
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("Sort optimization on the field [" + oldSortFields[0].getField() + "] was enabled!");
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure that this helps the debugging ;)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jimczi What would be the way to see if the optimization was used? LOGGER.trace is a not a good way?

}
}

Expand Down Expand Up @@ -387,8 +390,12 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
((sortField.getReverse() == false) && (missingValue == Long.MAX_VALUE));
if (missingValuesAccordingToSort == false) return null;

int docCount = PointValues.getDocCount(reader, fieldName);
// is not worth to run optimization on small index
if (docCount <= 512) return null;
jimczi marked this conversation as resolved.
Show resolved Hide resolved

// check for multiple values
if (PointValues.size(reader, fieldName) != PointValues.getDocCount(reader, fieldName)) return null; //TODO: handle multiple values
if (PointValues.size(reader, fieldName) != docCount) return null; //TODO: handle multiple values

// check if the optimization makes sense with the track_total_hits setting
if (searchContext.trackTotalHitsUpTo() == Integer.MAX_VALUE) {
Expand All @@ -408,6 +415,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
if (minValue == maxValue) {
rewrittenQuery = new DocValuesFieldExistsQuery(fieldName);
} else {
if (indexFieldHasDuplicateData(reader, fieldName)) return null;
long origin = (sortField.getReverse()) ? maxValue : minValue;
long pivotDistance = (maxValue - minValue) >>> 1; // division by 2 on the unsigned representation to avoid overflow
if (pivotDistance == 0) { // 0 if maxValue = (minValue + 1)
Expand Down Expand Up @@ -469,5 +477,76 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
return true;
}

/**
* Returns true if more than 50% of data in the index have the same value
* The evaluation is approximation based on finding the median value and estimating its count
* Returns true if the total count of median values is greater or equal to half of the total count of documents
*/
static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
long globalDocCount = 0;
long globalMedianCount = 0;
for (LeafReaderContext lrc : reader.leaves()) {
PointValues pointValues = lrc.reader().getPointValues(field);
if (pointValues == null) continue;
int docCount = pointValues.getDocCount();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment (or an assert) that the doc count is equals to the number of points. This is important since we'll need to change the logic here if we handle multiple values per docs (https://github.com/elastic/elasticsearch/pull/43121/files#diff-ec88da77f16eaf2fff65965789ea44beR398). Or maybe you can use PointValues#size here and add an assert that PointValues#size == PointValues#getDocCount to ensure that we don't forget to revise the logic if/when we handle multiple values per doc.

if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
continue;
}
globalDocCount += docCount;
long medianValue = estimateMedianValue(pointValues);
long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
globalMedianCount += medianCount;
}
return (globalMedianCount >= globalDocCount/2);
}

static long estimateMedianValue(PointValues pointValues) throws IOException {
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
while (minValue < maxValue) {
long avgValue = Math.floorDiv(minValue + maxValue, 2);
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
if (countLeft >= countRight) {
maxValue = avgValue;
} else {
minValue = avgValue + 1;
}
}
return maxValue;
}

static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
final byte[] minValueAsBytes = new byte[Long.BYTES];
LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
final byte[] maxValueAsBytes = new byte[Long.BYTES];
LongPoint.encodeDimension(maxValue, maxValueAsBytes, 0);

PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
@Override
public void grow(int count) {}

@Override
public void visit(int docID) {}

@Override
public void visit(int docID, byte[] packedValue) {}

@Override
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0 ||
Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
return PointValues.Relation.CELL_OUTSIDE_QUERY;
}
if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0 ||
Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
return PointValues.Relation.CELL_CROSSES_QUERY;
}
return PointValues.Relation.CELL_INSIDE_QUERY;
}
};
return pointValues.estimatePointCount(visitor);
}

private static class TimeExceededException extends RuntimeException {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NoMergePolicy;
Expand Down Expand Up @@ -65,8 +66,13 @@
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter;
import org.elasticsearch.action.search.SearchTask;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.mapper.DateFieldMapper;
Expand All @@ -88,10 +94,14 @@
import java.util.Collections;
import java.util.List;

import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.lessThan;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.mockito.Mockito.spy;
Expand Down Expand Up @@ -652,9 +662,9 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
when(searchContext.mapperService()).thenReturn(mapperService);

final int numDocs = scaledRandomIntBetween(50, 100);
final int numDocs = 4000;
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
long longValue = randomLongBetween(-10000000L, 10000000L);
Expand Down Expand Up @@ -708,6 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
dir.close();
}

public void testIndexHasDuplicateData() throws IOException {
int valuesCount = 5000;
int maxPointsInLeafNode = 40;
long expectedMedianCount = (long)(valuesCount * 0.6);
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < valuesCount; docId++) {
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
long medianValue = estimateMedianValue(r);
long medianCount = estimatePointCount(r, medianValue, medianValue);

assertEquals(expectedMedianValue, medianValue);
assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
}
}
}

public void testIndexHasNotDuplicateData() throws IOException {
int valuesCount = 5000;
int maxPointsInLeafNode = 40;
long expectedMedianCount = (long)(valuesCount * 0.35);
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < valuesCount; docId++) {
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
long medianValue = estimateMedianValue(r);
long medianCount = estimatePointCount(r, medianValue, medianValue);

// can't make any assertion about the values of medianValue and medianCount
// as BKDReader::estimatePointCount can be really off for non-duplicate data
assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
}
}
}

public void testMaxScoreQueryVisitor() {
BitSetProducer producer = context -> new FixedBitSet(1);
Expand Down Expand Up @@ -760,42 +832,6 @@ public void testMaxScoreQueryVisitor() {
}
}

public void testNumericLongSortOptimizationDocsHaveTheSameValue() throws Exception {
final String fieldNameLong = "long-field";
MappedFieldType fieldTypeLong = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG);
MapperService mapperService = mock(MapperService.class);
when(mapperService.fullName(fieldNameLong)).thenReturn(fieldTypeLong);
TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
when(searchContext.mapperService()).thenReturn(mapperService);

final int numDocs = scaledRandomIntBetween(5, 10);
long longValue = randomLongBetween(-10000000L, 10000000L); // all docs have the same value
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
doc.add(new LongPoint(fieldNameLong, longValue));
doc.add(new NumericDocValuesField(fieldNameLong, longValue));
writer.addDocument(doc);
}
writer.close();
final IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = getAssertingSortOptimizedSearcher(reader, 1);

final SortField sortFieldLong = new SortField(fieldNameLong, SortField.Type.LONG);
sortFieldLong.setMissingValue(Long.MAX_VALUE);
final Sort longSort = new Sort(sortFieldLong);
SortAndFormats sortAndFormats = new SortAndFormats(longSort, new DocValueFormat[]{DocValueFormat.RAW});
searchContext.sort(sortAndFormats);
searchContext.parsedQuery(new ParsedQuery(new MatchAllDocsQuery()));
searchContext.setTask(new SearchTask(123L, "", "", "", null, Collections.emptyMap()));
searchContext.setSize(10);
QueryPhase.execute(searchContext, searcher, checkCancelled -> {});
assertSortResults(searchContext.queryResult().topDocs().topDocs, (long) numDocs, false);
reader.close();
dir.close();
}

// used to check that numeric long or date sort optimization was run
private static IndexSearcher getAssertingSortOptimizedSearcher(IndexReader reader, int queryType) {
return new IndexSearcher(reader) {
Expand Down