Skip to content

Commit

Permalink
Skip optimization if the index has duplicate data (#43121)
Browse files Browse the repository at this point in the history
Skip sort optimization if the index has 50% or more data
with the same value.
When index has a lot of docs with the same value, sort
optimization doesn't make sense, as DistanceFeatureQuery
will produce same scores for these docs, and Lucene
will use the second sort to tie-break. This could be slower
than usual sorting.
  • Loading branch information
mayya-sharipova committed Jul 3, 2019
1 parent 1a9deae commit 3c29734
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,12 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
((sortField.getReverse() == false) && (missingValue == Long.MAX_VALUE));
if (missingValuesAccordingToSort == false) return null;

int docCount = PointValues.getDocCount(reader, fieldName);
// is not worth to run optimization on small index
if (docCount <= 512) return null;

// check for multiple values
if (PointValues.size(reader, fieldName) != PointValues.getDocCount(reader, fieldName)) return null; //TODO: handle multiple values
if (PointValues.size(reader, fieldName) != docCount) return null; //TODO: handle multiple values

// check if the optimization makes sense with the track_total_hits setting
if (searchContext.trackTotalHitsUpTo() == Integer.MAX_VALUE) {
Expand All @@ -408,6 +412,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader
if (minValue == maxValue) {
rewrittenQuery = new DocValuesFieldExistsQuery(fieldName);
} else {
if (indexFieldHasDuplicateData(reader, fieldName)) return null;
long origin = (sortField.getReverse()) ? maxValue : minValue;
long pivotDistance = (maxValue - minValue) >>> 1; // division by 2 on the unsigned representation to avoid overflow
if (pivotDistance == 0) { // 0 if maxValue = (minValue + 1)
Expand Down Expand Up @@ -469,5 +474,77 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
return true;
}

/**
* Returns true if more than 50% of data in the index have the same value
* The evaluation is approximation based on finding the median value and estimating its count
* Returns true if the total count of median values is greater or equal to half of the total count of documents
*/
static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
long globalDocCount = 0;
long globalMedianCount = 0;
for (LeafReaderContext lrc : reader.leaves()) {
PointValues pointValues = lrc.reader().getPointValues(field);
if (pointValues == null) continue;
int docCount = pointValues.getDocCount();
if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
continue;
}
assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
globalDocCount += docCount;
long medianValue = estimateMedianValue(pointValues);
long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
globalMedianCount += medianCount;
}
return (globalMedianCount >= globalDocCount/2);
}

static long estimateMedianValue(PointValues pointValues) throws IOException {
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
while (minValue < maxValue) {
long avgValue = Math.floorDiv(minValue + maxValue, 2);
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
if (countLeft >= countRight) {
maxValue = avgValue;
} else {
minValue = avgValue + 1;
}
}
return maxValue;
}

static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
final byte[] minValueAsBytes = new byte[Long.BYTES];
LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
final byte[] maxValueAsBytes = new byte[Long.BYTES];
LongPoint.encodeDimension(maxValue, maxValueAsBytes, 0);

PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
@Override
public void grow(int count) {}

@Override
public void visit(int docID) {}

@Override
public void visit(int docID, byte[] packedValue) {}

@Override
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0 ||
Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
return PointValues.Relation.CELL_OUTSIDE_QUERY;
}
if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0 ||
Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
return PointValues.Relation.CELL_CROSSES_QUERY;
}
return PointValues.Relation.CELL_INSIDE_QUERY;
}
};
return pointValues.estimatePointCount(visitor);
}

private static class TimeExceededException extends RuntimeException {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NoMergePolicy;
Expand Down Expand Up @@ -65,8 +66,13 @@
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter;
import org.elasticsearch.action.search.SearchTask;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.mapper.DateFieldMapper;
Expand All @@ -88,10 +94,14 @@
import java.util.Collections;
import java.util.List;

import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.lessThan;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.mockito.Mockito.spy;
Expand Down Expand Up @@ -652,9 +662,9 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
when(searchContext.mapperService()).thenReturn(mapperService);

final int numDocs = scaledRandomIntBetween(50, 100);
final int numDocs = 4000;
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
long longValue = randomLongBetween(-10000000L, 10000000L);
Expand Down Expand Up @@ -708,6 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
dir.close();
}

public void testIndexHasDuplicateData() throws IOException {
int valuesCount = 5000;
int maxPointsInLeafNode = 40;
long expectedMedianCount = (long)(valuesCount * 0.6);
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < valuesCount; docId++) {
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
long medianValue = estimateMedianValue(r);
long medianCount = estimatePointCount(r, medianValue, medianValue);

assertEquals(expectedMedianValue, medianValue);
assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
}
}
}

public void testIndexHasNotDuplicateData() throws IOException {
int valuesCount = 5000;
int maxPointsInLeafNode = 40;
long expectedMedianCount = (long)(valuesCount * 0.35);
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < valuesCount; docId++) {
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
long medianValue = estimateMedianValue(r);
long medianCount = estimatePointCount(r, medianValue, medianValue);

// can't make any assertion about the values of medianValue and medianCount
// as BKDReader::estimatePointCount can be really off for non-duplicate data
assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
}
}
}

public void testMaxScoreQueryVisitor() {
BitSetProducer producer = context -> new FixedBitSet(1);
Expand Down Expand Up @@ -760,42 +832,6 @@ public void testMaxScoreQueryVisitor() {
}
}

public void testNumericLongSortOptimizationDocsHaveTheSameValue() throws Exception {
final String fieldNameLong = "long-field";
MappedFieldType fieldTypeLong = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG);
MapperService mapperService = mock(MapperService.class);
when(mapperService.fullName(fieldNameLong)).thenReturn(fieldTypeLong);
TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
when(searchContext.mapperService()).thenReturn(mapperService);

final int numDocs = scaledRandomIntBetween(5, 10);
long longValue = randomLongBetween(-10000000L, 10000000L); // all docs have the same value
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
doc.add(new LongPoint(fieldNameLong, longValue));
doc.add(new NumericDocValuesField(fieldNameLong, longValue));
writer.addDocument(doc);
}
writer.close();
final IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = getAssertingSortOptimizedSearcher(reader, 1);

final SortField sortFieldLong = new SortField(fieldNameLong, SortField.Type.LONG);
sortFieldLong.setMissingValue(Long.MAX_VALUE);
final Sort longSort = new Sort(sortFieldLong);
SortAndFormats sortAndFormats = new SortAndFormats(longSort, new DocValueFormat[]{DocValueFormat.RAW});
searchContext.sort(sortAndFormats);
searchContext.parsedQuery(new ParsedQuery(new MatchAllDocsQuery()));
searchContext.setTask(new SearchTask(123L, "", "", "", null, Collections.emptyMap()));
searchContext.setSize(10);
QueryPhase.execute(searchContext, searcher, checkCancelled -> {});
assertSortResults(searchContext.queryResult().topDocs().topDocs, (long) numDocs, false);
reader.close();
dir.close();
}

// used to check that numeric long or date sort optimization was run
private static IndexSearcher getAssertingSortOptimizedSearcher(IndexReader reader, int queryType) {
return new IndexSearcher(reader) {
Expand Down

0 comments on commit 3c29734

Please sign in to comment.