-
Notifications
You must be signed in to change notification settings - Fork 24.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Skip optimization if the index has duplicate data #43121
Changes from 3 commits
ab174c5
e13761b
9c71827
05df4e9
80c11b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -236,6 +236,9 @@ static boolean execute(SearchContext searchContext, | |
System.arraycopy(oldFormats, 0, newFormats, 1, oldFormats.length); | ||
sortAndFormatsForRewrittenNumericSort = searchContext.sort(); // stash SortAndFormats to restore it later | ||
searchContext.sort(new SortAndFormats(new Sort(newSortFields), newFormats)); | ||
if (LOGGER.isTraceEnabled()) { | ||
LOGGER.trace("Sort optimization on the field [" + oldSortFields[0].getField() + "] was enabled!"); | ||
} | ||
} | ||
} | ||
|
||
|
@@ -387,8 +390,12 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader | |
((sortField.getReverse() == false) && (missingValue == Long.MAX_VALUE)); | ||
if (missingValuesAccordingToSort == false) return null; | ||
|
||
int docCount = PointValues.getDocCount(reader, fieldName); | ||
// is not worth to run optimization on small index | ||
if (docCount <= 512) return null; | ||
jimczi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// check for multiple values | ||
if (PointValues.size(reader, fieldName) != PointValues.getDocCount(reader, fieldName)) return null; //TODO: handle multiple values | ||
if (PointValues.size(reader, fieldName) != docCount) return null; //TODO: handle multiple values | ||
|
||
// check if the optimization makes sense with the track_total_hits setting | ||
if (searchContext.trackTotalHitsUpTo() == Integer.MAX_VALUE) { | ||
|
@@ -408,6 +415,7 @@ private static Query tryRewriteLongSort(SearchContext searchContext, IndexReader | |
if (minValue == maxValue) { | ||
rewrittenQuery = new DocValuesFieldExistsQuery(fieldName); | ||
} else { | ||
if (indexFieldHasDuplicateData(reader, fieldName)) return null; | ||
long origin = (sortField.getReverse()) ? maxValue : minValue; | ||
long pivotDistance = (maxValue - minValue) >>> 1; // division by 2 on the unsigned representation to avoid overflow | ||
if (pivotDistance == 0) { // 0 if maxValue = (minValue + 1) | ||
|
@@ -469,5 +477,76 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort | |
return true; | ||
} | ||
|
||
/** | ||
* Returns true if more than 50% of data in the index have the same value | ||
* The evaluation is approximation based on finding the median value and estimating its count | ||
* Returns true if the total count of median values is greater or equal to half of the total count of documents | ||
*/ | ||
static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException { | ||
long globalDocCount = 0; | ||
long globalMedianCount = 0; | ||
for (LeafReaderContext lrc : reader.leaves()) { | ||
PointValues pointValues = lrc.reader().getPointValues(field); | ||
if (pointValues == null) continue; | ||
int docCount = pointValues.getDocCount(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a comment (or an assert) that the doc count is equals to the number of points. This is important since we'll need to change the logic here if we handle multiple values per docs (https://github.com/elastic/elasticsearch/pull/43121/files#diff-ec88da77f16eaf2fff65965789ea44beR398). Or maybe you can use |
||
if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them | ||
continue; | ||
} | ||
globalDocCount += docCount; | ||
long medianValue = estimateMedianValue(pointValues); | ||
long medianCount = estimatePointCount(pointValues, medianValue, medianValue); | ||
globalMedianCount += medianCount; | ||
} | ||
return (globalMedianCount >= globalDocCount/2); | ||
} | ||
|
||
static long estimateMedianValue(PointValues pointValues) throws IOException { | ||
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0); | ||
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0); | ||
while (minValue < maxValue) { | ||
long avgValue = Math.floorDiv(minValue + maxValue, 2); | ||
long countLeft = estimatePointCount(pointValues, minValue, avgValue); | ||
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue); | ||
if (countLeft >= countRight) { | ||
maxValue = avgValue; | ||
} else { | ||
minValue = avgValue + 1; | ||
} | ||
} | ||
return maxValue; | ||
} | ||
|
||
static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) { | ||
final byte[] minValueAsBytes = new byte[Long.BYTES]; | ||
LongPoint.encodeDimension(minValue, minValueAsBytes, 0); | ||
final byte[] maxValueAsBytes = new byte[Long.BYTES]; | ||
LongPoint.encodeDimension(maxValue, maxValueAsBytes, 0); | ||
|
||
PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() { | ||
@Override | ||
public void grow(int count) {} | ||
|
||
@Override | ||
public void visit(int docID) {} | ||
|
||
@Override | ||
public void visit(int docID, byte[] packedValue) {} | ||
|
||
@Override | ||
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { | ||
if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0 || | ||
Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) { | ||
return PointValues.Relation.CELL_OUTSIDE_QUERY; | ||
} | ||
if (Arrays.compareUnsigned(minPackedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0 || | ||
Arrays.compareUnsigned(maxPackedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) { | ||
return PointValues.Relation.CELL_CROSSES_QUERY; | ||
} | ||
return PointValues.Relation.CELL_INSIDE_QUERY; | ||
} | ||
}; | ||
return pointValues.estimatePointCount(visitor); | ||
} | ||
|
||
private static class TimeExceededException extends RuntimeException {} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure that this helps the debugging ;)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jimczi What would be the way to see if the optimization was used? LOGGER.trace is a not a good way?