Skip to content

Commit

Permalink
Add test for finding duplicate data in BKDtree
Browse files Browse the repository at this point in the history
This allows to control the number of points in the leaf node
  • Loading branch information
mayya-sharipova committed Jun 25, 2019
1 parent e13761b commit 6f39a40
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 56 deletions.
35 changes: 11 additions & 24 deletions server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
Original file line number Diff line number Diff line change
Expand Up @@ -487,40 +487,36 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
long globalMedianCount = 0;
for (LeafReaderContext lrc : reader.leaves()) {
PointValues pointValues = lrc.reader().getPointValues(field);
if (pointValues == null) continue;
int docCount = pointValues.getDocCount();
if (docCount <= 512) { // skipping small segments as estimateMedianCount doesn't work well on them
continue;
}
globalDocCount += docCount;
byte[] minValueAsBytes = pointValues.getMinPackedValue();
byte[] maxValueAsBytes = pointValues.getMaxPackedValue();
long minValue = LongPoint.decodeDimension(minValueAsBytes, 0);
long maxValue = LongPoint.decodeDimension(maxValueAsBytes, 0);
long medianCount = estimateMedianCount(pointValues, minValue, maxValue, docCount/2);
long medianValue = estimateMedianValue(pointValues);
long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
globalMedianCount += medianCount;
}
return (globalMedianCount >= globalDocCount/2);
}

private static long estimateMedianCount(PointValues pointValues, long minValue, long maxValue, long threshold) {
static long estimateMedianValue(PointValues pointValues) throws IOException {
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
while (minValue < maxValue) {
long avgValue = Math.floorDiv(minValue + maxValue, 2);
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
if (countLeft >= threshold) {
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
if (countLeft >= countRight) {
maxValue = avgValue;
threshold = countLeft/2;
} else {
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
minValue = avgValue + 1;
threshold = countRight/2;
}
}
// maxValue is the approximate median value, estimate its count
long medianCount = estimatePointCount(pointValues, maxValue, maxValue);
return medianCount;
return maxValue;
}

private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
final byte[] minValueAsBytes = new byte[Long.BYTES];
LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
final byte[] maxValueAsBytes = new byte[Long.BYTES];
Expand All @@ -534,16 +530,7 @@ public void grow(int count) {}
public void visit(int docID) {}

@Override
public void visit(int docID, byte[] packedValue) {
if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, minValueAsBytes, 0, Long.BYTES) < 0) {
// Doc's value is too low, in this dimension
return;
}
if (Arrays.compareUnsigned(packedValue, 0, Long.BYTES, maxValueAsBytes, 0, Long.BYTES) > 0) {
// Doc's value is too high, in this dimension
return;
}
}
public void visit(int docID, byte[] packedValue) {}

@Override
public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,13 @@
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter;
import org.elasticsearch.action.search.SearchTask;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.mapper.DateFieldMapper;
Expand All @@ -89,11 +94,14 @@
import java.util.Collections;
import java.util.List;

import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.lessThan;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.mockito.Mockito.spy;
Expand Down Expand Up @@ -654,7 +662,7 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
TestSearchContext searchContext = spy(new TestSearchContext(null, indexShard));
when(searchContext.mapperService()).thenReturn(mapperService);

final int numDocs = 1000;
final int numDocs = 4000;
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
for (int i = 0; i < numDocs; ++i) {
Expand Down Expand Up @@ -710,39 +718,68 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
dir.close();
}

public void testIndexFieldHasDuplicateData() throws IOException {
final int numDocs = 10000;
final int threshold1 = numDocs * 60 / 100;
final int threshold2 = numDocs * 40 / 100;
final int threshold3 = numDocs * 5 / 100;

final String fieldName = "duplicateField";
final String fieldName2 = "notMuchDuplicateField";
final String fieldName3 = "notDuplicateField";

long duplicateValue = randomLongBetween(-10000000L, 10000000L);
long value, value2, value3;
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
for (int i = 0; i < numDocs; ++i) {
value = i < threshold1 ? duplicateValue : i;
value2 = i < threshold2 ? duplicateValue : i;
value3 = i < threshold3 ? duplicateValue : i;
Document doc = new Document();
doc.add(new LongPoint(fieldName, value));
doc.add(new LongPoint(fieldName2, value2));
doc.add(new LongPoint(fieldName3, value3));
writer.addDocument(doc);
public void testIndexHasDuplicateData() throws IOException {
int valuesCount = 5000;
int maxPointsInLeafNode = 40;
long expectedMedianCount = (long)(valuesCount * 0.6);
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < valuesCount; docId++) {
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
long medianValue = estimateMedianValue(r);
long medianCount = estimatePointCount(r, medianValue, medianValue);

assertEquals(expectedMedianValue, medianValue);
assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
}
}
writer.close();
final IndexReader reader = DirectoryReader.open(dir);
assertTrue(indexFieldHasDuplicateData(reader, fieldName));
assertFalse(indexFieldHasDuplicateData(reader, fieldName2));
assertFalse(indexFieldHasDuplicateData(reader, fieldName3));
reader.close();
dir.close();
}

public void testIndexHasNotDuplicateData() throws IOException {
int valuesCount = 5000;
int maxPointsInLeafNode = 40;
long expectedMedianCount = (long)(valuesCount * 0.35);
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < valuesCount; docId++) {
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
long medianValue = estimateMedianValue(r);
long medianCount = estimatePointCount(r, medianValue, medianValue);

// can't make any assertion about the values of medianValue and medianCount
// as BKDReader::estimatePointCount can be really off for non-duplicate data
assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
}
}
}

public void testMaxScoreQueryVisitor() {
BitSetProducer producer = context -> new FixedBitSet(1);
Expand Down

0 comments on commit 6f39a40

Please sign in to comment.