Skip to content

Commit

Permalink
Fielddata: Switch to Lucene DV APIs.
Browse files Browse the repository at this point in the history
This commits removes BytesValues/LongValues/DoubleValues/... and tries to use
Lucene's APIs such as NumericDocValues or RandomAccessOrds instead whenever
possible.

The next step would be to take advantage of the fact that APIs are the same in
Lucene and Elasticsearch in order to remove our custom comparators and use
Lucene's.

There are a few side-effects to this change:
 - GeoDistanceComparator has been removed, DoubleValuesComparator is used instead
   on top of dynamically computed values (was easier than migrating
   GeoDistanceComparator).
 - SortedNumericDocValues doesn't guarantee uniqueness so long/double terms
   aggregators have been updated to make sure a document cannot fall twice in
   the same bucket.
 - Sorting by maximum value of a field or running a `max` aggregation is
   potentially significantly faster thanks to the random-access API.

Our aggs and p/c aggregations benchmarks don't report differences with this
change on uninverted field data. However the fact that doc values don't need
to be wrapped anymore seems to help a lot. For example
TermsAggregationSearchBenchmark reports ~30% faster terms aggregations on doc
values on string fields with this change, which are now only ~18% slower than
uninverted field data although stored on disk.
  • Loading branch information
jpountz committed Jul 18, 2014
1 parent f22f3db commit 8f45f46
Show file tree
Hide file tree
Showing 204 changed files with 4,991 additions and 6,174 deletions.
40 changes: 40 additions & 0 deletions src/main/java/org/elasticsearch/common/geo/GeoDistance.java
Expand Up @@ -19,9 +19,11 @@

package org.elasticsearch.common.geo;

import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SloppyMath;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.unit.DistanceUnit;
import org.elasticsearch.index.fielddata.*;

import java.util.Locale;

Expand Down Expand Up @@ -368,4 +370,42 @@ public double calculate(double targetLatitude, double targetLongitude) {
return SLOPPY_ARC.calculate(sourceLatitude, sourceLongitude, targetLatitude, targetLongitude, unit);
}
}

/**
* Return a {@link SortedNumericDoubleValues} instance that returns the distance to a given geo-point for each document.
*/
public static SortedNumericDoubleValues distanceValues(final FixedSourceDistance distance, final MultiGeoPointValues geoPointValues) {
final GeoPointValues singleValues = FieldData.unwrapSingleton(geoPointValues);
if (singleValues != null) {
final Bits docsWithField = FieldData.unwrapSingletonBits(geoPointValues);
return FieldData.singleton(new NumericDoubleValues() {

@Override
public double get(int docID) {
if (docsWithField != null && !docsWithField.get(docID)) {
return 0d;
}
final GeoPoint point = singleValues.get(docID);
return distance.calculate(point.lat(), point.lon());
}

}, docsWithField);
} else {
return new SortingNumericDoubleValues() {

@Override
public void setDocument(int doc) {
geoPointValues.setDocument(doc);
count = geoPointValues.count();
grow();
for (int i = 0; i < count; ++i) {
final GeoPoint point = geoPointValues.valueAt(i);
values[i] = distance.calculate(point.lat(), point.lon());
}
sort();
}

};
}
}
}
Expand Up @@ -22,8 +22,8 @@
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.search.Explanation;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.index.fielddata.DoubleValues;
import org.elasticsearch.index.fielddata.IndexNumericFieldData;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;

import java.util.Locale;

Expand All @@ -37,7 +37,7 @@ public class FieldValueFactorFunction extends ScoreFunction {
private final float boostFactor;
private final Modifier modifier;
private final IndexNumericFieldData indexFieldData;
private DoubleValues values;
private SortedNumericDoubleValues values;

public FieldValueFactorFunction(String field, float boostFactor, Modifier modifierType, IndexNumericFieldData indexFieldData) {
super(CombineFunction.MULT);
Expand All @@ -54,9 +54,10 @@ public void setNextReader(AtomicReaderContext context) {

@Override
public double score(int docId, float subQueryScore) {
final int numValues = this.values.setDocument(docId);
this.values.setDocument(docId);
final int numValues = this.values.count();
if (numValues > 0) {
double val = this.values.nextValue() * boostFactor;
double val = this.values.valueAt(0) * boostFactor;
double result = modifier.apply(val);
if (Double.isNaN(result) || Double.isInfinite(result)) {
throw new ElasticsearchException("Result of field modification [" + modifier.toString() +
Expand Down

This file was deleted.

Expand Up @@ -16,35 +16,30 @@
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.fielddata;

import org.apache.lucene.index.RandomAccessOrds;

/**
* <code>FilterDoubleValues</code> contains another {@link DoubleValues}, which it
* uses as its basic source of data, possibly transforming the data along the
* way or providing additional functionality.
* Base implementation of a {@link RandomAccessOrds} instance.
*/
public abstract class FilterDoubleValues extends DoubleValues {

protected final DoubleValues delegate;
// TODO: should it be merged into Lucene's RandomAccessOrds?
public abstract class AbstractRandomAccessOrds extends RandomAccessOrds {

protected FilterDoubleValues(DoubleValues delegate) {
super(delegate.isMultiValued());
this.delegate = delegate;
}
int i = 0;

@Override
public int setDocument(int docId) {
return delegate.setDocument(docId);
}
protected abstract void doSetDocument(int docID);

@Override
public double nextValue() {
return delegate.nextValue();
public final void setDocument(int docID) {
doSetDocument(docID);
i = 0;
}

@Override
public AtomicFieldData.Order getOrder() {
return delegate.getOrder();
public long nextOrd() {
return ordAt(i++);
}

This comment has been minimized.

Copy link
@rmuir

rmuir Jul 18, 2014

Contributor

This class doesnt make a lot of sense to me. RandomAccessOrds already extends SortedDocValues, which itself has a nextOrd method. why the code duplication?

This comment has been minimized.

Copy link
@jpountz

jpountz Jul 18, 2014

Author Contributor

I don't think it duplicates. I just didn't want to have to implement nextOrd for every RandomAccessOrds impl while it could be handled by a base class by delegating to ordAt.



}
106 changes: 5 additions & 101 deletions src/main/java/org/elasticsearch/index/fielddata/AtomicFieldData.java
Expand Up @@ -20,117 +20,21 @@
package org.elasticsearch.index.fielddata;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.index.fielddata.ScriptDocValues.Strings;
import org.elasticsearch.common.lease.Releasable;

/**
* The thread safe {@link org.apache.lucene.index.AtomicReader} level cache of the data.
*/
public interface AtomicFieldData<Script extends ScriptDocValues> extends Accountable {

/**
* Use a non thread safe (lightweight) view of the values as bytes.
*/
BytesValues getBytesValues();
public interface AtomicFieldData extends Accountable, Releasable {

/**
* Returns a "scripting" based values.
*/
Script getScriptValues();
ScriptDocValues getScriptValues();

/**
* Close the field data.
* Return a String representation of the values.
*/
void close();

interface WithOrdinals<Script extends ScriptDocValues> extends AtomicFieldData<Script> {

public static final WithOrdinals<ScriptDocValues.Strings> EMPTY = new WithOrdinals<ScriptDocValues.Strings>() {

@Override
public Strings getScriptValues() {
return new ScriptDocValues.Strings(getBytesValues());
}

@Override
public void close() {
}

@Override
public long ramBytesUsed() {
return 0;
}

@Override
public BytesValues.WithOrdinals getBytesValues() {
return new BytesValues.WithOrdinals(false) {

@Override
public int setDocument(int docId) {
return 0;
}
SortedBinaryDocValues getBytesValues();

@Override
public long nextOrd() {
return MISSING_ORDINAL;
}

@Override
public BytesRef getValueByOrd(long ord) {
throw new UnsupportedOperationException();
}

@Override
public long getOrd(int docId) {
return MISSING_ORDINAL;
}

@Override
public long getMaxOrd() {
return 0;
}
};
}

};

/**
* Use a non thread safe (lightweight) view of the values as bytes.
* @param needsHashes
*/
BytesValues.WithOrdinals getBytesValues();

}

/**
* This enum provides information about the order of the values for
* a given document. For instance {@link BytesValues} by default
* return values in {@link #BYTES} order but if the interface
* wraps a numeric variant the sort order might change to {@link #NUMERIC}.
* In that case the values might not be returned in byte sort order but in numeric
* order instead while maintaining the property of <tt>N < N+1</tt> during the
* value iterations.
*
* @see org.elasticsearch.index.fielddata.BytesValues#getOrder()
* @see org.elasticsearch.index.fielddata.DoubleValues#getOrder()
* @see org.elasticsearch.index.fielddata.LongValues#getOrder()
*/
public enum Order {
/**
* Donates Byte sort order
*/
BYTES,
/**
* Donates Numeric sort order
*/
NUMERIC,
/**
* Donates custom sort order
*/
CUSTOM,
/**
* Donates no sort order
*/
NONE
}
}
Expand Up @@ -18,35 +18,15 @@
*/
package org.elasticsearch.index.fielddata;

import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.geo.GeoHashUtils;
import org.elasticsearch.common.geo.GeoPoint;

/**
* {@link AtomicFieldData} specialization for geo points.
*/
public abstract class AtomicGeoPointFieldData<Script extends ScriptDocValues> implements AtomicFieldData<Script> {
public interface AtomicGeoPointFieldData extends AtomicFieldData {

public abstract GeoPointValues getGeoPointValues();

@Override
public BytesValues getBytesValues() {
final GeoPointValues values = getGeoPointValues();
return new BytesValues(values.isMultiValued()) {
private final BytesRef scratch = new BytesRef();

@Override
public int setDocument(int docId) {
return values.setDocument(docId);
}

@Override
public BytesRef nextValue() {
GeoPoint value = values.nextValue();
scratch.copyChars(GeoHashUtils.encode(value.lat(), value.lon()));
return scratch;
}

};
}
/**
* Return geo-point values.
*/
MultiGeoPointValues getGeoPointValues();

}

0 comments on commit 8f45f46

Please sign in to comment.