Fielddata: Switch to Lucene DV APIs.

This commits removes BytesValues/LongValues/DoubleValues/... and tries to use Lucene's APIs such as NumericDocValues or RandomAccessOrds instead whenever possible. The next step would be to take advantage of the fact that APIs are the same in Lucene and Elasticsearch in order to remove our custom comparators and use Lucene's. There are a few side-effects to this change: - GeoDistanceComparator has been removed, DoubleValuesComparator is used instead on top of dynamically computed values (was easier than migrating GeoDistanceComparator). - SortedNumericDocValues doesn't guarantee uniqueness so long/double terms aggregators have been updated to make sure a document cannot fall twice in the same bucket. - Sorting by maximum value of a field or running a `max` aggregation is potentially significantly faster thanks to the random-access API. Our aggs and p/c aggregations benchmarks don't report differences with this change on uninverted field data. However the fact that doc values don't need to be wrapped anymore seems to help a lot. For example TermsAggregationSearchBenchmark reports ~30% faster terms aggregations on doc values on string fields with this change, which are now only ~18% slower than uninverted field data although stored on disk. Close #6908
elastic · Jul 22, 2014 · 3c142e5 · 3c142e5
1 parent 0de30e1
commit 3c142e5
Show file tree

Hide file tree

Showing 202 changed files with 4,964 additions and 6,294 deletions.
diff --git a/src/main/java/org/elasticsearch/common/geo/GeoDistance.java b/src/main/java/org/elasticsearch/common/geo/GeoDistance.java
@@ -19,9 +19,11 @@
 
 package org.elasticsearch.common.geo;
 
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.SloppyMath;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.common.unit.DistanceUnit;
+import org.elasticsearch.index.fielddata.*;
 
 import java.util.Locale;
 
@@ -368,4 +370,42 @@ public double calculate(double targetLatitude, double targetLongitude) {
             return SLOPPY_ARC.calculate(sourceLatitude, sourceLongitude, targetLatitude, targetLongitude, unit);
         }
     }
+
+    /**
+     * Return a {@link SortedNumericDoubleValues} instance that returns the distance to a given geo-point for each document.
+     */
+    public static SortedNumericDoubleValues distanceValues(final FixedSourceDistance distance, final MultiGeoPointValues geoPointValues) {
+        final GeoPointValues singleValues = FieldData.unwrapSingleton(geoPointValues);
+        if (singleValues != null) {
+            final Bits docsWithField = FieldData.unwrapSingletonBits(geoPointValues);
+            return FieldData.singleton(new NumericDoubleValues() {
+
+                @Override
+                public double get(int docID) {
+                    if (docsWithField != null && !docsWithField.get(docID)) {
+                        return 0d;
+                    }
+                    final GeoPoint point = singleValues.get(docID);
+                    return distance.calculate(point.lat(), point.lon());
+                }
+
+            }, docsWithField);
+        } else {
+            return new SortingNumericDoubleValues() {
+
+                @Override
+                public void setDocument(int doc) {
+                    geoPointValues.setDocument(doc);
+                    count = geoPointValues.count();
+                    grow();
+                    for (int i = 0; i < count; ++i) {
+                        final GeoPoint point = geoPointValues.valueAt(i);
+                        values[i] = distance.calculate(point.lat(), point.lon());
+                    }
+                    sort();
+                }
+
+            };
+        }
+    }
 }
diff --git a/src/main/java/org/elasticsearch/common/lucene/search/function/FieldValueFactorFunction.java b/src/main/java/org/elasticsearch/common/lucene/search/function/FieldValueFactorFunction.java
@@ -22,8 +22,8 @@
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.search.Explanation;
 import org.elasticsearch.ElasticsearchException;
-import org.elasticsearch.index.fielddata.DoubleValues;
 import org.elasticsearch.index.fielddata.IndexNumericFieldData;
+import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
 
 import java.util.Locale;
 
@@ -37,7 +37,7 @@ public class FieldValueFactorFunction extends ScoreFunction {
     private final float boostFactor;
     private final Modifier modifier;
     private final IndexNumericFieldData indexFieldData;
-    private DoubleValues values;
+    private SortedNumericDoubleValues values;
 
     public FieldValueFactorFunction(String field, float boostFactor, Modifier modifierType, IndexNumericFieldData indexFieldData) {
         super(CombineFunction.MULT);
@@ -54,9 +54,10 @@ public void setNextReader(AtomicReaderContext context) {
 
     @Override
     public double score(int docId, float subQueryScore) {
-        final int numValues = this.values.setDocument(docId);
+        this.values.setDocument(docId);
+        final int numValues = this.values.count();
         if (numValues > 0) {
-            double val = this.values.nextValue() * boostFactor;
+            double val = this.values.valueAt(0) * boostFactor;
             double result = modifier.apply(val);
             if (Double.isNaN(result) || Double.isInfinite(result)) {
                 throw new ElasticsearchException("Result of field modification [" + modifier.toString() +

diff --git a/src/main/java/org/elasticsearch/index/fielddata/AbstractAtomicNumericFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/AbstractAtomicNumericFieldData.java
diff --git a/...ex/fielddata/plain/DenseDoubleValues.java → ...x/fielddata/AbstractRandomAccessOrds.java b/...ex/fielddata/plain/DenseDoubleValues.java → ...x/fielddata/AbstractRandomAccessOrds.java
@@ -16,22 +16,29 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.elasticsearch.index.fielddata.plain;
 
-import org.elasticsearch.index.fielddata.DoubleValues;
+package org.elasticsearch.index.fielddata;
+
+import org.apache.lucene.index.RandomAccessOrds;
 
 /**
- * Package private base class for dense double values.
+ * Base implementation of a {@link RandomAccessOrds} instance.
  */
-abstract class DenseDoubleValues extends DoubleValues {
+public abstract class AbstractRandomAccessOrds extends RandomAccessOrds {
+
+    int i = 0;
+
+    protected abstract void doSetDocument(int docID);
 
-    protected DenseDoubleValues(boolean multiValued) {
-        super(multiValued);
+    @Override
+    public final void setDocument(int docID) {
+        doSetDocument(docID);
+        i = 0;
     }
 
     @Override
-    public final int setDocument(int docId) {
-        this.docId = docId;
-        return 1;
+    public long nextOrd() {
+        return ordAt(i++);
     }
-}
+
+}
diff --git a/src/main/java/org/elasticsearch/index/fielddata/AtomicFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/AtomicFieldData.java
@@ -20,117 +20,21 @@
 package org.elasticsearch.index.fielddata;
 
 import org.apache.lucene.util.Accountable;
-import org.apache.lucene.util.BytesRef;
-import org.elasticsearch.index.fielddata.ScriptDocValues.Strings;
+import org.elasticsearch.common.lease.Releasable;
 
 /**
  * The thread safe {@link org.apache.lucene.index.AtomicReader} level cache of the data.
  */
-public interface AtomicFieldData<Script extends ScriptDocValues> extends Accountable {
-
-    /**
-     * Use a non thread safe (lightweight) view of the values as bytes.
-     */
-    BytesValues getBytesValues();
+public interface AtomicFieldData extends Accountable, Releasable {
 
     /**
      * Returns a "scripting" based values.
      */
-    Script getScriptValues();
+    ScriptDocValues getScriptValues();
 
     /**
-     * Close the field data.
+     * Return a String representation of the values.
      */
-    void close();
-
-    interface WithOrdinals<Script extends ScriptDocValues> extends AtomicFieldData<Script> {
-
-        public static final WithOrdinals<ScriptDocValues.Strings> EMPTY = new WithOrdinals<ScriptDocValues.Strings>() {
-
-            @Override
-            public Strings getScriptValues() {
-                return new ScriptDocValues.Strings(getBytesValues());
-            }
-
-            @Override
-            public void close() {
-            }
-
-            @Override
-            public long ramBytesUsed() {
-                return 0;
-            }
-
-            @Override
-            public BytesValues.WithOrdinals getBytesValues() {
-                return new BytesValues.WithOrdinals(false) {
-
-                    @Override
-                    public int setDocument(int docId) {
-                        return 0;
-                    }
+    SortedBinaryDocValues getBytesValues();
 
-                    @Override
-                    public long nextOrd() {
-                        return MISSING_ORDINAL;
-                    }
-
-                    @Override
-                    public BytesRef getValueByOrd(long ord) {
-                        throw new UnsupportedOperationException();
-                    }
-
-                    @Override
-                    public long getOrd(int docId) {
-                        return MISSING_ORDINAL;
-                    }
-
-                    @Override
-                    public long getMaxOrd() {
-                        return 0;
-                    }
-                };
-            }
-
-        };
-
-        /**
-         * Use a non thread safe (lightweight) view of the values as bytes.
-         * @param needsHashes
-         */
-        BytesValues.WithOrdinals getBytesValues();
-
-    }
-
-    /**
-     * This enum provides information about the order of the values for
-     * a given document. For instance {@link BytesValues} by default
-     * return values in {@link #BYTES} order but if the interface
-     * wraps a numeric variant the sort order might change to {@link #NUMERIC}.
-     * In that case the values might not be returned in byte sort order but in numeric
-     * order instead while maintaining the property of <tt>N < N+1</tt> during the
-     * value iterations.
-     *
-     * @see org.elasticsearch.index.fielddata.BytesValues#getOrder()
-     * @see org.elasticsearch.index.fielddata.DoubleValues#getOrder()
-     * @see org.elasticsearch.index.fielddata.LongValues#getOrder()
-     */
-    public enum Order {
-        /**
-         * Donates Byte sort order
-         */
-        BYTES,
-        /**
-         * Donates Numeric sort order
-         */
-        NUMERIC,
-        /**
-         * Donates custom sort order
-         */
-        CUSTOM,
-        /**
-         * Donates no sort order
-         */
-        NONE
-    }
 }
diff --git a/src/main/java/org/elasticsearch/index/fielddata/AtomicGeoPointFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/AtomicGeoPointFieldData.java
@@ -18,35 +18,15 @@
  */
 package org.elasticsearch.index.fielddata;
 
-import org.apache.lucene.util.BytesRef;
-import org.elasticsearch.common.geo.GeoHashUtils;
-import org.elasticsearch.common.geo.GeoPoint;
 
 /**
+ * {@link AtomicFieldData} specialization for geo points.
  */
-public abstract class AtomicGeoPointFieldData<Script extends ScriptDocValues> implements AtomicFieldData<Script> {
+public interface AtomicGeoPointFieldData extends AtomicFieldData {
 
-    public abstract GeoPointValues getGeoPointValues();
-
-    @Override
-    public BytesValues getBytesValues() {
-        final GeoPointValues values = getGeoPointValues();
-        return new BytesValues(values.isMultiValued()) {
-            private final BytesRef scratch = new BytesRef();
-
-            @Override
-            public int setDocument(int docId) {
-                return values.setDocument(docId);
-            }
-
-            @Override
-            public BytesRef nextValue() {
-                GeoPoint value = values.nextValue();
-                scratch.copyChars(GeoHashUtils.encode(value.lat(), value.lon()));
-                return scratch;
-            }
-
-        };
-    }
+    /**
+     * Return geo-point values.
+     */
+    MultiGeoPointValues getGeoPointValues();
 
 }