elastic · nik9000 · Jul 29, 2015 · Aug 24, 2015 · mikemccand · Aug 11, 2015
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java
@@ -29,6 +29,7 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.AbstractIndexComponent;
 import org.elasticsearch.index.Index;
+import org.elasticsearch.index.mapper.core.StringFieldMapper;
 import org.elasticsearch.index.settings.IndexSettings;
 import org.elasticsearch.indices.analysis.IndicesAnalysisService;
 
@@ -215,19 +216,38 @@ public AnalysisService(Index index, @IndexSettings Settings indexSettings, @Null
 
         Map<String, NamedAnalyzer> analyzers = newHashMap();
         for (AnalyzerProvider analyzerFactory : analyzerProviders.values()) {
+            /*
+             * Lucene defaults positionOffsetGap to 0 in all analyzers but
+             * Elasticsearch defaults them to 0 only before version 2.1
+             * and 100 afterwards so we override the positionOffsetGap if it
+             * doesn't match here.
+             */
+            int overridePositionOffsetGap = StringFieldMapper.Defaults.positionOffsetGap(Version.indexCreated(indexSettings));
             if (analyzerFactory instanceof CustomAnalyzerProvider) {
                 ((CustomAnalyzerProvider) analyzerFactory).build(this);
+                /*
+                 * Custom analyzers already default to the correct, version
+                 * dependent positionOffsetGap and the user is be able to
+                 * configure the positionOffsetGap directly on the analyzer so
+                 * we disable overriding the positionOffsetGap to preserve the
+                 * user's setting.
+                 */
+                overridePositionOffsetGap = Integer.MIN_VALUE;
             }
             Analyzer analyzerF = analyzerFactory.get();
             if (analyzerF == null) {
                 throw new IllegalArgumentException("analyzer [" + analyzerFactory.name() + "] created null analyzer");
             }
             NamedAnalyzer analyzer;
-            // if we got a named analyzer back, use it...
             if (analyzerF instanceof NamedAnalyzer) {
+                // if we got a named analyzer back, use it...
                 analyzer = (NamedAnalyzer) analyzerF;
+                if (overridePositionOffsetGap >= 0 && analyzer.getPositionIncrementGap(analyzer.name()) != overridePositionOffsetGap) {
+                    // unless the positionOffsetGap needs to be overridden
+                    analyzer = new NamedAnalyzer(analyzer, overridePositionOffsetGap);
+                }
             } else {
-                analyzer = new NamedAnalyzer(analyzerFactory.name(), analyzerFactory.scope(), analyzerF);
+                analyzer = new NamedAnalyzer(analyzerFactory.name(), analyzerFactory.scope(), analyzerF, overridePositionOffsetGap);
             }
             analyzers.put(analyzerFactory.name(), analyzer);
             analyzers.put(Strings.toCamelCase(analyzerFactory.name()), analyzer);

diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java b/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java
@@ -19,10 +19,12 @@
 
 package org.elasticsearch.index.analysis;
 
+import org.elasticsearch.Version;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.inject.assistedinject.Assisted;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
+import org.elasticsearch.index.mapper.core.StringFieldMapper;
 import org.elasticsearch.index.settings.IndexSettings;
 
 import java.util.List;
@@ -77,7 +79,8 @@ public void build(AnalysisService analysisService) {
             tokenFilters.add(tokenFilter);
         }
 
-        int positionOffsetGap = analyzerSettings.getAsInt("position_offset_gap", 0);
+        int positionOffsetGapDefault = StringFieldMapper.Defaults.positionOffsetGap(Version.indexCreated(indexSettings));
+        int positionOffsetGap = analyzerSettings.getAsInt("position_offset_gap", positionOffsetGapDefault);
         int offsetGap = analyzerSettings.getAsInt("offset_gap", -1);
 
         this.customAnalyzer = new CustomAnalyzer(tokenizer,

diff --git a/core/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java b/core/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java
@@ -24,6 +24,7 @@
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.xcontent.XContentBuilder;
@@ -52,6 +53,7 @@
 public class StringFieldMapper extends FieldMapper implements AllFieldMapper.IncludeInAll {
 
     public static final String CONTENT_TYPE = "string";
+    private static final int POSITION_OFFSET_GAP_USE_ANALYZER = -1;
 
     public static class Defaults {
         public static final MappedFieldType FIELD_TYPE = new StringFieldType();
@@ -62,15 +64,36 @@ public static class Defaults {
 
         // NOTE, when adding defaults here, make sure you add them in the builder
         public static final String NULL_VALUE = null;
-        public static final int POSITION_OFFSET_GAP = 0;
+        /**
+         * Post 2.1 default for position_offset_gap. Set to 100 so that
+         * phrase queries of reasonably high slop will not match across field
+         * values.
+         */
+        public static final int POSITION_OFFSET_GAP = 100;
+        public static final int POSITION_OFFSET_GAP_PRE_2_1 = 0;
         public static final int IGNORE_ABOVE = -1;
+
+        /**
+         * The default position_offset_gap for a particular version of Elasticsearch.
+         */
+        public static int positionOffsetGap(Version version) {
+            if (version.before(Version.V_2_1_0)) {
+                return POSITION_OFFSET_GAP_PRE_2_1;
+            }
+            return POSITION_OFFSET_GAP;
+        }
     }
 
     public static class Builder extends FieldMapper.Builder<Builder, StringFieldMapper> {
 
         protected String nullValue = Defaults.NULL_VALUE;
 
-        protected int positionOffsetGap = Defaults.POSITION_OFFSET_GAP;
+        /**
+         * The distance between tokens from different values in the same field.
+         * POSITION_OFFSET_GAP_USE_ANALYZER means default to the analyzer's
+         * setting which in turn defaults to Defaults.POSITION_OFFSET_GAP.
+         */
+        protected int positionOffsetGap = POSITION_OFFSET_GAP_USE_ANALYZER;
 
         protected int ignoreAbove = Defaults.IGNORE_ABOVE;
 
@@ -102,7 +125,7 @@ public Builder ignoreAbove(int ignoreAbove) {
 
         @Override
         public StringFieldMapper build(BuilderContext context) {
-            if (positionOffsetGap > 0) {
+            if (positionOffsetGap != POSITION_OFFSET_GAP_USE_ANALYZER) {
                 fieldType.setIndexAnalyzer(new NamedAnalyzer(fieldType.indexAnalyzer(), positionOffsetGap));
                 fieldType.setSearchAnalyzer(new NamedAnalyzer(fieldType.searchAnalyzer(), positionOffsetGap));
                 fieldType.setSearchQuoteAnalyzer(new NamedAnalyzer(fieldType.searchQuoteAnalyzer(), positionOffsetGap));
@@ -154,7 +177,11 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext
                     builder.searchQuotedAnalyzer(analyzer);
                     iterator.remove();
                 } else if (propName.equals("position_offset_gap")) {
-                    builder.positionOffsetGap(XContentMapValues.nodeIntegerValue(propNode, -1));
+                    int newPositionOffsetGap = XContentMapValues.nodeIntegerValue(propNode, -1);
+                    if (newPositionOffsetGap < 0) {
+                        throw new MapperParsingException("positions_offset_gap less than 0 aren't allowed.");
+                    }
+                    builder.positionOffsetGap(newPositionOffsetGap);
                     // we need to update to actual analyzers if they are not set in this case...
                     // so we can inject the position offset gap...
                     if (builder.fieldType().indexAnalyzer() == null) {
@@ -354,7 +381,7 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults,
             builder.field("include_in_all", false);
         }
 
-        if (includeDefaults || positionOffsetGap != Defaults.POSITION_OFFSET_GAP) {
+        if (includeDefaults || positionOffsetGap != POSITION_OFFSET_GAP_USE_ANALYZER) {
             builder.field("position_offset_gap", positionOffsetGap);
         }
         NamedAnalyzer searchQuoteAnalyzer = fieldType().searchQuoteAnalyzer();

diff --git a/core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java b/core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java
@@ -21,6 +21,7 @@
 
 import com.google.common.base.Predicate;
 import com.google.common.util.concurrent.ListenableFuture;
+
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
@@ -40,6 +41,7 @@
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.env.NodeEnvironment;
 import org.elasticsearch.index.engine.EngineConfig;
+import org.elasticsearch.index.mapper.string.StringFieldMapperPositionOffsetGapTests;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.index.shard.MergePolicyConfig;
 import org.elasticsearch.indices.recovery.RecoverySettings;
@@ -330,6 +332,7 @@ void assertOldIndexWorks(String index) throws Exception {
         assertNewReplicasWork(indexName);
         assertUpgradeWorks(indexName, isLatestLuceneVersion(version));
         assertDeleteByQueryWorked(indexName, version);
+        assertPositionOffsetGapDefaults(indexName, version);
         unloadIndex(indexName);
     }
 
@@ -442,6 +445,14 @@ void assertDeleteByQueryWorked(String indexName, Version version) throws Excepti
         assertEquals(0, searchReq.get().getHits().getTotalHits());
     }
 
+    void assertPositionOffsetGapDefaults(String indexName, Version version) throws Exception {
+        if (version.before(Version.V_2_1_0)) {
+            StringFieldMapperPositionOffsetGapTests.assertGapIsZero(client(), indexName, "doc");
+        } else {
+            StringFieldMapperPositionOffsetGapTests.assertGapIsOneHundred(client(), indexName, "doc");
+        }
+    }
+
     void assertUpgradeWorks(String indexName, boolean alreadyLatest) throws Exception {
         if (alreadyLatest == false) {
             UpgradeIT.assertNotUpgraded(client(), indexName);

diff --git a/...t/java/org/elasticsearch/index/mapper/string/StringFieldMapperPositionOffsetGapTests.java b/...t/java/org/elasticsearch/index/mapper/string/StringFieldMapperPositionOffsetGapTests.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.string;
+
+import com.google.common.collect.ImmutableList;
+
+import org.elasticsearch.ExceptionsHelper;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.index.mapper.MapperParsingException;
+import org.elasticsearch.test.ESSingleNodeTestCase;
+
+import java.io.IOException;
+
+import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
+import static org.hamcrest.Matchers.containsString;
+
+/**
+ * Tests that position_offset_gap is read from the mapper and applies as
+ * expected in queries.
+ */
+public class StringFieldMapperPositionOffsetGapTests extends ESSingleNodeTestCase {
+    /**
+     * The default position_offset_gap should be large enough that most
+     * "sensible" queries phrase slops won't match across values.
+     */
+    public void testDefault() throws IOException {
+        assertGapIsOneHundred(client(), "test", "test");
+    }
+
+    /**
+     * Asserts that the post-2.0 default is being applied.
+     */
+    public static void assertGapIsOneHundred(Client client, String indexName, String type) throws IOException {
+        testGap(client(), indexName, type, 100);
+
+        // No match across gap using default slop with default positionOffsetGap
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two")).get(), 0);
+
+        // Nor with small-ish values
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(5)).get(), 0);
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(50)).get(), 0);
+
+        // But huge-ish values still match
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(500)).get(), 1);
+    }
+
+    public void testZero() throws IOException {
+        setupGapInMapping(0);
+        assertGapIsZero(client(), "test", "test");
+    }
+
+    /**
+     * Asserts that the pre-2.0 default has been applied or explicitly
+     * configured.
+     */
+    public static void assertGapIsZero(Client client, String indexName, String type) throws IOException {
+        testGap(client, indexName, type, 0);
+        /*
+         * Phrases match across different values using default slop with pre-2.0 default
+         * position_offset_gap.
+         */
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two")).get(), 1);
+    }
+
+    public void testLargerThanDefault() throws IOException {
+        setupGapInMapping(10000);
+        testGap(client(), "test", "test", 10000);
+    }
+
+    public void testSmallerThanDefault() throws IOException {
+        setupGapInMapping(2);
+        testGap(client(), "test", "test", 2);
+    }
+
+    public void testNegativeIsError() throws IOException {
+        try {
+            setupGapInMapping(-1);
+            fail("Expected an error");
+        } catch (MapperParsingException e) {
+            assertThat(ExceptionsHelper.detailedMessage(e), containsString("positions_offset_gap less than 0 aren't allowed"));
+        }
+    }
+
+    /**
+     * Tests that the default actually defaults to the position_offset_gap
+     * configured in the analyzer. This behavior is very old and a little
+     * strange but not worth breaking some thought.
+     */
+    public void testDefaultDefaultsToAnalyzer() throws IOException {
+        XContentBuilder settings = XContentFactory.jsonBuilder().startObject().startObject("analysis").startObject("analyzer")
+                .startObject("gappy");
+        settings.field("type", "custom");
+        settings.field("tokenizer", "standard");
+        settings.field("position_offset_gap", 2);
+        setupAnalyzer(settings, "gappy");
+        testGap(client(), "test", "test", 2);
+    }
+
+    /**
+     * Build an index named "test" with a field named "string" with the provided
+     * positionOffsetGap that uses the standard analyzer.
+     */
+    private void setupGapInMapping(int positionOffsetGap) throws IOException {
+        XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("properties").startObject("string");
+        mapping.field("type", "string");
+        mapping.field("position_offset_gap", positionOffsetGap);
+        client().admin().indices().prepareCreate("test").addMapping("test", mapping).get();
+    }
+
+    /**
+     * Build an index named "test" with the provided settings and and a field
+     * named "string" that uses the specified analyzer and default
+     * position_offset_gap.
+     */
+    private void setupAnalyzer(XContentBuilder settings, String analyzer) throws IOException {
+        XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("properties").startObject("string");
+        mapping.field("type", "string");
+        mapping.field("analyzer", analyzer);
+        client().admin().indices().prepareCreate("test").addMapping("test", mapping).setSettings(settings).get();
+    }
+
+    private static void testGap(Client client, String indexName, String type, int positionOffsetGap) throws IOException {
+        client.prepareIndex(indexName, type, "position_gap_test").setSource("string", ImmutableList.of("one", "two three")).setRefresh(true).get();
+
+        // Baseline - phrase query finds matches in the same field value
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "two three")).get(), 1);
+
+        if (positionOffsetGap > 0) {
+            // No match across gaps when slop < position gap
+            assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap - 1)).get(),
+                    0);
+        }
+
+        // Match across gaps when slop >= position gap
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap)).get(), 1);
+        assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap + 1)).get(), 1);
+    }
+}
diff --git a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc
@@ -20,8 +20,10 @@ filters.
 |`char_filter` |An optional list of logical / registered name of char
 filters.
 
-|`position_offset_gap` |An optional number of positions to increment 
-between each field value of a field using this analyzer.
+|`position_offset_gap` |An optional number of positions to increment
+between each field value of a field using this analyzer. Defaults to 100.
+100 was chosen because it prevents phrase queries with reasonably large
+slops (less than 100) from matching terms across field values.
 |=======================================================================
 
 Here is an example:
@@ -30,7 +32,7 @@ Here is an example:
 --------------------------------------------------
 index :
     analysis :
-        analyzer : 
+        analyzer :
             myAnalyzer2 :
                 type : custom
                 tokenizer : myTokenizer1

diff --git a/docs/reference/mapping/types/string.asciidoc b/docs/reference/mapping/types/string.asciidoc
@@ -145,6 +145,11 @@ Defaults depend on the <<mapping-index,`index`>> setting:
 
     The number of fake term positions which should be inserted between
     each element of an array of strings. Defaults to 0.
+    The number of fake term position which should be inserted between each
+    element of an array of strings. Defaults to the position_offset_gap
+    configured on the analyzer which defaults to 100. 100 was chosen because it
+    prevents phrase queries with reasonably large slops (less than 100) from
+    matching terms across field values.
 
 <<mapping-store,`store`>>::
 
@@ -166,5 +171,3 @@ Defaults depend on the <<mapping-index,`index`>> setting:
 
     Whether term vectors should be stored for an <<mapping-index,`analyzed`>>
     field. Defaults to `no`.
-
-