From e49b98cf0d2b0e080b48b7b5326c5b3896fc7706 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Wed, 29 Jul 2015 17:07:26 -0400 Subject: [PATCH] Mapping: Default position_offset_gap to 100 This is much more fiddly than you'd expect it to be because of the way position_offset_gap is applied in StringFieldMapper. Instead of setting the default to 100 its simpler to make sure that all the analyzers default to 100 and that StringFieldMapper doesn't override the default unless the user specifies something different. Unless the index was created before 2.1, in which case the old default of 0 has to take. Also postition_offset_gaps less than 0 aren't allowed at all. New tests test that: 1. the new default doesn't match phrases across values with reasonably low slop (5) 2. the new default doest match phrases across values with reasonably high slop (50) 3. you can override the value and phrases work as you'd expect 4. if you leave the value undefined in the mapping and define it on a custom analyzer the the value from the custom analyzer shines through Closes #7268 --- .../index/analysis/AnalysisService.java | 24 +- .../analysis/CustomAnalyzerProvider.java | 5 +- .../index/mapper/core/StringFieldMapper.java | 37 +- .../OldIndexBackwardsCompatibilityIT.java | 11 + ...ringFieldMapperPositionOffsetGapTests.java | 158 +++++ .../analyzers/custom-analyzer.asciidoc | 8 +- .../mapping/types/core-types.asciidoc | 651 ++++++++++++++++++ docs/reference/migration/migrate_2_0.asciidoc | 2 +- docs/reference/migration/migrate_2_1.asciidoc | 10 + 9 files changed, 894 insertions(+), 12 deletions(-) create mode 100644 core/src/test/java/org/elasticsearch/index/mapper/string/StringFieldMapperPositionOffsetGapTests.java create mode 100644 docs/reference/mapping/types/core-types.asciidoc diff --git a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java index 1cc37b8cda8fc..55d6f572c5eb8 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/AnalysisService.java @@ -29,6 +29,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.AbstractIndexComponent; import org.elasticsearch.index.Index; +import org.elasticsearch.index.mapper.core.StringFieldMapper; import org.elasticsearch.index.settings.IndexSettings; import org.elasticsearch.indices.analysis.IndicesAnalysisService; @@ -215,19 +216,38 @@ public AnalysisService(Index index, @IndexSettings Settings indexSettings, @Null Map analyzers = newHashMap(); for (AnalyzerProvider analyzerFactory : analyzerProviders.values()) { + /* + * Lucene defaults positionOffsetGap to 0 in all analyzers but + * Elasticsearch defaults them to 0 only before version 2.1 + * and 100 afterwards so we override the positionOffsetGap if it + * doesn't match here. + */ + int overridePositionOffsetGap = StringFieldMapper.Defaults.positionOffsetGap(Version.indexCreated(indexSettings)); if (analyzerFactory instanceof CustomAnalyzerProvider) { ((CustomAnalyzerProvider) analyzerFactory).build(this); + /* + * Custom analyzers already default to the correct, version + * dependent positionOffsetGap and the user is be able to + * configure the positionOffsetGap directly on the analyzer so + * we disable overriding the positionOffsetGap to preserve the + * user's setting. + */ + overridePositionOffsetGap = Integer.MIN_VALUE; } Analyzer analyzerF = analyzerFactory.get(); if (analyzerF == null) { throw new IllegalArgumentException("analyzer [" + analyzerFactory.name() + "] created null analyzer"); } NamedAnalyzer analyzer; - // if we got a named analyzer back, use it... if (analyzerF instanceof NamedAnalyzer) { + // if we got a named analyzer back, use it... analyzer = (NamedAnalyzer) analyzerF; + if (overridePositionOffsetGap >= 0 && analyzer.getPositionIncrementGap(analyzer.name()) != overridePositionOffsetGap) { + // unless the positionOffsetGap needs to be overridden + analyzer = new NamedAnalyzer(analyzer, overridePositionOffsetGap); + } } else { - analyzer = new NamedAnalyzer(analyzerFactory.name(), analyzerFactory.scope(), analyzerF); + analyzer = new NamedAnalyzer(analyzerFactory.name(), analyzerFactory.scope(), analyzerF, overridePositionOffsetGap); } analyzers.put(analyzerFactory.name(), analyzer); analyzers.put(Strings.toCamelCase(analyzerFactory.name()), analyzer); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java b/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java index 8975ba0044072..d3599dbceefac 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/CustomAnalyzerProvider.java @@ -19,10 +19,12 @@ package org.elasticsearch.index.analysis; +import org.elasticsearch.Version; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.Index; +import org.elasticsearch.index.mapper.core.StringFieldMapper; import org.elasticsearch.index.settings.IndexSettings; import java.util.List; @@ -77,7 +79,8 @@ public void build(AnalysisService analysisService) { tokenFilters.add(tokenFilter); } - int positionOffsetGap = analyzerSettings.getAsInt("position_offset_gap", 0); + int positionOffsetGapDefault = StringFieldMapper.Defaults.positionOffsetGap(Version.indexCreated(indexSettings)); + int positionOffsetGap = analyzerSettings.getAsInt("position_offset_gap", positionOffsetGapDefault); int offsetGap = analyzerSettings.getAsInt("offset_gap", -1); this.customAnalyzer = new CustomAnalyzer(tokenizer, diff --git a/core/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java b/core/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java index a255a2f106f8b..903660388730d 100644 --- a/core/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java +++ b/core/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.Version; import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; @@ -52,6 +53,7 @@ public class StringFieldMapper extends FieldMapper implements AllFieldMapper.IncludeInAll { public static final String CONTENT_TYPE = "string"; + private static final int POSITION_OFFSET_GAP_USE_ANALYZER = -1; public static class Defaults { public static final MappedFieldType FIELD_TYPE = new StringFieldType(); @@ -62,15 +64,36 @@ public static class Defaults { // NOTE, when adding defaults here, make sure you add them in the builder public static final String NULL_VALUE = null; - public static final int POSITION_OFFSET_GAP = 0; + /** + * Post 2.1 default for position_offset_gap. Set to 100 so that + * phrase queries of reasonably high slop will not match across field + * values. + */ + public static final int POSITION_OFFSET_GAP = 100; + public static final int POSITION_OFFSET_GAP_PRE_2_1 = 0; public static final int IGNORE_ABOVE = -1; + + /** + * The default position_offset_gap for a particular version of Elasticsearch. + */ + public static int positionOffsetGap(Version version) { + if (version.before(Version.V_2_1_0)) { + return POSITION_OFFSET_GAP_PRE_2_1; + } + return POSITION_OFFSET_GAP; + } } public static class Builder extends FieldMapper.Builder { protected String nullValue = Defaults.NULL_VALUE; - protected int positionOffsetGap = Defaults.POSITION_OFFSET_GAP; + /** + * The distance between tokens from different values in the same field. + * POSITION_OFFSET_GAP_USE_ANALYZER means default to the analyzer's + * setting which in turn defaults to Defaults.POSITION_OFFSET_GAP. + */ + protected int positionOffsetGap = POSITION_OFFSET_GAP_USE_ANALYZER; protected int ignoreAbove = Defaults.IGNORE_ABOVE; @@ -102,7 +125,7 @@ public Builder ignoreAbove(int ignoreAbove) { @Override public StringFieldMapper build(BuilderContext context) { - if (positionOffsetGap > 0) { + if (positionOffsetGap != POSITION_OFFSET_GAP_USE_ANALYZER) { fieldType.setIndexAnalyzer(new NamedAnalyzer(fieldType.indexAnalyzer(), positionOffsetGap)); fieldType.setSearchAnalyzer(new NamedAnalyzer(fieldType.searchAnalyzer(), positionOffsetGap)); fieldType.setSearchQuoteAnalyzer(new NamedAnalyzer(fieldType.searchQuoteAnalyzer(), positionOffsetGap)); @@ -154,7 +177,11 @@ public Mapper.Builder parse(String name, Map node, ParserContext builder.searchQuotedAnalyzer(analyzer); iterator.remove(); } else if (propName.equals("position_offset_gap")) { - builder.positionOffsetGap(XContentMapValues.nodeIntegerValue(propNode, -1)); + int newPositionOffsetGap = XContentMapValues.nodeIntegerValue(propNode, -1); + if (newPositionOffsetGap < 0) { + throw new MapperParsingException("positions_offset_gap less than 0 aren't allowed."); + } + builder.positionOffsetGap(newPositionOffsetGap); // we need to update to actual analyzers if they are not set in this case... // so we can inject the position offset gap... if (builder.fieldType().indexAnalyzer() == null) { @@ -354,7 +381,7 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, builder.field("include_in_all", false); } - if (includeDefaults || positionOffsetGap != Defaults.POSITION_OFFSET_GAP) { + if (includeDefaults || positionOffsetGap != POSITION_OFFSET_GAP_USE_ANALYZER) { builder.field("position_offset_gap", positionOffsetGap); } NamedAnalyzer searchQuoteAnalyzer = fieldType().searchQuoteAnalyzer(); diff --git a/core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java b/core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java index db464c31516e8..1efc9e888a56c 100644 --- a/core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java +++ b/core/src/test/java/org/elasticsearch/bwcompat/OldIndexBackwardsCompatibilityIT.java @@ -21,6 +21,7 @@ import com.google.common.base.Predicate; import com.google.common.util.concurrent.ListenableFuture; + import org.apache.lucene.index.IndexWriter; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; @@ -40,6 +41,7 @@ import org.elasticsearch.common.xcontent.XContentHelper; import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.engine.EngineConfig; +import org.elasticsearch.index.mapper.string.StringFieldMapperPositionOffsetGapTests; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.shard.MergePolicyConfig; import org.elasticsearch.indices.recovery.RecoverySettings; @@ -330,6 +332,7 @@ void assertOldIndexWorks(String index) throws Exception { assertNewReplicasWork(indexName); assertUpgradeWorks(indexName, isLatestLuceneVersion(version)); assertDeleteByQueryWorked(indexName, version); + assertPositionOffsetGapDefaults(indexName, version); unloadIndex(indexName); } @@ -442,6 +445,14 @@ void assertDeleteByQueryWorked(String indexName, Version version) throws Excepti assertEquals(0, searchReq.get().getHits().getTotalHits()); } + void assertPositionOffsetGapDefaults(String indexName, Version version) throws Exception { + if (version.before(Version.V_2_1_0)) { + StringFieldMapperPositionOffsetGapTests.assertGapIsZero(client(), indexName, "doc"); + } else { + StringFieldMapperPositionOffsetGapTests.assertGapIsOneHundred(client(), indexName, "doc"); + } + } + void assertUpgradeWorks(String indexName, boolean alreadyLatest) throws Exception { if (alreadyLatest == false) { UpgradeIT.assertNotUpgraded(client(), indexName); diff --git a/core/src/test/java/org/elasticsearch/index/mapper/string/StringFieldMapperPositionOffsetGapTests.java b/core/src/test/java/org/elasticsearch/index/mapper/string/StringFieldMapperPositionOffsetGapTests.java new file mode 100644 index 0000000000000..86701f83c5ea9 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/index/mapper/string/StringFieldMapperPositionOffsetGapTests.java @@ -0,0 +1,158 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.string; + +import com.google.common.collect.ImmutableList; + +import org.elasticsearch.ExceptionsHelper; +import org.elasticsearch.client.Client; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.test.ESSingleNodeTestCase; + +import java.io.IOException; + +import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; +import static org.hamcrest.Matchers.containsString; + +/** + * Tests that position_offset_gap is read from the mapper and applies as + * expected in queries. + */ +public class StringFieldMapperPositionOffsetGapTests extends ESSingleNodeTestCase { + /** + * The default position_offset_gap should be large enough that most + * "sensible" queries phrase slops won't match across values. + */ + public void testDefault() throws IOException { + assertGapIsOneHundred(client(), "test", "test"); + } + + /** + * Asserts that the post-2.0 default is being applied. + */ + public static void assertGapIsOneHundred(Client client, String indexName, String type) throws IOException { + testGap(client(), indexName, type, 100); + + // No match across gap using default slop with default positionOffsetGap + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two")).get(), 0); + + // Nor with small-ish values + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(5)).get(), 0); + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(50)).get(), 0); + + // But huge-ish values still match + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(500)).get(), 1); + } + + public void testZero() throws IOException { + setupGapInMapping(0); + assertGapIsZero(client(), "test", "test"); + } + + /** + * Asserts that the pre-2.0 default has been applied or explicitly + * configured. + */ + public static void assertGapIsZero(Client client, String indexName, String type) throws IOException { + testGap(client, indexName, type, 0); + /* + * Phrases match across different values using default slop with pre-2.0 default + * position_offset_gap. + */ + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two")).get(), 1); + } + + public void testLargerThanDefault() throws IOException { + setupGapInMapping(10000); + testGap(client(), "test", "test", 10000); + } + + public void testSmallerThanDefault() throws IOException { + setupGapInMapping(2); + testGap(client(), "test", "test", 2); + } + + public void testNegativeIsError() throws IOException { + try { + setupGapInMapping(-1); + fail("Expected an error"); + } catch (MapperParsingException e) { + assertThat(ExceptionsHelper.detailedMessage(e), containsString("positions_offset_gap less than 0 aren't allowed")); + } + } + + /** + * Tests that the default actually defaults to the position_offset_gap + * configured in the analyzer. This behavior is very old and a little + * strange but not worth breaking some thought. + */ + public void testDefaultDefaultsToAnalyzer() throws IOException { + XContentBuilder settings = XContentFactory.jsonBuilder().startObject().startObject("analysis").startObject("analyzer") + .startObject("gappy"); + settings.field("type", "custom"); + settings.field("tokenizer", "standard"); + settings.field("position_offset_gap", 2); + setupAnalyzer(settings, "gappy"); + testGap(client(), "test", "test", 2); + } + + /** + * Build an index named "test" with a field named "string" with the provided + * positionOffsetGap that uses the standard analyzer. + */ + private void setupGapInMapping(int positionOffsetGap) throws IOException { + XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("properties").startObject("string"); + mapping.field("type", "string"); + mapping.field("position_offset_gap", positionOffsetGap); + client().admin().indices().prepareCreate("test").addMapping("test", mapping).get(); + } + + /** + * Build an index named "test" with the provided settings and and a field + * named "string" that uses the specified analyzer and default + * position_offset_gap. + */ + private void setupAnalyzer(XContentBuilder settings, String analyzer) throws IOException { + XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("properties").startObject("string"); + mapping.field("type", "string"); + mapping.field("analyzer", analyzer); + client().admin().indices().prepareCreate("test").addMapping("test", mapping).setSettings(settings).get(); + } + + private static void testGap(Client client, String indexName, String type, int positionOffsetGap) throws IOException { + client.prepareIndex(indexName, type, "position_gap_test").setSource("string", ImmutableList.of("one", "two three")).setRefresh(true).get(); + + // Baseline - phrase query finds matches in the same field value + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "two three")).get(), 1); + + if (positionOffsetGap > 0) { + // No match across gaps when slop < position gap + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap - 1)).get(), + 0); + } + + // Match across gaps when slop >= position gap + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap)).get(), 1); + assertHitCount(client.prepareSearch(indexName).setQuery(matchPhraseQuery("string", "one two").slop(positionOffsetGap + 1)).get(), 1); + } +} diff --git a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc index bdc03a0998bea..d11cb7f95b660 100644 --- a/docs/reference/analysis/analyzers/custom-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/custom-analyzer.asciidoc @@ -20,8 +20,10 @@ filters. |`char_filter` |An optional list of logical / registered name of char filters. -|`position_offset_gap` |An optional number of positions to increment -between each field value of a field using this analyzer. +|`position_offset_gap` |An optional number of positions to increment +between each field value of a field using this analyzer. Defaults to 100. +100 was chosen because it prevents phrase queries with reasonably large +slops (less than 100) from matching terms across field values. |======================================================================= Here is an example: @@ -30,7 +32,7 @@ Here is an example: -------------------------------------------------- index : analysis : - analyzer : + analyzer : myAnalyzer2 : type : custom tokenizer : myTokenizer1 diff --git a/docs/reference/mapping/types/core-types.asciidoc b/docs/reference/mapping/types/core-types.asciidoc new file mode 100644 index 0000000000000..50c017e2e1131 --- /dev/null +++ b/docs/reference/mapping/types/core-types.asciidoc @@ -0,0 +1,651 @@ +[[mapping-core-types]] +=== Core Types + +Each JSON field can be mapped to a specific core type. JSON itself +already provides us with some typing, with its support for `string`, +`integer`/`long`, `float`/`double`, `boolean`, and `null`. + +The following sample tweet JSON document will be used to explain the +core types: + +[source,js] +-------------------------------------------------- +{ + "tweet" { + "user" : "kimchy", + "message" : "This is a tweet!", + "postDate" : "2009-11-15T14:12:12", + "priority" : 4, + "rank" : 12.3 + } +} +-------------------------------------------------- + +Explicit mapping for the above JSON tweet can be: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "user" : {"type" : "string", "index" : "not_analyzed"}, + "message" : {"type" : "string", "null_value" : "na"}, + "postDate" : {"type" : "date"}, + "priority" : {"type" : "integer"}, + "rank" : {"type" : "float"} + } + } +} +-------------------------------------------------- + +[float] +[[string]] +==== String + +The text based string type is the most basic type, and contains one or +more characters. An example mapping can be: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "message" : { + "type" : "string", + "store" : true, + "index" : "analyzed", + "null_value" : "na" + }, + "user" : { + "type" : "string", + "index" : "not_analyzed", + "norms" : { + "enabled" : false + } + } + } + } +} +-------------------------------------------------- + +The above mapping defines a `string` `message` property/field within the +`tweet` type. The field is stored in the index (so it can later be +retrieved using selective loading when searching), and it gets analyzed +(broken down into searchable terms). If the message has a `null` value, +then the value that will be stored is `na`. There is also a `string` `user` +which is indexed as-is (not broken down into tokens) and has norms +disabled (so that matching this field is a binary decision, no match is +better than another one). + +The following table lists all the attributes that can be used with the +`string` type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`store` |Set to `true` to actually store the field in the index, `false` to not +store it. Since by default Elasticsearch stores all fields of the source +document in the special `_source` field, this option is primarily useful when +the `_source` field has been disabled in the type definition. Defaults to +`false`. + +|`index` |Set to `analyzed` for the field to be indexed and searchable +after being broken down into token using an analyzer. `not_analyzed` +means that its still searchable, but does not go through any analysis +process or broken down into tokens. `no` means that it won't be +searchable at all (as an individual field; it may still be included in +`_all`). Setting to `no` disables `include_in_all`. Defaults to +`analyzed`. + +|`doc_values` |Set to `true` to store field values in a column-stride fashion. +Automatically set to `true` when the <> is `doc_values`. + +|`term_vector` |Possible values are `no`, `yes`, `with_offsets`, +`with_positions`, `with_positions_offsets`. Defaults to `no`. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. + +|`norms: {enabled: }` |Boolean value if norms should be enabled or +not. Defaults to `true` for `analyzed` fields, and to `false` for +`not_analyzed` fields. See the <>. + +|`norms: {loading: }` |Describes how norms should be loaded, possible values are +`eager` and `lazy` (default). It is possible to change the default value to +eager for all fields by configuring the index setting `index.norms.loading` +to `eager`. + +|`index_options` | Allows to set the indexing +options, possible values are `docs` (only doc numbers are indexed), +`freqs` (doc numbers and term frequencies), and `positions` (doc +numbers, term frequencies and positions). Defaults to `positions` for +`analyzed` fields, and to `docs` for `not_analyzed` fields. It +is also possible to set it to `offsets` (doc numbers, term +frequencies, positions and offsets). + +|`analyzer` |The analyzer used to analyze the text contents when +`analyzed` during indexing and searching. +Defaults to the globally configured analyzer. + +|`search_analyzer` |The analyzer used to analyze the field when searching, which +overrides the value of `analyzer`. Can be updated on an existing field. + +|`include_in_all` |Should the field be included in the `_all` field (if +enabled). If `index` is set to `no` this defaults to `false`, otherwise, +defaults to `true` or to the parent `object` type setting. + +|`ignore_above` |The analyzer will ignore strings larger than this size. +Useful for generic `not_analyzed` fields that should ignore long text. + +This option is also useful for protecting against Lucene's term byte-length +limit of `32766`. Note: the value for `ignore_above` is the _character count_, +but Lucene counts bytes, so if you have UTF-8 text, you may want to set the +limit to `32766 / 3 = 10922` since UTF-8 characters may occupy at most 3 +bytes. + +|`position_offset_gap` |Position increment gap between field instances +with the same field name. Defaults to the default of the analyzer which is 100. +100 was chosen because it prevents phrase queries with reasonably large slops +(less than 100) from matching terms across field values. +|======================================================================= + +The `string` type also support custom indexing parameters associated +with the indexed value. For example: + +[source,js] +-------------------------------------------------- +{ + "message" : { + "_value": "boosted value", + "_boost": 2.0 + } +} +-------------------------------------------------- + +The mapping is required to disambiguate the meaning of the document. +Otherwise, the structure would interpret "message" as a value of type +"object". The key `_value` (or `value`) in the inner document specifies +the real string content that should eventually be indexed. The `_boost` +(or `boost`) key specifies the per field document boost (here 2.0). + +[float] +[[norms]] +===== Norms + +Norms store various normalization factors that are later used (at query time) +in order to compute the score of a document relatively to a query. + +Although useful for scoring, norms also require quite a lot of memory +(typically in the order of one byte per document per field in your index, +even for documents that don't have this specific field). As a consequence, if +you don't need scoring on a specific field, it is highly recommended to disable +norms on it. In particular, this is the case for fields that are used solely +for filtering or aggregations. + +In case you would like to disable norms after the fact, it is possible to do so +by using the <>, like this: + +[source,js] +------------ +PUT my_index/_mapping/my_type +{ + "properties": { + "title": { + "type": "string", + "norms": { + "enabled": false + } + } + } +} +------------ + +Please however note that norms won't be removed instantly, but will be removed +as old segments are merged into new segments as you continue indexing new documents. +Any score computation on a field that has had +norms removed might return inconsistent results since some documents won't have +norms anymore while other documents might still have norms. + +[float] +[[number]] +==== Number + +A number based type supporting `float`, `double`, `byte`, `short`, +`integer`, and `long`. It uses specific constructs within Lucene in +order to support numeric values. The number types have the same ranges +as corresponding +http://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html[Java +types]. An example mapping can be: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "rank" : { + "type" : "float", + "null_value" : 1.0 + } + } + } +} +-------------------------------------------------- + +The following table lists all the attributes that can be used with a +numbered type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`type` |The type of the number. Can be `float`, `double`, `integer`, +`long`, `short`, `byte`. Required. + +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`store` |Set to `true` to store actual field in the index, `false` to not +store it. Defaults to `false` (note, the JSON document itself is stored, +and it can be retrieved from it). + +|`index` |Set to `no` if the value should not be indexed. Setting to +`no` disables `include_in_all`. If set to `no` the field should be either stored +in `_source`, have `include_in_all` enabled, or `store` be set to +`true` for this to be useful. + +|`doc_values` |Set to `true` to store field values in a column-stride fashion. +Automatically set to `true` when the fielddata format is `doc_values`. + +|`precision_step` |The precision step (influences the number of terms +generated for each number value). Defaults to `16` for `long`, `double`, +`8` for `short`, `integer`, `float`, and `2147483647` for `byte`. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. + +|`include_in_all` |Should the field be included in the `_all` field (if +enabled). If `index` is set to `no` this defaults to `false`, otherwise, +defaults to `true` or to the parent `object` type setting. + +|`ignore_malformed` |Ignored a malformed number. Defaults to `false`. + +|`coerce` |Try convert strings to numbers and truncate fractions for integers. Defaults to `true`. + +|======================================================================= + +[float] +[[token_count]] +==== Token Count +The `token_count` type maps to the JSON string type but indexes and stores +the number of tokens in the string rather than the string itself. For +example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "name" : { + "type" : "string", + "fields" : { + "word_count": { + "type" : "token_count", + "store" : "yes", + "analyzer" : "standard" + } + } + } + } + } +} +-------------------------------------------------- + +All the configuration that can be specified for a number can be specified +for a token_count. The only extra configuration is the required +`analyzer` field which specifies which analyzer to use to break the string +into tokens. For best performance, use an analyzer with no token filters. + +[NOTE] +=================================================================== +Technically the `token_count` type sums position increments rather than +counting tokens. This means that even if the analyzer filters out stop +words they are included in the count. +=================================================================== + +[float] +[[date]] +==== Date + +The date type is a special type which maps to JSON string type. It +follows a specific format that can be explicitly set. All dates are +`UTC`. Internally, a date maps to a number type `long`, with the added +parsing stage from string to long and from long to string. An example +mapping: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "postDate" : { + "type" : "date", + "format" : "YYYY-MM-dd" + } + } + } +} +-------------------------------------------------- + +The date type will also accept a long number representing UTC +milliseconds since the epoch, regardless of the format it can handle. + +The following table lists all the attributes that can be used with a +date type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`format` |The <>. Defaults to `epoch_millis||strictDateOptionalTime`. + +|`store` |Set to `true` to store actual field in the index, `false` to not +store it. Defaults to `false` (note, the JSON document itself is stored, +and it can be retrieved from it). + +|`index` |Set to `no` if the value should not be indexed. Setting to +`no` disables `include_in_all`. If set to `no` the field should be either stored +in `_source`, have `include_in_all` enabled, or `store` be set to +`true` for this to be useful. + +|`doc_values` |Set to `true` to store field values in a column-stride fashion. +Automatically set to `true` when the fielddata format is `doc_values`. + +|`precision_step` |The precision step (influences the number of terms +generated for each number value). Defaults to `16`. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. + +|`include_in_all` |Should the field be included in the `_all` field (if +enabled). If `index` is set to `no` this defaults to `false`, otherwise, +defaults to `true` or to the parent `object` type setting. + +|`ignore_malformed` |Ignored a malformed number. Defaults to `false`. + +|======================================================================= + +[float] +[[boolean]] +==== Boolean + +The boolean type Maps to the JSON boolean type. It ends up storing +within the index either `T` or `F`, with automatic translation to `true` +and `false` respectively. + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "hes_my_special_tweet" : { + "type" : "boolean" + } + } + } +} +-------------------------------------------------- + +The boolean type also supports passing the value as a number or a string +(in this case `0`, an empty string, `false`, `off` and `no` are +`false`, all other values are `true`). + +The following table lists all the attributes that can be used with the +boolean type: + +[cols="<,<",options="header",] +|======================================================================= +|Attribute |Description +|`index_name` |The name of the field that will be stored in the index. +Defaults to the property/field name. + +|`store` |Set to `true` to store actual field in the index, `false` to not +store it. Defaults to `false` (note, the JSON document itself is stored, +and it can be retrieved from it). + +|`index` |Set to `no` if the value should not be indexed. Setting to +`no` disables `include_in_all`. If set to `no` the field should be either stored +in `_source`, have `include_in_all` enabled, or `store` be set to +`true` for this to be useful. + +|`doc_values` |Set to `true` to store field values in a column-stride fashion. +Automatically set to `true` when the fielddata format is `doc_values`. + +|`boost` |The boost value. Defaults to `1.0`. + +|`null_value` |When there is a (JSON) null value for the field, use the +`null_value` as the field value. Defaults to not adding the field at +all. +|======================================================================= + +[float] +[[binary]] +==== Binary + +The binary type is a base64 representation of binary data that can be +stored in the index. The field is not stored by default and not indexed at +all. + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "image" : { + "type" : "binary" + } + } + } +} +-------------------------------------------------- + +The following table lists all the attributes that can be used with the +binary type: + +[horizontal] + +`index_name`:: + + The name of the field that will be stored in the index. Defaults to the + property/field name. + +`store`:: + + Set to `true` to store actual field in the index, `false` to not store it. + Defaults to `false` (note, the JSON document itself is already stored, so + the binary field can be retrieved from there). + +`doc_values`:: + + Set to `true` to store field values in a column-stride fashion. + +[float] +[[fielddata-filters]] +==== Fielddata filters + +It is possible to control which field values are loaded into memory, +which is particularly useful for aggregations on string fields, using +fielddata filters, which are explained in detail in the +<> section. + +Fielddata filters can exclude terms which do not match a regex, or which +don't fall between a `min` and `max` frequency range: + +[source,js] +-------------------------------------------------- +{ + tweet: { + type: "string", + analyzer: "whitespace" + fielddata: { + filter: { + regex: { + "pattern": "^#.*" + }, + frequency: { + min: 0.001, + max: 0.1, + min_segment_size: 500 + } + } + } + } +} +-------------------------------------------------- + +These filters can be updated on an existing field mapping and will take +effect the next time the fielddata for a segment is loaded. Use the +<> API +to reload the fielddata using the new filters. + +[float] +==== Similarity + +Elasticsearch allows you to configure a similarity (scoring algorithm) per field. +The `similarity` setting provides a simple way of choosing a similarity algorithm +other than the default TF/IDF, such as `BM25`. + +You can configure similarities via the +<> + +[float] +===== Configuring Similarity per Field + +Defining the Similarity for a field is done via the `similarity` mapping +property, as this example shows: + +[source,js] +-------------------------------------------------- +{ + "book":{ + "properties":{ + "title":{ + "type":"string", "similarity":"BM25" + } + } + } +} +-------------------------------------------------- + +The following Similarities are configured out-of-box: + +`default`:: + The Default TF/IDF algorithm used by Elasticsearch and + Lucene in previous versions. + +`BM25`:: + The BM25 algorithm. + http://en.wikipedia.org/wiki/Okapi_BM25[See Okapi_BM25] for more + details. + + +[[copy-to]] +[float] +===== Copy to field + +Adding `copy_to` parameter to any field mapping will cause all values of this field to be copied to fields specified in +the parameter. In the following example all values from fields `title` and `abstract` will be copied to the field +`meta_data`. The field which is being copied to will be indexed (i.e. searchable, and available through `fielddata_field`) but the original source will not be modified. + + +[source,js] +-------------------------------------------------- +{ + "book" : { + "properties" : { + "title" : { "type" : "string", "copy_to" : "meta_data" }, + "abstract" : { "type" : "string", "copy_to" : "meta_data" }, + "meta_data" : { "type" : "string" } + } +} +-------------------------------------------------- + +Multiple fields are also supported: + +[source,js] +-------------------------------------------------- +{ + "book" : { + "properties" : { + "title" : { "type" : "string", "copy_to" : ["meta_data", "article_info"] } + } +} +-------------------------------------------------- + +[float] +[[multi-fields]] +===== Multi fields + +The `fields` options allows to map several core types fields into a single +json source field. This can be useful if a single field need to be +used in different ways. For example a single field is to be used for both +free text search and sorting. + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "name" : { + "type" : "string", + "index" : "analyzed", + "fields" : { + "raw" : {"type" : "string", "index" : "not_analyzed"} + } + } + } + } +} +-------------------------------------------------- + +In the above example the field `name` gets processed twice. The first time it gets +processed as an analyzed string and this version is accessible under the field name +`name`, this is the main field and is in fact just like any other field. The second time +it gets processed as a not analyzed string and is accessible under the name `name.raw`. + +[float] +==== Include in All + +The `include_in_all` setting is ignored on any field that is defined in +the `fields` options. Setting the `include_in_all` only makes sense on +the main field, since the raw field value is copied to the `_all` field, +the tokens aren't copied. + +[float] +==== Updating a field + +In essence a field cannot be updated. However multi fields can be +added to existing fields. This allows for example to have a different +`analyzer` configuration in addition to the already configured +`analyzer` configuration specified in the main and other multi fields. + +Also the new multi field will only be applied on document that have been +added after the multi field has been added and in fact the new multi field +doesn't exist in existing documents. + +Another important note is that new multi fields will be merged into the +list of existing multi fields, so when adding new multi fields for a field +previous added multi fields don't need to be specified. diff --git a/docs/reference/migration/migrate_2_0.asciidoc b/docs/reference/migration/migrate_2_0.asciidoc index bc664c2920b4c..e8209f1a74913 100644 --- a/docs/reference/migration/migrate_2_0.asciidoc +++ b/docs/reference/migration/migrate_2_0.asciidoc @@ -55,4 +55,4 @@ include::migrate_2_0/settings.asciidoc[] include::migrate_2_0/stats.asciidoc[] -include::migrate_2_0/java.asciidoc[] \ No newline at end of file +include::migrate_2_0/java.asciidoc[] diff --git a/docs/reference/migration/migrate_2_1.asciidoc b/docs/reference/migration/migrate_2_1.asciidoc index 7542fb3d1dfdb..9c21b4d2be3b9 100644 --- a/docs/reference/migration/migrate_2_1.asciidoc +++ b/docs/reference/migration/migrate_2_1.asciidoc @@ -25,3 +25,13 @@ GET /my_index/_search?scroll=2m Scroll requests sorted by `_doc` have been optimized to more efficiently resume from where the previous request stopped, so this will have the same performance characteristics as the former `scan` search type. + +=== Mapping changes + +==== position_offset_gap +The default `position_offset_grap` is now 100. Indexes created in Elasticsearch +2.1.0 will default to using 100 and indexes created before that will continue +to use the old default of 0. This was done to prevent phrase queries from +matching across different values of the same term unexpectedly. Specifically, +100 was chosen to cause phrase queries with slops up to 99 to match only within +a single value of a field.