Skip to content

Commit

Permalink
Allow doc-values only search on boolean fields (#82925)
Browse files Browse the repository at this point in the history
Allows searching on boolean fields when those fields are not indexed (index: false) but just doc values are enabled.

This enables searches on archive data, which has access to doc values but not index structures. When combined with
searchable snapshots, it allows downloading only data for a given (doc value) field to quickly filter down to a select set
of documents.

Relates #81210 and #52728
  • Loading branch information
ywelsch committed Jan 24, 2022
1 parent 717cb68 commit 0592c4c
Show file tree
Hide file tree
Showing 10 changed files with 218 additions and 72 deletions.
3 changes: 2 additions & 1 deletion docs/reference/mapping/params/doc-values.asciidoc
Expand Up @@ -17,7 +17,8 @@ makes this data access pattern possible. They store the same values as the
sorting and aggregations. Doc values are supported on almost all field types,
with the __notable exception of `text` and `annotated_text` fields__.

<<number,Numeric types>>, <<date,date types>>, and the <<keyword, keyword type>>
<<number,Numeric types>>, <<date,date types>>, the <<boolean,boolean type>>
and the <<keyword,keyword type>>
can also be queried using term or range-based queries
when they are not <<mapping-index,indexed>> but only have doc values enabled.
Query performance on doc values is much slower than on index structures, but
Expand Down
5 changes: 4 additions & 1 deletion docs/reference/mapping/types/boolean.asciidoc
Expand Up @@ -174,7 +174,10 @@ The following parameters are accepted by `boolean` fields:

<<mapping-index,`index`>>::

Should the field be searchable? Accepts `true` (default) and `false`.
Should the field be quickly searchable? Accepts `true` (default) and
`false`. Fields that only have <<doc-values,`doc_values`>>
enabled can still be queried using term or range-based queries,
albeit slower.

<<null-value,`null_value`>>::

Expand Down
2 changes: 1 addition & 1 deletion docs/reference/query-dsl.asciidoc
Expand Up @@ -33,7 +33,7 @@ the stability of the cluster. Those queries can be categorised as follows:

* Queries that need to do linear scans to identify matches:
** <<query-dsl-script-query,`script` queries>>
** queries on <<number,numeric>>, <<date,date>>, or <<keyword,keyword>> fields that are not indexed
** queries on <<number,numeric>>, <<date,date>>, <<boolean,boolean>>, or <<keyword,keyword>> fields that are not indexed
but have <<doc-values,doc values>> enabled

* Queries that have a high up-front cost:
Expand Down
Expand Up @@ -89,6 +89,9 @@ setup:
non_indexed_keyword:
type: keyword
index: false
non_indexed_boolean:
type: boolean
index: false
geo:
type: keyword
object:
Expand Down Expand Up @@ -240,6 +243,18 @@ setup:

- match: {fields.non_indexed_keyword.keyword.searchable: true}

---
"Field caps for boolean field with only doc values":
- skip:
version: " - 8.0.99"
reason: "doc values search was added in 8.1.0"
- do:
field_caps:
index: 'test1,test2,test3'
fields: non_indexed_boolean

- match: {fields.non_indexed_boolean.boolean.searchable: true}

---
"Get object and nested field caps":

Expand Down
Expand Up @@ -39,6 +39,9 @@ setup:
keyword:
type: keyword
index: false
boolean:
type: boolean
index: false

- do:
index:
Expand All @@ -54,6 +57,7 @@ setup:
short: 1
date: "2017/01/01"
keyword: "key1"
boolean: "false"

- do:
index:
Expand All @@ -69,6 +73,7 @@ setup:
short: 2
date: "2017/01/02"
keyword: "key2"
boolean: "true"

- do:
indices.refresh: {}
Expand Down Expand Up @@ -252,3 +257,30 @@ setup:
index: test
body: { query: { range: { keyword: { gte: "key1" } } } }
- length: { hits.hits: 2 }

---
"Test match query on boolean field where only doc values are enabled":

- do:
search:
index: test
body: { query: { match: { boolean: { query: "false" } } } }
- length: { hits.hits: 1 }

---
"Test terms query on boolean field where only doc values are enabled":

- do:
search:
index: test
body: { query: { terms: { boolean: [ "false", "true" ] } } }
- length: { hits.hits: 2 }

---
"Test range query on boolean field where only doc values are enabled":

- do:
search:
index: test
body: { query: { range: { boolean: { gte: "false" } } } }
- length: { hits.hits: 2 }
Expand Up @@ -14,6 +14,10 @@
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
Expand All @@ -37,6 +41,7 @@

import java.io.IOException;
import java.time.ZoneId;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -155,18 +160,27 @@ public BooleanFieldType(
}

public BooleanFieldType(String name) {
this(name, true, false, true, false, null, Collections.emptyMap());
this(name, true);
}

public BooleanFieldType(String name, boolean searchable) {
this(name, searchable, false, true, false, null, Collections.emptyMap());
public BooleanFieldType(String name, boolean isIndexed) {
this(name, isIndexed, true);
}

public BooleanFieldType(String name, boolean isIndexed, boolean hasDocValues) {
this(name, isIndexed, isIndexed, hasDocValues, false, null, Collections.emptyMap());
}

@Override
public String typeName() {
return CONTENT_TYPE;
}

@Override
public boolean isSearchable() {
return isIndexed() || hasDocValues();
}

@Override
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
if (format != null) {
Expand Down Expand Up @@ -209,6 +223,15 @@ public BytesRef indexedValueForSearch(Object value) {
};
}

private long docValueForSearch(Object value) {
BytesRef ref = indexedValueForSearch(value);
if (Values.TRUE.equals(ref)) {
return 1;
} else {
return 0;
}
}

@Override
public Boolean valueForDisplay(Object value) {
if (value == null) {
Expand All @@ -234,6 +257,30 @@ public DocValueFormat docValueFormat(@Nullable String format, ZoneId timeZone) {
return DocValueFormat.BOOLEAN;
}

@Override
public Query termQuery(Object value, SearchExecutionContext context) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.termQuery(value, context);
} else {
return SortedNumericDocValuesField.newSlowExactQuery(name(), docValueForSearch(value));
}
}

@Override
public Query termsQuery(Collection<?> values, SearchExecutionContext context) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.termsQuery(values, context);
} else {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (Object value : values) {
builder.add(termQuery(value, context), BooleanClause.Occur.SHOULD);
}
return new ConstantScoreQuery(builder.build());
}
}

@Override
public Query rangeQuery(
Object lowerTerm,
Expand All @@ -242,14 +289,35 @@ public Query rangeQuery(
boolean includeUpper,
SearchExecutionContext context
) {
failIfNotIndexed();
return new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
} else {
long l = 0;
long u = 1;
if (lowerTerm != null) {
l = docValueForSearch(lowerTerm);
if (includeLower == false) {
l = Math.max(1, l + 1);
}
}
if (upperTerm != null) {
u = docValueForSearch(upperTerm);
if (includeUpper == false) {
l = Math.min(0, l - 1);
}
}
if (l > u) {
return new MatchNoDocsQuery();
}
return SortedNumericDocValuesField.newSlowRangeQuery(name(), l, u);
}
}
}

Expand Down
Expand Up @@ -20,6 +20,7 @@
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.sandbox.search.DocValuesTermsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
Expand Down Expand Up @@ -50,6 +51,7 @@

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -339,13 +341,51 @@ public KeywordFieldType(String name, NamedAnalyzer analyzer) {
}

@Override
protected boolean allowDocValueBasedQueries() {
return true;
public boolean isSearchable() {
return isIndexed() || hasDocValues();
}

@Override
public boolean isSearchable() {
return isIndexed() || hasDocValues();
public Query termQuery(Object value, SearchExecutionContext context) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.termQuery(value, context);
} else {
return SortedSetDocValuesField.newSlowExactQuery(name(), indexedValueForSearch(value));
}
}

@Override
public Query termsQuery(Collection<?> values, SearchExecutionContext context) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.termsQuery(values, context);
} else {
BytesRef[] bytesRefs = values.stream().map(this::indexedValueForSearch).toArray(BytesRef[]::new);
return new DocValuesTermsQuery(name(), bytesRefs);
}
}

@Override
public Query rangeQuery(
Object lowerTerm,
Object upperTerm,
boolean includeLower,
boolean includeUpper,
SearchExecutionContext context
) {
failIfNotIndexedNorDocValuesFallback(context);
if (isIndexed()) {
return super.rangeQuery(lowerTerm, upperTerm, includeLower, includeUpper, context);
} else {
return SortedSetDocValuesField.newSlowRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
}
}

@Override
Expand Down
Expand Up @@ -9,7 +9,6 @@
package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.FuzzyQuery;
Expand Down Expand Up @@ -211,27 +210,13 @@ public Query rangeQuery(
+ "' is set to false."
);
}
if (allowDocValueBasedQueries()) {
failIfNotIndexedNorDocValuesFallback(context);
} else {
failIfNotIndexed();
}
if (isIndexed()) {
return new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
} else {
return SortedSetDocValuesField.newSlowRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
}
failIfNotIndexed();
return new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
}
}

0 comments on commit 0592c4c

Please sign in to comment.