Skip to content

Commit

Permalink
Allow mixing set-based and regexp-based include and exclude (#63325)
Browse files Browse the repository at this point in the history
* Allow mixing set-based and regexp-based include and exclude

* Coding style

* Disallow having both set and regexp include (resp. exclude)

* Test correctness of every combination of include/exclude
  • Loading branch information
hchargois committed Oct 21, 2020
1 parent f2bcc77 commit ff736f0
Show file tree
Hide file tree
Showing 5 changed files with 301 additions and 197 deletions.
2 changes: 2 additions & 0 deletions docs/reference/aggregations/bucket/terms-aggregation.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,8 @@ expire then we may be missing accounts of interest and have set our numbers too
Ultimately this is a balancing act between managing the Elasticsearch resources required to process a single request and the volume
of requests that the client application must issue to complete a task.

WARNING: Partitions cannot be used together with an `exclude` parameter.

==== Multi-field terms aggregation

The `terms` aggregation does not support collecting terms from multiple fields
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand Down Expand Up @@ -78,17 +79,8 @@ public static IncludeExclude merge(IncludeExclude include, IncludeExclude exclud
if (include.isPartitionBased()) {
throw new IllegalArgumentException("Cannot specify any excludes when using a partition-based include");
}
String includeMethod = include.isRegexBased() ? "regex" : "set";
String excludeMethod = exclude.isRegexBased() ? "regex" : "set";
if (includeMethod.equals(excludeMethod) == false) {
throw new IllegalArgumentException("Cannot mix a " + includeMethod + "-based include with a "
+ excludeMethod + "-based method");
}
if (include.isRegexBased()) {
return new IncludeExclude(include.include, exclude.exclude);
} else {
return new IncludeExclude(include.includeValues, exclude.excludeValues);
}

return new IncludeExclude(include.include, exclude.exclude, include.includeValues, exclude.excludeValues);
}

public static IncludeExclude parseInclude(XContentParser parser) throws IOException {
Expand Down Expand Up @@ -196,46 +188,39 @@ public boolean accept(BytesRef value) {
}
}

static class AutomatonBackedStringFilter extends StringFilter {
class SetAndRegexStringFilter extends StringFilter {

private final ByteRunAutomaton runAutomaton;

private AutomatonBackedStringFilter(Automaton automaton) {
this.runAutomaton = new ByteRunAutomaton(automaton);
}

/**
* Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
return runAutomaton.run(value.bytes, value.offset, value.length);
}
}

static class TermListBackedStringFilter extends StringFilter {

private final Set<BytesRef> valids;
private final Set<BytesRef> invalids;

TermListBackedStringFilter(Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
this.valids = includeValues;
this.invalids = excludeValues;
private SetAndRegexStringFilter(DocValueFormat format) {
Automaton automaton = toAutomaton();
this.runAutomaton = automaton == null ? null : new ByteRunAutomaton(automaton);
this.valids = parseForDocValues(includeValues, format);
this.invalids = parseForDocValues(excludeValues, format);
}

/**
* Returns whether the given value is accepted based on the
* {@code include} &amp; {@code exclude} sets.
* Returns whether the given value is accepted based on the {@code includeValues} &amp; {@code excludeValues}
* sets, as well as the {@code include} &amp; {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
return ((valids == null) || (valids.contains(value))) && ((invalids == null) || (!invalids.contains(value)));
if (valids != null && valids.contains(value) == false) {
return false;
}

if (runAutomaton != null && runAutomaton.run(value.bytes, value.offset, value.length) == false) {
return false;
}

return invalids == null || invalids.contains(value) == false;
}
}

public abstract static class OrdinalsFilter extends Filter {
public abstract LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException;

}

class PartitionedOrdinalsFilter extends OrdinalsFilter {
Expand All @@ -258,59 +243,64 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
}
}

static class AutomatonBackedOrdinalsFilter extends OrdinalsFilter {
class SetAndRegexOrdinalsFilter extends OrdinalsFilter {

private final CompiledAutomaton compiled;
private final SortedSet<BytesRef> valids;
private final SortedSet<BytesRef> invalids;

private AutomatonBackedOrdinalsFilter(Automaton automaton) {
this.compiled = new CompiledAutomaton(automaton);
private SetAndRegexOrdinalsFilter(DocValueFormat format) {
Automaton automaton = toAutomaton();
this.compiled = automaton == null ? null : new CompiledAutomaton(automaton);
this.valids = parseForDocValues(includeValues, format);
this.invalids = parseForDocValues(excludeValues, format);
}

/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*
* Computes which global ordinals are accepted by this IncludeExclude instance, based on the combination of
* the {@code includeValues} &amp; {@code excludeValues} sets, as well as the {@code include} &amp;
* {@code exclude} patterns.
*/
@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
acceptedGlobalOrdinals.set(globalTermsEnum.ord());
}
return acceptedGlobalOrdinals;
}

}

static class TermListBackedOrdinalsFilter extends OrdinalsFilter {

private final SortedSet<BytesRef> includeValues;
private final SortedSet<BytesRef> excludeValues;

TermListBackedOrdinalsFilter(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
this.includeValues = includeValues;
this.excludeValues = excludeValues;
}

@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
if (includeValues != null) {
for (BytesRef term : includeValues) {
LongBitSet acceptedGlobalOrdinals = null;
if (valids != null) {
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
for (BytesRef term : valids) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.set(ord);
}
}
} else if (acceptedGlobalOrdinals.length() > 0) {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}
if (excludeValues != null) {
for (BytesRef term : excludeValues) {

if (compiled != null) {
LongBitSet automatonGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
automatonGlobalOrdinals.set(globalTermsEnum.ord());
}

if (acceptedGlobalOrdinals == null) {
acceptedGlobalOrdinals = automatonGlobalOrdinals;
} else {
acceptedGlobalOrdinals.and(automatonGlobalOrdinals);
}
}

if (acceptedGlobalOrdinals == null) {
acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
if (acceptedGlobalOrdinals.length() > 0) {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}
}

if (invalids != null) {
for (BytesRef term : invalids) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.clear(ord);
Expand All @@ -319,9 +309,9 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
}
return acceptedGlobalOrdinals;
}

}


private final RegExp include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
private final int incZeroBasedPartition;
Expand All @@ -332,17 +322,36 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro
* @param exclude The regular expression pattern for the terms to be excluded
*/
public IncludeExclude(RegExp include, RegExp exclude) {
if (include == null && exclude == null) {
this(include, exclude, null, null);
}

public IncludeExclude(RegExp include, RegExp exclude, SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
if (include == null && exclude == null && includeValues == null && excludeValues == null) {
throw new IllegalArgumentException();
}
if (include != null && includeValues != null) {
throw new IllegalArgumentException();
}
if (exclude != null && excludeValues != null) {
throw new IllegalArgumentException();
}
this.include = include;
this.exclude = exclude;
this.includeValues = null;
this.excludeValues = null;
this.includeValues = includeValues;
this.excludeValues = excludeValues;
this.incZeroBasedPartition = 0;
this.incNumPartitions = 0;
}

public IncludeExclude(String include, String exclude, String[] includeValues, String[] excludeValues) {
this(
include == null ? null : new RegExp(include),
exclude == null ? null : new RegExp(exclude),
convertToBytesRefSet(includeValues),
convertToBytesRefSet(excludeValues)
);
}

public IncludeExclude(String include, String exclude) {
this(include == null ? null : new RegExp(include), exclude == null ? null : new RegExp(exclude));
}
Expand All @@ -352,15 +361,7 @@ public IncludeExclude(String include, String exclude) {
* @param excludeValues The terms to be excluded
*/
public IncludeExclude(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
if (includeValues == null && excludeValues == null) {
throw new IllegalArgumentException();
}
this.include = null;
this.exclude = null;
this.incZeroBasedPartition = 0;
this.incNumPartitions = 0;
this.includeValues = includeValues;
this.excludeValues = excludeValues;
this(null, null, includeValues, excludeValues);
}

public IncludeExclude(String[] includeValues, String[] excludeValues) {
Expand Down Expand Up @@ -395,18 +396,21 @@ public IncludeExclude(int partition, int numPartitions) {
*/
public IncludeExclude(StreamInput in) throws IOException {
if (in.readBoolean()) {
includeValues = null;
excludeValues = null;
incZeroBasedPartition = 0;
incNumPartitions = 0;
String includeString = in.readOptionalString();
include = includeString == null ? null : new RegExp(includeString);
String excludeString = in.readOptionalString();
exclude = excludeString == null ? null : new RegExp(excludeString);
return;
if (in.getVersion().before(Version.V_8_0_0)) {
incZeroBasedPartition = 0;
incNumPartitions = 0;
includeValues = null;
excludeValues = null;
return;
}
} else {
include = null;
exclude = null;
}
include = null;
exclude = null;
if (in.readBoolean()) {
int size = in.readVInt();
includeValues = new TreeSet<>();
Expand Down Expand Up @@ -436,26 +440,28 @@ public void writeTo(StreamOutput out) throws IOException {
if (regexBased) {
out.writeOptionalString(include == null ? null : include.getOriginalString());
out.writeOptionalString(exclude == null ? null : exclude.getOriginalString());
} else {
boolean hasIncludes = includeValues != null;
out.writeBoolean(hasIncludes);
if (hasIncludes) {
out.writeVInt(includeValues.size());
for (BytesRef value : includeValues) {
out.writeBytesRef(value);
}
if (out.getVersion().before(Version.V_8_0_0)) {
return;
}
boolean hasExcludes = excludeValues != null;
out.writeBoolean(hasExcludes);
if (hasExcludes) {
out.writeVInt(excludeValues.size());
for (BytesRef value : excludeValues) {
out.writeBytesRef(value);
}
}
boolean hasIncludes = includeValues != null;
out.writeBoolean(hasIncludes);
if (hasIncludes) {
out.writeVInt(includeValues.size());
for (BytesRef value : includeValues) {
out.writeBytesRef(value);
}
out.writeVInt(incNumPartitions);
out.writeVInt(incZeroBasedPartition);
}
boolean hasExcludes = excludeValues != null;
out.writeBoolean(hasExcludes);
if (hasExcludes) {
out.writeVInt(excludeValues.size());
for (BytesRef value : excludeValues) {
out.writeBytesRef(value);
}
}
out.writeVInt(incNumPartitions);
out.writeVInt(incZeroBasedPartition);
}

private static SortedSet<BytesRef> convertToBytesRefSet(String[] values) {
Expand Down Expand Up @@ -573,29 +579,25 @@ public boolean isPartitionBased() {

private Automaton toAutomaton() {
Automaton a = null;
if (include == null && exclude == null) {
return a;
}
if (include != null) {
a = include.toAutomaton();
} else if (includeValues != null) {
a = Automata.makeStringUnion(includeValues);
} else {
a = Automata.makeAnyString();
}
if (exclude != null) {
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
} else if (excludeValues != null) {
a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
return a;
}

public StringFilter convertToStringFilter(DocValueFormat format) {
if (isRegexBased()) {
return new AutomatonBackedStringFilter(toAutomaton());
}
if (isPartitionBased()){
return new PartitionedStringFilter();
}
return new TermListBackedStringFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
return new SetAndRegexStringFilter(format);
}

private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUserFormattedValues, DocValueFormat format) {
Expand All @@ -612,15 +614,11 @@ private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUser
}

public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {

if (isRegexBased()) {
return new AutomatonBackedOrdinalsFilter(toAutomaton());
}
if (isPartitionBased()){
return new PartitionedOrdinalsFilter();
}

return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
return new SetAndRegexOrdinalsFilter(format);
}

public LongFilter convertToLongFilter(DocValueFormat format) {
Expand Down

0 comments on commit ff736f0

Please sign in to comment.