diff --git a/docs/changelog/127975.yaml b/docs/changelog/127975.yaml new file mode 100644 index 0000000000000..35a76a2474a34 --- /dev/null +++ b/docs/changelog/127975.yaml @@ -0,0 +1,5 @@ +pr: 127975 +summary: Fix a bug in `significant_terms` +area: Aggregations +type: bug +issues: [] diff --git a/modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml b/modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml index e8b0419589ed3..16f14c9601654 100644 --- a/modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml +++ b/modules/aggregations/src/yamlRestTest/resources/rest-api-spec/test/aggregations/sig_terms.yml @@ -73,7 +73,7 @@ - match: {aggregations.class.buckets.1.sig_terms.buckets.0.key: "good"} --- -"Test background filter count ": +"Test background filter count": - requires: cluster_features: ["gte_v7.15.0"] reason: bugfix introduced in 7.15.0 @@ -153,6 +153,257 @@ index: goodbad* body: {"aggs": {"sig_terms": {"significant_terms": {"field": "text", "background_filter": {"bool": {"filter": [{"term": {"class": "good" }}]}}}}}} - match: { aggregations.sig_terms.bg_count: 2 } + +--- +"Test background filter count as sub - global ords": + - requires: + capabilities: + - method: POST + path: /_search + capabilities: [ significant_terms_background_filter_as_sub ] + test_runner_features: capabilities + reason: "bug fix" + + - do: + indices.create: + index: goodbad + body: + settings: + number_of_shards: 1 + mappings: + properties: + text: + type: keyword + class: + type: keyword + - do: + indices.create: + index: goodbad-2 + body: + settings: + number_of_shards: 1 + mappings: + properties: + text: + type: keyword + class: + type: keyword + + - do: + index: + index: goodbad-2 + id: "1" + body: { group: 1, class: "bad" } + - do: + index: + index: goodbad-2 + id: "2" + body: { group: 1, class: "bad" } + + - do: + index: + index: goodbad + id: "1" + body: { group: 1, text: "good", class: "good" } + - do: + index: + index: goodbad + id: "2" + body: { group: 1, text: "good", class: "good" } + - do: + index: + index: goodbad + id: "3" + body: { group: 1, text: "bad", class: "bad" } + - do: + index: + index: goodbad + id: "4" + body: { group: 2, text: "bad", class: "bad" } + + - do: + indices.refresh: + index: [goodbad, goodbad-2] + + - do: + search: + rest_total_hits_as_int: true + index: goodbad* + - match: {hits.total: 6} + + - do: + search: + index: goodbad* + body: + aggs: + group: + range: + field: group + ranges: + # Having many ranges helps catch an issue building no hits buckets + - to: 1 + - from: 1 + to: 2 + - from: 2 + to: 3 + - from: 3 + to: 4 + - from: 4 + to: 5 + - from: 5 + to: 6 + aggs: + sig_terms: + significant_terms: + execution_hint: global_ordinals + field: text + background_filter: + bool: + filter: [{term: {class: good }}] + - match: { aggregations.group.buckets.0.key: "*-1.0" } + - match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 } + - match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.1.key: "1.0-2.0" } + - match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 } + - match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.2.key: "2.0-3.0" } + - match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 } + - match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.3.key: "3.0-4.0" } + - match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 } + - match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.4.key: "4.0-5.0" } + - match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 } + - match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.5.key: "5.0-6.0" } + - match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 } + - match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 } + +--- +"Test background filter count as sub - map": + - requires: + capabilities: + - method: POST + path: /_search + capabilities: [ significant_terms_background_filter_as_sub ] + test_runner_features: capabilities + reason: "bug fix" + + - do: + indices.create: + index: goodbad + body: + settings: + number_of_shards: 1 + mappings: + properties: + text: + type: keyword + class: + type: keyword + - do: + indices.create: + index: goodbad-2 + body: + settings: + number_of_shards: 1 + mappings: + properties: + text: + type: keyword + class: + type: keyword + + - do: + index: + index: goodbad-2 + id: "1" + body: { group: 1, class: "bad" } + - do: + index: + index: goodbad-2 + id: "2" + body: { group: 1, class: "bad" } + + - do: + index: + index: goodbad + id: "1" + body: { group: 1, text: "good", class: "good" } + - do: + index: + index: goodbad + id: "2" + body: { group: 1, text: "good", class: "good" } + - do: + index: + index: goodbad + id: "3" + body: { group: 1, text: "bad", class: "bad" } + - do: + index: + index: goodbad + id: "4" + body: { group: 2, text: "bad", class: "bad" } + + - do: + indices.refresh: + index: [goodbad, goodbad-2] + + - do: + search: + rest_total_hits_as_int: true + index: goodbad* + - match: {hits.total: 6} + + - do: + search: + index: goodbad* + body: + aggs: + group: + range: + field: group + ranges: + # Having many ranges helps catch an issue building no hits buckets + - to: 1 + - from: 1 + to: 2 + - from: 2 + to: 3 + - from: 3 + to: 4 + - from: 4 + to: 5 + - from: 5 + to: 6 + aggs: + sig_terms: + significant_terms: + execution_hint: map + field: text + background_filter: + bool: + filter: [{term: {class: good }}] + - match: { aggregations.group.buckets.0.key: "*-1.0" } + - match: { aggregations.group.buckets.0.sig_terms.doc_count: 0 } + - match: { aggregations.group.buckets.0.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.1.key: "1.0-2.0" } + - match: { aggregations.group.buckets.1.sig_terms.doc_count: 5 } + - match: { aggregations.group.buckets.1.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.2.key: "2.0-3.0" } + - match: { aggregations.group.buckets.2.sig_terms.doc_count: 1 } + - match: { aggregations.group.buckets.2.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.3.key: "3.0-4.0" } + - match: { aggregations.group.buckets.3.sig_terms.doc_count: 0 } + - match: { aggregations.group.buckets.3.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.4.key: "4.0-5.0" } + - match: { aggregations.group.buckets.4.sig_terms.doc_count: 0 } + - match: { aggregations.group.buckets.4.sig_terms.bg_count: 2 } + - match: { aggregations.group.buckets.5.key: "5.0-6.0" } + - match: { aggregations.group.buckets.5.sig_terms.doc_count: 0 } + - match: { aggregations.group.buckets.5.sig_terms.bg_count: 2 } + --- "IP test": - do: diff --git a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java index 8231046c6586f..93ca885ef9c96 100644 --- a/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java +++ b/server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java @@ -44,6 +44,8 @@ private SearchCapabilities() {} private static final String HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT = "highlight_max_analyzed_offset_default"; + private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub"; + public static final Set CAPABILITIES; static { HashSet capabilities = new HashSet<>(); @@ -60,6 +62,7 @@ private SearchCapabilities() {} capabilities.add(K_DEFAULT_TO_SIZE); capabilities.add(KQL_QUERY_SUPPORTED); capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT); + capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB); CAPABILITIES = Set.copyOf(capabilities); } } diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 439b61cc43ddf..fea8166753138 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -1089,7 +1089,7 @@ SignificantStringTerms buildEmptyResult() { @Override SignificantStringTerms buildNoValuesResult(long owningBucketOrdinal) { - return buildEmptySignificantTermsAggregation(subsetSizes.get(owningBucketOrdinal), supersetSize, significanceHeuristic); + return buildEmptySignificantTermsAggregation(subsetSize(owningBucketOrdinal), supersetSize, significanceHeuristic); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java index 026912a583ef3..5b6e2436ae1e8 100644 --- a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java +++ b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/MapStringTermsAggregator.java @@ -649,7 +649,7 @@ ObjectArrayPriorityQueue> buildPrior @Override BucketUpdater bucketUpdater(long owningBucketOrd) { - long subsetSize = subsetSizes.get(owningBucketOrd); + long subsetSize = subsetSize(owningBucketOrd); return (spare, ordsEnum, docCount) -> { ordsEnum.readValue(spare.termBytes); spare.subsetDf = docCount; @@ -696,7 +696,7 @@ SignificantStringTerms buildResult(long owningBucketOrd, long otherDocCount, Sig bucketCountThresholds.getMinDocCount(), metadata(), format, - subsetSizes.get(owningBucketOrd), + subsetSize(owningBucketOrd), supersetSize, significanceHeuristic, Arrays.asList(topBuckets) @@ -712,5 +712,10 @@ SignificantStringTerms buildEmptyResult() { public void close() { Releasables.close(backgroundFrequencies, subsetSizes); } + + private long subsetSize(long owningBucketOrd) { + // if the owningBucketOrd is not in the array that means the bucket is empty so the size has to be 0 + return owningBucketOrd < subsetSizes.size() ? subsetSizes.get(owningBucketOrd) : 0; + } } }