From 227f5fa06f085db3d3a42e9efc463e917d281b27 Mon Sep 17 00:00:00 2001
From: Alexander Spies <alexander.spies@elastic.co>
Date: Mon, 28 Apr 2025 13:50:08 +0200
Subject: [PATCH] ESQL: Fix count optimization with pushable union types
 (#127225)

When pushing down STATS count(field::type) to Lucene for a union-typed field, use the correct field name in the Lucene query and not the synthetic attribute name $$field$converted_to$type.
---
 docs/changelog/127225.yaml                    |  6 ++
 .../src/main/resources/union_types.csv-spec   | 67 +++++++++++++++++++
 .../xpack/esql/action/EsqlCapabilities.java   |  6 ++
 .../local/ReplaceMissingFieldWithNull.java    |  6 +-
 .../physical/local/PushStatsToSource.java     |  8 ++-
 5 files changed, 88 insertions(+), 5 deletions(-)
 create mode 100644 docs/changelog/127225.yaml

diff --git a/docs/changelog/127225.yaml b/docs/changelog/127225.yaml
new file mode 100644
index 0000000000000..b161a5c40bfdf
--- /dev/null
+++ b/docs/changelog/127225.yaml
@@ -0,0 +1,6 @@
+pr: 127225
+summary: Fix count optimization with pushable union types
+area: ES|QL
+type: bug
+issues:
+ - 127200
diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/union_types.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/union_types.csv-spec
index 8b19bc589fcff..aa08799943ade 100644
--- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/union_types.csv-spec
+++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/union_types.csv-spec
@@ -393,6 +393,73 @@ count:long | @timestamp:date
          0 | 2023-10-23T13:50:00.000Z
 ;
 
+multiIndexIpStatsNonPushableCount
+// Could be pushed to Lucene if we knew whether ip fields are single valued or not.
+// May still be pushed down on multi-node environments if a node has only the index where client_ip is keyword.
+required_capability: union_types
+required_capability: fix_count_pushdown_for_union_types
+
+FROM sample_data, sample_data_str
+| STATS count=count(client_ip::ip)
+;
+
+count:long
+14
+;
+
+multiIndexIpStatsNonPushableCountEval
+// Could be pushed to Lucene if we knew whether ip fields are single valued or not.
+// May still be pushed down on multi-node environments if a node has only the index where client_ip is keyword.
+required_capability: union_types
+required_capability: fix_count_pushdown_for_union_types
+
+FROM sample_data, sample_data_str
+| EVAL client_ip = client_ip::ip
+| STATS count=count(client_ip)
+;
+
+count:long
+14
+;
+
+multiIndexIpStatsNonPushableCountWithFilter
+// Currently not pushable: has 2 aggs and we don't consider multi-typed fields aggregatable.
+required_capability: union_types
+required_capability: fix_count_pushdown_for_union_types
+
+FROM sample_data, sample_data_ts_long
+| STATS count_matching=count(@timestamp::long) WHERE @timestamp::long >= 1698069301543,
+        total_count=count(@timestamp::long)
+;
+
+count_matching:long | total_count:long
+2                   | 14
+;
+
+multiIndexIpStatsPushableCount
+required_capability: union_types
+required_capability: fix_count_pushdown_for_union_types
+
+FROM sample_data, sample_data_ts_long
+| STATS count=count(@timestamp::long)
+;
+
+count:long
+14
+;
+
+multiIndexIpStatsPushableCountEval
+required_capability: union_types
+required_capability: fix_count_pushdown_for_union_types
+
+FROM sample_data, sample_data_ts_long
+| EVAL @timestamp = @timestamp::long
+| STATS count=count(@timestamp)
+;
+
+count:long
+14
+;
 
 multiIndexIpStringStatsInline2
 required_capability: union_types
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
index f01e2043d5c13..65e1237ef6b9e 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java
@@ -375,6 +375,12 @@ public enum Cap {
          */
         UNION_TYPES_AGG_CAST,
 
+        /**
+         * When pushing down {@code STATS count(field::type)} for a union type field, we wrongly used a synthetic attribute name in the
+         * query instead of the actual field name. This led to 0 counts instead of the correct result.
+         */
+        FIX_COUNT_PUSHDOWN_FOR_UNION_TYPES,
+
         /**
          * Fix to GROK validation in case of multiple fields with same name and different types
          * https://github.com/elastic/elasticsearch/issues/110533
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java
index 64512351644b4..0441539a92a82 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/logical/local/ReplaceMissingFieldWithNull.java
@@ -65,12 +65,12 @@ public LogicalPlan apply(LogicalPlan plan, LocalLogicalOptimizerContext localLog
 
     private LogicalPlan missingToNull(LogicalPlan plan, Predicate<FieldAttribute> shouldBeRetained) {
         if (plan instanceof EsRelation relation) {
-            // Remove missing fields from the EsRelation because this is not where we will obtain them from; replace them by an Eval right
-            // after, instead. This allows us to safely re-use the attribute ids of the corresponding FieldAttributes.
+            // For any missing field, place an Eval right after the EsRelation to assign null values to that attribute (using the same name
+            // id!), thus avoiding that InsertFieldExtrations inserts a field extraction later.
             // This means that an EsRelation[field1, field2, field3] where field1 and field 3 are missing will be replaced by
             // Project[field1, field2, field3] <- keeps the ordering intact
             // \_Eval[field1 = null, field3 = null]
-            // \_EsRelation[field2]
+            // \_EsRelation[field1, field2, field3]
             List<Attribute> relationOutput = relation.output();
             Map<DataType, Alias> nullLiterals = Maps.newLinkedHashMapWithExpectedSize(DataType.types().size());
             List<NamedExpression> newProjections = new ArrayList<>(relationOutput.size());
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushStatsToSource.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushStatsToSource.java
index e3954688aed41..3fec1644b92d2 100644
--- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushStatsToSource.java
+++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/optimizer/rules/physical/local/PushStatsToSource.java
@@ -94,15 +94,19 @@ private Tuple<List<Attribute>, List<EsStatsQueryExec.Stat>> pushableStats(
                             // check if regular field
                             else {
                                 if (target instanceof FieldAttribute fa) {
-                                    var fName = fa.name();
+                                    var fName = fa.fieldName();
                                     if (context.searchStats().isSingleValue(fName)) {
-                                        fieldName = fa.name();
+                                        fieldName = fName;
                                         query = QueryBuilders.existsQuery(fieldName);
                                     }
                                 }
                             }
                             if (fieldName != null) {
                                 if (count.hasFilter()) {
+                                    // Note: currently, it seems like we never actually perform stats pushdown if we reach here.
+                                    // That's because stats pushdown only works for 1 agg function (without BY); but in that case, filters
+                                    // are extracted into a separate filter node upstream from the aggregation (and hopefully pushed into
+                                    // the EsQueryExec separately).
                                     if (canPushToSource(count.filter()) == false) {
                                         return null; // can't push down
                                     }