Skip to content

Commit

Permalink
SQL: Fix MIN, MAX, SUM aggs data type handling (backport of #71525) (#…
Browse files Browse the repository at this point in the history
…71953)

This fixes the way the MIN, MAX and SUM handle the returned data types:
- MIN and MAX must return the same data type as input's.
- SUM must return long/BIGINT for integral types and double otherwise.

The fix concerns both data returned in projections, as well as aggs
filtering.

(cherry picked from commit be36c1a)
  • Loading branch information
bpintea committed Apr 20, 2021
1 parent 0e324d7 commit 6d7360b
Show file tree
Hide file tree
Showing 21 changed files with 207 additions and 96 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class org.elasticsearch.xpack.ql.expression.function.scalar.whitelist.InternalQl
boolean nullSafeFilter(Boolean)
double nullSafeSortNumeric(Number)
String nullSafeSortString(Object)
Number nullSafeCastNumeric(Number, String)

#
# ASCII Functions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@
import org.elasticsearch.xpack.ql.expression.function.Function;
import org.elasticsearch.xpack.ql.expression.function.aggregate.AggregateFunction;
import org.elasticsearch.xpack.ql.expression.function.grouping.GroupingFunction;
import org.elasticsearch.xpack.ql.expression.gen.script.ParamsBuilder;
import org.elasticsearch.xpack.ql.expression.gen.script.ScriptTemplate;
import org.elasticsearch.xpack.ql.expression.gen.script.Scripts;
import org.elasticsearch.xpack.ql.tree.Source;
import org.elasticsearch.xpack.ql.type.DataType;
import org.elasticsearch.xpack.ql.util.DateUtils;

import static java.util.Collections.emptyList;
import static org.elasticsearch.xpack.ql.expression.gen.script.ParamsBuilder.paramsBuilder;
import static org.elasticsearch.xpack.ql.expression.gen.script.Scripts.PARAM;
import static org.elasticsearch.xpack.ql.type.DataTypes.DATETIME;
import static org.elasticsearch.xpack.ql.type.DataTypes.LONG;

/**
* A {@code ScalarFunction} is a {@code Function} that takes values from some
Expand Down Expand Up @@ -110,31 +114,42 @@ protected ScriptTemplate scriptWithScalar(ScalarFunction scalar) {
}

protected ScriptTemplate scriptWithAggregate(AggregateFunction aggregate) {
String template = basicTemplate(aggregate);
return new ScriptTemplate(processScript(template),
paramsBuilder().agg(aggregate).build(),
dataType());
String template = PARAM;
ParamsBuilder paramsBuilder = paramsBuilder().agg(aggregate);

DataType nullSafeCastDataType = null;
DataType dataType = aggregate.dataType();
if (dataType.name().equals("DATE") || dataType == DATETIME ||
// Aggregations on date_nanos are returned as string
aggregate.field().dataType() == DATETIME) {

template = "{sql}.asDateTime({})";
} else if (dataType.isInteger()) {
// MAX, MIN need to retain field's data type, so that possible operations on integral types (like division) work
// correctly -> perform a cast in the aggs filtering script, the bucket selector for HAVING.
// SQL function classes not available in QL: filter by name
String fn = aggregate.functionName();
if ("MAX".equals(fn) || "MIN".equals(fn)) {
nullSafeCastDataType = dataType;
} else if ("SUM".equals(fn)) {
// SUM(integral_type) requires returning a LONG value
nullSafeCastDataType = LONG;
}
}
if (nullSafeCastDataType != null) {
template = "{ql}.nullSafeCastNumeric({},{})";
paramsBuilder.variable(nullSafeCastDataType.name());
}
return new ScriptTemplate(processScript(template), paramsBuilder.build(), dataType());
}

// This method isn't actually used at the moment, since there is no grouping function (ie HISTOGRAM)
// that currently results in a script being generated
protected ScriptTemplate scriptWithGrouping(GroupingFunction grouping) {
String template = basicTemplate(grouping);
String template = PARAM;
return new ScriptTemplate(processScript(template),
paramsBuilder().grouping(grouping).build(),
dataType());
}

// FIXME: this needs to be refactored to account for different datatypes in different projects (ie DATE from SQL)
private String basicTemplate(Function function) {
if (function.dataType().name().equals("DATE") || function.dataType() == DATETIME ||
// Aggregations on date_nanos are returned as string
(function instanceof AggregateFunction && ((AggregateFunction) function).field().dataType() == DATETIME)) {

return "{sql}.asDateTime({})";
} else {
return "{}";
}
paramsBuilder().grouping(grouping).build(),
dataType());
}

protected ScriptTemplate scriptWithField(FieldAttribute field) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
import java.util.List;
import java.util.Map;

import static org.elasticsearch.xpack.ql.type.DataTypeConverter.convert;
import static org.elasticsearch.xpack.ql.type.DataTypes.fromTypeName;

public class InternalQlScriptUtils {

//
Expand Down Expand Up @@ -51,6 +54,10 @@ public static String nullSafeSortString(Object sort) {
return sort == null ? StringUtils.EMPTY : sort.toString();
}

public static Number nullSafeCastNumeric(Number number, String typeName) {
return number == null || Double.isNaN(number.doubleValue()) ? null : (Number) convert(number, fromTypeName(typeName));
}


//
// Operators
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
aggSumWithColumnRepeatedWithOrderAsc
SELECT gender AS g, gender, SUM(salary) AS s3, SUM(salary), SUM(salary) AS s5 FROM test_emp GROUP BY gender ORDER BY SUM(salary);

g:s | gender:s | s3:i | SUM(salary):i | s5:i
g:s | gender:s | s3:l | SUM(salary):l | s5:l
null |null |487605 |487605 |487605
F |F |1666196|1666196 |1666196
M |M |2671054|2671054 |2671054
Expand All @@ -10,7 +10,7 @@ M |M |2671054|2671054 |2671054
aggSumWithAliasWithColumnRepeatedWithOrderDesc
SELECT gender AS g, gender, SUM(salary) AS s3, SUM(salary), SUM(salary) AS s5 FROM test_emp GROUP BY g ORDER BY s5 DESC;

g:s | gender:s | s3:i | SUM(salary):i | s5:i
g:s | gender:s | s3:l | SUM(salary):l | s5:l
M |M |2671054|2671054 |2671054
F |F |1666196|1666196 |1666196
null |null |487605 |487605 |487605
Expand All @@ -19,7 +19,7 @@ null |null |487605 |487605 |487605
aggSumWithNumericRefWithColumnRepeatedWithOrderDesc
SELECT gender AS g, gender, SUM(salary) AS s3, SUM(salary), SUM(salary) AS s5 FROM test_emp GROUP BY 2 ORDER BY 3 DESC;

g:s | gender:s | s3:i | SUM(salary):i | s5:i
g:s | gender:s | s3:l | SUM(salary):l | s5:l
M |M |2671054|2671054 |2671054
F |F |1666196|1666196 |1666196
null |null |487605 |487605 |487605
Expand Down
28 changes: 14 additions & 14 deletions x-pack/plugin/sql/qa/server/src/main/resources/agg.csv-spec
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ M |10096.2232 |10092.362 |15.350877192
sum
SELECT SUM(salary) FROM test_emp;

SUM(salary)
---------------
SUM(salary):l
----------------
4824855
;

Expand Down Expand Up @@ -193,7 +193,7 @@ SELECT MAX(languages) max, MIN(languages) min, SUM(languages) sum, AVG(languages
KURTOSIS(languages) kurtosis, SKEWNESS(languages) skewness
FROM test_emp GROUP BY languages ORDER BY languages ASC LIMIT 5;

max:bt | min:bt | sum:bt | avg:d | percent:d | percent_rank:d| kurtosis:d | skewness:d
max:bt | min:bt | sum:l | avg:d | percent:d | percent_rank:d| kurtosis:d | skewness:d
---------------+---------------+---------------+--------------+---------------+---------------+---------------+---------------
null |null |null |null |null |null |null |null
1 |1 |15 |1 |1.0 |100.0 |NaN |NaN
Expand All @@ -205,7 +205,7 @@ null |null |null |null |null |n
aggSumWithColumnRepeated
SELECT gender AS g, gender, SUM(salary) AS s3, SUM(salary), SUM(salary) AS s5 FROM test_emp GROUP BY gender;

g:s | gender:s | s3:i | SUM(salary):i | s5:i
g:s | gender:s | s3:l | SUM(salary):l | s5:l
null |null |487605 |487605 |487605
F |F |1666196|1666196 |1666196
M |M |2671054|2671054 |2671054
Expand All @@ -214,7 +214,7 @@ M |M |2671054|2671054 |2671054
aggSumWithAliasWithColumnRepeated
SELECT gender AS g, gender, SUM(salary) AS s3, SUM(salary), SUM(salary) AS s5 FROM test_emp GROUP BY g;

g:s | gender:s | s3:i | SUM(salary):i | s5:i
g:s | gender:s | s3:l | SUM(salary):l | s5:l
null |null |487605 |487605 |487605
F |F |1666196|1666196 |1666196
M |M |2671054|2671054 |2671054
Expand All @@ -223,7 +223,7 @@ M |M |2671054|2671054 |2671054
aggSumWithNumericRefWithColumnRepeated
SELECT gender AS g, gender, SUM(salary) AS s3, SUM(salary), SUM(salary) AS s5 FROM test_emp GROUP BY 2;

g:s | gender:s | s3:i | SUM(salary):i | s5:i
g:s | gender:s | s3:l | SUM(salary):l | s5:l
null |null |487605 |487605 |487605
F |F |1666196|1666196 |1666196
M |M |2671054|2671054 |2671054
Expand All @@ -232,7 +232,7 @@ M |M |2671054|2671054 |2671054
sumLiteralWithTrueConditionAndHavingWithCount
SELECT SUM(1) AS c FROM test_emp WHERE 'a'='a' HAVING COUNT(1) > 0;

c:i
c:l
---------------
100
;
Expand Down Expand Up @@ -272,7 +272,7 @@ Bezalel |1
sumFieldWithSumLiteralAsCondition
SELECT first_name, last_name, SUM(salary) AS s, birth_date AS y, COUNT(1) FROM test_emp GROUP BY 1, 2, 4 HAVING ((SUM(1) >= 1) AND (SUM(1) <= 577)) AND ((SUM(salary) >= 35000) AND (SUM(salary) <= 45000));

first_name:s | last_name:s | s:i | y:ts | COUNT(1):l
first_name:s | last_name:s | s:l | y:ts | COUNT(1):l
---------------+---------------+---------------+------------------------+---------------
null |Brender |36051 |1959-10-01T00:00:00.000Z|1
null |Joslin |37716 |1959-01-27T00:00:00.000Z|1
Expand Down Expand Up @@ -308,7 +308,7 @@ Zvonko |Nyanchama |42716 |null |1
mirrorIifForNumericAggregate
SELECT IIF(COUNT(1)=0, NULL, 123)+5, AVG(123), MIN(123)+5, IIF(COUNT(1)=0, NULL, 30*COUNT(1)), SUM(30) FROM test_emp;

IIF(COUNT(1)=0, NULL, 123)+5:i| AVG(123):d | MIN(123)+5:i |IIF(COUNT(1)=0, NULL, 30*COUNT(1)):l| SUM(30):l
IIF(COUNT(1)=0, NULL, 123)+5:i| AVG(123):d | MIN(123)+5:i |IIF(COUNT(1)=0, NULL, 30*COUNT(1)):l| SUM(30):l
------------------------------+-----------------+-----------------+------------------------------------+---------------
128 |123 |128 |3000 |3000
;
Expand Down Expand Up @@ -1452,7 +1452,7 @@ null


allZerosWithSum
schema::SUM_AllZeros:i
schema::SUM_AllZeros:l
SELECT SUM(bytes_in) as "SUM_AllZeros" FROM logs WHERE bytes_in = 0;

SUM_AllZeros
Expand All @@ -1462,7 +1462,7 @@ SELECT SUM(bytes_in) as "SUM_AllZeros" FROM logs WHERE bytes_in = 0;


allNullsWithSum
schema::SUM_AllNulls:i
schema::SUM_AllNulls:l
SELECT SUM(bytes_out) as "SUM_AllNulls" FROM logs WHERE bytes_out IS NULL;

SUM_AllNulls
Expand Down Expand Up @@ -1671,7 +1671,7 @@ null
;

nullsAndZerosCombined
schema::COUNT(*):l|COUNT_AllZeros:l|COUNT_AllNulls:l|FIRST_AllZeros:i|FIRST_AllNulls:i|SUM_AllZeros:i|SUM_AllNulls:i
schema::COUNT(*):l|COUNT_AllZeros:l|COUNT_AllNulls:l|FIRST_AllZeros:i|FIRST_AllNulls:i|SUM_AllZeros:l|SUM_AllNulls:l
SELECT
COUNT(*),
COUNT(bytes_in) AS "COUNT_AllZeros",
Expand All @@ -1690,7 +1690,7 @@ WHERE bytes_in = 0 AND bytes_out IS NULL;


groupedByNullsAndZeros
schema::bytes_in:i|COUNT(*):l|SUM(bytes_in):i|MIN(bytes_in):i|MAX(bytes_in):i|AVG(bytes_in):d
schema::bytes_in:i|COUNT(*):l|SUM(bytes_in):l|MIN(bytes_in):i|MAX(bytes_in):i|AVG(bytes_in):d
SELECT
bytes_in,
COUNT(*),
Expand All @@ -1710,7 +1710,7 @@ null |1 |null |null |null |
;

groupedByMultipleSumsWithNullsAndZeros
schema::SUM(bytes_in):i|SUM(bytes_out):i|client_ip:s|c:l
schema::SUM(bytes_in):l|SUM(bytes_out):l|client_ip:s|c:l
SELECT
SUM(bytes_in),
SUM(bytes_out),
Expand Down
31 changes: 31 additions & 0 deletions x-pack/plugin/sql/qa/server/src/main/resources/agg.sql-spec
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,8 @@ aggMinWithAlias
SELECT gender g, MIN(emp_no) m FROM "test_emp" GROUP BY g ORDER BY gender;
aggMinOnDateTime
SELECT gender, MIN(birth_date) m FROM "test_emp" GROUP BY gender ORDER BY gender;
aggMinOnNull
SELECT MIN(languages) AS min FROM test_emp WHERE languages IS NULL;
aggMinOnDateTimeCastAsDate
SELECT gender, YEAR(CAST(MIN(birth_date) AS DATE)) m FROM "test_emp" GROUP BY gender ORDER BY gender;

Expand All @@ -340,6 +342,10 @@ aggMinWithMultipleHavingBetween
SELECT gender g, MIN(emp_no) m FROM "test_emp" GROUP BY g HAVING m BETWEEN 10 AND 99999 ORDER BY g LIMIT 1;
aggMinWithMultipleHavingOnAliasAndFunction
SELECT gender g, MIN(emp_no) m FROM "test_emp" GROUP BY g HAVING m > 10 AND MIN(emp_no) < 99999 ORDER BY gender;
aggMinWithHavingOnIntegralDivision
SELECT MIN(salary)/100 AS min FROM test_emp HAVING min <= 253;
aggMinWithHavingAndNull
SELECT MIN(languages) AS min FROM test_emp GROUP BY languages HAVING min < 2;

aggMinWithHavingGroupMultiGroupBy
SELECT gender g, languages l, MIN(emp_no) m FROM "test_emp" GROUP BY g, l HAVING MIN(emp_no) > 10 ORDER BY gender, languages;
Expand Down Expand Up @@ -380,6 +386,8 @@ aggMaxWithAlias
SELECT gender g, MAX(emp_no) m FROM "test_emp" GROUP BY g ORDER BY gender;
aggMaxOnDateTime
SELECT gender, MAX(birth_date) m FROM "test_emp" GROUP BY gender ORDER BY gender;
aggMaxOnNull
SELECT MAX(languages) AS max FROM test_emp WHERE languages IS NULL;
aggMaxOnDateTimeCastAsDate
SELECT gender, YEAR(CAST(MAX(birth_date) AS DATE)) m FROM "test_emp" GROUP BY gender ORDER BY gender;
aggAvgAndMaxWithLikeFilter
Expand All @@ -400,6 +408,10 @@ aggMaxWithMultipleHavingBetweenWithLimit
SELECT gender g, MAX(emp_no) m FROM "test_emp" GROUP BY g HAVING m BETWEEN 10 AND 99999 ORDER BY g LIMIT 1;
aggMaxWithMultipleHavingOnAliasAndFunction
SELECT gender g, MAX(emp_no) m FROM "test_emp" GROUP BY g HAVING m > 10 AND MAX(emp_no) < 99999 ORDER BY gender;
aggMaxWithHavingOnIntegralDivision
SELECT MAX(salary)/100 AS max FROM test_emp HAVING max > 749;
aggMaxWithHavingAndNull
SELECT MAX(languages) AS max FROM test_emp GROUP BY languages HAVING max < 2;

// SUM
aggSumImplicitWithCast
Expand All @@ -418,6 +430,8 @@ aggSumWithCastAndCountWithFilterAndLimit
SELECT gender g, CAST(SUM(emp_no) AS BIGINT) s, COUNT(1) c FROM "test_emp" WHERE emp_no > 10000 GROUP BY g ORDER BY g LIMIT 1;
aggSumWithAlias
SELECT gender g, CAST(SUM(emp_no) AS BIGINT) s FROM "test_emp" GROUP BY g ORDER BY gender;
aggSumOnNull
SELECT SUM(languages) AS sum FROM test_emp WHERE languages IS NULL;

// Conditional SUM
aggSumWithHaving
Expand All @@ -434,6 +448,11 @@ aggSumWithMultipleHavingBetweenWithLimit
SELECT gender g, CAST(SUM(emp_no) AS INT) s FROM "test_emp" GROUP BY g HAVING s BETWEEN 10 AND 10000000 ORDER BY g LIMIT 1;
aggSumWithMultipleHavingOnAliasAndFunction
SELECT gender g, CAST(SUM(emp_no) AS INT) s FROM "test_emp" GROUP BY g HAVING s > 10 AND SUM(emp_no) > 10000000 ORDER BY gender;
aggSumWithHavingOnIntegralDivision
SELECT SUM(salary)/100 AS sum FROM test_emp HAVING sum <= 48248;
// AwaitsFix https://github.com/elastic/elasticsearch/issues/45251
aggSumWithHavingOnNull-Ignore
SELECT SUM(languages) AS sum FROM test_emp GROUP BY languages HAVING sum < 20 ORDER BY sum;

// AVG
aggAvgImplicitWithCast
Expand Down Expand Up @@ -595,6 +614,8 @@ selectLangGroupByLangHavingNotEquality
SELECT languages, COUNT(*) c FROM test_emp GROUP BY languages HAVING NOT COUNT(*) = 1 ORDER BY languages DESC;
selectLangGroupByLangHavingDifferent
SELECT languages, COUNT(*) c FROM test_emp GROUP BY languages HAVING COUNT(*) <> 1 ORDER BY languages DESC;
selectNullValueLangWithAgg
SELECT MAX(languages) AS max FROM test_emp GROUP BY languages HAVING max < 2;


// filter with IN
Expand All @@ -603,6 +624,16 @@ SELECT MIN(salary) min, MAX(salary) max, gender g, COUNT(*) c FROM "test_emp" WH
aggMultiGroupByMultiWithHavingUsingInAndNullHandling
SELECT MIN(salary) min, MAX(salary) max, gender g, languages l, COUNT(*) c FROM "test_emp" WHERE languages > 0 GROUP BY g, languages HAVING max IN (74500, null, 74600) ORDER BY gender, languages;

// aggs filtering with integral types
maxIntegralAggFiltering
SELECT MAX(salary)/100 AS max FROM test_emp HAVING max > 749;
minIntegralAggFiltering
SELECT MIN(salary)/100 AS min FROM test_emp HAVING min <= 253;
sumIntegralAggFiltering
SELECT SUM(salary)/100 AS sum FROM test_emp HAVING sum <= 48248;
nullIntegralAggFiltering
SELECT MAX(languages) AS max FROM test_emp GROUP BY languages HAVING max < 2;

// group by with literal
implicitGroupByWithLiteral
SELECT 10, MAX("salary") FROM test_emp;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Female |24.5
aggSumWithAliasWithColumnRepeatedWithOrderDesc
SELECT extra_gender AS g, extra_gender, SUM(salary) AS s3, SUM(salary), SUM(salary) AS s5 FROM test_emp_copy GROUP BY extra_gender;

g:s | extra_gender:s | s3:i | SUM(salary):i | s5:i
g:s | extra_gender:s | s3:l | SUM(salary):l | s5:l
---------------+----------------+---------------+---------------+---------------
Female |Female |4824855 |4824855 |4824855
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ SELECT WEEK(birth_date) week, birth_date FROM test_emp ORDER BY WEEK(birth_date)
isoDayOfWeek
SELECT ISO_DAY_OF_WEEK(birth_date) AS d, SUM(salary) s FROM test_emp GROUP BY d ORDER BY d DESC;

d:i | s:i
d:i | s:l
---------------+---------------
7 |386466
6 |643304
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -676,9 +676,9 @@ SELECT gender AS g, ROUND((MIN(salary) / 100)) AS salary FROM emp GROUP BY gende

g | salary
---------------+---------------
null |253
F |260
M |259
null |253
F |259
M |259
// end::groupByAndAggExpression
;

Expand Down
4 changes: 2 additions & 2 deletions x-pack/plugin/sql/qa/server/src/main/resources/pivot.csv-spec
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ null |48396.28571428572|62140.666666666664
;

sumWithoutSubquery
schema::birth_date:ts|emp_no:i|first_name:s|gender:s|hire_date:ts|last_name:s|name:s|1:i|2:i
schema::birth_date:ts|emp_no:i|first_name:s|gender:s|hire_date:ts|last_name:s|name:s|1:l|2:l
// tag::sumWithoutSubquery
SELECT * FROM test_emp PIVOT (SUM(salary) FOR languages IN (1, 2)) LIMIT 5;

Expand All @@ -205,7 +205,7 @@ SELECT *
FROM (SELECT client_ip, status, bytes_in FROM logs WHERE NVL(bytes_in, 0) = 0)
PIVOT (SUM(bytes_in) FOR status IN ('OK','Error'));

client_ip | 'OK' | 'Error'
client_ip:s | 'OK':l | 'Error':l
---------------+---------------+---------------
10.0.1.199 |0 |null
10.0.1.205 |0 |null
Expand Down

0 comments on commit 6d7360b

Please sign in to comment.