Skip to content

Commit

Permalink
SQL: Fix bug regarding histograms usage in scripting (#36866)
Browse files Browse the repository at this point in the history
Allow scripts to correctly reference grouping functions
Fix bug in translation of date/time functions mixed with histograms.
Enhance Verifier to prevent histograms being nested inside other
 functions inside GROUP BY (as it implies double grouping)
Extend Histogram docs
  • Loading branch information
costin committed Dec 20, 2018
1 parent 4cd5705 commit ac032a0
Show file tree
Hide file tree
Showing 15 changed files with 234 additions and 47 deletions.
24 changes: 24 additions & 0 deletions docs/reference/sql/functions/grouping.asciidoc
Expand Up @@ -36,6 +36,8 @@ The histogram function takes all matching values and divides them into buckets w
bucket_key = Math.floor(value / interval) * interval
----

NOTE:: The histogram in SQL does *NOT* return empty buckets for missing intervals as the traditional <<search-aggregations-bucket-histogram-aggregation, histogram>> and <<search-aggregations-bucket-datehistogram-aggregation, date histogram>>. Such behavior does not fit conceptually in SQL which treats all missing values as `NULL`; as such the histogram places all missing values in the `NULL` group.

`Histogram` can be applied on either numeric fields:


Expand All @@ -51,4 +53,26 @@ or date/time fields:
include-tagged::{sql-specs}/docs.csv-spec[histogramDate]
----

Expressions inside the histogram are also supported as long as the
return type is numeric:

["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[histogramNumericExpression]
----

Do note that histograms (and grouping functions in general) allow custom expressions but cannot have any functions applied to them in the `GROUP BY`. In other words, the following statement is *NOT* allowed:

["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[expressionOnHistogramNotAllowed]
----

as it requires two groupings (one for histogram followed by a second for applying the function on top of the histogram groups).

Instead one can rewrite the query to move the expression on the histogram _inside_ of it:

["source","sql",subs="attributes,callouts,macros"]
----
include-tagged::{sql-specs}/docs.csv-spec[histogramDateExpression]
----
50 changes: 46 additions & 4 deletions x-pack/plugin/sql/qa/src/main/resources/agg.csv-spec
Expand Up @@ -262,9 +262,51 @@ SELECT HISTOGRAM(birth_date, INTERVAL 1 YEAR) AS h, COUNT(*) as c FROM test_emp
null |10
;

histogramDateWithDateFunction-Ignore
SELECT YEAR(HISTOGRAM(birth_date, INTERVAL 1 YEAR)) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
histogramDateWithMonthOnTop
schema::h:i|c:l
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;

h | c
---------------+---------------
12 |7
10 |17
8 |16
6 |16
4 |18
2 |10
0 |6
null |10
;

histogramDateWithYearOnTop
schema::h:i|c:l
SELECT HISTOGRAM(YEAR(birth_date), 2) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;
h | c
---------------+---------------
1964 |5
1962 |13
1960 |16
1958 |16
1956 |9
1954 |12
1952 |19
null |10
;



histogramNumericWithExpression
schema::h:i|c:l
SELECT HISTOGRAM(emp_no % 100, 10) AS h, COUNT(*) as c FROM test_emp GROUP BY h ORDER BY h DESC;

h | c
---------------+---------------
90 |10
80 |10
70 |10
60 |10
50 |10
40 |10
30 |10
20 |10
10 |10
0 |10
;
45 changes: 45 additions & 0 deletions x-pack/plugin/sql/qa/src/main/resources/docs.csv-spec
Expand Up @@ -725,6 +725,27 @@ SELECT HISTOGRAM(salary, 5000) AS h FROM emp GROUP BY h;
// end::histogramNumeric
;

histogramNumericExpression
schema::h:i|c:l
// tag::histogramNumericExpression
SELECT HISTOGRAM(salary % 100, 10) AS h, COUNT(*) AS c FROM emp GROUP BY h;

h | c
---------------+---------------
0 |10
10 |15
20 |10
30 |14
40 |9
50 |9
60 |8
70 |13
80 |3
90 |9

// end::histogramNumericExpression
;

histogramDate
schema::h:ts|c:l
// tag::histogramDate
Expand Down Expand Up @@ -752,6 +773,30 @@ null |10
// end::histogramDate
;

expressionOnHistogramNotAllowed-Ignore
// tag::expressionOnHistogramNotAllowed
SELECT MONTH(HISTOGRAM(birth_date), 2)) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;
// end::expressionOnHistogramNotAllowed

histogramDateExpression
schema::h:i|c:l
// tag::histogramDateExpression
SELECT HISTOGRAM(MONTH(birth_date), 2) AS h, COUNT(*) as c FROM emp GROUP BY h ORDER BY h DESC;

h | c
---------------+---------------
12 |7
10 |17
8 |16
6 |16
4 |18
2 |10
0 |6
null |10

// end::histogramDateExpression
;

///////////////////////////////
//
// Date/Time
Expand Down
Expand Up @@ -18,6 +18,8 @@
import org.elasticsearch.xpack.sql.expression.function.FunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.Functions;
import org.elasticsearch.xpack.sql.expression.function.Score;
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunction;
import org.elasticsearch.xpack.sql.expression.predicate.conditional.ConditionalFunction;
import org.elasticsearch.xpack.sql.expression.predicate.operator.comparison.In;
Expand Down Expand Up @@ -224,6 +226,7 @@ Collection<Failure> verify(LogicalPlan plan) {
validateConditional(p, localFailures);

checkFilterOnAggs(p, localFailures);
checkFilterOnGrouping(p, localFailures);

if (!groupingFailures.contains(p)) {
checkGroupBy(p, localFailures, resolvedFunctions, groupingFailures);
Expand Down Expand Up @@ -419,7 +422,7 @@ private static boolean checkGroupByHavingHasOnlyAggs(Expression e, Node<?> sourc
return true;
}
// skip aggs (allowed to refer to non-group columns)
if (Functions.isAggregate(e)) {
if (Functions.isAggregate(e) || Functions.isGrouping(e)) {
return true;
}

Expand Down Expand Up @@ -448,6 +451,21 @@ private static boolean checkGroupByAgg(LogicalPlan p, Set<Failure> localFailures
}
}));

a.groupings().forEach(e -> {
if (Functions.isGrouping(e) == false) {
e.collectFirstChildren(c -> {
if (Functions.isGrouping(c)) {
localFailures.add(fail(c,
"Cannot combine [%s] grouping function inside GROUP BY, found [%s];"
+ " consider moving the expression inside the histogram",
Expressions.name(c), Expressions.name(e)));
return true;
}
return false;
});
}
});

if (!localFailures.isEmpty()) {
return false;
}
Expand Down Expand Up @@ -547,19 +565,30 @@ private static void checkFilterOnAggs(LogicalPlan p, Set<Failure> localFailures)
if (p instanceof Filter) {
Filter filter = (Filter) p;
if ((filter.child() instanceof Aggregate) == false) {
filter.condition().forEachDown(f -> {
if (Functions.isAggregate(f) || Functions.isGrouping(f)) {
String type = Functions.isAggregate(f) ? "aggregate" : "grouping";
localFailures.add(fail(f,
"Cannot use WHERE filtering on %s function [%s], use HAVING instead", type, Expressions.name(f)));
filter.condition().forEachDown(e -> {
if (Functions.isAggregate(e) || e instanceof AggregateFunctionAttribute) {
localFailures.add(
fail(e, "Cannot use WHERE filtering on aggregate function [%s], use HAVING instead", Expressions.name(e)));
}

}, Function.class);
}, Expression.class);
}
}
}


private static void checkFilterOnGrouping(LogicalPlan p, Set<Failure> localFailures) {
if (p instanceof Filter) {
Filter filter = (Filter) p;
filter.condition().forEachDown(e -> {
if (Functions.isGrouping(e) || e instanceof GroupingFunctionAttribute) {
localFailures
.add(fail(e, "Cannot filter on grouping function [%s], use its argument instead", Expressions.name(e)));
}
}, Expression.class);
}
}


private static void checkForScoreInsideFunctions(LogicalPlan p, Set<Failure> localFailures) {
// Make sure that SCORE is only used in "top level" functions
p.forEachExpressions(e ->
Expand Down Expand Up @@ -647,4 +676,4 @@ private static boolean areTypesCompatible(DataType left, DataType right) {
(left.isNumeric() && right.isNumeric());
}
}
}
}
Expand Up @@ -346,6 +346,9 @@ public static Integer weekOfYear(Object dateTime, String tzId) {
}

public static ZonedDateTime asDateTime(Object dateTime) {
if (dateTime == null) {
return null;
}
if (dateTime instanceof JodaCompatibleZonedDateTime) {
return ((JodaCompatibleZonedDateTime) dateTime).getZonedDateTime();
}
Expand Down
@@ -0,0 +1,24 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License;
* you may not use this file except in compliance with the Elastic License.
*/
package org.elasticsearch.xpack.sql.expression.gen.script;

import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;

class Grouping extends Param<GroupingFunctionAttribute> {

Grouping(GroupingFunctionAttribute groupRef) {
super(groupRef);
}

String groupName() {
return value().functionId();
}

@Override
public String prefix() {
return "g";
}
}
Expand Up @@ -85,25 +85,15 @@ Map<String, String> asAggPaths() {
String s = a.aggProperty() != null ? a.aggProperty() : a.aggName();
map.put(p.prefix() + aggs++, s);
}
}

return map;
}

// return the agg refs
List<String> asAggRefs() {
List<String> refs = new ArrayList<>();

for (Param<?> p : params) {
if (p instanceof Agg) {
refs.add(((Agg) p).aggName());
if (p instanceof Grouping) {
Grouping g = (Grouping) p;
map.put(p.prefix() + aggs++, g.groupName());
}
}

return refs;
return map;
}


private static List<Param<?>> flatten(List<Param<?>> params) {
List<Param<?>> flatten = emptyList();

Expand All @@ -116,6 +106,9 @@ private static List<Param<?>> flatten(List<Param<?>> params) {
else if (p instanceof Agg) {
flatten.add(p);
}
else if (p instanceof Grouping) {
flatten.add(p);
}
else if (p instanceof Var) {
flatten.add(p);
}
Expand All @@ -131,4 +124,4 @@ else if (p instanceof Var) {
public String toString() {
return params.toString();
}
}
}
Expand Up @@ -6,6 +6,7 @@
package org.elasticsearch.xpack.sql.expression.gen.script;

import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;

import java.util.ArrayList;
import java.util.List;
Expand All @@ -28,6 +29,11 @@ public ParamsBuilder agg(AggregateFunctionAttribute agg) {
return this;
}

public ParamsBuilder grouping(GroupingFunctionAttribute grouping) {
params.add(new Grouping(grouping));
return this;
}

public ParamsBuilder script(Params ps) {
params.add(new Script(ps));
return this;
Expand Down
Expand Up @@ -44,10 +44,6 @@ public Params params() {
return params;
}

public List<String> aggRefs() {
return params.asAggRefs();
}

public Map<String, String> aggPaths() {
return params.asAggPaths();
}
Expand Down
Expand Up @@ -12,6 +12,7 @@
import org.elasticsearch.xpack.sql.expression.Expressions;
import org.elasticsearch.xpack.sql.expression.FieldAttribute;
import org.elasticsearch.xpack.sql.expression.function.aggregate.AggregateFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.grouping.GroupingFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.function.scalar.ScalarFunctionAttribute;
import org.elasticsearch.xpack.sql.expression.literal.IntervalDayTime;
import org.elasticsearch.xpack.sql.expression.literal.IntervalYearMonth;
Expand All @@ -37,6 +38,9 @@ default ScriptTemplate asScript(Expression exp) {
if (attr instanceof AggregateFunctionAttribute) {
return scriptWithAggregate((AggregateFunctionAttribute) attr);
}
if (attr instanceof GroupingFunctionAttribute) {
return scriptWithGrouping((GroupingFunctionAttribute) attr);
}
if (attr instanceof FieldAttribute) {
return scriptWithField((FieldAttribute) attr);
}
Expand Down Expand Up @@ -83,6 +87,16 @@ default ScriptTemplate scriptWithAggregate(AggregateFunctionAttribute aggregate)
dataType());
}

default ScriptTemplate scriptWithGrouping(GroupingFunctionAttribute grouping) {
String template = "{}";
if (grouping.dataType() == DataType.DATE) {
template = "{sql}.asDateTime({})";
}
return new ScriptTemplate(processScript(template),
paramsBuilder().grouping(grouping).build(),
dataType());
}

default ScriptTemplate scriptWithField(FieldAttribute field) {
return new ScriptTemplate(processScript("doc[{}].value"),
paramsBuilder().variable(field.name()).build(),
Expand Down

0 comments on commit ac032a0

Please sign in to comment.