Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Union operator with collations #12078

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
30 changes: 27 additions & 3 deletions src/execution/physical_plan/plan_set_operation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,35 @@ unique_ptr<PhysicalOperator> PhysicalPlanGenerator::CreatePlan(LogicalSetOperati

// if the ALL specifier is not given, we have to ensure distinct results. Hence, push a GROUP BY ALL
if (!op.setop_all) { // no ALL, use distinct semantics
auto &types = result->GetTypes();
auto &types = result->types;
vector<unique_ptr<Expression>> groups, aggregates /* left empty */;
for (idx_t i = 0; i < types.size(); i++) {
groups.push_back(make_uniq<BoundReferenceExpression>(types[i], i));
if (op.collation_info.empty()) {
for (idx_t i = 0; i < types.size(); i++) {
groups.push_back(make_uniq<BoundReferenceExpression>(types[i], i));
}
} else { // project the collations and ordinary columns
vector<unique_ptr<Expression>> expressions;
idx_t info_idx = 0;
auto &info = op.collation_info;
for (idx_t proj_idx = 0; proj_idx < types.size(); ++proj_idx) {
if (info_idx < info.size() && proj_idx == info[info_idx].collation_idx) {
// project collation
auto &bound_collation_expr = info[info_idx].bound_collation_expr;
types[proj_idx] = bound_collation_expr->return_type;
expressions.push_back(std::move(bound_collation_expr));
info_idx++;
} else {
// ordinary columns
expressions.push_back(make_uniq<BoundReferenceExpression>(types[proj_idx], proj_idx));
}
groups.push_back(make_uniq<BoundReferenceExpression>(types[proj_idx], proj_idx));
}
auto projection =
make_uniq<PhysicalProjection>(std::move(types), std::move(expressions), op.estimated_cardinality);
projection->children.push_back(std::move(result));
result = std::move(projection);
}

auto groupby = make_uniq<PhysicalHashAggregate>(context, op.types, std::move(aggregates), std::move(groups),
result->estimated_cardinality);
groupby->children.push_back(std::move(result));
Expand Down
3 changes: 3 additions & 0 deletions src/include/duckdb/planner/binder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "duckdb/planner/logical_operator.hpp"
#include "duckdb/planner/joinside.hpp"
#include "duckdb/common/reference_map.hpp"
#include "duckdb/planner/query_node/bound_set_operation_node.hpp"

namespace duckdb {
class BoundResultModifier;
Expand Down Expand Up @@ -394,6 +395,8 @@ class Binder : public enable_shared_from_this<Binder> {
unique_ptr<BoundTableRef> BindShowTable(ShowRef &ref);
unique_ptr<BoundTableRef> BindSummarize(ShowRef &ref);

void BindCollationGroup(unique_ptr<BoundSetOperationNode> &bound_set_op);

public:
// This should really be a private constructor, but make_shared_ptr does not allow it...
// If you are thinking about calling this, you should probably call Binder::CreateBinder
Expand Down
13 changes: 13 additions & 0 deletions src/include/duckdb/planner/operator/logical_set_operation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#pragma once

#include "duckdb/planner/logical_operator.hpp"
#include "duckdb/planner/query_node/bound_select_node.hpp"
#include "duckdb/planner/query_node/bound_set_operation_node.hpp"

namespace duckdb {

Expand All @@ -34,12 +36,23 @@ class LogicalSetOperation : public LogicalOperator {
children.push_back(std::move(bottom));
}

LogicalSetOperation(idx_t table_index, idx_t column_count, unique_ptr<LogicalOperator> top,
unique_ptr<LogicalOperator> bottom, LogicalOperatorType type, bool setop_all,
bool allow_out_of_order, vector<CollationGroupInfo> info)
: LogicalSetOperation(table_index, column_count, std::move(top), std::move(bottom), type, setop_all,
allow_out_of_order) {
collation_info = std::move(info);
}

idx_t table_index;
idx_t column_count;
bool setop_all;
//! Whether or not UNION statements can be executed out of order
bool allow_out_of_order;

// unique_ptr<CollationGroupInfo> collation_info;
vector<CollationGroupInfo> collation_info;

public:
vector<ColumnBinding> GetColumnBindings() override {
return GenerateColumnBindings(table_index, column_count);
Expand Down
2 changes: 2 additions & 0 deletions src/include/duckdb/planner/query_node/bound_select_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ class BoundSelectNode : public BoundQueryNode {
idx_t prune_index;
bool need_prune = false;

vector<idx_t> collation_sel_idx;

public:
idx_t GetRootIndex() override {
return need_prune ? prune_index : projection_index;
Expand Down
15 changes: 15 additions & 0 deletions src/include/duckdb/planner/query_node/bound_set_operation_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@

namespace duckdb {

class CollationGroupInfo {
public:
CollationGroupInfo(idx_t coll_idx, unique_ptr<Expression> coll_expr)
: collation_idx(coll_idx), bound_collation_expr(std::move(coll_expr)) {
}

public:
//! collation index in the select list
idx_t collation_idx;
//! bound collation fucntion with a child of type BoundReferenceExpression that references to collation_idx
unique_ptr<Expression> bound_collation_expr;
};

//! Bound equivalent of SetOperationNode
class BoundSetOperationNode : public BoundQueryNode {
public:
Expand Down Expand Up @@ -49,6 +62,8 @@ class BoundSetOperationNode : public BoundQueryNode {
vector<idx_t> left_reorder_idx;
vector<idx_t> right_reorder_idx;

vector<CollationGroupInfo> collation_group_info;

public:
idx_t GetRootIndex() override {
return setop_index;
Expand Down
4 changes: 4 additions & 0 deletions src/planner/binder/query_node/bind_select_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,10 @@ unique_ptr<BoundQueryNode> Binder::BindSelectNode(SelectNode &statement, unique_
}
bind_state.projection_map[*expr] = i;
bind_state.original_expressions.push_back(expr->Copy());

if (expr->GetExpressionClass() == ExpressionClass::COLLATE) {
result->collation_sel_idx.push_back(i);
}
}
result->column_count = statement.select_list.size();

Expand Down
56 changes: 56 additions & 0 deletions src/planner/binder/query_node/bind_setop_node.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include "duckdb/parser/expression/collate_expression.hpp"
#include "duckdb/parser/expression/columnref_expression.hpp"
#include "duckdb/parser/expression/constant_expression.hpp"
#include "duckdb/parser/expression_map.hpp"
Expand All @@ -6,6 +7,7 @@
#include "duckdb/planner/binder.hpp"
#include "duckdb/planner/expression/bound_columnref_expression.hpp"
#include "duckdb/planner/expression/bound_constant_expression.hpp"
#include "duckdb/planner/expression/bound_reference_expression.hpp"
#include "duckdb/planner/expression_binder/order_binder.hpp"
#include "duckdb/planner/query_node/bound_select_node.hpp"
#include "duckdb/planner/query_node/bound_set_operation_node.hpp"
Expand Down Expand Up @@ -182,6 +184,57 @@ static void BuildUnionByNameInfo(ClientContext &context, BoundSetOperationNode &
}
}

void Binder::BindCollationGroup(unique_ptr<BoundSetOperationNode> &bound_set_op) {
if (bound_set_op->left->type != QueryNodeType::SELECT_NODE ||
bound_set_op->right->type != QueryNodeType::SELECT_NODE) {
return;
}
auto &left_node = bound_set_op->left->Cast<BoundSelectNode>();
auto &left_bind_state = left_node.bind_state;
auto &right_node = bound_set_op->right->Cast<BoundSelectNode>();
auto &right_bind_state = right_node.bind_state;

// using set data structure to ensure uniqueness
std::set<idx_t> collation_indexes(left_node.collation_sel_idx.begin(), left_node.collation_sel_idx.end());
std::copy(right_node.collation_sel_idx.begin(), right_node.collation_sel_idx.end(),
std::inserter(collation_indexes, collation_indexes.end()));

// verifies collation conflicts
for (idx_t collate_idx : collation_indexes) {
auto &left_expr = left_bind_state.original_expressions[collate_idx];
auto &right_expr = right_bind_state.original_expressions[collate_idx];
// at least one expression must have to be a collation
D_ASSERT(left_expr->GetExpressionClass() == ExpressionClass::COLLATE ||
right_expr->GetExpressionClass() == ExpressionClass::COLLATE);

LogicalType collation_type;
// collation on both sides
if (left_expr->GetExpressionClass() == ExpressionClass::COLLATE &&
right_expr->GetExpressionClass() == ExpressionClass::COLLATE) {
auto &left_collation_expr = left_expr->Cast<CollateExpression>();
auto &right_collation_expr = right_expr->Cast<CollateExpression>();

auto &left_str_collation = left_collation_expr.collation;
auto &right_str_collation = right_collation_expr.collation;

if (left_str_collation != right_str_collation) {
throw BinderException("Different collations in a set operation at column: %lld.", collate_idx + 1);
}
collation_type = left_node.select_list[collate_idx]->return_type;
} else if (left_expr->GetExpressionClass() == ExpressionClass::COLLATE) {
// collation on lhf
collation_type = left_node.select_list[collate_idx]->return_type;
} else {
// collation on lhr
collation_type = right_node.select_list[collate_idx]->return_type;
}
//! creating a reference to the collated column and pushing the collation function into it
unique_ptr<Expression> bound_collation_expr = make_uniq<BoundReferenceExpression>(collation_type, collate_idx);
ExpressionBinder::PushCollation(context, bound_collation_expr, bound_collation_expr->return_type, true);
bound_set_op->collation_group_info.push_back({collate_idx, std::move(bound_collation_expr)});
}
}

unique_ptr<BoundQueryNode> Binder::BindNode(SetOperationNode &statement) {
auto result = make_uniq<BoundSetOperationNode>();
result->setop_type = statement.setop_type;
Expand Down Expand Up @@ -253,6 +306,9 @@ unique_ptr<BoundQueryNode> Binder::BindNode(SetOperationNode &statement) {

// finally bind the types of the ORDER/DISTINCT clause expressions
BindModifiers(*result, result->setop_index, result->names, result->types, bind_state);

BindCollationGroup(result);

return std::move(result);
}

Expand Down
5 changes: 3 additions & 2 deletions src/planner/binder/query_node/plan_setop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,9 @@ unique_ptr<LogicalOperator> Binder::CreatePlan(BoundSetOperationNode &node) {
break;
}

auto root = make_uniq<LogicalSetOperation>(node.setop_index, node.types.size(), std::move(left_node),
std::move(right_node), logical_type, node.setop_all);
auto root =
make_uniq<LogicalSetOperation>(node.setop_index, node.types.size(), std::move(left_node), std::move(right_node),
logical_type, node.setop_all, true, std::move(node.collation_group_info));

return VisitQueryNode(node, std::move(root));
}
Expand Down
130 changes: 130 additions & 0 deletions test/sql/collate/collate_union.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# name: test/sql/collate/collate_union.test
# description: Test collation in Union operator
# group: [collate]

require icu

query I
SELECT 'A' COLLATE NOCASE UNION SELECT 'a'
----
a

query I
SELECT 'a' UNION SELECT 'A' COLLATE NOCASE
----
a

query II
SELECT 'a', 'B' UNION SELECT 'A' COLLATE NOCASE, 'B'
----
a B


query II
SELECT 'a', 'b' UNION SELECT 'A' COLLATE NOCASE, 'B' COLLATE NOCASE
----
a b


query II
SELECT 'A' COLLATE NOCASE, 'B' COLLATE NOCASE UNION SELECT 'a', 'b'
----
a b

query II
SELECT * FROM (SELECT 'A', 'B' UNION SELECT 'a', 'b') ORDER BY 1
----
A B
a b

#! multiple collations
query II
SELECT NULL::VARCHAR COLLATE NOCASE, 'A' COLLATE NOCASE UNION SELECT 'a', 'b' ORDER BY 1
----
a b
NULL a

#! different collations for the same resulting column
statement error
SELECT 'A' COLLATE NOCASE UNION SELECT 'a' COLLATE POSIX
----
Binder Error: Different collations in a set operation at column: 1.

#! testing NULL
query II
SELECT NULL::VARCHAR COLLATE NOCASE, 'A' COLLATE NOCASE UNION SELECT 'a', 'b' ORDER BY 1
----
a b
NULL a

statement ok
CREATE TABLE t1(c1 VARCHAR)

#! UNION with empty column
query I
SELECT 'a' UNION SELECT c1 COLLATE NOCASE FROM t1
----
a

statement ok
INSERT INTO t1 VALUES('A')

#! literal and column
query I
SELECT 'a' UNION SELECT c1 COLLATE NOCASE FROM t1
----
a

statement ok
INSERT INTO t1 VALUES('b')

query I
SELECT 'c' UNION SELECT c1 COLLATE NOCASE FROM t1 ORDER BY 1
----
a
b
c

#! UNION two tables
query I
SELECT c1 COLLATE NOCASE FROM t1 UNION SELECT * FROM t1 ORDER BY 1
----
a
b

statement ok
ALTER TABLE t1 ADD COLUMN c2 VARCHAR

#! UNION two tables
query II
SELECT c1 COLLATE NOCASE, c2 FROM t1 UNION SELECT * FROM t1 ORDER BY 1
----
a NULL
b NULL

statement ok
DROP TABLE t1

statement ok
CREATE TABLE t1(c1 VARCHAR)

statement ok
INSERT INTO t1 VALUES('a'),('à'),('á'),('A')

#! testing NOACCENT collation
query I
SELECT c1 COLLATE NOACCENT FROM T1 UNION SELECT * FROM t1 ORDER BY 1
----
A
a

statement ok
INSERT INTO t1 VALUES('b'),('B')

#! testing NOACCENT collation
query I
SELECT c1 COLLATE NOACCENT.NOCASE FROM T1 UNION SELECT * FROM t1 ORDER BY 1
----
a
b

Loading