-
Notifications
You must be signed in to change notification settings - Fork 6.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LIMIT BY clause was implemented #293
Changes from all commits
9bfd80b
938663a
96d441f
3f064d9
3b88b1f
405382b
2b38bab
1f49735
9cfc019
333d47e
273e58f
c74b8e2
3987df8
0dd2191
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#pragma once | ||
|
||
#include <DB/DataStreams/IProfilingBlockInputStream.h> | ||
|
||
#include <DB/Common/HashTable/HashMap.h> | ||
#include <DB/Common/SipHash.h> | ||
#include <DB/Common/UInt128.h> | ||
|
||
namespace DB | ||
{ | ||
|
||
/** Implements LIMIT BY clause witch can be used to obtain a "top N by subgroup". | ||
* | ||
* For example, if you have table T like this (Num: 1 1 3 3 3 4 4 5 7 7 7 7), | ||
* the query SELECT Num FROM T LIMIT 2 BY Num | ||
* will give you the following result: (Num: 1 1 3 3 4 4 5 7 7). | ||
*/ | ||
class LimitByBlockInputStream : public IProfilingBlockInputStream | ||
{ | ||
public: | ||
LimitByBlockInputStream(BlockInputStreamPtr input_, size_t group_size_, Names columns_); | ||
|
||
String getName() const override { return "LimitBy"; } | ||
|
||
String getID() const override; | ||
|
||
protected: | ||
Block readImpl() override; | ||
|
||
private: | ||
ConstColumnPlainPtrs getKeyColumns(Block & block) const; | ||
|
||
private: | ||
using MapHashed = HashMap<UInt128, UInt64, UInt128TrivialHash>; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing comment. |
||
|
||
const Names columns_names; | ||
const size_t group_size; | ||
MapHashed keys_counts; | ||
}; | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#include <DB/DataStreams/LimitByBlockInputStream.h> | ||
|
||
namespace DB | ||
{ | ||
|
||
LimitByBlockInputStream::LimitByBlockInputStream(BlockInputStreamPtr input_, size_t group_size_, Names columns_) | ||
: columns_names(columns_) | ||
, group_size(group_size_) | ||
{ | ||
children.push_back(input_); | ||
} | ||
|
||
String LimitByBlockInputStream::getID() const | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The purpose of method It is experimental, not used in production. Despite of this, better to implement method |
||
{ | ||
std::stringstream res; | ||
res << "LimitBy(" << this << ")"; | ||
return res.str(); | ||
} | ||
|
||
Block LimitByBlockInputStream::readImpl() | ||
{ | ||
/// Execute until end of stream or until | ||
/// a block with some new records will be gotten. | ||
while (true) | ||
{ | ||
Block block = children[0]->read(); | ||
if (!block) | ||
return Block(); | ||
|
||
const ConstColumnPlainPtrs column_ptrs(getKeyColumns(block)); | ||
const size_t rows = block.rows(); | ||
IColumn::Filter filter(rows); | ||
size_t inserted_count = 0; | ||
|
||
for (size_t i = 0; i < rows; ++i) | ||
{ | ||
UInt128 key; | ||
SipHash hash; | ||
|
||
for (auto & column : column_ptrs) | ||
column->updateHashWithValue(i, hash); | ||
|
||
hash.get128(key.first, key.second); | ||
|
||
if (keys_counts[key]++ < group_size) | ||
{ | ||
inserted_count++; | ||
filter[i] = 1; | ||
} | ||
else | ||
filter[i] = 0; | ||
} | ||
|
||
/// Just go to the next block if there isn't any new records in the current one. | ||
if (!inserted_count) | ||
continue; | ||
|
||
size_t all_columns = block.columns(); | ||
for (size_t i = 0; i < all_columns; ++i) | ||
block.getByPosition(i).column = block.getByPosition(i).column->filter(filter, inserted_count); | ||
|
||
return block; | ||
} | ||
} | ||
|
||
ConstColumnPlainPtrs LimitByBlockInputStream::getKeyColumns(Block & block) const | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method will bump in profiler in case of very small blocks. Ok for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. woo, it is really nice feature for me. Thanks guys! |
||
{ | ||
ConstColumnPlainPtrs column_ptrs; | ||
column_ptrs.reserve(columns_names.size()); | ||
|
||
for (const auto & name : columns_names) | ||
{ | ||
auto & column = block.getByName(name).column; | ||
|
||
/// Ignore all constant columns. | ||
if (!column->isConst()) | ||
column_ptrs.emplace_back(column.get()); | ||
} | ||
|
||
return column_ptrs; | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
#include <DB/DataStreams/ExpressionBlockInputStream.h> | ||
#include <DB/DataStreams/FilterBlockInputStream.h> | ||
#include <DB/DataStreams/LimitBlockInputStream.h> | ||
#include <DB/DataStreams/LimitByBlockInputStream.h> | ||
#include <DB/DataStreams/PartialSortingBlockInputStream.h> | ||
#include <DB/DataStreams/MergeSortingBlockInputStream.h> | ||
#include <DB/DataStreams/MergingSortedBlockInputStream.h> | ||
|
@@ -190,19 +191,25 @@ void InterpreterSelectQuery::initQueryAnalyzer() | |
|
||
InterpreterSelectQuery::InterpreterSelectQuery(ASTPtr query_ptr_, const Context & context_, QueryProcessingStage::Enum to_stage_, | ||
size_t subquery_depth_, BlockInputStreamPtr input_) | ||
: query_ptr(query_ptr_), query(typeid_cast<ASTSelectQuery &>(*query_ptr)), | ||
context(context_), to_stage(to_stage_), subquery_depth(subquery_depth_), | ||
is_first_select_inside_union_all(query.isUnionAllHead()), | ||
log(&Logger::get("InterpreterSelectQuery")) | ||
: query_ptr(query_ptr_) | ||
, query(typeid_cast<ASTSelectQuery &>(*query_ptr)) | ||
, context(context_) | ||
, to_stage(to_stage_) | ||
, subquery_depth(subquery_depth_) | ||
, is_first_select_inside_union_all(query.isUnionAllHead()) | ||
, log(&Logger::get("InterpreterSelectQuery")) | ||
{ | ||
init(input_); | ||
} | ||
|
||
InterpreterSelectQuery::InterpreterSelectQuery(OnlyAnalyzeTag, ASTPtr query_ptr_, const Context & context_) | ||
: query_ptr(query_ptr_), query(typeid_cast<ASTSelectQuery &>(*query_ptr)), | ||
context(context_), to_stage(QueryProcessingStage::Complete), subquery_depth(0), | ||
is_first_select_inside_union_all(false), only_analyze(true), | ||
log(&Logger::get("InterpreterSelectQuery")) | ||
: query_ptr(query_ptr_) | ||
, query(typeid_cast<ASTSelectQuery &>(*query_ptr)) | ||
, context(context_) | ||
, to_stage(QueryProcessingStage::Complete) | ||
, subquery_depth(0) | ||
, is_first_select_inside_union_all(false), only_analyze(true) | ||
, log(&Logger::get("InterpreterSelectQuery")) | ||
{ | ||
init({}); | ||
} | ||
|
@@ -217,10 +224,14 @@ InterpreterSelectQuery::InterpreterSelectQuery(ASTPtr query_ptr_, const Context | |
InterpreterSelectQuery::InterpreterSelectQuery(ASTPtr query_ptr_, const Context & context_, | ||
const Names & required_column_names_, | ||
const NamesAndTypesList & table_column_names_, QueryProcessingStage::Enum to_stage_, size_t subquery_depth_, BlockInputStreamPtr input_) | ||
: query_ptr(query_ptr_), query(typeid_cast<ASTSelectQuery &>(*query_ptr)), | ||
context(context_), to_stage(to_stage_), subquery_depth(subquery_depth_), table_column_names(table_column_names_), | ||
is_first_select_inside_union_all(query.isUnionAllHead()), | ||
log(&Logger::get("InterpreterSelectQuery")) | ||
: query_ptr(query_ptr_) | ||
, query(typeid_cast<ASTSelectQuery &>(*query_ptr)) | ||
, context(context_) | ||
, to_stage(to_stage_) | ||
, subquery_depth(subquery_depth_) | ||
, table_column_names(table_column_names_) | ||
, is_first_select_inside_union_all(query.isUnionAllHead()) | ||
, log(&Logger::get("InterpreterSelectQuery")) | ||
{ | ||
init(input_, required_column_names_); | ||
} | ||
|
@@ -305,7 +316,7 @@ void InterpreterSelectQuery::getDatabaseAndTableNames(String & database_name, St | |
DataTypes InterpreterSelectQuery::getReturnTypes() | ||
{ | ||
DataTypes res; | ||
NamesAndTypesList columns = query_analyzer->getSelectSampleBlock().getColumnsList(); | ||
const NamesAndTypesList & columns = query_analyzer->getSelectSampleBlock().getColumnsList(); | ||
for (auto & column : columns) | ||
res.push_back(column.type); | ||
|
||
|
@@ -553,8 +564,7 @@ void InterpreterSelectQuery::executeSingleQuery() | |
* но есть ORDER или LIMIT, | ||
* то выполним предварительную сортировку и LIMIT на удалёном сервере. | ||
*/ | ||
if (!second_stage | ||
&& !need_aggregate && !has_having) | ||
if (!second_stage && !need_aggregate && !has_having) | ||
{ | ||
if (has_order_by) | ||
executeOrder(); | ||
|
@@ -619,21 +629,28 @@ void InterpreterSelectQuery::executeSingleQuery() | |
/** Оптимизация - если источников несколько и есть LIMIT, то сначала применим предварительный LIMIT, | ||
* ограничивающий число записей в каждом до offset + limit. | ||
*/ | ||
if (query.limit_length && hasMoreThanOneStream() && !query.distinct) | ||
if (query.limit_length && hasMoreThanOneStream() && !query.distinct && !query.limit_by_expression_list) | ||
executePreLimit(); | ||
|
||
if (need_second_distinct_pass) | ||
union_within_single_query = true; | ||
|
||
/// To execute LIMIT BY we should merge all streams together. | ||
if (query.limit_by_expression_list && hasMoreThanOneStream()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do not understand. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should merge all streams together or condition streams.size() == 1 on line 645 will be false and executeLimit won't be called. |
||
union_within_single_query = true; | ||
|
||
if (union_within_single_query || stream_with_non_joined_data) | ||
executeUnion(); | ||
|
||
if (streams.size() == 1) | ||
{ | ||
/// Если было более одного источника - то нужно выполнить DISTINCT ещё раз после их слияния. | ||
/** If there was more than one stream, | ||
* then DISTINCT needs to be performed once again after merging all streams. | ||
*/ | ||
if (need_second_distinct_pass) | ||
executeDistinct(false, Names()); | ||
|
||
executeLimitBy(); | ||
executeLimit(); | ||
} | ||
} | ||
|
@@ -770,7 +787,7 @@ QueryProcessingStage::Enum InterpreterSelectQuery::executeFetchColumns() | |
size_t limit_offset = 0; | ||
getLimitLengthAndOffset(query, limit_length, limit_offset); | ||
|
||
/** Оптимизация - если не указаны DISTINCT, WHERE, GROUP, HAVING, ORDER, но указан LIMIT, и limit + offset < max_block_size, | ||
/** Оптимизация - если не указаны DISTINCT, WHERE, GROUP, HAVING, ORDER, LIMIT BY но указан LIMIT, и limit + offset < max_block_size, | ||
* то в качестве размера блока будем использовать limit + offset (чтобы не читать из таблицы больше, чем запрошено), | ||
* а также установим количество потоков в 1. | ||
*/ | ||
|
@@ -780,6 +797,7 @@ QueryProcessingStage::Enum InterpreterSelectQuery::executeFetchColumns() | |
&& !query.group_expression_list | ||
&& !query.having_expression | ||
&& !query.order_expression_list | ||
&& !query.limit_by_expression_list | ||
&& query.limit_length | ||
&& !query_analyzer->hasAggregation() | ||
&& limit_length + limit_offset < settings.max_block_size) | ||
|
@@ -1024,9 +1042,9 @@ static SortDescription getSortDescription(ASTSelectQuery & query) | |
|
||
static size_t getLimitForSorting(ASTSelectQuery & query) | ||
{ | ||
/// Если есть LIMIT и нет DISTINCT - можно делать частичную сортировку. | ||
/// Partial sort can be done if there is LIMIT but no DISTINCT or LIMIT BY. | ||
size_t limit = 0; | ||
if (!query.distinct) | ||
if (!query.distinct && !query.limit_by_expression_list) | ||
{ | ||
size_t limit_length = 0; | ||
size_t limit_offset = 0; | ||
|
@@ -1156,7 +1174,7 @@ void InterpreterSelectQuery::executePreLimit() | |
{ | ||
transformStreams([&](auto & stream) | ||
{ | ||
stream = std::make_shared<LimitBlockInputStream>(stream, limit_length + limit_offset, 0); | ||
stream = std::make_shared<LimitBlockInputStream>(stream, limit_length + limit_offset, false); | ||
}); | ||
|
||
if (hasMoreThanOneStream()) | ||
|
@@ -1165,6 +1183,28 @@ void InterpreterSelectQuery::executePreLimit() | |
} | ||
|
||
|
||
void InterpreterSelectQuery::executeLimitBy() | ||
{ | ||
if (!query.limit_by_value) | ||
return; | ||
|
||
Names columns; | ||
size_t value = safeGet<UInt64>(typeid_cast<ASTLiteral &>(*query.limit_by_value).value); | ||
|
||
for (const auto & elem : query.limit_by_expression_list->children) | ||
{ | ||
columns.emplace_back(elem->getAliasOrColumnName()); | ||
} | ||
|
||
transformStreams([&](auto & stream) | ||
{ | ||
stream = std::make_shared<LimitByBlockInputStream>( | ||
stream, value, columns | ||
); | ||
}); | ||
} | ||
|
||
|
||
void InterpreterSelectQuery::executeLimit() | ||
{ | ||
size_t limit_length = 0; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing comment with detailed description.