From ffbe9b83ea2f57ee5f2d5ddee1493873814f6a3c Mon Sep 17 00:00:00 2001 From: Dejan Mircevski Date: Sun, 8 Aug 2021 07:43:45 +0200 Subject: [PATCH] cql3: Demand ALLOW FILTERING for unlimited, sliced partitions When a query requests a partition slice but doesn't limit the number of partitions, require that it also says ALLOW FILTERING. Although do_filter() isn't invoked for such queries, the performance can still be unexpectedly slow, and we want to signal that to the user by demanding they explicitly say ALLOW FILTERING. Because we now reject queries that worked fine before, existing applications can break. Therefore, the behavior is controlled by a flag currently defaulting to off. We will default to "on" in the next Scylla version. Fixes #7608; see comments there for justification. Signed-off-by: Dejan Mircevski --- cql3/statements/raw/select_statement.hh | 5 +++- cql3/statements/select_statement.cc | 38 +++++++++++++++++++++---- db/config.cc | 5 ++++ db/config.hh | 1 + test/boost/restrictions_test.cc | 34 ++++++++++++++++++++++ test/cql-pytest/test_allow_filtering.py | 9 ++++-- 6 files changed, 82 insertions(+), 10 deletions(-) diff --git a/cql3/statements/raw/select_statement.hh b/cql3/statements/raw/select_statement.hh index bd35291f7f0f..e42abf1e8894 100644 --- a/cql3/statements/raw/select_statement.hh +++ b/cql3/statements/raw/select_statement.hh @@ -45,6 +45,7 @@ #include "cql3/statements/prepared_statement.hh" #include "cql3/relation.hh" #include "cql3/attributes.hh" +#include "db/config.hh" #include namespace cql3 { @@ -150,7 +151,9 @@ private: bool is_reversed(const schema& schema) const; /** If ALLOW FILTERING was not specified, this verifies that it is not needed */ - void check_needs_filtering(const restrictions::statement_restrictions& restrictions); + void check_needs_filtering( + const restrictions::statement_restrictions& restrictions, + db::tri_mode_restriction_t::mode strict_allow_filtering); void ensure_filtering_columns_retrieval(database& db, selection::selection& selection, diff --git a/cql3/statements/select_statement.cc b/cql3/statements/select_statement.cc index e5a69e78269f..68d10682a3a0 100644 --- a/cql3/statements/select_statement.cc +++ b/cql3/statements/select_statement.cc @@ -68,6 +68,8 @@ bool is_system_keyspace(std::string_view name); +static logging::logger mylog("select_statement"); + namespace cql3 { namespace statements { @@ -1413,7 +1415,7 @@ std::unique_ptr select_statement::prepare(database& db, cql_ is_reversed_ = is_reversed(*schema); } - check_needs_filtering(*restrictions); + check_needs_filtering(*restrictions, db.get_config().strict_allow_filtering()); ensure_filtering_columns_retrieval(db, *selection, *restrictions); auto group_by_cell_indices = ::make_shared>(prepare_group_by(*schema, *selection)); @@ -1627,15 +1629,39 @@ bool select_statement::is_reversed(const schema& schema) const { return is_reversed_; } +/// True iff restrictions require ALLOW FILTERING despite there being no coordinator-side filtering. +static bool needs_allow_filtering_anyway( + const restrictions::statement_restrictions& restrictions, + db::tri_mode_restriction_t::mode strict_allow_filtering) { + using flag_t = db::tri_mode_restriction_t::mode; + if (strict_allow_filtering == flag_t::FALSE) { + return false; + } + const auto& ck_restrictions = *restrictions.get_clustering_columns_restrictions(); + const auto& pk_restrictions = *restrictions.get_partition_key_restrictions(); + // Even if no filtering happens on the coordinator, we still warn about poor performance when partition + // slice is defined but in potentially unlimited number of partitions (see #7608). + if ((pk_restrictions.empty() || has_token(pk_restrictions.expression)) // Potentially unlimited partitions. + && !ck_restrictions.empty() // Slice defined. + && !restrictions.uses_secondary_indexing()) { // Base-table is used. (Index-table use always limits partitions.) + if (strict_allow_filtering == flag_t::WARN) { + mylog.warn("This query should use ALLOW FILTERING and will be rejected in future versions: {}", + restrictions.to_string()); + return false; + } + return true; + } + return false; +} + /** If ALLOW FILTERING was not specified, this verifies that it is not needed */ -void select_statement::check_needs_filtering(const restrictions::statement_restrictions& restrictions) +void select_statement::check_needs_filtering( + const restrictions::statement_restrictions& restrictions, + db::tri_mode_restriction_t::mode strict_allow_filtering) { // non-key-range non-indexed queries cannot involve filtering underneath if (!_parameters->allow_filtering() && (restrictions.is_key_range() || restrictions.uses_secondary_indexing())) { - // We will potentially filter data if either: - // - Have more than one IndexExpression - // - Have no index expression and the column filter is not the identity - if (restrictions.need_filtering()) { + if (restrictions.need_filtering() || needs_allow_filtering_anyway(restrictions, strict_allow_filtering)) { throw exceptions::invalid_request_exception( "Cannot execute this query as it might involve data filtering and " "thus may have unpredictable performance. If you want to execute " diff --git a/db/config.cc b/db/config.cc index 00897e100a10..8a01958b97bb 100644 --- a/db/config.cc +++ b/db/config.cc @@ -229,6 +229,10 @@ class convert> { #define str(x) #x #define _mk_init(name, type, deflt, status, desc, ...) , name(this, str(name), value_status::status, type(deflt), desc) +static db::tri_mode_restriction_t::mode strict_allow_filtering_default() { + return db::tri_mode_restriction_t::mode::WARN; // TODO: make it TRUE after Scylla 4.6. +} + db::config::config(std::shared_ptr exts) : utils::config_file() @@ -802,6 +806,7 @@ db::config::config(std::shared_ptr exts) "Maximum number of concurrent requests a single shard can handle before it starts shedding extra load. By default, no requests will be shed.") , cdc_dont_rewrite_streams(this, "cdc_dont_rewrite_streams", value_status::Used, false, "Disable rewriting streams from cdc_streams_descriptions to cdc_streams_descriptions_v2. Should not be necessary, but the procedure is expensive and prone to failures; this config option is left as a backdoor in case some user requires manual intervention.") + , strict_allow_filtering(this, "strict_allow_filtering", value_status::Used, strict_allow_filtering_default(), "Match Cassandra in requiring ALLOW FILTERING on slow queries. Can be true, false, or warn. When false, Scylla accepts some slow queries even without ALLOW FILTERING that Cassandra rejects. Warn is same as false, but with warning.") , alternator_port(this, "alternator_port", value_status::Used, 0, "Alternator API port") , alternator_https_port(this, "alternator_https_port", value_status::Used, 0, "Alternator API HTTPS port") , alternator_address(this, "alternator_address", value_status::Used, "0.0.0.0", "Alternator API listening address") diff --git a/db/config.hh b/db/config.hh index c87b813f9bd0..82f5b02a7c97 100644 --- a/db/config.hh +++ b/db/config.hh @@ -336,6 +336,7 @@ public: named_value schema_registry_grace_period; named_value max_concurrent_requests_per_shard; named_value cdc_dont_rewrite_streams; + named_value strict_allow_filtering; named_value alternator_port; named_value alternator_https_port; diff --git a/test/boost/restrictions_test.cc b/test/boost/restrictions_test.cc index 9c3d9a00bd00..2281186d2593 100644 --- a/test/boost/restrictions_test.cc +++ b/test/boost/restrictions_test.cc @@ -26,6 +26,7 @@ #include "cql3/cql_config.hh" #include "cql3/values.hh" +#include "db/config.hh" #include "test/lib/cql_assertions.hh" #include "test/lib/cql_test_env.hh" #include "test/lib/exception_utils.hh" @@ -927,3 +928,36 @@ SEASTAR_THREAD_TEST_CASE(tuples) { require_rows(e, "select q from t where q>(99) allow filtering", {}); }).get(); } + +SEASTAR_THREAD_TEST_CASE(strict_allow_filtering) { + auto cfg = make_shared(); + cfg->strict_allow_filtering(db::tri_mode_restriction_t::mode::TRUE); + do_with_cql_env_thread([](cql_test_env& e) { + cquery_nofail(e, "create table t (p int, c int, primary key (p,c))"); + require_rows(e, "select p from t where c>0 allow filtering", {}); + BOOST_REQUIRE_EXCEPTION( + e.execute_cql("select p from t where c>0").get(), + exceptions::invalid_request_exception, + exception_predicate::message_contains("use ALLOW FILTERING")); + }, cfg).get(); +} + +SEASTAR_THREAD_TEST_CASE(nonstrict_allow_filtering) { + auto cfg = make_shared(); + cfg->strict_allow_filtering(db::tri_mode_restriction_t::mode::FALSE); + do_with_cql_env_thread([](cql_test_env& e) { + cquery_nofail(e, "create table t (p int, c int, primary key (p,c))"); + require_rows(e, "select p from t where c>0 allow filtering", {}); + require_rows(e, "select p from t where c>0", {}); + }, cfg).get(); +} + +SEASTAR_THREAD_TEST_CASE(warn_strict_allow_filtering) { + auto cfg = make_shared(); + cfg->strict_allow_filtering(db::tri_mode_restriction_t::mode::WARN); + do_with_cql_env_thread([](cql_test_env& e) { + cquery_nofail(e, "create table t (p int, c int, primary key (p,c))"); + require_rows(e, "select p from t where c>0 allow filtering", {}); + require_rows(e, "select p from t where c>0", {}); + }, cfg).get(); +} diff --git a/test/cql-pytest/test_allow_filtering.py b/test/cql-pytest/test_allow_filtering.py index 338214d3ba7d..856c471b4fd2 100644 --- a/test/cql-pytest/test_allow_filtering.py +++ b/test/cql-pytest/test_allow_filtering.py @@ -121,7 +121,6 @@ def test_allow_filtering_regular_column(cql, table1): # partition it needs to inspect the promoted index and in the worst case, # also read a chunk of rows to look for the matching clustering-column value. # Reproduces issue #7608. -@pytest.mark.xfail(reason="issue #7608") def test_allow_filtering_clustering_key(cql, table1): check_af_mandatory(cql, table1, 'c=2', lambda row: row.c==2) check_af_mandatory(cql, table1, 'c>2', lambda row: row.c>2) @@ -136,7 +135,6 @@ def test_allow_filtering_clustering_key(cql, table1): # ranges - because although they are finite, they may still contain a lot # of partitions. # Reproduces issue #7608. -@pytest.mark.xfail(reason="issue #7608") def test_allow_filtering_clustering_key_token_range(cql, table1): # TODO: the current implementation of check_af_mandatory() doesn't know # how to request or compare results with the token, so we pass None to @@ -153,7 +151,6 @@ def test_allow_filtering_clustering_key_token_range(cql, table1): # this case. Note that 'c=2 AND k = 123' (without the "token"), which # is guaranteed to match just one partition, doesn't need ALLOW FILTERING. # Reproduces issue #7608. -@pytest.mark.xfail(reason="issue #7608") def test_allow_filtering_clustering_key_token_specific(cql, table1): check_af_mandatory(cql, table1, 'c=2 AND token(k) = 123', None) check_af_optional(cql, table1, 'c=2 AND k = 123', None) @@ -237,6 +234,12 @@ def table3(cql, test_keyspace): yield (table, everything) cql.execute("DROP TABLE " + table) +def test_allow_filtering_multi_column(cql, table3): + """Multi-column restrictions are just like other clustering restrictions""" + check_af_mandatory(cql, table3, '(c1,c2)=(10,10)', lambda row: row.c1==10 and row.c2==10) + check_af_optional(cql, table3, '(c1,c2)=(10,10) and k1=1 and k2=1', + lambda row: row.c1==10 and row.c2==10 and row.k1==1 and row.k2==1) + # In test_allow_filtering_indexed_a_and_k() above we noted that the # combination of an indexed column and the partition key can be searched # without filtering, and explained why. The same cannot be said for the