Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find big allocations without memory limits checks #48401

Merged
merged 3 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/Common/MemoryTracker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,26 @@ inline std::string_view toDescription(OvercommitResult result)
}
}

inline void debugLogBigAllocationWithoutCheck(Int64 size [[maybe_unused]])
{
/// Big allocations through allocNoThrow (without checking memory limits) may easily lead to OOM (and it's hard to debug).
/// Let's find them.
#ifdef ABORT_ON_LOGICAL_ERROR
if (size < 0)
return;

constexpr Int64 threshold = 16 * 1024 * 1024; /// The choice is arbitrary (maybe we should decrease it)
if (size < threshold)
return;

MemoryTrackerBlockerInThread blocker;
LOG_TEST(&Poco::Logger::get("MemoryTracker"), "Too big allocation ({} bytes) without checking memory limits, "
"it may lead to OOM. Stack trace: {}", size, StackTrace().toString());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting idea! Maybe it also worth to add a check to {stateless/stress}/run.sh for this pattern?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should fix known issues first:

rg -Faz "Too big allocation" clickhouse-server.log.zst | wc
   1484   46004  462441

#else
return; /// Avoid trash logging in release builds
#endif
}

}

namespace ProfileEvents
Expand Down Expand Up @@ -235,7 +255,10 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
formatReadableSizeWithBinarySuffix(current_hard_limit));
}
else
{
memory_limit_exceeded_ignored = true;
debugLogBigAllocationWithoutCheck(size);
}
}

Int64 limit_to_check = current_hard_limit;
Expand Down Expand Up @@ -303,7 +326,10 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
}
}
else
{
memory_limit_exceeded_ignored = true;
debugLogBigAllocationWithoutCheck(size);
}
}

bool peak_updated = false;
Expand All @@ -323,6 +349,7 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
{
bool log_memory_usage = false;
peak_updated = updatePeak(will_be, log_memory_usage);
debugLogBigAllocationWithoutCheck(size);
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/Interpreters/Session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class NamedSessionsStorage
if (it == sessions.end())
{
if (throw_if_not_found)
throw Exception(ErrorCodes::SESSION_NOT_FOUND, "Session not found.");
throw Exception(ErrorCodes::SESSION_NOT_FOUND, "Session {} not found", session_id);

/// Create a new session from current context.
auto context = Context::createCopy(global_context);
Expand All @@ -129,7 +129,7 @@ class NamedSessionsStorage
LOG_TEST(log, "Reuse session from storage with session_id: {}, user_id: {}", key.second, key.first);

if (!session.unique())
throw Exception(ErrorCodes::SESSION_IS_LOCKED, "Session is locked by a concurrent client.");
throw Exception(ErrorCodes::SESSION_IS_LOCKED, "Session {} is locked by a concurrent client", session_id);
return {session, false};
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ void StorageEmbeddedRocksDB::mutate(const MutationCommands & commands, ContextPt

assert(commands.front().type == MutationCommand::Type::UPDATE);
if (commands.front().column_to_update_expression.contains(primary_key))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated");
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated (cannot update column {})", primary_key);

auto interpreter = std::make_unique<MutationsInterpreter>(
storage_ptr, metadata_snapshot, commands, context_, /*can_execute_*/ true, /*return_all_columns*/ true);
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/StorageKeeperMap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -827,7 +827,7 @@ void StorageKeeperMap::mutate(const MutationCommands & commands, ContextPtr loca

assert(commands.front().type == MutationCommand::Type::UPDATE);
if (commands.front().column_to_update_expression.contains(primary_key))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated");
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated (cannot update column {})", primary_key);

auto interpreter = std::make_unique<MutationsInterpreter>(
storage_ptr, metadata_snapshot, commands, local_context, /*can_execute_*/ true, /*return_all_columns*/ true);
Expand Down
16 changes: 14 additions & 2 deletions tests/clickhouse-test
Original file line number Diff line number Diff line change
Expand Up @@ -2076,15 +2076,27 @@ def reportLogStats(args):
'AST is too deep. Maximum: {}', 'Array sizes are too large: {}', 'Unable to connect to HDFS: {}',
'Shutdown is called for table', 'File is not inside {}',
'Table {} doesn''t exist', 'Database {} doesn''t exist', 'Table {}.{} doesn''t exist',
'File {} doesn''t exist', 'No such attribute ''{}''', 'User name ''{}'' is reserved'
'File {} doesn''t exist', 'No such attribute ''{}''', 'User name ''{}'' is reserved',
'Could not find table: {}', 'Detached part "{}" not found', 'Unknown data type family: {}',
'Unknown input format {}', 'Cannot UPDATE key column {}', 'Substitution {} is not set',
'Cannot OPTIMIZE table: {}', 'User name is empty', 'Table name is empty', 'AST is too big. Maximum: {}',
'Unsupported cipher mode', 'Unknown explain kind ''{}''', 'Table {} was suddenly removed',
'No cache found by path: {}', 'No such column {} in table {}', 'There is no port named {}',
'Function {} cannot resize {}', 'Function {} is not parametric', 'Unknown key attribute ''{}''',
'Transaction was cancelled', 'Unknown parent id {}', 'Session {} not found', 'Mutation {} was killed',
'Table {}.{} doesn''t exist.', 'Table is not initialized yet', '{} is not an identifier',
'Column ''{}'' already exists', 'No macro {} in config', 'Invalid origin H3 index: {}',
'Invalid session timeout: ''{}''', 'Tuple cannot be empty', 'Database name is empty',
'Table {} is not a Dictionary', 'Expected function, got: {}', 'Unknown identifier: ''{}''',
'Failed to {} input ''{}''', '{}.{} is not a VIEW', 'Cannot convert NULL to {}', 'Dictionary {} doesn''t exist'
) AS known_short_messages
SELECT count() AS c, message_format_string, substr(any(message), 1, 120)
FROM system.text_log
WHERE (now() - toIntervalMinute(120)) < event_time
AND (length(message_format_string) < 16
OR (length(message_format_string) < 30 AND message ilike '%DB::Exception%'))
AND message_format_string NOT IN known_short_messages
GROUP BY message_format_string ORDER BY c DESC LIMIT 30 FORMAT TSVWithNamesAndTypes
GROUP BY message_format_string ORDER BY c DESC LIMIT 50 FORMAT TSVWithNamesAndTypes
"""
value = clickhouse_execute(args, query).decode(errors="replace")
print("\nTop short messages:\n")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
runtime messages 0.001
runtime exceptions 0.05
messages shorter than 10 10
messages shorter than 16 40
exceptions shorter than 30 120
messages shorter than 10 0
messages shorter than 16 2
exceptions shorter than 30 27
noisy messages 0.3
noisy Trace messages 0.16
noisy Debug messages 0.09
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,52 @@ select 'runtime messages', max2(coalesce(sum(length(message_format_string) = 0)
-- Check the same for exceptions. The value was 0.03
select 'runtime exceptions', max2(coalesce(sum(length(message_format_string) = 0) / countOrNull(), 0), 0.05) from logs where message like '%DB::Exception%';

-- FIXME some of the following messages are not informative and it has to be fixed
create temporary table known_short_messages (s String) as select * from (select
['', '({}) Keys: {}', '({}) {}', 'Aggregating', 'Became leader', 'Cleaning queue', 'Creating set.',
'Cyclic aliases', 'Detaching {}', 'Executing {}', 'Fire events: {}', 'Found part {}', 'Loaded queue',
'No sharding key', 'No tables', 'Query: {}', 'Removed', 'Removed part {}', 'Removing parts.',
'Request URI: {}', 'Sending part {}', 'Sent handshake', 'Starting {}', 'Will mimic {}', 'Writing to {}',
'dropIfEmpty', 'loadAll {}', '{} ({}:{})', '{} -> {}', '{} {}', '{}: {}', 'Query was cancelled',
'Table {} already exists.', '{}%', 'Cancelled merging parts', 'All replicas are lost',
'Cancelled mutating parts', 'Read object: {}', 'New segment: {}', 'Unknown geometry type {}',
'Table {} is not replicated', '{} {}.{} already exists', 'Attempt to read after eof',
'Replica {} already exists', 'Convert overflow', 'key must be a tuple', 'Division by zero',
'No part {} in committed state', 'Files set to {}', 'Bytes set to {}', 'Sharding key {} is not used',
'Cannot parse datetime', 'Bad get: has {}, requested {}', 'There is no {} in {}', 'Numeric overflow',
'Polygon is not valid: {}', 'Decimal math overflow', '{} only accepts maps', 'Dictionary ({}) not found',
'Unknown format {}', 'Invalid IPv4 value', 'Invalid IPv6 value', 'Unknown setting {}',
'Unknown table function {}', 'Database {} already exists.', 'Table {} doesn''t exist',
'Invalid credentials', 'Part {} already exists', 'Invalid mode: {}', 'Log pulling is cancelled',
'JOIN {} cannot get JOIN keys', 'Unknown function {}{}', 'Cannot parse IPv6 {}',
'Not found address of host: {}', '{} must contain a tuple', 'Unknown codec family: {}',
'Expected const String column', 'Invalid partition format: {}', 'Cannot parse IPv4 {}',
'AST is too deep. Maximum: {}', 'Array sizes are too large: {}', 'Unable to connect to HDFS: {}',
'Shutdown is called for table', 'File is not inside {}',
'Table {} doesn''t exist', 'Database {} doesn''t exist', 'Table {}.{} doesn''t exist',
'File {} doesn''t exist', 'No such attribute ''{}''', 'User name ''{}'' is reserved',
'Could not find table: {}', 'Detached part "{}" not found', 'Unknown data type family: {}',
'Unknown input format {}', 'Cannot UPDATE key column {}', 'Substitution {} is not set',
'Cannot OPTIMIZE table: {}', 'User name is empty', 'Table name is empty', 'AST is too big. Maximum: {}',
'Unsupported cipher mode', 'Unknown explain kind ''{}''', 'Table {} was suddenly removed',
'No cache found by path: {}', 'No such column {} in table {}', 'There is no port named {}',
'Function {} cannot resize {}', 'Function {} is not parametric', 'Unknown key attribute ''{}''',
'Transaction was cancelled', 'Unknown parent id {}', 'Session {} not found', 'Mutation {} was killed',
'Table {}.{} doesn''t exist.', 'Table is not initialized yet', '{} is not an identifier',
'Column ''{}'' already exists', 'No macro {} in config', 'Invalid origin H3 index: {}',
'Invalid session timeout: ''{}''', 'Tuple cannot be empty', 'Database name is empty',
'Table {} is not a Dictionary', 'Expected function, got: {}', 'Unknown identifier: ''{}''',
'Failed to {} input ''{}''', '{}.{} is not a VIEW', 'Cannot convert NULL to {}', 'Dictionary {} doesn''t exist'
] as arr) array join arr;

-- Check that we don't have too many short meaningless message patterns.
select 'messages shorter than 10', max2(countDistinctOrDefault(message_format_string), 10) from logs where length(message_format_string) < 10;
select 'messages shorter than 10', max2(countDistinctOrDefault(message_format_string), 0) from logs where length(message_format_string) < 10 and message_format_string not in known_short_messages;

-- Same as above. Feel free to update the threshold or remove this query if really necessary
select 'messages shorter than 16', max2(countDistinctOrDefault(message_format_string), 40) from logs where length(message_format_string) < 16;
select 'messages shorter than 16', max2(countDistinctOrDefault(message_format_string), 2) from logs where length(message_format_string) < 16 and message_format_string not in known_short_messages;

-- Same as above, but exceptions must be more informative. Feel free to update the threshold or remove this query if really necessary
select 'exceptions shorter than 30', max2(countDistinctOrDefault(message_format_string), 120) from logs where length(message_format_string) < 30 and message ilike '%DB::Exception%';
select 'exceptions shorter than 30', max2(countDistinctOrDefault(message_format_string), 27) from logs where length(message_format_string) < 30 and message ilike '%DB::Exception%' and message_format_string not in known_short_messages;


-- Avoid too noisy messages: top 1 message frequency must be less than 30%. We should reduce the threshold
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)


echo "Using non-existent session with the 'session_check' flag will throw exception:"
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&session_id=nonexistent&session_check=1" --data-binary "SELECT 1" | grep -c -F 'Session not found'
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&session_id=nonexistent&session_check=1" --data-binary "SELECT 1" | grep -c -F 'SESSION_NOT_FOUND'

echo "Using non-existent session without the 'session_check' flag will create a new session:"
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_1" --data-binary "SELECT 1"
Expand All @@ -30,7 +30,7 @@ ${CLICKHOUSE_CLIENT} --multiquery --query "DROP USER IF EXISTS test_00463; CREAT
${CLICKHOUSE_CURL} -sS -X POST "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_6&session_timeout=600" --data-binary "CREATE TEMPORARY TABLE t (s String)"
${CLICKHOUSE_CURL} -sS -X POST "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_6" --data-binary "INSERT INTO t VALUES ('Hello')"

${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&user=test_00463&session_id=${CLICKHOUSE_DATABASE}_6&session_check=1" --data-binary "SELECT 1" | grep -c -F 'Session not found'
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&user=test_00463&session_id=${CLICKHOUSE_DATABASE}_6&session_check=1" --data-binary "SELECT 1" | grep -c -F 'SESSION_NOT_FOUND'
${CLICKHOUSE_CURL} -sS -X POST "${CLICKHOUSE_URL}&user=test_00463&session_id=${CLICKHOUSE_DATABASE}_6&session_timeout=600" --data-binary "CREATE TEMPORARY TABLE t (s String)"
${CLICKHOUSE_CURL} -sS -X POST "${CLICKHOUSE_URL}&user=test_00463&session_id=${CLICKHOUSE_DATABASE}_6" --data-binary "INSERT INTO t VALUES ('World')"

Expand All @@ -53,7 +53,7 @@ do
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_7&session_timeout=1" --data-binary "SELECT 1"
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_7&session_check=1" --data-binary "SELECT 1"
sleep 3
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_7&session_check=1" --data-binary "SELECT 1" | grep -c -F 'Session not found'
${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_7&session_check=1" --data-binary "SELECT 1" | grep -c -F 'SESSION_NOT_FOUND'
) | tr -d '\n' | grep -F '111' && break || sleep 1
done

Expand Down Expand Up @@ -82,6 +82,6 @@ do
${CLICKHOUSE_CLIENT} --query "SELECT count() > 0 FROM system.processes WHERE query_id = '${CLICKHOUSE_DATABASE}_9'" | grep -F '1' && break || sleep 1
done

${CLICKHOUSE_CURL} -sS -X POST "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_9" --data-binary "SELECT 1" | grep -c -F 'Session is locked'
${CLICKHOUSE_CURL} -sS -X POST "${CLICKHOUSE_URL}&session_id=${CLICKHOUSE_DATABASE}_9" --data-binary "SELECT 1" | grep -c -F 'SESSION_IS_LOCKED'
${CLICKHOUSE_CLIENT} --multiquery --query "KILL QUERY WHERE query_id = '${CLICKHOUSE_DATABASE}_9' SYNC FORMAT Null";
wait