Skip to content

Commit

Permalink
osd doc mon mgr: To milliseconds for config value, user input and thr…
Browse files Browse the repository at this point in the history
…eshold out

Signed-off-by: David Zafman <dzafman@redhat.com>
  • Loading branch information
dzafman committed Sep 4, 2019
1 parent 9d02e5d commit 5f83a61
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 16 deletions.
2 changes: 1 addition & 1 deletion PendingReleaseNotes
Expand Up @@ -116,7 +116,7 @@
option ``mon_warn_on_slow_ping_ratio`` specifies a percentage of
``osd_heartbeat_grace`` to determine the threshold. A value of zero
disables the warning. New configuration option
``mon_warn_on_slow_ping_time`` specified in microseconds over-rides the
``mon_warn_on_slow_ping_time`` specified in milliseconds over-rides the
computed value, causes a warning
when OSD heartbeat pings take longer than the specified amount.
New admin command ``ceph daemon mgr.# dump_osd_network [threshold]`` command will
Expand Down
2 changes: 1 addition & 1 deletion doc/rados/configuration/mon-config-ref.rst
Expand Up @@ -407,7 +407,7 @@ by setting it in the ``[mon]`` section of the configuration file.
:Description: Override ``mon warn on slow ping ratio`` with a specific value.
Issue a ``HEALTH_WARN`` in cluster log if any heartbeat
between OSDs exceeds ``mon warn on slow ping time``
microseconds. The default is 0 (disabled).
milliseconds. The default is 0 (disabled).
:Type: Integer
:Default: ``0``

Expand Down
18 changes: 12 additions & 6 deletions qa/standalone/misc/network-ping.sh
Expand Up @@ -43,11 +43,11 @@ function TEST_network_ping_test1() {

CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1

CEPH_ARGS='' ceph daemon $(get_asok_path mgr.x) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1

CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network 0 | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "4" || return 1
Expand All @@ -62,11 +62,11 @@ function TEST_network_ping_test1() {
flush_pg_stats
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1

CEPH_ARGS='' ceph daemon $(get_asok_path mgr.x) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1

CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network 0 | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "4" || return 1
Expand All @@ -82,11 +82,11 @@ function TEST_network_ping_test1() {
flush_pg_stats
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1

CEPH_ARGS='' ceph daemon $(get_asok_path mgr.x) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1

CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network 0 | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "4" || return 1
Expand All @@ -96,6 +96,12 @@ function TEST_network_ping_test1() {
test "$(cat $dir/json | jq '.entries | length')" = "12" || return 1
test "$(cat $dir/json | jq '.threshold')" = "0" || return 1

# Just check the threshold output matches the input
CEPH_ARGS='' ceph daemon $(get_asok_path mgr.x) dump_osd_network 99 | tee $dir/json
test "$(cat $dir/json | jq '.threshold')" = "99" || return 1
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network 98 | tee $dir/json
test "$(cat $dir/json | jq '.threshold')" = "98" || return 1

rm -f $dir/json
}

Expand Down
6 changes: 3 additions & 3 deletions src/common/options.cc
Expand Up @@ -1712,13 +1712,13 @@ std::vector<Option> get_global_options() {
.add_service("mgr")
.set_description("Issue a health warning if there are fewer OSDs than osd_pool_default_size"),

Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_BASIC)
Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
.add_service("mgr")
.set_description("Override mon_warn_on_slow_ping_ratio with specified threshold in microseconds")
.set_description("Override mon_warn_on_slow_ping_ratio with specified threshold in milliseconds")
.add_see_also("mon_warn_on_slow_ping_ratio"),

Option("mon_warn_on_slow_ping_ratio", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
Option("mon_warn_on_slow_ping_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.05)
.add_service("mgr")
.set_description("Issue a health warning if heartbeat ping longer than percentage of osd_heartbeat_grace")
Expand Down
8 changes: 6 additions & 2 deletions src/mgr/ClusterState.cc
Expand Up @@ -223,12 +223,16 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t&
int64_t value = 0;
// Default to health warning level if nothing specified
if (!(cmd_getval(g_ceph_context, cmdmap, "value", value))) {
value = static_cast<int64_t>(g_ceph_context->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time"));
// Convert milliseconds to microseconds
value = static_cast<int64_t>(g_ceph_context->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time")) * 1000;
if (value == 0) {
double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
}
} else {
// Convert user input to microseconds
value *= 1000;
}
if (value < 0)
value = 0;
Expand Down Expand Up @@ -321,7 +325,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t&

// Network ping times (1min 5min 15min)
f->open_object_section("network_ping_times");
f->dump_int("threshold", value);
f->dump_int("threshold", value / 1000);
f->open_array_section("entries");
for (auto &sitem : boost::adaptors::reverse(sorted)) {
ceph_assert(!value || sitem.pingtime >= value);
Expand Down
3 changes: 2 additions & 1 deletion src/mon/PGMap.cc
Expand Up @@ -2762,7 +2762,8 @@ void PGMap::get_health_checks(
}

// SLOW_PING_TIME
auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time");
// Convert milliseconds to microseconds
auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time") * 1000;
auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
if (warn_slow_ping_time == 0) {
double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
Expand Down
8 changes: 6 additions & 2 deletions src/osd/OSD.cc
Expand Up @@ -2567,12 +2567,16 @@ will start to track new ops received afterwards.";
} else if (admin_command == "dump_osd_network") {
int64_t value = 0;
if (!(cmd_getval(cct, cmdmap, "value", value))) {
value = static_cast<int64_t>(g_conf().get_val<uint64_t>("mon_warn_on_slow_ping_time"));
// Convert milliseconds to microseconds
value = static_cast<int64_t>(g_conf().get_val<uint64_t>("mon_warn_on_slow_ping_time")) * 1000;
if (value == 0) {
double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
}
} else {
// Convert user input to microseconds
value *= 1000;
}
if (value < 0) value = 0;

Expand Down Expand Up @@ -2650,7 +2654,7 @@ will start to track new ops received afterwards.";
//
// Network ping times (1min 5min 15min)
f->open_object_section("network_ping_times");
f->dump_int("threshold", value);
f->dump_int("threshold", value / 1000);
f->open_array_section("entries");
for (auto &sitem : boost::adaptors::reverse(sorted)) {
ceph_assert(sitem.pingtime >= value);
Expand Down

0 comments on commit 5f83a61

Please sign in to comment.