From 4eb065d81a28046b76f001335de6d6e46d9438c0 Mon Sep 17 00:00:00 2001 From: Steve Watanabe Date: Thu, 2 May 2024 16:12:29 -0700 Subject: [PATCH] MB-61751: Add counters for [graceful] failover status This adds cm_failover_total and cm_graceful_failover_total stats. The counters for these stats were already in place. We now report them to prometheus. Change-Id: I4157c4a6aacd82acf790348f7698a84918e722e7 Reviewed-on: https://review.couchbase.org/c/ns_server/+/209431 Reviewed-by: Navdeep S Boparai Well-Formed: Build Bot Well-Formed: Restriction Checker Tested-by: Build Bot --- etc/metrics_metadata.json | 24 +++++++++++++++++++++++- src/ns_orchestrator.erl | 2 ++ src/ns_server_stats.erl | 36 +++++++++++++++++++++++++++--------- 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/etc/metrics_metadata.json b/etc/metrics_metadata.json index ad15c2ae66..d3b1007dd2 100644 --- a/etc/metrics_metadata.json +++ b/etc/metrics_metadata.json @@ -117,12 +117,34 @@ } ] }, + "cm_failover_total": { + "type": "counter", + "help": "Number of non-graceful failover results", + "added": "7.6.2", + "labels": [ + { + "name": "event", + "help": "failover result (initiated/completed/failed/stopped)" + } + ] + }, "cm_gc_duration_seconds": { "type": "histogram", "help": "Time to perform erlang garbage collection", "added": "7.6.0", "stability": "committed" }, + "cm_graceful_failover_total": { + "type": "counter", + "help": "Number of graceful failover results", + "added": "7.6.2", + "labels": [ + { + "name": "event", + "help": "graceful failover result (initiated/completed/failed/stopped)" + } + ] + }, "cm_http_requests_seconds": { "type": "histogram", "help": "Number of bucket HTTP requests", @@ -462,7 +484,7 @@ "labels": [ { "name": "event", - "help": "rebalance result (initiated/completed/failed/interrupted/stopped)" + "help": "rebalance result (initiated/completed/failed/stopped)" } ] }, diff --git a/src/ns_orchestrator.erl b/src/ns_orchestrator.erl index ad6b07fd4d..c8d9fb4394 100644 --- a/src/ns_orchestrator.erl +++ b/src/ns_orchestrator.erl @@ -1703,6 +1703,8 @@ rebalance_type2text(service_upgrade) -> <<"Service upgrade">>. update_rebalance_counters(Reason, #rebalancing_state{type = Type}) -> + %% If any new counter is added a corresponding convert_to_reported_event + %% must be added to ns_server_stats.erl. Counter = case Reason of normal -> diff --git a/src/ns_server_stats.erl b/src/ns_server_stats.erl index 47aeb427f6..e54ab8f1b2 100644 --- a/src/ns_server_stats.erl +++ b/src/ns_server_stats.erl @@ -403,9 +403,13 @@ report_ns_server_hc_stats(ReportFun) -> convert_to_reported_event(<<"start">>) -> <<"initiated">>; convert_to_reported_event(<<"success">>) -> <<"completed">>; convert_to_reported_event(<<"fail">>) -> <<"failed">>; -convert_to_reported_event(<<"interrupted">>) -> <<"interrupted">>; convert_to_reported_event(<<"stop">>) -> <<"stopped">>; -convert_to_reported_event(Other) -> Other. +%% We only want the orchestrator counters which use the above suffixes to +%% 'failover_'. The counters generated by the failover module also start +%% with 'failover_' but also include graceful failovers. Fortunately the +%% trailing portions of the 'failover_' stats don't overlap between the +%% two modules. +convert_to_reported_event(_) -> skip. %% Report cluster-wide stats (stored in chronicle). report_cluster_stats(ReportFun) -> @@ -413,13 +417,27 @@ report_cluster_stats(ReportFun) -> lists:foreach( fun ({Key, Val}) -> KeyBin = key_to_binary(Key), - case KeyBin of - <<"rebalance_", Event/binary>> -> - Label = [{<<"event">>, convert_to_reported_event(Event)}], - ReportFun({<<"cm">>, <<"rebalance_total">>, Label, Val}); - _ -> - ok - end + {Event, StatName} = + case KeyBin of + <<"rebalance_", Event0/binary>> -> + {convert_to_reported_event(Event0), + <<"rebalance_total">>}; + <<"failover_", Event0/binary>> -> + {convert_to_reported_event(Event0), + <<"failover_total">>}; + <<"graceful_failover_", Event0/binary>> -> + {convert_to_reported_event(Event0), + <<"graceful_failover_total">>}; + _ -> + {skip, undefined} + end, + case Event of + skip -> + ok; + _ -> + Label = [{<<"event">>, Event}], + ReportFun({<<"cm">>, StatName, Label, Val}) + end end, Counters). %% Delete stats for the specified bucket.