Skip to content

Commit

Permalink
MB-51374 do not call IsSafe api on the nodes that might be dead
Browse files Browse the repository at this point in the history
...but are not getting failed over due to the maximum failed over
nodes is being reached.

Change-Id: I10173166043f72be5973e14486676ff75ed46e4b
Reviewed-on: https://review.couchbase.org/c/ns_server/+/172229
Well-Formed: Restriction Checker
Well-Formed: Build Bot <build@couchbase.com>
Tested-by: Build Bot <build@couchbase.com>
Reviewed-by: Dave Finlay <dave.finlay@couchbase.com>
  • Loading branch information
vzasade committed Mar 17, 2022
1 parent 7cf2a3c commit 2065366
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 6 deletions.
16 changes: 12 additions & 4 deletions src/auto_failover.erl
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
is_enabled/0,
is_enabled/1,
validate_kv/2,
validate_services_safety/2]).
validate_services_safety/3]).

%% For email alert notificatons
-export([alert_keys/0]).
Expand Down Expand Up @@ -601,7 +601,8 @@ failover_nodes(Nodes, S, DownNodes, NodeStatuses, UpdateCount) ->
try_autofailover(Nodes, DownNodes, FailoverReasons) ->
case ns_cluster_membership:service_nodes(Nodes, kv) of
[] ->
{ValidNodes, UnsafeNodes} = validate_services_safety(Nodes, []),
{ValidNodes, UnsafeNodes} =
validate_services_safety(Nodes, DownNodes, []),
case ValidNodes of
[] ->
{ok, UnsafeNodes};
Expand Down Expand Up @@ -1097,8 +1098,15 @@ validate_services_safety([Service | Rest], DownNodes, UUIDDict, Cache) ->
{{error, Error}, Service, NewCache}
end.

validate_services_safety(DownNodes, KVNodes) ->
NonKVNodes = DownNodes -- KVNodes,
%% Returns the list of nodes that are OK to failover and those
%% that are not taking into account the service safety check.
%% Note: the service safety check may involve an RPC to
%% the service on a remote node.
%% Note: NodesToFailover should be a subset of DownNodes.
-spec validate_services_safety([node()], [node()], [node()]) ->
{[node()], [{node(), {atom(), list()}}]}.
validate_services_safety(NodesToFailover, DownNodes, KVNodes) ->
NonKVNodes = NodesToFailover -- KVNodes,
UUIDDict = ns_config:get_node_uuid_map(ns_config:latest()),

{ValidNodes, UnsafeNodes, _} =
Expand Down
5 changes: 3 additions & 2 deletions src/failover.erl
Original file line number Diff line number Diff line change
Expand Up @@ -375,9 +375,10 @@ do_failover_bucket(membase, Bucket, BucketConfig, Nodes, Options) ->

failover_services(Nodes, _, #{skip_safety_check := true}) ->
{failover_services(Nodes), []};
failover_services(Nodes, KVNodes, #{auto := true}) ->
failover_services(Nodes, KVNodes, #{auto := true,
down_nodes := DownNodes}) ->
{ValidNodes, UnsafeNodes} =
auto_failover:validate_services_safety(Nodes, KVNodes),
auto_failover:validate_services_safety(Nodes, DownNodes, KVNodes),
{case ValidNodes of
[] ->
[];
Expand Down

0 comments on commit 2065366

Please sign in to comment.