Skip to content

Commit

Permalink
Merge branch 'trinity'
Browse files Browse the repository at this point in the history
* trinity:
  MB-59880: Short circuit failover checks if failover already in progress

Change-Id: Idee09eabfbfc4ee98035ea76483e27fe034fb3ce
  • Loading branch information
BenHuddleston committed May 10, 2024
2 parents 52792a7 + 0ca1a67 commit 209d793
Showing 1 changed file with 24 additions and 21 deletions.
45 changes: 24 additions & 21 deletions apps/ns_server/src/auto_failover.erl
Expand Up @@ -328,7 +328,7 @@ handle_info(tick_period_updated, #state{timeout = Timeout} = State) ->
NewState#state{auto_failover_logic_state = init_logic_state(NewState)}};

%% @doc Check if nodes should/could be auto-failovered on every tick
handle_info(tick, State0) ->
handle_info(tick, #state{auto_failover_ref = AutoFailoverRef} = State0) ->
Ref = send_tick_msg(State0),
Config = ns_config:get(),
Snapshot = ns_cluster_membership:get_snapshot(#{ns_config => Config}),
Expand Down Expand Up @@ -362,16 +362,28 @@ handle_info(tick, State0) ->
ns_cluster_membership:attach_node_uuids(CurrentlyDown, NodeUUIDs),
State#state.auto_failover_logic_state,
ServicesConfig),
NewState = lists:foldl(
fun(Action, S) ->
process_action(Action, S, DownNodes, NodeStatuses,
Snapshot)
end, State#state{auto_failover_logic_state = LogicState},
Actions),

NewState1 = update_reported_flags_by_actions(Actions, NewState),

{noreply, NewState1#state{tick_ref = Ref}};
%% Short circuit any actions that we may have taken if a failover is already
%% in progress. Failovers could not complete and we could prematurely send
%% failure emails if we found that we could not auto-failover anyways.
NewState = case AutoFailoverRef of
undefined ->
%% Time to take action
PostActionState =
lists:foldl(
fun(Action, S) ->
process_action(Action, S, DownNodes,
NodeStatuses, Snapshot)
end, State#state{
auto_failover_logic_state = LogicState},
Actions),
update_reported_flags_by_actions(Actions,
PostActionState);
_ ->
State#state{auto_failover_logic_state = LogicState}
end,

{noreply, NewState#state{tick_ref = Ref}};

handle_info({auto_failover_complete, Nodes, DownNodes, NodeStatuses, Result},
State) ->
Expand Down Expand Up @@ -642,9 +654,8 @@ try_autofailover(Nodes, DownNodeNames, DownNodes, NodeStatuses,
DownNodes, #{}, State)
end.

%% Failover not in progress, we can start one
trigger_autofailover(Nodes, NodeStatuses, DownNodeNames, DownNodes, Opts,
#state{auto_failover_ref = undefined} = State) ->
State) ->
FailoverReasons = failover_reasons(Nodes, DownNodes, NodeStatuses),
Self = self(),
Pid = erlang:spawn_link(
Expand All @@ -658,15 +669,7 @@ trigger_autofailover(Nodes, NodeStatuses, DownNodeNames, DownNodes, Opts,
Self ! {auto_failover_complete, Nodes, DownNodes,
NodeStatuses, R}
end),
State#state{auto_failover_ref = Pid};
%% Failover in progress already
trigger_autofailover(_Nodes, _NodeStatuses, _DownNodeNames, _DownNodes, _Opts,
#state{auto_failover_ref = _} = State) ->
?log_debug("Skipping failover request, one already in progress"),
%% Note that the return value is essentially the same here as it is for the
%% clause in which a failover is not already in progress. This is because we
%% rely on the next tick to "retry" the auto-failover if still relevant.
State.
State#state{auto_failover_ref = Pid}.

log_failover_success(Node, DownNodes, NodeStatuses) ->
case failover_reason(Node, DownNodes, NodeStatuses) of
Expand Down

0 comments on commit 209d793

Please sign in to comment.