Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disconnect of a node from a cluster of global nodes #6264

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
68 changes: 68 additions & 0 deletions lib/kernel/doc/src/global.xml
Expand Up @@ -146,6 +146,74 @@
</desc>
</func>

<func>
<name name="disconnect" arity="0" since=""/>
<fsummary>Disconnect from all other nodes known to global</fsummary>
<desc>
<p>
Disconnect from all other nodes known to <c>global</c>. A list
of node names (in an unspecified order) is returned which corresponds
to the nodes that were disconnected. All disconnect operations
performed have completed when <c>global:disconnect/0</c> returns.
</p>

<p>
The disconnects will be made in such a way that only the current
node will be removed from the cluster of <c>global</c> nodes. If
<seeerl marker="#prevent_overlapping_partitions">
<c>prevent_overlapping_partitions</c></seeerl> is enabled and
you disconnect, from other nodes in the cluster of <c>global</c>
nodes, by other means, <c>global</c> on the other nodes may
partition the remaining nodes in order to ensure that no
overlapping partitions appear. Even if
<c>prevent_overlapping_partitions</c> is disabled, you should
preferably use <c>global:disconnect/0</c> in order to remove
current node from a cluster of <c>global</c> nodes, since you
otherwise likely <em>will</em> create overlapping partitions which
might <seeerl marker="#prevent_overlapping_partitions">cause
problems</seeerl>.
</p>

<p>
Note that if the node is going to be halted, there is <em>no</em>
need to remove it from a cluster of <c>global</c> nodes explicitly by
calling <c>global:disconnect/0</c> before halting it. The removal
from the cluster is taken care of automatically when the node
halts regardless of whether <c>prevent_overlapping_partitions</c> is
enabled or not.
</p>

<p>
If current node has been configured to be part of a
<seeerl marker="global_group"><i>global group</i></seeerl>, only
connected and/or synchronized nodes in that group are known to
<c>global</c>, so <c>global:disconnect/0</c> will <em>only</em>
disconnect from those nodes. If current node is <em>not</em> part of
a <i>global group</i>, all
<seemfa marker="erts:erlang#nodes/0">connected visible nodes</seemfa>
will be known to <c>global</c>, so <c>global:disconnect/0</c> will
disconnect from all those nodes.
</p>
<p>
Note that information about connected nodes does not instantaneously
reach <c>global</c>, so the caller might see a node part of the
result returned by
<seemfa marker="erts:erlang#nodes/0"><c>nodes()</c></seemfa> while
it still is not known to <c>global</c>. The disconnect operation
will, however, still not cause any overlapping partitions when
<c>prevent_overlapping_partitions</c> is enabled. If
<c>prevent_overlapping_partitions</c> is disabled, overlapping
partitions might form in this case.
</p>
<p>
Note that when <c>prevent_overlapping_partitions</c> is enabled,
you may see warning reports on other nodes when they detect that
current node has disconnected. These are in this case completely
harmless and can be ignored.
</p>
</desc>
</func>

<func>
<name name="notify_all_name" arity="3" since=""/>
<fsummary>Name resolving function that notifies both pids.</fsummary>
Expand Down
36 changes: 34 additions & 2 deletions lib/kernel/src/global.erl
Expand Up @@ -36,7 +36,8 @@
set_lock/1, set_lock/2, set_lock/3,
del_lock/1, del_lock/2,
trans/2, trans/3, trans/4,
random_exit_name/3, random_notify_name/3, notify_all_name/3]).
random_exit_name/3, random_notify_name/3, notify_all_name/3,
disconnect/0]).

%% Internal exports
-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
Expand Down Expand Up @@ -484,6 +485,11 @@ trans(Id, Fun, Nodes, Retries) ->
info() ->
gen_server:call(global_name_server, info, infinity).

-spec disconnect() -> [node()].

disconnect() ->
gen_server:call(global_name_server, disconnect, infinity).

%%%-----------------------------------------------------------------
%%% Call-back functions from gen_server
%%%-----------------------------------------------------------------
Expand Down Expand Up @@ -821,6 +827,32 @@ handle_call(get_names_ext, _From, S) ->
handle_call(info, _From, S) ->
{reply, S, S};

handle_call(disconnect, _From, #state{known = Known} = S0) ->
%% Disconnect from all nodes global knows of without
%% sending any lost_connection messages...
Nodes = maps:fold(fun ({connection_id, N}, _, Ns) when is_atom(N) ->
case global_group:participant(N) of
false ->
Ns;
true ->
?trace({'####', disconnect, {node,N}}),
net_kernel:async_disconnect(N),
[N|Ns]
end;
(_, _, Ns) ->
Ns
end, [], Known),
S1 = lists:foldl(fun (N, SAcc0) ->
receive {nodedown, N, I} -> ok end,
?trace({'####', nodedown, {node,N,I}}),
SAcc1 = trace_message(SAcc0, {nodedown, N, I}, []),
SAcc2 = handle_nodedown(N, SAcc1, ignore_node),
NewKnown = maps:remove({connection_id, N},
SAcc2#state.known),
SAcc2#state{known = NewKnown}
end, S0, Nodes),
{reply, Nodes, S1};

%% "High level trace". For troubleshooting only.
handle_call(high_level_trace_start, _From, S) ->
S#state.the_locker ! {do_trace, true},
Expand Down Expand Up @@ -2641,7 +2673,7 @@ pid_locks(Ref) ->
ref_is_locking(Ref, PidRefs)].

ref_is_locking(Ref, PidRefs) ->
lists:keyfind(Ref, 2, PidRefs) =/= false.
lists:keyfind(Ref, 2, PidRefs) =/= false.

handle_nodeup(Node, #state{the_locker = TheLocker,
resolvers = Rs,
Expand Down
39 changes: 23 additions & 16 deletions lib/kernel/src/net_kernel.erl
Expand Up @@ -344,7 +344,7 @@ passive_cnct(Node) ->
disconnect(Node) -> request({disconnect, Node}).

async_disconnect(Node) ->
gen_server:cast(net_kernel, {disconnect, Node}).
gen_server:cast(net_kernel, {async_disconnect, Node}).

%% Should this node publish itself on Node?
publish_on_node(Node) when is_atom(Node) ->
Expand Down Expand Up @@ -747,7 +747,7 @@ handle_call({disconnect, Node}, From, State) when Node =:= node() ->
async_reply({reply, false, State}, From);
handle_call({disconnect, Node}, From, State) ->
verbose({disconnect, Node}, 1, State),
{Reply, State1} = do_disconnect(Node, State),
{Reply, State1} = do_disconnect(Node, State, false),
async_reply({reply, Reply, State1}, From);

%%
Expand Down Expand Up @@ -914,11 +914,11 @@ handle_call(_Msg, _From, State) ->
%% handle_cast.
%% ------------------------------------------------------------

handle_cast({disconnect, Node}, State) when Node =:= node() ->
handle_cast({async_disconnect, Node}, State) when Node =:= node() ->
{noreply, State};
handle_cast({disconnect, Node}, State) ->
verbose({disconnect, Node}, 1, State),
{_Reply, State1} = do_disconnect(Node, State),
handle_cast({async_disconnect, Node}, State) ->
verbose({async_disconnect, Node}, 1, State),
{_Reply, State1} = do_disconnect(Node, State, true),
{noreply, State1};

handle_cast(_, State) ->
Expand Down Expand Up @@ -1172,7 +1172,7 @@ handle_info({From,registered_send,To,Mess},State) ->
handle_info({From,badcookie,_To,_Mess}, State) ->
error_logger:error_msg("~n** Got OLD cookie from ~w~n",
[getnode(From)]),
{_Reply, State1} = do_disconnect(getnode(From), State),
{_Reply, State1} = do_disconnect(getnode(From), State, false),
{noreply,State1};

%%
Expand Down Expand Up @@ -1636,23 +1636,30 @@ mk_monitor_nodes_error(_Flag, Opts) ->

% -------------------------------------------------------------

do_disconnect(Node, State) ->
do_disconnect(Node, State, Async) ->
case ets:lookup(sys_dist, Node) of
[Conn] when Conn#connection.state =:= up ->
disconnect_ctrlr(Conn#connection.ctrlr, State);
disconnect_ctrlr(Conn#connection.ctrlr, State, Async);
[Conn] when Conn#connection.state =:= up_pending ->
disconnect_ctrlr(Conn#connection.ctrlr, State);
disconnect_ctrlr(Conn#connection.ctrlr, State, Async);
_ ->
{false, State}
end.

disconnect_ctrlr(Ctrlr, State) ->
disconnect_ctrlr(Ctrlr, S0, Async) ->
exit(Ctrlr, disconnect),
receive
{'EXIT',Ctrlr,Reason} ->
{_,State1} = handle_exit(Ctrlr, Reason, State),
{true, State1}
end.
S2 = case Async of
true ->
S0;
false ->
receive
{'EXIT',Ctrlr,Reason} ->
{_,S1} = handle_exit(Ctrlr, Reason, S0),
S1
end
end,
{true, S2}.


%%
%%
Expand Down