diff --git a/erts/doc/src/erl_dist_protocol.xml b/erts/doc/src/erl_dist_protocol.xml index 8cb8e0961553..a8ec5bbaeb2a 100644 --- a/erts/doc/src/erl_dist_protocol.xml +++ b/erts/doc/src/erl_dist_protocol.xml @@ -430,9 +430,6 @@ io:format("old/unused name ~ts at port ~p, fd = ~p ~n",

where n = Length - 1.

-

The current implementation of Erlang does not care if the connection - to the EPMD is broken.

-

The response for a STOP_REQ is as follows:

diff --git a/lib/kernel/doc/src/erl_epmd.xml b/lib/kernel/doc/src/erl_epmd.xml index 03aa9495160d..f6fe3c0a9ebc 100644 --- a/lib/kernel/doc/src/erl_epmd.xml +++ b/lib/kernel/doc/src/erl_epmd.xml @@ -56,8 +56,10 @@

Registers the node with epmd and tells epmd what port will be used for the current node. It returns a creation number. This number is - incremented on each register to help with identifying if a node is - reconnecting to epmd.

+ incremented on each register to help differentiate a new node instance + connecting to epmd with the same name.

+

After the node has successfully registered with epmd it will automatically + attempt reconnect to the daemon if the connection is broken.

diff --git a/lib/kernel/src/erl_epmd.erl b/lib/kernel/src/erl_epmd.erl index 7cc84b24756b..96806ae3e7a5 100644 --- a/lib/kernel/src/erl_epmd.erl +++ b/lib/kernel/src/erl_epmd.erl @@ -53,13 +53,15 @@ -import(lists, [reverse/1]). --record(state, {socket, port_no = -1, name = ""}). +-record(state, {socket, port_no = -1, name = "", family}). -type state() :: #state{}. -include("inet_int.hrl"). -include("erl_epmd.hrl"). -include_lib("kernel/include/inet.hrl"). +-define(RECONNECT_TIME, 2000). + %%%---------------------------------------------------------------------- %%% API %%%---------------------------------------------------------------------- @@ -228,7 +230,8 @@ handle_call({register, Name, PortNo, Family}, _From, State) -> {alive, Socket, Creation} -> S = State#state{socket = Socket, port_no = PortNo, - name = Name}, + name = Name, + family = Family}, {reply, {ok, Creation}, S}; Error -> case init:get_argument(erl_epmd_port) of @@ -263,7 +266,17 @@ handle_cast(_, State) -> -spec handle_info(term(), state()) -> {'noreply', state()}. handle_info({tcp_closed, Socket}, State) when State#state.socket =:= Socket -> + erlang:send_after(?RECONNECT_TIME, self(), reconnect), {noreply, State#state{socket = -1}}; +handle_info(reconnect, State) when State#state.socket =:= -1 -> + case do_register_node(State#state.name, State#state.port_no, State#state.family) of + {alive, Socket, _Creation} -> + %% ignore the received creation + {noreply, State#state{socket = Socket}}; + _Error -> + erlang:send_after(?RECONNECT_TIME, self(), reconnect), + {noreply, State} + end; handle_info(_, State) -> {noreply, State}. diff --git a/lib/kernel/test/erl_distribution_SUITE.erl b/lib/kernel/test/erl_distribution_SUITE.erl index 67faa4911c53..1c7b067375db 100644 --- a/lib/kernel/test/erl_distribution_SUITE.erl +++ b/lib/kernel/test/erl_distribution_SUITE.erl @@ -30,6 +30,7 @@ nodenames/1, hostnames/1, illegal_nodenames/1, hidden_node/1, dyn_node_name/1, + epmd_reconnect/1, setopts/1, table_waste/1, net_setuptime/1, inet_dist_options_options/1, @@ -54,6 +55,7 @@ tick_serv_test/2, tick_serv_test1/1, run_remote_test/1, dyn_node_name_do/2, + epmd_reconnect_do/2, setopts_do/2, keep_conn/1, time_ping/1]). @@ -64,6 +66,8 @@ -export([pinger/1]). -define(DUMMY_NODE,dummy@test01). +-define(ALT_EPMD_PORT, "12321"). +-define(ALT_EPMD_CMD, "epmd -port "++?ALT_EPMD_PORT). %%----------------------------------------------------------------- %% The distribution is mainly tested in the big old test_suite. @@ -82,6 +86,7 @@ all() -> tick, tick_change, nodenames, hostnames, illegal_nodenames, connect_node, dyn_node_name, + epmd_reconnect, hidden_node, setopts, table_waste, net_setuptime, inet_dist_options_options, {group, monitor_nodes}, @@ -117,9 +122,15 @@ init_per_testcase(TC, Config) when TC == hostnames; file:make_dir("hostnames_nodedir"), file:write_file("hostnames_nodedir/ignore_core_files",""), Config; +init_per_testcase(epmd_reconnect, Config) -> + [] = os:cmd(?ALT_EPMD_CMD++" -relaxed_command_check -daemon"), + Config; init_per_testcase(Func, Config) when is_atom(Func), is_list(Config) -> Config. +end_per_testcase(epmd_reconnect, _Config) -> + os:cmd(?ALT_EPMD_CMD++" -kill"), + ok; end_per_testcase(_Func, _Config) -> ok. @@ -427,6 +438,83 @@ tick_cli_test1(Node) -> end end. +epmd_reconnect(Config) when is_list(Config) -> + NodeNames = [N1,N2,N3] = get_nodenames(3, ?FUNCTION_NAME), + Nodes = [atom_to_list(full_node_name(NN)) || NN <- NodeNames], + + DCfg = "-epmd_port "++?ALT_EPMD_PORT, + + {_N1F,Port1} = start_node_unconnected(DCfg, N1, ?MODULE, run_remote_test, + ["epmd_reconnect_do", atom_to_list(node()), "1" | Nodes]), + {_N2F,Port2} = start_node_unconnected(DCfg, N2, ?MODULE, run_remote_test, + ["epmd_reconnect_do", atom_to_list(node()), "2" | Nodes]), + {_N3F,Port3} = start_node_unconnected(DCfg, N3, ?MODULE, run_remote_test, + ["epmd_reconnect_do", atom_to_list(node()), "3" | Nodes]), + Ports = [Port1, Port2, Port3], + + ok = reap_ports(Ports), + + ok. + +reap_ports([]) -> + ok; +reap_ports(Ports) -> + case (receive M -> M end) of + {Port, Message} -> + case lists:member(Port, Ports) andalso Message of + {data,String} -> + io:format("~p: ~s\n", [Port, String]), + reap_ports(Ports); + {exit_status,0} -> + reap_ports(Ports -- [Port]) + end + end. + +epmd_reconnect_do(_Node, ["1", Node1, Node2, Node3]) -> + Names = [Name || Name <- [hd(string:tokens(Node, "@")) || Node <- [Node1, Node2, Node3]]], + %% wait until all nodes are registered + ok = wait_for_names(Names), + "Killed" ++_ = os:cmd(?ALT_EPMD_CMD++" -kill"), + open_port({spawn, ?ALT_EPMD_CMD}, []), + %% check that all nodes reregister with epmd + ok = wait_for_names(Names), + lists:foreach(fun(Node) -> + ANode = list_to_atom(Node), + pong = net_adm:ping(ANode), + {epmd_reconnect_do, ANode} ! {stop, Node1, Node} + end, [Node2, Node3]), + ok; +epmd_reconnect_do(_Node, ["2", Node1, Node2, _Node3]) -> + register(epmd_reconnect_do, self()), + receive {stop, Node1, Node2} -> + ok + after 7000 -> + exit(timeout) + end; +epmd_reconnect_do(_Node, ["3", Node1, _Node2, Node3]) -> + register(epmd_reconnect_do, self()), + receive {stop, Node1, Node3} -> + ok + after 7000 -> + exit(timeout) + end. + +wait_for_names(Names) -> + %% wait for up to 3 seconds (the current retry timer in erl_epmd is 2s) + wait_for_names(lists:sort(Names), 30, 100). + +wait_for_names(Names, N, Wait) when N > 0 -> + try + {ok, Info} = erl_epmd:names(), + Names = lists:sort([Name || {Name, _Port} <- Info]), + ok + catch + error:{badmatch, _} -> + timer:sleep(Wait), + wait_for_names(Names, N-1, Wait) + end. + + dyn_node_name(Config) when is_list(Config) -> %%run_dist_configs(fun dyn_node_name/2, Config). dyn_node_name("", Config).