Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(nodetool): increase graceful stop timeout #11567

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions bin/emqx
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,7 @@ is_down() {
if ps -p "$PID" >/dev/null; then
# still around
# shellcheck disable=SC2009 # this grep pattern is not a part of the program names
if ps -efp "$PID" | $GREP -q 'defunct'; then
if ps -fp "$PID" | $GREP -q 'defunct'; then
# zombie state, print parent pid
parent="$(ps -o ppid= -p "$PID" | tr -d ' ')"
logwarn "$PID is marked <defunct>, parent: $(ps -p "$parent")"
Expand All @@ -831,7 +831,7 @@ wait_for() {
shift
CMD="$*"
while true; do
if $CMD >/dev/null 2>&1; then
if $CMD; then
return 0
fi
if [ "$WAIT_TIME" -le 0 ]; then
Expand Down
24 changes: 23 additions & 1 deletion bin/nodetool
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
%% -------------------------------------------------------------------
-mode(compile).

-define(SHUTDOWN_TIMEOUT_MS, 120_000).

main(Args) ->
case os:type() of
{win32, nt} -> ok;
Expand Down Expand Up @@ -85,9 +87,17 @@ do(Args) ->
%% a "pong"
io:format("pong\n");
["stop"] ->
case rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], 60000) of
Pid = start_shutdown_status(),
Res = rpc:call(TargetNode, emqx_machine, graceful_shutdown, [], ?SHUTDOWN_TIMEOUT_MS),
true = stop_shutdown_status(Pid),
case Res of
ok ->
ok;
{badrpc, timeout} ->
io:format("EMQX is still shutting down, it failed to stop gracefully "
"within the configured timeout of: ~ps\n",
[erlang:convert_time_unit(?SHUTDOWN_TIMEOUT_MS, millisecond, second)]),
halt(1);
{badrpc, nodedown} ->
%% nodetool commands are always executed after a ping
%% which if the code gets here, it's because the target node
Expand Down Expand Up @@ -145,6 +155,18 @@ do(Args) ->
end,
net_kernel:stop().

start_shutdown_status() ->
spawn_link(fun shutdown_status_loop/0).

stop_shutdown_status(Pid) ->
true = unlink(Pid),
true = exit(Pid, stop).

shutdown_status_loop() ->
timer:sleep(10_000),
io:format("EMQX is shutting down, please wait...\n", []),
shutdown_status_loop().

parse_eval_args(Args) ->
% shells may process args into more than one, and end up stripping
% spaces, so this converts all of that to a single string to parse
Expand Down
4 changes: 4 additions & 0 deletions changes/ce/fix-11567.en.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Improve EMQX graceful shutdown (`emqx stop` command):
- increase timeout from 1 to 2 minutes
- print an error message if EMQX can't stop gracefully within the configured timeout
- print periodic status messages while EMQX is shutting down