Skip to content

Commit

Permalink
Merge pull request #10052 from zmstone/0301-optimize-boot-error-log
Browse files Browse the repository at this point in the history
0301 optimize boot failure logs
  • Loading branch information
zmstone committed Mar 6, 2023
2 parents a638cc6 + 4eb9a69 commit a4aece3
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 36 deletions.
5 changes: 5 additions & 0 deletions apps/emqx_machine/src/emqx_machine.erl
Expand Up @@ -19,6 +19,7 @@
-export([
start/0,
graceful_shutdown/0,
brutal_shutdown/0,
is_ready/0,

node_status/0,
Expand Down Expand Up @@ -47,6 +48,10 @@ start() ->
graceful_shutdown() ->
emqx_machine_terminator:graceful_wait().

%% only used when failed to boot
brutal_shutdown() ->
init:stop().

set_backtrace_depth() ->
{ok, Depth} = application:get_env(emqx_machine, backtrace_depth),
_ = erlang:system_flag(backtrace_depth, Depth),
Expand Down
109 changes: 73 additions & 36 deletions bin/emqx
Expand Up @@ -451,6 +451,20 @@ call_hocon() {
|| die "call_hocon_failed: $*" $?
}

find_emqx_process() {
## Find the running node from 'ps -ef'
## * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
## * The running 'remsh' and 'nodetool' processes must be excluded
if [ -n "${EMQX_NODE__NAME:-}" ]; then
# if node name is provided, filter by node name
# shellcheck disable=SC2009
ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true
else
# shellcheck disable=SC2009
ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true
fi
}

## Resolve boot configs in a batch
## This is because starting the Erlang beam with all modules loaded
## and parsing HOCON config + environment variables is a non-trivial task
Expand All @@ -468,17 +482,7 @@ fi
# Turn off debug as the ps output can be quite noisy
set +x

## Find the running node from 'ps -ef'
## * The grep args like '[e]mqx' but not 'emqx' is to avoid greping the grep command itself
## * The running 'remsh' and 'nodetool' processes must be excluded
if [ -n "${EMQX_NODE__NAME:-}" ]; then
# if node name is provided, filter by node name
# shellcheck disable=SC2009
PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -E "\s\-s?name\s${EMQX_NODE__NAME}" | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
else
# shellcheck disable=SC2009
PS_LINE="$(ps -ef | $GREP '[e]mqx' | $GREP -v -E '(remsh|nodetool)' | $GREP -oE "\-[r]oot ${RUNNER_ROOT_DIR}.*" || true)"
fi
PS_LINE="$(find_emqx_process)"
logdebug "PS_LINE=$PS_LINE"
RUNNING_NODES_COUNT="$(echo -e "$PS_LINE" | sed '/^\s*$/d' | wc -l)"
[ "$RUNNING_NODES_COUNT" -gt 1 ] && logdebug "More than one running node found: count=$RUNNING_NODES_COUNT"
Expand Down Expand Up @@ -927,6 +931,7 @@ case "$NAME" in
NAME_TYPE='-sname'
esac
SHORT_NAME="$(echo "$NAME" | awk -F'@' '{print $1}')"
HOST_NAME="$(echo "$NAME" | awk -F'@' '{print $2}')"
if ! (echo "$SHORT_NAME" | grep -q '^[0-9A-Za-z_\-]\+$'); then
logerr "Invalid node name, should be of format '^[0-9A-Za-z_-]+$'."
exit 1
Expand Down Expand Up @@ -969,6 +974,59 @@ if [[ "$IS_BOOT_COMMAND" == 'yes' && "$(get_boot_config 'node.db_backend')" == "
fi
fi

diagnose_boot_failure_and_die() {
local ps_line
ps_line="$(find_emqx_process)"
if [ -z "$ps_line" ]; then
echo "Find more information in the latest log file: ${RUNNER_LOG_DIR}/erlang.log.*"
exit 1
fi
if ! relx_nodetool "ping" > /dev/null; then
logerr "$NAME seems to be running, but not responding to pings."
echo "Make sure '$HOST_NAME' is a resolvable and reachable hostname."
pipe_shutdown
exit 2
fi
if ! relx_nodetool 'eval' 'true = emqx:is_running()' > /dev/null; then
logerr "$NAME node is started, but failed to complete the boot sequence in time."
echo "Please collect the logs in ${RUNNER_LOG_DIR} and report a bug to EMQX team at https://github.com/emqx/emqx/issues/new/choose"
pipe_shutdown
exit 3
fi
}

## Only works when started in daemon mode
pipe_shutdown() {
if [ -d "$PIPE_DIR" ]; then
echo "Shutting down $NAME from to_erl pipe."
## can not evaluate init:stop() or erlang:halt() because the shell is restricted
echo 'emqx_machine:brutal_shutdown().' | "$BINDIR/to_erl" "$PIPE_DIR"
fi
}

## Call nodetool to stop EMQX
nodetool_shutdown() {
# Wait for the node to completely stop...
PID="$(relx_get_pid)"
if ! relx_nodetool "stop"; then
die "Graceful shutdown failed PID=[$PID]"
fi
WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}"
if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
msg="dangling after ${WAIT_TIME} seconds"
# also log to syslog
logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
# log to user console
set +x
logerr "Stop failed, $msg"
echo "ERROR: $PID is still around"
ps -p "$PID"
exit 1
fi
echo "ok"
logger -t "${REL_NAME}[${PID}]" "STOP: OK"
}

cd "$RUNNER_ROOT_DIR"

case "${COMMAND}" in
Expand Down Expand Up @@ -1014,33 +1072,15 @@ case "${COMMAND}" in
echo "$EMQX_DESCRIPTION $REL_VSN is started successfully!"
exit 0
else
echo "$EMQX_DESCRIPTION $REL_VSN failed to start in ${WAIT_TIME} seconds."
echo "Please find more information in erlang.log.N"
echo "Or run 'env DEBUG=1 $0 console' to have logs printed to console."
exit 1
logerr "${EMQX_DESCRIPTION} ${REL_VSN} using node name '${NAME}' failed ${WAIT_TIME} probes."
diagnose_boot_failure_and_die
fi
;;

stop)
# Wait for the node to completely stop...
PID="$(relx_get_pid)"
if ! relx_nodetool "stop"; then
die "Graceful shutdown failed PID=[$PID]"
fi
WAIT_TIME="${EMQX_WAIT_FOR_STOP:-120}"
if ! wait_for "$WAIT_TIME" 'is_down' "$PID"; then
msg="dangling after ${WAIT_TIME} seconds"
# also log to syslog
logger -t "${REL_NAME}[${PID}]" "STOP: $msg"
# log to user console
set +x
logerr "Stop failed, $msg"
echo "ERROR: $PID is still around"
ps -p "$PID"
exit 1
if ! nodetool_shutdown; then
pipe_shutdown
fi
echo "ok"
logger -t "${REL_NAME}[${PID}]" "STOP: OK"
;;

pid)
Expand All @@ -1063,9 +1103,6 @@ case "${COMMAND}" in
;;

attach)
assert_node_alive

shift
exec "$BINDIR/to_erl" "$PIPE_DIR"
;;

Expand Down
12 changes: 12 additions & 0 deletions changes/ce/fix-10052.en.md
@@ -0,0 +1,12 @@
Improve daemon mode startup failure logs.

Before this change, it was difficult for users to understand the reason for EMQX 'start' command failed to boot the node.
The only information they received was that the node did not start within the expected time frame,
and they were instructed to boot the node with 'console' command in the hope of obtaining some logs.
However, the node might actually be running, which could cause 'console' mode to fail for a different reason.

With this new change, when daemon mode fails to boot, a diagnosis is issued. Here are the possible scenarios:

* If the node cannot be found from `ps -ef`, the user is instructed to find information in log files `erlang.log.*`.
* If the node is found to be running but not responding to pings, the user is advised to check if the host name is resolvable and reachable.
* If the node is responding to pings, but the EMQX app is not running, it is likely a bug. In this case, the user is advised to report a Github issue.
11 changes: 11 additions & 0 deletions changes/ce/fix-10052.zh.md
@@ -0,0 +1,11 @@
优化 EMQX daemon 模式启动启动失败的日志。

在进行此更改之前,当 EMQX 用 `start` 命令启动失败时,用户很难理解出错的原因。
所知道的仅仅是节点未能在预期时间内启动,然后被指示以 `console` 式引导节点以获取一些日志。
然而,节点实际上可能正在运行,这可能会导致 `console` 模式因不同的原因而失败。

此次修复后,启动脚本会发出诊断:

* 如果无法从 `ps -ef` 中找到节点,则指示用户在 `erlang.log.*` 中查找信息。
* 如果发现节点正在运行但不响应 ping,则建议用户检查节点主机名是否有效并可达。
* 如果节点响应 ping 但 EMQX 应用程序未运行,则很可能是一个错误。在这种情况下,建议用户报告一个Github issue。
1 change: 1 addition & 0 deletions deploy/docker/README.md
Expand Up @@ -59,6 +59,7 @@ EMQX_DASHBOARD__DEFAULT_PASSWORD <--> dashboard.default_password
EMQX_NODE__COOKIE <--> node.cookie
EMQX_LISTENERS__SSL__default__ENABLE <--> listeners.ssl.default.enable
```
Note: The lowercase use of 'default' is not a typo. It is used to demonstrate that lowercase environment variables are equivalent.

+ Prefix `EMQX_` is removed
+ All upper case letters is replaced with lower case letters
Expand Down

0 comments on commit a4aece3

Please sign in to comment.