diff --git a/docs/en/userguide/troubleshooting.md b/docs/en/userguide/troubleshooting.md index 6b624043b9..9b3d2b4744 100644 --- a/docs/en/userguide/troubleshooting.md +++ b/docs/en/userguide/troubleshooting.md @@ -1,21 +1,46 @@ # Troubleshooting -You may encounter various problems during installation or development in Fluid. Usually, logs are useful for debugging. But the Runtime containers where Fluid's underlying Distributed Cache Engine is running, are distributed on different hosts under distributed environment, so it's quite annoying to collect these logs one by one. To make this troublesome work easier, we provided a [shell script](https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid.sh) to help users collect logs more quickly. This document describes how to use that script. +You may encounter various problems during installation or development in Fluid. Usually, logs are useful for debugging. But the Runtime containers where Fluid's underlying Distributed Cache Engine is running, are distributed on different hosts under distributed environment, so it's quite annoying to collect these logs one by one. +To make this troublesome work easier, we provided a shell script to help users collect logs more quickly. This document describes how to use that script. + + +Alluxio: + +```wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-alluxio.sh``` + +JuiceFS: + +```wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-juicefs.sh``` + +GooseFS: + +```wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-goosefs.sh``` ## Diagnose Fluid using Script -1. Make sure that script is executable +1. Fluid provides different diagnostic scripts for different Runtimes, but the usage is the same. You can download the runtime diagnostic scripts you use: + + ```shell + # Alluxio: + wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-alluxio.sh + # JuiceFS: + wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-juicefs.sh + # GooseFS: + wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-goosefs.sh + ``` + +2. Make sure that script is executable, take `diagnose-fluid-alluxio.sh` as an example: ```shell - $ chmod a+x diagnose-fluid.sh + $ chmod a+x diagnose-fluid-alluxio.sh ``` -2. Get help message +3. Get help message ```shell - $ ./diagnose-fluid.sh + $ ./diagnose-fluid-alluxio.sh Usage: - ./diagnose-fluid.sh COMMAND [OPTIONS] + ./diagnose-fluid-alluxio.sh COMMAND [OPTIONS] COMMAND: help Display this help message. @@ -28,12 +53,12 @@ You may encounter various problems during installation or development in Fluid. Set the namespace of runtime. ``` -3. Collect logs +4. Collect logs You can collect all the Runtime container logs for given name and namespace with: ```shell - $ ./diagnose-fluid.sh collect --name cifar10 --namespace default + $ ./diagnose-fluid-alluxio.sh collect --name cifar10 --namespace default ``` > **NOTES**: diff --git a/docs/zh/userguide/troubleshooting.md b/docs/zh/userguide/troubleshooting.md index 9eaf3b8d75..0a1cbfe203 100644 --- a/docs/zh/userguide/troubleshooting.md +++ b/docs/zh/userguide/troubleshooting.md @@ -1,20 +1,33 @@ # Fluid问题诊断 -您可能会在部署、开发Fluid的过程中遇到各种问题,而查看日志可以协助我们定位问题原因。但在分布式环境下,Fluid底层的分布式缓存引擎(Runtime)运行在不同主机的容器上,手动收集这些容器的日志效率低下。因此,Fluid提供了shell脚本[diagnose-fluid.sh](https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid.sh),帮助使用者快速收集Fluid系统和Runtime容器的日志信息。 +您可能会在部署、开发Fluid的过程中遇到各种问题,而查看日志可以协助我们定位问题原因。但在分布式环境下,Fluid底层的分布式缓存引擎(Runtime)运行在不同主机的容器上,手动收集这些容器的日志效率低下。 +因此,Fluid提供了shell脚本,帮助使用者快速收集Fluid系统和Runtime容器的日志信息。 ## 如何使用脚本收集日志 -1. 首先,确保shell脚本有运行权限 +1. 下载诊断脚本 + 针对不同的 Runtime,Fluid 提供了不同的诊断脚本,但使用方式是一致的。您可以下载您使用的 Runtime 诊断脚本: + + ```shell + # Alluxio: + wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-alluxio.sh + # JuiceFS: + wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-juicefs.sh + # GooseFS: + wget https://raw.githubusercontent.com/fluid-cloudnative/fluid/master/tools/diagnose-fluid-goosefs.sh + ``` + +2. 确保shell脚本有运行权限,以 `diagnose-fluid-alluxio.sh` 为例: ```bash - $ chmod a+x diagnose-fluid.sh + $ chmod a+x diagnose-fluid-alluxio.sh ``` -2. 查看帮助信息 +3. 查看帮助信息 ```bash - $ ./diagnose-fluid.sh + $ ./diagnose-fluid-alluxio.sh Usage: - ./diagnose-fluid.sh COMMAND [OPTIONS] + ./diagnose-fluid-alluxio.sh COMMAND [OPTIONS] COMMAND: help Display this help message. @@ -27,12 +40,12 @@ Set the namespace of runtime. ``` -3. 收集日志 +4. 收集日志 - 运行`diagnose-fluid.sh`,`--name`指定了Runtime的name,`--namespace`指定了Runtime的namespace + 运行`diagnose-fluid-alluxio.sh`,`--name`指定了Runtime的name,`--namespace`指定了Runtime的namespace ```bash - $ ./diagnose-fluid.sh collect --name cifar10 --namespace default + $ ./diagnose-fluid-alluxio.sh collect --name cifar10 --namespace default ``` shell脚本会将收集的日志信息打包到执行路径下的一个压缩包里。 diff --git a/tools/diagnose-fluid.sh b/tools/diagnose-fluid-alluxio.sh similarity index 96% rename from tools/diagnose-fluid.sh rename to tools/diagnose-fluid-alluxio.sh index 872f77a313..b4b8a8bb42 100644 --- a/tools/diagnose-fluid.sh +++ b/tools/diagnose-fluid-alluxio.sh @@ -3,7 +3,7 @@ set +x print_usage() { echo "Usage:" - echo " ./diagnose-fluid.sh COMMAND [OPTIONS]" + echo " ./diagnose-fluid-alluxio.sh COMMAND [OPTIONS]" echo "COMMAND:" echo " help" echo " Display this help message." @@ -77,7 +77,7 @@ kubectl_resource() { # runtime, dataset, pv and pvc should have the same name kubectl describe dataset --namespace ${runtime_namespace} ${runtime_name} &>"${diagnose_dir}/dataset-${runtime_name}.yaml" 2>&1 kubectl describe alluxioruntime --namespace ${runtime_namespace} ${name} &>"${diagnose_dir}/alluxioruntime-${runtime_name}.yaml" 2>&1 - kubectl describe pv ${runtime_name} &>"${diagnose_dir}/pv-${runtime_name}.yaml" 2>&1 + kubectl describe pv ${runtime_namespace}-${runtime_name} &>"${diagnose_dir}/pv-${runtime_name}.yaml" 2>&1 kubectl describe pvc ${runtime_name} --namespace ${runtime_namespace} &>"${diagnose_dir}/pvc-${runtime_name}.yaml" 2>&1 } diff --git a/tools/diagnose-fluid-goosefs.sh b/tools/diagnose-fluid-goosefs.sh index 9917cb23e9..1c16bdcfbe 100755 --- a/tools/diagnose-fluid-goosefs.sh +++ b/tools/diagnose-fluid-goosefs.sh @@ -3,7 +3,7 @@ set +x print_usage() { echo "Usage:" - echo " ./diagnose-fluid.sh COMMAND [OPTIONS]" + echo " ./diagnose-fluid-goosefs.sh COMMAND [OPTIONS]" echo "COMMAND:" echo " help" echo " Display this help message." @@ -77,7 +77,7 @@ kubectl_resource() { # runtime, dataset, pv and pvc should have the same name kubectl describe dataset --namespace ${runtime_namespace} ${runtime_name} &>"${diagnose_dir}/dataset-${runtime_name}.yaml" 2>&1 kubectl describe goosefsruntime --namespace ${runtime_namespace} ${name} &>"${diagnose_dir}/goosefsruntime-${runtime_name}.yaml" 2>&1 - kubectl describe pv ${runtime_name} &>"${diagnose_dir}/pv-${runtime_name}.yaml" 2>&1 + kubectl describe pv ${runtime_namespace}-${runtime_name} &>"${diagnose_dir}/pv-${runtime_name}.yaml" 2>&1 kubectl describe pvc ${runtime_name} --namespace ${runtime_namespace} &>"${diagnose_dir}/pvc-${runtime_name}.yaml" 2>&1 } diff --git a/tools/diagnose-fluid-juicefs.sh b/tools/diagnose-fluid-juicefs.sh new file mode 100644 index 0000000000..7ac7f84b20 --- /dev/null +++ b/tools/diagnose-fluid-juicefs.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +set +x + +print_usage() { + echo "Usage:" + echo " ./diagnose-fluid-juicefs.sh COMMAND [OPTIONS]" + echo "COMMAND:" + echo " help" + echo " Display this help message." + echo " collect" + echo " Collect pods logs of controller and runtime." + echo "OPTIONS:" + echo " -r, --name name" + echo " Set the name of runtime." + echo " -n, --namespace name" + echo " Set the namespace of runtime." +} + +run() { + echo + echo "-----------------run $*------------------" + timeout 10s "$@" + if [ $? != 0 ]; then + echo "failed to collect info: $*" + fi + echo "------------End of ${1}----------------" +} + +helm_get() { + run helm get all -n ${runtime_namespace} "${1}" &>"$diagnose_dir/helm-${1}.yaml" +} + +pod_status() { + local namespace=${1:-"default"} + run kubectl get po -owide -n ${namespace} &>"$diagnose_dir/pods-${namespace}.log" +} + +fluid_pod_logs() { + core_component "${fluid_namespace}" "manager" "control-plane=juicefsruntime-controller" + core_component "${fluid_namespace}" "manager" "control-plane=dataset-controller" + core_component "${fluid_namespace}" "plugins" "app=csi-nodeplugin-fluid" + core_component "${fluid_namespace}" "node-driver-registrar" "app=csi-nodeplugin-fluid" +} + +runtime_pod_logs() { + core_component "${runtime_namespace}" "juicefs-worker" "role=juicefs-worker" "release=${runtime_name}" + core_component "${runtime_namespace}" "juicefs-fuse" "role=juicefs-fuse" "release=${runtime_name}" +} + +core_component() { + # namespace container selectors... + local namespace="$1" + local container="$2" + shift 2 + local selectors="$*" + local constrains + local pods + constrains=$(echo "${selectors}" | tr ' ' ',') + if [[ -n ${constrains} ]]; then + constrains="-l ${constrains}" + fi + mkdir -p "$diagnose_dir/pods-${namespace}" + pods=$(kubectl get po -n ${namespace} "${constrains}" | awk '{print $1}' | grep -v NAME) + for po in ${pods}; do + kubectl logs "${po}" -c "$container" -n ${namespace} &>"$diagnose_dir/pods-${namespace}/${po}-${container}.log" 2>&1 + done +} + +kubectl_resource() { + # runtime, dataset, pv and pvc should have the same name + kubectl describe dataset --namespace ${runtime_namespace} ${runtime_name} &>"${diagnose_dir}/dataset-${runtime_name}.yaml" 2>&1 + kubectl describe juicefsruntime --namespace ${runtime_namespace} ${name} &>"${diagnose_dir}/juicefsruntime-${runtime_name}.yaml" 2>&1 + kubectl describe pv ${runtime_namespace}-${runtime_name} &>"${diagnose_dir}/pv-${runtime_name}.yaml" 2>&1 + kubectl describe pvc ${runtime_name} --namespace ${runtime_namespace} &>"${diagnose_dir}/pvc-${runtime_name}.yaml" 2>&1 +} + +archive() { + tar -zcvf "${current_dir}/diagnose_fluid_${timestamp}.tar.gz" "${diagnose_dir}" + echo "please get diagnose_fluid_${timestamp}.tar.gz for diagnostics" +} + +pd_collect() { + echo "Start collecting, runtime-name=${runtime_name}, runtime-namespace=${runtime_namespace}" + helm_get "${fluid_name}" + helm_get "${runtime_name}" + pod_status "${fluid_namespace}" + pod_status "${runtime_namespace}" + runtime_pod_logs + fluid_pod_logs + kubectl_resource + archive +} + +collect() +{ + # ensure params + fluid_name=${fluid_name:-"fluid"} + fluid_namespace=${fluid_namespace:-"fluid-system"} + runtime_name=${runtime_name:?"the name of runtime must be set"} + runtime_namespace=${runtime_namespace:-"default"} + + current_dir=$(pwd) + timestamp=$(date +%s) + diagnose_dir="/tmp/diagnose_fluid_${timestamp}" + mkdir -p "$diagnose_dir" + + pd_collect +} + +main() { + if [[ $# -eq 0 ]]; then + print_usage + exit 1 + fi + + action="help" + + while [[ $# -gt 0 ]]; do + case $1 in + -h|--help|"-?") + print_usage + exit 0; + ;; + collect|help) + action=$1 + ;; + -r|--name) + runtime_name=$2 + shift + ;; + -n|--namespace) + runtime_namespace=$2 + shift + ;; + *) + echo "Error: unsupported option $1" >&2 + print_usage + exit 1 + ;; + esac + shift + done + + case ${action} in + collect) + collect + ;; + help) + print_usage + ;; + esac +} + +main "$@" \ No newline at end of file