diff --git a/manager/debug.sh b/manager/debug.sh index f5ed3c597f..3275ed0424 100755 --- a/manager/debug.sh +++ b/manager/debug.sh @@ -40,16 +40,49 @@ done mkdir -p /cortex-debug/logs kubectl get pods --all-namespaces -o json | jq '.items[] | . as $parent | $parent.spec.containers[]? | "kubectl logs -n \($parent.metadata.namespace) \($parent.metadata.name) \(.name) --timestamps --tail=10000 > /cortex-debug/logs/\($parent.metadata.namespace).\($parent.metadata.name).\(.name) 2>&1 && echo -n ."' | xargs -n 1 bash -c +echo -n "." kubectl get pods --all-namespaces -o json | jq '.items[] | . as $parent | $parent.spec.initContainers[]? | "kubectl logs -n \($parent.metadata.namespace) \($parent.metadata.name) \(.name) --timestamps --tail=10000 > /cortex-debug/logs/\($parent.metadata.namespace).\($parent.metadata.name).init.\(.name) 2>&1 && echo -n ."' | xargs -n 1 bash -c +echo -n "." kubectl top pods --all-namespaces --containers=true > "/cortex-debug/k8s/top_pods" 2>&1 +echo -n "." kubectl top nodes > "/cortex-debug/k8s/top_nodes" 2>&1 +echo -n "." mkdir -p /cortex-debug/aws/amis -aws autoscaling describe-auto-scaling-groups --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asgs" 2>&1 + +asg_on_demand_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query "AutoScalingGroups[?contains(Tags[?Key==\`alpha.eksctl.io/cluster-name\`].Value, \`$CORTEX_CLUSTER_NAME\`)]|[?contains(Tags[?Key==\`alpha.eksctl.io/nodegroup-name\`].Value, \`ng-cortex-worker-on-demand\`)]") echo -n "." -aws autoscaling describe-scaling-activities --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities" 2>&1 +asg_on_demand_name="" +asg_on_demand_length=$(echo "$asg_on_demand_info" | jq -r 'length') +if (( "$asg_on_demand_length" > "0" )); then + asg_on_demand_name=$(echo "$asg_on_demand_info" | jq -r 'first | .AutoScalingGroupName') + aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-on-demand" 2>&1 + echo -n "." + aws autoscaling describe-scaling-activities --max-items 1000 --auto-scaling-group-name $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-on-demand" 2>&1 + echo -n "." +fi + +asg_spot_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query "AutoScalingGroups[?contains(Tags[?Key==\`alpha.eksctl.io/cluster-name\`].Value, \`$CORTEX_CLUSTER_NAME\`)]|[?contains(Tags[?Key==\`alpha.eksctl.io/nodegroup-name\`].Value, \`ng-cortex-worker-spot\`)]") echo -n "." +asg_spot_name="" +asg_spot_length=$(echo "$asg_spot_info" | jq -r 'length') +if (( "$asg_spot_length" > "0" )); then + asg_spot_name=$(echo "$asg_spot_info" | jq -r 'first | .AutoScalingGroupName') + aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-spot" 2>&1 + echo -n "." + aws autoscaling describe-scaling-activities --max-items 1000 --auto-scaling-group-name $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-spot" 2>&1 + echo -n "." +fi + +# failsafe in case the asg(s) could not be located +if [ "$asg_on_demand_name" == "" ] && [ "$asg_spot_name" == "" ]; then + aws autoscaling describe-auto-scaling-groups --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asgs" 2>&1 + echo -n "." + aws autoscaling describe-scaling-activities --max-items 1000 --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities" 2>&1 + echo -n "." +fi + aws ec2 describe-instances --filters Name=tag:cortex.dev/cluster-name,Values=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --output json > "/cortex-debug/aws/instances" 2>&1 echo -n "." aws ec2 describe-instance-status --include-all-instances --region=$CORTEX_REGION --output json > "/cortex-debug/aws/instance-statuses" 2>&1