From 1d184b3a2d7b33f3b65ff377cd1f7a3cec4f33bd Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Sat, 21 Nov 2020 20:47:54 -0800 Subject: [PATCH 1/3] Update ASG cluster debug info --- manager/debug.sh | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/manager/debug.sh b/manager/debug.sh index f5ed3c597f..867d853231 100755 --- a/manager/debug.sh +++ b/manager/debug.sh @@ -40,15 +40,46 @@ done mkdir -p /cortex-debug/logs kubectl get pods --all-namespaces -o json | jq '.items[] | . as $parent | $parent.spec.containers[]? | "kubectl logs -n \($parent.metadata.namespace) \($parent.metadata.name) \(.name) --timestamps --tail=10000 > /cortex-debug/logs/\($parent.metadata.namespace).\($parent.metadata.name).\(.name) 2>&1 && echo -n ."' | xargs -n 1 bash -c +echo -n "." kubectl get pods --all-namespaces -o json | jq '.items[] | . as $parent | $parent.spec.initContainers[]? | "kubectl logs -n \($parent.metadata.namespace) \($parent.metadata.name) \(.name) --timestamps --tail=10000 > /cortex-debug/logs/\($parent.metadata.namespace).\($parent.metadata.name).init.\(.name) 2>&1 && echo -n ."' | xargs -n 1 bash -c +echo -n "." kubectl top pods --all-namespaces --containers=true > "/cortex-debug/k8s/top_pods" 2>&1 +echo -n "." kubectl top nodes > "/cortex-debug/k8s/top_nodes" 2>&1 +echo -n "." mkdir -p /cortex-debug/aws/amis -aws autoscaling describe-auto-scaling-groups --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asgs" 2>&1 + +asg_on_demand_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query "AutoScalingGroups[?contains(Tags[?Key==\`alpha.eksctl.io/cluster-name\`].Value, \`$CORTEX_CLUSTER_NAME\`)]|[?contains(Tags[?Key==\`alpha.eksctl.io/nodegroup-name\`].Value, \`ng-cortex-worker-on-demand\`)]") +echo -n "." +asg_on_demand_length=$(echo "$asg_on_demand_info" | jq -r 'length') +if (( "$asg_on_demand_length" > "0" )); then + asg_on_demand_name=$(echo "$asg_on_demand_info" | jq -r 'first | .AutoScalingGroupName') + aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-info-on-demand" 2>&1 + echo -n "." + aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-on-demand" 2>&1 + echo -n "." +else + # failsafe in case the asg could not be located + aws autoscaling describe-auto-scaling-groups --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asgs" 2>&1 + echo -n "." + aws autoscaling describe-scaling-activities --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities" 2>&1 + echo -n "." +fi + +asg_spot_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query "AutoScalingGroups[?contains(Tags[?Key==\`alpha.eksctl.io/cluster-name\`].Value, \`$CORTEX_CLUSTER_NAME\`)]|[?contains(Tags[?Key==\`alpha.eksctl.io/nodegroup-name\`].Value, \`ng-cortex-worker-spot\`)]") echo -n "." -aws autoscaling describe-scaling-activities --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities" 2>&1 +asg_spot_length=$(echo "$asg_spot_info" | jq -r 'length') +if (( "$asg_spot_length" > "0" )); then + asg_spot_name=$(echo "$asg_spot_info" | jq -r 'first | .AutoScalingGroupName') + aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-info-spot" 2>&1 + echo -n "." + aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-spot" 2>&1 + echo -n "." +fi +echo -n "." + echo -n "." aws ec2 describe-instances --filters Name=tag:cortex.dev/cluster-name,Values=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --output json > "/cortex-debug/aws/instances" 2>&1 echo -n "." From 20c4d2c5747dc19ad0ea25c97316330b678d9da0 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Sat, 21 Nov 2020 20:56:52 -0800 Subject: [PATCH 2/3] Specify max items --- manager/debug.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/manager/debug.sh b/manager/debug.sh index 867d853231..38b0c2813c 100755 --- a/manager/debug.sh +++ b/manager/debug.sh @@ -58,13 +58,13 @@ if (( "$asg_on_demand_length" > "0" )); then asg_on_demand_name=$(echo "$asg_on_demand_info" | jq -r 'first | .AutoScalingGroupName') aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-info-on-demand" 2>&1 echo -n "." - aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-on-demand" 2>&1 + aws autoscaling describe-scaling-activities --max-items 1000 --auto-scaling-group-name $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-on-demand" 2>&1 echo -n "." else # failsafe in case the asg could not be located aws autoscaling describe-auto-scaling-groups --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asgs" 2>&1 echo -n "." - aws autoscaling describe-scaling-activities --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities" 2>&1 + aws autoscaling describe-scaling-activities --max-items 1000 --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities" 2>&1 echo -n "." fi @@ -75,7 +75,7 @@ if (( "$asg_spot_length" > "0" )); then asg_spot_name=$(echo "$asg_spot_info" | jq -r 'first | .AutoScalingGroupName') aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-info-spot" 2>&1 echo -n "." - aws autoscaling describe-scaling-activities --auto-scaling-group-name $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-spot" 2>&1 + aws autoscaling describe-scaling-activities --max-items 1000 --auto-scaling-group-name $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-spot" 2>&1 echo -n "." fi echo -n "." From aafaa3b55e9e85e62c133b12990e128b328ea674 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Mon, 23 Nov 2020 11:34:05 -0800 Subject: [PATCH 3/3] Update debug.sh --- manager/debug.sh | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/manager/debug.sh b/manager/debug.sh index 38b0c2813c..3275ed0424 100755 --- a/manager/debug.sh +++ b/manager/debug.sh @@ -53,34 +53,36 @@ mkdir -p /cortex-debug/aws/amis asg_on_demand_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query "AutoScalingGroups[?contains(Tags[?Key==\`alpha.eksctl.io/cluster-name\`].Value, \`$CORTEX_CLUSTER_NAME\`)]|[?contains(Tags[?Key==\`alpha.eksctl.io/nodegroup-name\`].Value, \`ng-cortex-worker-on-demand\`)]") echo -n "." +asg_on_demand_name="" asg_on_demand_length=$(echo "$asg_on_demand_info" | jq -r 'length') if (( "$asg_on_demand_length" > "0" )); then asg_on_demand_name=$(echo "$asg_on_demand_info" | jq -r 'first | .AutoScalingGroupName') - aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-info-on-demand" 2>&1 + aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-on-demand" 2>&1 echo -n "." aws autoscaling describe-scaling-activities --max-items 1000 --auto-scaling-group-name $asg_on_demand_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-on-demand" 2>&1 echo -n "." -else - # failsafe in case the asg could not be located - aws autoscaling describe-auto-scaling-groups --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asgs" 2>&1 - echo -n "." - aws autoscaling describe-scaling-activities --max-items 1000 --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities" 2>&1 - echo -n "." fi asg_spot_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query "AutoScalingGroups[?contains(Tags[?Key==\`alpha.eksctl.io/cluster-name\`].Value, \`$CORTEX_CLUSTER_NAME\`)]|[?contains(Tags[?Key==\`alpha.eksctl.io/nodegroup-name\`].Value, \`ng-cortex-worker-spot\`)]") echo -n "." +asg_spot_name="" asg_spot_length=$(echo "$asg_spot_info" | jq -r 'length') if (( "$asg_spot_length" > "0" )); then asg_spot_name=$(echo "$asg_spot_info" | jq -r 'first | .AutoScalingGroupName') - aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-info-spot" 2>&1 + aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-spot" 2>&1 echo -n "." aws autoscaling describe-scaling-activities --max-items 1000 --auto-scaling-group-name $asg_spot_name --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities-spot" 2>&1 echo -n "." fi -echo -n "." -echo -n "." +# failsafe in case the asg(s) could not be located +if [ "$asg_on_demand_name" == "" ] && [ "$asg_spot_name" == "" ]; then + aws autoscaling describe-auto-scaling-groups --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asgs" 2>&1 + echo -n "." + aws autoscaling describe-scaling-activities --max-items 1000 --region=$CORTEX_REGION --output json > "/cortex-debug/aws/asg-activities" 2>&1 + echo -n "." +fi + aws ec2 describe-instances --filters Name=tag:cortex.dev/cluster-name,Values=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --output json > "/cortex-debug/aws/instances" 2>&1 echo -n "." aws ec2 describe-instance-status --include-all-instances --region=$CORTEX_REGION --output json > "/cortex-debug/aws/instance-statuses" 2>&1