From 6b727a47964c986afc9344216923da5677101d7a Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Mon, 29 Mar 2021 18:12:14 +0300 Subject: [PATCH 1/6] Fix downloader daemonsets --- manager/install.sh | 55 +++++++++++++++++---- manager/manifests/image-downloader-cpu.yaml | 13 +++-- manager/manifests/image-downloader-gpu.yaml | 14 ++---- manager/manifests/image-downloader-inf.yaml | 14 ++---- 4 files changed, 61 insertions(+), 35 deletions(-) diff --git a/manager/install.sh b/manager/install.sh index 2206d4a88c..92f46054ff 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -352,30 +352,65 @@ function start_pre_download_images() { export CORTEX_IMAGE_TENSORFLOW_SERVING_INF="${registry}/tensorflow-serving-inf:${CORTEX_VERSION}" export CORTEX_IMAGE_TENSORFLOW_PREDICTOR="${registry}/tensorflow-predictor:${CORTEX_VERSION}" - if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]] || [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then + envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null + + has_gpu="false" + has_inf="false" + + cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length) + for idx in $(seq 0 $(($cluster_config_len-1))); do + ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type) + if [[ "$has_gpu" == "false" && ( "$ng_instance_type" == p* || "$ng_instance_type" == g* ) ]]; then + has_gpu="true" + fi + if [[ "$has_inf" == "false" && "$ng_instance_type" == inf* ]]; then + has_inf="true" + fi + done + + if [ "$has_gpu" == "true" ]; then envsubst < manifests/image-downloader-gpu.yaml | kubectl apply -f - &>/dev/null - elif [[ "$CORTEX_INSTANCE_TYPE" == inf* ]]; then + fi + + if [ "$has_inf" == "true" ]; then envsubst < manifests/image-downloader-inf.yaml | kubectl apply -f - &>/dev/null - else - envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null fi } function await_pre_download_images() { - if kubectl get daemonset image-downloader -n=default &>/dev/null; then - echo -n "○ downloading docker images " + daemonsets=( "image-downloader-cpu" ) + + cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length) + for idx in $(seq 0 $(($cluster_config_len-1))); do + ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type) + if [[ "$has_gpu" == "false" && ( "$ng_instance_type" == p* || "$ng_instance_type" == g* ) ]]; then + daemonsets+=( "image-downloader-gpu" ) + fi + if [[ "$has_inf" == "false" && "$ng_instance_type" == inf* ]]; then + daemonsets+=( "image-downloader-inf" ) + fi + done + + echo -n "○ downloading docker images " + printed_dot_count=0 + for ds_name in ${daemonsets[@]}; do printed_dot="false" i=0 - until [ "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do + until [ "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do if [ $i -eq 120 ]; then break; fi # give up after 6 minutes echo -n "." printed_dot="true" ((i=i+1)) sleep 3 done - kubectl -n=default delete --ignore-not-found=true daemonset image-downloader &>/dev/null - if [ "$printed_dot" == "true" ]; then echo " ✓"; else echo "✓"; fi - fi + kubectl -n=default delete --ignore-not-found=true daemonset $ds_name &>/dev/null + if [ "$printed_dot" == "true" ]; then + printed_dot_count=$(($printed_dot_count+1)) + fi + done + + + if [ "$printed_dot_count" == "${#daemonsets[@]}" ]; then echo " ✓"; else echo "✓"; fi } function validate_cortex() { diff --git a/manager/manifests/image-downloader-cpu.yaml b/manager/manifests/image-downloader-cpu.yaml index 263b6ab1d4..e0ccbda0d6 100644 --- a/manager/manifests/image-downloader-cpu.yaml +++ b/manager/manifests/image-downloader-cpu.yaml @@ -15,16 +15,16 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: image-downloader + name: image-downloader-cpu namespace: default spec: selector: matchLabels: - name: image-downloader + name: image-downloader-cpu template: metadata: labels: - name: image-downloader + name: image-downloader-cpu spec: nodeSelector: workload: "true" @@ -33,6 +33,13 @@ spec: value: "true" operator: Equal effect: NoSchedule + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + value: "true" + operator: Equal + effect: NoSchedule terminationGracePeriodSeconds: 0 containers: - name: python-predictor-cpu diff --git a/manager/manifests/image-downloader-gpu.yaml b/manager/manifests/image-downloader-gpu.yaml index 15e0465c1c..a3c17b4b78 100644 --- a/manager/manifests/image-downloader-gpu.yaml +++ b/manager/manifests/image-downloader-gpu.yaml @@ -15,16 +15,16 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: image-downloader + name: image-downloader-gpu namespace: default spec: selector: matchLabels: - name: image-downloader + name: image-downloader-gpu template: metadata: labels: - name: image-downloader + name: image-downloader-gpu spec: nodeSelector: workload: "true" @@ -46,11 +46,3 @@ spec: image: $CORTEX_IMAGE_TENSORFLOW_SERVING_GPU command: ["/bin/sh"] args: ["-c", "sleep 1000000"] - - name: tensorflow-predictor - image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR - command: ["/bin/sh"] - args: ["-c", "sleep 1000000"] - - name: downloader - image: $CORTEX_IMAGE_DOWNLOADER - command: ["/bin/sh"] - args: ["-c", "sleep 1000000"] diff --git a/manager/manifests/image-downloader-inf.yaml b/manager/manifests/image-downloader-inf.yaml index 5e97d54728..c69c28f97c 100644 --- a/manager/manifests/image-downloader-inf.yaml +++ b/manager/manifests/image-downloader-inf.yaml @@ -15,16 +15,16 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: image-downloader + name: image-downloader-inf namespace: default spec: selector: matchLabels: - name: image-downloader + name: image-downloader-inf template: metadata: labels: - name: image-downloader + name: image-downloader-inf spec: nodeSelector: workload: "true" @@ -47,14 +47,6 @@ spec: image: $CORTEX_IMAGE_TENSORFLOW_SERVING_INF command: ["/bin/sh"] args: ["-c", "sleep 1000000"] - - name: tensorflow-predictor - image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR - command: ["/bin/sh"] - args: ["-c", "sleep 1000000"] - - name: downloader - image: $CORTEX_IMAGE_DOWNLOADER - command: ["/bin/sh"] - args: ["-c", "sleep 1000000"] - name: neuron-rtd image: $CORTEX_IMAGE_NEURON_RTD command: ["/bin/sh"] From 33d0e9c819ac445ba5bc44ea31485daf9728c5cf Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Mon, 29 Mar 2021 21:54:54 +0300 Subject: [PATCH 2/6] Fix downloader ds logic --- manager/install.sh | 5 +++++ manager/manifests/image-downloader-gpu.yaml | 1 + manager/manifests/image-downloader-inf.yaml | 1 + 3 files changed, 7 insertions(+) diff --git a/manager/install.sh b/manager/install.sh index 92f46054ff..467da57d02 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -380,14 +380,19 @@ function start_pre_download_images() { function await_pre_download_images() { daemonsets=( "image-downloader-cpu" ) + has_gpu="false" + has_inf="false" + cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length) for idx in $(seq 0 $(($cluster_config_len-1))); do ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type) if [[ "$has_gpu" == "false" && ( "$ng_instance_type" == p* || "$ng_instance_type" == g* ) ]]; then daemonsets+=( "image-downloader-gpu" ) + has_gpu="true" fi if [[ "$has_inf" == "false" && "$ng_instance_type" == inf* ]]; then daemonsets+=( "image-downloader-inf" ) + has_inf="true" fi done diff --git a/manager/manifests/image-downloader-gpu.yaml b/manager/manifests/image-downloader-gpu.yaml index a3c17b4b78..415cdaedcf 100644 --- a/manager/manifests/image-downloader-gpu.yaml +++ b/manager/manifests/image-downloader-gpu.yaml @@ -28,6 +28,7 @@ spec: spec: nodeSelector: workload: "true" + nvidia.com/gpu: "true" tolerations: - key: workload value: "true" diff --git a/manager/manifests/image-downloader-inf.yaml b/manager/manifests/image-downloader-inf.yaml index c69c28f97c..2243b2c4bc 100644 --- a/manager/manifests/image-downloader-inf.yaml +++ b/manager/manifests/image-downloader-inf.yaml @@ -28,6 +28,7 @@ spec: spec: nodeSelector: workload: "true" + aws.amazon.com/neuron: "true" tolerations: - key: workload value: "true" From eb4761f2feb73829f0e4bca2dab5d2912dabe633 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 30 Mar 2021 01:33:35 +0300 Subject: [PATCH 3/6] Address some PR comments --- manager/install.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/manager/install.sh b/manager/install.sh index 467da57d02..58e4946a7e 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -360,10 +360,10 @@ function start_pre_download_images() { cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length) for idx in $(seq 0 $(($cluster_config_len-1))); do ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type) - if [[ "$has_gpu" == "false" && ( "$ng_instance_type" == p* || "$ng_instance_type" == g* ) ]]; then + if [[ "$ng_instance_type" == p* || "$ng_instance_type" == g* ]]; then has_gpu="true" fi - if [[ "$has_inf" == "false" && "$ng_instance_type" == inf* ]]; then + if [[ "$ng_instance_type" == inf* ]]; then has_inf="true" fi done From ee539e8b21529c320e4bd5d1d28db9f21a082958 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 30 Mar 2021 02:06:28 +0300 Subject: [PATCH 4/6] Remove the tracking of printed dots --- manager/install.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/manager/install.sh b/manager/install.sh index 58e4946a7e..6330d8ffd1 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -397,25 +397,18 @@ function await_pre_download_images() { done echo -n "○ downloading docker images " - printed_dot_count=0 for ds_name in ${daemonsets[@]}; do - printed_dot="false" i=0 until [ "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do if [ $i -eq 120 ]; then break; fi # give up after 6 minutes echo -n "." - printed_dot="true" ((i=i+1)) sleep 3 done kubectl -n=default delete --ignore-not-found=true daemonset $ds_name &>/dev/null - if [ "$printed_dot" == "true" ]; then - printed_dot_count=$(($printed_dot_count+1)) - fi done - - if [ "$printed_dot_count" == "${#daemonsets[@]}" ]; then echo " ✓"; else echo "✓"; fi + echo " ✓" } function validate_cortex() { From 54f4dfa0ad16d1095005d66ef8d276092e80e579 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 30 Mar 2021 02:40:59 +0300 Subject: [PATCH 5/6] Address PR comments --- manager/install.sh | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/manager/install.sh b/manager/install.sh index 6330d8ffd1..b83b1786be 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -378,26 +378,13 @@ function start_pre_download_images() { } function await_pre_download_images() { - daemonsets=( "image-downloader-cpu" ) + daemonsets=( "image-downloader-cpu" "image-downloader-gpu" "image-downloader-inf" ) - has_gpu="false" - has_inf="false" - - cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length) - for idx in $(seq 0 $(($cluster_config_len-1))); do - ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type) - if [[ "$has_gpu" == "false" && ( "$ng_instance_type" == p* || "$ng_instance_type" == g* ) ]]; then - daemonsets+=( "image-downloader-gpu" ) - has_gpu="true" - fi - if [[ "$has_inf" == "false" && "$ng_instance_type" == inf* ]]; then - daemonsets+=( "image-downloader-inf" ) - has_inf="true" - fi - done - - echo -n "○ downloading docker images " + echo -n "○ downloading docker images ." for ds_name in ${daemonsets[@]}; do + if ! kubectl get daemonset $ds_name > /dev/null 2>&1; then + continue + fi i=0 until [ "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do if [ $i -eq 120 ]; then break; fi # give up after 6 minutes From 250cb194ab40a56dc040468a0be16f11493539b1 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Tue, 30 Mar 2021 02:42:44 +0300 Subject: [PATCH 6/6] Remove ds array --- manager/install.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/manager/install.sh b/manager/install.sh index b83b1786be..9992b18920 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -378,10 +378,8 @@ function start_pre_download_images() { } function await_pre_download_images() { - daemonsets=( "image-downloader-cpu" "image-downloader-gpu" "image-downloader-inf" ) - echo -n "○ downloading docker images ." - for ds_name in ${daemonsets[@]}; do + for ds_name in image-downloader-cpu image-downloader-gpu image-downloader-inf; do if ! kubectl get daemonset $ds_name > /dev/null 2>&1; then continue fi