diff --git a/manager/install.sh b/manager/install.sh index 2206d4a88c..9992b18920 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -352,30 +352,48 @@ function start_pre_download_images() { export CORTEX_IMAGE_TENSORFLOW_SERVING_INF="${registry}/tensorflow-serving-inf:${CORTEX_VERSION}" export CORTEX_IMAGE_TENSORFLOW_PREDICTOR="${registry}/tensorflow-predictor:${CORTEX_VERSION}" - if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]] || [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then + envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null + + has_gpu="false" + has_inf="false" + + cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length) + for idx in $(seq 0 $(($cluster_config_len-1))); do + ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type) + if [[ "$ng_instance_type" == p* || "$ng_instance_type" == g* ]]; then + has_gpu="true" + fi + if [[ "$ng_instance_type" == inf* ]]; then + has_inf="true" + fi + done + + if [ "$has_gpu" == "true" ]; then envsubst < manifests/image-downloader-gpu.yaml | kubectl apply -f - &>/dev/null - elif [[ "$CORTEX_INSTANCE_TYPE" == inf* ]]; then + fi + + if [ "$has_inf" == "true" ]; then envsubst < manifests/image-downloader-inf.yaml | kubectl apply -f - &>/dev/null - else - envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null fi } function await_pre_download_images() { - if kubectl get daemonset image-downloader -n=default &>/dev/null; then - echo -n "○ downloading docker images " - printed_dot="false" + echo -n "○ downloading docker images ." + for ds_name in image-downloader-cpu image-downloader-gpu image-downloader-inf; do + if ! kubectl get daemonset $ds_name > /dev/null 2>&1; then + continue + fi i=0 - until [ "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do + until [ "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do if [ $i -eq 120 ]; then break; fi # give up after 6 minutes echo -n "." - printed_dot="true" ((i=i+1)) sleep 3 done - kubectl -n=default delete --ignore-not-found=true daemonset image-downloader &>/dev/null - if [ "$printed_dot" == "true" ]; then echo " ✓"; else echo "✓"; fi - fi + kubectl -n=default delete --ignore-not-found=true daemonset $ds_name &>/dev/null + done + + echo " ✓" } function validate_cortex() { diff --git a/manager/manifests/image-downloader-cpu.yaml b/manager/manifests/image-downloader-cpu.yaml index 263b6ab1d4..e0ccbda0d6 100644 --- a/manager/manifests/image-downloader-cpu.yaml +++ b/manager/manifests/image-downloader-cpu.yaml @@ -15,16 +15,16 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: image-downloader + name: image-downloader-cpu namespace: default spec: selector: matchLabels: - name: image-downloader + name: image-downloader-cpu template: metadata: labels: - name: image-downloader + name: image-downloader-cpu spec: nodeSelector: workload: "true" @@ -33,6 +33,13 @@ spec: value: "true" operator: Equal effect: NoSchedule + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: aws.amazon.com/neuron + value: "true" + operator: Equal + effect: NoSchedule terminationGracePeriodSeconds: 0 containers: - name: python-predictor-cpu diff --git a/manager/manifests/image-downloader-gpu.yaml b/manager/manifests/image-downloader-gpu.yaml index 15e0465c1c..415cdaedcf 100644 --- a/manager/manifests/image-downloader-gpu.yaml +++ b/manager/manifests/image-downloader-gpu.yaml @@ -15,19 +15,20 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: image-downloader + name: image-downloader-gpu namespace: default spec: selector: matchLabels: - name: image-downloader + name: image-downloader-gpu template: metadata: labels: - name: image-downloader + name: image-downloader-gpu spec: nodeSelector: workload: "true" + nvidia.com/gpu: "true" tolerations: - key: workload value: "true" @@ -46,11 +47,3 @@ spec: image: $CORTEX_IMAGE_TENSORFLOW_SERVING_GPU command: ["/bin/sh"] args: ["-c", "sleep 1000000"] - - name: tensorflow-predictor - image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR - command: ["/bin/sh"] - args: ["-c", "sleep 1000000"] - - name: downloader - image: $CORTEX_IMAGE_DOWNLOADER - command: ["/bin/sh"] - args: ["-c", "sleep 1000000"] diff --git a/manager/manifests/image-downloader-inf.yaml b/manager/manifests/image-downloader-inf.yaml index 5e97d54728..2243b2c4bc 100644 --- a/manager/manifests/image-downloader-inf.yaml +++ b/manager/manifests/image-downloader-inf.yaml @@ -15,19 +15,20 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: image-downloader + name: image-downloader-inf namespace: default spec: selector: matchLabels: - name: image-downloader + name: image-downloader-inf template: metadata: labels: - name: image-downloader + name: image-downloader-inf spec: nodeSelector: workload: "true" + aws.amazon.com/neuron: "true" tolerations: - key: workload value: "true" @@ -47,14 +48,6 @@ spec: image: $CORTEX_IMAGE_TENSORFLOW_SERVING_INF command: ["/bin/sh"] args: ["-c", "sleep 1000000"] - - name: tensorflow-predictor - image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR - command: ["/bin/sh"] - args: ["-c", "sleep 1000000"] - - name: downloader - image: $CORTEX_IMAGE_DOWNLOADER - command: ["/bin/sh"] - args: ["-c", "sleep 1000000"] - name: neuron-rtd image: $CORTEX_IMAGE_NEURON_RTD command: ["/bin/sh"]