cortexlabs · RobertLucian · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/manager/install.sh b/manager/install.sh
@@ -352,30 +352,48 @@ function start_pre_download_images() {
   export CORTEX_IMAGE_TENSORFLOW_SERVING_INF="${registry}/tensorflow-serving-inf:${CORTEX_VERSION}"
   export CORTEX_IMAGE_TENSORFLOW_PREDICTOR="${registry}/tensorflow-predictor:${CORTEX_VERSION}"
 
-  if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]] || [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then
+  envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null
+
+  has_gpu="false"
+  has_inf="false"
+
+  cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length)
+  for idx in $(seq 0 $(($cluster_config_len-1))); do
+    ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type)
+    if [[ "$ng_instance_type" == p* || "$ng_instance_type" == g* ]]; then
+      has_gpu="true"
+    fi
+    if [[ "$ng_instance_type" == inf* ]]; then
+      has_inf="true"
+    fi
+  done
+
+  if [ "$has_gpu" == "true" ]; then
     envsubst < manifests/image-downloader-gpu.yaml | kubectl apply -f - &>/dev/null
-  elif [[ "$CORTEX_INSTANCE_TYPE" == inf* ]]; then
+  fi
+
+  if [ "$has_inf" == "true" ]; then
     envsubst < manifests/image-downloader-inf.yaml | kubectl apply -f - &>/dev/null
-  else
-    envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null
   fi
 }
 
 function await_pre_download_images() {
-  if kubectl get daemonset image-downloader -n=default &>/dev/null; then
-    echo -n "￮ downloading docker images "
-    printed_dot="false"
+  echo -n "￮ downloading docker images ."
+  for ds_name in image-downloader-cpu image-downloader-gpu image-downloader-inf; do
+    if ! kubectl get daemonset $ds_name > /dev/null 2>&1; then
+      continue
+    fi
     i=0
-    until [ "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do
+    until [ "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do
       if [ $i -eq 120 ]; then break; fi  # give up after 6 minutes
       echo -n "."
-      printed_dot="true"
       ((i=i+1))
       sleep 3
     done
-    kubectl -n=default delete --ignore-not-found=true daemonset image-downloader &>/dev/null
-    if [ "$printed_dot" == "true" ]; then echo " ✓"; else echo "✓"; fi
-  fi
+    kubectl -n=default delete --ignore-not-found=true daemonset $ds_name &>/dev/null
+  done
+
+  echo " ✓"
 }
 
 function validate_cortex() {

diff --git a/manager/manifests/image-downloader-cpu.yaml b/manager/manifests/image-downloader-cpu.yaml
@@ -15,16 +15,16 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-  name: image-downloader
+  name: image-downloader-cpu
   namespace: default
 spec:
   selector:
     matchLabels:
-      name: image-downloader
+      name: image-downloader-cpu
   template:
     metadata:
       labels:
-        name: image-downloader
+        name: image-downloader-cpu
     spec:
       nodeSelector:
         workload: "true"
@@ -33,6 +33,13 @@ spec:
           value: "true"
           operator: Equal
           effect: NoSchedule
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+        - key: aws.amazon.com/neuron
+          value: "true"
+          operator: Equal
+          effect: NoSchedule
       terminationGracePeriodSeconds: 0
       containers:
         - name: python-predictor-cpu

diff --git a/manager/manifests/image-downloader-gpu.yaml b/manager/manifests/image-downloader-gpu.yaml
@@ -15,19 +15,20 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-  name: image-downloader
+  name: image-downloader-gpu
   namespace: default
 spec:
   selector:
     matchLabels:
-      name: image-downloader
+      name: image-downloader-gpu
   template:
     metadata:
       labels:
-        name: image-downloader
+        name: image-downloader-gpu
     spec:
       nodeSelector:
         workload: "true"
+        nvidia.com/gpu: "true"
       tolerations:
         - key: workload
           value: "true"
@@ -46,11 +47,3 @@ spec:
           image: $CORTEX_IMAGE_TENSORFLOW_SERVING_GPU
           command: ["/bin/sh"]
           args: ["-c", "sleep 1000000"]
-        - name: tensorflow-predictor
-          image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR
-          command: ["/bin/sh"]
-          args: ["-c", "sleep 1000000"]
-        - name: downloader
-          image: $CORTEX_IMAGE_DOWNLOADER
-          command: ["/bin/sh"]
-          args: ["-c", "sleep 1000000"]
diff --git a/manager/manifests/image-downloader-inf.yaml b/manager/manifests/image-downloader-inf.yaml
@@ -15,19 +15,20 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-  name: image-downloader
+  name: image-downloader-inf
   namespace: default
 spec:
   selector:
     matchLabels:
-      name: image-downloader
+      name: image-downloader-inf
   template:
     metadata:
       labels:
-        name: image-downloader
+        name: image-downloader-inf
     spec:
       nodeSelector:
         workload: "true"
+        aws.amazon.com/neuron: "true"
       tolerations:
         - key: workload
           value: "true"
@@ -47,14 +48,6 @@ spec:
           image: $CORTEX_IMAGE_TENSORFLOW_SERVING_INF
           command: ["/bin/sh"]
           args: ["-c", "sleep 1000000"]
-        - name: tensorflow-predictor
-          image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR
-          command: ["/bin/sh"]
-          args: ["-c", "sleep 1000000"]
-        - name: downloader
-          image: $CORTEX_IMAGE_DOWNLOADER
-          command: ["/bin/sh"]
-          args: ["-c", "sleep 1000000"]
         - name: neuron-rtd
           image: $CORTEX_IMAGE_NEURON_RTD
           command: ["/bin/sh"]