Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions manager/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -352,30 +352,48 @@ function start_pre_download_images() {
export CORTEX_IMAGE_TENSORFLOW_SERVING_INF="${registry}/tensorflow-serving-inf:${CORTEX_VERSION}"
export CORTEX_IMAGE_TENSORFLOW_PREDICTOR="${registry}/tensorflow-predictor:${CORTEX_VERSION}"

if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]] || [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then
envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null

has_gpu="false"
has_inf="false"

cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length)
for idx in $(seq 0 $(($cluster_config_len-1))); do
ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type)
if [[ "$ng_instance_type" == p* || "$ng_instance_type" == g* ]]; then
has_gpu="true"
fi
if [[ "$ng_instance_type" == inf* ]]; then
has_inf="true"
fi
done

if [ "$has_gpu" == "true" ]; then
envsubst < manifests/image-downloader-gpu.yaml | kubectl apply -f - &>/dev/null
elif [[ "$CORTEX_INSTANCE_TYPE" == inf* ]]; then
fi

if [ "$has_inf" == "true" ]; then
envsubst < manifests/image-downloader-inf.yaml | kubectl apply -f - &>/dev/null
else
envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null
fi
}

function await_pre_download_images() {
if kubectl get daemonset image-downloader -n=default &>/dev/null; then
echo -n "○ downloading docker images "
printed_dot="false"
echo -n "○ downloading docker images ."
for ds_name in image-downloader-cpu image-downloader-gpu image-downloader-inf; do
if ! kubectl get daemonset $ds_name > /dev/null 2>&1; then
continue
fi
i=0
until [ "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset image-downloader -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do
until [ "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do
if [ $i -eq 120 ]; then break; fi # give up after 6 minutes
echo -n "."
printed_dot="true"
((i=i+1))
sleep 3
done
kubectl -n=default delete --ignore-not-found=true daemonset image-downloader &>/dev/null
if [ "$printed_dot" == "true" ]; then echo " ✓"; else echo "✓"; fi
fi
kubectl -n=default delete --ignore-not-found=true daemonset $ds_name &>/dev/null
done

echo " ✓"
}

function validate_cortex() {
Expand Down
13 changes: 10 additions & 3 deletions manager/manifests/image-downloader-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: image-downloader
name: image-downloader-cpu
namespace: default
spec:
selector:
matchLabels:
name: image-downloader
name: image-downloader-cpu
template:
metadata:
labels:
name: image-downloader
name: image-downloader-cpu
spec:
nodeSelector:
workload: "true"
Expand All @@ -33,6 +33,13 @@ spec:
value: "true"
operator: Equal
effect: NoSchedule
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: aws.amazon.com/neuron
value: "true"
operator: Equal
effect: NoSchedule
terminationGracePeriodSeconds: 0
containers:
- name: python-predictor-cpu
Expand Down
15 changes: 4 additions & 11 deletions manager/manifests/image-downloader-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,20 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: image-downloader
name: image-downloader-gpu
namespace: default
spec:
selector:
matchLabels:
name: image-downloader
name: image-downloader-gpu
template:
metadata:
labels:
name: image-downloader
name: image-downloader-gpu
spec:
nodeSelector:
workload: "true"
nvidia.com/gpu: "true"
tolerations:
- key: workload
value: "true"
Expand All @@ -46,11 +47,3 @@ spec:
image: $CORTEX_IMAGE_TENSORFLOW_SERVING_GPU
command: ["/bin/sh"]
args: ["-c", "sleep 1000000"]
- name: tensorflow-predictor
image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR
command: ["/bin/sh"]
args: ["-c", "sleep 1000000"]
- name: downloader
image: $CORTEX_IMAGE_DOWNLOADER
command: ["/bin/sh"]
args: ["-c", "sleep 1000000"]
15 changes: 4 additions & 11 deletions manager/manifests/image-downloader-inf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,20 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: image-downloader
name: image-downloader-inf
namespace: default
spec:
selector:
matchLabels:
name: image-downloader
name: image-downloader-inf
template:
metadata:
labels:
name: image-downloader
name: image-downloader-inf
spec:
nodeSelector:
workload: "true"
aws.amazon.com/neuron: "true"
tolerations:
- key: workload
value: "true"
Expand All @@ -47,14 +48,6 @@ spec:
image: $CORTEX_IMAGE_TENSORFLOW_SERVING_INF
command: ["/bin/sh"]
args: ["-c", "sleep 1000000"]
- name: tensorflow-predictor
image: $CORTEX_IMAGE_TENSORFLOW_PREDICTOR
command: ["/bin/sh"]
args: ["-c", "sleep 1000000"]
- name: downloader
image: $CORTEX_IMAGE_DOWNLOADER
command: ["/bin/sh"]
args: ["-c", "sleep 1000000"]
- name: neuron-rtd
image: $CORTEX_IMAGE_NEURON_RTD
command: ["/bin/sh"]
Expand Down