From d3b6f5a649042d9f9142a73501f042b1dd46b55b Mon Sep 17 00:00:00 2001 From: Omer Spillinger Date: Wed, 3 Jul 2019 15:18:26 -0700 Subject: [PATCH] Add GPU support --- Makefile | 2 + cortex.sh | 2 + dev/eks.sh | 45 ---------- dev/kops.sh | 163 ---------------------------------- dev/registry.sh | 2 + docs/apis/compute.md | 4 +- docs/cluster/config.md | 1 + docs/cluster/development.md | 1 + images/nvidia/Dockerfile | 1 + manager/install_cortex.sh | 1 + manager/install_eks.sh | 3 +- manager/manifests/nvidia.yaml | 56 ++++++++++++ manager/uninstall_operator.sh | 5 +- 13 files changed, 75 insertions(+), 211 deletions(-) delete mode 100755 dev/eks.sh delete mode 100755 dev/kops.sh create mode 100644 images/nvidia/Dockerfile create mode 100644 manager/manifests/nvidia.yaml diff --git a/Makefile b/Makefile index 324b3e0bea..2b5df8f02f 100644 --- a/Makefile +++ b/Makefile @@ -136,6 +136,7 @@ ci-build-images: @./build/build-image.sh images/argo-executor argo-executor @./build/build-image.sh images/python-packager python-packager @./build/build-image.sh images/cluster-autoscaler cluster-autoscaler + @./build/build-image.sh images/nvidia nvidia @./build/build-image.sh images/metrics-server metrics-server ci-push-images: @@ -155,6 +156,7 @@ ci-push-images: @./build/push-image.sh argo-executor @./build/push-image.sh python-packager @./build/push-image.sh cluster-autoscaler + @./build/push-image.sh nvidia @./build/push-image.sh metrics-server diff --git a/cortex.sh b/cortex.sh index 44ab0a4070..ba73b0bc45 100755 --- a/cortex.sh +++ b/cortex.sh @@ -131,6 +131,7 @@ export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/ export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_CLUSTER_AUTOSCALER="${CORTEX_IMAGE_CLUSTER_AUTOSCALER:-cortexlabs/cluster-autoscaler:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_NVIDIA="${CORTEX_IMAGE_NVIDIA:-cortexlabs/nvidia:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_METRICS_SERVER="${CORTEX_IMAGE_METRICS_SERVER:-cortexlabs/metrics-server:$CORTEX_VERSION_STABLE}" export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}" @@ -182,6 +183,7 @@ function install_cortex() { -e CORTEX_IMAGE_TF_SERVE_GPU=$CORTEX_IMAGE_TF_SERVE_GPU \ -e CORTEX_IMAGE_TF_TRAIN_GPU=$CORTEX_IMAGE_TF_TRAIN_GPU \ -e CORTEX_IMAGE_CLUSTER_AUTOSCALER=$CORTEX_IMAGE_CLUSTER_AUTOSCALER \ + -e CORTEX_IMAGE_NVIDIA=$CORTEX_IMAGE_NVIDIA \ -e CORTEX_IMAGE_METRICS_SERVER=$CORTEX_IMAGE_METRICS_SERVER \ -e CORTEX_ENABLE_TELEMETRY=$CORTEX_ENABLE_TELEMETRY \ $CORTEX_IMAGE_MANAGER diff --git a/dev/eks.sh b/dev/eks.sh deleted file mode 100755 index 0509e86f13..0000000000 --- a/dev/eks.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -set -euo pipefail - -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)" -source $ROOT/dev/config/k8s.sh - -function eks_set_cluster() { - eksctl utils write-kubeconfig --name=$K8S_NAME - kubectl config set-context $(kubectl config current-context) --namespace="cortex" -} - -if [ "$1" = "start" ]; then - eksctl create cluster --version=1.11 --name=$K8S_NAME --region $K8S_REGION --nodes-max $K8S_NODES_MAX_COUNT --nodes-min $K8S_NODES_MIN_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE - if [ $K8S_GPU_NODES_MIN_COUNT -gt 0 ] || [ $K8S_GPU_NODES_MAX_COUNT -gt 0 ]; then - eksctl create nodegroup --version=1.11 --cluster=$K8S_NAME --nodes-max=$K8S_GPU_NODES_MAX_COUNT --nodes-min=$K8S_GPU_NODES_MIN_COUNT --node-type=$K8S_GPU_NODE_INSTANCE_TYPE --node-ami=$K8S_GPU_NODE_AMI - echo "Once the GPU nodegroup joins the cluster, run:" - echo "kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml" - fi - eks_set_cluster - -elif [ "$1" = "update" ]; then - echo "Not implemented" - -elif [ "$1" = "stop" ]; then - eksctl delete cluster --name=$K8S_NAME - -elif [ "$1" = "set" ]; then - eks_set_cluster -fi diff --git a/dev/kops.sh b/dev/kops.sh deleted file mode 100755 index 79b24f2b57..0000000000 --- a/dev/kops.sh +++ /dev/null @@ -1,163 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -set -euo pipefail - -ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)" - -source $ROOT/dev/config/k8s.sh -export K8S_NAME="${K8S_NAME}.k8s.local" - -function kops_set_cluster() { - kops export kubecfg --name=$K8S_NAME --state="s3://$K8S_KOPS_BUCKET" - kubectl config set-context $(kubectl config current-context) --namespace="cortex" -} - -function create_kops_config() { - cat > $ROOT/dev/config/.k8s_kops_config.yaml << EOM -apiVersion: kops/v1alpha2 -kind: Cluster -metadata: - creationTimestamp: null - name: ${K8S_NAME} -spec: - api: - loadBalancer: - type: Public - authorization: - rbac: {} - channel: stable - cloudProvider: aws - configBase: s3://${K8S_KOPS_BUCKET}/kops/${K8S_NAME} - etcdClusters: - - etcdMembers: - - instanceGroup: master-${K8S_ZONE} - name: a - name: main - - etcdMembers: - - instanceGroup: master-${K8S_ZONE} - name: a - name: events - iam: - allowContainerRegistry: true - legacy: false - kubernetesApiAccess: - - 0.0.0.0/0 - kubernetesVersion: 1.11.6 - masterPublicName: api.${K8S_NAME} - networkCIDR: 172.20.0.0/16 - networking: - kubenet: {} - nonMasqueradeCIDR: 100.64.0.0/10 - sshAccess: - - 0.0.0.0/0 - subnets: - - cidr: 172.20.32.0/19 - name: ${K8S_ZONE} - type: Public - zone: ${K8S_ZONE} - topology: - dns: - type: Public - masters: public - nodes: public - additionalPolicies: - node: | - [ - { - "Effect": "Allow", - "Action": "s3:*", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "logs:DescribeLogGroups", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "logs:*", - "Resource": [ - "arn:aws:logs:${K8S_REGION}:*:log-group:*:*:*" - ] - } - ] - ---- - -apiVersion: kops/v1alpha2 -kind: InstanceGroup -metadata: - creationTimestamp: null - labels: - kops.k8s.io/cluster: ${K8S_NAME} - name: master-${K8S_ZONE} -spec: - image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17 - machineType: ${K8S_MASTER_INSTANCE_TYPE} - rootVolumeSize: ${K8S_MASTER_VOLUME_SIZE} - maxSize: 1 - minSize: 1 - nodeLabels: - kops.k8s.io/instancegroup: master-${K8S_ZONE} - role: Master - subnets: - - ${K8S_ZONE} - ---- - -apiVersion: kops/v1alpha2 -kind: InstanceGroup -metadata: - creationTimestamp: null - labels: - kops.k8s.io/cluster: ${K8S_NAME} - name: nodes -spec: - image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17 - machineType: ${K8S_NODE_INSTANCE_TYPE} - rootVolumeSize: ${K8S_NODE_VOLUME_SIZE} - maxSize: ${K8S_NODES_MAX_COUNT} - minSize: ${K8S_NODES_MIN_COUNT} - nodeLabels: - kops.k8s.io/instancegroup: nodes - role: Node - subnets: - - ${K8S_ZONE} -EOM -} - -if [ "$1" = "start" ]; then - create_kops_config - kops create -f $ROOT/dev/config/.k8s_kops_config.yaml --state=s3://$K8S_KOPS_BUCKET - kops create secret --state=s3://$K8S_KOPS_BUCKET --name=$K8S_NAME sshpublickey admin -i ~/.ssh/kops.pub - kops update cluster --yes --name=$K8S_NAME --state=s3://$K8S_KOPS_BUCKET - kops_set_cluster - until kops validate cluster --name=$K8S_NAME --state=s3://$K8S_KOPS_BUCKET; do - sleep 10 - done - -elif [ "$1" = "update" ]; then - echo "Not implemented" - -elif [ "$1" = "stop" ]; then - kops delete cluster --yes --name=$K8S_NAME --state=s3://$K8S_KOPS_BUCKET - aws s3 rm s3://$K8S_KOPS_BUCKET/$K8S_NAME --recursive - -elif [ "$1" = "set" ]; then - kops_set_cluster -fi diff --git a/dev/registry.sh b/dev/registry.sh index dcc11dd4ab..1e38ba314a 100755 --- a/dev/registry.sh +++ b/dev/registry.sh @@ -51,6 +51,7 @@ function create_registry() { aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/cluster-autoscaler --region=$REGISTRY_REGION || true + aws ecr create-repository --repository-name=cortexlabs/nvidia --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/metrics-server --region=$REGISTRY_REGION || true } @@ -139,6 +140,7 @@ elif [ "$cmd" = "update" ]; then build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest build_and_push $ROOT/images/python-packager python-packager latest build_and_push $ROOT/images/cluster-autoscaler cluster-autoscaler latest + build_and_push $ROOT/images/nvidia nvidia latest build_and_push $ROOT/images/metrics-server metrics-server latest fi diff --git a/docs/apis/compute.md b/docs/apis/compute.md index 8174b3f875..e77edf20af 100644 --- a/docs/apis/compute.md +++ b/docs/apis/compute.md @@ -25,4 +25,6 @@ One unit of memory is one byte. Memory can be expressed as an integer or by usin ## GPU -One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html). +1. Please make sure your AWS account is subscribed to the [EKS-optimized AMI with GPU Support](https://aws.amazon.com/marketplace/pp/B07GRHFXGM). +2. Set CORTEX_NODE_TYPE to an AWS GPU instance (e.g. p2.xlarge) before installing Cortex. +3. Note that one unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. diff --git a/docs/cluster/config.md b/docs/cluster/config.md index 68f1e61c56..40fd396415 100644 --- a/docs/cluster/config.md +++ b/docs/cluster/config.md @@ -52,6 +52,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="cortexlabs/tf-train-gpu:master" export CORTEX_IMAGE_TF_SERVE_GPU="cortexlabs/tf-serve-gpu:master" export CORTEX_IMAGE_PYTHON_PACKAGER="cortexlabs/python-packager:master" export CORTEX_IMAGE_CLUSTER_AUTOSCALER="cortexlabs/cluster-autoscaler:master" +export CORTEX_IMAGE_NVIDIA="cortexlabs/nvidia:master" export CORTEX_IMAGE_METRICS_SERVER="cortexlabs/metrics-server:master" # Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted. diff --git a/docs/cluster/development.md b/docs/cluster/development.md index 4d7b4a212e..85e6cf2030 100644 --- a/docs/cluster/development.md +++ b/docs/cluster/development.md @@ -86,6 +86,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/corte export CORTEX_IMAGE_TF_TRANSFORM="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/tf-transform:latest" export CORTEX_IMAGE_PYTHON_PACKAGER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/python-packager:latest" export CORTEX_IMAGE_CLUSTER_AUTOSCALER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/cluster-autoscaler:latest" +export CORTEX_IMAGE_NVIDIA="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/nvidia:latest" export CORTEX_IMAGE_METRICS_SERVER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/metrics-server:latest" export AWS_ACCESS_KEY_ID="XXXXXX" diff --git a/images/nvidia/Dockerfile b/images/nvidia/Dockerfile new file mode 100644 index 0000000000..e43cb27d7a --- /dev/null +++ b/images/nvidia/Dockerfile @@ -0,0 +1 @@ +FROM nvidia/k8s-device-plugin:1.0.0-beta diff --git a/manager/install_cortex.sh b/manager/install_cortex.sh index f9d97ada9f..f16400a3c8 100755 --- a/manager/install_cortex.sh +++ b/manager/install_cortex.sh @@ -169,5 +169,6 @@ envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null envsubst < manifests/cluster-autoscaler.yaml | kubectl apply -f - >/dev/null envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null +envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null validate_cortex diff --git a/manager/install_eks.sh b/manager/install_eks.sh index ed1ccad97c..4ba799f8fb 100755 --- a/manager/install_eks.sh +++ b/manager/install_eks.sh @@ -24,7 +24,8 @@ eksctl create cluster --name=$CORTEX_CLUSTER \ --node-type=$CORTEX_NODE_TYPE \ --nodes-min=$CORTEX_NODES_MIN \ --nodes-max=$CORTEX_NODES_MAX \ - --version=1.13 \ + --node-ami=auto \ + --version=1.11 \ --asg-access echo -e "\nāœ“ Spun up the cluster" diff --git a/manager/manifests/nvidia.yaml b/manager/manifests/nvidia.yaml new file mode 100644 index 0000000000..55bf7a59bd --- /dev/null +++ b/manager/manifests/nvidia.yaml @@ -0,0 +1,56 @@ +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Source: https://github.com/NVIDIA/k8s-device-plugin/blob/1.0.0-beta/nvidia-device-plugin.yml + +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + updateStrategy: + type: RollingUpdate + template: + metadata: + # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler + # reserves resources for critical add-on pods so that they can be rescheduled after + # a failure. This annotation works in tandem with the toleration below. + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. + # This, along with the annotation above marks this pod as a critical add-on. + - key: CriticalAddonsOnly + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + containers: + - image: $CORTEX_IMAGE_NVIDIA + name: nvidia-device-plugin-ctr + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/manager/uninstall_operator.sh b/manager/uninstall_operator.sh index a059e41001..9f70194607 100755 --- a/manager/uninstall_operator.sh +++ b/manager/uninstall_operator.sh @@ -23,6 +23,9 @@ eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER --region=$CORTEX_REGION | g echo -e "\nUninstalling the Cortex operator ..." kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator >/dev/null 2>&1 -kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1 # Pods in DaemonSets cannot be modified + +# Pods in DaemonSets cannot be modified +kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1 +kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset nvidia-device-plugin-daemonset >/dev/null 2>&1 echo "āœ“ Uninstalled the Cortex operator"