From d3b6f5a649042d9f9142a73501f042b1dd46b55b Mon Sep 17 00:00:00 2001
From: Omer Spillinger <ospillinger@users.noreply.github.com>
Date: Wed, 3 Jul 2019 15:18:26 -0700
Subject: [PATCH] Add GPU support

---
 Makefile                      |   2 +
 cortex.sh                     |   2 +
 dev/eks.sh                    |  45 ----------
 dev/kops.sh                   | 163 ----------------------------------
 dev/registry.sh               |   2 +
 docs/apis/compute.md          |   4 +-
 docs/cluster/config.md        |   1 +
 docs/cluster/development.md   |   1 +
 images/nvidia/Dockerfile      |   1 +
 manager/install_cortex.sh     |   1 +
 manager/install_eks.sh        |   3 +-
 manager/manifests/nvidia.yaml |  56 ++++++++++++
 manager/uninstall_operator.sh |   5 +-
 13 files changed, 75 insertions(+), 211 deletions(-)
 delete mode 100755 dev/eks.sh
 delete mode 100755 dev/kops.sh
 create mode 100644 images/nvidia/Dockerfile
 create mode 100644 manager/manifests/nvidia.yaml

diff --git a/Makefile b/Makefile
index 324b3e0bea..2b5df8f02f 100644
--- a/Makefile
+++ b/Makefile
@@ -136,6 +136,7 @@ ci-build-images:
 	@./build/build-image.sh images/argo-executor argo-executor
 	@./build/build-image.sh images/python-packager python-packager
 	@./build/build-image.sh images/cluster-autoscaler cluster-autoscaler
+	@./build/build-image.sh images/nvidia nvidia
 	@./build/build-image.sh images/metrics-server metrics-server
 
 ci-push-images:
@@ -155,6 +156,7 @@ ci-push-images:
 	@./build/push-image.sh argo-executor
 	@./build/push-image.sh python-packager
 	@./build/push-image.sh cluster-autoscaler
+	@./build/push-image.sh nvidia
 	@./build/push-image.sh metrics-server
 
 
diff --git a/cortex.sh b/cortex.sh
index 44ab0a4070..ba73b0bc45 100755
--- a/cortex.sh
+++ b/cortex.sh
@@ -131,6 +131,7 @@ export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/
 export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
 export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"
 export CORTEX_IMAGE_CLUSTER_AUTOSCALER="${CORTEX_IMAGE_CLUSTER_AUTOSCALER:-cortexlabs/cluster-autoscaler:$CORTEX_VERSION_STABLE}"
+export CORTEX_IMAGE_NVIDIA="${CORTEX_IMAGE_NVIDIA:-cortexlabs/nvidia:$CORTEX_VERSION_STABLE}"
 export CORTEX_IMAGE_METRICS_SERVER="${CORTEX_IMAGE_METRICS_SERVER:-cortexlabs/metrics-server:$CORTEX_VERSION_STABLE}"
 
 export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}"
@@ -182,6 +183,7 @@ function install_cortex() {
     -e CORTEX_IMAGE_TF_SERVE_GPU=$CORTEX_IMAGE_TF_SERVE_GPU \
     -e CORTEX_IMAGE_TF_TRAIN_GPU=$CORTEX_IMAGE_TF_TRAIN_GPU \
     -e CORTEX_IMAGE_CLUSTER_AUTOSCALER=$CORTEX_IMAGE_CLUSTER_AUTOSCALER \
+    -e CORTEX_IMAGE_NVIDIA=$CORTEX_IMAGE_NVIDIA \
     -e CORTEX_IMAGE_METRICS_SERVER=$CORTEX_IMAGE_METRICS_SERVER \
     -e CORTEX_ENABLE_TELEMETRY=$CORTEX_ENABLE_TELEMETRY \
     $CORTEX_IMAGE_MANAGER
diff --git a/dev/eks.sh b/dev/eks.sh
deleted file mode 100755
index 0509e86f13..0000000000
--- a/dev/eks.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-set -euo pipefail
-
-ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)"
-source $ROOT/dev/config/k8s.sh
-
-function eks_set_cluster() {
-  eksctl utils write-kubeconfig --name=$K8S_NAME
-  kubectl config set-context $(kubectl config current-context) --namespace="cortex"
-}
-
-if [ "$1" = "start" ]; then
-  eksctl create cluster --version=1.11 --name=$K8S_NAME  --region $K8S_REGION --nodes-max $K8S_NODES_MAX_COUNT --nodes-min $K8S_NODES_MIN_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE
-  if [ $K8S_GPU_NODES_MIN_COUNT -gt 0 ] || [ $K8S_GPU_NODES_MAX_COUNT -gt 0 ]; then
-    eksctl create nodegroup --version=1.11 --cluster=$K8S_NAME --nodes-max=$K8S_GPU_NODES_MAX_COUNT --nodes-min=$K8S_GPU_NODES_MIN_COUNT  --node-type=$K8S_GPU_NODE_INSTANCE_TYPE --node-ami=$K8S_GPU_NODE_AMI
-    echo "Once the GPU nodegroup joins the cluster, run:"
-    echo "kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml"
-  fi
-  eks_set_cluster
-
-elif [ "$1" = "update" ]; then
-  echo "Not implemented"
-
-elif [ "$1" = "stop" ]; then
-  eksctl delete cluster --name=$K8S_NAME
-
-elif [ "$1" = "set" ]; then
-  eks_set_cluster
-fi
diff --git a/dev/kops.sh b/dev/kops.sh
deleted file mode 100755
index 79b24f2b57..0000000000
--- a/dev/kops.sh
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-set -euo pipefail
-
-ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)"
-
-source $ROOT/dev/config/k8s.sh
-export K8S_NAME="${K8S_NAME}.k8s.local"
-
-function kops_set_cluster() {
-  kops export kubecfg --name=$K8S_NAME --state="s3://$K8S_KOPS_BUCKET"
-  kubectl config set-context $(kubectl config current-context) --namespace="cortex"
-}
-
-function create_kops_config() {
-  cat > $ROOT/dev/config/.k8s_kops_config.yaml << EOM
-apiVersion: kops/v1alpha2
-kind: Cluster
-metadata:
-  creationTimestamp: null
-  name: ${K8S_NAME}
-spec:
-  api:
-    loadBalancer:
-      type: Public
-  authorization:
-    rbac: {}
-  channel: stable
-  cloudProvider: aws
-  configBase: s3://${K8S_KOPS_BUCKET}/kops/${K8S_NAME}
-  etcdClusters:
-  - etcdMembers:
-    - instanceGroup: master-${K8S_ZONE}
-      name: a
-    name: main
-  - etcdMembers:
-    - instanceGroup: master-${K8S_ZONE}
-      name: a
-    name: events
-  iam:
-    allowContainerRegistry: true
-    legacy: false
-  kubernetesApiAccess:
-  - 0.0.0.0/0
-  kubernetesVersion: 1.11.6
-  masterPublicName: api.${K8S_NAME}
-  networkCIDR: 172.20.0.0/16
-  networking:
-    kubenet: {}
-  nonMasqueradeCIDR: 100.64.0.0/10
-  sshAccess:
-  - 0.0.0.0/0
-  subnets:
-  - cidr: 172.20.32.0/19
-    name: ${K8S_ZONE}
-    type: Public
-    zone: ${K8S_ZONE}
-  topology:
-    dns:
-      type: Public
-    masters: public
-    nodes: public
-  additionalPolicies:
-    node: |
-      [
-        {
-          "Effect": "Allow",
-          "Action": "s3:*",
-          "Resource": "*"
-        },
-        {
-          "Effect": "Allow",
-          "Action": "logs:DescribeLogGroups",
-          "Resource": "*"
-        },
-        {
-          "Effect": "Allow",
-          "Action": "logs:*",
-          "Resource": [
-            "arn:aws:logs:${K8S_REGION}:*:log-group:*:*:*"
-          ]
-        }
-      ]
-
----
-
-apiVersion: kops/v1alpha2
-kind: InstanceGroup
-metadata:
-  creationTimestamp: null
-  labels:
-    kops.k8s.io/cluster: ${K8S_NAME}
-  name: master-${K8S_ZONE}
-spec:
-  image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17
-  machineType: ${K8S_MASTER_INSTANCE_TYPE}
-  rootVolumeSize: ${K8S_MASTER_VOLUME_SIZE}
-  maxSize: 1
-  minSize: 1
-  nodeLabels:
-    kops.k8s.io/instancegroup: master-${K8S_ZONE}
-  role: Master
-  subnets:
-  - ${K8S_ZONE}
-
----
-
-apiVersion: kops/v1alpha2
-kind: InstanceGroup
-metadata:
-  creationTimestamp: null
-  labels:
-    kops.k8s.io/cluster: ${K8S_NAME}
-  name: nodes
-spec:
-  image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17
-  machineType: ${K8S_NODE_INSTANCE_TYPE}
-  rootVolumeSize: ${K8S_NODE_VOLUME_SIZE}
-  maxSize: ${K8S_NODES_MAX_COUNT}
-  minSize: ${K8S_NODES_MIN_COUNT}
-  nodeLabels:
-    kops.k8s.io/instancegroup: nodes
-  role: Node
-  subnets:
-  - ${K8S_ZONE}
-EOM
-}
-
-if [ "$1" = "start" ]; then
-  create_kops_config
-  kops create -f $ROOT/dev/config/.k8s_kops_config.yaml --state=s3://$K8S_KOPS_BUCKET
-  kops create secret --state=s3://$K8S_KOPS_BUCKET --name=$K8S_NAME sshpublickey admin -i ~/.ssh/kops.pub
-  kops update cluster --yes --name=$K8S_NAME --state=s3://$K8S_KOPS_BUCKET
-  kops_set_cluster
-  until kops validate cluster --name=$K8S_NAME --state=s3://$K8S_KOPS_BUCKET; do
-    sleep 10
-  done
-
-elif [ "$1" = "update" ]; then
-  echo "Not implemented"
-
-elif [ "$1" = "stop" ]; then
-  kops delete cluster --yes --name=$K8S_NAME --state=s3://$K8S_KOPS_BUCKET
-  aws s3 rm s3://$K8S_KOPS_BUCKET/$K8S_NAME --recursive
-
-elif [ "$1" = "set" ]; then
-  kops_set_cluster
-fi
diff --git a/dev/registry.sh b/dev/registry.sh
index dcc11dd4ab..1e38ba314a 100755
--- a/dev/registry.sh
+++ b/dev/registry.sh
@@ -51,6 +51,7 @@ function create_registry() {
   aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
   aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
   aws ecr create-repository --repository-name=cortexlabs/cluster-autoscaler --region=$REGISTRY_REGION || true
+  aws ecr create-repository --repository-name=cortexlabs/nvidia --region=$REGISTRY_REGION || true
   aws ecr create-repository --repository-name=cortexlabs/metrics-server --region=$REGISTRY_REGION || true
 }
 
@@ -139,6 +140,7 @@ elif [ "$cmd" = "update" ]; then
     build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
     build_and_push $ROOT/images/python-packager python-packager latest
     build_and_push $ROOT/images/cluster-autoscaler cluster-autoscaler latest
+    build_and_push $ROOT/images/nvidia nvidia latest
     build_and_push $ROOT/images/metrics-server metrics-server latest
   fi
 
diff --git a/docs/apis/compute.md b/docs/apis/compute.md
index 8174b3f875..e77edf20af 100644
--- a/docs/apis/compute.md
+++ b/docs/apis/compute.md
@@ -25,4 +25,6 @@ One unit of memory is one byte. Memory can be expressed as an integer or by usin
 
 ## GPU
 
-One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html).
+1. Please make sure your AWS account is subscribed to the [EKS-optimized AMI with GPU Support](https://aws.amazon.com/marketplace/pp/B07GRHFXGM).
+2. Set CORTEX_NODE_TYPE to an AWS GPU instance (e.g. p2.xlarge) before installing Cortex.
+3. Note that one unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed.
diff --git a/docs/cluster/config.md b/docs/cluster/config.md
index 68f1e61c56..40fd396415 100644
--- a/docs/cluster/config.md
+++ b/docs/cluster/config.md
@@ -52,6 +52,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="cortexlabs/tf-train-gpu:master"
 export CORTEX_IMAGE_TF_SERVE_GPU="cortexlabs/tf-serve-gpu:master"
 export CORTEX_IMAGE_PYTHON_PACKAGER="cortexlabs/python-packager:master"
 export CORTEX_IMAGE_CLUSTER_AUTOSCALER="cortexlabs/cluster-autoscaler:master"
+export CORTEX_IMAGE_NVIDIA="cortexlabs/nvidia:master"
 export CORTEX_IMAGE_METRICS_SERVER="cortexlabs/metrics-server:master"
 
 # Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted.
diff --git a/docs/cluster/development.md b/docs/cluster/development.md
index 4d7b4a212e..85e6cf2030 100644
--- a/docs/cluster/development.md
+++ b/docs/cluster/development.md
@@ -86,6 +86,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/corte
 export CORTEX_IMAGE_TF_TRANSFORM="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/tf-transform:latest"
 export CORTEX_IMAGE_PYTHON_PACKAGER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/python-packager:latest"
 export CORTEX_IMAGE_CLUSTER_AUTOSCALER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/cluster-autoscaler:latest"
+export CORTEX_IMAGE_NVIDIA="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/nvidia:latest"
 export CORTEX_IMAGE_METRICS_SERVER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/metrics-server:latest"
 
 export AWS_ACCESS_KEY_ID="XXXXXX"
diff --git a/images/nvidia/Dockerfile b/images/nvidia/Dockerfile
new file mode 100644
index 0000000000..e43cb27d7a
--- /dev/null
+++ b/images/nvidia/Dockerfile
@@ -0,0 +1 @@
+FROM nvidia/k8s-device-plugin:1.0.0-beta
diff --git a/manager/install_cortex.sh b/manager/install_cortex.sh
index f9d97ada9f..f16400a3c8 100755
--- a/manager/install_cortex.sh
+++ b/manager/install_cortex.sh
@@ -169,5 +169,6 @@ envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null
 envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null
 envsubst < manifests/cluster-autoscaler.yaml | kubectl apply -f - >/dev/null
 envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
+envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null
 
 validate_cortex
diff --git a/manager/install_eks.sh b/manager/install_eks.sh
index ed1ccad97c..4ba799f8fb 100755
--- a/manager/install_eks.sh
+++ b/manager/install_eks.sh
@@ -24,7 +24,8 @@ eksctl create cluster --name=$CORTEX_CLUSTER \
                       --node-type=$CORTEX_NODE_TYPE \
                       --nodes-min=$CORTEX_NODES_MIN \
                       --nodes-max=$CORTEX_NODES_MAX \
-                      --version=1.13 \
+                      --node-ami=auto \
+                      --version=1.11 \
                       --asg-access
 
 echo -e "\n✓ Spun up the cluster"
diff --git a/manager/manifests/nvidia.yaml b/manager/manifests/nvidia.yaml
new file mode 100644
index 0000000000..55bf7a59bd
--- /dev/null
+++ b/manager/manifests/nvidia.yaml
@@ -0,0 +1,56 @@
+# Copyright 2019 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Source: https://github.com/NVIDIA/k8s-device-plugin/blob/1.0.0-beta/nvidia-device-plugin.yml
+
+apiVersion: extensions/v1beta1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
+      # reserves resources for critical add-on pods so that they can be rescheduled after
+      # a failure.  This annotation works in tandem with the toleration below.
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        name: nvidia-device-plugin-ds
+    spec:
+      tolerations:
+      # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
+      # This, along with the annotation above marks this pod as a critical add-on.
+      - key: CriticalAddonsOnly
+        operator: Exists
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      containers:
+      - image: $CORTEX_IMAGE_NVIDIA
+        name: nvidia-device-plugin-ctr
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+          - name: device-plugin
+            mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
diff --git a/manager/uninstall_operator.sh b/manager/uninstall_operator.sh
index a059e41001..9f70194607 100755
--- a/manager/uninstall_operator.sh
+++ b/manager/uninstall_operator.sh
@@ -23,6 +23,9 @@ eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER --region=$CORTEX_REGION | g
 echo -e "\nUninstalling the Cortex operator ..."
 
 kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator >/dev/null 2>&1
-kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1  # Pods in DaemonSets cannot be modified
+
+# Pods in DaemonSets cannot be modified
+kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1
+kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset nvidia-device-plugin-daemonset >/dev/null 2>&1
 
 echo "✓ Uninstalled the Cortex operator"