From c74baf29006bc510c0b41a23549a33b03e5c593d Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Mon, 7 Oct 2019 09:56:15 -0700
Subject: [PATCH 01/24] NodeGroup spot instances

---
 dev-cluster.yaml                       | 83 ++++++++++++++++++++++++++
 images/cluster-autoscaler/Dockerfile   |  2 +-
 manager/install_cortex.sh              | 10 ++--
 manager/manifests/fluentd.yaml         |  7 +++
 manager/manifests/nvidia.yaml          |  3 +
 manager/manifests/statsd.yaml          |  9 +++
 pkg/lib/k8s/pod.go                     | 11 ++++
 pkg/operator/workloads/api_workload.go | 10 ++++
 pkg/operator/workloads/workflow.go     | 30 +++++-----
 9 files changed, 144 insertions(+), 21 deletions(-)
 create mode 100644 dev-cluster.yaml

diff --git a/dev-cluster.yaml b/dev-cluster.yaml
new file mode 100644
index 0000000000..814134822a
--- /dev/null
+++ b/dev-cluster.yaml
@@ -0,0 +1,83 @@
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+
+metadata:
+  name: cortex
+  region: us-west-2
+  version: "1.14"
+
+# nodeGroups:
+#   # spot workers NG - multi AZ, scale from 3
+#   - name: ng-4
+#     ami: auto
+#     instanceType: mixed
+#     minSize: 1
+#     maxSize: 5
+#     volumeSize: 100
+#     volumeType: gp2
+#     volumeEncrypted: true
+#     iam:
+#       withAddonPolicies:a
+#         autoScaler: true
+#     instancesDistribution:
+#       instanceTypes: [t3.medium, t3.large]
+#       onDemandPercentageAboveBaseCapacity: 0
+#       spotInstancePools: 2
+#     taints:
+#       workload: "true:NoSchedule"
+#     tags:
+#       k8s.io/cluster-autoscaler/enabled: 'true'
+#       k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+#       k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
+#       k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
+#     labels:
+#       lifecycle: Ec2Spot
+#     kubeletExtraConfig:
+#       kubeReserved:
+#         cpu: 150m
+#         memory: 300Mi
+#         ephemeral-storage: 1Gi
+#       kubeReservedCgroup: /kube-reserved
+#       systemReserved:
+#         cpu: 150m
+#         memory: 300Mi
+#         ephemeral-storage: 1Gi
+#       evictionHard:
+#         memory.available:  200Mi
+#         nodefs.available: 5%
+
+
+# nodeGroups:
+#   - name: spot-ng
+#     ami: auto
+#     instanceType: mixed
+#     desiredCapacity: 0
+#     minSize: 0
+#     maxSize: 2
+#     volumeSize: 100
+#     volumeType: gp2
+#     volumeEncrypted: true
+#     instancesDistribution:
+#       instanceTypes: [p2.xlarge, p2.8xlarge]
+#     iam:
+#       withAddonPolicies:
+#         autoScaler: true
+#     # tags:
+#     #   k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
+#     #   k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
+#     #   k8s.io/cluster-autoscaler/enabled: 'true'
+#     kubeletExtraConfig:
+#       kubeReserved:
+#         cpu: 150m
+#         memory: 300Mi
+#         ephemeral-storage: 1Gi
+#       kubeReservedCgroup: /kube-reserved
+#       systemReserved:
+#         cpu: 150m
+#         memory: 300Mi
+#         ephemeral-storage: 1Gi
+#       evictionHard:
+#         memory.available:  200Mi
+#         nodefs.available: 5%
+#     taints:
+#       nvidia.com/gpu: "true:NoSchedule"
\ No newline at end of file
diff --git a/images/cluster-autoscaler/Dockerfile b/images/cluster-autoscaler/Dockerfile
index 27b9b571c6..f73836359b 100644
--- a/images/cluster-autoscaler/Dockerfile
+++ b/images/cluster-autoscaler/Dockerfile
@@ -1 +1 @@
-FROM k8s.gcr.io/cluster-autoscaler:v1.12.3
+FROM gcr.io/google-containers/cluster-autoscaler:v1.14.5
diff --git a/manager/install_cortex.sh b/manager/install_cortex.sh
index 0324330b4a..e884fab9b2 100755
--- a/manager/install_cortex.sh
+++ b/manager/install_cortex.sh
@@ -199,10 +199,12 @@ envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
 envsubst < manifests/statsd.yaml | kubectl apply -f - >/dev/null
 echo "✓ Configured metrics"
 
-if [[ "$CORTEX_NODE_TYPE" == p* ]] || [[ "$CORTEX_NODE_TYPE" == g* ]]; then
-  envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null
-  echo "✓ Configured GPU support"
-fi
+envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null
+echo "✓ Configured GPU support"
+
+# if [[ "$CORTEX_NODE_TYPE" == p* ]] || [[ "$CORTEX_NODE_TYPE" == g* ]]; then
+
+# fi
 
 envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null
 echo "✓ Started operator"
diff --git a/manager/manifests/fluentd.yaml b/manager/manifests/fluentd.yaml
index 6b4844dea0..16b341199f 100644
--- a/manager/manifests/fluentd.yaml
+++ b/manager/manifests/fluentd.yaml
@@ -157,6 +157,13 @@ spec:
           readOnly: true
         - name: config
           mountPath: /fluentd/etc
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      - key: workload
+        operator: Exists
+        effect: NoSchedule
       terminationGracePeriodSeconds: 30
       volumes:
       - name: varlog
diff --git a/manager/manifests/nvidia.yaml b/manager/manifests/nvidia.yaml
index 55bf7a59bd..aede5fdbf0 100644
--- a/manager/manifests/nvidia.yaml
+++ b/manager/manifests/nvidia.yaml
@@ -40,6 +40,9 @@ spec:
       - key: nvidia.com/gpu
         operator: Exists
         effect: NoSchedule
+      - key: workload
+        operator: Exists
+        effect: NoSchedule
       containers:
       - image: $CORTEX_IMAGE_NVIDIA
         name: nvidia-device-plugin-ctr
diff --git a/manager/manifests/statsd.yaml b/manager/manifests/statsd.yaml
index 555a275387..2b9b34190e 100644
--- a/manager/manifests/statsd.yaml
+++ b/manager/manifests/statsd.yaml
@@ -93,8 +93,17 @@ spec:
           volumeMounts:
             - name: cwagentconfig
               mountPath: /etc/cwagentconfig
+      nodeSelector:
+        lifecycle: "Ec2Spot"
       volumes:
         - name: cwagentconfig
           configMap:
             name: cwagentstatsdconfig
       terminationGracePeriodSeconds: 60
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      - key: workload
+        operator: Exists
+        effect: NoSchedule
diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go
index af92ab440d..c9af5c66a6 100644
--- a/pkg/lib/k8s/pod.go
+++ b/pkg/lib/k8s/pod.go
@@ -333,3 +333,14 @@ func (c *Client) StalledPods() ([]kcore.Pod, error) {
 
 	return stalledPods, nil
 }
+
+func Tolerations() []kcore.Toleration {
+	return []kcore.Toleration{
+		{
+			Key:      "workload",
+			Operator: kcore.TolerationOpEqual,
+			Value:    "true",
+			Effect:   kcore.TaintEffectNoSchedule,
+		},
+	}
+}
diff --git a/pkg/operator/workloads/api_workload.go b/pkg/operator/workloads/api_workload.go
index e229959908..7605b4023c 100644
--- a/pkg/operator/workloads/api_workload.go
+++ b/pkg/operator/workloads/api_workload.go
@@ -246,6 +246,7 @@ func tfAPISpec(
 	apiResourceList := kcore.ResourceList{}
 	tfServingResourceList := kcore.ResourceList{}
 	tfServingLimitsList := kcore.ResourceList{}
+	tolerations := k8s.Tolerations()
 
 	q1, q2 := api.Compute.CPU.SplitInTwo()
 	apiResourceList[kcore.ResourceCPU] = *q1
@@ -412,6 +413,10 @@ func tfAPISpec(
 						},
 					},
 				},
+				NodeSelector: map[string]string{
+					"lifecycle": "Ec2Spot",
+				},
+				Tolerations:        tolerations,
 				Volumes:            k8s.DefaultVolumes(),
 				ServiceAccountName: "default",
 			},
@@ -429,6 +434,7 @@ func onnxAPISpec(
 	servingImage := config.Cortex.ONNXServeImage
 	resourceList := kcore.ResourceList{}
 	resourceLimitsList := kcore.ResourceList{}
+	tolerations := k8s.Tolerations()
 	resourceList[kcore.ResourceCPU] = api.Compute.CPU.Quantity
 
 	if api.Compute.Mem != nil {
@@ -553,6 +559,10 @@ func onnxAPISpec(
 						},
 					},
 				},
+				NodeSelector: map[string]string{
+					"lifecycle": "Ec2Spot",
+				},
+				Tolerations:        tolerations,
 				Volumes:            k8s.DefaultVolumes(),
 				ServiceAccountName: "default",
 			},
diff --git a/pkg/operator/workloads/workflow.go b/pkg/operator/workloads/workflow.go
index 66211fe1dd..c16072b7e7 100644
--- a/pkg/operator/workloads/workflow.go
+++ b/pkg/operator/workloads/workflow.go
@@ -17,7 +17,6 @@ limitations under the License.
 package workloads
 
 import (
-	"fmt"
 	"path/filepath"
 
 	kresource "k8s.io/apimachinery/pkg/api/resource"
@@ -27,7 +26,6 @@ import (
 	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
 	"github.com/cortexlabs/cortex/pkg/operator/api/context"
 	"github.com/cortexlabs/cortex/pkg/operator/api/resource"
-	"github.com/cortexlabs/cortex/pkg/operator/api/userconfig"
 	"github.com/cortexlabs/cortex/pkg/operator/config"
 )
 
@@ -327,19 +325,19 @@ func ValidateDeploy(ctx *context.Context) error {
 		}
 	}
 
-	for _, api := range ctx.APIs {
-		if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {
-			return errors.Wrap(ErrorNoAvailableNodeComputeLimit("CPU", api.Compute.CPU.String(), maxCPU.String()), userconfig.Identify(api))
-		}
-		if api.Compute.Mem != nil {
-			if maxMem.Cmp(api.Compute.Mem.Quantity) < 0 {
-				return errors.Wrap(ErrorNoAvailableNodeComputeLimit("Memory", api.Compute.Mem.String(), maxMem.String()), userconfig.Identify(api))
-			}
-		}
-		gpu := api.Compute.GPU
-		if gpu > maxGPU {
-			return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU)), userconfig.Identify(api))
-		}
-	}
+	// for _, api := range ctx.APIs {
+	// 	if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {
+	// 		return errors.Wrap(ErrorNoAvailableNodeComputeLimit("CPU", api.Compute.CPU.String(), maxCPU.String()), userconfig.Identify(api))
+	// 	}
+	// 	if api.Compute.Mem != nil {
+	// 		if maxMem.Cmp(api.Compute.Mem.Quantity) < 0 {
+	// 			return errors.Wrap(ErrorNoAvailableNodeComputeLimit("Memory", api.Compute.Mem.String(), maxMem.String()), userconfig.Identify(api))
+	// 		}
+	// 	}
+	// 	gpu := api.Compute.GPU
+	// 	if gpu > maxGPU {
+	// 		return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU)), userconfig.Identify(api))
+	// 	}
+	// }
 	return nil
 }

From f4fd69c95152ff0ca88591ccf8ab09b731c4f0bf Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Mon, 7 Oct 2019 10:07:15 -0700
Subject: [PATCH 02/24] Update cluster-autoscaler.yaml

---
 dev/versions.md                           |  4 +-
 manager/manifests/cluster-autoscaler.yaml | 90 +++++++++++------------
 2 files changed, 44 insertions(+), 50 deletions(-)

diff --git a/dev/versions.md b/dev/versions.md
index 11ea89d2c2..cd8cb62eef 100644
--- a/dev/versions.md
+++ b/dev/versions.md
@@ -135,8 +135,8 @@ Note: overriding horizontal-pod-autoscaler-sync-period on EKS is currently not s
 
 ## Cluster autoscaler
 
-1. Find the latest release on [GitHub](https://github.com/kubernetes/autoscaler/releases) and check the changelog
-1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws), set the tree to the tag for the latest release, and open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml` (e.g. <https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-release-1.16/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml>)
+1. Find the latest patch release for our current version of k8s (e.g. k8s v1.14 -> cluster-autocluster v1.14.5) on [GitHub](https://github.com/kubernetes/autoscaler/releases) and check the changelog
+1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws), set the tree to the tag for the chosen release, and open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml` (e.g. <https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.14.5/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml>)
 1. Copy the contents to `manager/manifests/cluster-autoscaler.yaml`
    1. Update this line of config:
 
diff --git a/manager/manifests/cluster-autoscaler.yaml b/manager/manifests/cluster-autoscaler.yaml
index c39bb817ff..f189f7f0b4 100644
--- a/manager/manifests/cluster-autoscaler.yaml
+++ b/manager/manifests/cluster-autoscaler.yaml
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Source: https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.15.1/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml
-# README: https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.15.1/cluster-autoscaler/cloudprovider/aws
+# Source: https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.14.5/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml
 
 ---
 apiVersion: v1
@@ -33,45 +32,40 @@ metadata:
     k8s-addon: cluster-autoscaler.addons.k8s.io
     k8s-app: cluster-autoscaler
 rules:
-  - apiGroups: [""]
-    resources: ["events", "endpoints"]
-    verbs: ["create", "patch"]
-  - apiGroups: [""]
-    resources: ["pods/eviction"]
-    verbs: ["create"]
-  - apiGroups: [""]
-    resources: ["pods/status"]
-    verbs: ["update"]
-  - apiGroups: [""]
-    resources: ["endpoints"]
-    resourceNames: ["cluster-autoscaler"]
-    verbs: ["get", "update"]
-  - apiGroups: [""]
-    resources: ["nodes"]
-    verbs: ["watch", "list", "get", "update"]
-  - apiGroups: [""]
-    resources:
-      - "pods"
-      - "services"
-      - "replicationcontrollers"
-      - "persistentvolumeclaims"
-      - "persistentvolumes"
-    verbs: ["watch", "list", "get"]
-  - apiGroups: ["extensions"]
-    resources: ["replicasets", "daemonsets"]
-    verbs: ["watch", "list", "get"]
-  - apiGroups: ["policy"]
-    resources: ["poddisruptionbudgets"]
-    verbs: ["watch", "list"]
-  - apiGroups: ["apps"]
-    resources: ["statefulsets", "replicasets", "daemonsets"]
-    verbs: ["watch", "list", "get"]
-  - apiGroups: ["storage.k8s.io"]
-    resources: ["storageclasses"]
-    verbs: ["watch", "list", "get"]
-  - apiGroups: ["batch", "extensions"]
-    resources: ["jobs"]
-    verbs: ["get", "list", "watch", "patch"]
+- apiGroups: [""]
+  resources: ["events","endpoints"]
+  verbs: ["create", "patch"]
+- apiGroups: [""]
+  resources: ["pods/eviction"]
+  verbs: ["create"]
+- apiGroups: [""]
+  resources: ["pods/status"]
+  verbs: ["update"]
+- apiGroups: [""]
+  resources: ["endpoints"]
+  resourceNames: ["cluster-autoscaler"]
+  verbs: ["get","update"]
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs: ["watch","list","get","update"]
+- apiGroups: [""]
+  resources: ["pods","services","replicationcontrollers","persistentvolumeclaims","persistentvolumes"]
+  verbs: ["watch","list","get"]
+- apiGroups: ["extensions"]
+  resources: ["replicasets","daemonsets"]
+  verbs: ["watch","list","get"]
+- apiGroups: ["policy"]
+  resources: ["poddisruptionbudgets"]
+  verbs: ["watch","list"]
+- apiGroups: ["apps"]
+  resources: ["statefulsets", "replicasets", "daemonsets"]
+  verbs: ["watch","list","get"]
+- apiGroups: ["storage.k8s.io"]
+  resources: ["storageclasses"]
+  verbs: ["watch","list","get"]
+- apiGroups: ["batch", "extensions"]
+  resources: ["jobs"]
+  verbs: ["get", "list", "watch", "patch"]
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1
@@ -83,13 +77,13 @@ metadata:
     k8s-addon: cluster-autoscaler.addons.k8s.io
     k8s-app: cluster-autoscaler
 rules:
-  - apiGroups: [""]
-    resources: ["configmaps"]
-    verbs: ["create","list","watch"]
-  - apiGroups: [""]
-    resources: ["configmaps"]
-    resourceNames: ["cluster-autoscaler-status", "cluster-autoscaler-priority-expander"]
-    verbs: ["delete", "get", "update", "watch"]
+- apiGroups: [""]
+  resources: ["configmaps"]
+  verbs: ["create"]
+- apiGroups: [""]
+  resources: ["configmaps"]
+  resourceNames: ["cluster-autoscaler-status"]
+  verbs: ["delete","get","update"]
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1

From abfe18fdf822c60c087b80fea9e64d11d53c1c77 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Mon, 7 Oct 2019 15:39:12 -0700
Subject: [PATCH 03/24] Update autoscaler to version 1.16

---
 dev-cluster.yaml                          |  78 +++++++--------
 manager/manifests/cluster-autoscaler.yaml | 114 ++++++++++------------
 2 files changed, 92 insertions(+), 100 deletions(-)

diff --git a/dev-cluster.yaml b/dev-cluster.yaml
index 814134822a..8194b916d7 100644
--- a/dev-cluster.yaml
+++ b/dev-cluster.yaml
@@ -6,45 +6,45 @@ metadata:
   region: us-west-2
   version: "1.14"
 
-# nodeGroups:
-#   # spot workers NG - multi AZ, scale from 3
-#   - name: ng-4
-#     ami: auto
-#     instanceType: mixed
-#     minSize: 1
-#     maxSize: 5
-#     volumeSize: 100
-#     volumeType: gp2
-#     volumeEncrypted: true
-#     iam:
-#       withAddonPolicies:a
-#         autoScaler: true
-#     instancesDistribution:
-#       instanceTypes: [t3.medium, t3.large]
-#       onDemandPercentageAboveBaseCapacity: 0
-#       spotInstancePools: 2
-#     taints:
-#       workload: "true:NoSchedule"
-#     tags:
-#       k8s.io/cluster-autoscaler/enabled: 'true'
-#       k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
-#       k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
-#       k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
-#     labels:
-#       lifecycle: Ec2Spot
-#     kubeletExtraConfig:
-#       kubeReserved:
-#         cpu: 150m
-#         memory: 300Mi
-#         ephemeral-storage: 1Gi
-#       kubeReservedCgroup: /kube-reserved
-#       systemReserved:
-#         cpu: 150m
-#         memory: 300Mi
-#         ephemeral-storage: 1Gi
-#       evictionHard:
-#         memory.available:  200Mi
-#         nodefs.available: 5%
+nodeGroups:
+  # spot workers NG - multi AZ, scale from 3
+  - name: ng-3
+    ami: auto
+    instanceType: mixed
+    minSize: 0
+    maxSize: 5
+    volumeSize: 100
+    volumeType: gp2
+    volumeEncrypted: true
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    instancesDistribution:
+      instanceTypes: [t3.medium, t3.large]
+      onDemandPercentageAboveBaseCapacity: 0
+      spotInstancePools: 2
+    taints:
+      workload: "true:NoSchedule"
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
+      k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
+      k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
+    labels:
+      lifecycle: Ec2Spot
+    kubeletExtraConfig:
+      kubeReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      kubeReservedCgroup: /kube-reserved
+      systemReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      evictionHard:
+        memory.available:  200Mi
+        nodefs.available: 5%
 
 
 # nodeGroups:
diff --git a/manager/manifests/cluster-autoscaler.yaml b/manager/manifests/cluster-autoscaler.yaml
index f189f7f0b4..21dcc52848 100644
--- a/manager/manifests/cluster-autoscaler.yaml
+++ b/manager/manifests/cluster-autoscaler.yaml
@@ -1,19 +1,3 @@
-# Copyright 2019 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Source: https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.14.5/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml
-
 ---
 apiVersion: v1
 kind: ServiceAccount
@@ -32,41 +16,49 @@ metadata:
     k8s-addon: cluster-autoscaler.addons.k8s.io
     k8s-app: cluster-autoscaler
 rules:
-- apiGroups: [""]
-  resources: ["events","endpoints"]
-  verbs: ["create", "patch"]
-- apiGroups: [""]
-  resources: ["pods/eviction"]
-  verbs: ["create"]
-- apiGroups: [""]
-  resources: ["pods/status"]
-  verbs: ["update"]
-- apiGroups: [""]
-  resources: ["endpoints"]
-  resourceNames: ["cluster-autoscaler"]
-  verbs: ["get","update"]
-- apiGroups: [""]
-  resources: ["nodes"]
-  verbs: ["watch","list","get","update"]
-- apiGroups: [""]
-  resources: ["pods","services","replicationcontrollers","persistentvolumeclaims","persistentvolumes"]
-  verbs: ["watch","list","get"]
-- apiGroups: ["extensions"]
-  resources: ["replicasets","daemonsets"]
-  verbs: ["watch","list","get"]
-- apiGroups: ["policy"]
-  resources: ["poddisruptionbudgets"]
-  verbs: ["watch","list"]
-- apiGroups: ["apps"]
-  resources: ["statefulsets", "replicasets", "daemonsets"]
-  verbs: ["watch","list","get"]
-- apiGroups: ["storage.k8s.io"]
-  resources: ["storageclasses"]
-  verbs: ["watch","list","get"]
-- apiGroups: ["batch", "extensions"]
-  resources: ["jobs"]
-  verbs: ["get", "list", "watch", "patch"]
-
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["csinodes"]
+    verbs: ["watch", "list", "get"]
+  - apiGroups: [""]
+    resources: ["events", "endpoints"]
+    verbs: ["create", "patch"]
+  - apiGroups: [""]
+    resources: ["pods/eviction"]
+    verbs: ["create"]
+  - apiGroups: [""]
+    resources: ["pods/status"]
+    verbs: ["update"]
+  - apiGroups: [""]
+    resources: ["endpoints"]
+    resourceNames: ["cluster-autoscaler"]
+    verbs: ["get", "update"]
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["watch", "list", "get", "update"]
+  - apiGroups: [""]
+    resources:
+      - "pods"
+      - "services"
+      - "replicationcontrollers"
+      - "persistentvolumeclaims"
+      - "persistentvolumes"
+    verbs: ["watch", "list", "get"]
+  - apiGroups: ["extensions"]
+    resources: ["replicasets", "daemonsets"]
+    verbs: ["watch", "list", "get"]
+  - apiGroups: ["policy"]
+    resources: ["poddisruptionbudgets"]
+    verbs: ["watch", "list"]
+  - apiGroups: ["apps"]
+    resources: ["statefulsets", "replicasets", "daemonsets"]
+    verbs: ["watch", "list", "get"]
+  - apiGroups: ["storage.k8s.io"]
+    resources: ["storageclasses"]
+    verbs: ["watch", "list", "get"]
+  - apiGroups: ["batch", "extensions"]
+    resources: ["jobs"]
+    verbs: ["get", "list", "watch", "patch"]
+    
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
@@ -77,13 +69,13 @@ metadata:
     k8s-addon: cluster-autoscaler.addons.k8s.io
     k8s-app: cluster-autoscaler
 rules:
-- apiGroups: [""]
-  resources: ["configmaps"]
-  verbs: ["create"]
-- apiGroups: [""]
-  resources: ["configmaps"]
-  resourceNames: ["cluster-autoscaler-status"]
-  verbs: ["delete","get","update"]
+  - apiGroups: [""]
+    resources: ["configmaps"]
+    verbs: ["create","list","watch"]
+  - apiGroups: [""]
+    resources: ["configmaps"]
+    resourceNames: ["cluster-autoscaler-status", "cluster-autoscaler-priority-expander"]
+    verbs: ["delete", "get", "update", "watch"]
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1
@@ -140,7 +132,7 @@ spec:
     spec:
       serviceAccountName: cluster-autoscaler
       containers:
-        - image: $CORTEX_IMAGE_CLUSTER_AUTOSCALER
+        - image: k8s.gcr.io/cluster-autoscaler:v1.16.1
           name: cluster-autoscaler
           resources:
             limits:
@@ -156,7 +148,7 @@ spec:
             - --cloud-provider=aws
             - --skip-nodes-with-local-storage=false
             - --expander=least-waste
-            - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/$CORTEX_CLUSTER
+            - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/cortex
           volumeMounts:
             - name: ssl-certs
               mountPath: /etc/ssl/certs/ca-certificates.crt
@@ -165,4 +157,4 @@ spec:
       volumes:
         - name: ssl-certs
           hostPath:
-            path: "/etc/ssl/certs/ca-bundle.crt"
+            path: "/etc/ssl/certs/ca-bundle.crt"
\ No newline at end of file

From fdc8201a84ffc3b454b4b8db46fa2123b4171a4a Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 10 Oct 2019 15:21:31 -0700
Subject: [PATCH 04/24] Calculate allocatable resources more accurately

---
 cortex.sh                          |  5 ++
 dev-cluster.yaml                   | 74 ++++++++++++-------------
 dev/operator_local.sh              |  6 +++
 docs/cluster/config.md             |  6 +--
 images/manager/Dockerfile          |  2 +
 manager/eks.yaml                   | 38 +++++++++++--
 manager/install_cortex.sh          | 22 +++++---
 manager/instance_metadata.py       | 86 ++++++++++++++++++++++++++++++
 manager/manifests/fluentd.yaml     |  7 +++
 manager/manifests/nvidia.yaml      |  7 +++
 manager/requirements.txt           |  2 +
 pkg/operator/config/config.go      | 43 +++++++++++----
 pkg/operator/workloads/workflow.go | 60 ++++++++-------------
 13 files changed, 259 insertions(+), 99 deletions(-)
 create mode 100644 manager/instance_metadata.py
 create mode 100644 manager/requirements.txt

diff --git a/cortex.sh b/cortex.sh
index e283e433aa..ec81a80e38 100755
--- a/cortex.sh
+++ b/cortex.sh
@@ -213,6 +213,10 @@ fi
 
 function install_eks() {
   echo
+  export CORTEX_DESIRED_COUNT= $CORTEX_NODES_MIN
+  if [ $CORTEX_DESIRED_COUNT -lt 1 ]; then
+    export $CORTEX_DESIRED_COUNT=1
+  fi
   docker run -it --entrypoint /root/install_eks.sh \
     -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
     -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
@@ -221,6 +225,7 @@ function install_eks() {
     -e CORTEX_NODE_TYPE=$CORTEX_NODE_TYPE \
     -e CORTEX_NODES_MIN=$CORTEX_NODES_MIN \
     -e CORTEX_NODES_MAX=$CORTEX_NODES_MAX \
+    -e CORTEX_DESIRED_COUNT=$CORTEX_DESIRED_COUNT \
     $CORTEX_IMAGE_MANAGER
 }
 
diff --git a/dev-cluster.yaml b/dev-cluster.yaml
index 8194b916d7..01125b9116 100644
--- a/dev-cluster.yaml
+++ b/dev-cluster.yaml
@@ -8,43 +8,43 @@ metadata:
 
 nodeGroups:
   # spot workers NG - multi AZ, scale from 3
-  - name: ng-3
-    ami: auto
-    instanceType: mixed
-    minSize: 0
-    maxSize: 5
-    volumeSize: 100
-    volumeType: gp2
-    volumeEncrypted: true
-    iam:
-      withAddonPolicies:
-        autoScaler: true
-    instancesDistribution:
-      instanceTypes: [t3.medium, t3.large]
-      onDemandPercentageAboveBaseCapacity: 0
-      spotInstancePools: 2
-    taints:
-      workload: "true:NoSchedule"
-    tags:
-      k8s.io/cluster-autoscaler/enabled: 'true'
-      k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
-      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
-      k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
-    labels:
-      lifecycle: Ec2Spot
-    kubeletExtraConfig:
-      kubeReserved:
-        cpu: 150m
-        memory: 300Mi
-        ephemeral-storage: 1Gi
-      kubeReservedCgroup: /kube-reserved
-      systemReserved:
-        cpu: 150m
-        memory: 300Mi
-        ephemeral-storage: 1Gi
-      evictionHard:
-        memory.available:  200Mi
-        nodefs.available: 5%
+  # - name: ng-3
+  #   ami: auto
+  #   instanceType: mixed
+  #   minSize: 0
+  #   maxSize: 5
+  #   volumeSize: 100
+  #   volumeType: gp2
+  #   volumeEncrypted: true
+  #   iam:
+  #     withAddonPolicies:
+  #       autoScaler: true
+  #   instancesDistribution:
+  #     instanceTypes: [t3.medium, t3.large]
+  #     onDemandPercentageAboveBaseCapacity: 0
+  #     spotInstancePools: 2
+  #   taints:
+  #     workload: "true:NoSchedule"
+  #   tags:
+  #     k8s.io/cluster-autoscaler/enabled: 'true'
+  #     k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+  #     k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
+  #     k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
+  #   labels:
+  #     lifecycle: Ec2Spot
+  #   kubeletExtraConfig:
+  #     kubeReserved:
+  #       cpu: 150m
+  #       memory: 300Mi
+  #       ephemeral-storage: 1Gi
+  #     kubeReservedCgroup: /kube-reserved
+  #     systemReserved:
+  #       cpu: 150m
+  #       memory: 300Mi
+  #       ephemeral-storage: 1Gi
+  #     evictionHard:
+  #       memory.available:  200Mi
+  #       nodefs.available: 5%
 
 
 # nodeGroups:
diff --git a/dev/operator_local.sh b/dev/operator_local.sh
index 968719190f..1fbc1a1a2a 100755
--- a/dev/operator_local.sh
+++ b/dev/operator_local.sh
@@ -23,6 +23,12 @@ source $ROOT/dev/config/cortex.sh
 
 export CORTEX_OPERATOR_IN_CLUSTER=false
 
+pip3 install -r $ROOT/manager/requirements.txt
+
+export CORTEX_NODE_CPU=$(python3 $ROOT/manager/instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="$HOME/.cortex/ec2-metadata.json" --feature="cpu")
+export CORTEX_NODE_MEM=$(python3 $ROOT/manager/instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="$HOME/.cortex/ec2-metadata.json" --feature="mem")
+export CORTEX_NODE_GPU=$(python3 $ROOT/manager/instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="$HOME/.cortex/ec2-metadata.json" --feature="gpu")
+
 kill $(pgrep -f rerun) >/dev/null 2>&1 || true
 updated_config=$(cat $HOME/.cortex/default.json | jq '.cortex_url = "http://localhost:8888"') && echo $updated_config > $HOME/.cortex/default.json
 rerun -watch $ROOT/pkg $ROOT/cli -ignore $ROOT/vendor $ROOT/bin -run sh -c \
diff --git a/docs/cluster/config.md b/docs/cluster/config.md
index 5d789dfbe9..ec70e2ff85 100644
--- a/docs/cluster/config.md
+++ b/docs/cluster/config.md
@@ -27,13 +27,13 @@ export CORTEX_REGION="us-west-2"
 # The name of the EKS cluster Cortex will use
 export CORTEX_CLUSTER="cortex"
 
-# The AWS node type Cortex will use
+# The AWS node type Cortex will use for worker nodes
 export CORTEX_NODE_TYPE="m5.large"
 
-# Minimum number of nodes in the cluster
+# Minimum number of worker nodes in the cluster
 export CORTEX_NODES_MIN=2
 
-# Maximum number of nodes in the cluster
+# Maximum number of worker nodes in the cluster
 export CORTEX_NODES_MAX=5
 
 # The name of the Kubernetes namespace Cortex will use
diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile
index 86181ec481..4d78b591e0 100644
--- a/images/manager/Dockerfile
+++ b/images/manager/Dockerfile
@@ -35,4 +35,6 @@ RUN ISTIO_VERSION=1.3.0 && \
 
 COPY manager /root
 
+RUN pip install -r /root/requirements.txt
+
 ENTRYPOINT ["/bin/bash"]
diff --git a/manager/eks.yaml b/manager/eks.yaml
index 2254a8b280..48b265b72e 100644
--- a/manager/eks.yaml
+++ b/manager/eks.yaml
@@ -21,15 +21,47 @@ metadata:
   version: "1.14"
 
 nodeGroups:
-  - name: ng-1
+  - name: ng-cortex-operator
+    instanceType: t3.medium
+    minSize: 2
+    maxSize: 2
+    desiredCapacity: 2
+    ami: auto
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    kubeletExtraConfig:
+      kubeReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      kubeReservedCgroup: /kube-reserved
+      systemReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      evictionHard:
+        memory.available:  200Mi
+        nodefs.available: 5%
+
+  - name: ng-cortex-workers
+    ami: auto
     instanceType: $CORTEX_NODE_TYPE
     minSize: $CORTEX_NODES_MIN
     maxSize: $CORTEX_NODES_MAX
-    desiredCapacity: $CORTEX_NODES_MIN
-    ami: auto
+    desiredCapacity: $CORTEX_DESIRED_COUNT
     iam:
       withAddonPolicies:
         autoScaler: true
+    taints:
+      workload: "true:NoSchedule"
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
+      k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
+      k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
+    labels:
+      lifecycle: Ec2Spot
     kubeletExtraConfig:
       kubeReserved:
         cpu: 150m
diff --git a/manager/install_cortex.sh b/manager/install_cortex.sh
index e884fab9b2..eb31721d2d 100755
--- a/manager/install_cortex.sh
+++ b/manager/install_cortex.sh
@@ -63,6 +63,10 @@ function setup_configmap() {
     --from-literal='IMAGE_DOWNLOADER'=$CORTEX_IMAGE_DOWNLOADER \
     --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \
     --from-literal='ENABLE_TELEMETRY'=$CORTEX_ENABLE_TELEMETRY \
+    --from-literal='NODE_TYPE'=$CORTEX_NODE_TYPE \
+    --from-literal='NODE_MEM'=$CORTEX_NODE_MEM \
+    --from-literal='NODE_CPU'=$CORTEX_NODE_CPU \
+    --from-literal='NODE_GPU'=$CORTEX_NODE_GPU \
     -o yaml --dry-run | kubectl apply -f - >/dev/null
 }
 
@@ -170,6 +174,14 @@ function validate_cortex() {
   echo -e "\n✓ Load balancers are ready"
 }
 
+export CORTEX_NODE_CPU=$(python instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="./metadata.json" --feature="cpu")
+export CORTEX_NODE_MEM=$(python instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="./metadata.json" --feature="mem")
+export CORTEX_NODE_GPU=$(python instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="./metadata.json" --feature="gpu")
+
+echo $CORTEX_NODE_CPU
+echo $CORTEX_NODE_MEM
+echo $CORTEX_NODE_GPU
+
 eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER --region=$CORTEX_REGION | grep -v "saved kubeconfig as" | grep -v "using region" || true
 
 # https://docs.aws.amazon.com/eks/latest/userguide/cni-upgrades.html
@@ -199,12 +211,10 @@ envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
 envsubst < manifests/statsd.yaml | kubectl apply -f - >/dev/null
 echo "✓ Configured metrics"
 
-envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null
-echo "✓ Configured GPU support"
-
-# if [[ "$CORTEX_NODE_TYPE" == p* ]] || [[ "$CORTEX_NODE_TYPE" == g* ]]; then
-
-# fi
+if [[ "$CORTEX_NODE_TYPE" == p* ]] || [[ "$CORTEX_NODE_TYPE" == g* ]]; then
+  envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null
+  echo "✓ Configured GPU support"
+fi
 
 envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null
 echo "✓ Started operator"
diff --git a/manager/instance_metadata.py b/manager/instance_metadata.py
new file mode 100644
index 0000000000..223d97da3b
--- /dev/null
+++ b/manager/instance_metadata.py
@@ -0,0 +1,86 @@
+import boto3
+import requests
+import argparse
+import re
+import os
+import pathlib
+import json
+
+PRICING_ENDPOINT_TEMPLATE = (
+    "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{}/index.json"
+)
+
+
+def download_metadata(args):
+    response = requests.get(PRICING_ENDPOINT_TEMPLATE.format(args.region))
+    offers = response.json()
+
+    instance_mapping = {}
+
+    for product_id, product in offers["products"].items():
+        if product.get("attributes") is None:
+            continue
+        if product["attributes"].get("servicecode") != "AmazonEC2":
+            continue
+        if product["attributes"].get("tenancy") != "Shared":
+            continue
+        if product["attributes"].get("operatingSystem") != "Linux":
+            continue
+        if product["attributes"].get("capacitystatus") != "Used":
+            continue
+        if product["attributes"].get("operation") != "RunInstances":
+            continue
+        price_dimensions = list(offers["terms"]["OnDemand"][product["sku"]].values())[0][
+            "priceDimensions"
+        ]
+
+        price = list(price_dimensions.values())[0]["pricePerUnit"]["USD"]
+
+        instance_type = product["attributes"]["instanceType"]
+        metadata = {
+            "sku": product["sku"],
+            "instance_type": instance_type,
+            "cpu": int(product["attributes"]["vcpu"]),
+            "mem": int(
+                float(re.sub("[^0-9\\.]", "", product["attributes"]["memory"].split(" ")[0])) * 1024
+            ),
+            "price": float(price),
+        }
+        if product["attributes"].get("gpu") is not None:
+            metadata["gpu"] = product["attributes"]["gpu"]
+        instance_mapping[instance_type] = metadata
+    with open(args.cache_dir, "w") as outfile:
+        json.dump(instance_mapping, outfile)
+
+    return instance_mapping
+
+
+def get_metadata(args):
+    if pathlib.Path(args.cache_dir).exists():
+        return json.load(open(args.cache_dir))
+    else:
+        return download_metadata(args)
+
+
+units = {"mem": "Mi"}
+
+
+def set_ec2_metadata(args):
+    instance_mapping = get_metadata(args)
+    instance_type = instance_mapping[args.instance_type]
+    print("{}{}".format(instance_type.get(args.feature, "0"), units.get(args.feature, "")))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    na = parser.add_argument_group("required named arguments")
+    na.add_argument("--region", required=True, help="AWS Region")
+    na.add_argument("--instance-type", required=True, help="Instance type")
+    na.add_argument("--feature", required=True, help="Feature to get")
+    na.add_argument("--cache-dir", required=True, help="Cache dir")
+    args = parser.parse_args()
+    set_ec2_metadata(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/manager/manifests/fluentd.yaml b/manager/manifests/fluentd.yaml
index 16b341199f..11b8230715 100644
--- a/manager/manifests/fluentd.yaml
+++ b/manager/manifests/fluentd.yaml
@@ -149,6 +149,13 @@ spec:
             secretKeyRef:
               name: aws-credentials
               key: AWS_SECRET_ACCESS_KEY
+        resources:
+          limits:
+            cpu:  200m
+            memory: 200Mi
+          requests:
+            cpu: 200m
+            memory: 200Mi
         volumeMounts:
         - name: varlog
           mountPath: /var/log
diff --git a/manager/manifests/nvidia.yaml b/manager/manifests/nvidia.yaml
index aede5fdbf0..7852b6038d 100644
--- a/manager/manifests/nvidia.yaml
+++ b/manager/manifests/nvidia.yaml
@@ -53,6 +53,13 @@ spec:
         volumeMounts:
           - name: device-plugin
             mountPath: /var/lib/kubelet/device-plugins
+        resources: # https://github.com/kubernetes/kubernetes/blob/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml#L44
+          requests:
+            cpu: 50m
+            memory: 10Mi
+          limits:
+            cpu: 50m
+            memory: 10Mi
       volumes:
         - name: device-plugin
           hostPath:
diff --git a/manager/requirements.txt b/manager/requirements.txt
new file mode 100644
index 0000000000..f3667cfc01
--- /dev/null
+++ b/manager/requirements.txt
@@ -0,0 +1,2 @@
+requests==2.22.0
+boto3==1.9.199
diff --git a/pkg/operator/config/config.go b/pkg/operator/config/config.go
index d474f8cab4..e267a2ef55 100644
--- a/pkg/operator/config/config.go
+++ b/pkg/operator/config/config.go
@@ -19,6 +19,8 @@ package config
 import (
 	"path/filepath"
 
+	kresource "k8s.io/apimachinery/pkg/api/resource"
+
 	"github.com/cortexlabs/cortex/pkg/consts"
 	"github.com/cortexlabs/cortex/pkg/lib/aws"
 	"github.com/cortexlabs/cortex/pkg/lib/configreader"
@@ -37,12 +39,17 @@ var (
 )
 
 type CortexConfig struct {
-	ID                string `json:"id"`
-	APIVersion        string `json:"api_version"`
-	Bucket            string `json:"bucket"`
-	LogGroup          string `json:"log_group"`
-	Region            string `json:"region"`
-	Namespace         string `json:"namespace"`
+	ID         string             `json:"id"`
+	APIVersion string             `json:"api_version"`
+	Bucket     string             `json:"bucket"`
+	LogGroup   string             `json:"log_group"`
+	Region     string             `json:"region"`
+	Namespace  string             `json:"namespace"`
+	NodeType   string             `json:"node_type"`
+	NodeCPU    kresource.Quantity `json:"node_cpu"`
+	NodeMem    kresource.Quantity `json:"node_mem"`
+	NodeGPU    kresource.Quantity `json:"node_gpu"`
+
 	OperatorImage     string `json:"operator_image"`
 	TFServeImage      string `json:"tf_serve_image"`
 	TFAPIImage        string `json:"tf_api_image"`
@@ -58,11 +65,16 @@ type CortexConfig struct {
 
 func Init() error {
 	Cortex = &CortexConfig{
-		APIVersion:        consts.CortexVersion,
-		Bucket:            getStr("BUCKET"),
-		LogGroup:          getStr("LOG_GROUP"),
-		Region:            getStr("REGION"),
-		Namespace:         getStr("NAMESPACE"),
+		APIVersion: consts.CortexVersion,
+		Bucket:     getStr("BUCKET"),
+		LogGroup:   getStr("LOG_GROUP"),
+		Region:     getStr("REGION"),
+		Namespace:  getStr("NAMESPACE"),
+		NodeType:   getStr("NODE_TYPE"),
+		NodeCPU:    getQuantity("NODE_CPU"),
+		NodeMem:    getQuantity("NODE_MEM"),
+		NodeGPU:    getQuantity("NODE_GPU"),
+
 		OperatorImage:     getStr("IMAGE_OPERATOR"),
 		TFServeImage:      getStr("IMAGE_TF_SERVE"),
 		TFAPIImage:        getStr("IMAGE_TF_API"),
@@ -75,6 +87,7 @@ func Init() error {
 		EnableTelemetry:   getBool("ENABLE_TELEMETRY", false),
 		OperatorInCluster: getBool("OPERATOR_IN_CLUSTER", true),
 	}
+
 	Cortex.ID = hash.String(Cortex.Bucket + Cortex.Region + Cortex.LogGroup)
 
 	var err error
@@ -114,6 +127,14 @@ func getStr(name string) string {
 	return configreader.MustStringFromEnvOrFile(envVarName, filePath, v)
 }
 
+func getQuantity(name string) kresource.Quantity {
+	v := &configreader.StringValidation{Required: true}
+	envVarName, filePath := getPaths(name)
+	value := configreader.MustStringFromEnvOrFile(envVarName, filePath, v)
+
+	return kresource.MustParse(value)
+}
+
 func getBool(name string, defaultVal bool) bool {
 	envVarName, filePath := getPaths(name)
 	v := &configreader.BoolValidation{Default: defaultVal}
diff --git a/pkg/operator/workloads/workflow.go b/pkg/operator/workloads/workflow.go
index c16072b7e7..cf561672d8 100644
--- a/pkg/operator/workloads/workflow.go
+++ b/pkg/operator/workloads/workflow.go
@@ -17,6 +17,7 @@ limitations under the License.
 package workloads
 
 import (
+	"fmt"
 	"path/filepath"
 
 	kresource "k8s.io/apimachinery/pkg/api/resource"
@@ -26,9 +27,13 @@ import (
 	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
 	"github.com/cortexlabs/cortex/pkg/operator/api/context"
 	"github.com/cortexlabs/cortex/pkg/operator/api/resource"
+	"github.com/cortexlabs/cortex/pkg/operator/api/userconfig"
 	"github.com/cortexlabs/cortex/pkg/operator/config"
 )
 
+var cortexCPUReserve = kresource.MustParse("800m")   // FluentD (200), Nvidia (50), StatsD (100), Kube Procy, (100) Node capacity - Node availability 300 CPU
+var cortexMemReserve = kresource.MustParse("1500Mi") // FluentD (200), Nvidia (50), StatsD (100), KubeReserved (800), AWS node memory - Node capacity (200)
+
 func Init() error {
 	err := reloadCurrentContexts()
 	if err != nil {
@@ -296,48 +301,25 @@ func GetDeploymentStatus(appName string) (resource.DeploymentStatus, error) {
 }
 
 func ValidateDeploy(ctx *context.Context) error {
-	nodes, err := config.Kubernetes.ListNodes(nil)
-	if err != nil {
-		return err
-	}
-
-	var maxCPU, maxMem kresource.Quantity
-	var maxGPU int64
-	for _, node := range nodes {
-		curCPU := node.Status.Capacity.Cpu()
-		curMem := node.Status.Capacity.Memory()
-
-		var curGPU int64
-		if GPUQuantity, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
-			curGPU, _ = GPUQuantity.AsInt64()
+	maxCPU := config.Cortex.NodeCPU.Copy()
+	maxCPU.Sub(cortexCPUReserve)
+	maxMem := config.Cortex.NodeMem.Copy()
+	maxMem.Sub(cortexMemReserve)
+	maxGPU := config.Cortex.NodeGPU.Copy()
+
+	for _, api := range ctx.APIs {
+		if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {
+			return errors.Wrap(ErrorNoAvailableNodeComputeLimit("CPU", api.Compute.CPU.String(), maxCPU.String()), userconfig.Identify(api))
 		}
-
-		if curCPU != nil && maxCPU.Cmp(*curCPU) < 0 {
-			maxCPU = *curCPU
-		}
-
-		if curMem != nil && maxMem.Cmp(*curMem) < 0 {
-			maxMem = *curMem
+		if api.Compute.Mem != nil {
+			if maxMem.Cmp(api.Compute.Mem.Quantity) < 0 {
+				return errors.Wrap(ErrorNoAvailableNodeComputeLimit("Memory", api.Compute.Mem.String(), maxMem.String()), userconfig.Identify(api))
+			}
 		}
-
-		if curGPU > maxGPU {
-			maxGPU = curGPU
+		gpu := api.Compute.GPU
+		if gpu > maxGPU.Value() {
+			return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU.Value())), userconfig.Identify(api))
 		}
 	}
-
-	// for _, api := range ctx.APIs {
-	// 	if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {
-	// 		return errors.Wrap(ErrorNoAvailableNodeComputeLimit("CPU", api.Compute.CPU.String(), maxCPU.String()), userconfig.Identify(api))
-	// 	}
-	// 	if api.Compute.Mem != nil {
-	// 		if maxMem.Cmp(api.Compute.Mem.Quantity) < 0 {
-	// 			return errors.Wrap(ErrorNoAvailableNodeComputeLimit("Memory", api.Compute.Mem.String(), maxMem.String()), userconfig.Identify(api))
-	// 		}
-	// 	}
-	// 	gpu := api.Compute.GPU
-	// 	if gpu > maxGPU {
-	// 		return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU)), userconfig.Identify(api))
-	// 	}
-	// }
 	return nil
 }

From b0e0fa616df5298afdd3b9fa31e84e069f710fac Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 17:36:43 +0000
Subject: [PATCH 05/24] Separate nodegroups

---
 Makefile                                      |  8 +-
 cli/cmd/cluster.go                            |  2 +-
 cli/cmd/lib_manager.go                        |  8 +-
 dev-cluster.yaml                              | 62 +++++++++++--
 dev/operator_local.sh                         |  4 +-
 dev/registry.sh                               | 13 +--
 docs/cluster/config.md                        |  2 +-
 examples/pytorch/iris-classifier/cortex.yaml  |  3 +
 manager/eks.yaml                              | 20 ++--
 manager/eks_gpu.yaml                          | 80 ++++++++++++++++
 manager/install.sh                            |  8 +-
 manager/instance_metadata.py                  | 40 ++++----
 manager/manifests/cluster-autoscaler.yaml     |  2 +-
 pkg/lib/clusterconfig/clusterconfig.go        | 93 ++++++++++++-------
 pkg/lib/errors/errors.go                      |  4 +-
 .../api/userconfig => lib/k8s}/quantity.go    |  7 +-
 pkg/operator/api/userconfig/compute.go        | 27 +++---
 pkg/operator/workloads/api_workload.go        | 26 +++---
 pkg/operator/workloads/workflow.go            | 22 ++---
 19 files changed, 290 insertions(+), 141 deletions(-)
 create mode 100644 manager/eks_gpu.yaml
 rename pkg/{operator/api/userconfig => lib/k8s}/quantity.go (95%)

diff --git a/Makefile b/Makefile
index c7aceea538..cc931cccf5 100644
--- a/Makefile
+++ b/Makefile
@@ -29,11 +29,11 @@ kubectl:
 	@kubectl config set-context --current --namespace=cortex >/dev/null
 
 cluster-up:
-	@$(MAKE) registry-all
-	@$(MAKE) cli
-	@kill $(shell pgrep -f rerun) >/dev/null 2>&1 || true
+	# @$(MAKE) registry-all
+	# @$(MAKE) cli
+	# @kill $(shell pgrep -f rerun) >/dev/null 2>&1 || true
 	@./bin/cortex -c=./dev/config/cluster.yaml cluster up
-	@$(MAKE) kubectl
+	# @$(MAKE) kubectl
 
 cluster-down:
 	@$(MAKE) manager-local
diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index 246dc1d498..66852920bb 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -75,7 +75,7 @@ var upCmd = &cobra.Command{
 			errors.Exit(err)
 		}
 
-		promptForEmail()
+		//promptForEmail()
 
 		clusterConfig, awsCreds, err := getInstallClusterConfig()
 		if err != nil {
diff --git a/cli/cmd/lib_manager.go b/cli/cmd/lib_manager.go
index 69834ec689..af57a2cd50 100644
--- a/cli/cmd/lib_manager.go
+++ b/cli/cmd/lib_manager.go
@@ -127,9 +127,11 @@ func runManagerCommand(entrypoint string, clusterConfig *clusterconfig.ClusterCo
 	}
 
 	containerConfig := &container.Config{
-		Image:        clusterConfig.ImageManager,
-		Entrypoint:   []string{"/bin/bash", "-c"},
-		Cmd:          []string{"sleep 0.1 && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && " + entrypoint},
+		Image:      clusterConfig.ImageManager,
+		Entrypoint: []string{"/bin/bash", "-c"},
+		// Cmd:        []string{"sleep 0.1 && python /root/instance_metadata.py /.cortex/cluster.yaml && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && echo $CORTEX_INSTANCE_CPU"},
+		// Cmd:          []string{"sleep 0.1 eval $(python /root/instance_metadata.py /.cortex/cluster.yaml) && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && " + entrypoint},
+		Cmd:          []string{"sleep 0.1 && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml /root/eks.yaml) && " + entrypoint},
 		Tty:          true,
 		AttachStdout: true,
 		AttachStderr: true,
diff --git a/dev-cluster.yaml b/dev-cluster.yaml
index 01125b9116..decd7d4735 100644
--- a/dev-cluster.yaml
+++ b/dev-cluster.yaml
@@ -1,12 +1,12 @@
-apiVersion: eksctl.io/v1alpha5
-kind: ClusterConfig
+# apiVersion: eksctl.io/v1alpha5
+# kind: ClusterConfig
 
-metadata:
-  name: cortex
-  region: us-west-2
-  version: "1.14"
+# metadata:
+#   name: cortex
+#   region: us-west-2
+#   version: "1.14"
 
-nodeGroups:
+# nodeGroups:
   # spot workers NG - multi AZ, scale from 3
   # - name: ng-3
   #   ami: auto
@@ -47,6 +47,52 @@ nodeGroups:
   #       nodefs.available: 5%
 
 
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+
+metadata:
+  name: cortex
+  region: us-west-2
+  version: "1.14"
+
+nodeGroups:
+  - name: ng-cortex-worker1
+    instanceType: p2.xlarge
+    minSize: 0
+    maxSize: 2
+    desiredCapacity: 0
+    ami: auto
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    taints:
+      workload: "true:NoSchedule"
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
+      k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
+      k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
+      k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
+    labels:
+      lifecycle: Ec2Spot
+      workload: "true"
+      nvidia.com/gpu: 'true'
+    kubeletExtraConfig:
+      kubeReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      kubeReservedCgroup: /kube-reserved
+      systemReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      evictionHard:
+        memory.available: 200Mi
+        nodefs.available: 5%
+
+
+
 # nodeGroups:
 #   - name: spot-ng
 #     ami: auto
@@ -80,4 +126,4 @@ nodeGroups:
 #         memory.available:  200Mi
 #         nodefs.available: 5%
 #     taints:
-#       nvidia.com/gpu: "true:NoSchedule"
\ No newline at end of file
+#       nvidia.com/gpu: "true:NoSchedule"
diff --git a/dev/operator_local.sh b/dev/operator_local.sh
index 8c05cf1608..18d23530e0 100755
--- a/dev/operator_local.sh
+++ b/dev/operator_local.sh
@@ -24,9 +24,7 @@ export CORTEX_CLUSTER_CONFIG_PATH=$ROOT/dev/config/cluster.yaml
 
 pip3 install -r $ROOT/manager/requirements.txt
 
-export CORTEX_NODE_CPU=$(python3 $ROOT/manager/instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="$HOME/.cortex/ec2-metadata.json" --feature="cpu")
-export CORTEX_NODE_MEM=$(python3 $ROOT/manager/instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="$HOME/.cortex/ec2-metadata.json" --feature="mem")
-export CORTEX_NODE_GPU=$(python3 $ROOT/manager/instance_metadata.py --region=$CORTEX_REGION --instance-type=$CORTEX_NODE_TYPE --cache-dir="$HOME/.cortex/ec2-metadata.json" --feature="gpu")
+python3 $ROOT/manager/instance_metadata.py $CORTEX_CLUSTER_CONFIG_PATH
 
 kill $(pgrep -f rerun) >/dev/null 2>&1 || true
 updated_cli_config=$(cat $HOME/.cortex/default.json | jq '.cortex_url = "http://localhost:8888"') && echo $updated_cli_config > $HOME/.cortex/default.json
diff --git a/dev/registry.sh b/dev/registry.sh
index 18446007c2..7c23c3176e 100755
--- a/dev/registry.sh
+++ b/dev/registry.sh
@@ -141,12 +141,13 @@ elif [ "$cmd" = "update" ]; then
     build_and_push $ROOT/images/istio-galley istio-galley latest
   fi
 
-  build_and_push $ROOT/images/predictor-serve predictor-serve latest
-  build_and_push $ROOT/images/predictor-serve-gpu predictor-serve-gpu latest
-  build_and_push $ROOT/images/tf-api tf-api latest
-  build_and_push $ROOT/images/onnx-serve onnx-serve latest
-  build_and_push $ROOT/images/onnx-serve-gpu onnx-serve-gpu latest
-  build_and_push $ROOT/images/downloader downloader latest
+  # build_and_push $ROOT/images/predictor-serve predictor-serve latest
+  # build_and_push $ROOT/images/predictor-serve-gpu predictor-serve-gpu latest
+  # build_and_push $ROOT/images/tf-api tf-api latest
+  # build_and_push $ROOT/images/onnx-serve onnx-serve latest
+  # build_and_push $ROOT/images/onnx-serve-gpu onnx-serve-gpu latest
+  # build_and_push $ROOT/images/downloader downloader latest
+  build_and_push $ROOT/images/manager manager latest
 
   cleanup
 fi
diff --git a/docs/cluster/config.md b/docs/cluster/config.md
index c6b0bc3ed1..69159cf3e2 100644
--- a/docs/cluster/config.md
+++ b/docs/cluster/config.md
@@ -18,7 +18,7 @@ cortex_aws_secret_access_key: ***
 # Instance type Cortex will use
 instance_type: m5.large
 
-# Minimum and maximum number of instances in the cluster
+# Minimum and maximum number of instances in the cluster to run your API
 min_instances: 2
 max_instances: 5
 
diff --git a/examples/pytorch/iris-classifier/cortex.yaml b/examples/pytorch/iris-classifier/cortex.yaml
index 937c38d7a9..d3d011e9ec 100644
--- a/examples/pytorch/iris-classifier/cortex.yaml
+++ b/examples/pytorch/iris-classifier/cortex.yaml
@@ -9,3 +9,6 @@
     model: s3://cortex-examples/pytorch/iris-classifier/weights.pth
   tracker:
     model_type: classification
+  compute:
+    cpu: 1.2
+    mem: 6.5Gi
diff --git a/manager/eks.yaml b/manager/eks.yaml
index 6d6547c7de..32a3ca7390 100644
--- a/manager/eks.yaml
+++ b/manager/eks.yaml
@@ -22,10 +22,10 @@ metadata:
 
 nodeGroups:
   - name: ng-cortex-operator
-    instanceType: t3.medium
-    minSize: 2
-    maxSize: 2
-    desiredCapacity: 2
+    instanceType: t3.large
+    minSize: 1
+    maxSize: 1
+    desiredCapacity: 1
     ami: auto
     iam:
       withAddonPolicies:
@@ -44,14 +44,7 @@ nodeGroups:
         memory.available:  200Mi
         nodefs.available: 5%
 
-  - name: ng-cortex-workers
-    ami: auto
-    instanceType: $CORTEX_NODE_TYPE
-    minSize: $CORTEX_NODES_MIN
-    maxSize: $CORTEX_NODES_MAX
-    desiredCapacity: $CORTEX_DESIRED_COUNT
-
-  - name: ng-1
+  - name: ng-cortex-worker
     instanceType: $CORTEX_INSTANCE_TYPE
     minSize: $CORTEX_MIN_INSTANCES
     maxSize: $CORTEX_MAX_INSTANCES
@@ -69,6 +62,7 @@ nodeGroups:
       k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
     labels:
       lifecycle: Ec2Spot
+      workload: "true"
     kubeletExtraConfig:
       kubeReserved:
         cpu: 150m
@@ -80,5 +74,5 @@ nodeGroups:
         memory: 300Mi
         ephemeral-storage: 1Gi
       evictionHard:
-        memory.available:  200Mi
+        memory.available: 200Mi
         nodefs.available: 5%
diff --git a/manager/eks_gpu.yaml b/manager/eks_gpu.yaml
new file mode 100644
index 0000000000..fab6a51334
--- /dev/null
+++ b/manager/eks_gpu.yaml
@@ -0,0 +1,80 @@
+# Copyright 2019 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+
+metadata:
+  name: $CORTEX_CLUSTER_NAME
+  region: $CORTEX_REGION
+  version: "1.14"
+
+nodeGroups:
+  - name: ng-cortex-operator
+    instanceType: t3.medium
+    minSize: 2
+    maxSize: 2
+    desiredCapacity: 2
+    ami: auto
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    kubeletExtraConfig:
+      kubeReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      kubeReservedCgroup: /kube-reserved
+      systemReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      evictionHard:
+        memory.available:  200Mi
+        nodefs.available: 5%
+
+  - name: ng-cortex-worker
+    instanceType: $CORTEX_INSTANCE_TYPE
+    minSize: $CORTEX_MIN_INSTANCES
+    maxSize: $CORTEX_MAX_INSTANCES
+    desiredCapacity: $CORTEX_MIN_INSTANCES
+    ami: auto
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    taints:
+      workload: "true:NoSchedule"
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
+      k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
+      k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
+      k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
+    labels:
+      lifecycle: Ec2Spot
+      workload: "true"
+      nvidia.com/gpu: 'true'
+    kubeletExtraConfig:
+      kubeReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      kubeReservedCgroup: /kube-reserved
+      systemReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      evictionHard:
+        memory.available: 200Mi
+        nodefs.available: 5%
diff --git a/manager/install.sh b/manager/install.sh
index 0652fe9b21..9eccfb2494 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -26,7 +26,13 @@ function ensure_eks() {
     fi
 
     echo -e "\n￮ Spinning up the cluster ... (this will take about 15 minutes)\n"
-    envsubst < eks.yaml | eksctl create cluster -f -
+    if [ $CORTEX_INSTANCE_GPU -ne 0 ]; then
+      echo "GPU"
+      envsubst < eks_gpu.yaml | eksctl create cluster -f -
+    else
+      echo "CPU"
+      envsubst < eks.yaml | eksctl create cluster -f -
+    fi
     echo -e "\n✓ Spun up the cluster"
     return
   fi
diff --git a/manager/instance_metadata.py b/manager/instance_metadata.py
index 223d97da3b..fc5eb51033 100644
--- a/manager/instance_metadata.py
+++ b/manager/instance_metadata.py
@@ -1,18 +1,19 @@
 import boto3
 import requests
-import argparse
+import sys
 import re
 import os
 import pathlib
 import json
+import yaml
 
 PRICING_ENDPOINT_TEMPLATE = (
     "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{}/index.json"
 )
 
 
-def download_metadata(args):
-    response = requests.get(PRICING_ENDPOINT_TEMPLATE.format(args.region))
+def download_metadata(cluster_config):
+    response = requests.get(PRICING_ENDPOINT_TEMPLATE.format(cluster_config["region"]))
     offers = response.json()
 
     instance_mapping = {}
@@ -49,37 +50,30 @@ def download_metadata(args):
         if product["attributes"].get("gpu") is not None:
             metadata["gpu"] = product["attributes"]["gpu"]
         instance_mapping[instance_type] = metadata
-    with open(args.cache_dir, "w") as outfile:
-        json.dump(instance_mapping, outfile)
 
     return instance_mapping
 
 
-def get_metadata(args):
-    if pathlib.Path(args.cache_dir).exists():
-        return json.load(open(args.cache_dir))
-    else:
-        return download_metadata(args)
+def get_metadata(cluster_config):
+    return download_metadata(cluster_config)
 
 
-units = {"mem": "Mi"}
+def set_ec2_metadata(cluster_config_path):
+    with open(cluster_config_path, "r") as cluster_config_file:
+        cluster_config = yaml.safe_load(cluster_config_file)
+    instance_mapping = get_metadata(cluster_config)
+    instance_type = instance_mapping[cluster_config["instance_type"]]
 
+    cluster_config["instance_mem"] = str(instance_type["mem"]) + "Mi"
+    cluster_config["instance_cpu"] = str(instance_type["cpu"])
+    cluster_config["instance_gpu"] = int(instance_type.get("gpu", 0))
 
-def set_ec2_metadata(args):
-    instance_mapping = get_metadata(args)
-    instance_type = instance_mapping[args.instance_type]
-    print("{}{}".format(instance_type.get(args.feature, "0"), units.get(args.feature, "")))
+    with open(cluster_config_path, "w") as cluster_config_file:
+        yaml.dump(cluster_config, cluster_config_file, default_flow_style=False)
 
 
 def main():
-    parser = argparse.ArgumentParser()
-    na = parser.add_argument_group("required named arguments")
-    na.add_argument("--region", required=True, help="AWS Region")
-    na.add_argument("--instance-type", required=True, help="Instance type")
-    na.add_argument("--feature", required=True, help="Feature to get")
-    na.add_argument("--cache-dir", required=True, help="Cache dir")
-    args = parser.parse_args()
-    set_ec2_metadata(args)
+    set_ec2_metadata(sys.argv[1])
 
 
 if __name__ == "__main__":
diff --git a/manager/manifests/cluster-autoscaler.yaml b/manager/manifests/cluster-autoscaler.yaml
index 7c585d9a85..33c0ecc355 100644
--- a/manager/manifests/cluster-autoscaler.yaml
+++ b/manager/manifests/cluster-autoscaler.yaml
@@ -148,7 +148,7 @@ spec:
     spec:
       serviceAccountName: cluster-autoscaler
       containers:
-        - image: k8s.gcr.io/cluster-autoscaler:v1.16.1
+        - image: $CORTEX_IMAGE_CLUSTER_AUTOSCALER
           name: cluster-autoscaler
           resources:
             limits:
diff --git a/pkg/lib/clusterconfig/clusterconfig.go b/pkg/lib/clusterconfig/clusterconfig.go
index d1a97446d6..e835fdb75d 100644
--- a/pkg/lib/clusterconfig/clusterconfig.go
+++ b/pkg/lib/clusterconfig/clusterconfig.go
@@ -21,42 +21,48 @@ import (
 
 	"github.com/cortexlabs/cortex/pkg/consts"
 	"github.com/cortexlabs/cortex/pkg/lib/aws"
+	"github.com/cortexlabs/cortex/pkg/lib/table"
+	kresource "k8s.io/apimachinery/pkg/api/resource"
+
 	cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	"github.com/cortexlabs/cortex/pkg/lib/hash"
+	"github.com/cortexlabs/cortex/pkg/lib/k8s"
 	"github.com/cortexlabs/cortex/pkg/lib/pointer"
 	"github.com/cortexlabs/cortex/pkg/lib/prompt"
-	"github.com/cortexlabs/cortex/pkg/lib/table"
 )
 
 type ClusterConfig struct {
-	InstanceType           *string `json:"instance_type" yaml:"instance_type"`
-	MinInstances           *int64  `json:"min_instances" yaml:"min_instances"`
-	MaxInstances           *int64  `json:"max_instances" yaml:"max_instances"`
-	ClusterName            string  `json:"cluster_name" yaml:"cluster_name"`
-	Region                 string  `json:"region" yaml:"region"`
-	Bucket                 string  `json:"bucket" yaml:"bucket"`
-	LogGroup               string  `json:"log_group" yaml:"log_group"`
-	Telemetry              bool    `json:"telemetry" yaml:"telemetry"`
-	ImagePredictorServe    string  `json:"image_predictor_serve" yaml:"image_predictor_serve"`
-	ImagePredictorServeGPU string  `json:"image_predictor_serve_gpu" yaml:"image_predictor_serve_gpu"`
-	ImageTFServe           string  `json:"image_tf_serve" yaml:"image_tf_serve"`
-	ImageTFServeGPU        string  `json:"image_tf_serve_gpu" yaml:"image_tf_serve_gpu"`
-	ImageTFAPI             string  `json:"image_tf_api" yaml:"image_tf_api"`
-	ImageONNXServe         string  `json:"image_onnx_serve" yaml:"image_onnx_serve"`
-	ImageONNXServeGPU      string  `json:"image_onnx_serve_gpu" yaml:"image_onnx_serve_gpu"`
-	ImageOperator          string  `json:"image_operator" yaml:"image_operator"`
-	ImageManager           string  `json:"image_manager" yaml:"image_manager"`
-	ImageDownloader        string  `json:"image_downloader" yaml:"image_downloader"`
-	ImageClusterAutoscaler string  `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
-	ImageMetricsServer     string  `json:"image_metrics_server" yaml:"image_metrics_server"`
-	ImageNvidia            string  `json:"image_nvidia" yaml:"image_nvidia"`
-	ImageFluentd           string  `json:"image_fluentd" yaml:"image_fluentd"`
-	ImageStatsd            string  `json:"image_statsd" yaml:"image_statsd"`
-	ImageIstioProxy        string  `json:"image_istio_proxy" yaml:"image_istio_proxy"`
-	ImageIstioPilot        string  `json:"image_istio_pilot" yaml:"image_istio_pilot"`
-	ImageIstioCitadel      string  `json:"image_istio_citadel" yaml:"image_istio_citadel"`
-	ImageIstioGalley       string  `json:"image_istio_galley" yaml:"image_istio_galley"`
+	InstanceType           *string       `json:"instance_type" yaml:"instance_type"`
+	InstanceCPU            *k8s.Quantity `json:"instance_cpu" yaml:"instance_cpu"`
+	InstanceMem            *k8s.Quantity `json:"instance_mem" yaml:"instance_mem"`
+	InstanceGPU            int64         `json:"instance_gpu" yaml:"instance_gpu"`
+	MinInstances           *int64        `json:"min_instances" yaml:"min_instances"`
+	MaxInstances           *int64        `json:"max_instances" yaml:"max_instances"`
+	ClusterName            string        `json:"cluster_name" yaml:"cluster_name"`
+	Region                 string        `json:"region" yaml:"region"`
+	Bucket                 string        `json:"bucket" yaml:"bucket"`
+	LogGroup               string        `json:"log_group" yaml:"log_group"`
+	Telemetry              bool          `json:"telemetry" yaml:"telemetry"`
+	ImagePredictorServe    string        `json:"image_predictor_serve" yaml:"image_predictor_serve"`
+	ImagePredictorServeGPU string        `json:"image_predictor_serve_gpu" yaml:"image_predictor_serve_gpu"`
+	ImageTFServe           string        `json:"image_tf_serve" yaml:"image_tf_serve"`
+	ImageTFServeGPU        string        `json:"image_tf_serve_gpu" yaml:"image_tf_serve_gpu"`
+	ImageTFAPI             string        `json:"image_tf_api" yaml:"image_tf_api"`
+	ImageONNXServe         string        `json:"image_onnx_serve" yaml:"image_onnx_serve"`
+	ImageONNXServeGPU      string        `json:"image_onnx_serve_gpu" yaml:"image_onnx_serve_gpu"`
+	ImageOperator          string        `json:"image_operator" yaml:"image_operator"`
+	ImageManager           string        `json:"image_manager" yaml:"image_manager"`
+	ImageDownloader        string        `json:"image_downloader" yaml:"image_downloader"`
+	ImageClusterAutoscaler string        `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
+	ImageMetricsServer     string        `json:"image_metrics_server" yaml:"image_metrics_server"`
+	ImageNvidia            string        `json:"image_nvidia" yaml:"image_nvidia"`
+	ImageFluentd           string        `json:"image_fluentd" yaml:"image_fluentd"`
+	ImageStatsd            string        `json:"image_statsd" yaml:"image_statsd"`
+	ImageIstioProxy        string        `json:"image_istio_proxy" yaml:"image_istio_proxy"`
+	ImageIstioPilot        string        `json:"image_istio_pilot" yaml:"image_istio_pilot"`
+	ImageIstioCitadel      string        `json:"image_istio_citadel" yaml:"image_istio_citadel"`
+	ImageIstioGalley       string        `json:"image_istio_galley" yaml:"image_istio_galley"`
 }
 
 type InternalClusterConfig struct {
@@ -74,10 +80,31 @@ var Validation = &cr.StructValidation{
 				Validator: validateInstanceType,
 			},
 		},
+		{
+			StructField:         "InstanceCPU",
+			StringPtrValidation: &cr.StringPtrValidation{},
+			Parser: k8s.QuantityParser(&k8s.QuantityValidation{
+				GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
+			}),
+		},
+		{
+			StructField:         "InstanceMem",
+			StringPtrValidation: &cr.StringPtrValidation{},
+			Parser: k8s.QuantityParser(&k8s.QuantityValidation{
+				GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
+			}),
+		},
+		{
+			StructField: "InstanceGPU",
+			Int64Validation: &cr.Int64Validation{
+				Default:              0,
+				GreaterThanOrEqualTo: pointer.Int64(0),
+			},
+		},
 		{
 			StructField: "MinInstances",
 			Int64PtrValidation: &cr.Int64PtrValidation{
-				GreaterThan: pointer.Int64(0),
+				GreaterThanOrEqualTo: pointer.Int64(0),
 			},
 		},
 		{
@@ -168,7 +195,7 @@ var Validation = &cr.StructValidation{
 		{
 			StructField: "ImageManager",
 			StringValidation: &cr.StringValidation{
-				Default: "cortexlabs/manager:" + consts.CortexVersion,
+				Default: "969758392368.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/manager:latest",
 			},
 		},
 		{
@@ -259,7 +286,7 @@ func PromptValidation(skipPopulatedFields bool, promptInstanceType bool, default
 		defaults.InstanceType = pointer.String("m5.large")
 	}
 	if defaults.MinInstances == nil {
-		defaults.MinInstances = pointer.Int64(2)
+		defaults.MinInstances = pointer.Int64(1)
 	}
 	if defaults.MaxInstances == nil {
 		defaults.MaxInstances = pointer.Int64(5)
@@ -326,8 +353,8 @@ func validateInstanceType(instanceType string) (string, error) {
 
 // This does not set defaults for fields that are prompted from the user
 func SetFileDefaults(clusterConfig *ClusterConfig) error {
-	var emtpyMap interface{} = map[interface{}]interface{}{}
-	errs := cr.Struct(clusterConfig, emtpyMap, Validation)
+	var emptyMap interface{} = map[interface{}]interface{}{}
+	errs := cr.Struct(clusterConfig, emptyMap, Validation)
 	if errors.HasErrors(errs) {
 		return errors.FirstError(errs...)
 	}
diff --git a/pkg/lib/errors/errors.go b/pkg/lib/errors/errors.go
index 60a7c845e9..4f798397f2 100644
--- a/pkg/lib/errors/errors.go
+++ b/pkg/lib/errors/errors.go
@@ -144,14 +144,14 @@ func Panic(items ...interface{}) {
 		os.Exit(1)
 	}
 	err := MergeErrItems(items...)
-	// PrintStacktrace(err)
+	PrintStacktrace(err)
 	panic(err)
 }
 
 func PrintError(err error, strs ...string) {
 	wrappedErr := Wrap(err, strs...)
 	fmt.Println("error:", wrappedErr.Error())
-	// PrintStacktrace(wrappedErr)
+	PrintStacktrace(wrappedErr)
 }
 
 func PrintStacktrace(err error) {
diff --git a/pkg/operator/api/userconfig/quantity.go b/pkg/lib/k8s/quantity.go
similarity index 95%
rename from pkg/operator/api/userconfig/quantity.go
rename to pkg/lib/k8s/quantity.go
index d3ef0597bf..778b3e26c9 100644
--- a/pkg/operator/api/userconfig/quantity.go
+++ b/pkg/lib/k8s/quantity.go
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-package userconfig
+package k8s
 
 import (
 	"encoding/json"
@@ -23,7 +23,6 @@ import (
 	kresource "k8s.io/apimachinery/pkg/api/resource"
 
 	"github.com/cortexlabs/cortex/pkg/lib/configreader"
-	"github.com/cortexlabs/cortex/pkg/lib/k8s"
 	s "github.com/cortexlabs/cortex/pkg/lib/strings"
 )
 
@@ -43,7 +42,7 @@ func QuantityParser(v *QuantityValidation) func(string) (interface{}, error) {
 	return func(str string) (interface{}, error) {
 		k8sQuantity, err := kresource.ParseQuantity(str)
 		if err != nil {
-			return Quantity{}, k8s.ErrorParseQuantity(str)
+			return Quantity{}, ErrorParseQuantity(str)
 		}
 
 		if v.GreaterThan != nil {
@@ -107,7 +106,7 @@ func (quantity *Quantity) ID() string {
 	return s.Int64(quantity.MilliValue())
 }
 
-func k8sQuantityPtr(k8sQuantity kresource.Quantity) *kresource.Quantity {
+func QuantityPtr(k8sQuantity kresource.Quantity) *kresource.Quantity {
 	return &k8sQuantity
 }
 
diff --git a/pkg/operator/api/userconfig/compute.go b/pkg/operator/api/userconfig/compute.go
index 980e7c1ac8..bec487514e 100644
--- a/pkg/operator/api/userconfig/compute.go
+++ b/pkg/operator/api/userconfig/compute.go
@@ -32,18 +32,19 @@ import (
 
 	cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
 	"github.com/cortexlabs/cortex/pkg/lib/hash"
+	"github.com/cortexlabs/cortex/pkg/lib/k8s"
 	"github.com/cortexlabs/cortex/pkg/lib/pointer"
 	s "github.com/cortexlabs/cortex/pkg/lib/strings"
 )
 
 type APICompute struct {
-	MinReplicas          int32     `json:"min_replicas" yaml:"min_replicas"`
-	MaxReplicas          int32     `json:"max_replicas" yaml:"max_replicas"`
-	InitReplicas         int32     `json:"init_replicas" yaml:"init_replicas"`
-	TargetCPUUtilization int32     `json:"target_cpu_utilization" yaml:"target_cpu_utilization"`
-	CPU                  Quantity  `json:"cpu" yaml:"cpu"`
-	Mem                  *Quantity `json:"mem" yaml:"mem"`
-	GPU                  int64     `json:"gpu" yaml:"gpu"`
+	MinReplicas          int32         `json:"min_replicas" yaml:"min_replicas"`
+	MaxReplicas          int32         `json:"max_replicas" yaml:"max_replicas"`
+	InitReplicas         int32         `json:"init_replicas" yaml:"init_replicas"`
+	TargetCPUUtilization int32         `json:"target_cpu_utilization" yaml:"target_cpu_utilization"`
+	CPU                  k8s.Quantity  `json:"cpu" yaml:"cpu"`
+	Mem                  *k8s.Quantity `json:"mem" yaml:"mem"`
+	GPU                  int64         `json:"gpu" yaml:"gpu"`
 }
 
 var apiComputeFieldValidation = &cr.StructFieldValidation{
@@ -85,8 +86,8 @@ var apiComputeFieldValidation = &cr.StructFieldValidation{
 					Default:     "200m",
 					CastNumeric: true,
 				},
-				Parser: QuantityParser(&QuantityValidation{
-					GreaterThan: k8sQuantityPtr(kresource.MustParse("0")),
+				Parser: k8s.QuantityParser(&k8s.QuantityValidation{
+					GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
 				}),
 			},
 			{
@@ -94,8 +95,8 @@ var apiComputeFieldValidation = &cr.StructFieldValidation{
 				StringPtrValidation: &cr.StringPtrValidation{
 					Default: nil,
 				},
-				Parser: QuantityParser(&QuantityValidation{
-					GreaterThan: k8sQuantityPtr(kresource.MustParse("0")),
+				Parser: k8s.QuantityParser(&k8s.QuantityValidation{
+					GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
 				}),
 			},
 			{
@@ -150,7 +151,7 @@ func (ac *APICompute) ID() string {
 	buf.WriteString(s.Int32(ac.InitReplicas))
 	buf.WriteString(s.Int32(ac.TargetCPUUtilization))
 	buf.WriteString(ac.CPU.ID())
-	buf.WriteString(QuantityPtrID(ac.Mem))
+	buf.WriteString(k8s.QuantityPtrID(ac.Mem))
 	buf.WriteString(s.Int64(ac.GPU))
 	return hash.Bytes(buf.Bytes())
 }
@@ -159,7 +160,7 @@ func (ac *APICompute) ID() string {
 func (ac *APICompute) IDWithoutReplicas() string {
 	var buf bytes.Buffer
 	buf.WriteString(ac.CPU.ID())
-	buf.WriteString(QuantityPtrID(ac.Mem))
+	buf.WriteString(k8s.QuantityPtrID(ac.Mem))
 	buf.WriteString(s.Int64(ac.GPU))
 	return hash.Bytes(buf.Bytes())
 }
diff --git a/pkg/operator/workloads/api_workload.go b/pkg/operator/workloads/api_workload.go
index 38e1af1374..188ff6319b 100644
--- a/pkg/operator/workloads/api_workload.go
+++ b/pkg/operator/workloads/api_workload.go
@@ -425,7 +425,7 @@ func tfAPISpec(
 					},
 				},
 				NodeSelector: map[string]string{
-					"lifecycle": "Ec2Spot",
+					"workload": "true",
 				},
 				Tolerations:        tolerations,
 				Volumes:            defaultVolumes(),
@@ -580,9 +580,9 @@ func predictorAPISpec(
 					},
 				},
 				NodeSelector: map[string]string{
-					"lifecycle": "Ec2Spot",
+					"workload": "true",
 				},
-				Tolerations:        tolerations,
+				Tolerations:        k8s.Tolerations(),
 				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
@@ -734,10 +734,10 @@ func onnxAPISpec(
 					},
 				},
 				NodeSelector: map[string]string{
-					"lifecycle": "Ec2Spot",
+					"workload": "true",
 				},
 				Tolerations:        tolerations,
-				Volumes:            k8s.DefaultVolumes(),
+				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
 		},
@@ -788,10 +788,10 @@ func doesAPIComputeNeedsUpdating(api *context.API, k8sDeployment *kapps.Deployme
 	}
 
 	curCPU, curMem, curGPU := APIPodCompute(k8sDeployment.Spec.Template.Spec.Containers)
-	if !userconfig.QuantityPtrsEqual(curCPU, &api.Compute.CPU) {
+	if !k8s.QuantityPtrsEqual(curCPU, &api.Compute.CPU) {
 		return true
 	}
-	if !userconfig.QuantityPtrsEqual(curMem, api.Compute.Mem) {
+	if !k8s.QuantityPtrsEqual(curMem, api.Compute.Mem) {
 		return true
 	}
 	if curGPU != api.Compute.GPU {
@@ -887,7 +887,7 @@ func APIsBaseURL() (string, error) {
 func APIPodComputeID(containers []kcore.Container) string {
 	cpu, mem, gpu := APIPodCompute(containers)
 	if cpu == nil {
-		cpu = &userconfig.Quantity{} // unexpected, since 0 is disallowed
+		cpu = &k8s.Quantity{} // unexpected, since 0 is disallowed
 	}
 	podAPICompute := userconfig.APICompute{
 		CPU: *cpu,
@@ -897,9 +897,9 @@ func APIPodComputeID(containers []kcore.Container) string {
 	return podAPICompute.IDWithoutReplicas()
 }
 
-func APIPodCompute(containers []kcore.Container) (*userconfig.Quantity, *userconfig.Quantity, int64) {
-	var totalCPU *userconfig.Quantity
-	var totalMem *userconfig.Quantity
+func APIPodCompute(containers []kcore.Container) (*k8s.Quantity, *k8s.Quantity, int64) {
+	var totalCPU *k8s.Quantity
+	var totalMem *k8s.Quantity
 	var totalGPU int64
 
 	for _, container := range containers {
@@ -914,13 +914,13 @@ func APIPodCompute(containers []kcore.Container) (*userconfig.Quantity, *usercon
 
 		if cpu, ok := requests[kcore.ResourceCPU]; ok {
 			if totalCPU == nil {
-				totalCPU = &userconfig.Quantity{}
+				totalCPU = &k8s.Quantity{}
 			}
 			totalCPU.Add(cpu)
 		}
 		if mem, ok := requests[kcore.ResourceMemory]; ok {
 			if totalMem == nil {
-				totalMem = &userconfig.Quantity{}
+				totalMem = &k8s.Quantity{}
 			}
 			totalMem.Add(mem)
 		}
diff --git a/pkg/operator/workloads/workflow.go b/pkg/operator/workloads/workflow.go
index fd559a637e..5b994a3e98 100644
--- a/pkg/operator/workloads/workflow.go
+++ b/pkg/operator/workloads/workflow.go
@@ -32,7 +32,7 @@ import (
 	"github.com/cortexlabs/cortex/pkg/operator/config"
 )
 
-var cortexCPUReserve = kresource.MustParse("800m")   // FluentD (200), Nvidia (50), StatsD (100), Kube Procy, (100) Node capacity - Node availability 300 CPU
+var cortexCPUReserve = kresource.MustParse("800m")   // FluentD (200), Nvidia (50), StatsD (100), Kube Proxy, (100) Node capacity - Node availability 300 CPU
 var cortexMemReserve = kresource.MustParse("1500Mi") // FluentD (200), Nvidia (50), StatsD (100), KubeReserved (800), AWS node memory - Node capacity (200)
 
 func Init() error {
@@ -302,11 +302,6 @@ func GetDeploymentStatus(appName string) (resource.DeploymentStatus, error) {
 }
 
 func ValidateDeploy(ctx *context.Context) error {
-	// maxCPU := config.Cortex.NodeCPU.Copy()
-	// maxCPU.Sub(cortexCPUReserve)
-	// maxMem := config.Cortex.NodeMem.Copy()
-	// maxMem.Sub(cortexMemReserve)
-	// maxGPU := config.Cortex.NodeGPU.Copy()
 	if err := CheckAPIEndpointCollisions(ctx); err != nil {
 		return err
 	}
@@ -316,8 +311,11 @@ func ValidateDeploy(ctx *context.Context) error {
 		return err
 	}
 
-	var maxCPU, maxMem kresource.Quantity
-	var maxGPU int64
+	maxCPU := config.Cluster.InstanceCPU.Copy()
+	//maxCPU.Sub(cortexCPUReserve)
+	maxMem := config.Cluster.InstanceMem.Copy()
+	//maxMem.Sub(cortexMemReserve)
+	maxGPU := config.Cluster.InstanceGPU
 	for _, node := range nodes {
 		curCPU := node.Status.Capacity.Cpu()
 		curMem := node.Status.Capacity.Memory()
@@ -328,11 +326,11 @@ func ValidateDeploy(ctx *context.Context) error {
 		}
 
 		if curCPU != nil && maxCPU.Cmp(*curCPU) < 0 {
-			maxCPU = *curCPU
+			maxCPU = curCPU
 		}
 
 		if curMem != nil && maxMem.Cmp(*curMem) < 0 {
-			maxMem = *curMem
+			maxMem = curMem
 		}
 
 		if curGPU > maxGPU {
@@ -350,8 +348,8 @@ func ValidateDeploy(ctx *context.Context) error {
 			}
 		}
 		gpu := api.Compute.GPU
-		if gpu > maxGPU.Value() {
-			return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU.Value())), userconfig.Identify(api))
+		if gpu > maxGPU {
+			return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU)), userconfig.Identify(api))
 		}
 	}
 	return nil

From 3268382b9c98776f790ed969902ca5a0a214faac Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 16:15:19 -0500
Subject: [PATCH 06/24] Add desired instances

---
 Makefile             |  2 +-
 dev-cluster.yaml     | 95 +++-----------------------------------------
 dev/registry.sh      | 12 +++---
 manager/eks.yaml     | 16 ++++----
 manager/eks_gpu.yaml |  9 +++--
 manager/install.sh   | 12 ++++--
 6 files changed, 34 insertions(+), 112 deletions(-)

diff --git a/Makefile b/Makefile
index 0c2a244435..cc931cccf5 100644
--- a/Makefile
+++ b/Makefile
@@ -25,7 +25,7 @@ devstart:
 	@./dev/operator_local.sh || true
 
 kubectl:
-	@eval $$(python ./manager/cluster_config_env.py ./dev/config/cluster.yaml) && eksctl utils write-kubeconfig --cluster="$$CORTEX_CLUSTER_NAME" | grep -v "saved kubeconfig as" | grep -v "using region" | grep -v "eksctl version" || true
+	@eval $$(python ./manager/cluster_config_env.py ./dev/config/cluster.yaml) && eksctl utils write-kubeconfig --name="$$CORTEX_CLUSTER_NAME" | grep -v "saved kubeconfig as" | grep -v "using region" | grep -v "eksctl version" || true
 	@kubectl config set-context --current --namespace=cortex >/dev/null
 
 cluster-up:
diff --git a/dev-cluster.yaml b/dev-cluster.yaml
index decd7d4735..b54d482010 100644
--- a/dev-cluster.yaml
+++ b/dev-cluster.yaml
@@ -1,52 +1,3 @@
-# apiVersion: eksctl.io/v1alpha5
-# kind: ClusterConfig
-
-# metadata:
-#   name: cortex
-#   region: us-west-2
-#   version: "1.14"
-
-# nodeGroups:
-  # spot workers NG - multi AZ, scale from 3
-  # - name: ng-3
-  #   ami: auto
-  #   instanceType: mixed
-  #   minSize: 0
-  #   maxSize: 5
-  #   volumeSize: 100
-  #   volumeType: gp2
-  #   volumeEncrypted: true
-  #   iam:
-  #     withAddonPolicies:
-  #       autoScaler: true
-  #   instancesDistribution:
-  #     instanceTypes: [t3.medium, t3.large]
-  #     onDemandPercentageAboveBaseCapacity: 0
-  #     spotInstancePools: 2
-  #   taints:
-  #     workload: "true:NoSchedule"
-  #   tags:
-  #     k8s.io/cluster-autoscaler/enabled: 'true'
-  #     k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
-  #     k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
-  #     k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
-  #   labels:
-  #     lifecycle: Ec2Spot
-  #   kubeletExtraConfig:
-  #     kubeReserved:
-  #       cpu: 150m
-  #       memory: 300Mi
-  #       ephemeral-storage: 1Gi
-  #     kubeReservedCgroup: /kube-reserved
-  #     systemReserved:
-  #       cpu: 150m
-  #       memory: 300Mi
-  #       ephemeral-storage: 1Gi
-  #     evictionHard:
-  #       memory.available:  200Mi
-  #       nodefs.available: 5%
-
-
 apiVersion: eksctl.io/v1alpha5
 kind: ClusterConfig
 
@@ -60,23 +11,24 @@ nodeGroups:
     instanceType: p2.xlarge
     minSize: 0
     maxSize: 2
-    desiredCapacity: 0
+    desiredCapacity: 1
     ami: auto
     iam:
       withAddonPolicies:
         autoScaler: true
-    taints:
-      workload: "true:NoSchedule"
     tags:
       k8s.io/cluster-autoscaler/enabled: 'true'
       k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
-      k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+      k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
       k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
       k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
     labels:
       lifecycle: Ec2Spot
       workload: "true"
       nvidia.com/gpu: 'true'
+    taints:
+      nvidia.com/gpu: "true:NoSchedule"
+      workload: "true:NoSchedule"
     kubeletExtraConfig:
       kubeReserved:
         cpu: 150m
@@ -90,40 +42,3 @@ nodeGroups:
       evictionHard:
         memory.available: 200Mi
         nodefs.available: 5%
-
-
-
-# nodeGroups:
-#   - name: spot-ng
-#     ami: auto
-#     instanceType: mixed
-#     desiredCapacity: 0
-#     minSize: 0
-#     maxSize: 2
-#     volumeSize: 100
-#     volumeType: gp2
-#     volumeEncrypted: true
-#     instancesDistribution:
-#       instanceTypes: [p2.xlarge, p2.8xlarge]
-#     iam:
-#       withAddonPolicies:
-#         autoScaler: true
-#     # tags:
-#     #   k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
-#     #   k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
-#     #   k8s.io/cluster-autoscaler/enabled: 'true'
-#     kubeletExtraConfig:
-#       kubeReserved:
-#         cpu: 150m
-#         memory: 300Mi
-#         ephemeral-storage: 1Gi
-#       kubeReservedCgroup: /kube-reserved
-#       systemReserved:
-#         cpu: 150m
-#         memory: 300Mi
-#         ephemeral-storage: 1Gi
-#       evictionHard:
-#         memory.available:  200Mi
-#         nodefs.available: 5%
-#     taints:
-#       nvidia.com/gpu: "true:NoSchedule"
diff --git a/dev/registry.sh b/dev/registry.sh
index 7c23c3176e..3fea7b319e 100755
--- a/dev/registry.sh
+++ b/dev/registry.sh
@@ -141,12 +141,12 @@ elif [ "$cmd" = "update" ]; then
     build_and_push $ROOT/images/istio-galley istio-galley latest
   fi
 
-  # build_and_push $ROOT/images/predictor-serve predictor-serve latest
-  # build_and_push $ROOT/images/predictor-serve-gpu predictor-serve-gpu latest
-  # build_and_push $ROOT/images/tf-api tf-api latest
-  # build_and_push $ROOT/images/onnx-serve onnx-serve latest
-  # build_and_push $ROOT/images/onnx-serve-gpu onnx-serve-gpu latest
-  # build_and_push $ROOT/images/downloader downloader latest
+  build_and_push $ROOT/images/predictor-serve predictor-serve latest
+  build_and_push $ROOT/images/predictor-serve-gpu predictor-serve-gpu latest
+  build_and_push $ROOT/images/tf-api tf-api latest
+  build_and_push $ROOT/images/onnx-serve onnx-serve latest
+  build_and_push $ROOT/images/onnx-serve-gpu onnx-serve-gpu latest
+  build_and_push $ROOT/images/downloader downloader latest
   build_and_push $ROOT/images/manager manager latest
 
   cleanup
diff --git a/manager/eks.yaml b/manager/eks.yaml
index 32a3ca7390..116c1bea52 100644
--- a/manager/eks.yaml
+++ b/manager/eks.yaml
@@ -22,10 +22,10 @@ metadata:
 
 nodeGroups:
   - name: ng-cortex-operator
-    instanceType: t3.large
-    minSize: 1
-    maxSize: 1
-    desiredCapacity: 1
+    instanceType: t3.medium
+    minSize: 2
+    maxSize: 2
+    desiredCapacity: 2
     ami: auto
     iam:
       withAddonPolicies:
@@ -48,21 +48,21 @@ nodeGroups:
     instanceType: $CORTEX_INSTANCE_TYPE
     minSize: $CORTEX_MIN_INSTANCES
     maxSize: $CORTEX_MAX_INSTANCES
-    desiredCapacity: $CORTEX_MIN_INSTANCES
+    desiredCapacity: $CORTEX_DESIRED_INSTANCES
     ami: auto
     iam:
       withAddonPolicies:
         autoScaler: true
-    taints:
-      workload: "true:NoSchedule"
     tags:
       k8s.io/cluster-autoscaler/enabled: 'true'
-      k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+      k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
       k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
       k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
     labels:
       lifecycle: Ec2Spot
       workload: "true"
+    taints:
+      workload: "true:NoSchedule"
     kubeletExtraConfig:
       kubeReserved:
         cpu: 150m
diff --git a/manager/eks_gpu.yaml b/manager/eks_gpu.yaml
index fab6a51334..aab794852b 100644
--- a/manager/eks_gpu.yaml
+++ b/manager/eks_gpu.yaml
@@ -48,23 +48,24 @@ nodeGroups:
     instanceType: $CORTEX_INSTANCE_TYPE
     minSize: $CORTEX_MIN_INSTANCES
     maxSize: $CORTEX_MAX_INSTANCES
-    desiredCapacity: $CORTEX_MIN_INSTANCES
+    desiredCapacity: $CORTEX_DESIRED_INSTANCES
     ami: auto
     iam:
       withAddonPolicies:
         autoScaler: true
-    taints:
-      workload: "true:NoSchedule"
     tags:
       k8s.io/cluster-autoscaler/enabled: 'true'
       k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
-      k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
+      k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
       k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
       k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
     labels:
       lifecycle: Ec2Spot
       workload: "true"
       nvidia.com/gpu: 'true'
+    taints:
+      nvidia.com/gpu: "true:NoSchedule"
+      workload: "true:NoSchedule"
     kubeletExtraConfig:
       kubeReserved:
         cpu: 150m
diff --git a/manager/install.sh b/manager/install.sh
index 2cb4572175..d59d601519 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -32,6 +32,12 @@ function ensure_eks() {
       exit 1
     fi
 
+    if [ $CORTEX_MIN_INSTANCES -lt 1 ]; then
+      CORTEX_DESIRED_INSTANCES=1
+    else
+      CORTEX_DESIRED_INSTANCES=$CORTEX_MIN_INSTANCES
+    fi
+
     echo -e "￮ Spinning up the cluster ... (this will take about 15 minutes)\n"
     if [ $CORTEX_INSTANCE_GPU -ne 0 ]; then
       echo "GPU"
@@ -66,15 +72,15 @@ function ensure_eks() {
   echo "✓ Cluster is running"
 
   # Check if instance type changed
-  ng_info=$(eksctl get nodegroup --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --name ng-1 -o json)
-  ng_instance_type=$(echo "$ng_info" | jq -r ".[] | select( .Cluster == \"$CORTEX_CLUSTER_NAME\" ) | select( .Name == \"ng-1\" ) | .InstanceType")
+  ng_info=$(eksctl get nodegroup --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --name ng-cortex-worker -o json)
+  ng_instance_type=$(echo "$ng_info" | jq -r ".[] | select( .Cluster == \"$CORTEX_CLUSTER_NAME\" ) | select( .Name == \"ng-cortex-worker\" ) | .InstanceType")
   if [ "$ng_instance_type" != "$CORTEX_INSTANCE_TYPE" ]; then
     echo -e "\nerror: Cortex does not currently support changing the instance type of a running cluster; please run \`cortex cluster down\` followed by \`cortex cluster up\` to create a new cluster"
     exit 1
   fi
 
   # Check for change in min/max instances
-  asg_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query 'AutoScalingGroups[?contains(Tags[?Key==`alpha.eksctl.io/nodegroup-name`].Value, `ng-1`)]')
+  asg_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query 'AutoScalingGroups[?contains(Tags[?Key==`alpha.eksctl.io/nodegroup-name`].Value, `ng-cortex-worker`)]')
   asg_name=$(echo "$asg_info" | jq -r 'first | .AutoScalingGroupName')
   asg_min_size=$(echo "$asg_info" | jq -r 'first | .MinSize')
   asg_max_size=$(echo "$asg_info" | jq -r 'first | .MaxSize')

From 8d4ea323c664913b507db6d2f5bf6c093e721bab Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Tue, 12 Nov 2019 16:20:17 -0500
Subject: [PATCH 07/24] Minor cleanup

---
 Makefile                                     | 8 ++++----
 docs/cluster/config.md                       | 2 +-
 examples/pytorch/iris-classifier/cortex.yaml | 3 ---
 images/manager/Dockerfile                    | 4 ++--
 manager/manifests/statsd.yaml                | 2 +-
 5 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index cc931cccf5..c7aceea538 100644
--- a/Makefile
+++ b/Makefile
@@ -29,11 +29,11 @@ kubectl:
 	@kubectl config set-context --current --namespace=cortex >/dev/null
 
 cluster-up:
-	# @$(MAKE) registry-all
-	# @$(MAKE) cli
-	# @kill $(shell pgrep -f rerun) >/dev/null 2>&1 || true
+	@$(MAKE) registry-all
+	@$(MAKE) cli
+	@kill $(shell pgrep -f rerun) >/dev/null 2>&1 || true
 	@./bin/cortex -c=./dev/config/cluster.yaml cluster up
-	# @$(MAKE) kubectl
+	@$(MAKE) kubectl
 
 cluster-down:
 	@$(MAKE) manager-local
diff --git a/docs/cluster/config.md b/docs/cluster/config.md
index e59b3c19c5..4fdb411390 100644
--- a/docs/cluster/config.md
+++ b/docs/cluster/config.md
@@ -18,7 +18,7 @@ cortex_aws_secret_access_key: ***
 # Instance type Cortex will use
 instance_type: m5.large
 
-# Minimum and maximum number of instances in the cluster to run your API
+# Minimum and maximum number of instances in the cluster
 min_instances: 2
 max_instances: 5
 
diff --git a/examples/pytorch/iris-classifier/cortex.yaml b/examples/pytorch/iris-classifier/cortex.yaml
index d3d011e9ec..937c38d7a9 100644
--- a/examples/pytorch/iris-classifier/cortex.yaml
+++ b/examples/pytorch/iris-classifier/cortex.yaml
@@ -9,6 +9,3 @@
     model: s3://cortex-examples/pytorch/iris-classifier/weights.pth
   tracker:
     model_type: classification
-  compute:
-    cpu: 1.2
-    mem: 6.5Gi
diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile
index 07589de56b..54b0fc0893 100644
--- a/images/manager/Dockerfile
+++ b/images/manager/Dockerfile
@@ -8,6 +8,8 @@ RUN pip install --upgrade pip && \
     pip install awscli --upgrade --user && \
     rm -rf /root/.cache/pip*
 
+RUN pip install -r /root/requirements.txt
+
 RUN apk add --no-cache bash curl gettext jq openssl
 
 RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.8.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && \
@@ -36,6 +38,4 @@ RUN ISTIO_VERSION=1.3.4 && \
 
 COPY manager /root
 
-RUN pip install -r /root/requirements.txt
-
 ENTRYPOINT ["/bin/bash"]
diff --git a/manager/manifests/statsd.yaml b/manager/manifests/statsd.yaml
index 49bdbcc9c0..7d8331594f 100644
--- a/manager/manifests/statsd.yaml
+++ b/manager/manifests/statsd.yaml
@@ -94,7 +94,7 @@ spec:
             - name: cwagentconfig
               mountPath: /etc/cwagentconfig
       nodeSelector:
-        lifecycle: "Ec2Spot"
+        workload: "true"
       volumes:
         - name: cwagentconfig
           configMap:

From e95239298a8509cae0c5e0fb3d0a0432fc04de08 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 09:06:28 -0500
Subject: [PATCH 08/24] Remove debug statements

---
 cli/cmd/lib_manager.go                 |  9 +++---
 dev-cluster.yaml                       | 44 --------------------------
 images/manager/Dockerfile              |  2 ++
 manager/install.sh                     |  5 +--
 manager/manifests/nvidia.yaml          |  7 ++--
 pkg/lib/k8s/pod.go                     |  6 ++++
 pkg/lib/k8s/quantity.go                | 18 +++++++++--
 pkg/operator/workloads/api_workload.go |  3 +-
 pkg/operator/workloads/workflow.go     |  8 ++---
 9 files changed, 39 insertions(+), 63 deletions(-)
 delete mode 100644 dev-cluster.yaml

diff --git a/cli/cmd/lib_manager.go b/cli/cmd/lib_manager.go
index e7d3150de0..b1e902281a 100644
--- a/cli/cmd/lib_manager.go
+++ b/cli/cmd/lib_manager.go
@@ -124,16 +124,15 @@ func runManagerCommand(entrypoint string, clusterConfig *clusterconfig.ClusterCo
 	if err != nil {
 		return "", errors.WithStack(err)
 	}
+
 	if err := files.WriteFile(clusterConfigBytes, cachedClusterConfigPath); err != nil {
 		return "", err
 	}
 
 	containerConfig := &container.Config{
-		Image:      clusterConfig.ImageManager,
-		Entrypoint: []string{"/bin/bash", "-c"},
-		// Cmd:        []string{"sleep 0.1 && python /root/instance_metadata.py /.cortex/cluster.yaml && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && echo $CORTEX_INSTANCE_CPU"},
-		// Cmd:          []string{"sleep 0.1 eval $(python /root/instance_metadata.py /.cortex/cluster.yaml) && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && " + entrypoint},
-		Cmd:          []string{"sleep 0.1 && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml /root/eks.yaml) && " + entrypoint},
+		Image:        clusterConfig.ImageManager,
+		Entrypoint:   []string{"/bin/bash", "-c"},
+		Cmd:          []string{"sleep 0.1 && eval $(python /root/instance_metadata.py /.cortex/cluster.yaml) && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && " + entrypoint},
 		Tty:          true,
 		AttachStdout: true,
 		AttachStderr: true,
diff --git a/dev-cluster.yaml b/dev-cluster.yaml
deleted file mode 100644
index b54d482010..0000000000
--- a/dev-cluster.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-apiVersion: eksctl.io/v1alpha5
-kind: ClusterConfig
-
-metadata:
-  name: cortex
-  region: us-west-2
-  version: "1.14"
-
-nodeGroups:
-  - name: ng-cortex-worker1
-    instanceType: p2.xlarge
-    minSize: 0
-    maxSize: 2
-    desiredCapacity: 1
-    ami: auto
-    iam:
-      withAddonPolicies:
-        autoScaler: true
-    tags:
-      k8s.io/cluster-autoscaler/enabled: 'true'
-      k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
-      k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
-      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
-      k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
-    labels:
-      lifecycle: Ec2Spot
-      workload: "true"
-      nvidia.com/gpu: 'true'
-    taints:
-      nvidia.com/gpu: "true:NoSchedule"
-      workload: "true:NoSchedule"
-    kubeletExtraConfig:
-      kubeReserved:
-        cpu: 150m
-        memory: 300Mi
-        ephemeral-storage: 1Gi
-      kubeReservedCgroup: /kube-reserved
-      systemReserved:
-        cpu: 150m
-        memory: 300Mi
-        ephemeral-storage: 1Gi
-      evictionHard:
-        memory.available: 200Mi
-        nodefs.available: 5%
diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile
index 54b0fc0893..e1d329c383 100644
--- a/images/manager/Dockerfile
+++ b/images/manager/Dockerfile
@@ -8,6 +8,8 @@ RUN pip install --upgrade pip && \
     pip install awscli --upgrade --user && \
     rm -rf /root/.cache/pip*
 
+COPY manager/requirements.txt /root/requirements.txt
+
 RUN pip install -r /root/requirements.txt
 
 RUN apk add --no-cache bash curl gettext jq openssl
diff --git a/manager/install.sh b/manager/install.sh
index d59d601519..4527341c43 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -40,10 +40,8 @@ function ensure_eks() {
 
     echo -e "￮ Spinning up the cluster ... (this will take about 15 minutes)\n"
     if [ $CORTEX_INSTANCE_GPU -ne 0 ]; then
-      echo "GPU"
       envsubst < eks_gpu.yaml | eksctl create cluster -f -
     else
-      echo "CPU"
       envsubst < eks.yaml | eksctl create cluster -f -
     fi
     echo -e "\n✓ Spun up the cluster"
@@ -73,7 +71,10 @@ function ensure_eks() {
 
   # Check if instance type changed
   ng_info=$(eksctl get nodegroup --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --name ng-cortex-worker -o json)
+  echo $ng_info
   ng_instance_type=$(echo "$ng_info" | jq -r ".[] | select( .Cluster == \"$CORTEX_CLUSTER_NAME\" ) | select( .Name == \"ng-cortex-worker\" ) | .InstanceType")
+  echo $ng_instance_type
+  echo $CORTEX_INSTANCE_TYPE
   if [ "$ng_instance_type" != "$CORTEX_INSTANCE_TYPE" ]; then
     echo -e "\nerror: Cortex does not currently support changing the instance type of a running cluster; please run \`cortex cluster down\` followed by \`cortex cluster up\` to create a new cluster"
     exit 1
diff --git a/manager/manifests/nvidia.yaml b/manager/manifests/nvidia.yaml
index 0794cee088..854ba9475a 100644
--- a/manager/manifests/nvidia.yaml
+++ b/manager/manifests/nvidia.yaml
@@ -62,11 +62,10 @@ spec:
             mountPath: /var/lib/kubelet/device-plugins
         resources: # https://github.com/kubernetes/kubernetes/blob/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml#L44
           requests:
-            cpu: 50m
-            memory: 10Mi
+            cpu: 100m
+            memory: 100Mi
           limits:
-            cpu: 50m
-            memory: 10Mi
+            memory: 100Mi
       volumes:
         - name: device-plugin
           hostPath:
diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go
index c9af5c66a6..7a60398eed 100644
--- a/pkg/lib/k8s/pod.go
+++ b/pkg/lib/k8s/pod.go
@@ -342,5 +342,11 @@ func Tolerations() []kcore.Toleration {
 			Value:    "true",
 			Effect:   kcore.TaintEffectNoSchedule,
 		},
+		{
+			Key:      "nvidia.com/gpu",
+			Operator: kcore.TolerationOpEqual,
+			Value:    "true",
+			Effect:   kcore.TaintEffectNoSchedule,
+		},
 	}
 }
diff --git a/pkg/lib/k8s/quantity.go b/pkg/lib/k8s/quantity.go
index 778b3e26c9..dbbfafed91 100644
--- a/pkg/lib/k8s/quantity.go
+++ b/pkg/lib/k8s/quantity.go
@@ -38,7 +38,7 @@ type QuantityValidation struct {
 	LessThanOrEqualTo    *kresource.Quantity
 }
 
-func QuantityParser(v *QuantityValidation) func(string) (interface{}, error) {
+func K8sQuantityParser(v *QuantityValidation) func(string) (interface{}, error) {
 	return func(str string) (interface{}, error) {
 		k8sQuantity, err := kresource.ParseQuantity(str)
 		if err != nil {
@@ -65,9 +65,19 @@ func QuantityParser(v *QuantityValidation) func(string) (interface{}, error) {
 				return nil, configreader.ErrorMustBeLessThanOrEqualTo(str, *v.LessThanOrEqualTo)
 			}
 		}
+		return k8sQuantity, nil
+	}
+}
+
+func QuantityParser(v *QuantityValidation) func(string) (interface{}, error) {
+	return func(str string) (interface{}, error) {
+		k8sQuantity, err := K8sQuantityParser(v)(str)
+		if err != nil {
+			return Quantity{}, err
+		}
 
 		return Quantity{
-			Quantity:   k8sQuantity,
+			Quantity:   k8sQuantity.(kresource.Quantity),
 			UserString: str,
 		}, nil
 	}
@@ -132,6 +142,10 @@ type quantityMarshalable struct {
 	UserString string
 }
 
+func (quantity Quantity) MarshalYAML() (interface{}, error) {
+	return quantity.String(), nil
+}
+
 func (quantity Quantity) MarshalJSON() ([]byte, error) {
 	marshalable := quantityMarshalable{
 		Quantity:   quantity.Quantity,
diff --git a/pkg/operator/workloads/api_workload.go b/pkg/operator/workloads/api_workload.go
index ba6e2ccc39..f502979f1e 100644
--- a/pkg/operator/workloads/api_workload.go
+++ b/pkg/operator/workloads/api_workload.go
@@ -250,7 +250,6 @@ func tfAPISpec(
 	apiResourceList := kcore.ResourceList{}
 	tfServingResourceList := kcore.ResourceList{}
 	tfServingLimitsList := kcore.ResourceList{}
-	tolerations := k8s.Tolerations()
 
 	q1, q2 := api.Compute.CPU.SplitInTwo()
 	apiResourceList[kcore.ResourceCPU] = *q1
@@ -429,7 +428,7 @@ func tfAPISpec(
 				NodeSelector: map[string]string{
 					"workload": "true",
 				},
-				Tolerations:        tolerations,
+				Tolerations:        k8s.Tolerations(),
 				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
diff --git a/pkg/operator/workloads/workflow.go b/pkg/operator/workloads/workflow.go
index 5b994a3e98..cc32b5b4e3 100644
--- a/pkg/operator/workloads/workflow.go
+++ b/pkg/operator/workloads/workflow.go
@@ -32,8 +32,8 @@ import (
 	"github.com/cortexlabs/cortex/pkg/operator/config"
 )
 
-var cortexCPUReserve = kresource.MustParse("800m")   // FluentD (200), Nvidia (50), StatsD (100), Kube Proxy, (100) Node capacity - Node availability 300 CPU
-var cortexMemReserve = kresource.MustParse("1500Mi") // FluentD (200), Nvidia (50), StatsD (100), KubeReserved (800), AWS node memory - Node capacity (200)
+var cortexCPUReserve = kresource.MustParse("800m")   // FluentD (200), Nvidia (100), StatsD (100), Kube Proxy, (100) Node capacity - Node availability 300 CPU
+var cortexMemReserve = kresource.MustParse("1500Mi") // FluentD (200), Nvidia (100), StatsD (100), KubeReserved (800), AWS node memory - Node capacity (200)
 
 func Init() error {
 	err := reloadCurrentContexts()
@@ -312,9 +312,9 @@ func ValidateDeploy(ctx *context.Context) error {
 	}
 
 	maxCPU := config.Cluster.InstanceCPU.Copy()
-	//maxCPU.Sub(cortexCPUReserve)
+	maxCPU.Sub(cortexCPUReserve)
 	maxMem := config.Cluster.InstanceMem.Copy()
-	//maxMem.Sub(cortexMemReserve)
+	maxMem.Sub(cortexMemReserve)
 	maxGPU := config.Cluster.InstanceGPU
 	for _, node := range nodes {
 		curCPU := node.Status.Capacity.Cpu()

From 351e68bba0a4660d21165467fd7df0d5fbc53b72 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 09:18:41 -0500
Subject: [PATCH 09/24] Remove more debugging helpers

---
 cli/cmd/cluster.go                     |  2 +-
 dev/registry.sh                        |  1 -
 go.mod                                 |  3 +++
 manager/eks.yaml                       |  2 --
 manager/eks_gpu.yaml                   |  2 --
 pkg/lib/clusterconfig/clusterconfig.go |  3 +--
 pkg/lib/errors/errors.go               |  4 ++--
 pkg/lib/k8s/quantity.go                | 14 ++------------
 pkg/operator/endpoints/info.go         |  2 +-
 9 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index 66852920bb..246dc1d498 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -75,7 +75,7 @@ var upCmd = &cobra.Command{
 			errors.Exit(err)
 		}
 
-		//promptForEmail()
+		promptForEmail()
 
 		clusterConfig, awsCreds, err := getInstallClusterConfig()
 		if err != nil {
diff --git a/dev/registry.sh b/dev/registry.sh
index 3fea7b319e..18446007c2 100755
--- a/dev/registry.sh
+++ b/dev/registry.sh
@@ -147,7 +147,6 @@ elif [ "$cmd" = "update" ]; then
   build_and_push $ROOT/images/onnx-serve onnx-serve latest
   build_and_push $ROOT/images/onnx-serve-gpu onnx-serve-gpu latest
   build_and_push $ROOT/images/downloader downloader latest
-  build_and_push $ROOT/images/manager manager latest
 
   cleanup
 fi
diff --git a/go.mod b/go.mod
index 5d2b434a55..f56c14206f 100644
--- a/go.mod
+++ b/go.mod
@@ -30,9 +30,12 @@ require (
 	github.com/tcnksm/go-input v0.0.0-20180404061846-548a7d7a8ee8
 	github.com/ugorji/go/codec v1.1.7
 	github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca
+	golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3 // indirect
+	golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135 // indirect
 	google.golang.org/grpc v1.25.1 // indirect
 	gopkg.in/karalabe/cookiejar.v2 v2.0.0-20150724131613-8dcd6a7f4951
 	gotest.tools v2.2.0+incompatible // indirect
+	honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc // indirect
 	k8s.io/api v0.0.0-20190620084959-7cf5895f2711
 	k8s.io/apimachinery v0.0.0-20190612205821-1799e75a0719
 	k8s.io/client-go v0.0.0-20190620085101-78d2af792bab
diff --git a/manager/eks.yaml b/manager/eks.yaml
index 116c1bea52..c572a8dbcf 100644
--- a/manager/eks.yaml
+++ b/manager/eks.yaml
@@ -55,9 +55,7 @@ nodeGroups:
         autoScaler: true
     tags:
       k8s.io/cluster-autoscaler/enabled: 'true'
-      k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
       k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
-      k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
     labels:
       lifecycle: Ec2Spot
       workload: "true"
diff --git a/manager/eks_gpu.yaml b/manager/eks_gpu.yaml
index aab794852b..4ee7a90d41 100644
--- a/manager/eks_gpu.yaml
+++ b/manager/eks_gpu.yaml
@@ -57,8 +57,6 @@ nodeGroups:
       k8s.io/cluster-autoscaler/enabled: 'true'
       k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
       k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
-      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
-      k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
     labels:
       lifecycle: Ec2Spot
       workload: "true"
diff --git a/pkg/lib/clusterconfig/clusterconfig.go b/pkg/lib/clusterconfig/clusterconfig.go
index e835fdb75d..c2ab47af6d 100644
--- a/pkg/lib/clusterconfig/clusterconfig.go
+++ b/pkg/lib/clusterconfig/clusterconfig.go
@@ -195,8 +195,7 @@ var Validation = &cr.StructValidation{
 		{
 			StructField: "ImageManager",
 			StringValidation: &cr.StringValidation{
-				Default: "969758392368.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/manager:latest",
-			},
+				Default: "cortexlabs/manager:" + consts.CortexVersion},
 		},
 		{
 			StructField: "ImageDownloader",
diff --git a/pkg/lib/errors/errors.go b/pkg/lib/errors/errors.go
index 4f798397f2..60a7c845e9 100644
--- a/pkg/lib/errors/errors.go
+++ b/pkg/lib/errors/errors.go
@@ -144,14 +144,14 @@ func Panic(items ...interface{}) {
 		os.Exit(1)
 	}
 	err := MergeErrItems(items...)
-	PrintStacktrace(err)
+	// PrintStacktrace(err)
 	panic(err)
 }
 
 func PrintError(err error, strs ...string) {
 	wrappedErr := Wrap(err, strs...)
 	fmt.Println("error:", wrappedErr.Error())
-	PrintStacktrace(wrappedErr)
+	// PrintStacktrace(wrappedErr)
 }
 
 func PrintStacktrace(err error) {
diff --git a/pkg/lib/k8s/quantity.go b/pkg/lib/k8s/quantity.go
index dbbfafed91..1d34146b2a 100644
--- a/pkg/lib/k8s/quantity.go
+++ b/pkg/lib/k8s/quantity.go
@@ -38,7 +38,7 @@ type QuantityValidation struct {
 	LessThanOrEqualTo    *kresource.Quantity
 }
 
-func K8sQuantityParser(v *QuantityValidation) func(string) (interface{}, error) {
+func QuantityParser(v *QuantityValidation) func(string) (interface{}, error) {
 	return func(str string) (interface{}, error) {
 		k8sQuantity, err := kresource.ParseQuantity(str)
 		if err != nil {
@@ -65,19 +65,9 @@ func K8sQuantityParser(v *QuantityValidation) func(string) (interface{}, error)
 				return nil, configreader.ErrorMustBeLessThanOrEqualTo(str, *v.LessThanOrEqualTo)
 			}
 		}
-		return k8sQuantity, nil
-	}
-}
-
-func QuantityParser(v *QuantityValidation) func(string) (interface{}, error) {
-	return func(str string) (interface{}, error) {
-		k8sQuantity, err := K8sQuantityParser(v)(str)
-		if err != nil {
-			return Quantity{}, err
-		}
 
 		return Quantity{
-			Quantity:   k8sQuantity.(kresource.Quantity),
+			Quantity:   k8sQuantity,
 			UserString: str,
 		}, nil
 	}
diff --git a/pkg/operator/endpoints/info.go b/pkg/operator/endpoints/info.go
index 7007eedd05..6628c36490 100644
--- a/pkg/operator/endpoints/info.go
+++ b/pkg/operator/endpoints/info.go
@@ -26,7 +26,7 @@ import (
 )
 
 func Info(w http.ResponseWriter, r *http.Request) {
-	asgs, err := config.AWS.AutoscalingGroups(map[string]string{"alpha.eksctl.io/nodegroup-name": "ng-1"})
+	asgs, err := config.AWS.AutoscalingGroups(map[string]string{"alpha.eksctl.io/nodegroup-name": "ng-cortex-worker"})
 	if err != nil {
 		RespondError(w, errors.WithStack(err))
 		return

From 58e49334ec90c7df3b9c02f6fd34817c71d072d8 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 09:20:03 -0500
Subject: [PATCH 10/24] Reset go.mod

---
 go.mod | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/go.mod b/go.mod
index f56c14206f..5d2b434a55 100644
--- a/go.mod
+++ b/go.mod
@@ -30,12 +30,9 @@ require (
 	github.com/tcnksm/go-input v0.0.0-20180404061846-548a7d7a8ee8
 	github.com/ugorji/go/codec v1.1.7
 	github.com/xlab/treeprint v0.0.0-20181112141820-a009c3971eca
-	golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3 // indirect
-	golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135 // indirect
 	google.golang.org/grpc v1.25.1 // indirect
 	gopkg.in/karalabe/cookiejar.v2 v2.0.0-20150724131613-8dcd6a7f4951
 	gotest.tools v2.2.0+incompatible // indirect
-	honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc // indirect
 	k8s.io/api v0.0.0-20190620084959-7cf5895f2711
 	k8s.io/apimachinery v0.0.0-20190612205821-1799e75a0719
 	k8s.io/client-go v0.0.0-20190620085101-78d2af792bab

From c56ca3ea66234365a06607fc06303ff5b0015526 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 09:21:00 -0500
Subject: [PATCH 11/24] Remove more echo statements

---
 manager/install.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/manager/install.sh b/manager/install.sh
index 4527341c43..db84ae3af8 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -71,10 +71,7 @@ function ensure_eks() {
 
   # Check if instance type changed
   ng_info=$(eksctl get nodegroup --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --name ng-cortex-worker -o json)
-  echo $ng_info
   ng_instance_type=$(echo "$ng_info" | jq -r ".[] | select( .Cluster == \"$CORTEX_CLUSTER_NAME\" ) | select( .Name == \"ng-cortex-worker\" ) | .InstanceType")
-  echo $ng_instance_type
-  echo $CORTEX_INSTANCE_TYPE
   if [ "$ng_instance_type" != "$CORTEX_INSTANCE_TYPE" ]; then
     echo -e "\nerror: Cortex does not currently support changing the instance type of a running cluster; please run \`cortex cluster down\` followed by \`cortex cluster up\` to create a new cluster"
     exit 1

From cdf862e8df575cefcd44d1ec77ef4614c4505bdd Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 09:27:13 -0500
Subject: [PATCH 12/24] Remove unnecessary boto3 dependency

---
 manager/instance_metadata.py | 1 -
 manager/requirements.txt     | 1 -
 2 files changed, 2 deletions(-)

diff --git a/manager/instance_metadata.py b/manager/instance_metadata.py
index fc5eb51033..11235ad87c 100644
--- a/manager/instance_metadata.py
+++ b/manager/instance_metadata.py
@@ -1,4 +1,3 @@
-import boto3
 import requests
 import sys
 import re
diff --git a/manager/requirements.txt b/manager/requirements.txt
index f3667cfc01..566083cb6b 100644
--- a/manager/requirements.txt
+++ b/manager/requirements.txt
@@ -1,2 +1 @@
 requests==2.22.0
-boto3==1.9.199

From 1f18d5259f5c756cd9244a3943a2120e363fd127 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 14:59:06 -0500
Subject: [PATCH 13/24] Address some PR comments and fix linting

---
 Makefile                                  |  2 +-
 docs/cluster/config.md                    |  2 +-
 images/manager/Dockerfile                 |  7 +-
 manager/eks.yaml                          |  6 +-
 manager/eks_gpu.yaml                      |  7 +-
 manager/install.sh                        |  4 +-
 manager/instance_metadata.py              | 28 +++++---
 manager/manifests/cluster-autoscaler.yaml | 81 ++++++++++-------------
 manager/manifests/fluentd.yaml            |  5 +-
 manager/manifests/istio-values.yaml       |  4 +-
 manager/manifests/metrics-server.yaml     |  7 +-
 manager/manifests/nvidia.yaml             |  2 +
 manager/manifests/statsd.yaml             |  1 -
 pkg/lib/clusterconfig/clusterconfig.go    |  5 +-
 pkg/operator/workloads/workflow.go        | 28 +-------
 pkg/workloads/cortex/lib/context.py       |  1 -
 16 files changed, 86 insertions(+), 104 deletions(-)

diff --git a/Makefile b/Makefile
index c7aceea538..149785a979 100644
--- a/Makefile
+++ b/Makefile
@@ -25,7 +25,7 @@ devstart:
 	@./dev/operator_local.sh || true
 
 kubectl:
-	@eval $$(python ./manager/cluster_config_env.py ./dev/config/cluster.yaml) && eksctl utils write-kubeconfig --name="$$CORTEX_CLUSTER_NAME" | grep -v "saved kubeconfig as" | grep -v "using region" | grep -v "eksctl version" || true
+	@eval $$(python ./manager/cluster_config_env.py ./dev/config/cluster.yaml) && eksctl utils write-kubeconfig --cluster="$$CORTEX_CLUSTER_NAME" | grep -v "saved kubeconfig as" | grep -v "using region" | grep -v "eksctl version" || true
 	@kubectl config set-context --current --namespace=cortex >/dev/null
 
 cluster-up:
diff --git a/docs/cluster/config.md b/docs/cluster/config.md
index 4fdb411390..0201bd6de2 100644
--- a/docs/cluster/config.md
+++ b/docs/cluster/config.md
@@ -19,7 +19,7 @@ cortex_aws_secret_access_key: ***
 instance_type: m5.large
 
 # Minimum and maximum number of instances in the cluster
-min_instances: 2
+min_instances: 1
 max_instances: 5
 
 # Name of the S3 bucket Cortex will use
diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile
index e1d329c383..2d442c841d 100644
--- a/images/manager/Dockerfile
+++ b/images/manager/Dockerfile
@@ -4,14 +4,13 @@ WORKDIR /root
 
 ENV PATH /root/.local/bin:$PATH
 
+COPY manager/requirements.txt /root/requirements.txt
+
 RUN pip install --upgrade pip && \
     pip install awscli --upgrade --user && \
+    pip install -r /root/requirements.txt && \
     rm -rf /root/.cache/pip*
 
-COPY manager/requirements.txt /root/requirements.txt
-
-RUN pip install -r /root/requirements.txt
-
 RUN apk add --no-cache bash curl gettext jq openssl
 
 RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.8.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && \
diff --git a/manager/eks.yaml b/manager/eks.yaml
index c572a8dbcf..381227b45c 100644
--- a/manager/eks.yaml
+++ b/manager/eks.yaml
@@ -23,13 +23,15 @@ metadata:
 nodeGroups:
   - name: ng-cortex-operator
     instanceType: t3.medium
-    minSize: 2
+    minSize: 1
     maxSize: 2
-    desiredCapacity: 2
+    desiredCapacity: 1
     ami: auto
     iam:
       withAddonPolicies:
         autoScaler: true
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
     kubeletExtraConfig:
       kubeReserved:
         cpu: 150m
diff --git a/manager/eks_gpu.yaml b/manager/eks_gpu.yaml
index 4ee7a90d41..8f60afa21d 100644
--- a/manager/eks_gpu.yaml
+++ b/manager/eks_gpu.yaml
@@ -23,13 +23,15 @@ metadata:
 nodeGroups:
   - name: ng-cortex-operator
     instanceType: t3.medium
-    minSize: 2
+    minSize: 1
     maxSize: 2
-    desiredCapacity: 2
+    desiredCapacity: 1
     ami: auto
     iam:
       withAddonPolicies:
         autoScaler: true
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
     kubeletExtraConfig:
       kubeReserved:
         cpu: 150m
@@ -57,6 +59,7 @@ nodeGroups:
       k8s.io/cluster-autoscaler/enabled: 'true'
       k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
       k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
+      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
     labels:
       lifecycle: Ec2Spot
       workload: "true"
diff --git a/manager/install.sh b/manager/install.sh
index db84ae3af8..a73c4906b0 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -33,9 +33,9 @@ function ensure_eks() {
     fi
 
     if [ $CORTEX_MIN_INSTANCES -lt 1 ]; then
-      CORTEX_DESIRED_INSTANCES=1
+      export CORTEX_DESIRED_INSTANCES=1
     else
-      CORTEX_DESIRED_INSTANCES=$CORTEX_MIN_INSTANCES
+      export CORTEX_DESIRED_INSTANCES=$CORTEX_MIN_INSTANCES
     fi
 
     echo -e "￮ Spinning up the cluster ... (this will take about 15 minutes)\n"
diff --git a/manager/instance_metadata.py b/manager/instance_metadata.py
index 11235ad87c..b6163cccd8 100644
--- a/manager/instance_metadata.py
+++ b/manager/instance_metadata.py
@@ -1,3 +1,17 @@
+# Copyright 2019 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import requests
 import sys
 import re
@@ -53,19 +67,15 @@ def download_metadata(cluster_config):
     return instance_mapping
 
 
-def get_metadata(cluster_config):
-    return download_metadata(cluster_config)
-
-
 def set_ec2_metadata(cluster_config_path):
     with open(cluster_config_path, "r") as cluster_config_file:
         cluster_config = yaml.safe_load(cluster_config_file)
-    instance_mapping = get_metadata(cluster_config)
-    instance_type = instance_mapping[cluster_config["instance_type"]]
+    instance_mapping = download_metadata(cluster_config)
+    instance_metadata = instance_mapping[cluster_config["instance_type"]]
 
-    cluster_config["instance_mem"] = str(instance_type["mem"]) + "Mi"
-    cluster_config["instance_cpu"] = str(instance_type["cpu"])
-    cluster_config["instance_gpu"] = int(instance_type.get("gpu", 0))
+    cluster_config["instance_mem"] = str(instance_metadata["mem"]) + "Mi"
+    cluster_config["instance_cpu"] = str(instance_metadata["cpu"])
+    cluster_config["instance_gpu"] = int(instance_metadata.get("gpu", 0))
 
     with open(cluster_config_path, "w") as cluster_config_file:
         yaml.dump(cluster_config, cluster_config_file, default_flow_style=False)
diff --git a/manager/manifests/cluster-autoscaler.yaml b/manager/manifests/cluster-autoscaler.yaml
index 85da7881da..ae603458da 100644
--- a/manager/manifests/cluster-autoscaler.yaml
+++ b/manager/manifests/cluster-autoscaler.yaml
@@ -32,48 +32,40 @@ metadata:
     k8s-addon: cluster-autoscaler.addons.k8s.io
     k8s-app: cluster-autoscaler
 rules:
-  - apiGroups: ["storage.k8s.io"]
-    resources: ["csinodes"]
-    verbs: ["watch", "list", "get"]
-  - apiGroups: [""]
-    resources: ["events", "endpoints"]
-    verbs: ["create", "patch"]
-  - apiGroups: [""]
-    resources: ["pods/eviction"]
-    verbs: ["create"]
-  - apiGroups: [""]
-    resources: ["pods/status"]
-    verbs: ["update"]
-  - apiGroups: [""]
-    resources: ["endpoints"]
-    resourceNames: ["cluster-autoscaler"]
-    verbs: ["get", "update"]
-  - apiGroups: [""]
-    resources: ["nodes"]
-    verbs: ["watch", "list", "get", "update"]
-  - apiGroups: [""]
-    resources:
-      - "pods"
-      - "services"
-      - "replicationcontrollers"
-      - "persistentvolumeclaims"
-      - "persistentvolumes"
-    verbs: ["watch", "list", "get"]
-  - apiGroups: ["extensions"]
-    resources: ["replicasets", "daemonsets"]
-    verbs: ["watch", "list", "get"]
-  - apiGroups: ["policy"]
-    resources: ["poddisruptionbudgets"]
-    verbs: ["watch", "list"]
-  - apiGroups: ["apps"]
-    resources: ["statefulsets", "replicasets", "daemonsets"]
-    verbs: ["watch", "list", "get"]
-  - apiGroups: ["storage.k8s.io"]
-    resources: ["storageclasses"]
-    verbs: ["watch", "list", "get"]
-  - apiGroups: ["batch", "extensions"]
-    resources: ["jobs"]
-    verbs: ["get", "list", "watch", "patch"]
+- apiGroups: [""]
+  resources: ["events","endpoints"]
+  verbs: ["create", "patch"]
+- apiGroups: [""]
+  resources: ["pods/eviction"]
+  verbs: ["create"]
+- apiGroups: [""]
+  resources: ["pods/status"]
+  verbs: ["update"]
+- apiGroups: [""]
+  resources: ["endpoints"]
+  resourceNames: ["cluster-autoscaler"]
+  verbs: ["get","update"]
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs: ["watch","list","get","update"]
+- apiGroups: [""]
+  resources: ["pods","services","replicationcontrollers","persistentvolumeclaims","persistentvolumes"]
+  verbs: ["watch","list","get"]
+- apiGroups: ["extensions"]
+  resources: ["replicasets","daemonsets"]
+  verbs: ["watch","list","get"]
+- apiGroups: ["policy"]
+  resources: ["poddisruptionbudgets"]
+  verbs: ["watch","list"]
+- apiGroups: ["apps"]
+  resources: ["statefulsets", "replicasets", "daemonsets"]
+  verbs: ["watch","list","get"]
+- apiGroups: ["storage.k8s.io"]
+  resources: ["storageclasses"]
+  verbs: ["watch","list","get"]
+- apiGroups: ["batch", "extensions"]
+  resources: ["jobs"]
+  verbs: ["get", "list", "watch", "patch"]
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1
@@ -151,12 +143,11 @@ spec:
         - image: $CORTEX_IMAGE_CLUSTER_AUTOSCALER
           name: cluster-autoscaler
           resources:
-            limits:
-              cpu: 100m
-              memory: 300Mi
             requests:
               cpu: 100m
               memory: 300Mi
+            limits:
+              memory: 300Mi
           command:
             - ./cluster-autoscaler
             - --v=4
diff --git a/manager/manifests/fluentd.yaml b/manager/manifests/fluentd.yaml
index d94311fe00..4233c535c4 100644
--- a/manager/manifests/fluentd.yaml
+++ b/manager/manifests/fluentd.yaml
@@ -150,12 +150,11 @@ spec:
               name: aws-credentials
               key: AWS_SECRET_ACCESS_KEY
         resources:
-          limits:
-            cpu:  200m
-            memory: 200Mi
           requests:
             cpu: 200m
             memory: 200Mi
+          limits:
+            memory: 200Mi
         volumeMounts:
         - name: varlog
           mountPath: /var/log
diff --git a/manager/manifests/istio-values.yaml b/manager/manifests/istio-values.yaml
index 31173b3bb9..15f099c32a 100644
--- a/manager/manifests/istio-values.yaml
+++ b/manager/manifests/istio-values.yaml
@@ -71,7 +71,7 @@ gateways:
     autoscaleMax: 5
     resources:
       requests:
-       cpu: 100m
+       cpu: 200m
        memory: 128Mi
       limits:
        cpu: 2000m
@@ -126,7 +126,7 @@ pilot:
   sidecar: false
   resources:
     requests:
-      cpu: 150m
+      cpu: 200m
       memory: 128Mi
     limits:
       cpu: 2000m
diff --git a/manager/manifests/metrics-server.yaml b/manager/manifests/metrics-server.yaml
index 322c85ccf1..7134e39083 100644
--- a/manager/manifests/metrics-server.yaml
+++ b/manager/manifests/metrics-server.yaml
@@ -111,7 +111,12 @@ spec:
         volumeMounts:
         - name: tmp-dir
           mountPath: /tmp
-
+        resources: # https://github.com/kubernetes/kubernetes/blob/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml#L44
+          requests:
+            cpu: 100m
+            memory: 100Mi
+          limits:
+            memory: 100Mi
 ---
 
 apiVersion: v1
diff --git a/manager/manifests/nvidia.yaml b/manager/manifests/nvidia.yaml
index 854ba9475a..5336b120f7 100644
--- a/manager/manifests/nvidia.yaml
+++ b/manager/manifests/nvidia.yaml
@@ -66,6 +66,8 @@ spec:
             memory: 100Mi
           limits:
             memory: 100Mi
+      nodeSelector:
+        workload: "true"
       volumes:
         - name: device-plugin
           hostPath:
diff --git a/manager/manifests/statsd.yaml b/manager/manifests/statsd.yaml
index 7d8331594f..b7f87cd266 100644
--- a/manager/manifests/statsd.yaml
+++ b/manager/manifests/statsd.yaml
@@ -66,7 +66,6 @@ spec:
               protocol: UDP
           resources:
             limits:
-              cpu:  200m
               memory: 100Mi
             requests:
               cpu: 100m
diff --git a/pkg/lib/clusterconfig/clusterconfig.go b/pkg/lib/clusterconfig/clusterconfig.go
index c2ab47af6d..073f33dc46 100644
--- a/pkg/lib/clusterconfig/clusterconfig.go
+++ b/pkg/lib/clusterconfig/clusterconfig.go
@@ -21,15 +21,14 @@ import (
 
 	"github.com/cortexlabs/cortex/pkg/consts"
 	"github.com/cortexlabs/cortex/pkg/lib/aws"
-	"github.com/cortexlabs/cortex/pkg/lib/table"
-	kresource "k8s.io/apimachinery/pkg/api/resource"
-
 	cr "github.com/cortexlabs/cortex/pkg/lib/configreader"
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	"github.com/cortexlabs/cortex/pkg/lib/hash"
 	"github.com/cortexlabs/cortex/pkg/lib/k8s"
 	"github.com/cortexlabs/cortex/pkg/lib/pointer"
 	"github.com/cortexlabs/cortex/pkg/lib/prompt"
+	"github.com/cortexlabs/cortex/pkg/lib/table"
+	kresource "k8s.io/apimachinery/pkg/api/resource"
 )
 
 type ClusterConfig struct {
diff --git a/pkg/operator/workloads/workflow.go b/pkg/operator/workloads/workflow.go
index cc32b5b4e3..e6425213ef 100644
--- a/pkg/operator/workloads/workflow.go
+++ b/pkg/operator/workloads/workflow.go
@@ -32,7 +32,7 @@ import (
 	"github.com/cortexlabs/cortex/pkg/operator/config"
 )
 
-var cortexCPUReserve = kresource.MustParse("800m")   // FluentD (200), Nvidia (100), StatsD (100), Kube Proxy, (100) Node capacity - Node availability 300 CPU
+var cortexCPUReserve = kresource.MustParse("800m")   // FluentD (200), Nvidia (100), StatsD (100), Kube Proxy (100), Node capacity - Node availability (300)
 var cortexMemReserve = kresource.MustParse("1500Mi") // FluentD (200), Nvidia (100), StatsD (100), KubeReserved (800), AWS node memory - Node capacity (200)
 
 func Init() error {
@@ -306,37 +306,11 @@ func ValidateDeploy(ctx *context.Context) error {
 		return err
 	}
 
-	nodes, err := config.Kubernetes.ListNodes(nil)
-	if err != nil {
-		return err
-	}
-
 	maxCPU := config.Cluster.InstanceCPU.Copy()
 	maxCPU.Sub(cortexCPUReserve)
 	maxMem := config.Cluster.InstanceMem.Copy()
 	maxMem.Sub(cortexMemReserve)
 	maxGPU := config.Cluster.InstanceGPU
-	for _, node := range nodes {
-		curCPU := node.Status.Capacity.Cpu()
-		curMem := node.Status.Capacity.Memory()
-
-		var curGPU int64
-		if GPUQuantity, ok := node.Status.Allocatable["nvidia.com/gpu"]; ok {
-			curGPU, _ = GPUQuantity.AsInt64()
-		}
-
-		if curCPU != nil && maxCPU.Cmp(*curCPU) < 0 {
-			maxCPU = curCPU
-		}
-
-		if curMem != nil && maxMem.Cmp(*curMem) < 0 {
-			maxMem = curMem
-		}
-
-		if curGPU > maxGPU {
-			maxGPU = curGPU
-		}
-	}
 
 	for _, api := range ctx.APIs {
 		if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {
diff --git a/pkg/workloads/cortex/lib/context.py b/pkg/workloads/cortex/lib/context.py
index 5230a364ea..af75746dce 100644
--- a/pkg/workloads/cortex/lib/context.py
+++ b/pkg/workloads/cortex/lib/context.py
@@ -163,7 +163,6 @@ def get_predictor_impl(self, api_name, project_dir):
             impl = self.load_module(
                 "predictor", api["name"], os.path.join(project_dir, api["predictor"]["path"])
             )
-
         except CortexException as e:
             e.wrap("api " + api_name, "failed to load predictor", api["predictor"]["path"])
             raise

From f90f921262d312743913d1ab86f6f64b78d14ac4 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 12:29:43 -0800
Subject: [PATCH 14/24] Remove InternalClusterConfig

---
 cli/cmd/cluster.go                     |   2 +-
 cli/cmd/lib_cluster_config.go          |   2 +-
 pkg/lib/clusterconfig/clusterconfig.go | 123 +++++++++++++------------
 pkg/operator/api/context/context.go    |  24 ++---
 pkg/operator/api/schema/schema.go      |   2 +-
 pkg/operator/config/config.go          |   6 +-
 6 files changed, 81 insertions(+), 78 deletions(-)

diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index 246dc1d498..0498556d7a 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -145,7 +145,7 @@ var infoCmd = &cobra.Command{
 		if err != nil {
 			errors.Exit(err, "/info", string(httpResponse))
 		}
-		fmt.Println(infoResponse.ClusterConfig.String())
+		fmt.Println(infoResponse.ClusterConfig.UserFacingString())
 	},
 }
 
diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
index 3469fa02e8..d39bacf0b7 100644
--- a/cli/cmd/lib_cluster_config.go
+++ b/cli/cmd/lib_cluster_config.go
@@ -93,7 +93,7 @@ var awsCredentialsPromptValidation = &cr.PromptValidation{
 }
 
 func readClusterConfigFile(clusterConfig *clusterconfig.ClusterConfig, awsCreds *AWSCredentials, path string) error {
-	errs := cr.ParseYAMLFile(clusterConfig, clusterconfig.Validation, path)
+	errs := cr.ParseYAMLFile(clusterConfig, clusterconfig.UserValidation, path)
 	if errors.HasErrors(errs) {
 		return errors.FirstError(errs...)
 	}
diff --git a/pkg/lib/clusterconfig/clusterconfig.go b/pkg/lib/clusterconfig/clusterconfig.go
index 073f33dc46..d1d0355173 100644
--- a/pkg/lib/clusterconfig/clusterconfig.go
+++ b/pkg/lib/clusterconfig/clusterconfig.go
@@ -32,46 +32,44 @@ import (
 )
 
 type ClusterConfig struct {
-	InstanceType           *string       `json:"instance_type" yaml:"instance_type"`
-	InstanceCPU            *k8s.Quantity `json:"instance_cpu" yaml:"instance_cpu"`
-	InstanceMem            *k8s.Quantity `json:"instance_mem" yaml:"instance_mem"`
-	InstanceGPU            int64         `json:"instance_gpu" yaml:"instance_gpu"`
-	MinInstances           *int64        `json:"min_instances" yaml:"min_instances"`
-	MaxInstances           *int64        `json:"max_instances" yaml:"max_instances"`
-	ClusterName            string        `json:"cluster_name" yaml:"cluster_name"`
-	Region                 string        `json:"region" yaml:"region"`
-	Bucket                 string        `json:"bucket" yaml:"bucket"`
-	LogGroup               string        `json:"log_group" yaml:"log_group"`
-	Telemetry              bool          `json:"telemetry" yaml:"telemetry"`
-	ImagePredictorServe    string        `json:"image_predictor_serve" yaml:"image_predictor_serve"`
-	ImagePredictorServeGPU string        `json:"image_predictor_serve_gpu" yaml:"image_predictor_serve_gpu"`
-	ImageTFServe           string        `json:"image_tf_serve" yaml:"image_tf_serve"`
-	ImageTFServeGPU        string        `json:"image_tf_serve_gpu" yaml:"image_tf_serve_gpu"`
-	ImageTFAPI             string        `json:"image_tf_api" yaml:"image_tf_api"`
-	ImageONNXServe         string        `json:"image_onnx_serve" yaml:"image_onnx_serve"`
-	ImageONNXServeGPU      string        `json:"image_onnx_serve_gpu" yaml:"image_onnx_serve_gpu"`
-	ImageOperator          string        `json:"image_operator" yaml:"image_operator"`
-	ImageManager           string        `json:"image_manager" yaml:"image_manager"`
-	ImageDownloader        string        `json:"image_downloader" yaml:"image_downloader"`
-	ImageClusterAutoscaler string        `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
-	ImageMetricsServer     string        `json:"image_metrics_server" yaml:"image_metrics_server"`
-	ImageNvidia            string        `json:"image_nvidia" yaml:"image_nvidia"`
-	ImageFluentd           string        `json:"image_fluentd" yaml:"image_fluentd"`
-	ImageStatsd            string        `json:"image_statsd" yaml:"image_statsd"`
-	ImageIstioProxy        string        `json:"image_istio_proxy" yaml:"image_istio_proxy"`
-	ImageIstioPilot        string        `json:"image_istio_pilot" yaml:"image_istio_pilot"`
-	ImageIstioCitadel      string        `json:"image_istio_citadel" yaml:"image_istio_citadel"`
-	ImageIstioGalley       string        `json:"image_istio_galley" yaml:"image_istio_galley"`
-}
+	InstanceType           *string `json:"instance_type" yaml:"instance_type"`
+	MinInstances           *int64  `json:"min_instances" yaml:"min_instances"`
+	MaxInstances           *int64  `json:"max_instances" yaml:"max_instances"`
+	ClusterName            string  `json:"cluster_name" yaml:"cluster_name"`
+	Region                 string  `json:"region" yaml:"region"`
+	Bucket                 string  `json:"bucket" yaml:"bucket"`
+	LogGroup               string  `json:"log_group" yaml:"log_group"`
+	Telemetry              bool    `json:"telemetry" yaml:"telemetry"`
+	ImagePredictorServe    string  `json:"image_predictor_serve" yaml:"image_predictor_serve"`
+	ImagePredictorServeGPU string  `json:"image_predictor_serve_gpu" yaml:"image_predictor_serve_gpu"`
+	ImageTFServe           string  `json:"image_tf_serve" yaml:"image_tf_serve"`
+	ImageTFServeGPU        string  `json:"image_tf_serve_gpu" yaml:"image_tf_serve_gpu"`
+	ImageTFAPI             string  `json:"image_tf_api" yaml:"image_tf_api"`
+	ImageONNXServe         string  `json:"image_onnx_serve" yaml:"image_onnx_serve"`
+	ImageONNXServeGPU      string  `json:"image_onnx_serve_gpu" yaml:"image_onnx_serve_gpu"`
+	ImageOperator          string  `json:"image_operator" yaml:"image_operator"`
+	ImageManager           string  `json:"image_manager" yaml:"image_manager"`
+	ImageDownloader        string  `json:"image_downloader" yaml:"image_downloader"`
+	ImageClusterAutoscaler string  `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
+	ImageMetricsServer     string  `json:"image_metrics_server" yaml:"image_metrics_server"`
+	ImageNvidia            string  `json:"image_nvidia" yaml:"image_nvidia"`
+	ImageFluentd           string  `json:"image_fluentd" yaml:"image_fluentd"`
+	ImageStatsd            string  `json:"image_statsd" yaml:"image_statsd"`
+	ImageIstioProxy        string  `json:"image_istio_proxy" yaml:"image_istio_proxy"`
+	ImageIstioPilot        string  `json:"image_istio_pilot" yaml:"image_istio_pilot"`
+	ImageIstioCitadel      string  `json:"image_istio_citadel" yaml:"image_istio_citadel"`
+	ImageIstioGalley       string  `json:"image_istio_galley" yaml:"image_istio_galley"`
 
-type InternalClusterConfig struct {
-	ClusterConfig
-	ID                string `json:"id"`
-	APIVersion        string `json:"api_version"`
-	OperatorInCluster bool   `json:"operator_in_cluster"`
+	// Internal
+	ID                string        `json:"id"`
+	APIVersion        string        `json:"api_version"`
+	OperatorInCluster bool          `json:"operator_in_cluster"`
+	InstanceCPU       *k8s.Quantity `json:"instance_cpu" yaml:"instance_cpu"`
+	InstanceMem       *k8s.Quantity `json:"instance_mem" yaml:"instance_mem"`
+	InstanceGPU       int64         `json:"instance_gpu" yaml:"instance_gpu"`
 }
 
-var Validation = &cr.StructValidation{
+var UserValidation = &cr.StructValidation{
 	StructFieldValidations: []*cr.StructFieldValidation{
 		{
 			StructField: "InstanceType",
@@ -79,27 +77,6 @@ var Validation = &cr.StructValidation{
 				Validator: validateInstanceType,
 			},
 		},
-		{
-			StructField:         "InstanceCPU",
-			StringPtrValidation: &cr.StringPtrValidation{},
-			Parser: k8s.QuantityParser(&k8s.QuantityValidation{
-				GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
-			}),
-		},
-		{
-			StructField:         "InstanceMem",
-			StringPtrValidation: &cr.StringPtrValidation{},
-			Parser: k8s.QuantityParser(&k8s.QuantityValidation{
-				GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
-			}),
-		},
-		{
-			StructField: "InstanceGPU",
-			Int64Validation: &cr.Int64Validation{
-				Default:              0,
-				GreaterThanOrEqualTo: pointer.Int64(0),
-			},
-		},
 		{
 			StructField: "MinInstances",
 			Int64PtrValidation: &cr.Int64PtrValidation{
@@ -276,6 +253,32 @@ var Validation = &cr.StructValidation{
 	},
 }
 
+var InternalValidation = &cr.StructValidation{
+	StructFieldValidations: append(UserValidation.StructFieldValidations,
+		&cr.StructFieldValidation{
+			StructField:         "InstanceCPU",
+			StringPtrValidation: &cr.StringPtrValidation{},
+			Parser: k8s.QuantityParser(&k8s.QuantityValidation{
+				GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
+			}),
+		},
+		&cr.StructFieldValidation{
+			StructField:         "InstanceMem",
+			StringPtrValidation: &cr.StringPtrValidation{},
+			Parser: k8s.QuantityParser(&k8s.QuantityValidation{
+				GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
+			}),
+		},
+		&cr.StructFieldValidation{
+			StructField: "InstanceGPU",
+			Int64Validation: &cr.Int64Validation{
+				Default:              0,
+				GreaterThanOrEqualTo: pointer.Int64(0),
+			},
+		},
+	),
+}
+
 func PromptValidation(skipPopulatedFields bool, promptInstanceType bool, defaults *ClusterConfig) *cr.PromptValidation {
 	if defaults == nil {
 		defaults = &ClusterConfig{}
@@ -352,7 +355,7 @@ func validateInstanceType(instanceType string) (string, error) {
 // This does not set defaults for fields that are prompted from the user
 func SetFileDefaults(clusterConfig *ClusterConfig) error {
 	var emptyMap interface{} = map[interface{}]interface{}{}
-	errs := cr.Struct(clusterConfig, emptyMap, Validation)
+	errs := cr.Struct(clusterConfig, emptyMap, UserValidation)
 	if errors.HasErrors(errs) {
 		return errors.FirstError(errs...)
 	}
@@ -387,7 +390,7 @@ func (cc *ClusterConfig) SetBucket(awsAccessKeyID string, awsSecretAccessKey str
 	return nil
 }
 
-func (cc *InternalClusterConfig) String() string {
+func (cc *ClusterConfig) UserFacingString() string {
 	var items []table.KV
 
 	items = append(items, table.KV{K: "cluster version", V: cc.APIVersion})
diff --git a/pkg/operator/api/context/context.go b/pkg/operator/api/context/context.go
index 3dab2086d2..69c8c3c3f9 100644
--- a/pkg/operator/api/context/context.go
+++ b/pkg/operator/api/context/context.go
@@ -27,18 +27,18 @@ import (
 )
 
 type Context struct {
-	ID                string                               `json:"id"`
-	Key               string                               `json:"key"`
-	CreatedEpoch      int64                                `json:"created_epoch"`
-	ClusterConfig     *clusterconfig.InternalClusterConfig `json:"cluster_config"`
-	DeploymentVersion string                               `json:"deployment_version"`
-	Root              string                               `json:"root"`
-	MetadataRoot      string                               `json:"metadata_root"`
-	StatusPrefix      string                               `json:"status_prefix"`
-	App               *App                                 `json:"app"`
-	APIs              APIs                                 `json:"apis"`
-	ProjectID         string                               `json:"project_id"`
-	ProjectKey        string                               `json:"project_key"`
+	ID                string                       `json:"id"`
+	Key               string                       `json:"key"`
+	CreatedEpoch      int64                        `json:"created_epoch"`
+	ClusterConfig     *clusterconfig.ClusterConfig `json:"cluster_config"`
+	DeploymentVersion string                       `json:"deployment_version"`
+	Root              string                       `json:"root"`
+	MetadataRoot      string                       `json:"metadata_root"`
+	StatusPrefix      string                       `json:"status_prefix"`
+	App               *App                         `json:"app"`
+	APIs              APIs                         `json:"apis"`
+	ProjectID         string                       `json:"project_id"`
+	ProjectKey        string                       `json:"project_key"`
 }
 
 type Resource interface {
diff --git a/pkg/operator/api/schema/schema.go b/pkg/operator/api/schema/schema.go
index b6665513e9..9eebed78ba 100644
--- a/pkg/operator/api/schema/schema.go
+++ b/pkg/operator/api/schema/schema.go
@@ -25,7 +25,7 @@ import (
 )
 
 type InfoResponse struct {
-	ClusterConfig *clusterconfig.InternalClusterConfig `json:"cluster_config"`
+	ClusterConfig *clusterconfig.ClusterConfig `json:"cluster_config"`
 }
 
 type DeployResponse struct {
diff --git a/pkg/operator/config/config.go b/pkg/operator/config/config.go
index 29b66920f8..67b315289e 100644
--- a/pkg/operator/config/config.go
+++ b/pkg/operator/config/config.go
@@ -31,7 +31,7 @@ import (
 )
 
 var (
-	Cluster         *clusterconfig.InternalClusterConfig
+	Cluster         *clusterconfig.ClusterConfig
 	AWS             *aws.Client
 	Kubernetes      *k8s.Client
 	IstioKubernetes *k8s.Client
@@ -41,7 +41,7 @@ var (
 func Init() error {
 	var err error
 
-	Cluster = &clusterconfig.InternalClusterConfig{
+	Cluster = &clusterconfig.ClusterConfig{
 		APIVersion:        consts.CortexVersion,
 		OperatorInCluster: strings.ToLower(os.Getenv("CORTEX_OPERATOR_IN_CLUSTER")) != "false",
 	}
@@ -51,7 +51,7 @@ func Init() error {
 		clusterConfigPath = consts.ClusterConfigPath
 	}
 
-	errs := cr.ParseYAMLFile(Cluster, clusterconfig.Validation, clusterConfigPath)
+	errs := cr.ParseYAMLFile(Cluster, clusterconfig.InternalValidation, clusterConfigPath)
 	if errors.HasErrors(errs) {
 		return errors.FirstError(errs...)
 	}

From 2703944ede07217d362ca13395fb9f4453cff15f Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 21:35:58 -0500
Subject: [PATCH 15/24] Address more PR comments

---
 pkg/lib/k8s/pod.go                        |  17 ----
 pkg/lib/k8s/quantity.go                   |  31 ++++--
 pkg/operator/workloads/api_workload.go    |  24 ++++-
 pkg/operator/workloads/memory_capacity.go | 112 ++++++++++++++++++++++
 pkg/operator/workloads/workflow.go        |  36 ++++++-
 5 files changed, 187 insertions(+), 33 deletions(-)
 create mode 100644 pkg/operator/workloads/memory_capacity.go

diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go
index 7a60398eed..af92ab440d 100644
--- a/pkg/lib/k8s/pod.go
+++ b/pkg/lib/k8s/pod.go
@@ -333,20 +333,3 @@ func (c *Client) StalledPods() ([]kcore.Pod, error) {
 
 	return stalledPods, nil
 }
-
-func Tolerations() []kcore.Toleration {
-	return []kcore.Toleration{
-		{
-			Key:      "workload",
-			Operator: kcore.TolerationOpEqual,
-			Value:    "true",
-			Effect:   kcore.TaintEffectNoSchedule,
-		},
-		{
-			Key:      "nvidia.com/gpu",
-			Operator: kcore.TolerationOpEqual,
-			Value:    "true",
-			Effect:   kcore.TaintEffectNoSchedule,
-		},
-	}
-}
diff --git a/pkg/lib/k8s/quantity.go b/pkg/lib/k8s/quantity.go
index 1d34146b2a..58bd84e600 100644
--- a/pkg/lib/k8s/quantity.go
+++ b/pkg/lib/k8s/quantity.go
@@ -136,22 +136,35 @@ func (quantity Quantity) MarshalYAML() (interface{}, error) {
 	return quantity.String(), nil
 }
 
-func (quantity Quantity) MarshalJSON() ([]byte, error) {
-	marshalable := quantityMarshalable{
-		Quantity:   quantity.Quantity,
-		UserString: quantity.UserString,
+func (quantity *Quantity) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	var userString string
+	err := unmarshal(&userString)
+	if err != nil {
+		return err
+	}
+	err = quantity.UnmarshalJSON([]byte(userString))
+	if err != nil {
+		return err
 	}
-	return json.Marshal(marshalable)
+	return nil
+}
+
+func (quantity Quantity) MarshalJSON() ([]byte, error) {
+	return json.Marshal(quantity.String())
 }
 
 func (quantity *Quantity) UnmarshalJSON(data []byte) error {
-	var unmarshaled quantityMarshalable
-	err := json.Unmarshal(data, &unmarshaled)
+	var userString string
+	err := json.Unmarshal(data, &userString)
+	quantity.UserString = userString
+
+	parsedQuantity, err := kresource.ParseQuantity(userString)
 	if err != nil {
 		return err
 	}
-	quantity.Quantity = unmarshaled.Quantity
-	quantity.UserString = unmarshaled.UserString
+
+	quantity.Quantity = parsedQuantity
+	quantity.UserString = userString
 	return nil
 }
 
diff --git a/pkg/operator/workloads/api_workload.go b/pkg/operator/workloads/api_workload.go
index f502979f1e..7f4423bd95 100644
--- a/pkg/operator/workloads/api_workload.go
+++ b/pkg/operator/workloads/api_workload.go
@@ -428,7 +428,7 @@ func tfAPISpec(
 				NodeSelector: map[string]string{
 					"workload": "true",
 				},
-				Tolerations:        k8s.Tolerations(),
+				Tolerations:        tolerations(),
 				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
@@ -583,7 +583,7 @@ func predictorAPISpec(
 				NodeSelector: map[string]string{
 					"workload": "true",
 				},
-				Tolerations:        k8s.Tolerations(),
+				Tolerations:        tolerations(),
 				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
@@ -601,7 +601,6 @@ func onnxAPISpec(
 	servingImage := config.Cluster.ImageONNXServe
 	resourceList := kcore.ResourceList{}
 	resourceLimitsList := kcore.ResourceList{}
-	tolerations := k8s.Tolerations()
 	resourceList[kcore.ResourceCPU] = api.Compute.CPU.Quantity
 
 	if api.Compute.Mem != nil {
@@ -737,7 +736,7 @@ func onnxAPISpec(
 				NodeSelector: map[string]string{
 					"workload": "true",
 				},
-				Tolerations:        tolerations,
+				Tolerations:        tolerations(),
 				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
@@ -935,3 +934,20 @@ func APIPodCompute(containers []kcore.Container) (*k8s.Quantity, *k8s.Quantity,
 
 	return totalCPU, totalMem, totalGPU
 }
+
+func tolerations() []kcore.Toleration {
+	return []kcore.Toleration{
+		{
+			Key:      "workload",
+			Operator: kcore.TolerationOpEqual,
+			Value:    "true",
+			Effect:   kcore.TaintEffectNoSchedule,
+		},
+		{
+			Key:      "nvidia.com/gpu",
+			Operator: kcore.TolerationOpEqual,
+			Value:    "true",
+			Effect:   kcore.TaintEffectNoSchedule,
+		},
+	}
+}
diff --git a/pkg/operator/workloads/memory_capacity.go b/pkg/operator/workloads/memory_capacity.go
new file mode 100644
index 0000000000..3ebe9e6068
--- /dev/null
+++ b/pkg/operator/workloads/memory_capacity.go
@@ -0,0 +1,112 @@
+/*
+Copyright 2019 Cortex Labs, Inc.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package workloads
+
+import (
+	"github.com/cortexlabs/cortex/pkg/consts"
+	"github.com/cortexlabs/cortex/pkg/lib/k8s"
+	"github.com/cortexlabs/cortex/pkg/operator/config"
+	kresource "k8s.io/apimachinery/pkg/api/resource"
+	kmeta "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+const key = "capacity"
+const configMemoryMapName = "cortex-node-memory"
+
+func GetMemoryCapacityFromNodes() (*kresource.Quantity, error) {
+	opts := kmeta.ListOptions{
+		LabelSelector: k8s.LabelSelector(map[string]string{
+			"workload": "true",
+		}),
+	}
+	nodes, err := config.Kubernetes.ListNodes(&opts)
+	if err != nil {
+		return nil, err
+	}
+
+	var minMem *kresource.Quantity
+	for _, node := range nodes {
+		curMem := node.Status.Capacity.Memory()
+
+		if curMem != nil && minMem == nil {
+			minMem = curMem
+		}
+
+		if curMem != nil && minMem.Cmp(*curMem) < 0 {
+			minMem = curMem
+		}
+	}
+
+	return minMem, nil
+}
+
+func GetMemoryCapacityFromConfigMap() (*kresource.Quantity, error) {
+	configMap, err := config.Kubernetes.GetConfigMap(configMemoryMapName)
+	if err != nil {
+		return nil, err
+	}
+
+	if configMap == nil {
+		return nil, nil
+	}
+
+	memoryUserStr := configMap.Data[key]
+	mem, err := kresource.ParseQuantity(memoryUserStr)
+	if err != nil {
+		return nil, err
+	}
+	return &mem, nil
+}
+
+func UpdateMemoryCapacityConfigMap() (*kresource.Quantity, error) {
+	memFromConfig := config.Cluster.InstanceMem
+	memFromNodes, err := GetMemoryCapacityFromNodes()
+	if err != nil {
+		return nil, err
+	}
+
+	memFromConfigMap, err := GetMemoryCapacityFromConfigMap()
+	if err != nil {
+		return nil, err
+	}
+
+	minMem := memFromConfig.Copy()
+
+	if memFromNodes != nil && minMem.Cmp(*memFromNodes) > 0 {
+		minMem = memFromNodes
+	}
+
+	if memFromConfigMap != nil && minMem.Cmp(*memFromConfigMap) > 0 {
+		minMem = memFromConfigMap
+	}
+
+	if minMem != memFromConfigMap {
+		configMap := k8s.ConfigMap(&k8s.ConfigMapSpec{
+			Name:      configMemoryMapName,
+			Namespace: consts.K8sNamespace,
+			Data: map[string]string{
+				key: minMem.String(),
+			},
+		})
+
+		_, err := config.Kubernetes.ApplyConfigMap(configMap)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return minMem, nil
+}
diff --git a/pkg/operator/workloads/workflow.go b/pkg/operator/workloads/workflow.go
index e6425213ef..bc08f9a367 100644
--- a/pkg/operator/workloads/workflow.go
+++ b/pkg/operator/workloads/workflow.go
@@ -32,14 +32,36 @@ import (
 	"github.com/cortexlabs/cortex/pkg/operator/config"
 )
 
-var cortexCPUReserve = kresource.MustParse("800m")   // FluentD (200), Nvidia (100), StatsD (100), Kube Proxy (100), Node capacity - Node availability (300)
-var cortexMemReserve = kresource.MustParse("1500Mi") // FluentD (200), Nvidia (100), StatsD (100), KubeReserved (800), AWS node memory - Node capacity (200)
+/*
+CPU Reservations:
+
+FluentD 200
+StatsD 100
+KubeProxy 100
+Reserved (150 + 150) see eks.yaml for details
+Buffer (100)
+*/
+var cortexCPUReserve = kresource.MustParse("800m") // FluentD (200), StatsD (100), KubeProxy (100), KubeReserved (150 + 150), buffer (100)
+
+/*
+Memory Reservations:
+
+FluentD 200
+StatsD 100
+Reserved (300 + 300 + 200) see eks.yaml for details
+Buffer (100)
+*/
+var cortexMemReserve = kresource.MustParse("1200Mi") // FluentD (200), StatsD (100), KubeReserved (300 + 300 + 200), buffer (100)
 
 func Init() error {
 	err := reloadCurrentContexts()
 	if err != nil {
 		return errors.Wrap(err, "init")
 	}
+	_, err = UpdateMemoryCapacityConfigMap()
+	if err != nil {
+		return errors.Wrap(err, "init")
+	}
 
 	go cronRunner()
 
@@ -308,9 +330,17 @@ func ValidateDeploy(ctx *context.Context) error {
 
 	maxCPU := config.Cluster.InstanceCPU.Copy()
 	maxCPU.Sub(cortexCPUReserve)
-	maxMem := config.Cluster.InstanceMem.Copy()
+	maxMem, err := UpdateMemoryCapacityConfigMap()
+	if err != nil {
+		return errors.Wrap(err, "validating memory constraint")
+	}
 	maxMem.Sub(cortexMemReserve)
 	maxGPU := config.Cluster.InstanceGPU
+	if maxGPU > 0 {
+		// Reserve resources for nvidia device plugin daemonset
+		maxCPU.Sub(kresource.MustParse("100m"))
+		maxMem.Sub(kresource.MustParse("100Mi"))
+	}
 
 	for _, api := range ctx.APIs {
 		if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {

From bd24c1cc5f2e6f5a9f6084f98795cd3e8d4e2533 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Wed, 13 Nov 2019 20:37:21 -0800
Subject: [PATCH 16/24] Separate internal cluster config

---
 cli/cmd/lib_manager.go                 |  2 +-
 dev/operator_local.sh                  |  3 +-
 manager/cluster_config_env.py          | 11 ++++---
 manager/install.sh                     |  1 +
 manager/instance_metadata.py           | 20 +++++++------
 pkg/consts/consts.go                   |  5 ++--
 pkg/lib/clusterconfig/clusterconfig.go | 40 +++++++++++++++-----------
 pkg/operator/api/context/context.go    | 24 ++++++++--------
 pkg/operator/api/schema/schema.go      |  2 +-
 pkg/operator/config/config.go          | 16 +++++++++--
 10 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/cli/cmd/lib_manager.go b/cli/cmd/lib_manager.go
index b1e902281a..46f8289201 100644
--- a/cli/cmd/lib_manager.go
+++ b/cli/cmd/lib_manager.go
@@ -132,7 +132,7 @@ func runManagerCommand(entrypoint string, clusterConfig *clusterconfig.ClusterCo
 	containerConfig := &container.Config{
 		Image:        clusterConfig.ImageManager,
 		Entrypoint:   []string{"/bin/bash", "-c"},
-		Cmd:          []string{"sleep 0.1 && eval $(python /root/instance_metadata.py /.cortex/cluster.yaml) && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && " + entrypoint},
+		Cmd:          []string{"sleep 0.1 && eval $(python /root/instance_metadata.py /.cortex/cluster.yaml /.cortex/cluster_internal.yaml) && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml /.cortex/cluster_internal.yaml) && " + entrypoint},
 		Tty:          true,
 		AttachStdout: true,
 		AttachStderr: true,
diff --git a/dev/operator_local.sh b/dev/operator_local.sh
index 18d23530e0..01c880f0ec 100755
--- a/dev/operator_local.sh
+++ b/dev/operator_local.sh
@@ -21,10 +21,11 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)"
 
 export CORTEX_OPERATOR_IN_CLUSTER=false
 export CORTEX_CLUSTER_CONFIG_PATH=$ROOT/dev/config/cluster.yaml
+export CORTEX_INTERNAL_CLUSTER_CONFIG_PATH=$HOME/.cortex/cluster_internal.yaml
 
 pip3 install -r $ROOT/manager/requirements.txt
 
-python3 $ROOT/manager/instance_metadata.py $CORTEX_CLUSTER_CONFIG_PATH
+python3 $ROOT/manager/instance_metadata.py $CORTEX_CLUSTER_CONFIG_PATH $CORTEX_INTERNAL_CLUSTER_CONFIG_PATH
 
 kill $(pgrep -f rerun) >/dev/null 2>&1 || true
 updated_cli_config=$(cat $HOME/.cortex/default.json | jq '.cortex_url = "http://localhost:8888"') && echo $updated_cli_config > $HOME/.cortex/default.json
diff --git a/manager/cluster_config_env.py b/manager/cluster_config_env.py
index f02cd3f84e..685637c9c2 100644
--- a/manager/cluster_config_env.py
+++ b/manager/cluster_config_env.py
@@ -15,10 +15,9 @@
 import sys
 import yaml
 
-cluster_conifg_path = sys.argv[1]
+for config_path in sys.argv[1:]:
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
 
-with open(cluster_conifg_path, "r") as cluster_conifg_file:
-    cluster_conifg = yaml.safe_load(cluster_conifg_file)
-
-for key, value in cluster_conifg.items():
-    print("export CORTEX_{}={}".format(key.upper(), value))
+    for key, value in config.items():
+        print("export CORTEX_{}={}".format(key.upper(), value))
diff --git a/manager/install.sh b/manager/install.sh
index a73c4906b0..652a10ae64 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -176,6 +176,7 @@ function setup_cloudwatch_logs() {
 function setup_configmap() {
   kubectl -n=cortex create configmap 'cluster-config' \
     --from-file='cluster.yaml'='/.cortex/cluster.yaml' \
+    --from-file='cluster_internal.yaml'='/.cortex/cluster_internal.yaml' \
     -o yaml --dry-run | kubectl apply -f - >/dev/null
 }
 
diff --git a/manager/instance_metadata.py b/manager/instance_metadata.py
index b6163cccd8..6ae5416d45 100644
--- a/manager/instance_metadata.py
+++ b/manager/instance_metadata.py
@@ -67,22 +67,24 @@ def download_metadata(cluster_config):
     return instance_mapping
 
 
-def set_ec2_metadata(cluster_config_path):
-    with open(cluster_config_path, "r") as cluster_config_file:
-        cluster_config = yaml.safe_load(cluster_config_file)
+def set_ec2_metadata(cluster_config_path, internal_cluster_config_path):
+    with open(cluster_config_path, "r") as f:
+        cluster_config = yaml.safe_load(f)
     instance_mapping = download_metadata(cluster_config)
     instance_metadata = instance_mapping[cluster_config["instance_type"]]
 
-    cluster_config["instance_mem"] = str(instance_metadata["mem"]) + "Mi"
-    cluster_config["instance_cpu"] = str(instance_metadata["cpu"])
-    cluster_config["instance_gpu"] = int(instance_metadata.get("gpu", 0))
+    internal_cluster_config = {
+        "instance_mem": str(instance_metadata["mem"]) + "Mi",
+        "instance_cpu": str(instance_metadata["cpu"]),
+        "instance_gpu": int(instance_metadata.get("gpu", 0)),
+    }
 
-    with open(cluster_config_path, "w") as cluster_config_file:
-        yaml.dump(cluster_config, cluster_config_file, default_flow_style=False)
+    with open(internal_cluster_config_path, "w") as f:
+        yaml.dump(internal_cluster_config, f)
 
 
 def main():
-    set_ec2_metadata(sys.argv[1])
+    set_ec2_metadata(sys.argv[1], sys.argv[2])
 
 
 if __name__ == "__main__":
diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go
index 56b9696588..817f32d957 100644
--- a/pkg/consts/consts.go
+++ b/pkg/consts/consts.go
@@ -23,8 +23,9 @@ var (
 	EmptyDirMountPath  = "/mnt"
 	EmptyDirVolumeName = "mnt"
 
-	ClusterConfigPath = "/configs/cluster/cluster.yaml"
-	ClusterConfigName = "cluster-config"
+	ClusterConfigPath         = "/configs/cluster/cluster.yaml"
+	InternalClusterConfigPath = "/configs/cluster/cluster_internal.yaml"
+	ClusterConfigName         = "cluster-config"
 
 	AppsDir             = "apps"
 	DeploymentsDir      = "deployments"
diff --git a/pkg/lib/clusterconfig/clusterconfig.go b/pkg/lib/clusterconfig/clusterconfig.go
index d1d0355173..7356aa50c0 100644
--- a/pkg/lib/clusterconfig/clusterconfig.go
+++ b/pkg/lib/clusterconfig/clusterconfig.go
@@ -59,14 +59,20 @@ type ClusterConfig struct {
 	ImageIstioPilot        string  `json:"image_istio_pilot" yaml:"image_istio_pilot"`
 	ImageIstioCitadel      string  `json:"image_istio_citadel" yaml:"image_istio_citadel"`
 	ImageIstioGalley       string  `json:"image_istio_galley" yaml:"image_istio_galley"`
+}
+
+type InternalClusterConfig struct {
+	ClusterConfig
 
-	// Internal
-	ID                string        `json:"id"`
-	APIVersion        string        `json:"api_version"`
-	OperatorInCluster bool          `json:"operator_in_cluster"`
-	InstanceCPU       *k8s.Quantity `json:"instance_cpu" yaml:"instance_cpu"`
-	InstanceMem       *k8s.Quantity `json:"instance_mem" yaml:"instance_mem"`
-	InstanceGPU       int64         `json:"instance_gpu" yaml:"instance_gpu"`
+	// Populated via internal cluster config file
+	InstanceCPU k8s.Quantity `json:"instance_cpu" yaml:"instance_cpu"`
+	InstanceMem k8s.Quantity `json:"instance_mem" yaml:"instance_mem"`
+	InstanceGPU int64        `json:"instance_gpu" yaml:"instance_gpu"`
+
+	// Populated by operator
+	ID                string `json:"id"`
+	APIVersion        string `json:"api_version"`
+	OperatorInCluster bool   `json:"operator_in_cluster"`
 }
 
 var UserValidation = &cr.StructValidation{
@@ -254,29 +260,29 @@ var UserValidation = &cr.StructValidation{
 }
 
 var InternalValidation = &cr.StructValidation{
-	StructFieldValidations: append(UserValidation.StructFieldValidations,
-		&cr.StructFieldValidation{
-			StructField:         "InstanceCPU",
-			StringPtrValidation: &cr.StringPtrValidation{},
+	StructFieldValidations: []*cr.StructFieldValidation{
+		{
+			StructField:      "InstanceCPU",
+			StringValidation: &cr.StringValidation{},
 			Parser: k8s.QuantityParser(&k8s.QuantityValidation{
 				GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
 			}),
 		},
-		&cr.StructFieldValidation{
-			StructField:         "InstanceMem",
-			StringPtrValidation: &cr.StringPtrValidation{},
+		{
+			StructField:      "InstanceMem",
+			StringValidation: &cr.StringValidation{},
 			Parser: k8s.QuantityParser(&k8s.QuantityValidation{
 				GreaterThan: k8s.QuantityPtr(kresource.MustParse("0")),
 			}),
 		},
-		&cr.StructFieldValidation{
+		{
 			StructField: "InstanceGPU",
 			Int64Validation: &cr.Int64Validation{
 				Default:              0,
 				GreaterThanOrEqualTo: pointer.Int64(0),
 			},
 		},
-	),
+	},
 }
 
 func PromptValidation(skipPopulatedFields bool, promptInstanceType bool, defaults *ClusterConfig) *cr.PromptValidation {
@@ -390,7 +396,7 @@ func (cc *ClusterConfig) SetBucket(awsAccessKeyID string, awsSecretAccessKey str
 	return nil
 }
 
-func (cc *ClusterConfig) UserFacingString() string {
+func (cc *InternalClusterConfig) UserFacingString() string {
 	var items []table.KV
 
 	items = append(items, table.KV{K: "cluster version", V: cc.APIVersion})
diff --git a/pkg/operator/api/context/context.go b/pkg/operator/api/context/context.go
index 69c8c3c3f9..3dab2086d2 100644
--- a/pkg/operator/api/context/context.go
+++ b/pkg/operator/api/context/context.go
@@ -27,18 +27,18 @@ import (
 )
 
 type Context struct {
-	ID                string                       `json:"id"`
-	Key               string                       `json:"key"`
-	CreatedEpoch      int64                        `json:"created_epoch"`
-	ClusterConfig     *clusterconfig.ClusterConfig `json:"cluster_config"`
-	DeploymentVersion string                       `json:"deployment_version"`
-	Root              string                       `json:"root"`
-	MetadataRoot      string                       `json:"metadata_root"`
-	StatusPrefix      string                       `json:"status_prefix"`
-	App               *App                         `json:"app"`
-	APIs              APIs                         `json:"apis"`
-	ProjectID         string                       `json:"project_id"`
-	ProjectKey        string                       `json:"project_key"`
+	ID                string                               `json:"id"`
+	Key               string                               `json:"key"`
+	CreatedEpoch      int64                                `json:"created_epoch"`
+	ClusterConfig     *clusterconfig.InternalClusterConfig `json:"cluster_config"`
+	DeploymentVersion string                               `json:"deployment_version"`
+	Root              string                               `json:"root"`
+	MetadataRoot      string                               `json:"metadata_root"`
+	StatusPrefix      string                               `json:"status_prefix"`
+	App               *App                                 `json:"app"`
+	APIs              APIs                                 `json:"apis"`
+	ProjectID         string                               `json:"project_id"`
+	ProjectKey        string                               `json:"project_key"`
 }
 
 type Resource interface {
diff --git a/pkg/operator/api/schema/schema.go b/pkg/operator/api/schema/schema.go
index 9eebed78ba..b6665513e9 100644
--- a/pkg/operator/api/schema/schema.go
+++ b/pkg/operator/api/schema/schema.go
@@ -25,7 +25,7 @@ import (
 )
 
 type InfoResponse struct {
-	ClusterConfig *clusterconfig.ClusterConfig `json:"cluster_config"`
+	ClusterConfig *clusterconfig.InternalClusterConfig `json:"cluster_config"`
 }
 
 type DeployResponse struct {
diff --git a/pkg/operator/config/config.go b/pkg/operator/config/config.go
index 67b315289e..9027b80aa6 100644
--- a/pkg/operator/config/config.go
+++ b/pkg/operator/config/config.go
@@ -31,7 +31,7 @@ import (
 )
 
 var (
-	Cluster         *clusterconfig.ClusterConfig
+	Cluster         *clusterconfig.InternalClusterConfig
 	AWS             *aws.Client
 	Kubernetes      *k8s.Client
 	IstioKubernetes *k8s.Client
@@ -41,7 +41,7 @@ var (
 func Init() error {
 	var err error
 
-	Cluster = &clusterconfig.ClusterConfig{
+	Cluster = &clusterconfig.InternalClusterConfig{
 		APIVersion:        consts.CortexVersion,
 		OperatorInCluster: strings.ToLower(os.Getenv("CORTEX_OPERATOR_IN_CLUSTER")) != "false",
 	}
@@ -51,7 +51,17 @@ func Init() error {
 		clusterConfigPath = consts.ClusterConfigPath
 	}
 
-	errs := cr.ParseYAMLFile(Cluster, clusterconfig.InternalValidation, clusterConfigPath)
+	internalClusterConfigPath := os.Getenv("CORTEX_INTERNAL_CLUSTER_CONFIG_PATH")
+	if internalClusterConfigPath == "" {
+		internalClusterConfigPath = consts.InternalClusterConfigPath
+	}
+
+	errs := cr.ParseYAMLFile(Cluster, clusterconfig.UserValidation, clusterConfigPath)
+	if errors.HasErrors(errs) {
+		return errors.FirstError(errs...)
+	}
+
+	errs = cr.ParseYAMLFile(Cluster, clusterconfig.InternalValidation, internalClusterConfigPath)
 	if errors.HasErrors(errs) {
 		return errors.FirstError(errs...)
 	}

From a8c16f48a09676da74a34eeae45ffe95beb44e8e Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 14 Nov 2019 09:30:00 -0500
Subject: [PATCH 17/24] Change cortex internal cluster path for dev to be in
 the dev directory

---
 dev/operator_local.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/operator_local.sh b/dev/operator_local.sh
index 01c880f0ec..88314e2cd6 100755
--- a/dev/operator_local.sh
+++ b/dev/operator_local.sh
@@ -21,7 +21,7 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)"
 
 export CORTEX_OPERATOR_IN_CLUSTER=false
 export CORTEX_CLUSTER_CONFIG_PATH=$ROOT/dev/config/cluster.yaml
-export CORTEX_INTERNAL_CLUSTER_CONFIG_PATH=$HOME/.cortex/cluster_internal.yaml
+export CORTEX_INTERNAL_CLUSTER_CONFIG_PATH=$ROOT/dev/config/cluster_internal.yaml
 
 pip3 install -r $ROOT/manager/requirements.txt
 

From fad20f4b9b398fd7ed36064d32a29b205252c4e5 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 14 Nov 2019 09:51:16 -0500
Subject: [PATCH 18/24] Update config.md docs

---
 docs/cluster/config.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/cluster/config.md b/docs/cluster/config.md
index 0201bd6de2..dfc68c2ea9 100644
--- a/docs/cluster/config.md
+++ b/docs/cluster/config.md
@@ -18,8 +18,10 @@ cortex_aws_secret_access_key: ***
 # Instance type Cortex will use
 instance_type: m5.large
 
-# Minimum and maximum number of instances in the cluster
+# Minimum number of worker instances in the cluster (must be >= 0)
 min_instances: 1
+
+# Maximum number of worker instances in the cluster (must be >= 1)
 max_instances: 5
 
 # Name of the S3 bucket Cortex will use

From 96005c18c40b81d39b619d1e84775eec5299fab1 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 14 Nov 2019 09:51:40 -0500
Subject: [PATCH 19/24] Change config map key name

---
 pkg/operator/workloads/memory_capacity.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/operator/workloads/memory_capacity.go b/pkg/operator/workloads/memory_capacity.go
index 3ebe9e6068..a8a8d9b73d 100644
--- a/pkg/operator/workloads/memory_capacity.go
+++ b/pkg/operator/workloads/memory_capacity.go
@@ -25,7 +25,7 @@ import (
 )
 
 const key = "capacity"
-const configMemoryMapName = "cortex-node-memory"
+const configMemoryMapName = "cortex-instance-memory"
 
 func GetMemoryCapacityFromNodes() (*kresource.Quantity, error) {
 	opts := kmeta.ListOptions{

From 5848fbe572e24d7ac0c814b9dd98af86e6f26134 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Thu, 14 Nov 2019 09:53:24 -0500
Subject: [PATCH 20/24] Remove outdated comment and minor refactor

---
 pkg/operator/workloads/workflow.go | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pkg/operator/workloads/workflow.go b/pkg/operator/workloads/workflow.go
index bc08f9a367..b844315229 100644
--- a/pkg/operator/workloads/workflow.go
+++ b/pkg/operator/workloads/workflow.go
@@ -41,7 +41,7 @@ KubeProxy 100
 Reserved (150 + 150) see eks.yaml for details
 Buffer (100)
 */
-var cortexCPUReserve = kresource.MustParse("800m") // FluentD (200), StatsD (100), KubeProxy (100), KubeReserved (150 + 150), buffer (100)
+var cortexCPUReserve = kresource.MustParse("800m")
 
 /*
 Memory Reservations:
@@ -51,7 +51,10 @@ StatsD 100
 Reserved (300 + 300 + 200) see eks.yaml for details
 Buffer (100)
 */
-var cortexMemReserve = kresource.MustParse("1200Mi") // FluentD (200), StatsD (100), KubeReserved (300 + 300 + 200), buffer (100)
+var cortexMemReserve = kresource.MustParse("1200Mi")
+
+var nvidiaCPUReserve = kresource.MustParse("100m")
+var nvidiaMemReserve = kresource.MustParse("100Mi")
 
 func Init() error {
 	err := reloadCurrentContexts()
@@ -338,8 +341,8 @@ func ValidateDeploy(ctx *context.Context) error {
 	maxGPU := config.Cluster.InstanceGPU
 	if maxGPU > 0 {
 		// Reserve resources for nvidia device plugin daemonset
-		maxCPU.Sub(kresource.MustParse("100m"))
-		maxMem.Sub(kresource.MustParse("100Mi"))
+		maxCPU.Sub(nvidiaCPUReserve)
+		maxMem.Sub(nvidiaMemReserve)
 	}
 
 	for _, api := range ctx.APIs {

From d37914cdcb02128547e555d44b5413627a466861 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Thu, 14 Nov 2019 08:58:21 -0800
Subject: [PATCH 21/24] Fix formatting

---
 pkg/lib/clusterconfig/clusterconfig.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/lib/clusterconfig/clusterconfig.go b/pkg/lib/clusterconfig/clusterconfig.go
index 7356aa50c0..a1bf17709d 100644
--- a/pkg/lib/clusterconfig/clusterconfig.go
+++ b/pkg/lib/clusterconfig/clusterconfig.go
@@ -177,7 +177,8 @@ var UserValidation = &cr.StructValidation{
 		{
 			StructField: "ImageManager",
 			StringValidation: &cr.StringValidation{
-				Default: "cortexlabs/manager:" + consts.CortexVersion},
+				Default: "cortexlabs/manager:" + consts.CortexVersion,
+			},
 		},
 		{
 			StructField: "ImageDownloader",

From acf0058c1413443f6315af6822bae0a60d0feaa3 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Thu, 14 Nov 2019 09:01:36 -0800
Subject: [PATCH 22/24] Update api_workload.go

---
 pkg/operator/workloads/api_workload.go | 34 ++++++++++++--------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/pkg/operator/workloads/api_workload.go b/pkg/operator/workloads/api_workload.go
index 7f4423bd95..9ea0fb03b6 100644
--- a/pkg/operator/workloads/api_workload.go
+++ b/pkg/operator/workloads/api_workload.go
@@ -428,7 +428,7 @@ func tfAPISpec(
 				NodeSelector: map[string]string{
 					"workload": "true",
 				},
-				Tolerations:        tolerations(),
+				Tolerations:        tolerations,
 				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
@@ -583,7 +583,7 @@ func predictorAPISpec(
 				NodeSelector: map[string]string{
 					"workload": "true",
 				},
-				Tolerations:        tolerations(),
+				Tolerations:        tolerations,
 				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
@@ -736,7 +736,7 @@ func onnxAPISpec(
 				NodeSelector: map[string]string{
 					"workload": "true",
 				},
-				Tolerations:        tolerations(),
+				Tolerations:        tolerations,
 				Volumes:            defaultVolumes(),
 				ServiceAccountName: "default",
 			},
@@ -935,19 +935,17 @@ func APIPodCompute(containers []kcore.Container) (*k8s.Quantity, *k8s.Quantity,
 	return totalCPU, totalMem, totalGPU
 }
 
-func tolerations() []kcore.Toleration {
-	return []kcore.Toleration{
-		{
-			Key:      "workload",
-			Operator: kcore.TolerationOpEqual,
-			Value:    "true",
-			Effect:   kcore.TaintEffectNoSchedule,
-		},
-		{
-			Key:      "nvidia.com/gpu",
-			Operator: kcore.TolerationOpEqual,
-			Value:    "true",
-			Effect:   kcore.TaintEffectNoSchedule,
-		},
-	}
+var tolerations = []kcore.Toleration{
+	{
+		Key:      "workload",
+		Operator: kcore.TolerationOpEqual,
+		Value:    "true",
+		Effect:   kcore.TaintEffectNoSchedule,
+	},
+	{
+		Key:      "nvidia.com/gpu",
+		Operator: kcore.TolerationOpEqual,
+		Value:    "true",
+		Effect:   kcore.TaintEffectNoSchedule,
+	},
 }

From 19fe4ff339bb882d689decd129cd6690b7954f92 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Thu, 14 Nov 2019 09:26:58 -0800
Subject: [PATCH 23/24] Update memory_capacity.go

---
 pkg/operator/workloads/memory_capacity.go | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/pkg/operator/workloads/memory_capacity.go b/pkg/operator/workloads/memory_capacity.go
index a8a8d9b73d..7cc4d4a01d 100644
--- a/pkg/operator/workloads/memory_capacity.go
+++ b/pkg/operator/workloads/memory_capacity.go
@@ -24,8 +24,8 @@ import (
 	kmeta "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-const key = "capacity"
-const configMemoryMapName = "cortex-instance-memory"
+const memConfigMapName = "cortex-instance-memory"
+const memConfigMapKey = "capacity"
 
 func GetMemoryCapacityFromNodes() (*kresource.Quantity, error) {
 	opts := kmeta.ListOptions{
@@ -55,16 +55,16 @@ func GetMemoryCapacityFromNodes() (*kresource.Quantity, error) {
 }
 
 func GetMemoryCapacityFromConfigMap() (*kresource.Quantity, error) {
-	configMap, err := config.Kubernetes.GetConfigMap(configMemoryMapName)
+	configMapData, err := config.Kubernetes.GetConfigMapData(memConfigMapName)
 	if err != nil {
 		return nil, err
 	}
 
-	if configMap == nil {
+	if len(configMapData) == 0 {
 		return nil, nil
 	}
 
-	memoryUserStr := configMap.Data[key]
+	memoryUserStr := configMapData[memConfigMapKey]
 	mem, err := kresource.ParseQuantity(memoryUserStr)
 	if err != nil {
 		return nil, err
@@ -94,12 +94,12 @@ func UpdateMemoryCapacityConfigMap() (*kresource.Quantity, error) {
 		minMem = memFromConfigMap
 	}
 
-	if minMem != memFromConfigMap {
+	if memFromConfigMap == nil || minMem.Cmp(*memFromConfigMap) != 0 {
 		configMap := k8s.ConfigMap(&k8s.ConfigMapSpec{
-			Name:      configMemoryMapName,
+			Name:      memConfigMapName,
 			Namespace: consts.K8sNamespace,
 			Data: map[string]string{
-				key: minMem.String(),
+				memConfigMapKey: minMem.String(),
 			},
 		})
 
@@ -108,5 +108,6 @@ func UpdateMemoryCapacityConfigMap() (*kresource.Quantity, error) {
 			return nil, err
 		}
 	}
+
 	return minMem, nil
 }

From 3f9a62feb46b4a890c0d93e0588d3195c20cb2d5 Mon Sep 17 00:00:00 2001
From: David Eliahu <deliahu@users.noreply.github.com>
Date: Thu, 14 Nov 2019 09:34:16 -0800
Subject: [PATCH 24/24] Update metrics-server.yaml

---
 manager/manifests/metrics-server.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/manager/manifests/metrics-server.yaml b/manager/manifests/metrics-server.yaml
index 7134e39083..5493539c83 100644
--- a/manager/manifests/metrics-server.yaml
+++ b/manager/manifests/metrics-server.yaml
@@ -111,7 +111,7 @@ spec:
         volumeMounts:
         - name: tmp-dir
           mountPath: /tmp
-        resources: # https://github.com/kubernetes/kubernetes/blob/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml#L44
+        resources:
           requests:
             cpu: 100m
             memory: 100Mi