cortexlabs · vishalbollu · Nov 14, 2019 · Oct 7, 2019 · Oct 7, 2019 · Oct 7, 2019
diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
@@ -145,7 +145,7 @@ var infoCmd = &cobra.Command{
 		if err != nil {
 			errors.Exit(err, "/info", string(httpResponse))
 		}
-		fmt.Println(infoResponse.ClusterConfig.String())
+		fmt.Println(infoResponse.ClusterConfig.UserFacingString())
 	},
 }
 

diff --git a/cli/cmd/lib_cluster_config.go b/cli/cmd/lib_cluster_config.go
@@ -93,7 +93,7 @@ var awsCredentialsPromptValidation = &cr.PromptValidation{
 }
 
 func readClusterConfigFile(clusterConfig *clusterconfig.ClusterConfig, awsCreds *AWSCredentials, path string) error {
-	errs := cr.ParseYAMLFile(clusterConfig, clusterconfig.Validation, path)
+	errs := cr.ParseYAMLFile(clusterConfig, clusterconfig.UserValidation, path)
 	if errors.HasErrors(errs) {
 		return errors.FirstError(errs...)
 	}

diff --git a/cli/cmd/lib_manager.go b/cli/cmd/lib_manager.go
@@ -124,14 +124,15 @@ func runManagerCommand(entrypoint string, clusterConfig *clusterconfig.ClusterCo
 	if err != nil {
 		return "", errors.WithStack(err)
 	}
+
 	if err := files.WriteFile(clusterConfigBytes, cachedClusterConfigPath); err != nil {
 		return "", err
 	}
 
 	containerConfig := &container.Config{
 		Image:        clusterConfig.ImageManager,
 		Entrypoint:   []string{"/bin/bash", "-c"},
-		Cmd:          []string{"sleep 0.1 && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && " + entrypoint},
+		Cmd:          []string{"sleep 0.1 && eval $(python /root/instance_metadata.py /.cortex/cluster.yaml /.cortex/cluster_internal.yaml) && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml /.cortex/cluster_internal.yaml) && " + entrypoint},
 		Tty:          true,
 		AttachStdout: true,
 		AttachStderr: true,

diff --git a/dev/operator_local.sh b/dev/operator_local.sh
@@ -21,6 +21,11 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)"
 
 export CORTEX_OPERATOR_IN_CLUSTER=false
 export CORTEX_CLUSTER_CONFIG_PATH=$ROOT/dev/config/cluster.yaml
+export CORTEX_INTERNAL_CLUSTER_CONFIG_PATH=$ROOT/dev/config/cluster_internal.yaml
+
+pip3 install -r $ROOT/manager/requirements.txt
+
+python3 $ROOT/manager/instance_metadata.py $CORTEX_CLUSTER_CONFIG_PATH $CORTEX_INTERNAL_CLUSTER_CONFIG_PATH
 
 kill $(pgrep -f rerun) >/dev/null 2>&1 || true
 updated_cli_config=$(cat $HOME/.cortex/default.json | jq '.cortex_url = "http://localhost:8888"') && echo $updated_cli_config > $HOME/.cortex/default.json

diff --git a/docs/cluster/config.md b/docs/cluster/config.md
@@ -18,8 +18,10 @@ cortex_aws_secret_access_key: ***
 # Instance type Cortex will use
 instance_type: m5.large
 
-# Minimum and maximum number of instances in the cluster
-min_instances: 2
+# Minimum number of worker instances in the cluster (must be >= 0)
+min_instances: 1
+
+# Maximum number of worker instances in the cluster (must be >= 1)
 max_instances: 5
 
 # Name of the S3 bucket Cortex will use

diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile
@@ -4,8 +4,11 @@ WORKDIR /root
 
 ENV PATH /root/.local/bin:$PATH
 
+COPY manager/requirements.txt /root/requirements.txt
+
 RUN pip install --upgrade pip && \
     pip install awscli --upgrade --user && \
+    pip install -r /root/requirements.txt && \
     rm -rf /root/.cache/pip*
 
 RUN apk add --no-cache bash curl gettext jq openssl

diff --git a/manager/cluster_config_env.py b/manager/cluster_config_env.py
@@ -15,10 +15,9 @@
 import sys
 import yaml
 
-cluster_conifg_path = sys.argv[1]
+for config_path in sys.argv[1:]:
+    with open(config_path, "r") as f:
+        config = yaml.safe_load(f)
 
-with open(cluster_conifg_path, "r") as cluster_conifg_file:
-    cluster_conifg = yaml.safe_load(cluster_conifg_file)
-
-for key, value in cluster_conifg.items():
-    print("export CORTEX_{}={}".format(key.upper(), value))
+    for key, value in config.items():
+        print("export CORTEX_{}={}".format(key.upper(), value))
diff --git a/manager/eks.yaml b/manager/eks.yaml
@@ -21,15 +21,48 @@ metadata:
   version: "1.14"
 
 nodeGroups:
-  - name: ng-1
+  - name: ng-cortex-operator
+    instanceType: t3.medium
+    minSize: 1
+    maxSize: 2
+    desiredCapacity: 1
+    ami: auto
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
+    kubeletExtraConfig:
+      kubeReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      kubeReservedCgroup: /kube-reserved
+      systemReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      evictionHard:
+        memory.available:  200Mi
+        nodefs.available: 5%
+
+  - name: ng-cortex-worker
     instanceType: $CORTEX_INSTANCE_TYPE
     minSize: $CORTEX_MIN_INSTANCES
     maxSize: $CORTEX_MAX_INSTANCES
-    desiredCapacity: $CORTEX_MIN_INSTANCES
+    desiredCapacity: $CORTEX_DESIRED_INSTANCES
     ami: auto
     iam:
       withAddonPolicies:
         autoScaler: true
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
+      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
+    labels:
+      lifecycle: Ec2Spot
+      workload: "true"
+    taints:
+      workload: "true:NoSchedule"
     kubeletExtraConfig:
       kubeReserved:
         cpu: 150m
@@ -41,5 +74,5 @@ nodeGroups:
         memory: 300Mi
         ephemeral-storage: 1Gi
       evictionHard:
-        memory.available:  200Mi
+        memory.available: 200Mi
         nodefs.available: 5%
diff --git a/manager/eks_gpu.yaml b/manager/eks_gpu.yaml
@@ -0,0 +1,82 @@
+# Copyright 2019 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: eksctl.io/v1alpha5
+kind: ClusterConfig
+
+metadata:
+  name: $CORTEX_CLUSTER_NAME
+  region: $CORTEX_REGION
+  version: "1.14"
+
+nodeGroups:
+  - name: ng-cortex-operator
+    instanceType: t3.medium
+    minSize: 1
+    maxSize: 2
+    desiredCapacity: 1
+    ami: auto
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
+    kubeletExtraConfig:
+      kubeReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      kubeReservedCgroup: /kube-reserved
+      systemReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      evictionHard:
+        memory.available:  200Mi
+        nodefs.available: 5%
+
+  - name: ng-cortex-worker
+    instanceType: $CORTEX_INSTANCE_TYPE
+    minSize: $CORTEX_MIN_INSTANCES
+    maxSize: $CORTEX_MAX_INSTANCES
+    desiredCapacity: $CORTEX_DESIRED_INSTANCES
+    ami: auto
+    iam:
+      withAddonPolicies:
+        autoScaler: true
+    tags:
+      k8s.io/cluster-autoscaler/enabled: 'true'
+      k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
+      k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
+      k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
+    labels:
+      lifecycle: Ec2Spot
+      workload: "true"
+      nvidia.com/gpu: 'true'
+    taints:
+      nvidia.com/gpu: "true:NoSchedule"
+      workload: "true:NoSchedule"
+    kubeletExtraConfig:
+      kubeReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      kubeReservedCgroup: /kube-reserved
+      systemReserved:
+        cpu: 150m
+        memory: 300Mi
+        ephemeral-storage: 1Gi
+      evictionHard:
+        memory.available: 200Mi
+        nodefs.available: 5%
diff --git a/manager/install.sh b/manager/install.sh
@@ -32,8 +32,18 @@ function ensure_eks() {
       exit 1
     fi
 
+    if [ $CORTEX_MIN_INSTANCES -lt 1 ]; then
+      export CORTEX_DESIRED_INSTANCES=1
+    else
+      export CORTEX_DESIRED_INSTANCES=$CORTEX_MIN_INSTANCES
+    fi
+
     echo -e "￮ Spinning up the cluster ... (this will take about 15 minutes)\n"
-    envsubst < eks.yaml | eksctl create cluster -f -
+    if [ $CORTEX_INSTANCE_GPU -ne 0 ]; then
+      envsubst < eks_gpu.yaml | eksctl create cluster -f -
+    else
+      envsubst < eks.yaml | eksctl create cluster -f -
+    fi
     echo -e "\n✓ Spun up the cluster"
     return
   fi
@@ -60,15 +70,15 @@ function ensure_eks() {
   echo "✓ Cluster is running"
 
   # Check if instance type changed
-  ng_info=$(eksctl get nodegroup --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --name ng-1 -o json)
-  ng_instance_type=$(echo "$ng_info" | jq -r ".[] | select( .Cluster == \"$CORTEX_CLUSTER_NAME\" ) | select( .Name == \"ng-1\" ) | .InstanceType")
+  ng_info=$(eksctl get nodegroup --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --name ng-cortex-worker -o json)
+  ng_instance_type=$(echo "$ng_info" | jq -r ".[] | select( .Cluster == \"$CORTEX_CLUSTER_NAME\" ) | select( .Name == \"ng-cortex-worker\" ) | .InstanceType")
   if [ "$ng_instance_type" != "$CORTEX_INSTANCE_TYPE" ]; then
     echo -e "\nerror: Cortex does not currently support changing the instance type of a running cluster; please run \`cortex cluster down\` followed by \`cortex cluster up\` to create a new cluster"
     exit 1
   fi
 
   # Check for change in min/max instances
-  asg_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query 'AutoScalingGroups[?contains(Tags[?Key==`alpha.eksctl.io/nodegroup-name`].Value, `ng-1`)]')
+  asg_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query 'AutoScalingGroups[?contains(Tags[?Key==`alpha.eksctl.io/nodegroup-name`].Value, `ng-cortex-worker`)]')
   asg_name=$(echo "$asg_info" | jq -r 'first | .AutoScalingGroupName')
   asg_min_size=$(echo "$asg_info" | jq -r 'first | .MinSize')
   asg_max_size=$(echo "$asg_info" | jq -r 'first | .MaxSize')
@@ -166,6 +176,7 @@ function setup_cloudwatch_logs() {
 function setup_configmap() {
   kubectl -n=cortex create configmap 'cluster-config' \
     --from-file='cluster.yaml'='/.cortex/cluster.yaml' \
+    --from-file='cluster_internal.yaml'='/.cortex/cluster_internal.yaml' \
     -o yaml --dry-run | kubectl apply -f - >/dev/null
 }
 

diff --git a/manager/instance_metadata.py b/manager/instance_metadata.py
@@ -0,0 +1,91 @@
+# Copyright 2019 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import sys
+import re
+import os
+import pathlib
+import json
+import yaml
+
+PRICING_ENDPOINT_TEMPLATE = (
+    "https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{}/index.json"
+)
+
+
+def download_metadata(cluster_config):
+    response = requests.get(PRICING_ENDPOINT_TEMPLATE.format(cluster_config["region"]))
+    offers = response.json()
+
+    instance_mapping = {}
+
+    for product_id, product in offers["products"].items():
+        if product.get("attributes") is None:
+            continue
+        if product["attributes"].get("servicecode") != "AmazonEC2":
+            continue
+        if product["attributes"].get("tenancy") != "Shared":
+            continue
+        if product["attributes"].get("operatingSystem") != "Linux":
+            continue
+        if product["attributes"].get("capacitystatus") != "Used":
+            continue
+        if product["attributes"].get("operation") != "RunInstances":
+            continue
+        price_dimensions = list(offers["terms"]["OnDemand"][product["sku"]].values())[0][
+            "priceDimensions"
+        ]
+
+        price = list(price_dimensions.values())[0]["pricePerUnit"]["USD"]
+
+        instance_type = product["attributes"]["instanceType"]
+        metadata = {
+            "sku": product["sku"],
+            "instance_type": instance_type,
+            "cpu": int(product["attributes"]["vcpu"]),
+            "mem": int(
+                float(re.sub("[^0-9\\.]", "", product["attributes"]["memory"].split(" ")[0])) * 1024
+            ),
+            "price": float(price),
+        }
+        if product["attributes"].get("gpu") is not None:
+            metadata["gpu"] = product["attributes"]["gpu"]
+        instance_mapping[instance_type] = metadata
+
+    return instance_mapping
+
+
+def set_ec2_metadata(cluster_config_path, internal_cluster_config_path):
+    with open(cluster_config_path, "r") as f:
+        cluster_config = yaml.safe_load(f)
+    instance_mapping = download_metadata(cluster_config)
+    instance_metadata = instance_mapping[cluster_config["instance_type"]]
+
+    internal_cluster_config = {
+        "instance_mem": str(instance_metadata["mem"]) + "Mi",
+        "instance_cpu": str(instance_metadata["cpu"]),
+        "instance_gpu": int(instance_metadata.get("gpu", 0)),
+    }
+
+    with open(internal_cluster_config_path, "w") as f:
+        yaml.dump(internal_cluster_config, f)
+
+
+def main():
+    set_ec2_metadata(sys.argv[1], sys.argv[2])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/manager/manifests/cluster-autoscaler.yaml b/manager/manifests/cluster-autoscaler.yaml
@@ -143,12 +143,11 @@ spec:
         - image: $CORTEX_IMAGE_CLUSTER_AUTOSCALER
           name: cluster-autoscaler
           resources:
-            limits:
-              cpu: 100m
-              memory: 300Mi
             requests:
               cpu: 100m
               memory: 300Mi
+            limits:
+              memory: 300Mi
           command:
             - ./cluster-autoscaler
             - --v=4