Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c74baf2
NodeGroup spot instances
vishalbollu Oct 7, 2019
f4fd69c
Update cluster-autoscaler.yaml
deliahu Oct 7, 2019
abfe18f
Update autoscaler to version 1.16
vishalbollu Oct 7, 2019
a6a096b
Merge branch 'spot-instances' into separate-operator-workload-nodegroup
vishalbollu Oct 8, 2019
fdc8201
Calculate allocatable resources more accurately
vishalbollu Oct 10, 2019
6f12aa9
Merge branch 'master' into separate-operator-workload-nodegroup
vishalbollu Nov 5, 2019
b0e0fa6
Separate nodegroups
vishalbollu Nov 12, 2019
12ee06f
Merge branch 'master' into separate-operator-workload-nodegroup
vishalbollu Nov 12, 2019
3268382
Add desired instances
vishalbollu Nov 12, 2019
8d4ea32
Minor cleanup
vishalbollu Nov 12, 2019
e952392
Remove debug statements
vishalbollu Nov 13, 2019
607c545
Merge branch 'master' into separate-operator-workload-nodegroup
vishalbollu Nov 13, 2019
351e68b
Remove more debugging helpers
vishalbollu Nov 13, 2019
58e4933
Reset go.mod
vishalbollu Nov 13, 2019
c56ca3e
Remove more echo statements
vishalbollu Nov 13, 2019
cdf862e
Remove unnecessary boto3 dependency
vishalbollu Nov 13, 2019
1f18d52
Address some PR comments and fix linting
vishalbollu Nov 13, 2019
f90f921
Remove InternalClusterConfig
deliahu Nov 13, 2019
2703944
Address more PR comments
vishalbollu Nov 14, 2019
bd24c1c
Separate internal cluster config
deliahu Nov 14, 2019
a8c16f4
Change cortex internal cluster path for dev to be in the dev directory
vishalbollu Nov 14, 2019
fad20f4
Update config.md docs
vishalbollu Nov 14, 2019
96005c1
Change config map key name
vishalbollu Nov 14, 2019
5848fbe
Remove outdated comment and minor refactor
vishalbollu Nov 14, 2019
d37914c
Fix formatting
deliahu Nov 14, 2019
acf0058
Update api_workload.go
deliahu Nov 14, 2019
19fe4ff
Update memory_capacity.go
deliahu Nov 14, 2019
3f9a62f
Update metrics-server.yaml
deliahu Nov 14, 2019
1c40e7e
Merge branch 'master' into separate-operator-workload-nodegroup
vishalbollu Nov 14, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ var infoCmd = &cobra.Command{
if err != nil {
errors.Exit(err, "/info", string(httpResponse))
}
fmt.Println(infoResponse.ClusterConfig.String())
fmt.Println(infoResponse.ClusterConfig.UserFacingString())
},
}

Expand Down
2 changes: 1 addition & 1 deletion cli/cmd/lib_cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ var awsCredentialsPromptValidation = &cr.PromptValidation{
}

func readClusterConfigFile(clusterConfig *clusterconfig.ClusterConfig, awsCreds *AWSCredentials, path string) error {
errs := cr.ParseYAMLFile(clusterConfig, clusterconfig.Validation, path)
errs := cr.ParseYAMLFile(clusterConfig, clusterconfig.UserValidation, path)
if errors.HasErrors(errs) {
return errors.FirstError(errs...)
}
Expand Down
3 changes: 2 additions & 1 deletion cli/cmd/lib_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,15 @@ func runManagerCommand(entrypoint string, clusterConfig *clusterconfig.ClusterCo
if err != nil {
return "", errors.WithStack(err)
}

if err := files.WriteFile(clusterConfigBytes, cachedClusterConfigPath); err != nil {
return "", err
}

containerConfig := &container.Config{
Image: clusterConfig.ImageManager,
Entrypoint: []string{"/bin/bash", "-c"},
Cmd: []string{"sleep 0.1 && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml) && " + entrypoint},
Cmd: []string{"sleep 0.1 && eval $(python /root/instance_metadata.py /.cortex/cluster.yaml /.cortex/cluster_internal.yaml) && eval $(python /root/cluster_config_env.py /.cortex/cluster.yaml /.cortex/cluster_internal.yaml) && " + entrypoint},
Tty: true,
AttachStdout: true,
AttachStderr: true,
Expand Down
5 changes: 5 additions & 0 deletions dev/operator_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null && pwd)"

export CORTEX_OPERATOR_IN_CLUSTER=false
export CORTEX_CLUSTER_CONFIG_PATH=$ROOT/dev/config/cluster.yaml
export CORTEX_INTERNAL_CLUSTER_CONFIG_PATH=$ROOT/dev/config/cluster_internal.yaml

pip3 install -r $ROOT/manager/requirements.txt

python3 $ROOT/manager/instance_metadata.py $CORTEX_CLUSTER_CONFIG_PATH $CORTEX_INTERNAL_CLUSTER_CONFIG_PATH

kill $(pgrep -f rerun) >/dev/null 2>&1 || true
updated_cli_config=$(cat $HOME/.cortex/default.json | jq '.cortex_url = "http://localhost:8888"') && echo $updated_cli_config > $HOME/.cortex/default.json
Expand Down
6 changes: 4 additions & 2 deletions docs/cluster/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ cortex_aws_secret_access_key: ***
# Instance type Cortex will use
instance_type: m5.large

# Minimum and maximum number of instances in the cluster
min_instances: 2
# Minimum number of worker instances in the cluster (must be >= 0)
min_instances: 1

# Maximum number of worker instances in the cluster (must be >= 1)
max_instances: 5

# Name of the S3 bucket Cortex will use
Expand Down
3 changes: 3 additions & 0 deletions images/manager/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ WORKDIR /root

ENV PATH /root/.local/bin:$PATH

COPY manager/requirements.txt /root/requirements.txt

RUN pip install --upgrade pip && \
pip install awscli --upgrade --user && \
pip install -r /root/requirements.txt && \
rm -rf /root/.cache/pip*

RUN apk add --no-cache bash curl gettext jq openssl
Expand Down
11 changes: 5 additions & 6 deletions manager/cluster_config_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@
import sys
import yaml

cluster_conifg_path = sys.argv[1]
for config_path in sys.argv[1:]:
with open(config_path, "r") as f:
config = yaml.safe_load(f)

with open(cluster_conifg_path, "r") as cluster_conifg_file:
cluster_conifg = yaml.safe_load(cluster_conifg_file)

for key, value in cluster_conifg.items():
print("export CORTEX_{}={}".format(key.upper(), value))
for key, value in config.items():
print("export CORTEX_{}={}".format(key.upper(), value))
39 changes: 36 additions & 3 deletions manager/eks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,48 @@ metadata:
version: "1.14"

nodeGroups:
- name: ng-1
- name: ng-cortex-operator
instanceType: t3.medium
minSize: 1
maxSize: 2
desiredCapacity: 1
ami: auto
iam:
withAddonPolicies:
autoScaler: true
tags:
k8s.io/cluster-autoscaler/enabled: 'true'
kubeletExtraConfig:
kubeReserved:
cpu: 150m
memory: 300Mi
ephemeral-storage: 1Gi
kubeReservedCgroup: /kube-reserved
systemReserved:
cpu: 150m
memory: 300Mi
ephemeral-storage: 1Gi
evictionHard:
memory.available: 200Mi
nodefs.available: 5%

- name: ng-cortex-worker
instanceType: $CORTEX_INSTANCE_TYPE
minSize: $CORTEX_MIN_INSTANCES
maxSize: $CORTEX_MAX_INSTANCES
desiredCapacity: $CORTEX_MIN_INSTANCES
desiredCapacity: $CORTEX_DESIRED_INSTANCES
ami: auto
iam:
withAddonPolicies:
autoScaler: true
tags:
k8s.io/cluster-autoscaler/enabled: 'true'
k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
labels:
lifecycle: Ec2Spot
workload: "true"
taints:
workload: "true:NoSchedule"
kubeletExtraConfig:
kubeReserved:
cpu: 150m
Expand All @@ -41,5 +74,5 @@ nodeGroups:
memory: 300Mi
ephemeral-storage: 1Gi
evictionHard:
memory.available: 200Mi
memory.available: 200Mi
nodefs.available: 5%
82 changes: 82 additions & 0 deletions manager/eks_gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright 2019 Cortex Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
name: $CORTEX_CLUSTER_NAME
region: $CORTEX_REGION
version: "1.14"

nodeGroups:
- name: ng-cortex-operator
instanceType: t3.medium
minSize: 1
maxSize: 2
desiredCapacity: 1
ami: auto
iam:
withAddonPolicies:
autoScaler: true
tags:
k8s.io/cluster-autoscaler/enabled: 'true'
kubeletExtraConfig:
kubeReserved:
cpu: 150m
memory: 300Mi
ephemeral-storage: 1Gi
kubeReservedCgroup: /kube-reserved
systemReserved:
cpu: 150m
memory: 300Mi
ephemeral-storage: 1Gi
evictionHard:
memory.available: 200Mi
nodefs.available: 5%

- name: ng-cortex-worker
instanceType: $CORTEX_INSTANCE_TYPE
minSize: $CORTEX_MIN_INSTANCES
maxSize: $CORTEX_MAX_INSTANCES
desiredCapacity: $CORTEX_DESIRED_INSTANCES
ami: auto
iam:
withAddonPolicies:
autoScaler: true
tags:
k8s.io/cluster-autoscaler/enabled: 'true'
k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
labels:
lifecycle: Ec2Spot
workload: "true"
nvidia.com/gpu: 'true'
taints:
nvidia.com/gpu: "true:NoSchedule"
workload: "true:NoSchedule"
kubeletExtraConfig:
kubeReserved:
cpu: 150m
memory: 300Mi
ephemeral-storage: 1Gi
kubeReservedCgroup: /kube-reserved
systemReserved:
cpu: 150m
memory: 300Mi
ephemeral-storage: 1Gi
evictionHard:
memory.available: 200Mi
nodefs.available: 5%
19 changes: 15 additions & 4 deletions manager/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,18 @@ function ensure_eks() {
exit 1
fi

if [ $CORTEX_MIN_INSTANCES -lt 1 ]; then
export CORTEX_DESIRED_INSTANCES=1
else
export CORTEX_DESIRED_INSTANCES=$CORTEX_MIN_INSTANCES
fi

echo -e "○ Spinning up the cluster ... (this will take about 15 minutes)\n"
envsubst < eks.yaml | eksctl create cluster -f -
if [ $CORTEX_INSTANCE_GPU -ne 0 ]; then
envsubst < eks_gpu.yaml | eksctl create cluster -f -
else
envsubst < eks.yaml | eksctl create cluster -f -
fi
echo -e "\n✓ Spun up the cluster"
return
fi
Expand All @@ -60,15 +70,15 @@ function ensure_eks() {
echo "✓ Cluster is running"

# Check if instance type changed
ng_info=$(eksctl get nodegroup --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --name ng-1 -o json)
ng_instance_type=$(echo "$ng_info" | jq -r ".[] | select( .Cluster == \"$CORTEX_CLUSTER_NAME\" ) | select( .Name == \"ng-1\" ) | .InstanceType")
ng_info=$(eksctl get nodegroup --cluster=$CORTEX_CLUSTER_NAME --region=$CORTEX_REGION --name ng-cortex-worker -o json)
ng_instance_type=$(echo "$ng_info" | jq -r ".[] | select( .Cluster == \"$CORTEX_CLUSTER_NAME\" ) | select( .Name == \"ng-cortex-worker\" ) | .InstanceType")
if [ "$ng_instance_type" != "$CORTEX_INSTANCE_TYPE" ]; then
echo -e "\nerror: Cortex does not currently support changing the instance type of a running cluster; please run \`cortex cluster down\` followed by \`cortex cluster up\` to create a new cluster"
exit 1
fi

# Check for change in min/max instances
asg_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query 'AutoScalingGroups[?contains(Tags[?Key==`alpha.eksctl.io/nodegroup-name`].Value, `ng-1`)]')
asg_info=$(aws autoscaling describe-auto-scaling-groups --region $CORTEX_REGION --query 'AutoScalingGroups[?contains(Tags[?Key==`alpha.eksctl.io/nodegroup-name`].Value, `ng-cortex-worker`)]')
asg_name=$(echo "$asg_info" | jq -r 'first | .AutoScalingGroupName')
asg_min_size=$(echo "$asg_info" | jq -r 'first | .MinSize')
asg_max_size=$(echo "$asg_info" | jq -r 'first | .MaxSize')
Expand Down Expand Up @@ -166,6 +176,7 @@ function setup_cloudwatch_logs() {
function setup_configmap() {
kubectl -n=cortex create configmap 'cluster-config' \
--from-file='cluster.yaml'='/.cortex/cluster.yaml' \
--from-file='cluster_internal.yaml'='/.cortex/cluster_internal.yaml' \
-o yaml --dry-run | kubectl apply -f - >/dev/null
}

Expand Down
91 changes: 91 additions & 0 deletions manager/instance_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright 2019 Cortex Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import requests
import sys
import re
import os
import pathlib
import json
import yaml

PRICING_ENDPOINT_TEMPLATE = (
"https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{}/index.json"
)


def download_metadata(cluster_config):
response = requests.get(PRICING_ENDPOINT_TEMPLATE.format(cluster_config["region"]))
offers = response.json()

instance_mapping = {}

for product_id, product in offers["products"].items():
if product.get("attributes") is None:
continue
if product["attributes"].get("servicecode") != "AmazonEC2":
continue
if product["attributes"].get("tenancy") != "Shared":
continue
if product["attributes"].get("operatingSystem") != "Linux":
continue
if product["attributes"].get("capacitystatus") != "Used":
continue
if product["attributes"].get("operation") != "RunInstances":
continue
price_dimensions = list(offers["terms"]["OnDemand"][product["sku"]].values())[0][
"priceDimensions"
]

price = list(price_dimensions.values())[0]["pricePerUnit"]["USD"]

instance_type = product["attributes"]["instanceType"]
metadata = {
"sku": product["sku"],
"instance_type": instance_type,
"cpu": int(product["attributes"]["vcpu"]),
"mem": int(
float(re.sub("[^0-9\\.]", "", product["attributes"]["memory"].split(" ")[0])) * 1024
),
"price": float(price),
}
if product["attributes"].get("gpu") is not None:
metadata["gpu"] = product["attributes"]["gpu"]
instance_mapping[instance_type] = metadata

return instance_mapping


def set_ec2_metadata(cluster_config_path, internal_cluster_config_path):
with open(cluster_config_path, "r") as f:
cluster_config = yaml.safe_load(f)
instance_mapping = download_metadata(cluster_config)
instance_metadata = instance_mapping[cluster_config["instance_type"]]

internal_cluster_config = {
"instance_mem": str(instance_metadata["mem"]) + "Mi",
"instance_cpu": str(instance_metadata["cpu"]),
"instance_gpu": int(instance_metadata.get("gpu", 0)),
}

with open(internal_cluster_config_path, "w") as f:
yaml.dump(internal_cluster_config, f)


def main():
set_ec2_metadata(sys.argv[1], sys.argv[2])


if __name__ == "__main__":
main()
5 changes: 2 additions & 3 deletions manager/manifests/cluster-autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,12 +143,11 @@ spec:
- image: $CORTEX_IMAGE_CLUSTER_AUTOSCALER
name: cluster-autoscaler
resources:
limits:
cpu: 100m
memory: 300Mi
requests:
cpu: 100m
memory: 300Mi
limits:
memory: 300Mi
command:
- ./cluster-autoscaler
- --v=4
Expand Down
Loading