Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ ci-build-images:
@./build/build-image.sh images/argo-executor argo-executor
@./build/build-image.sh images/python-packager python-packager
@./build/build-image.sh images/cluster-autoscaler cluster-autoscaler
@./build/build-image.sh images/nvidia nvidia
@./build/build-image.sh images/metrics-server metrics-server

ci-push-images:
Expand All @@ -155,6 +156,7 @@ ci-push-images:
@./build/push-image.sh argo-executor
@./build/push-image.sh python-packager
@./build/push-image.sh cluster-autoscaler
@./build/push-image.sh nvidia
@./build/push-image.sh metrics-server


Expand Down
2 changes: 2 additions & 0 deletions cortex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/
export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="${CORTEX_IMAGE_CLUSTER_AUTOSCALER:-cortexlabs/cluster-autoscaler:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_NVIDIA="${CORTEX_IMAGE_NVIDIA:-cortexlabs/nvidia:$CORTEX_VERSION_STABLE}"
export CORTEX_IMAGE_METRICS_SERVER="${CORTEX_IMAGE_METRICS_SERVER:-cortexlabs/metrics-server:$CORTEX_VERSION_STABLE}"

export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}"
Expand Down Expand Up @@ -182,6 +183,7 @@ function install_cortex() {
-e CORTEX_IMAGE_TF_SERVE_GPU=$CORTEX_IMAGE_TF_SERVE_GPU \
-e CORTEX_IMAGE_TF_TRAIN_GPU=$CORTEX_IMAGE_TF_TRAIN_GPU \
-e CORTEX_IMAGE_CLUSTER_AUTOSCALER=$CORTEX_IMAGE_CLUSTER_AUTOSCALER \
-e CORTEX_IMAGE_NVIDIA=$CORTEX_IMAGE_NVIDIA \
-e CORTEX_IMAGE_METRICS_SERVER=$CORTEX_IMAGE_METRICS_SERVER \
-e CORTEX_ENABLE_TELEMETRY=$CORTEX_ENABLE_TELEMETRY \
$CORTEX_IMAGE_MANAGER
Expand Down
45 changes: 0 additions & 45 deletions dev/eks.sh

This file was deleted.

163 changes: 0 additions & 163 deletions dev/kops.sh

This file was deleted.

2 changes: 2 additions & 0 deletions dev/registry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ function create_registry() {
aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/cluster-autoscaler --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/nvidia --region=$REGISTRY_REGION || true
aws ecr create-repository --repository-name=cortexlabs/metrics-server --region=$REGISTRY_REGION || true
}

Expand Down Expand Up @@ -139,6 +140,7 @@ elif [ "$cmd" = "update" ]; then
build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest
build_and_push $ROOT/images/python-packager python-packager latest
build_and_push $ROOT/images/cluster-autoscaler cluster-autoscaler latest
build_and_push $ROOT/images/nvidia nvidia latest
build_and_push $ROOT/images/metrics-server metrics-server latest
fi

Expand Down
4 changes: 3 additions & 1 deletion docs/apis/compute.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,6 @@ One unit of memory is one byte. Memory can be expressed as an integer or by usin

## GPU

One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html).
1. Please make sure your AWS account is subscribed to the [EKS-optimized AMI with GPU Support](https://aws.amazon.com/marketplace/pp/B07GRHFXGM).
2. Set CORTEX_NODE_TYPE to an AWS GPU instance (e.g. p2.xlarge) before installing Cortex.
3. Note that one unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed.
1 change: 1 addition & 0 deletions docs/cluster/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="cortexlabs/tf-train-gpu:master"
export CORTEX_IMAGE_TF_SERVE_GPU="cortexlabs/tf-serve-gpu:master"
export CORTEX_IMAGE_PYTHON_PACKAGER="cortexlabs/python-packager:master"
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="cortexlabs/cluster-autoscaler:master"
export CORTEX_IMAGE_NVIDIA="cortexlabs/nvidia:master"
export CORTEX_IMAGE_METRICS_SERVER="cortexlabs/metrics-server:master"

# Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted.
Expand Down
1 change: 1 addition & 0 deletions docs/cluster/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ export CORTEX_IMAGE_TF_TRAIN_GPU="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/corte
export CORTEX_IMAGE_TF_TRANSFORM="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/tf-transform:latest"
export CORTEX_IMAGE_PYTHON_PACKAGER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/python-packager:latest"
export CORTEX_IMAGE_CLUSTER_AUTOSCALER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/cluster-autoscaler:latest"
export CORTEX_IMAGE_NVIDIA="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/nvidia:latest"
export CORTEX_IMAGE_METRICS_SERVER="XXXXXXXX.dkr.ecr.us-west-2.amazonaws.com/cortexlabs/metrics-server:latest"

export AWS_ACCESS_KEY_ID="XXXXXX"
Expand Down
1 change: 1 addition & 0 deletions images/nvidia/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
FROM nvidia/k8s-device-plugin:1.0.0-beta
1 change: 1 addition & 0 deletions manager/install_cortex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -169,5 +169,6 @@ envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null
envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null
envsubst < manifests/cluster-autoscaler.yaml | kubectl apply -f - >/dev/null
envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null

validate_cortex
3 changes: 2 additions & 1 deletion manager/install_eks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ eksctl create cluster --name=$CORTEX_CLUSTER \
--node-type=$CORTEX_NODE_TYPE \
--nodes-min=$CORTEX_NODES_MIN \
--nodes-max=$CORTEX_NODES_MAX \
--version=1.13 \
--node-ami=auto \
--version=1.11 \
--asg-access

echo -e "\n✓ Spun up the cluster"
56 changes: 56 additions & 0 deletions manager/manifests/nvidia.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright 2019 Cortex Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Source: https://github.com/NVIDIA/k8s-device-plugin/blob/1.0.0-beta/nvidia-device-plugin.yml

apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
updateStrategy:
type: RollingUpdate
template:
metadata:
# Mark this pod as a critical add-on; when enabled, the critical add-on scheduler
# reserves resources for critical add-on pods so that they can be rescheduled after
# a failure. This annotation works in tandem with the toleration below.
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
# Allow this pod to be rescheduled while the node is in "critical add-ons only" mode.
# This, along with the annotation above marks this pod as a critical add-on.
- key: CriticalAddonsOnly
operator: Exists
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
containers:
- image: $CORTEX_IMAGE_NVIDIA
name: nvidia-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
5 changes: 4 additions & 1 deletion manager/uninstall_operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER --region=$CORTEX_REGION | g
echo -e "\nUninstalling the Cortex operator ..."

kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator >/dev/null 2>&1
kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1 # Pods in DaemonSets cannot be modified

# Pods in DaemonSets cannot be modified
kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1
kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset nvidia-device-plugin-daemonset >/dev/null 2>&1

echo "✓ Uninstalled the Cortex operator"