Skip to content

Commit

Permalink
Revert "[autoscaler][kubernetes] Ray client setup, example config sim…
Browse files Browse the repository at this point in the history
…plification, example scripts. (ray-project#13920)"

This reverts commit a4226a4.
  • Loading branch information
fishbone committed Feb 16, 2021
1 parent 61a5735 commit 6834c97
Show file tree
Hide file tree
Showing 18 changed files with 392 additions and 788 deletions.
2 changes: 1 addition & 1 deletion python/ray/autoscaler/_private/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def create_or_update_cluster(
redirect_command_output: Optional[bool] = False,
use_login_shells: bool = True,
no_monitor_on_head: bool = False) -> Dict[str, Any]:
"""Creates or updates an autoscaling Ray cluster from a config json."""
"""Create or updates an autoscaling Ray cluster from a config json."""
# no_monitor_on_head is an internal flag used by the Ray K8s operator.
# If True, prevents autoscaling config sync to the Ray head during cluster
# creation. See https://github.com/ray-project/ray/pull/13720.
Expand Down
237 changes: 148 additions & 89 deletions python/ray/autoscaler/kubernetes/defaults.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# A unique identifier for the head node and workers of this cluster.
cluster_name: defaults
# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0

# The maximum number of workers nodes to launch in addition to the head
# node.
# node. This takes precedence over min_workers.
max_workers: 2

# The autoscaler will scale up the cluster faster with higher upscaling speed.
Expand Down Expand Up @@ -74,83 +78,127 @@ provider:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: example-cluster-ray-head
name: ray-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: example-cluster-ray-head
component: ray-head
ports:
- protocol: TCP
port: 8000
targetPort: 8000

# Service that maps to the worker nodes of the Ray cluster.
- apiVersion: v1
kind: Service
metadata:
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: ray-workers
spec:
# This selector must match the worker node pods' selector below.
selector:
component: ray-worker
ports:
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265

# Specify the pod type for the ray head node (as configured below).
head_node_type: head_node
# Specify the allowed pod types for this ray cluster and the resources they provide.
available_node_types:
worker_node:
# Minimum number of Ray workers of this Pod type.
min_workers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
max_workers: 2
node_config:
apiVersion: v1
kind: Pod
metadata:
- protocol: TCP
port: 8000
targetPort: 8000

# Kubernetes pod config for the head node pod.
head_node:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-worker-
spec:
generateName: ray-head-

# Must match the head node service selector above if a head node
# service is required.
labels:
component: ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: autoscaler

# Restarting the head node automatically is not currently supported.
# If the head node goes down, `ray up` must be run again.
restartPolicy: Never

# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
medium: Memory

containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
# - screen (used for `ray attach`)
# - kubectl (used by the autoscaler to manage worker pods)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.

# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi
head_node:
node_config:
apiVersion: v1
kind: Pod
metadata:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu

# Kubernetes pod config for worker node pods.
worker_nodes:
apiVersion: v1
kind: Pod
metadata:
# Automatically generates a name for the pod with this prefix.
generateName: example-cluster-ray-head-
# Must match the head node service selector above if a head node
generateName: ray-worker-

# Must match the worker node service selector above if a worker node
# service is required.
labels:
component: example-cluster-ray-head
spec:
# Change this if you altered the autoscaler_service_account above
# or want to provide your own.
serviceAccountName: autoscaler
component: ray-worker
spec:
serviceAccountName: default

# Worker nodes will be managed automatically by the head node, so
# do not change the restart policy.
restartPolicy: Never

# This volume allocates shared memory for Ray to use for its plasma
Expand All @@ -159,51 +207,45 @@ available_node_types:
volumes:
- name: dshm
emptyDir:
medium: Memory
medium: Memory

containers:
- name: ray-node
imagePullPolicy: Always
# You are free (and encouraged) to use your own container image,
# but it should have the following installed:
# - rsync (used for `ray rsync` commands and file mounts)
image: rayproject/ray:nightly
# Do not change this command - it keeps the pod alive until it is
# explicitly killed.
command: ["/bin/bash", "-c", "--"]
args: ['trap : TERM INT; sleep infinity & wait;']
args: ["trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 6379 # Redis port
- containerPort: 10001 # Used by Ray Client
- containerPort: 8265 # Used by Ray Dashboard
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.

# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 1000m
memory: 512Mi
limits:
# The maximum memory that this pod is allowed to use. The
# limit will be detected by ray and split to use 10% for
# redis, 30% for the shared memory object store, and the
# rest for application memory. If this limit is not set and
# the object store size is not set manually, ray will
# allocate a very large object store in each pod that may
# cause problems for other pods.
memory: 512Mi


# Command to start ray on the head node. You don't need to change this.
# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
requests:
cpu: 1000m
memory: 512Mi
limits:
# This memory limit will be detected by ray and split into
# 30% for plasma, and 70% for workers.
memory: 2Gi
env:
# This is used in the head_start_ray_commands below so that
# Ray can spawn the correct number of processes. Omitting this
# may lead to degraded performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu

# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
Expand All @@ -224,6 +266,16 @@ cluster_synced_files: []
# should sync to the worker node continuously
file_mounts_sync_continuously: False

# Patterns for files to exclude when running rsync up or rsync down.
# This is not supported on kubernetes.
# rsync_exclude: []

# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
# as a value, the behavior will match git's behavior for finding and using .gitignore files.
# This is not supported on kubernetes.
# rsync_filter: []


# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
Expand All @@ -239,6 +291,13 @@ head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

head_node: {}
# Command to start ray on the head node. You don't need to change this.
# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0

worker_nodes: {}
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
Loading

0 comments on commit 6834c97

Please sign in to comment.