Revert "[autoscaler][kubernetes] Ray client setup, example config sim…

…plification, example scripts. (ray-project#13920)" This reverts commit a4226a4.
fishbone · Feb 16, 2021 · 6834c97 · 6834c97
1 parent 61a5735
commit 6834c97
Show file tree

Hide file tree

Showing 18 changed files with 392 additions and 788 deletions.
diff --git a/python/ray/autoscaler/_private/commands.py b/python/ray/autoscaler/_private/commands.py
@@ -149,7 +149,7 @@ def create_or_update_cluster(
         redirect_command_output: Optional[bool] = False,
         use_login_shells: bool = True,
         no_monitor_on_head: bool = False) -> Dict[str, Any]:
-    """Creates or updates an autoscaling Ray cluster from a config json."""
+    """Create or updates an autoscaling Ray cluster from a config json."""
     # no_monitor_on_head is an internal flag used by the Ray K8s operator.
     # If True, prevents autoscaling config sync to the Ray head during cluster
     # creation. See https://github.com/ray-project/ray/pull/13720.

diff --git a/python/ray/autoscaler/kubernetes/defaults.yaml b/python/ray/autoscaler/kubernetes/defaults.yaml
@@ -1,8 +1,12 @@
-# A unique identifier for the head node and workers of this cluster.
-cluster_name: defaults
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: default
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
 
 # The maximum number of workers nodes to launch in addition to the head
-# node.
+# node. This takes precedence over min_workers.
 max_workers: 2
 
 # The autoscaler will scale up the cluster faster with higher upscaling speed.
@@ -74,83 +78,127 @@ provider:
             # NOTE: If you're running multiple Ray clusters with services
             # on one Kubernetes cluster, they must have unique service
             # names.
-            name: example-cluster-ray-head
+            name: ray-head
         spec:
             # This selector must match the head node pod's selector below.
             selector:
-                component: example-cluster-ray-head
+                component: ray-head
+            ports:
+                - protocol: TCP
+                  port: 8000
+                  targetPort: 8000
+
+      # Service that maps to the worker nodes of the Ray cluster.
+      - apiVersion: v1
+        kind: Service
+        metadata:
+            # NOTE: If you're running multiple Ray clusters with services
+            # on one Kubernetes cluster, they must have unique service
+            # names.
+            name: ray-workers
+        spec:
+            # This selector must match the worker node pods' selector below.
+            selector:
+                component: ray-worker
             ports:
-                - name: client
-                  protocol: TCP
-                  port: 10001
-                  targetPort: 10001
-                - name: dashboard
-                  protocol: TCP
-                  port: 8265
-                  targetPort: 8265
-
-# Specify the pod type for the ray head node (as configured below).
-head_node_type: head_node
-# Specify the allowed pod types for this ray cluster and the resources they provide.
-available_node_types:
-  worker_node:
-    # Minimum number of Ray workers of this Pod type.
-    min_workers: 0
-    # Maximum number of Ray workers of this Pod type. Takes precedence over min_workers.
-    max_workers: 2
-    node_config:
-      apiVersion: v1
-      kind: Pod
-      metadata:
+                - protocol: TCP
+                  port: 8000
+                  targetPort: 8000
+
+# Kubernetes pod config for the head node pod.
+head_node:
+    apiVersion: v1
+    kind: Pod
+    metadata:
         # Automatically generates a name for the pod with this prefix.
-        generateName: example-cluster-ray-worker-
-      spec:
+        generateName: ray-head-
+
+        # Must match the head node service selector above if a head node
+        # service is required.
+        labels:
+            component: ray-head
+    spec:
+        # Change this if you altered the autoscaler_service_account above
+        # or want to provide your own.
+        serviceAccountName: autoscaler
+
+        # Restarting the head node automatically is not currently supported.
+        # If the head node goes down, `ray up` must be run again.
         restartPolicy: Never
+
+        # This volume allocates shared memory for Ray to use for its plasma
+        # object store. If you do not provide this, Ray will fall back to
+        # /tmp which cause slowdowns if is not a shared memory volume.
         volumes:
         - name: dshm
           emptyDir:
-            medium: Memory
+              medium: Memory
+
         containers:
         - name: ray-node
           imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
+          #   - screen (used for `ray attach`)
+          #   - kubectl (used by the autoscaler to manage worker pods)
           image: rayproject/ray:nightly
+          # Do not change this command - it keeps the pod alive until it is
+          # explicitly killed.
           command: ["/bin/bash", "-c", "--"]
           args: ["trap : TERM INT; sleep infinity & wait;"]
+          ports:
+              - containerPort: 6379 # Redis port.
+              - containerPort: 6380 # Redis port.
+              - containerPort: 6381 # Redis port.
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
+
           # This volume allocates shared memory for Ray to use for its plasma
           # object store. If you do not provide this, Ray will fall back to
           # /tmp which cause slowdowns if is not a shared memory volume.
           volumeMounts:
-          - mountPath: /dev/shm
-            name: dshm
+              - mountPath: /dev/shm
+                name: dshm
           resources:
-            requests:
-              cpu: 1000m
-              memory: 512Mi
-            limits:
-              # The maximum memory that this pod is allowed to use. The
-              # limit will be detected by ray and split to use 10% for
-              # redis, 30% for the shared memory object store, and the
-              # rest for application memory. If this limit is not set and
-              # the object store size is not set manually, ray will
-              # allocate a very large object store in each pod that may
-              # cause problems for other pods.
-              memory: 512Mi
-  head_node:
-    node_config:
-      apiVersion: v1
-      kind: Pod
-      metadata:
+              requests:
+                  cpu: 1000m
+                  memory: 512Mi
+              limits:
+                  # The maximum memory that this pod is allowed to use. The
+                  # limit will be detected by ray and split to use 10% for
+                  # redis, 30% for the shared memory object store, and the
+                  # rest for application memory. If this limit is not set and
+                  # the object store size is not set manually, ray will
+                  # allocate a very large object store in each pod that may
+                  # cause problems for other pods.
+                  memory: 2Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: requests.cpu
+
+# Kubernetes pod config for worker node pods.
+worker_nodes:
+    apiVersion: v1
+    kind: Pod
+    metadata:
         # Automatically generates a name for the pod with this prefix.
-        generateName: example-cluster-ray-head-
-        # Must match the head node service selector above if a head node
+        generateName: ray-worker-
+
+        # Must match the worker node service selector above if a worker node
         # service is required.
         labels:
-            component: example-cluster-ray-head
-      spec:
-        # Change this if you altered the autoscaler_service_account above
-        # or want to provide your own.
-        serviceAccountName: autoscaler
+            component: ray-worker
+    spec:
+        serviceAccountName: default
 
+        # Worker nodes will be managed automatically by the head node, so
+        # do not change the restart policy.
         restartPolicy: Never
 
         # This volume allocates shared memory for Ray to use for its plasma
@@ -159,51 +207,45 @@ available_node_types:
         volumes:
         - name: dshm
           emptyDir:
-            medium: Memory
+              medium: Memory
+
         containers:
         - name: ray-node
           imagePullPolicy: Always
+          # You are free (and encouraged) to use your own container image,
+          # but it should have the following installed:
+          #   - rsync (used for `ray rsync` commands and file mounts)
           image: rayproject/ray:nightly
           # Do not change this command - it keeps the pod alive until it is
           # explicitly killed.
           command: ["/bin/bash", "-c", "--"]
-          args: ['trap : TERM INT; sleep infinity & wait;']
+          args: ["trap : TERM INT; sleep infinity & wait;"]
           ports:
-          - containerPort: 6379  # Redis port
-          - containerPort: 10001  # Used by Ray Client
-          - containerPort: 8265  # Used by Ray Dashboard
+              - containerPort: 12345 # Ray internal communication.
+              - containerPort: 12346 # Ray internal communication.
 
           # This volume allocates shared memory for Ray to use for its plasma
           # object store. If you do not provide this, Ray will fall back to
           # /tmp which cause slowdowns if is not a shared memory volume.
           volumeMounts:
-          - mountPath: /dev/shm
-            name: dshm
+              - mountPath: /dev/shm
+                name: dshm
           resources:
-            requests:
-              cpu: 1000m
-              memory: 512Mi
-            limits:
-              # The maximum memory that this pod is allowed to use. The
-              # limit will be detected by ray and split to use 10% for
-              # redis, 30% for the shared memory object store, and the
-              # rest for application memory. If this limit is not set and
-              # the object store size is not set manually, ray will
-              # allocate a very large object store in each pod that may
-              # cause problems for other pods.
-              memory: 512Mi
-
-
-# Command to start ray on the head node. You don't need to change this.
-# Note dashboard-host is set to 0.0.0.0 so that kubernetes can port forward.
-head_start_ray_commands:
-    - ray stop
-    - ulimit -n 65536; ray start --head --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
-
-# Command to start ray on worker nodes. You don't need to change this.
-worker_start_ray_commands:
-    - ray stop
-    - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379
+              requests:
+                  cpu: 1000m
+                  memory: 512Mi
+              limits:
+                  # This memory limit will be detected by ray and split into
+                  # 30% for plasma, and 70% for workers.
+                  memory: 2Gi
+          env:
+              # This is used in the head_start_ray_commands below so that
+              # Ray can spawn the correct number of processes. Omitting this
+              # may lead to degraded performance.
+              - name: MY_CPU_REQUEST
+                valueFrom:
+                    resourceFieldRef:
+                        resource: requests.cpu
 
 # Files or directories to copy to the head and worker nodes. The format is a
 # dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
@@ -224,6 +266,16 @@ cluster_synced_files: []
 # should sync to the worker node continuously
 file_mounts_sync_continuously: False
 
+# Patterns for files to exclude when running rsync up or rsync down.
+# This is not supported on kubernetes.
+# rsync_exclude: []
+
+# Pattern files to use for filtering out files when running rsync up or rsync down. The file is searched for
+# in the source directory and recursively through all subdirectories. For example, if .gitignore is provided
+# as a value, the behavior will match git's behavior for finding and using .gitignore files.
+# This is not supported on kubernetes.
+# rsync_filter: []
+
 
 # List of commands that will be run before `setup_commands`. If docker is
 # enabled, these commands will run outside the container and before docker
@@ -239,6 +291,13 @@ head_setup_commands: []
 # Custom commands that will be run on worker nodes after common setup.
 worker_setup_commands: []
 
-head_node: {}
+# Command to start ray on the head node. You don't need to change this.
+# Note webui-host is set to 0.0.0.0 so that kubernetes can port forward.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --num-cpus=$MY_CPU_REQUEST --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host 0.0.0.0
 
-worker_nodes: {}
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=8076