In [1]:
import sys
sys.path.append("/mnt/code")
from utils import ddl_cluster_scaling_client
import json


In [2]:
result = ddl_cluster_scaling_client.get_auth_headers()
print(result['Authorization'][0:20])

Bearer eyJhbGciOiJSU


## Define Cluster Kind

For ray clusters use this section

In [3]:
cluster_kind = "rayclusters"
#cluster_kind = "sparkclusters"
#cluster_kind = "daskclusters"

## Get Service Status

In [4]:
print(ddl_cluster_scaling_client.is_cluster_auto_scaler_healthy())

(200, {'status': 'healthy'})


### Get the cluster status

In [5]:
j = ddl_cluster_scaling_client.get_cluster_status(cluster_kind=cluster_kind)
print('Cluster status')
print(json.dumps(j['status'], indent=2, sort_keys=True, ensure_ascii=False))

print('Cluster Autoscaling status')
print(json.dumps(j['spec']['autoscaling'], indent=2, sort_keys=True, ensure_ascii=False))

print('Cluster worker replicas')
print(j['spec']['worker']['replicas'])

ray-68e51495a23687447b1644d7
http://ddl-cluster-scaler-svc.domino-field.svc.cluster.local/ddl_cluster_scaler/cluster/rayclusters/ray-68e51495a23687447b1644d7
Status code 200
Cluster status
{
  "clusterStatus": "Running",
  "nodes": [
    "ray-68e51495a23687447b1644d7-ray-head-0",
    "ray-68e51495a23687447b1644d7-ray-worker-0"
  ],
  "startTime": "2025-10-07T13:26:10Z",
  "workerReplicas": 1,
  "workerSelector": "app.kubernetes.io/component=worker,app.kubernetes.io/instance=ray-68e51495a23687447b1644d7,app.kubernetes.io/name=ray"
}
Cluster Autoscaling status
{
  "maxReplicas": 5,
  "minReplicas": 1
}
Cluster worker replicas
1


### Scale the cluster up using HW tier of your choice

Now scale up the cluster. You need to provide two parameters:
1. `head_hw_tier_name` - This is optional. You use it to override the HW Tier used to start the cluster for the head node
2. `worker_hw_tier_name` - This is optional. You use it to override the HW Tier used to start the cluster for the workers
3. `replicas` - This is the number of desired workers. It needs to be less than the `max_workers` for this cluster or it is capped to the 
`max_workers`

In [6]:
worker_hw_tier_name="Small"
#worker_hw_tier_name="GPU (small)"

j = ddl_cluster_scaling_client.scale_cluster(cluster_kind=cluster_kind,worker_hw_tier_name=worker_hw_tier_name, replicas=4)
json.dumps(j, indent=2, sort_keys=True, ensure_ascii=False)
#json.dumps(j, indent=2, sort_keys=True, ensure_ascii=False)
#print(j['restarted_ts'])
#restart_ts=j['restarted_ts']

4
Status code 200


'{\n  "capped": false,\n  "effective_replicas": 4,\n  "kind": "rayclusters",\n  "maxReplicas": 5,\n  "name": "ray-68e51495a23687447b1644d7",\n  "object": {\n    "apiVersion": "distributed-compute.dominodatalab.com/v1alpha1",\n    "kind": "RayCluster",\n    "metadata": {\n      "creationTimestamp": "2025-10-07T13:24:40Z",\n      "finalizers": [\n        "distributed-compute.dominodatalab.com/finalizer"\n      ],\n      "generation": 2,\n      "labels": {\n        "app.kubernetes.io/instance": "ray-68e51495a23687447b1644d7",\n        "app.kubernetes.io/name": "raycluster",\n        "dominodatalab.com/billing-tag": "navy",\n        "dominodatalab.com/execution-id": "68e51495a23687447b1644d7",\n        "dominodatalab.com/project-id": "68b08bbfdc77613e66e6364e",\n        "dominodatalab.com/project-name": "ddl-end-to-end-demo",\n        "dominodatalab.com/project-owner-id": "68ac9b59ea374e2b81c48d75",\n        "dominodatalab.com/project-owner-username": "wadkars",\n        "dominodatalab.com

In [8]:
ddl_cluster_scaling_client.wait_until_scaling_complete(cluster_kind=cluster_kind)

Waiting 10 seconds to give the cluster some time to scale before we start polling
ray-68dfd90a7a54b913b98e346d
http://ddl-cluster-scaler-svc.domino-field.svc.cluster.local/ddl_cluster_scaler/restart_status/rayclusters/ray-68dfd90a7a54b913b98e346d/worker
Status code 200
{'desired_replicas': 4, 'evaluated_with': 'statefulset_pods', 'namespace': 'domino-compute', 'node_type': 'worker', 'ok': False, 'oldestCreationTimestamp': '2025-10-03T14:09:17+00:00', 'ready_equals_running': True, 'ready_pods': 3, 'running_pods': 3, 'statefulset': 'ray-68dfd90a7a54b913b98e346d-ray-worker', 'status': 'restarted_but_counts_mismatch'}
Scaling not yet done...
ray-68dfd90a7a54b913b98e346d
http://ddl-cluster-scaler-svc.domino-field.svc.cluster.local/ddl_cluster_scaler/restart_status/rayclusters/ray-68dfd90a7a54b913b98e346d/worker
Status code 200
{'desired_replicas': 4, 'evaluated_with': 'statefulset_pods', 'namespace': 'domino-compute', 'node_type': 'worker', 'ok': False, 'oldestCreationTimestamp': '2025-10-0

True

### Optionally restart the head node

With certain clusters (Ray) the api based scaling does not cause the head node UI to correctly reflect the number of workers even though
the cluster utilizes all the workers. It is safe to restart the head code as a matter of hygiene. Once restarted it correctly
reflect the correct number of workers.

In [7]:
j = ddl_cluster_scaling_client.restart_head_node(cluster_kind=cluster_kind,head_hw_tier_name="Medium")
print(j)

Status code 200
{'head_hw_tier_id': 'medium-k8s', 'kind': 'rayclusters', 'name': 'ray-68e51495a23687447b1644d7', 'object': {}, 'restarted_ts': '2025-10-07T13:30:21Z'}


In [8]:
ddl_cluster_scaling_client.wait_until_head_restart_complete(cluster_kind=cluster_kind)

Waiting 10 seconds to give the head node some time to scale before we start polling
ray-68e51495a23687447b1644d7
http://ddl-cluster-scaler-svc.domino-field.svc.cluster.local/ddl_cluster_scaler/restart_status/rayclusters/ray-68e51495a23687447b1644d7/head
Status code 200
{'desired_replicas': 1, 'evaluated_with': 'statefulset_pods', 'namespace': 'domino-compute', 'node_type': 'head', 'ok': False, 'oldestCreationTimestamp': '2025-10-07T13:30:30+00:00', 'ready_equals_running': True, 'ready_pods': 0, 'running_pods': 0, 'statefulset': 'ray-68e51495a23687447b1644d7-ray-head', 'status': 'restarted_but_counts_mismatch'}
Head restart not yet...
ray-68e51495a23687447b1644d7
http://ddl-cluster-scaler-svc.domino-field.svc.cluster.local/ddl_cluster_scaler/restart_status/rayclusters/ray-68e51495a23687447b1644d7/worker
Status code 200
{'desired_replicas': 4, 'evaluated_with': 'statefulset_pods', 'namespace': 'domino-compute', 'node_type': 'worker', 'ok': False, 'oldestCreationTimestamp': '2025-10-07T13

True

### Use the cluster

Having fully scaled the cluster to your requirements, use your cluster. Run your hyper-parameter tuning, Deepspeed based LLM
fine tuning, large Spark queries.

### Scale Down Cluster

Having finished your work, scale down the cluster. You need to provide two parameters:
1. `worker_hw_tier_name` - This is optional. You use it to override the HW Tier used to scale the cluster. It is recommended. Use the smallest hw_tier permitted for this cluster type. Domino does not allow you to scale down your cluster to 0. This capability is provided so that if you have used an expensive HW tier for your worker nodes (Ex. GPU Tiers), when you scale down the cluster to 1 worker you can avoid paying for the expensive tier.
 
2. `replicas` - This is the number of desired workers. It needs to be more than or equal to the `min_workers` for this cluster 
or it is capped to `min_workers`. Always start your cluster so that minimum workers is 1.

In [None]:
j = ddl_cluster_scaling_client.scale_cluster(cluster_kind=cluster_kind,worker_hw_tier_name="Small", replicas=1)
json.dumps(j, indent=2, sort_keys=True, ensure_ascii=False)


In [None]:
ddl_cluster_scaling_client.wait_until_scaling_complete(cluster_kind=cluster_kind)

### Optionally restart the head node again

This is a good practice in general

In [None]:
j = ddl_cluster_scaling_client.restart_head_node(cluster_kind=cluster_kind,head_hw_tier_name="Small")

In [None]:
ddl_cluster_scaling_client.wait_until_scaling_complete(cluster_kind=cluster_kind)

### Check current status of head and worker nodes


In [None]:
from datetime import datetime, timezone

# current UTC datetime in ISO 8601 format with Zulu suffix
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
print(current_time)

In [None]:

ddl_cluster_scaling_client.get_cluster_restart_status(cluster_kind = "rayclusters",node_type ="worker")

In [None]:
ddl_cluster_scaling_client.get_cluster_restart_status(cluster_kind = "rayclusters",node_type ="head")