In [1]:
import sys
sys.path.append("/mnt/code")
from utils import ddl_cluster_scaling_client
import json


In [2]:
result = ddl_cluster_scaling_client.get_auth_headers()
print(result['Authorization'][0:20])

Bearer eyJhbGciOiJSU


## Define Cluster Kind

For ray clusters use this section

In [3]:
cluster_kind = "rayclusters"
#cluster_kind = "sparkclusters"
#cluster_kind = "daskclusters"

### Get the cluster status

In [None]:
j = ddl_cluster_scaling_client.get_cluster_status(cluster_kind=cluster_kind)
print('Cluster status')
print(json.dumps(j['status'], indent=2, sort_keys=True, ensure_ascii=False))

print('Cluster Autoscaling status')
print(json.dumps(j['spec']['autoscaling'], indent=2, sort_keys=True, ensure_ascii=False))

print('Cluster worker replicas')
print(j['spec']['worker']['replicas'])

### Scale the cluster up using HW tier of your choice

Now scale up the cluster. You need to provide two parameters:
1. `head_hw_tier_name` - This is optional. You use it to override the HW Tier used to start the cluster for the head node
2. `worker_hw_tier_name` - This is optional. You use it to override the HW Tier used to start the cluster for the workers
3. `replicas` - This is the number of desired workers. It needs to be less than the `max_workers` for this cluster or it is capped to the 
`max_workers`

In [None]:
worker_hw_tier_name="Small"
#worker_hw_tier_name="GPU (small)"

j = ddl_cluster_scaling_client.scale_cluster(cluster_kind=cluster_kind,worker_hw_tier_name=worker_hw_tier_name, replicas=3)
json.dumps(j, indent=2, sort_keys=True, ensure_ascii=False)
#json.dumps(j, indent=2, sort_keys=True, ensure_ascii=False)
print(j['restarted_ts'])
restart_ts=j['restarted_ts']

In [None]:
ddl_cluster_scaling_client.wait_until_scaling_complete(cluster_kind=cluster_kind,scale_start_ts=restart_ts)

### Optionally restart the head node

With certain clusters (Ray) the api based scaling does not cause the head node UI to correctly reflect the number of workers even though
the cluster utilizes all the workers. It is safe to restart the head code as a matter of hygiene. Once restarted it correctly
reflect the correct number of workers.

In [None]:
j = ddl_cluster_scaling_client.restart_head_node(cluster_kind=cluster_kind,head_hw_tier_name="Small")
restarts_at = j['started_at']
print(restarts_at)

In [None]:
ddl_cluster_scaling_client.wait_until_head_restart_complete(cluster_kind=cluster_kind,restart_ts=restarts_at)

### Use the cluster

Having fully scaled the cluster to your requirements, use your cluster. Run your hyper-parameter tuning, Deepspeed based LLM
fine tuning, large Spark queries.

### Scale Down Cluster

Having finished your work, scale down the cluster. You need to provide two parameters:
1. `worker_hw_tier_name` - This is optional. You use it to override the HW Tier used to scale the cluster. It is recommended. Use the smallest hw_tier permitted for this cluster type. Domino does not allow you to scale down your cluster to 0. This capability is provided so that if you have used an expensive HW tier for your worker nodes (Ex. GPU Tiers), when you scale down the cluster to 1 worker you can avoid paying for the expensive tier.
 
2. `replicas` - This is the number of desired workers. It needs to be more than or equal to the `min_workers` for this cluster 
or it is capped to `min_workers`. Always start your cluster so that minimum workers is 1.

In [None]:
j = ddl_cluster_scaling_client.scale_cluster(cluster_kind=cluster_kind,worker_hw_tier_name="Medium", replicas=1)
json.dumps(j, indent=2, sort_keys=True, ensure_ascii=False)
restart_ts=j['restarted_ts']

In [5]:
ddl_cluster_scaling_client.wait_until_scaling_complete(cluster_kind=cluster_kind)

ray-68d6b04e212c1d2df121426e
http://ddl-cluster-scaler-svc.domino-field.svc.cluster.local/ddl_cluster_scaler/restart_status/rayclusters/ray-68d6b04e212c1d2df121426e/worker
Status code 200
{'desired_replicas': 3, 'evaluated_with': 'statefulset_pods', 'namespace': 'domino-compute', 'node_type': 'worker', 'ok': True, 'oldestCreationTimestamp': '2025-09-26T18:23:39+00:00', 'ready_equals_running': True, 'ready_pods': 3, 'running_pods': 3, 'statefulset': 'ray-68d6b04e212c1d2df121426e-ray-worker', 'status': 'restarted_and_ready_counts_ok'}


True

### Optionally restart the head node again

This is a good practice in general

In [None]:
j = ddl_cluster_scaling_client.restart_head_node(cluster_kind=cluster_kind,head_hw_tier_name="Small")
restarts_at = j['started_at']
print(restarts_at)

In [None]:
ddl_cluster_scaling_client.wait_until_scaling_complete(cluster_kind=cluster_kind,scale_start_ts=restart_ts)

### Check current status of head and worker nodes


In [6]:
from datetime import datetime, timezone

# current UTC datetime in ISO 8601 format with Zulu suffix
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
print(current_time)

2025-09-26T18:42:20Z


In [8]:

ddl_cluster_scaling_client.get_cluster_restart_status(cluster_kind = "rayclusters",node_type ="worker")

ray-68d6b04e212c1d2df121426e
http://ddl-cluster-scaler-svc.domino-field.svc.cluster.local/ddl_cluster_scaler/restart_status/rayclusters/ray-68d6b04e212c1d2df121426e/worker
Status code 200


{'desired_replicas': 3,
 'evaluated_with': 'statefulset_pods',
 'namespace': 'domino-compute',
 'node_type': 'worker',
 'ok': True,
 'oldestCreationTimestamp': '2025-09-26T18:23:39+00:00',
 'ready_equals_running': True,
 'ready_pods': 3,
 'running_pods': 3,
 'statefulset': 'ray-68d6b04e212c1d2df121426e-ray-worker',
 'status': 'restarted_and_ready_counts_ok'}

In [None]:
ddl_cluster_scaling_client.get_cluster_restart_status(cluster_kind = "rayclusters",node_type ="head")