In [None]:
# Import pieces from codeflare-sdk
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication

In [None]:
# Create authentication object
auth = TokenAuthentication(
    token = "XXXXX",
    server = "XXXXX",
    skip_tls=False
)
auth.login()

Here, we want to define our cluster by specifying the resources we require for our batch workload. Below, we define our cluster object (which generates a corresponding RayCluster).

In [None]:
# Create and configure our cluster object
cluster = Cluster(ClusterConfiguration(
    name='raytest',
    head_cpus=1,
    head_memory=4,
    head_gpus=0, # For GPU enabled workloads set the head_gpus and num_gpus
    num_gpus=0,
    num_workers=2,
    min_cpus='250m',
    max_cpus=1,
    min_memory=2,
    max_memory=2,
    image="quay.io/rhoai/ray:2.35.0-py311-rocm61-torch24-fa26",
    write_to_file=False,
    local_queue="local-queue-name" # Specify the local queue manually
))

In [None]:
# Bring up the cluster
cluster.up()
cluster.wait_ready()

In [None]:
cluster.details()

### Ray Job Submission

* Initialise the Cluster Job Client 
* Provide an entrypoint command directed to your job script
* Set up your runtime environment

In [None]:
# Initialize the Job Submission Client
client = cluster.job_client

In [None]:
# Submit the MNIST job using the Job Submission Client
submission_id = client.submit_job(
    entrypoint="python mnist_fashion.py",
    runtime_env={"working_dir": "./","pip": "requirements.txt"},
)
print(submission_id)

In [None]:
# Get the job's logs
client.get_job_logs(submission_id)

In [None]:
# Get the job's status
client.get_job_status(submission_id)

In [None]:
# Get job related info
client.get_job_info(submission_id)

In [None]:
# List all existing jobs
client.list_jobs()

In [None]:
# Iterate through the logs of a job 
async for lines in client.tail_job_logs(submission_id):
    print(lines, end="") 

In [None]:
# Please shut down the cluster if you are done with the job. 