In [None]:
pip install codeflare-sdk codeflare-torchx

In [None]:
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication, RayJobClient, Authentication
import os
import sys

In [None]:
# Authenticate the CodeFlare SDK
# On OpenShift, you can retrieve the token by running `oc whoami -t`,
# and the server with `oc cluster-info`.
auth = TokenAuthentication(
    token = 'sha256~fXiyTta4BqDo6fpQf1VRAJ60LABQtz2p0blvpORnfuE',
    server = 'https://api.rosa-wswzx.e1i3.p3.openshiftapps.com:443',
    skip_tls=False
)
auth.login()

In [None]:
# Configure the Ray cluster
cluster = Cluster(ClusterConfiguration(
    head_cpu_requests=4,
    head_cpu_limits=8,
    head_memory_requests=4,
    head_memory_limits=8,
    name='dd-rl',
    namespace='hello-chris-ai-4',
    num_workers=1,
    worker_cpu_requests=4,
    worker_cpu_limits=4,
    worker_memory_requests=4,
    worker_memory_limits=4,
    # Use the following parameters with NVIDIA GPUs
    image="quay.io/cnuland/dd-kuberay-worker:latest",
    head_extended_resource_requests={'nvidia.com/gpu':1},
    worker_extended_resource_requests={'nvidia.com/gpu':1},
    write_to_file=False,
    local_queue="user-queue"
))

In [None]:
# Create the Ray cluster
cluster.up()

In [None]:
cluster.wait_ready()

In [None]:
cluster.details()

In [None]:
clusterDetails = cluster.details()

In [60]:
# Create or connect to the Ray cluster
cluster.status()
print(f"Ray Dashboard URL: {clusterDetails.dashboard}")


# Initialize RayJobClient
# The address for RayJobClient should be the Ray head service, often exposed via a route or service URL.
# CodeFlare SDK's `cluster.details()` might provide this, or `cluster.cluster_uri()` for internal access.
# For external access (like from your notebook to submit job), you might need the Ray dashboard or a client service URL.
# Let's assume client can connect to where jobs are submitted (usually Ray head/dashboard)
# If your notebook runs outside the OpenShift cluster, you might need an external route to Ray head.
# For simplicity, using cluster_uri() which is often for internal connections; adapt if needed.
# A common pattern is to use the dashboard URL for the RayJobClient if it's accessible.
try:
    # Attempt connection to Ray head, may need adjustment based on network setup
    # For jobs, it's often `address="ray://<head_service_host>:<port>"`
    # If dashboard_url is http://...:8265, client address is often the same host but port 10001 for ray client.
    # For KubeRay, the service for Ray client is often 'raycluster-kuberay-head-svc'.
    # The dashboard_url() from codeflare might be what you need directly or with minor modification.
    
    # A common approach is to use the dashboard URL if it's correctly pointing to the Ray head service endpoint.
    # However, RayJobClient expects a Ray Client server address (ray://) or HTTP proxy.
    # If your 'cluster.dashboard_url()' provides an HTTP URL for the dashboard,
    # Ray Client can connect via HTTP Proxy if enabled (Ray >= 1.13).
    # ray_client_address = "http://" + cluster.dashboard_url().split("://")[1] # Simplified, assumes standard port or proxy setup
    
    # It's safer to get the specific client connection string if available.
    # Let's assume Codeflare SDK handles this abstraction or you can derive it.
    # If running locally and port-forwarding Ray head's client port (default 10001):
    # client_address = "ray://localhost:10001"
    # When submitting from the environment that can reach the Ray head:
    #client_address = clusterDetails.dashboard # Check if this works directly or needs adjustment for RayJobClient
                                         # It may need to be like "http://<host>:<dashboard_port>"
    client_address = clusterDetails.dashboard
    print(f"Attempting to connect RayJobClient to: {client_address}")
    client = cluster.job_client


except Exception as e:
    print(f"Error initializing RayJobClient: {e}")
    print("Please ensure the Ray client server is accessible from where you are running this notebook.")
    print("You might need to set up port-forwarding or use an external route to the Ray head service.")
    client = None # Ensure client is None if connection fails

    
if client:
    print("Submitting Ray job...")

    try:
        submission_id = client.submit_job(
            entrypoint="python run-ray-dd.py",
            runtime_env={
                'working_dir': './',
                'pip': ["boto3"],
                "excludes": ["*.sh", "*.ipynb", "*.md"]
            },
        )
        print(f"Job submitted successfully! Submission ID: {submission_id}")
        print("You can monitor the job status using client.get_job_status(submission_id)")
        print("And logs using client.get_job_logs(submission_id)")

        # Example: Poll for job status
        # import time
        while True:
            status = client.get_job_status(submission_id)
            print(f"Job status: {status}")
            if status in [JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.STOPPED]:
                break
            time.sleep(5)
            print(f"Final job logs:\n{client.get_job_logs(submission_id)}")

    except Exception as e:
        print(f"Error submitting job: {e}")
else:
    print("RayJobClient not initialized. Cannot submit job.")

2025-05-19 00:18:55,581	INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_bb69c7e70453ab7b.zip.
2025-05-19 00:18:55,582	INFO packaging.py:575 -- Creating a file package for local module './'.


Ray Dashboard URL: https://ray-dashboard-dd-rl-hello-chris-ai-4.apps.rosa.rosa-wswzx.e1i3.p3.openshiftapps.com
Attempting to connect RayJobClient to: https://ray-dashboard-dd-rl-hello-chris-ai-4.apps.rosa.rosa-wswzx.e1i3.p3.openshiftapps.com
Submitting Ray job...
Job submitted successfully! Submission ID: raysubmit_hxJeKwUd2rCDGz4b
You can monitor the job status using client.get_job_status(submission_id)
And logs using client.get_job_logs(submission_id)
Job status: PENDING
Error submitting job: name 'JobStatus' is not defined


In [None]:
client.list_jobs()

In [None]:
client.stop_job("06000000")

In [None]:
cluster.down()