# Ray RLlib Training for Zelda Oracle of Seasons

This notebook deploys a Ray cluster on OpenShift/Kubernetes and submits a distributed training job.

Based on the Double Dragon KubeRay implementation.


In [None]:
!pip install codeflare-sdk


In [None]:
from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication
import os
import time


In [None]:
# Authenticate with OpenShift
# Get your token: oc whoami -t
# Get your server: oc cluster-info

auth = TokenAuthentication(
    token = 'YOUR_TOKEN_HERE',  # Replace with: oc whoami -t
    server = 'YOUR_SERVER_HERE',  # Replace with: oc cluster-info
    skip_tls=False
)
auth.login()


In [None]:
# Configure Ray Cluster
# ✅ REUSING the Double Dragon image - it already has everything we need!

cluster = Cluster(ClusterConfiguration(
    head_cpu_requests=10,
    head_cpu_limits=12,
    head_memory_requests=10,
    head_memory_limits=12,
    name='zelda-rl',
    namespace='YOUR_NAMESPACE_HERE',  # Replace with your namespace
    num_workers=3,
    worker_cpu_requests=12,
    worker_cpu_limits=16,
    worker_memory_requests=12,
    worker_memory_limits=20,
    
    # ✅ Reusing DD image - has Ray, PyTorch, PyBoy, CUDA, all dependencies
    image="quay.io/cnuland/dd-kuberay-worker:latest",
    
    # GPU configuration
    head_extended_resource_requests={'nvidia.com/gpu':1},
    worker_extended_resource_requests={'nvidia.com/gpu':1},
    
    write_to_file=False,
    local_queue="user-queue"
))


In [None]:
# Create the Ray cluster
cluster.up()
cluster.wait_ready()


In [None]:
# Get cluster details and submit job
clusterDetails = cluster.details()
print(f"Ray Dashboard URL: {clusterDetails.dashboard}")

client = cluster.job_client

# Configure environment variables
env_vars = {
    # S3 Storage (MinIO)
    'S3_ACCESS_KEY_ID': 'YOUR_S3_KEY',
    'S3_SECRET_ACCESS_KEY': 'YOUR_S3_SECRET',
    'S3_REGION_NAME': 'region',
    'S3_ENDPOINT_URL': 'YOUR_S3_ENDPOINT',
    'S3_BUCKET_NAME': 'YOUR_BUCKET_NAME',
    
    # LLM endpoint (assumes LLM service deployed in cluster)
    'LLM_ENDPOINT': 'http://llama4-scout-service:8000/v1/chat/completions',
}

# Submit training job
# Ray's working_dir uploads our Zelda code to all workers!
submission_id = client.submit_job(
    entrypoint="python run-ray-zelda.py",
    runtime_env={
        "env_vars": env_vars,
        'working_dir': './',  # Uploads our Zelda code, configs, and ROMs
        'pip': [],  # DD image already has everything!
        "excludes": ["*.sh", "*.ipynb", "*.md", "__pycache__", "*.pyc", "checkpoints/", "training_runs/"],
    }
)

print(f"\n✅ Job submitted successfully!")
print(f"Submission ID: {submission_id}")
print(f"Monitor at: {clusterDetails.dashboard}")


In [None]:
# Monitor job status
from ray.job_submission import JobStatus

while True:
    status = client.get_job_status(submission_id)
    print(f"Job status: {status}")
    
    if status in [JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.STOPPED]:
        print(f"\n📊 Final job logs:\n{client.get_job_logs(submission_id)}")
        break
    
    time.sleep(30)  # Check every 30 seconds


In [None]:
# Get job logs (anytime)
logs = client.get_job_logs(submission_id)
print(logs)


In [None]:
# Clean up (when training is complete)
# cluster.down()
