# Ray RLlib Training for Zelda Oracle of Seasons

This notebook deploys a Ray cluster on OpenShift/Kubernetes and submits a distributed training job.

Based on the Double Dragon KubeRay implementation.


In [None]:
!pip install codeflare-sdk


In [None]:
# Updated imports for newer codeflare_sdk versions
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication
import os
import time


In [None]:
# Authenticate with OpenShift
# Get your token: oc whoami -t
# Get your server: oc cluster-info

auth = TokenAuthentication(
    token = 'YOUR_TOKEN_HERE',  # Replace with: oc whoami -t
    server = 'YOUR_SERVER_HERE',  # Replace with: oc cluster-info
    skip_tls=False
)
auth.login()


In [None]:
# First, let's check what parameters ClusterConfiguration actually accepts
import inspect
sig = inspect.signature(ClusterConfiguration.__init__)
print("Available ClusterConfiguration parameters:")
for param_name, param in sig.parameters.items():
    if param_name != 'self':
        default = param.default if param.default != inspect.Parameter.empty else "REQUIRED"
        print(f"  {param_name}: {default}")


In [None]:
# Configure Ray Cluster
# ✅ REUSING the Double Dragon image - it already has everything we need!
# 🎮 PyBoy emulator works great on CPU - no GPUs needed!

cluster = Cluster(ClusterConfiguration(
    name='zelda-rl',
    namespace='zelda-hybrid-rl-llm',
    num_workers=3,
    
    # CPU resources (applies to both head and workers)
    min_cpus=8,
    max_cpus=8,
    
    # Memory resources in GB (applies to both head and workers)
    min_memory=12,
    max_memory=16,
    
    # No GPUs needed - PyBoy runs great on CPU!
    num_gpus=0,
    
    # ✅ Reusing DD image - has Ray, PyTorch, PyBoy, all dependencies
    image="quay.io/cnuland/dd-kuberay-worker:latest",
))


In [None]:
# Create the Ray cluster
cluster.up()
cluster.wait_ready()


In [None]:
# Get cluster details and submit job
clusterDetails = cluster.details()
print(f"Ray Dashboard URL: {clusterDetails.dashboard}")

client = cluster.job_client

# Configure environment variables
env_vars = {
    # S3 Storage (MinIO)
    'S3_ACCESS_KEY_ID': 'YOUR_S3_KEY',
    'S3_SECRET_ACCESS_KEY': 'YOUR_S3_SECRET',
    'S3_REGION_NAME': 'region',
    'S3_ENDPOINT_URL': 'YOUR_S3_ENDPOINT',
    'S3_BUCKET_NAME': 'YOUR_BUCKET_NAME',
    
    # LLM endpoint (assumes LLM service deployed in cluster)
    'LLM_ENDPOINT': 'http://llama4-scout-service:8000/v1/chat/completions',
}

# Submit training job
# Ray's working_dir uploads our Zelda code to all workers!
submission_id = client.submit_job(
    entrypoint="python run-ray-zelda.py",
    runtime_env={
        "env_vars": env_vars,
        'working_dir': './',  # Uploads our Zelda code, configs, and ROMs
        'pip': [],  # DD image already has everything!
        "excludes": ["*.sh", "*.ipynb", "*.md", "__pycache__", "*.pyc", "checkpoints/", "training_runs/"],
    }
)

print(f"\n✅ Job submitted successfully!")
print(f"Submission ID: {submission_id}")
print(f"Monitor at: {clusterDetails.dashboard}")


In [None]:
# Monitor job status
from ray.job_submission import JobStatus

while True:
    status = client.get_job_status(submission_id)
    print(f"Job status: {status}")
    
    if status in [JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.STOPPED]:
        print(f"\n📊 Final job logs:\n{client.get_job_logs(submission_id)}")
        break
    
    time.sleep(30)  # Check every 30 seconds


In [None]:
# Get job logs (anytime)
logs = client.get_job_logs(submission_id)
print(logs)


In [None]:
# Clean up (when training is complete)
# cluster.down()
