# Ray RLlib Training for Zelda Oracle of Seasons

This notebook deploys a Ray cluster on OpenShift/Kubernetes and submits a distributed training job.

Based on the Double Dragon KubeRay implementation.


## ⚠️ RBAC Setup Required

**Before running this notebook**, you need to grant RBAC permissions to the service account.

Run these commands in a terminal:

```bash
cd /Users/cnuland/hello-chris-rl-llm-zelda

# Apply RBAC permissions
oc apply -f ops/openshift/rbac.yaml

# Verify permissions
oc auth can-i list rayclusters --as=system:serviceaccount:zelda-hybrid-rl-llm:zelda-rl-training -n zelda-hybrid-rl-llm
oc auth can-i create rayclusters --as=system:serviceaccount:zelda-hybrid-rl-llm:zelda-rl-training -n zelda-hybrid-rl-llm
```

All should return `yes` ✅

**Then proceed with the cells below.**


In [None]:
!pip install codeflare-sdk


In [None]:
# Updated imports for newer codeflare_sdk versions
from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration
from codeflare_sdk.cluster.auth import TokenAuthentication
from ray.job_submission import JobSubmissionClient, JobStatus
import os
import time
import subprocess


In [None]:
# Authenticate with OpenShift
# Get your token: oc whoami -t
# Get your server: oc cluster-info

auth = TokenAuthentication(
    token = 'YOUR_TOKEN_HERE',  # Replace with: oc whoami -t
    server = 'YOUR_SERVER_HERE',  # Replace with: oc cluster-info
    skip_tls=False
)
auth.login()


In [None]:
# First, let's check what parameters ClusterConfiguration actually accepts
import inspect
sig = inspect.signature(ClusterConfiguration.__init__)
print("Available ClusterConfiguration parameters:")
for param_name, param in sig.parameters.items():
    if param_name != 'self':
        default = param.default if param.default != inspect.Parameter.empty else "REQUIRED"
        print(f"  {param_name}: {default}")


In [None]:
# Configure Ray Cluster
# ✅ REUSING the Double Dragon image - it already has everything we need!
# 🎮 PyBoy emulator works great on CPU - no GPUs needed!
# ✅ Ultra-minimal resources to guarantee scheduling on available nodes!

cluster = Cluster(ClusterConfiguration(
    name='zelda-rl',
    namespace='zelda-hybrid-rl-llm',
    num_workers=2,      # ✅ Reduce from 3 to 2
    
    # Ultra-minimal CPU/memory (applies to both head and workers)
    min_cpus=1,         # ✅ Ultra-minimal: 1 CPU request
    max_cpus=2,         # ✅ Small limit: 2 CPUs max
    min_memory=2,       # ✅ Ultra-minimal: 2 GB request
    max_memory=4,       # ✅ Small limit: 4 GB max
    
    # No GPUs needed - PyBoy runs great on CPU!
    num_gpus=0,
    
    # ✅ Using your existing DD image
    image="quay.io/cnuland/dd-kuberay-worker:latest",
))

print(f"✅ Cluster configuration created:")
print(f"   Name: {cluster.config.name}")
print(f"   Namespace: {cluster.config.namespace}")
print(f"   Workers: {cluster.config.num_workers}")
print(f"   CPUs per pod: {cluster.config.min_cpus}-{cluster.config.max_cpus}")
print(f"   Memory per pod: {cluster.config.min_memory}-{cluster.config.max_memory} GB")
print(f"   Image: {cluster.config.image}")
print(f"   Total pods: 3 (1 head + 2 workers)")
print(f"   Total resources: 3-6 CPUs, 6-12 GB RAM")
print(f"   Parallel games: 6 (3 per worker)")
print(f"\n✅ Ultra-minimal resources - should fit on any available nodes!")
print(f"✅ Kueue queues (zelda-ray-queue → ray-cluster-queue) are ready!")


In [None]:
# Create the Ray cluster (or connect to existing)
from codeflare_sdk.cluster.cluster import CodeFlareClusterStatus

print("🚀 Creating/connecting to Ray cluster...")
cluster.up()

# Check status immediately
print("\n📊 Checking cluster status...")
status_info = cluster.status()
cluster_state = status_info[0] if isinstance(status_info, tuple) else status_info

print(f"Cluster state: {cluster_state}")

# Only wait if cluster is not already active
if cluster_state == CodeFlareClusterStatus.READY:
    print("✅ Cluster is already READY!")
elif cluster_state in [CodeFlareClusterStatus.STARTING, CodeFlareClusterStatus.UNKNOWN]:
    try:
        print("\n⏳ Waiting for cluster to be ready (max 10 minutes)...")
        cluster.wait_ready(timeout=600)
        print("✅ Cluster is ready!")
    except Exception as e:
        print(f"⚠️ Error waiting for cluster: {e}")
        print("\n📋 Cluster details:")
        print(cluster.details())
else:
    print(f"✅ Cluster status: {cluster_state}")
    
# Show final cluster details
print("\n" + "="*60)
print("📊 FINAL CLUSTER STATUS:")
print("="*60)
cluster.details()


In [None]:
# Diagnostic: Check cluster status and image pull issues
import subprocess

print("🔍 DIAGNOSTIC: Checking Ray cluster deployment...\n")

try:
    # Check RayCluster resource
    print("1️⃣ RayCluster resource:")
    result = subprocess.run(
        ["oc", "get", "raycluster", "-n", "zelda-hybrid-rl-llm"],
        capture_output=True, text=True, timeout=10
    )
    print(result.stdout if result.returncode == 0 else result.stderr)
    
    # Check pods
    print("\n2️⃣ Pods in namespace:")
    result2 = subprocess.run(
        ["oc", "get", "pods", "-n", "zelda-hybrid-rl-llm"],
        capture_output=True, text=True, timeout=10
    )
    print(result2.stdout if result2.returncode == 0 else result2.stderr)
    
    # Check for image pull errors in events
    print("\n3️⃣ Recent events (looking for ImagePullBackOff errors):")
    result3 = subprocess.run(
        ["oc", "get", "events", "-n", "zelda-hybrid-rl-llm", 
         "--sort-by=.lastTimestamp", "--field-selector=type=Warning"],
        capture_output=True, text=True, timeout=10
    )
    events = result3.stdout if result3.returncode == 0 else result3.stderr
    print(events if events.strip() else "No warning events found")
    
    # Check image pull secrets
    print("\n4️⃣ Image pull secrets in namespace:")
    result4 = subprocess.run(
        ["oc", "get", "secrets", "-n", "zelda-hybrid-rl-llm", 
         "-o", "jsonpath={.items[?(@.type==\"kubernetes.io/dockerconfigjson\")].metadata.name}"],
        capture_output=True, text=True, timeout=10
    )
    secrets = result4.stdout if result4.returncode == 0 else result4.stderr
    print(secrets if secrets.strip() else "⚠️  No dockerconfigjson secrets found!")
    
except Exception as e:
    print(f"❌ Error running diagnostic commands: {e}")

print("\n" + "="*60)
print("💡 IMAGE PULL FIX:")
print("="*60)
print("If you see 'ImagePullBackOff' errors, you need to:")
print("1. Check if quay.io/cnuland/dd-kuberay-worker:latest exists")
print("2. If private, create image pull secret:")
print("   oc create secret docker-registry quay-pull-secret \\")
print("     --docker-server=quay.io \\")
print("     --docker-username=YOUR_USERNAME \\")
print("     --docker-password=YOUR_PASSWORD \\")
print("     -n zelda-hybrid-rl-llm")
print("3. Then add to ClusterConfiguration:")
print("   image_pull_secrets=['quay-pull-secret']")


In [None]:
# Get cluster details and submit job
from ray.job_submission import JobSubmissionClient

clusterDetails = cluster.details()
print(f"Ray Dashboard URL: {clusterDetails.dashboard}")

# Create Ray Job Submission Client
# Extract the head service URL from cluster details
ray_cluster_uri = f"ray://zelda-rl-head-svc.zelda-hybrid-rl-llm.svc:10001"
ray_dashboard_url = clusterDetails.dashboard

print(f"Ray Cluster URI: {ray_cluster_uri}")
print(f"Connecting to Ray dashboard: {ray_dashboard_url}")

# Create client using the dashboard URL
client = JobSubmissionClient(ray_dashboard_url)

# Configure environment variables
env_vars = {
    # S3/MinIO Storage (for model checkpoints and training results)
    'S3_ACCESS_KEY_ID': 'YOUR_S3_KEY',              # Replace with your MinIO access key
    'S3_SECRET_ACCESS_KEY': 'YOUR_S3_SECRET',       # Replace with your MinIO secret key
    'S3_REGION_NAME': 'us-east-1',                  # MinIO region (usually us-east-1)
    'S3_ENDPOINT_URL': 'YOUR_S3_ENDPOINT',          # Replace with MinIO endpoint URL
    'S3_BUCKET_NAME': 'zelda-rl-checkpoints',       # Replace with your bucket name
    
    # LLM endpoint
    'LLM_ENDPOINT': 'http://llm-d-infra-inference-gateway-istio.llm-d.svc.cluster.local/v1/chat/completions',
    
    # ROM path (relative to working_dir)
    'ROM_PATH': 'roms/zelda_oracle_of_seasons.gbc',
    
    # Config paths
    'ENV_CONFIG': 'configs/env.yaml',
    'VISION_PROMPT_CONFIG': 'configs/vision_prompt.yaml',
}

# Submit training job
# Ray's working_dir uploads our Zelda code to all workers!
submission_id = client.submit_job(
    entrypoint="python run-ray-zelda.py",
    runtime_env={
        "env_vars": env_vars,
        'working_dir': './',  # Uploads our Zelda code, configs, and ROMs
        'pip': [],  # DD image already has everything!
        "excludes": [
            "*.sh", "*.ipynb", "*.md", "__pycache__", "*.pyc", 
            "checkpoints/*", "training_runs/*", "strategic_test_results/*",
            ".git/*", "tmp/*", "HUD/*", "notebooks/*", "examples/*"
        ],
    }
)

print(f"\n✅ Job submitted successfully!")
print(f"Submission ID: {submission_id}")
print(f"Monitor at: {ray_dashboard_url}")


In [None]:
# Monitor job status
from ray.job_submission import JobStatus
import time

print("🔍 Monitoring job status...")
print("(This will check every 30 seconds until completion)")
print()

while True:
    status = client.get_job_status(submission_id)
    info = client.get_job_info(submission_id)
    
    print(f"[{time.strftime('%H:%M:%S')}] Job status: {status}")
    
    if status in [JobStatus.SUCCEEDED, JobStatus.FAILED, JobStatus.STOPPED]:
        print(f"\n{'='*60}")
        print(f"📊 Job completed with status: {status}")
        print(f"{'='*60}\n")
        
        # Get final logs
        logs = client.get_job_logs(submission_id)
        print("📝 Final job logs:")
        print(logs[-5000:] if len(logs) > 5000 else logs)  # Last 5000 chars
        break
    
    time.sleep(30)  # Check every 30 seconds


In [None]:
# Get job logs (anytime)
logs = client.get_job_logs(submission_id)
print(logs)


In [None]:
# 🗑️ CLEANUP: Delete the Ray cluster (only run when completely done!)
# 
# WARNING: This will delete the entire Ray cluster!
# - All running training jobs will be stopped
# - All pods will be terminated
# - The RayCluster resource will be deleted
#
# Uncomment the line below to delete:
# cluster.down()

# To check if cluster was deleted:
# !oc get raycluster -n zelda-hybrid-rl-llm
