adding ray scikit-learn example (#165)

* adding ray scikit-learn example Signed-off-by: vsoch <vsoch@users.noreply.github.com>
flux-framework · May 11, 2023 · fd12920 · fd12920
1 parent a045141
commit fd12920
Show file tree

Hide file tree

Showing 23 changed files with 357 additions and 107 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -54,20 +54,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        test: [["hello-world", "ghcr.io/flux-framework/flux-restful-api:latest", 30],
-               ["post", "ghcr.io/flux-framework/flux-restful-api:latest", 30],
-               ["batch", "ghcr.io/flux-framework/flux-restful-api:latest", 30],
-               ["singularity", "ghcr.io/rse-ops/singularity:tag-mamba", 40],
-               ["nginx-sidecar-service", "ghcr.io/flux-framework/flux-restful-api:latest", 40],
-               ["nginx-service", "ghcr.io/flux-framework/flux-restful-api:latest", 40],
-               ["lammps", "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", 30],
-               ["pokemon", "ghcr.io/rse-ops/pokemon:app-latest", 30],
-               ["timed", "ghcr.io/flux-framework/flux-restful-api:latest", 30],
-               ["debug", "ghcr.io/flux-framework/flux-restful-api:latest", 30],
-               ["flux-resource-list", "ghcr.io/flux-framework/flux-restful-api:latest", 30],
-               ["existing-volumes", "ghcr.io/flux-framework/flux-restful-api:latest", 30],
-               ["volumes", "ghcr.io/flux-framework/flux-restful-api:latest", 30],
-               ["snakemake", "ghcr.io/rse-ops/atacseq:app-latest", 30]]
+        test: [["hello-world", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["post", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["batch", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["singularity", "ghcr.io/rse-ops/singularity:tag-mamba", 60],
+               ["nginx-sidecar-service", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["nginx-service", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["lammps", "ghcr.io/rse-ops/lammps:flux-sched-focal-v0.24.0", 60],
+               ["pokemon", "ghcr.io/rse-ops/pokemon:app-latest", 60],
+               ["timed", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["debug", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["flux-resource-list", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["existing-volumes", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["volumes", "ghcr.io/flux-framework/flux-restful-api:latest", 60],
+               ["snakemake", "ghcr.io/rse-ops/atacseq:app-latest", 60]]
 
     steps:
     - name: Clone the code

diff --git a/api/v1alpha1/minicluster_types.go b/api/v1alpha1/minicluster_types.go
@@ -187,7 +187,7 @@ type MiniClusterStatus struct {
 
 	// We keep the original size of the MiniCluster request as
 	// this is the absolute maximum
-	MaximumSize int32 `json:"size"`
+	MaximumSize int32 `json:"maximumSize"`
 
 	// conditions hold the latest Flux Job and MiniCluster states
 	// +listType=atomic

diff --git a/api/v1alpha1/swagger.json b/api/v1alpha1/swagger.json
@@ -529,7 +529,7 @@
       "type": "object",
       "required": [
         "jobid",
-        "size"
+        "maximumSize"
       ],
       "properties": {
         "conditions": {
@@ -546,7 +546,7 @@
           "type": "string",
           "default": ""
         },
-        "size": {
+        "maximumSize": {
           "description": "We keep the original size of the MiniCluster request as this is the absolute maximum",
           "type": "integer",
           "format": "int32",

diff --git a/api/v1alpha1/zz_generated.openapi.go b/api/v1alpha1/zz_generated.openapi.go
diff --git a/chart/templates/minicluster-crd.yaml b/chart/templates/minicluster-crd.yaml
@@ -703,14 +703,14 @@ spec:
                 description: The Jobid is set internally to associate to a miniCluster
                   This isn't currently in use, we only have one!
                 type: string
-              size:
+              maximumSize:
                 description: We keep the original size of the MiniCluster request as
                   this is the absolute maximum
                 format: int32
                 type: integer
             required:
             - jobid
-            - size
+            - maximumSize
             type: object
         type: object
     served: true

diff --git a/config/crd/bases/flux-framework.org_miniclusters.yaml b/config/crd/bases/flux-framework.org_miniclusters.yaml
@@ -708,14 +708,14 @@ spec:
                 description: The Jobid is set internally to associate to a miniCluster
                   This isn't currently in use, we only have one!
                 type: string
-              size:
+              maximumSize:
                 description: We keep the original size of the MiniCluster request
                   as this is the absolute maximum
                 format: int32
                 type: integer
             required:
             - jobid
-            - size
+            - maximumSize
             type: object
         type: object
     served: true

diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml
@@ -13,3 +13,4 @@ kind: Kustomization
 images:
 - name: controller
   newName: ghcr.io/flux-framework/flux-operator
+  newTag: test
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
@@ -33,6 +33,7 @@ The following tutorials are provided from their respective directories (and are
  - [Pytorch MNIST](https://github.com/flux-framework/flux-operator/blob/main/examples/machine-learning/pytorch)
  - [Tensorflow cifar-10](https://github.com/flux-framework/flux-operator/blob/main/examples/machine-learning/tensorflow)
  - [Dask with Scikit-Learn](https://github.com/flux-framework/flux-operator/blob/main/examples/machine-learning/dask/scikit-learn)
+ - [Ray with Scikit-Learn](https://github.com/flux-framework/flux-operator/blob/main/examples/machine-learning/ray/scikit-learn)
 
 ### Services
 

diff --git a/examples/dist/flux-operator.yaml b/examples/dist/flux-operator.yaml
@@ -714,14 +714,14 @@ spec:
                 description: The Jobid is set internally to associate to a miniCluster
                   This isn't currently in use, we only have one!
                 type: string
-              size:
+              maximumSize:
                 description: We keep the original size of the MiniCluster request
                   as this is the absolute maximum
                 format: int32
                 type: integer
             required:
             - jobid
-            - size
+            - maximumSize
             type: object
         type: object
     served: true

diff --git a/examples/machine-learning/ray/scikit-learn/README.md b/examples/machine-learning/ray/scikit-learn/README.md
@@ -0,0 +1,136 @@
+# Ray with scikit-learn and Flux!
+
+This will test running [Ray and scikit-learn](https://docs.ray.io/en/latest/ray-more-libs/joblib.html) with Flux.
+Our goal is to do this simply, and eventually extend this to a more complex hierarchy of jobs.
+Note that I found most of the needed logic [here](https://docs.ray.io/en/latest/cluster/vms/user-guides/launching-clusters/on-premises.html#on-prem).
+
+## Usage
+
+### Create Cluster
+
+First, let's create a kind cluster.
+
+```bash
+$ kind create cluster --config ../../../kind-config.yaml
+```
+
+And then install the operator, create the namespace, and apply the MiniCluster YAML here.
+
+```bash
+$ kubectl apply -f ../../../dist/flux-operator.yaml
+$ kubectl create namespace flux-operator
+$ kubectl apply -f ./minicluster.yaml
+```
+
+This will use a base container with Ray and Scikit-learn, and mount
+the current directory (in the pods as `/tmp/workflow`). We run the [start.sh](start.sh)
+to start the cluster. You can watch doing the following:
+
+```bash
+$ kubectl logs -n flux-operator flux-sample-0-7tx7s -f
+```
+
+<details>
+
+<summary>Ray Expected Output</summary>
+
+```console
+2023-05-10 22:52:44,254 INFO scripts.py:892 -- Local node IP: 10.244.0.57
+2023-05-10 22:52:47,567 SUCC scripts.py:904 -- --------------------
+2023-05-10 22:52:47,567 SUCC scripts.py:905 -- Ray runtime started.
+2023-05-10 22:52:47,567 SUCC scripts.py:906 -- --------------------
+2023-05-10 22:52:47,567 INFO scripts.py:908 -- To terminate the Ray runtime, run
+2023-05-10 22:52:47,567 INFO scripts.py:909 --   ray stop
+2023-05-10 22:52:44,300 INFO usage_lib.py:372 -- Usage stats collection is disabled.
+2023-05-10 22:52:44,300 INFO scripts.py:710 -- Local node IP: flux-sample-0.flux-service.flux-operator.svc.cluster.local
+2023-05-10 22:52:47,605 SUCC scripts.py:747 -- --------------------
+2023-05-10 22:52:47,605 SUCC scripts.py:748 -- Ray runtime started.
+2023-05-10 22:52:47,605 SUCC scripts.py:749 -- --------------------
+2023-05-10 22:52:47,605 INFO scripts.py:751 -- Next steps
+2023-05-10 22:52:47,605 INFO scripts.py:754 -- To add another node to this Ray cluster, run
+2023-05-10 22:52:47,605 INFO scripts.py:757 --   ray start --address='flux-sample-0.flux-service.flux-operator.svc.cluster.local:6379'
+2023-05-10 22:52:47,605 INFO scripts.py:766 -- To connect to this Ray cluster:
+2023-05-10 22:52:47,605 INFO scripts.py:768 -- import ray
+2023-05-10 22:52:47,606 INFO scripts.py:769 -- ray.init(_node_ip_address='flux-sample-0.flux-service.flux-operator.svc.cluster.local')
+2023-05-10 22:52:47,606 INFO scripts.py:781 -- To submit a Ray job using the Ray Jobs CLI:
+2023-05-10 22:52:47,606 INFO scripts.py:782 --   RAY_ADDRESS='http://127.0.0.1:8265' ray job submit --working-dir . -- python my_script.py
+2023-05-10 22:52:47,622 INFO scripts.py:791 -- See https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html 
+2023-05-10 22:52:47,622 INFO scripts.py:795 -- for more information on submitting Ray jobs to the Ray cluster.
+2023-05-10 22:52:47,622 INFO scripts.py:800 -- To terminate the Ray runtime, run
+2023-05-10 22:52:47,622 INFO scripts.py:801 --   ray stop
+2023-05-10 22:52:47,622 INFO scripts.py:804 -- To view the status of the cluster, use
+2023-05-10 22:52:47,623 INFO scripts.py:805 --   ray status
+2023-05-10 22:52:47,623 INFO scripts.py:809 -- To monitor and debug Ray, view the dashboard at 
+2023-05-10 22:52:47,623 INFO scripts.py:810 --   127.0.0.1:8265
+2023-05-10 22:52:47,623 INFO scripts.py:817 -- If connection to the dashboard fails, check your firewall settings and network configuration.
+2023-05-10 22:52:44,270 INFO scripts.py:892 -- Local node IP: 10.244.0.59
+2023-05-10 22:52:47,654 SUCC scripts.py:904 -- --------------------
+2023-05-10 22:52:47,654 SUCC scripts.py:905 -- Ray runtime started.
+2023-05-10 22:52:47,655 SUCC scripts.py:906 -- --------------------
+2023-05-10 22:52:47,655 INFO scripts.py:908 -- To terminate the Ray runtime, run
+2023-05-10 22:52:47,655 INFO scripts.py:909 --   ray stop
+[2023-05-10 22:52:47,666 I 108 108] global_state_accessor.cc:356: This node has an IP address of 10.244.0.56, but we cannot find a local Raylet with the same address. This can happen when you connect to the Ray cluster with a different IP address or when connecting to a container.
+2023-05-10 22:52:44,305 INFO scripts.py:892 -- Local node IP: 10.244.0.56
+2023-05-10 22:52:47,718 SUCC scripts.py:904 -- --------------------
+2023-05-10 22:52:47,719 SUCC scripts.py:905 -- Ray runtime started.
+2023-05-10 22:52:47,719 SUCC scripts.py:906 -- --------------------
+2023-05-10 22:52:47,719 INFO scripts.py:908 -- To terminate the Ray runtime, run
+2023-05-10 22:52:47,719 INFO scripts.py:909 --   ray stop
+```
+
+</details>
+
+### Run Workflow
+
+Let's shell into the broker pod and connect to the broker flux instance:
+
+```bash
+$ kubectl exec -it -n flux-operator flux-sample-0-jlsp6 bash
+$ sudo -u fluxuser -E $(env) -E HOME=/home/fluxuser flux proxy local:///run/flux/local bash
+```
+
+At this point, since this main pod is running the show, we can run our Python example that
+uses ray:
+
+```bash
+$ python3 ray_tune.py
+```
+
+Note that you could also give this to the broker as the command directly (and no need for bash above):
+
+```bash
+$ kubectl exec -it -n flux-operator flux-sample-0-jlsp6 bash
+$ sudo -u fluxuser -E $(env) -E HOME=/home/fluxuser flux proxy local:///run/flux/local python3 ray_tune.py
+```
+For either approach, you should see the training logs across the cluster:
+
+```console
+(PoolActor pid=564) [CV 1/5; 125/300] END C=0.00011721022975334806, class_weight=balanced, gamma=23.95026619987481, tol=0.0017433288221999873;, score=0.100 total time=   2.2s [repeated 5x across cluster]
+(PoolActor pid=564) [CV 5/5; 129/300] START C=1000000.0, class_weight=balanced, gamma=0.14873521072935117, tol=0.03039195382313198 [repeated 9x across cluster]
+(PoolActor pid=565) [CV 5/5; 126/300] START C=489.3900918477499, class_weight=None, gamma=1.8873918221350996, tol=0.014873521072935119 [repeated 10x across cluster]
+(PoolActor pid=559) [CV 3/5; 130/300] END C=72.78953843983146, class_weight=None, gamma=1082.6367338740563, tol=0.0008531678524172806;, score=0.103 total time=   2.1s [repeated 9x across cluster]
+(PoolActor pid=564) [CV 5/5; 129/300] END C=1000000.0, class_weight=balanced, gamma=0.14873521072935117, tol=0.03039195382313198;, score=0.103 total time=   2.2s [repeated 10x across cluster]
+(PoolActor pid=563) [CV 1/5; 136/300] START C=489.3900918477499, class_weight=balanced, gamma=28072162.039411698, tol=0.003562247890262444 [repeated 12x across cluster]
+(PoolActor pid=564) [CV 1/5; 133/300] START C=0.2395026619987486, class_weight=None, gamma=100000000.0, tol=0.003562247890262444 [repeated 9x across cluster]
+(PoolActor pid=565) [CV 1/5; 130/300] END C=72.78953843983146, class_weight=None, gamma=1082.6367338740563, tol=0.0008531678524172806;, score=0.100 total time=   2.8s [repeated 10x across cluster]
+(PoolActor pid=559) [CV 2/5; 135/300] END C=2.592943797404667e-06, class_weight=balanced, gamma=28072162.039411698, tol=0.07880462815669913;, score=0.100 total time=   2.3s [repeated 8x across cluster]
+(PoolActor pid=563) [CV 2/5; 139/300] START C=1.7433288221999873e-05, class_weight=balanced, gamma=0.5298316906283702, tol=0.06210169418915616 [repeated 10x across cluster]
+...
+```
+
+And that's it! If you create a more substantial example using Ray please [let us know](https://github.com/flux-framework/flux-operator/issues).
+
+### Cleanup
+
+When you are done, clean up:
+
+```bash
+$ kubectl delete -f minicluster.yaml
+```
+
+Make sure to clean up your shared tmpdir!
+
+```bash
+$ rm *.out
+$ sudo rm -rf ./tmp/*
+```
diff --git a/examples/machine-learning/ray/scikit-learn/minicluster.yaml b/examples/machine-learning/ray/scikit-learn/minicluster.yaml
@@ -0,0 +1,36 @@
+apiVersion: flux-framework.org/v1alpha1
+kind: MiniCluster
+metadata:
+  name: flux-sample
+  namespace: flux-operator
+spec:
+  # It is important to launch with tasks == nodes, so that the main start
+  # command is told to run across nodes (and start.sh given to workers and leader)
+  size: 4
+  tasks: 4
+  volumes:
+    data:
+      storageClass: hostpath
+      path: /tmp/workflow
+
+  containers:
+    - image: ghcr.io/rse-ops/ray-scikit-learn:tag-mamba
+      workingDir: /tmp/workflow
+      commands:
+        pre: mkdir -p /tmp/workflow/tmp
+
+      environment:
+        PYTHONPATH: /usr/lib/python3.10/site-packages
+        TMPDIR: /tmp/workflow/tmp
+
+      ports:
+        - 8786
+        - 6379
+      command: /bin/bash /tmp/workflow/start.sh
+
+      #launcher: true
+      fluxUser:
+        name: fluxuser
+      volumes:
+        data:
+          path: /tmp/workflow
diff --git a/examples/machine-learning/ray/scikit-learn/ray_tune.py b/examples/machine-learning/ray/scikit-learn/ray_tune.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+# python ./ray_tune.py --mode train --config-yml path_to/configs/s2ef/200k/forcenet/fn_forceonly.yml --run_dir path_to_run_dir
+
+import numpy as np
+from sklearn.datasets import load_digits
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.svm import SVC
+
+digits = load_digits()
+param_space = {
+    "C": np.logspace(-6, 6, 30),
+    "gamma": np.logspace(-8, 8, 30),
+    "tol": np.logspace(-4, -1, 30),
+    "class_weight": [None, "balanced"],
+}
+model = SVC(kernel="rbf")
+search = RandomizedSearchCV(model, param_space, cv=5, n_iter=300, verbose=10)
+
+import joblib
+from ray.util.joblib import register_ray
+
+register_ray()
+with joblib.parallel_backend("ray"):
+    search.fit(digits.data, digits.target)
diff --git a/examples/machine-learning/ray/scikit-learn/start.sh b/examples/machine-learning/ray/scikit-learn/start.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+export LC_ALL=C.UTF-8
+export LANG=C.UTF-8
+
+name=$(hostname)
+echo "Hello I am ${name}"
+
+# We only care about the head node address
+address="flux-sample-0.flux-service.flux-operator.svc.cluster.local"
+port="6379"
+password="austinpowersyeahbaby"
+echo "Head node address is ${address}:${port}"
+
+# This is the "head noded"
+if [[ "${name}" == "flux-sample-0" ]]; then
+    echo "ray start --head --node-ip-address=${address} --port=6379 --redis-password=${password} --temp-dir=/tmp/workflow/tmp --disable-usage-stats"
+    ray start --head --node-ip-address=${address} --port=${port} --redis-password=${password} --temp-dir=/tmp/workflow/tmp --disable-usage-stats
+    sleep infinity
+else
+    # This triggers an error when I add port to the string, so we rely on using the default
+    echo "ray start --address ${address}:${port} --redis-password=${password} --disable-usage-stats"
+    ray start --address ${address}:${port} --redis-password=${password} --disable-usage-stats
+    sleep infinity
+fi
diff --git a/sdk/python/v1alpha1/.openapi-generator/FILES b/sdk/python/v1alpha1/.openapi-generator/FILES
@@ -53,4 +53,13 @@ setup.cfg
 setup.py
 test-requirements.txt
 test/__init__.py
+test/test_mini_cluster.py
+test/test_mini_cluster_archive.py
+test/test_mini_cluster_container.py
+test/test_mini_cluster_existing_volume.py
+test/test_mini_cluster_list.py
+test/test_mini_cluster_spec.py
+test/test_mini_cluster_status.py
+test/test_mini_cluster_user.py
+test/test_mini_cluster_volume.py
 tox.ini
diff --git a/sdk/python/v1alpha1/docs/MiniClusterStatus.md b/sdk/python/v1alpha1/docs/MiniClusterStatus.md
@@ -7,7 +7,7 @@ Name | Type | Description | Notes
 ------------ | ------------- | ------------- | -------------
 **conditions** | [**list[V1Condition]**](V1Condition.md) | conditions hold the latest Flux Job and MiniCluster states | [optional] 
 **jobid** | **str** | The Jobid is set internally to associate to a miniCluster This isn&#39;t currently in use, we only have one! | [default to '']
-**size** | **int** | We keep the original size of the MiniCluster request as this is the absolute maximum | [default to 0]
+**maximum_size** | **int** | We keep the original size of the MiniCluster request as this is the absolute maximum | [default to 0]
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)