In [1]:
!pip install ray==2.6.1 tensorflow==2.12.1 pyarrow tblib

Collecting ray==2.6.1
  Downloading ray-2.6.1-cp38-cp38-manylinux2014_x86_64.whl (56.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tensorflow==2.12.1
  Downloading tensorflow-2.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (585.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m585.9/585.9 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pyarrow
  Downloading pyarrow-12.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.0/39.0 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tblib
  Downloading tblib-2.0.0-py3-none-any.whl (11 kB)
Collecting frozenlist
  Downloading frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (220 kB)
[2

In [2]:
import argparse
from filelock import FileLock
import json
import os

import numpy as np
from ray.air.result import Result
import tensorflow as tf

from ray.train.tensorflow import TensorflowTrainer
from ray.air.integrations.keras import ReportCheckpointCallback
from ray.air.config import ScalingConfig


def mnist_dataset(batch_size: int) -> tf.data.Dataset:
    with FileLock(os.path.expanduser("~/.mnist_lock")):
        (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
    # The `x` arrays are in uint8 and have values in the [0, 255] range.
    # You need to convert them to float32 with values in the [0, 1] range.
    x_train = x_train / np.float32(255)
    y_train = y_train.astype(np.int64)
    train_dataset = (
        tf.data.Dataset.from_tensor_slices((x_train, y_train))
        .shuffle(60000)
        .repeat()
        .batch(batch_size)
    )
    return train_dataset


def build_cnn_model() -> tf.keras.Model:
    model = tf.keras.Sequential(
        [
            tf.keras.Input(shape=(28, 28)),
            tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
            tf.keras.layers.Conv2D(32, 3, activation="relu"),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(10),
        ]
    )
    return model


def train_func(config: dict):
    per_worker_batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)
    steps_per_epoch = config.get("steps_per_epoch", 70)

    tf_config = json.loads(os.environ["TF_CONFIG"])
    num_workers = len(tf_config["cluster"]["worker"])

    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    global_batch_size = per_worker_batch_size * num_workers
    multi_worker_dataset = mnist_dataset(global_batch_size)

    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_cnn_model()
        learning_rate = config.get("lr", 0.001)
        multi_worker_model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
            metrics=["accuracy"],
        )

    history = multi_worker_model.fit(
        multi_worker_dataset,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        callbacks=[ReportCheckpointCallback()],
    )
    results = history.history
    return results


def train_tensorflow_mnist(
    num_workers: int = 2, use_gpu: bool = False, epochs: int = 4
) -> Result:
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    return results

2023-08-21 16:41:15,652	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.3 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-08-21 16:41:16,188	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.3 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-08-21 16:41:16.444346: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-21 16:41:16.449478: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-21 16:41:16.534278: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-21 16:41:16.537658: I 

In [3]:
import ray

In [4]:
runtime_env = {
    'env_vars': {'RAY_AIR_NEW_OUTPUT': '0'}
}

In [5]:
ray.init(address="ray://example-cluster-head-svc:10001", runtime_env=runtime_env)

0,1
Python version:,3.8.13
Ray version:,2.6.1
Dashboard:,http://10.42.1.26:8265


In [10]:
@ray.remote(num_gpus=1, runtime_env=runtime_env)
def f():
    print(ray.get_gpu_ids())

In [11]:
f.remote()

ClientObjectRef(c2668a65bda616c1ffffffffffffffffffffffff0100000001000000)

[2m[36m(f pid=278)[0m [0]


In [12]:
train_tensorflow_mnist(num_workers=1, use_gpu=True, epochs=3)

0,1
Current time:,2023-08-21 09:43:28
Running for:,00:00:44.15
Memory:,5.0/23.5 GiB

Trial name,status,loc,iter,total time (s),loss,accuracy
TensorflowTrainer_be5a6_00000,TERMINATED,10.42.1.26:430,3,32.3227,2.08812,0.605357


[2m[36m(TunerInternal pid=314)[0m   self._maybe_warn_resource_contention()


[2m[1m[33m(autoscaler +1m24s)[0m Error: No available node types can fulfill resource request {'GPU': 2.0, 'CPU': 1.0}. Add suitable node types to this cluster to resolve this issue.


[2m[36m(pid=430)[0m 2023-08-21 09:42:50.330615: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
[2m[36m(pid=430)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(pid=430)[0m 2023-08-21 09:42:50.525179: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(pid=430)[0m 2023-08-21 09:42:51.995754: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_P

[2m[36m(RayTrainWorker pid=527)[0m Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
   24576/11490434 [..............................] - ETA: 25s
   49152/11490434 [..............................] - ETA: 28s
  147456/11490434 [..............................] - ETA: 22s
  278528/11490434 [..............................] - ETA: 15s
  376832/11490434 [..............................] - ETA: 13s
  753664/11490434 [>.............................] - ETA: 7s
 1474560/11490434 [==>...........................] - ETA: 4s
 2064384/11490434 [====>.........................] - ETA: 3s


[2m[36m(RayTrainWorker pid=527)[0m 2023-08-21 09:43:04.389806: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_2"
[2m[36m(RayTrainWorker pid=527)[0m op: "TensorSliceDataset"
[2m[36m(RayTrainWorker pid=527)[0m input: "Placeholder/_0"
[2m[36m(RayTrainWorker pid=527)[0m input: "Placeholder/_1"
[2m[36m(RayTrainWorker pid=527)[0m attr {
[2m[36m(RayTrainWorker pid=527)[0m   key: "Toutput_types"
[2m[36m(RayTrainWorker pid=527)[0m   value {
[2m[36m(RayTrainWorker pid=527)[0m     list {
[2m[36m(RayTrainWorker pid=527)[0m       type: DT_FLOAT
[2m[36m(RayTrainWorker pid=527)[0m       type: DT_INT64
[2m[36m(RayTrainWorker pid=527)[0m     }
[2m[36m(RayTrainWorker pid=527)[0m   }
[2m[36m(RayTrainWorker pid=527)[0m }
[2m[36m(RayTrainWorker pid=527)[0m attr

[2m[36m(RayTrainWorker pid=527)[0m Epoch 1/3
 1/70 [..............................] - ETA: 3:25 - loss: 2.3263 - accuracy: 0.1562
 3/70 [>.............................] - ETA: 3s - loss: 2.3121 - accuracy: 0.1510  
 5/70 [=>............................] - ETA: 4s - loss: 2.3114 - accuracy: 0.1156
 8/70 [==>...........................] - ETA: 4s - loss: 2.3090 - accuracy: 0.1211
 9/70 [==>...........................] - ETA: 4s - loss: 2.3097 - accuracy: 0.1128
10/70 [===>..........................] - ETA: 4s - loss: 2.3102 - accuracy: 0.1125
12/70 [====>.........................] - ETA: 4s - loss: 2.3093 - accuracy: 0.1107
14/70 [=====>........................] - ETA: 4s - loss: 2.3088 - accuracy: 0.1161
16/70 [=====>........................] - ETA: 4s - loss: 2.3083 - accuracy: 0.1152
[2m[36m(RayTrainWorker pid=527)[0m Epoch 2/3
 1/70 [..............................] - ETA: 1s - loss: 2.2236 - accuracy: 0.2500
 2/70 [..............................] - ETA: 5s - loss: 2.2208 - accu

[2m[36m(TunerInternal pid=314)[0m Total run time: 48.09 seconds (44.15 seconds for the tuning loop).


AttributeError: Can't get attribute '_unpickle_block' on <module 'pandas._libs.internals' from '/opt/conda/lib/python3.8/site-packages/pandas/_libs/internals.cpython-38-x86_64-linux-gnu.so'>

[2m[1m[33m(autoscaler +2m34s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 2.0}. Add suitable node types to this cluster to resolve this issue.
