In [6]:
!pip install ray==2.5.1 tensorflow pyarrow tblib

Collecting tblib
  Downloading tblib-2.0.0-py3-none-any.whl (11 kB)
Installing collected packages: tblib
Successfully installed tblib-2.0.0


In [7]:
import argparse
from filelock import FileLock
import json
import os

import numpy as np
from ray.air.result import Result
import tensorflow as tf

from ray.train.tensorflow import TensorflowTrainer
from ray.air.integrations.keras import ReportCheckpointCallback
from ray.air.config import ScalingConfig


def mnist_dataset(batch_size: int) -> tf.data.Dataset:
    with FileLock(os.path.expanduser("~/.mnist_lock")):
        (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
    # The `x` arrays are in uint8 and have values in the [0, 255] range.
    # You need to convert them to float32 with values in the [0, 1] range.
    x_train = x_train / np.float32(255)
    y_train = y_train.astype(np.int64)
    train_dataset = (
        tf.data.Dataset.from_tensor_slices((x_train, y_train))
        .shuffle(60000)
        .repeat()
        .batch(batch_size)
    )
    return train_dataset


def build_cnn_model() -> tf.keras.Model:
    model = tf.keras.Sequential(
        [
            tf.keras.Input(shape=(28, 28)),
            tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
            tf.keras.layers.Conv2D(32, 3, activation="relu"),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(10),
        ]
    )
    return model


def train_func(config: dict):
    per_worker_batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)
    steps_per_epoch = config.get("steps_per_epoch", 70)

    tf_config = json.loads(os.environ["TF_CONFIG"])
    num_workers = len(tf_config["cluster"]["worker"])

    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    global_batch_size = per_worker_batch_size * num_workers
    multi_worker_dataset = mnist_dataset(global_batch_size)

    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_cnn_model()
        learning_rate = config.get("lr", 0.001)
        multi_worker_model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
            metrics=["accuracy"],
        )

    history = multi_worker_model.fit(
        multi_worker_dataset,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        callbacks=[ReportCheckpointCallback()],
    )
    results = history.history
    return results


def train_tensorflow_mnist(
    num_workers: int = 2, use_gpu: bool = False, epochs: int = 4
) -> Result:
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    return results

In [8]:
import ray

In [9]:
ray.init(address="ray://example-cluster-head-svc:10001")

RuntimeError: Ray Client is already connected. Maybe you called ray.init("ray://<address>") twice by accident?

In [None]:
train_tensorflow_mnist(num_workers=2, use_gpu=True, epochs=100)

[2m[36m(TunerInternal pid=452)[0m 2023-06-30 04:56:57.633525: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
[2m[36m(TunerInternal pid=452)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(TunerInternal pid=452)[0m 2023-06-30 04:56:57.836576: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(TunerInternal pid=452)[0m 2023-06-30 04:56:59.050186: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open sha

0,1
Current time:,2023-06-30 04:58:34
Running for:,00:01:31.16
Memory:,11.5/47.1 GiB

Trial name,status,loc,iter,total time (s),loss,accuracy
TensorflowTrainer_38bb7_00000,RUNNING,10.42.0.75:665,2,72.4023,2.2017,0.455357


[2m[36m(pid=665)[0m 2023-06-30 04:57:08.195744: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
[2m[36m(pid=665)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(pid=665)[0m 2023-06-30 04:57:08.457044: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(pid=665)[0m 2023-06-30 04:57:09.941149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_P

[2m[36m(RayTrainWorker pid=934)[0m Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
   16384/11490434 [..............................] - ETA: 41s
   49152/11490434 [..............................] - ETA: 36s
   81920/11490434 [..............................] - ETA: 35s
  212992/11490434 [..............................] - ETA: 19s
  475136/11490434 [>.............................] - ETA: 10s
 1048576/11490434 [=>............................] - ETA: 5s
 2383872/11490434 [=====>........................] - ETA: 2s


[2m[36m(RayTrainWorker pid=879)[0m 2023-06-30 04:57:32.421919: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:784] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_2"
[2m[36m(RayTrainWorker pid=879)[0m op: "TensorSliceDataset"
[2m[36m(RayTrainWorker pid=879)[0m input: "Placeholder/_0"
[2m[36m(RayTrainWorker pid=879)[0m input: "Placeholder/_1"
[2m[36m(RayTrainWorker pid=879)[0m attr {
[2m[36m(RayTrainWorker pid=879)[0m   key: "Toutput_types"
[2m[36m(RayTrainWorker pid=879)[0m   value {
[2m[36m(RayTrainWorker pid=879)[0m     list {
[2m[36m(RayTrainWorker pid=879)[0m       type: DT_FLOAT
[2m[36m(RayTrainWorker pid=879)[0m       type: DT_INT64
[2m[36m(RayTrainWorker pid=879)[0m     }
[2m[36m(RayTrainWorker pid=879)[0m   }
[2m[36m(RayTrainWorker pid=879)[0m }
[2m[36m(RayTrainWorker pid=879)[0m attr

[2m[36m(RayTrainWorker pid=879)[0m Epoch 1/100
[2m[36m(RayTrainWorker pid=934)[0m Epoch 1/100
 1/70 [..............................] - ETA: 4:50 - loss: 4.5964 - accuracy: 0.2656
 1/70 [..............................] - ETA: 4:43 - loss: 4.5964 - accuracy: 0.2656
 2/70 [..............................] - ETA: 27s - loss: 4.6122 - accuracy: 0.2188 
 2/70 [..............................] - ETA: 27s - loss: 4.6122 - accuracy: 0.2188 
 3/70 [>.............................] - ETA: 27s - loss: 4.6226 - accuracy: 0.1771
 3/70 [>.............................] - ETA: 26s - loss: 4.6226 - accuracy: 0.1771
 4/70 [>.............................] - ETA: 26s - loss: 4.6151 - accuracy: 0.2070
 4/70 [>.............................] - ETA: 26s - loss: 4.6151 - accuracy: 0.2070
 5/70 [=>............................] - ETA: 24s - loss: 4.6133 - accuracy: 0.2125
 5/70 [=>............................] - ETA: 24s - loss: 4.6133 - accuracy: 0.2125
 6/70 [=>............................] - ETA: 23s - loss