In [1]:
!pip install ray==2.6.1 tensorflow==2.12.1 pyarrow tblib



In [2]:
import argparse
from filelock import FileLock
import json
import os

import numpy as np
from ray.air.result import Result
import tensorflow as tf

from ray.train.tensorflow import TensorflowTrainer
from ray.air.integrations.keras import ReportCheckpointCallback
from ray.air.config import ScalingConfig


def mnist_dataset(batch_size: int) -> tf.data.Dataset:
    with FileLock(os.path.expanduser("~/.mnist_lock")):
        (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
    # The `x` arrays are in uint8 and have values in the [0, 255] range.
    # You need to convert them to float32 with values in the [0, 1] range.
    x_train = x_train / np.float32(255)
    y_train = y_train.astype(np.int64)
    train_dataset = (
        tf.data.Dataset.from_tensor_slices((x_train, y_train))
        .shuffle(60000)
        .repeat()
        .batch(batch_size)
    )
    return train_dataset


def build_cnn_model() -> tf.keras.Model:
    model = tf.keras.Sequential(
        [
            tf.keras.Input(shape=(28, 28)),
            tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
            tf.keras.layers.Conv2D(32, 3, activation="relu"),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(10),
        ]
    )
    return model


def train_func(config: dict):
    per_worker_batch_size = config.get("batch_size", 64)
    epochs = config.get("epochs", 3)
    steps_per_epoch = config.get("steps_per_epoch", 70)

    tf_config = json.loads(os.environ["TF_CONFIG"])
    num_workers = len(tf_config["cluster"]["worker"])

    strategy = tf.distribute.MultiWorkerMirroredStrategy()

    global_batch_size = per_worker_batch_size * num_workers
    multi_worker_dataset = mnist_dataset(global_batch_size)

    with strategy.scope():
        # Model building/compiling need to be within `strategy.scope()`.
        multi_worker_model = build_cnn_model()
        learning_rate = config.get("lr", 0.001)
        multi_worker_model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
            metrics=["accuracy"],
        )

    history = multi_worker_model.fit(
        multi_worker_dataset,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        callbacks=[ReportCheckpointCallback()],
    )
    results = history.history
    return results


def train_tensorflow_mnist(
    num_workers: int = 2, use_gpu: bool = False, epochs: int = 4
) -> Result:
    config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs}
    trainer = TensorflowTrainer(
        train_loop_per_worker=train_func,
        train_loop_config=config,
        scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu),
    )
    results = trainer.fit()
    return results

2023-08-21 17:47:24,184	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.3 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-08-21 17:47:24,713	INFO util.py:159 -- Outdated packages:
  ipywidgets==7.6.3 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-08-21 17:47:24.969919: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-21 17:47:24.973308: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-21 17:47:25.049908: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-21 17:47:25.051896: I 

In [3]:
import ray

In [4]:
runtime_env = {
    'env_vars': {'RAY_AIR_NEW_OUTPUT': '0'}
}

In [6]:
@ray.remote(num_gpus=2, runtime_env=runtime_env)
def f():
    print(ray.get_gpu_ids())

In [7]:
f.remote()

ClientObjectRef(c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000)

[2m[36m(f pid=299)[0m [0, 1]


In [5]:
ray.init(address="ray://example-cluster-head-svc:10001", runtime_env=runtime_env)

0,1
Python version:,3.8.13
Ray version:,2.6.1
Dashboard:,http://10.42.3.10:8265


In [10]:
train_tensorflow_mnist(num_workers=2, use_gpu=True, epochs=3)

0,1
Current time:,2023-08-21 10:52:45
Running for:,00:01:44.17
Memory:,6.0/47.1 GiB

Trial name,status,loc,iter,total time (s),loss,accuracy
TensorflowTrainer_4893d_00000,TERMINATED,10.42.3.10:2665,3,92.8406,2.09379,0.529464


[2m[36m(TunerInternal pid=2548)[0m   self._maybe_warn_resource_contention()
[2m[36m(pid=2665)[0m 2023-08-21 10:51:07.317740: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
[2m[36m(pid=2665)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(pid=2665)[0m 2023-08-21 10:51:07.510469: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(pid=2665)[0m 2023-08-21 10:51:08.744926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnv

[2m[36m(RayTrainWorker pid=2761)[0m Epoch 1/3
[2m[36m(RayTrainWorker pid=2803)[0m Epoch 1/3
 1/70 [..............................] - ETA: 5:57 - loss: 4.5837 - accuracy: 0.3438
 1/70 [..............................] - ETA: 5:51 - loss: 4.5837 - accuracy: 0.3438
 2/70 [..............................] - ETA: 20s - loss: 4.5939 - accuracy: 0.3203 
 2/70 [..............................] - ETA: 20s - loss: 4.5939 - accuracy: 0.3203 
 3/70 [>.............................] - ETA: 20s - loss: 4.5847 - accuracy: 0.3958
 3/70 [>.............................] - ETA: 20s - loss: 4.5847 - accuracy: 0.3958
 4/70 [>.............................] - ETA: 20s - loss: 4.5858 - accuracy: 0.3789
 4/70 [>.............................] - ETA: 20s - loss: 4.5858 - accuracy: 0.3789
 5/70 [=>............................] - ETA: 19s - loss: 4.5806 - accuracy: 0.3875
 5/70 [=>............................] - ETA: 19s - loss: 4.5806 - accuracy: 0.3875
 6/70 [=>............................] - ETA: 19s - loss: 

[2m[36m(TunerInternal pid=2548)[0m Total run time: 107.78 seconds (104.17 seconds for the tuning loop).


AttributeError: Can't get attribute '_unpickle_block' on <module 'pandas._libs.internals' from '/opt/conda/lib/python3.8/site-packages/pandas/_libs/internals.cpython-38-x86_64-linux-gnu.so'>