In [1]:
import json
import os
import sys
import time

In [2]:
# off GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Delete TF_CONFIG, if u have it
os.environ.pop('TF_CONFIG', None)

if '.' not in sys.path:
  sys.path.insert(0, '.')

In [3]:
%%writefile cifar.py

# Data model
import os

import tensorflow as tf
from keras.datasets import cifar10
import numpy as np

def get_data(batch_size):
  (x_train, y_train), _ = cifar10.load_data()
  # The `x` arrays are in uint8 and have values in the range [0, 255].
  # You need to convert them to float32 with values in the range [0, 1]
  x_train = x_train / np.float32(255)
  y_train = y_train.astype(np.int64)
  train_dataset = tf.data.Dataset.from_tensor_slices(
      (x_train, y_train)).shuffle(1998).repeat().batch(batch_size)
  return train_dataset


def build_and_compile_cnn_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, (3, 3), activation='selu', input_shape=(32, 32, 3)),
      tf.keras.layers.MaxPooling2D((2, 2)),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(32, activation='selu'),
      tf.keras.layers.Dense(10),
  ])
  
  model.compile(
      loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
      metrics=['accuracy'])
  return model

Writing cifar.py


In [4]:
import cifar

# try single work
batch_size = 64
single_worker_dataset = cifar.get_data(batch_size)
single_worker_model = cifar.build_and_compile_cnn_model()
single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f22d02e4410>

In [5]:
%%writefile kernel.py

import os
import json

import tensorflow as tf
import cifar

per_worker_batch_size = 64
tf_config = json.loads(os.environ['TF_CONFIG'])
num_workers = len(tf_config['cluster']['worker'])

strategy = tf.distribute.MultiWorkerMirroredStrategy()

global_batch_size = per_worker_batch_size * num_workers
multi_worker_dataset = cifar.get_data(global_batch_size)

with strategy.scope():
  # Model building/compiling need to be within `strategy.scope()`.
  multi_worker_model =cifar.build_and_compile_cnn_model()


multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)

Writing kernel.py


In [6]:
# config for workers
tf_config = {
    'cluster': {
        'worker': ['localhost:12345', 'localhost:14464', 'localhost:23456']
    },
    'task': {'type': 'worker', 'index': 0}
}

os.environ['TF_CONFIG'] = json.dumps(tf_config)

In [7]:
# first kill any previous runs
%killbgscripts

All background processes were killed.


In [8]:
%%bash --bg
python kernel.py &> job_0.log

Starting job # 0 in a separate thread.


In [9]:
time.sleep(5)

In [10]:
%%bash
cat job_0.log

2021-04-21 07:06:26.123555: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-21 07:06:28.437870: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-21 07:06:28.438796: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-04-21 07:06:28.448390: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-04-21 07:06:28.448455: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f4c974da2003): /proc/driver/nvidia/version does not exist
2021-04-21 07:06:28.449472: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-21 07:06:28.450084: I tensorflow/compiler/jit/xla_gpu_device.cc:99] 

In [11]:
# config for workers
tf_config['task']['index'] = 1
os.environ['TF_CONFIG'] = json.dumps(tf_config)

In [12]:
%%bash --bg
python kernel.py &> job_1.log

Starting job # 2 in a separate thread.


In [13]:
time.sleep(5)

In [14]:
cat job_1.log

2021-04-21 07:06:38.597866: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-21 07:06:40.871336: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-21 07:06:40.872231: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-04-21 07:06:40.881574: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-04-21 07:06:40.881639: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f4c974da2003): /proc/driver/nvidia/version does not exist
2021-04-21 07:06:40.882533: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-21 07:06:40.882933: I tensorflow/compiler/jit/xla_gpu_device.cc:99] 

In [15]:
# config for workers
tf_config['task']['index'] = 2
os.environ['TF_CONFIG'] = json.dumps(tf_config)

In [16]:
%%bash --bg
python kernel.py &> job_2.log

Starting job # 3 in a separate thread.


In [17]:
time.sleep(15)

In [19]:
cat job_2.log

2021-04-21 07:06:52.240035: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-21 07:06:54.555570: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-21 07:06:54.556468: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-04-21 07:06:54.566419: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-04-21 07:06:54.566483: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f4c974da2003): /proc/driver/nvidia/version does not exist
2021-04-21 07:06:54.567480: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-21 07:06:54.567900: I tensorflow/compiler/jit/xla_gpu_device.cc:99] 

In [22]:
cat job_0.log

2021-04-21 07:06:26.123555: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-04-21 07:06:28.437870: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-21 07:06:28.438796: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-04-21 07:06:28.448390: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-04-21 07:06:28.448455: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f4c974da2003): /proc/driver/nvidia/version does not exist
2021-04-21 07:06:28.449472: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-04-21 07:06:28.450084: I tensorflow/compiler/jit/xla_gpu_device.cc:99] 

In [23]:
os.environ.pop('TF_CONFIG', None)
%killbgscripts

All background processes were killed.
