In [None]:
import json
import os
import sys

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
os.environ.pop('TF_CONFIG', None)

In [None]:
if '.' not in sys.path:
  sys.path.insert(0, '.')

In [None]:
import tensorflow as tf

In [None]:
%%writefile cifar10.py

import os
import tensorflow as tf
from tensorflow.keras import models, layers, datasets
import numpy as np

def cifar_dataset(bs):
    (x_train, y_train), _ = datasets.cifar10.load_data()
    x_train, y_train = x_train / np.float32(255), y_train.astype(np.int64)
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (x_train, y_train)).shuffle(50000).repeat().batch(bs)
    
    return train_dataset

def build_and_compile_cnn_model():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(10))
    
    model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


    return model

Writing cifar10.py


In [None]:
import cifar10

batch_size = 64

single_worker_dataset = cifar10.cifar_dataset(batch_size)
single_worker_model = cifar10.build_and_compile_cnn_model()
single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7feb001f4438>

In [None]:
tf_config = {
    'cluster': {
        'worker': ['localhost:12345', 'localhost:23456', 'localhost:34567']
    },
    'task': {'type': 'worker', 'index': 0}
}

In [None]:
%%writefile main.py

import os
import json

import tensorflow as tf
import cifar10

per_worker_batch_size = 64

tf_config = json.loads(os.environ['TF_CONFIG'])
num_workers = len(tf_config['cluster']['worker'])

strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

global_batch_size = per_worker_batch_size * num_workers
multi_worker_dataset = cifar10.cifar_dataset(global_batch_size)

with strategy.scope():
    multi_worker_model = cifar10.build_and_compile_cnn_model()

multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)

Writing main.py


In [None]:
!ls *.py

cifar10.py  main.py


In [None]:
os.environ['TF_CONFIG'] = json.dumps(tf_config)

In [None]:
!echo ${TF_CONFIG}

{"cluster": {"worker": ["localhost:12345", "localhost:23456", "localhost:34567"]}, "task": {"type": "worker", "index": 0}}


In [None]:
%killbgscripts

All background processes were killed.


In [None]:
%%bash --bg
python main.py &> job_0.log

Starting job # 0 in a separate thread.


In [None]:
tf_config['task']['index'] = 1
os.environ['TF_CONFIG'] = json.dumps(tf_config)

In [None]:
import time
time.sleep(10)

In [None]:
%%bash
cat job_0.log

2020-11-04 12:14:03.579449: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-04 12:14:05.202549: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2020-11-04 12:14:05.211708: E tensorflow/stream_executor/cuda/cuda_driver.cc:314] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-11-04 12:14:05.211759: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (4928ed9abf90): /proc/driver/nvidia/version does not exist
2020-11-04 12:14:05.234860: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2200000000 Hz
2020-11-04 12:14:05.235157: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x2408a00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-11-04 12:14:05.235214: I tensorflow/com

In [None]:
%%bash --bg
python main.py &> job_1.log

Starting job # 2 in a separate thread.


In [None]:
time.sleep(10)

In [None]:
%%bash
cat job_1.log

2020-11-04 12:14:13.605153: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-04 12:14:15.233413: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2020-11-04 12:14:15.242565: E tensorflow/stream_executor/cuda/cuda_driver.cc:314] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-11-04 12:14:15.242620: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (4928ed9abf90): /proc/driver/nvidia/version does not exist
2020-11-04 12:14:15.249676: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2200000000 Hz
2020-11-04 12:14:15.249921: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x30eaa00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-11-04 12:14:15.249958: I tensorflow/com

In [None]:
tf_config['task']['index'] = 2
os.environ['TF_CONFIG'] = json.dumps(tf_config)

In [None]:
%%bash
python main.py

Epoch 1/3
Epoch 2/3
Epoch 3/3


2020-11-04 12:14:23.665072: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-04 12:14:25.276347: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2020-11-04 12:14:25.285453: E tensorflow/stream_executor/cuda/cuda_driver.cc:314] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-11-04 12:14:25.285500: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (4928ed9abf90): /proc/driver/nvidia/version does not exist
2020-11-04 12:14:25.292691: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2200000000 Hz
2020-11-04 12:14:25.292940: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x223ca00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-11-04 12:14:25.292970: I tensorflow/com

In [None]:
%%bash
cat job_0.log

2020-11-04 12:14:03.579449: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-04 12:14:05.202549: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2020-11-04 12:14:05.211708: E tensorflow/stream_executor/cuda/cuda_driver.cc:314] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-11-04 12:14:05.211759: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (4928ed9abf90): /proc/driver/nvidia/version does not exist
2020-11-04 12:14:05.234860: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2200000000 Hz
2020-11-04 12:14:05.235157: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x2408a00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-11-04 12:14:05.235214: I tensorflow/com

In [None]:
%%bash
cat job_1.log

2020-11-04 12:14:13.605153: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2020-11-04 12:14:15.233413: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2020-11-04 12:14:15.242565: E tensorflow/stream_executor/cuda/cuda_driver.cc:314] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2020-11-04 12:14:15.242620: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (4928ed9abf90): /proc/driver/nvidia/version does not exist
2020-11-04 12:14:15.249676: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2200000000 Hz
2020-11-04 12:14:15.249921: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x30eaa00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-11-04 12:14:15.249958: I tensorflow/com

In [None]:
os.environ.pop('TF_CONFIG', None)
%killbgscripts

All background processes were killed.
