In [5]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import datasets, models, layers, utils, losses, optimizers

In [2]:
# check available GPUs
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


2022-03-02 16:30:54.057039: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-02 16:30:59.019477: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /device:GPU:0 with 14639 MB memory:  -> device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:1b:00.0, compute capability: 7.0
2022-03-02 16:30:59.021382: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /device:GPU:1 with 14639 MB memory:  -> device: 1, name: Tesla V100-SXM2-16GB, pci bus id: 0000:1c:00.0, compute capability: 7.0
2022-03-02 16:30:59.023036: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /device:GPU:2 with 14639 MB memory:  -> device: 2, name: Tesla V100-SXM2-16GB, pci bus id: 0

True

 Created device /device:GPU:4 with 14639 MB memory:  -> device: 4, name: Tesla V100-SXM2-16GB, pci bus id: 0000:88:00.0, compute capability: 7.0
2022-03-02 16:30:59.027617: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /device:GPU:5 with 14639 MB memory:  -> device: 5, name: Tesla V100-SXM2-16GB, pci bus id: 0000:89:00.0, compute capability: 7.0
2022-03-02 16:30:59.029101: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /device:GPU:6 with 14639 MB memory:  -> device: 6, name: Tesla V100-SXM2-16GB, pci bus id: 0000:db:00.0, compute capability: 7.0
2022-03-02 16:30:59.030540: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /device:GPU:7 with 14639 MB memory:  -> device: 7, name: Tesla V100-SXM2-16GB, pci bus id: 0000:dd:00.0, compute capability: 7.0


# Prepare Data

In [3]:
num_classes = 10
img_size = 32

In [7]:
(x_train, y_train), (x_val, y_val) = datasets.cifar10.load_data()
print('x_train:', x_train.shape)
print('y_train:', y_train.shape)
print('x_val:', x_val.shape)
print('y_val:', y_val.shape)

x_train: (50000, 32, 32, 3)
y_train: (50000, 1)
x_val: (10000, 32, 32, 3)
y_val: (10000, 1)


In [8]:
# normalization 0~1
x_train = x_train/255.
x_val = x_val/255.

# one hot encoding
y_train = utils.to_categorical(y_train, num_classes= num_classes)
y_val = utils.to_categorical(y_val, num_classes= num_classes)

# Build model

In [10]:
# Build model
inputs = layers.Input(shape=(img_size, img_size, 3))
x = layers.Conv2D(filters=32, kernel_size=3, activation='relu', padding='same')(inputs)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D(2)(x)
x = layers.Conv2D(filters=64, kernel_size=3, activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.MaxPooling2D(2)(x)
x = layers.Conv2D(filters=128, kernel_size=3, activation='relu', padding='same')(x)
x = layers.BatchNormalization()(x)
x = layers.Flatten()(x)
x = layers.Dense(512, activation='relu')(x)
prediction = layers.Dense(num_classes, activation='softmax')(x)

model = models.Model(inputs=inputs, outputs=prediction)

In [11]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 32, 32, 3)]       0         
                                                                 
 conv2d_1 (Conv2D)           (None, 32, 32, 32)        896       
                                                                 
 batch_normalization_1 (Batc  (None, 32, 32, 32)       128       
 hNormalization)                                                 
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 16, 32)       0         
 )                                                               
                                                                 
 conv2d_2 (Conv2D)           (None, 16, 16, 64)        18496     
                                                                 
 batch_normalization_2 (Batc  (None, 16, 16, 64)       256   

# Multi Strategy

In [12]:
# 抓所有認得到的GPU,並在這些GPU上設定同步策略
mirrored_strategy = tf.distribute.MirroredStrategy() 

# 指定特定GPU
# mirrored_strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')


In [13]:
# 這裡採同步訓練,每個step算完會"聚合"一次梯度再更新; 異地不會聚合,每個GPU算完會依其結果直接更新
with mirrored_strategy.scope():
    model = model

model.compile(loss=losses.categorical_crossentropy, 
              optimizer=optimizers.Adam(),
              metrics=['accuracy'])

model.fit(x_train, y_train,
          batch_size=64,
          epochs=10,
          verbose=1,
          validation_data=(x_val,y_val))

Epoch 1/10


2022-03-02 16:53:30.163130: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f61742e54c0>

# Evaluate

In [14]:
model.evaluate(x_val, y_val)



[1.733648657798767, 0.6980999708175659]

In [16]:
# 觀察每張GPU分布運算情形, 原先batch_size為64, 平均分到8張GPU後,每個step各自負責64/8=8筆資料
mirrored_strategy.num_replicas_in_sync

8