# 目录
## 1. 导入模块
## 2. GPU配置策略（所有GPU可见）
## 3. MirroredStrategy分布式策略训练
>相对于普通的keras修改注意的地方<br/>
1. 有多个逻辑GPU <br/>
2. 训练的batch_size, `batch_size_per_replica = 256; batch_size = len(logical_gpus) * batch_size_per_replica` <br/>
3. 在模型构造，添加分布式策略，例如

```python
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    model = keras.models.Sequential([ ...])
    model.compile()
```
  - ### 3.1 获取fashion mnist数据
  - ### 3.2 keras模型训练(不同地方，加一个mirroredStrategy 策略) 

## 1. 导入模块

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import matplotlib.pyplot as plt

from tensorflow import keras
import tensorflow as tf

import sys
import os
import time
import datetime

for module in [np, pd, sklearn, mpl, keras, tf]:
    print(module.__name__, module.__version__)

numpy 1.18.1
pandas 0.25.3
sklearn 0.22.1
matplotlib 3.1.2
tensorflow_core.python.keras.api._v2.keras 2.2.4-tf
tensorflow 2.1.0


## 2. GPU配置策略（所有GPU可见）

In [2]:
'''
tf.debugging.set_log_device_placement(True)  # 打印设备运行日志
tf.config.set_soft_device_placement(True)  # 把运算放在合适的设备运行
GPUS = tf.config.experimental.list_physical_devices("GPU") # 获取物理GPU
for gpu in GPUS:
    # 设置GPU内存自增长模式，需要多少，就用多少， 这一行代码必须要在程序刚开始运行的时候运行
    tf.config.experimental.set_memory_growth(gpu, True)  
print("Physical GPU: {}".format(len(GPUS)))

logical_gpus = tf.config.experimental.list_logical_devices("GPU") # 获取逻辑GPU，逻辑GPU就像电脑磁盘分区，默认是在第0个逻辑分区训练
print("Logical GPU: {}".format(len(logical_gpus)))

'''
# 虚拟多个gpu，用来验证。。。如果有多个物理gpu，就用上面的代码

tf.debugging.set_log_device_placement(True)  # 打印设备运行日志
tf.config.set_soft_device_placement(True)  # 把运算放在合适的设备运行
GPUS = tf.config.experimental.list_physical_devices("GPU") # 获取物理GPU
tf.config.experimental.set_visible_devices(GPUS[0], "GPU") # 设置该程序可访问的GPU，默认是在第0个gpu上训练
tf.config.experimental.set_virtual_device_configuration(  # 对某一个物理gpu设置逻辑分区， 默认一个物理gpu对应一个逻辑gpu
    GPUS[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2072),
     tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2072),
    tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2072),
    tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2072)]
)

print("Physical GPU: {}".format(len(GPUS)))

logical_gpus = tf.config.experimental.list_logical_devices("GPU") # 获取逻辑GPU，逻辑GPU就像电脑磁盘分区，默认是在第0个逻辑分区训练
print("Logical GPU: {}".format(len(logical_gpus)))


Physical GPU: 1
Logical GPU: 4


## 3. MirroredStrategy分布式策略训练
>相对于普通的keras修改注意的地方<br/>
1. 有多个逻辑GPU <br/>
2. 训练的batch_size, `batch_size_per_replica = 256; batch_size = len(logical_gpus) * batch_size_per_replica` <br/>
3. 在模型构造，添加分布式策略，例如

```python
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    model = keras.models.Sequential([ ...])
    model.compile()
```

  - ### 3.1 获取fashion mnist数据

In [3]:
# 取出fashion mnist 数据集
fashion_mnist = keras.datasets.fashion_mnist
(x_train_all, y_train_all), (x_test, y_test) = fashion_mnist.load_data()

x_valid, x_train = x_train_all[:5000], x_train_all[5000:]
y_valid, y_train = y_train_all[:5000], y_train_all[5000:]


# 标准化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train.astype(np.float32).reshape(-1, 1)).reshape(-1, 28, 28, 1)
x_valid_scaled = scaler.transform(x_valid.astype(np.float32).reshape(-1, 1)).reshape(-1, 28, 28, 1)
x_test_scaled = scaler.transform(x_test.astype(np.float32).reshape(-1, 1)).reshape(-1, 28, 28, 1)

print(x_train_scaled.shape, y_train.shape)
print(x_valid_scaled.shape, y_valid.shape)
print(x_test_scaled.shape, y_test.shape)


# 制作dataset 数据集
def make_dataset(images, labels, epochs, batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((images, labels))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size).prefetch(50) # prefetch 先取出50个样本准备
    return dataset


batch_size_per_replica = 256
batch_size = len(logical_gpus) * batch_size_per_replica
epochs = None
train_dataset = make_dataset(x_train_scaled, y_train, epochs, batch_size)

(55000, 28, 28, 1) (55000,)
(5000, 28, 28, 1) (5000,)
(10000, 28, 28, 1) (10000,)
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AnonymousRandomSeedGenerator in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ShuffleDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0


  - ### 3.2 keras模型训练(不同地方，加一个mirroredStrategy 策略)

In [4]:
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    model = keras.models.Sequential([
        keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu", input_shape=(28, 28, 1)),
        keras.layers.Conv2D(filters=32, kernel_size=3, padding="same", activation="relu"),
        keras.layers.MaxPool2D(pool_size=2),

        keras.layers.Conv2D(filters=64, kernel_size=3, padding="same", activation="relu"),
        keras.layers.Conv2D(filters=64, kernel_size=3, padding="same", activation="relu"),
        keras.layers.MaxPool2D(pool_size=2),

        keras.layers.Conv2D(filters=128, kernel_size=3, padding="same", activation="relu"),
        keras.layers.Conv2D(filters=128, kernel_size=3, padding="same", activation="relu"),
        keras.layers.MaxPool2D(pool_size=2),

        keras.layers.Flatten(),
        keras.layers.Dense(128, activation="relu"),
        keras.layers.Dense(10, activation="softmax")
    ])

    model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(lr=0.01), metrics=["accuracy"])



INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')
Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhos

In [5]:
model.fit(train_dataset, steps_per_epoch=x_train_scaled.shape[0]//batch_size, epochs=10)

Executing op ParallelMapDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RebatchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op PrefetchDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AutoShardDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Train for 53 steps
Epoch 1/10
Executing op Cast in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op OptimizeDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ModelDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MultiDeviceIterator in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MultiDeviceIteratorInit in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MultiDeviceIteratorToStringHandle in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op GeneratorDa

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Executing op __inference_distributed_function_3379 in device /job:localhost/replica:0/task:0/device:GPU:0
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Executing op DestroyResourceOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op DeleteIterator in device /job:localhost/replica:0/task:0/device:GPU:3
Executing op DeleteIterator in device /job:localhost/replica:0/task:0/device:GPU:2
Executing op DeleteIterator in device /job:localhost/replica:0/task:0/device:GPU:1
Executing op DeleteIterator in device /job:localhost/replica:0/task:0/device:GPU:0


<tensorflow.python.keras.callbacks.History at 0x7fa1400c07b8>