In [1]:
import tensorflow as tf
import numpy as np
# 查询系统可用的 GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
# 确保有可用的 GPU 如果没有, 则会报错
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# 设置参数,该段务必在运行jupyter的第一段代码执行，否则会无法初始化成功
# 仅在需要时申请显存空间（程序初始运行时消耗很少的显存，随着程序的运行而动态申请显存）
tf.config.experimental.set_memory_growth(physical_devices[0], True)

#### 1. TensorBoard简介
- 是一个在深度学习中很好的可视化训练过程和模型结构的工具

在TensorFlow2.0中，训练一个神经网络主要两个方式:
- 使用tf.keras模块的Model.fit()，这种方式调用tensorboard相对简单
- 使用tf.GraientTape()求解梯度，这样可以自定义训练过程，相对复杂

对于这两种方案，都可以使用TensorBoard，下面以在MNIST数据集上训练一个图像分类模型为例介绍

#### 2.Keras训练中使用
- keras在回调函数中内置Tensorboard函数

In [2]:
tf.keras.callbacks.TensorBoard(
    log_dir='logs', # 保存 TensorBoard要解析日志文件的目录的路径
    histogram_freq=0, # 在epoch中的频率，计算模型层的激活和权重直方图，如果设置为0，则不会计算直方图，必须为直方图可视化指定验证数据或拆分
    write_graph= True, # 是否在TensorBoard中可视化图像，当write_graph为True时，日志文件可能会变得很大
    write_images= False, # 是否在TensorBoard中编写模型权重以显示为图像
    update_freq= 'epoch',
    profile_batch=2, # 分析批次以采样计算特征
    embeddings_freq=0, # 可视化嵌入层的频率
    embeddings_metadata= None # 字典，它将层名次映射到文件名，该嵌入层的元数据保存在该文件名中
)

<tensorflow.python.keras.callbacks.TensorBoard at 0x7fb7001b5e20>

#### Tensorboard界面解释
- Scalars：显示了如何将loss与每个时间段改变，还可以使用它来跟踪训练速度，学习率和其他标量值
- Graphs：进行可视化模型，在这种情况下，将显示层的Keras图，这可以帮助你确保模型正确构建
- Distributions 和 Histograms:显示张量随时间的分布，这对于可视化权重和偏差并验证他们是否以预期的方式变化很有用

In [26]:
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model
import numpy as np
import datetime

In [27]:
print(tf.__version__)
print(np.__version__)

2.2.0
1.18.5


In [28]:
gpus = tf.config.experimental.list_physical_devices(device_type = 'GPU')

In [29]:
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit = 2048)]
)

In [30]:
tf.test.is_gpu_available()

True

In [31]:
path_tmp = '/home/hp/.local/lib/python3.8/site-packages/tensorflow/keras/datasets/'
mnist = np.load(path_tmp + "mnist.npz")
x_train, y_train, x_test, y_test = mnist['x_train'],mnist['y_train'],mnist['x_test'],mnist['y_test']


In [32]:
x_train,x_test = x_train/255.0, x_test/255.0

In [33]:
# 添加一个channels dimension
x_train = x_train[..., tf.newaxis]
x_test = x_test[..., tf.newaxis]

In [34]:
class MyModel(Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv1 = Conv2D(32,3,activation= 'relu')
        self.flatten = Flatten()
        self.d1 = Dense(128, activation= 'relu')
        self.d2 = Dense(10, activation= 'softmax')

    def call(self, x):
        x = self.conv1(x)
        x = self.flatten(x)
        x = self.d1(x)
        return self.d2(x)

In [3]:
model = MyModel()
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

tensorborad_callback = tf.keras.callbacks.TensorBoard(
    log_dir= 'keras_logv1',
    histogram_freq=1,
    profile_batch=100000000
)

NameError: name 'MyModel' is not defined

In [4]:
model.fit(
    x = x_train,
    y = y_train,
    epochs = 20,
    validation_data = (x_test,y_test),
    callbacks = [tensorborad_callback]
)

NameError: name 'model' is not defined

如果想用 localhost 为本地ip 打开 tensorboard，需要指定:

tensorboard --logdir keras_logv1 --host=本机IP --port=21886，具体参见杨璐的jisuan01最新版文档

In [5]:
%reload_ext tensorboard

In [6]:
!ls

 1_1_tensorflow2.0简介.ipynb
 2_1_张量操作和三类自定义模型.ipynb
 2_2_keras模型训练.ipynb
 2_3_tensorflow2.0模型训练.ipynb
 2_4_计算图机制详解.ipynb
 2_5_模型的保存与加载.ipynb
 3_1_自定义层详解.ipynb
 3_2_常用损失函数与自定义损失函数.ipynb
 3_3_常用评估函数和自定义评估函数.ipynb
 3_4_tensorboard可视化训练过程和模型结构.ipynb
 4_1_tf.data简介.ipynb
 4_2_tf.data简介.ipynb
 4_3_tfrecord详解.ipynb
 5_1_cnn卷积神经网络介绍.ipynb
'5_2_实战：Quick,Draw! Google涂鸦识别挑战项目（上）.ipynb'
 5_3_1_方法一构建CNN_generator读取数据.ipynb
 5_3_2_方法二构建CNN_Textlinedataset读取数据.ipynb
 5_3_3_方法三构建CNN_TFRecord读取数据.ipynb
 5_3_对涂鸦识别数据做预处理.ipynb
'5_3_实战：Quick,Draw! Google涂鸦识别挑战项目（下）.ipynb'
 6_1_循环神经网络简介.ipynb
 6_2_word2vec简介.ipynb
 6_3_实战：LSTM实现新闻分类.ipynb
 7_1_Transformer模型简介.ipynb
 7_2_实战：Transformer模型实现机器翻译算法.ipynb
 9_1_TF.hub迁移学习的使用.ipynb
 附录1：imdb数据集了解自然语言的处理方式.ipynb
 附录2：循环神经网络的理解.ipynb
 附录3：理解python装饰器.ipynb
 附录5：softmax和交叉熵的理解.ipynb
 adasd.h5
 checkpoint
 checkpoints
 keras_logv1
 keras_model_tf_version.h5
 logs
 markdown_pics
 mymodel_1.data-00000-of-00002
 mymodel_1.data-00001-

In [7]:
# 在命令行里面运行的界面可以直接打开一个网页，然后运行这个结果

%tensorboard --logdir keras_logv1 --host=192.168.254.11 --port=21866

# 然后在网页中输入  http://jisuan01.cmsc.tech:21866/ 即可查看结果  BOSS网连接

# 仍然GRAPHS显示存在问题

Reusing TensorBoard on port 21866 (pid 61356), started 166 days, 3:50:29 ago. (Use '!kill 61356' to kill it.)

#### 3.自定义训练中使用

In [40]:
tf.summary.create_file_writer(
    logdir='./test', # 定义路径
    max_queue=None, # 最多再缓存中暂存max_queue个数据，当超过max_queue个时，flush更新到日志文件中并清空缓存，默认为10
    flush_millis=None, # 至少flush_millis毫秒内进行一次flush,默认为120000毫秒
    filename_suffix=None, # 日志文件的后缀，默认为.v2
    name=None #本操作的名称
)

<tensorflow.python.ops.summary_ops_v2.ResourceSummaryWriter at 0x7f610d52d910>

##### 保存图像
- 保存形成为[k,h,w,c]的Tensor

tf.summary.image(
    name,data,step = None,max_outputs=3,description=None
)

##### 保存标量
- 保存单个数值，再tensorboard中将生成折线图

tf.summary.scalar(
    name,data,step = None,description=None
)

##### 保存文本
- 保存一个tf.string类型的Tensor

tf.summary.text(
    name,data,step = None,description=None
)

##### 保存模型权重的分布情况
- 保存一个张量，直方图或者密度图

tf.summary.histogram(
    name,data,step = None,buckets=None,description=None
)

##### 保存音频
- 保存形状为[k,t,c]的Tensor

tf.summary.audio(
    name,data,sample_rate, step = None,max_outputs=3,encoding=None, description=None
)

#### 查看 Graph和Profile信息
- tf.summary.trace_export():停止trace，并将之前trace记录到的信息写入profiler日志文件
- tf.summary.trace_off():停止trace,并舍弃之前trace记录
- tf.summary.trace_on():开始trace以记录计算图和分析信息

In [41]:
# 定义数据的本地导入
path_tmp = '/home/hp/.local/lib/python3.8/site-packages/tensorflow/keras/datasets/'
mnist = np.load(path_tmp+'mnist.npz')
x_train, y_train, x_test, y_test = mnist['x_train'],mnist['y_train'],mnist['x_test'],mnist['y_test']


x_train, x_test = x_train/255.0, x_test/255.0


# add a channels dimension
x_train = x_train[..., tf.newaxis]
x_test = x_test[..., tf.newaxis]


train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

In [42]:
# 定义模型
class MyModel(Model):
    def __init__(self):
        super(MyModel,self).__init__()
        self.conv1 = Conv2D(32,3,activation= 'relu')
        self.flatten = Flatten()
        self.d1 = Dense(128, activation= 'relu')
        self.d2 = Dense(10, activation='softmax')


    @tf.function # 将前向传播转换为静态图
    def call(self, x):
        x = self.conv1(x)
        x = self.flatten(x)
        x = self.d1(x)

        return self.d2(x)

In [43]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()


# 定义损失函数，返回平均损失
train_loss = tf.keras.metrics.Mean(name = 'train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name = 'train_accuracy')

test_loss = tf.keras.metrics.Mean(name = 'test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name = 'test_accuracy')


In [44]:
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions) 




def test_step(images, labels):
    predictions = model(images)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)
    test_tp(labels, predictions)

In [45]:
model = MyModel()

In [46]:
stamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

In [47]:
import os
logdir = os.path.join('logs/'+stamp)

In [48]:
summary_writer = tf.summary.create_file_writer(logdir)

In [61]:
tf.summary.trace_on(graph= True,profiler= True) # 开启trance,可以记录图结构和profile信息

In [62]:
EPOCHS = 20

In [63]:
for epoch in range(EPOCHS):
    for (x_train,y_train) in train_ds:
        train_step(x_train,y_train)
    
    # 定义希望使用的记录器
    with summary_writer.as_default():
        tf.summary.scalar('loss', train_loss.result(),step = epoch)
        tf.summary.scalar('accuracy', train_accuracy.result(), step = epoch) # 还可以添加其他自定义的变量




    template = 'Epoch{}, Loss:{}, Accuracy{}, Test Loss :{}, Test Accuracy:{}'
    print(template.format(epoch + 1,
        train_loss.result(),
        train_accuracy.result() * 100,
        test_loss.result(),
        test_accuracy.result() * 100
    ))


    # 重置评估
    train_loss.reset_states()
    test_loss.reset_states()
    train_accuracy.reset_states()
    test_accuracy.reset_states()

Epoch1, Loss:0.00027669634437188506, Accuracy99.99832916259766, Test Loss :0.0, Test Accuracy:0.0


KeyboardInterrupt: 

In [None]:
# 保存trace信息到文件

with summary_writer.as_default():
    tf.summary.trace_export(name = 'model_trace',step = 3, profiler_outdir= logdir)