# Tensorflow模型存储

In [1]:
import os
import datetime

import tensorflow as tf
from tensorflow import keras

print(tf.version.VERSION)

1.15.0


In [2]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

train_labels = train_labels[:1000]
test_labels = test_labels[:1000]

train_images = train_images[:1000]
test_images = test_images[:1000]

print(f"train_images shape: {train_images.shape}")
print(f"train_labels shape: {train_labels.shape}")
print()
print(f"test_images shape: {test_images.shape}")
print(f"test_labels shape: {test_labels.shape}")

train_images shape: (1000, 28, 28)
train_labels shape: (1000,)

test_images shape: (1000, 28, 28)
test_labels shape: (1000,)


In [3]:
train_images = train_images.reshape(-1, 28 * 28) / 255.0
test_images = test_images.reshape(-1, 28 * 28) / 255.0

print(f"train_images shape: {train_images.shape}")
print()
print(f"test_images shape: {test_images.shape}")

train_images shape: (1000, 784)

test_images shape: (1000, 784)


In [4]:
# 定义一个简单的序列模型
def create_model():
  model = tf.keras.models.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=(784,)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(10, activation='softmax')
  ])

  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

  return model

# 创建一个基本的模型实例
trained_model = create_model()

# 显示模型的结构
trained_model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               401920    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                5130      
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________


## 保存Checkpoint形式的模型

**主要用途**
1. 使用 **已有模型** 继续进行模型训练；
2. 继续训练 **被中断训练** 的模型；

In [5]:
# today = datetime.datetime.now().strftime("%Y%m%d")
checkpoint_path = "out/training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
checkpoint_dir

'out/training_1'

### 设置模型训练的Checkpoint回调

In [6]:
# 创建一个保存模型权重的回调
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

# 使用新的回调训练模型
trained_model.fit(train_images, 
                  train_labels,  
                  epochs=10,
                  validation_data=(test_images,test_labels),
                  callbacks=[cp_callback])  # 通过回调训练

# 将根据配置的 checkpoint_path 内容，
# 创建一个TensorFlow checkpoint文件集合，
# 这些文件在每个 epoch 结束时更新

Train on 1000 samples, validate on 1000 samples
Epoch 1/10
Epoch 00001: saving model to out/training_1/cp.ckpt
Epoch 2/10
Epoch 00002: saving model to out/training_1/cp.ckpt
Epoch 3/10
Epoch 00003: saving model to out/training_1/cp.ckpt
Epoch 4/10
Epoch 00004: saving model to out/training_1/cp.ckpt
Epoch 5/10
Epoch 00005: saving model to out/training_1/cp.ckpt
Epoch 6/10
Epoch 00006: saving model to out/training_1/cp.ckpt
Epoch 7/10
Epoch 00007: saving model to out/training_1/cp.ckpt
Epoch 8/10
Epoch 00008: saving model to out/training_1/cp.ckpt
Epoch 9/10
Epoch 00009: saving model to out/training_1/cp.ckpt
Epoch 10/10
Epoch 00010: saving model to out/training_1/cp.ckpt


<tensorflow.python.keras.callbacks.History at 0x7fac6fd13ed0>

In [7]:
!ls -l {checkpoint_dir}

total 4784
-rw-r--r-- 1 sq sq      71 Jun  3 23:52 checkpoint
-rw-r--r-- 1 sq sq 4884624 Jun  3 23:52 cp.ckpt.data-00000-of-00002
-rw-r--r-- 1 sq sq    2195 Jun  3 23:52 cp.ckpt.data-00001-of-00002
-rw-r--r-- 1 sq sq    1219 Jun  3 23:52 cp.ckpt.index


### 使用已有Checkpoint文件给新创建的同构模型进行参数共享

In [8]:
# 创建一个 同结构的、未训练的 新基本模型实例：
# 只有与原始模型具有相同网络结构，才能进行参数共享
# 只有与原始模型具有相同网络结构，才能进行参数共享
# 只有与原始模型具有相同网络结构，才能进行参数共享
model = create_model()

# 评估模型 - 共享参数前
loss, acc = model.evaluate(test_images,  test_labels, verbose=2)
print("Untrained model, accuracy: {:5.2f}%".format(100 * acc))

1000/1000 - 0s - loss: 2.4024 - acc: 0.0510
Untrained model, accuracy:  5.10%


In [9]:
# 加载权重
model.load_weights(checkpoint_path)

# 重新评估模型 - 共享参数后
loss,acc = model.evaluate(test_images,  test_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

1000/1000 - 0s - loss: 0.4229 - acc: 0.8620
Restored model, accuracy: 86.20%


### Checkpoint回调的可选设置

In [10]:
# 创建一个新的模型实例
model = create_model()

# 1. 使用 `str.format` 字符串，在Checkpoint文件的命名中，增加 epoch 内容
checkpoint_path = "out/training_2/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
# 使用 `checkpoint_path` 格式保存权重
model.save_weights(checkpoint_path.format(epoch=0))
print()
print(f"checkpoint_path={checkpoint_path}")
print()

# 2. 使用参数period，在创建Checkpoint回调时，设置每完成 5 个 epochs 保存模型的权重；
#    Tensorflow在默认情况中，仅保存最近的5个 checkpoint 内容；
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    period=5)

# 使用新的回调*训练*模型
model.fit(train_images, 
          train_labels,
          epochs=50, 
          callbacks=[cp_callback],
          validation_data=(test_images, test_labels),
          verbose=0)


checkpoint_path=out/training_2/cp-{epoch:04d}.ckpt


Epoch 00005: saving model to out/training_2/cp-0005.ckpt

Epoch 00010: saving model to out/training_2/cp-0010.ckpt

Epoch 00015: saving model to out/training_2/cp-0015.ckpt

Epoch 00020: saving model to out/training_2/cp-0020.ckpt

Epoch 00025: saving model to out/training_2/cp-0025.ckpt

Epoch 00030: saving model to out/training_2/cp-0030.ckpt

Epoch 00035: saving model to out/training_2/cp-0035.ckpt

Epoch 00040: saving model to out/training_2/cp-0040.ckpt

Epoch 00045: saving model to out/training_2/cp-0045.ckpt

Epoch 00050: saving model to out/training_2/cp-0050.ckpt


<tensorflow.python.keras.callbacks.History at 0x7face9b596d0>

In [11]:
! ls {checkpoint_dir}

checkpoint			  cp-0025.ckpt.data-00001-of-00002
cp-0000.ckpt.data-00000-of-00002  cp-0025.ckpt.index
cp-0000.ckpt.data-00001-of-00002  cp-0030.ckpt.data-00000-of-00002
cp-0000.ckpt.index		  cp-0030.ckpt.data-00001-of-00002
cp-0005.ckpt.data-00000-of-00002  cp-0030.ckpt.index
cp-0005.ckpt.data-00001-of-00002  cp-0035.ckpt.data-00000-of-00002
cp-0005.ckpt.index		  cp-0035.ckpt.data-00001-of-00002
cp-0010.ckpt.data-00000-of-00002  cp-0035.ckpt.index
cp-0010.ckpt.data-00001-of-00002  cp-0040.ckpt.data-00000-of-00002
cp-0010.ckpt.index		  cp-0040.ckpt.data-00001-of-00002
cp-0015.ckpt.data-00000-of-00002  cp-0040.ckpt.index
cp-0015.ckpt.data-00001-of-00002  cp-0045.ckpt.data-00000-of-00002
cp-0015.ckpt.index		  cp-0045.ckpt.data-00001-of-00002
cp-0020.ckpt.data-00000-of-00002  cp-0045.ckpt.index
cp-0020.ckpt.data-00001-of-00002  cp-0050.ckpt.data-00000-of-00002
cp-0020.ckpt.index		  cp-0050.ckpt.data-00001-of-00002
cp-0025.ckpt.data-00000-of-00002  cp-0050.ckpt.index


In [12]:
# 选择最新的 checkpoint 文件
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

'out/training_2/cp-0050.ckpt'

In [13]:
# 创建一个新的模型实例
model = create_model()

# 加载以前保存的权重
model.load_weights(latest)

# 重新评估模型
loss, acc = model.evaluate(test_images,  test_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

1000/1000 - 0s - loss: 0.4915 - acc: 0.8700
Restored model, accuracy: 87.00%


### Checkpoint文件集合介绍

Checkpoint是一个**格式化文件集合**，这些文件中仅保存二进制格式的模型参数权重；Checkpoint包含如下内容：
1. **索引文件**，指示哪些权重存储在哪个分片中，带有```*.index```后缀；
2. **分片文件**，一个或多个包含模型参数权重的分片文件；

### 手动保存模型参数的权重

In [14]:
# 保存权重
model.save_weights('out/checkpoints/my_checkpoint')

# 创建模型实例
model = create_model()

# Restore the weights
model.load_weights('out/checkpoints/my_checkpoint')

# Evaluate the model
loss,acc = model.evaluate(test_images,  test_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

1000/1000 - 0s - loss: 0.4915 - acc: 0.8700
Restored model, accuracy: 87.00%


## 保存完整的训练模型

### 将模型保存为HDF5文件
保存模型相关的所有内容：

- 模型配置(结构)
- 优化器配置
- 参数权重

In [15]:
# 创建一个新的模型实例
model = create_model()

# 训练模型
model.fit(train_images, train_labels, epochs=5)

# 将整个模型保存为HDF5文件
model.save('out/my_model.h5')

Train on 1000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
# 重新创建完全相同的模型，包括其权重和优化程序
new_model = keras.models.load_model('out/my_model.h5')

# 显示网络结构
new_model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 512)               401920    
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                5130      
Total params: 407,050
Trainable params: 407,050
Non-trainable params: 0
_________________________________________________________________


In [18]:
loss, acc = new_model.evaluate(test_images,  test_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

1000/1000 - 0s - loss: 0.4351 - acc: 0.8560
Restored model, accuracy: 85.60%
