In [2]:
import tensorflow as tf
# 查询系统可用的 GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
# 确保有可用的 GPU 如果没有, 则会报错
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# 设置参数,该段务必在运行jupyter的第一段代码执行，否则会无法初始化成功
# 仅在需要时申请显存空间（程序初始运行时消耗很少的显存，随着程序的运行而动态申请显存）
tf.config.experimental.set_memory_growth(physical_devices[0], True)
import numpy as np

#### 一、神经网络的正向传播和反向传播

#### 自动求导机制
- 梯度求导函数：tf.GradientTape
- GradientTape是eager模式下计算梯度的函数，TensorFlow2.0默认以eager模式执行。

In [3]:
# 举例使用这个函数计算梯度
x =tf.constant(3.0) # 定义常数
with tf.GradientTape() as g:
    g.watch(x) # watch的作用：确保某个tensor被tape追踪
    y = x*x + x**3
dy_dx = g.gradient(y,x) #参数里面的y是指刚刚定义的函数，那么x就是对其中哪一个参数求导
tf.print(dy_dx)

33


In [4]:
# 定义一个求导公式并求取梯度值
def dy_du(u:float):
    u = tf.constant(u)
    with tf.GradientTape(persistent=False) as g:
        g.watch(u) # watch的作用：确保某个tensor被tape追踪
        y = u**2 + u**3 + u**np.exp(1)
    dy_du = g.gradient(y,u)
    return tf.print(dy_du)

In [5]:
# 求偏导数
dy_du([3.0,2.0,4.0,0,6.0])

[50.9524307 24.9443512 85.4308777 0 179.071442]


In [6]:
help(tf.GradientTape)

Help on class GradientTape in module tensorflow.python.eager.backprop:

class GradientTape(builtins.object)
 |  GradientTape(persistent=False, watch_accessed_variables=True)
 |  
 |  Record operations for automatic differentiation.
 |  
 |  Operations are recorded if they are executed within this context manager and
 |  at least one of their inputs is being "watched".
 |  
 |  Trainable variables (created by `tf.Variable` or `tf.compat.v1.get_variable`,
 |  where `trainable=True` is default in both cases) are automatically watched.
 |  Tensors can be manually watched by invoking the `watch` method on this context
 |  manager.
 |  
 |  For example, consider the function `y = x * x`. The gradient at `x = 3.0` can
 |  be computed as:
 |  
 |  ```python
 |  x = tf.constant(3.0)
 |  with tf.GradientTape() as g:
 |    g.watch(x)
 |    y = x * x
 |  dy_dx = g.gradient(y, x) # Will compute to 6.0
 |  ```
 |  
 |  GradientTapes can be nested to compute higher-order derivatives. For example,
 | 

In [7]:
tf.GradientTape(persistent=False,watch_accessed_variables=True)

<tensorflow.python.eager.backprop.GradientTape at 0x7fc9c41846d0>

- persistent:指定创建的GradientTape是否可持续调用。False表示只能调用一次gradient()函数。是否链式求导
- watch_accessed_variables：表明这个GradientTape是不是会自动追踪任何能够被训练的变量。如果是False则需要指定追踪哪些变量

gradient(target,sources)
##### 根据tape上面的上下文来计算某个或者某些tensor的梯度参数
- target 被微分的tensor，可以理解为损失值
- sources Tensors 或者变量列表
gradient(target,sources)
- 返回值：一个列表表示各个变量的梯度值，和sources中的变量列表一一对应，表明这个变量的梯度

#### 结合模型的数据调用tf.GradientTape示例

In [8]:
### 14:43

In [9]:
loss_object = tf.keras.losses.CategoricalCrossentropy() # 定义损失函数
optimizer = tf.keras.optimizers.Adam() # 定义优化方法

#### 正向传播和反向传播的调用示例

```python
with tf.GradientTape() as tape:
    predictions = model(data)
    loss = loss_object(labels,predictions)
gradients = tape.gradient(loss,model.trainable_variables) # 定义梯度
optimizer.apply_gradients(zip(gradients,model.trainable_variables)) #定义调用
```

#### 将计算出来的梯度更新到变量上面去
#### apply_gradients(grads_and_vars,name = None)

 ##### 案例1：实现模型自动求导

构建模型（前向传播）-->定义损失函数-->定义优化函数-->定义tape
-->模型得到预测值-->前向传播得到loss-->反向传播-->用优化函数将计算出来的梯度更新到变量上

In [10]:
# 定义一个类表示前向传播的过程
class MyModel(tf.keras.Model):

    def __init__(self,num_classes=10):
        super().__init__(name = 'my_model')
        self.num_classes = num_classes
        # 定义自己需要的层
        self.dense_1 = tf.keras.layers.Dense(32,activation='relu') # 隐藏层
        self.dense_2 = tf.keras.layers.Dense(num_classes) # 输出层
        
    def call(self,inputs):
        # 定义前向传播
        # 使用在__init__定义的层
        x = self.dense_1(inputs) # 经过第一个隐藏层
        return self.dense_2(x) # 返回预测值

In [11]:
import numpy as np
data = np.random.random((1000,32))
labels = np.random.random((1000,10)) # 10个类别
data = tf.cast(data, tf.float32) # numpy转换为tensor
labels = tf.cast(labels, tf.float32)

In [12]:
labels[0:2] # 每一个数组的对应位置表示这个数据归属于哪一类的概率较高

<tf.Tensor: shape=(2, 10), dtype=float32, numpy=
array([[0.22049998, 0.31213003, 0.5519902 , 0.9000708 , 0.16034448,
        0.39964816, 0.9113272 , 0.67348826, 0.37222522, 0.08691391],
       [0.62849903, 0.30306992, 0.7365023 , 0.04017013, 0.6432007 ,
        0.9647105 , 0.02479025, 0.7382544 , 0.42798683, 0.46305203]],
      dtype=float32)>

In [13]:
type(data)

tensorflow.python.framework.ops.EagerTensor

In [14]:
help(np.random.random)

Help on built-in function random:

random(...) method of numpy.random.mtrand.RandomState instance
    random(size=None)
    
    Return random floats in the half-open interval [0.0, 1.0). Alias for
    `random_sample` to ease forward-porting to the new random API.



In [15]:
model = MyModel(num_classes=10)

loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True) # 定义损失函数
optimizer = tf.keras.optimizers.Adam() # 定义优化方法

with tf.GradientTape() as tape:
    predictions = model(data)
    loss = loss_object(labels,predictions)
gradients = tape.gradient(loss,model.trainable_variables) # 定义梯度
optimizer.apply_gradients(zip(gradients,model.trainable_variables)) #定义调用，将梯度结果更新到变量上去

<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=1>

In [16]:
type(model.trainable_variables)

list

In [17]:
len(model.trainable_variables)

4

In [18]:
# 这里包含了权重矩阵和偏置向量，0和2是权重矩阵，1和3是偏置向量
print(model.trainable_variables[0].shape)
print(model.trainable_variables[1].shape)
print(model.trainable_variables[2].shape)
print(model.trainable_variables[3].shape)

(32, 32)
(32,)
(32, 10)
(10,)


#### 案例2、使用GradientTape自定义训练模型

In [19]:
# 定义一个类表示前向传播的过程
class MyModel(tf.keras.Model):

    def __init__(self,num_classes=10):
        super(MyModel,self).__init__(name = 'my_model')
        self.num_classes = num_classes
        # 定义自己需要的层
        self.dense_1 = tf.keras.layers.Dense(32,activation='relu') # 隐藏层
        self.dense_2 = tf.keras.layers.Dense(num_classes) # 输出层
        
    def call(self,inputs):
        # 定义前向传播
        # 使用在__init__定义的层
        x = self.dense_1(inputs) # 经过第一个隐藏层
        return self.dense_2(x) # 返回预测值

In [20]:
data = np.random.random((1000,32))
labels = np.random.random((1000,10)) # 10个类别
# data = tf.cast(data, tf.float32) 
# labels = tf.cast(labels, tf.float32)

In [21]:
model = MyModel(num_classes = 10)

optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)

loss_fn = tf.keras.losses.CategoricalCrossentropy()

batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((data, labels))

train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

In [23]:
train_dataset

<BatchDataset shapes: ((None, 32), (None, 10)), types: (tf.float64, tf.float64)>

In [24]:
epochs = 3
for epoch in range(epochs):
    print(f'start of epoch {(epoch,)}')
    
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            logits = model(x_batch_train, training = True) # 执行前向传播
            
            loss_value = loss_fn(y_batch_train, logits) # 计算每一轮迭代的损失
            
        grads = tape.gradient(loss_value, model.trainable_weights)
        
        # 通过梯度更新参数
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        if step % 200 == 0:
            print(f"training loss for one batch at step {step}:\t{float(loss_value)}")
            print(f"seen so far:{(step+1)*64} samples")

start of epoch (0,)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

training loss for one batch at step 0:	39.42831039428711
seen so far:64 samples
start of epoch (1,)
training loss for one batch at step 0:	12.252761840820312
seen so far:64 samples
start of epoch (2,)
training loss for one batch at step 0:	11.841941833496094
seen so far:64 samples


##### 案例3： 使用GradientTape自定义训练模型进阶（加入评估函数）
让我们将 metric添加到组合中，下面可以从头开始编写的训练循环中随时使用内置指标（或编写自定义指标）。流程如下：
- 再循环开始时初始化metric
- metric.update_state():每个batch之后更新
- metric.result():需要显示metric的当前值时调用
- metric.reset_states():需要清除metric状态时重置（通常在每个epoch的结尾）

In [25]:
# 定义一个类表示前向传播的过程
class MyModel(tf.keras.Model):

    def __init__(self,num_classes=10):
        super(MyModel,self).__init__(name = 'my_model')
        self.num_classes = num_classes
        # 定义自己需要的层
        self.dense_1 = tf.keras.layers.Dense(32,activation='relu') # 隐藏层
        self.dense_2 = tf.keras.layers.Dense(num_classes) # 输出层
        
    def call(self,inputs):
        # 定义前向传播
        # 使用在__init__定义的层
        x = self.dense_1(inputs) # 经过第一个隐藏层
        return self.dense_2(x) # 返回预测值

In [26]:
x_train = np.random.random((1000,32))
y_train = np.random.random((1000,10))

x_val = np.random.random((200,32))
y_val = np.random.random((200,10))

x_test = np.random.random((200,32))
y_test = np.random.random((200,10))

In [27]:
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)
loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
val_acc_metric = tf.keras.metrics.CategoricalAccuracy()

In [28]:
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_dataset = train_dataset.shuffle(buffer_size = 1024).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val,y_val))
val_dataset = val_dataset.batch(batch_size)

In [29]:
model = MyModel(num_classes=10)

In [30]:
epochs = 3
for epoch in range(epochs):
    print(f'start of epoch {(epoch,)}')
    
    # 上文已经定义好了batch_size大小是64
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        
        # 先定义跟踪梯度的结构
        with tf.GradientTape() as tape:
            logits = model(x_batch_train, training = True) # 预测值
            loss_value = loss_fn(y_batch_train, logits)
        
        # 对每一个变量求梯度
        grads = tape.gradient(loss_value, model.trainable_weights)
        
        # 通过梯度反向传播更新参数
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        # 完成反向传播以后评估预测效果
        train_acc_metric(y_batch_train,logits)
    
    train_acc = train_acc_metric.result()
    print("Training acc over epoch{(float(train_acc),)}")
    train_acc_metric.reset_states()
        
#         if step % 200 == 0:
#             print(f"training loss for one batch at step {step}:\t{float(loss_value)}")
#             print(f"seen so far:{(step+1)*64} samples")

    for x_batch_val, y_batch_val in val_dataset:
        val_logits = model(x_batch_val)
        val_acc_metric(y_batch_val, val_logits)
    val_acc = val_acc_metric.result()
    print(f"val acc {(float(val_acc),)}")
    val_acc_metric.reset_states()

start of epoch (0,)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Training acc over epoch{(float(train_acc),)}
val acc (0.10499999672174454,)
start of epoch (1,)
Training acc over epoch{(float(train_acc),)}
val acc (0.10000000149011612,)
start of epoch (2,)
Training acc over epoch{(float(train_acc),)}
val acc (0.0949999988079071,)


In [31]:
help(tf.keras.metrics.CategoricalAccuracy.result)

Help on function result in module tensorflow.python.keras.metrics:

result(self)
    Computes and returns the metric value tensor.
    
    Result computation is an idempotent operation that simply calculates the
    metric value using the state variables.

