In [1]:
import tensorflow as tf 
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import numpy as np
import pathlib
tf.__version__

'2.0.0'

## 批量归一化层

对全连接层和卷积层做批量归一化的方法稍有不同。下面我们将分别介绍这两种情况下的批量归一化。

### 对全连接层做批量归一化

我们先考虑如何对全连接层做批量归一化。通常，我们将批量归一化层置于全连接层中的仿射变换和激活函数之间。设全连接层的输入为$\boldsymbol{u}$，权重参数和偏差参数分别为$\boldsymbol{W}$和$\boldsymbol{b}$，激活函数为$\phi$。设批量归一化的运算符为$\text{BN}$。那么，使用批量归一化的全连接层的输出为

$$\phi(\text{BN}(\boldsymbol{x})),$$

其中批量归一化输入$\boldsymbol{x}$由仿射变换

$$\boldsymbol{x} = \boldsymbol{W\boldsymbol{u} + \boldsymbol{b}}$$

得到。考虑一个由$m$个样本组成的小批量，仿射变换的输出为一个新的小批量$\mathcal{B} = \{\boldsymbol{x}^{(1)}, \ldots, \boldsymbol{x}^{(m)} \}$。它们正是批量归一化层的输入。对于小批量$\mathcal{B}$中任意样本$\boldsymbol{x}^{(i)} \in \mathbb{R}^d, 1 \leq  i \leq m$，批量归一化层的输出同样是$d$维向量

$$\boldsymbol{y}^{(i)} = \text{BN}(\boldsymbol{x}^{(i)}),$$

并由以下几步求得。首先，对小批量$\mathcal{B}$求均值和方差：

$$\boldsymbol{\mu}_\mathcal{B} \leftarrow \frac{1}{m}\sum_{i = 1}^{m} \boldsymbol{x}^{(i)},$$
$$\boldsymbol{\sigma}_\mathcal{B}^2 \leftarrow \frac{1}{m} \sum_{i=1}^{m}(\boldsymbol{x}^{(i)} - \boldsymbol{\mu}_\mathcal{B})^2,$$

其中的平方计算是按元素求平方。接下来，使用按元素开方和按元素除法对$\boldsymbol{x}^{(i)}$标准化：

$$\hat{\boldsymbol{x}}^{(i)} \leftarrow \frac{\boldsymbol{x}^{(i)} - \boldsymbol{\mu}_\mathcal{B}}{\sqrt{\boldsymbol{\sigma}_\mathcal{B}^2 + \epsilon}},$$

这里$\epsilon > 0$是一个很小的常数，保证分母大于0。在上面标准化的基础上，批量归一化层引入了两个可以学习的模型参数，拉伸（scale）参数 $\boldsymbol{\gamma}$ 和偏移（shift）参数 $\boldsymbol{\beta}$。这两个参数和$\boldsymbol{x}^{(i)}$形状相同，皆为$d$维向量。它们与$\hat{\boldsymbol{x}}^{(i)}$分别做按元素乘法（符号$\odot$）和加法计算：

$${\boldsymbol{y}}^{(i)} \leftarrow \boldsymbol{\gamma} \odot \hat{\boldsymbol{x}}^{(i)} + \boldsymbol{\beta}.$$

至此，我们得到了$\boldsymbol{x}^{(i)}$的批量归一化的输出$\boldsymbol{y}^{(i)}$。
值得注意的是，可学习的拉伸和偏移参数保留了不对$\boldsymbol{x}^{(i)}$做批量归一化的可能：此时只需学出$\boldsymbol{\gamma} = \sqrt{\boldsymbol{\sigma}_\mathcal{B}^2 + \epsilon}$和$\boldsymbol{\beta} = \boldsymbol{\mu}_\mathcal{B}$。我们可以对此这样理解：如果批量归一化无益，理论上，学出的模型可以不使用批量归一化。


### 对卷积层做批量归一化

对卷积层来说，批量归一化发生在卷积计算之后、应用激活函数之前。如果卷积计算输出多个通道，我们需要对这些通道的输出分别做批量归一化，且每个通道都拥有独立的拉伸和偏移参数，并均为标量。设小批量中有$m$个样本。在单个通道上，假设卷积计算输出的高和宽分别为$p$和$q$。我们需要对该通道中$m \times p \times q$个元素同时做批量归一化。对这些元素做标准化计算时，我们使用相同的均值和方差，即该通道中$m \times p \times q$个元素的均值和方差。


### 预测时的批量归一化

使用批量归一化训练时，我们可以将批量大小设得大一点，从而使批量内样本的均值和方差的计算都较为准确。将训练好的模型用于预测时，我们希望模型对于任意输入都有确定的输出。因此，单个样本的输出不应取决于批量归一化所需要的随机小批量中的均值和方差。一种常用的方法是通过移动平均估算整个训练数据集的样本均值和方差，并在预测时使用它们得到确定的输出。可见，和丢弃层一样，批量归一化层在训练模式和预测模式下的计算结果也是不一样的。

In [2]:
def pure_batch_norm(X,gama,bate,epslon = 1e-3):
    if tf.rank(X).numpy() == 2:
        ### 输入是二维,在每个batch上做归一化
        mean = tf.reduce_mean(X,axis=0)
        std = tf.reduce_mean(tf.square(tf.subtract(X,mean)),axis=0)
        x_hat = tf.divide(tf.subtract(X,mean),tf.sqrt(std +epslon))
    elif tf.rank(X).numpy() == 4:
        ### 输入是四维在每个chanel上做归一化
        mean = tf.reduce_mean(x,axis=(0,1,2),keepdims=True)
        std = tf.reduce_mean(tf.square(tf.subtract(X,mean)),axis=(0,1,2),keepdims=True)
        x_hat = tf.divide(tf.subtract(X,mean),tf.sqrt(std +epslon))
    
    return x_hat

In [3]:
x = tf.reshape(tf.range(1,7,dtype=tf.float32),shape=(3,2))
x

<tf.Tensor: id=8, shape=(3, 2), dtype=float32, numpy=
array([[1., 2.],
       [3., 4.],
       [5., 6.]], dtype=float32)>

In [4]:
y = pure_batch_norm(x,0.1,2)
y

<tf.Tensor: id=20, shape=(3, 2), dtype=float32, numpy=
array([[-1.2245153, -1.2245153],
       [ 0.       ,  0.       ],
       [ 1.2245153,  1.2245153]], dtype=float32)>

In [5]:
x = tf.reshape(tf.range(1,19,dtype=tf.float32),shape=(1,3,3,2))
x

<tf.Tensor: id=29, shape=(1, 3, 3, 2), dtype=float32, numpy=
array([[[[ 1.,  2.],
         [ 3.,  4.],
         [ 5.,  6.]],

        [[ 7.,  8.],
         [ 9., 10.],
         [11., 12.]],

        [[13., 14.],
         [15., 16.],
         [17., 18.]]]], dtype=float32)>

In [6]:
y = pure_batch_norm(x,1,2)
y

<tf.Tensor: id=42, shape=(1, 3, 3, 2), dtype=float32, numpy=
array([[[[-1.5491643 , -1.5491643 ],
         [-1.1618732 , -1.1618732 ],
         [-0.77458215, -0.77458215]],

        [[-0.38729107, -0.38729107],
         [ 0.        ,  0.        ],
         [ 0.38729107,  0.38729107]],

        [[ 0.77458215,  0.77458215],
         [ 1.1618732 ,  1.1618732 ],
         [ 1.5491643 ,  1.5491643 ]]]], dtype=float32)>

In [7]:
tf.reshape(y,shape=(1, 2,3, 3))

<tf.Tensor: id=44, shape=(1, 2, 3, 3), dtype=float32, numpy=
array([[[[-1.5491643 , -1.5491643 , -1.1618732 ],
         [-1.1618732 , -0.77458215, -0.77458215],
         [-0.38729107, -0.38729107,  0.        ]],

        [[ 0.        ,  0.38729107,  0.38729107],
         [ 0.77458215,  0.77458215,  1.1618732 ],
         [ 1.1618732 ,  1.5491643 ,  1.5491643 ]]]], dtype=float32)>

In [85]:
def batch_norm(x,is_train,gamma,bate,moving_mean,moving_var,epslon,moment):
    
    if not is_train:
        x_hat = (x - moving_mean) / tf.sqrt(moving_var ** 2 + epslon)
    else:
        rank = len(tf.shape(x))
        if  rank == 2:
            mean = tf.reduce_mean(x,axis=0)
            std = tf.reduce_mean(tf.square(tf.subtract(x,mean)),axis=0)
        elif rank == 4:
            mean = tf.reduce_mean(x,axis=(0,1,2),keepdims=True)
            std = tf.reduce_mean(tf.square(tf.subtract(x,mean)),axis=(0,1,2),keepdims=True)

        moving_mean = mean * moment + (1.0-moment) * moving_mean
        moving_var = std * moving_mean + (1.0-moment ) *moving_var
        x_hat = (x - moving_var) / tf.sqrt(moving_var ** 2 + epslon)
    y = x_hat * gamma + bate
    return y,moving_mean,moving_var

In [86]:
batch_norm(x,True,1,0,0,0,1e-3,0.9)

(<tf.Tensor: id=17729, shape=(1, 3, 3, 2), dtype=float32, numpy=
 array([[[[-0.9953704 , -0.9916667 ],
          [-0.9861111 , -0.98333335],
          [-0.9768518 , -0.975     ]],
 
         [[-0.9675926 , -0.96666664],
          [-0.9583333 , -0.9583333 ],
          [-0.9490741 , -0.95      ]],
 
         [[-0.9398148 , -0.94166666],
          [-0.9305555 , -0.93333334],
          [-0.9212963 , -0.925     ]]]], dtype=float32)>,
 <tf.Tensor: id=17715, shape=(1, 1, 1, 2), dtype=float32, numpy=array([[[[8.099999, 9.      ]]]], dtype=float32)>,
 <tf.Tensor: id=17718, shape=(1, 1, 1, 2), dtype=float32, numpy=array([[[[215.99998, 240.     ]]]], dtype=float32)>)

In [93]:
class BatchNorm(layers.Layer):
    def __init__(self,num_features, num_dims,**kargs):
        super(BatchNorm,self).__init__(**kargs)
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, 1, 1,num_features)
        self.gamma = self.add_weight(name = "gamma",shape=shape,trainable=True,initializer='one')
        self.bate =  self.add_weight(name = "bate",shape=shape,trainable=True,initializer='zero')
        self.moving_mean =tf.constant(1,shape=shape,dtype = tf.float32)
        self.moving_var = tf.constant(0,shape=shape,dtype = tf.float32)
    @tf.function
    def call(self,x,training=None):
        is_train = True if not training else False
        x_hat,self.moving_mean,self.moving_var = self.batch_norm(x,is_train,self.moving_mean,self.moving_var,1e-5,0.9)
        y = x_hat * self.gamma + self.bate
        return y
    
    def batch_norm(self,x,is_train,moving_mean,moving_var,epslon,moment):
        if not is_train:
            x_hat = (x - moving_mean) / tf.sqrt(moving_var ** 2 + epslon)
        else:
            rank = len(tf.shape(x))
            if  rank == 2:
                mean = tf.reduce_mean(x,axis=0)
                std = tf.reduce_mean(tf.square(tf.subtract(x,mean)),axis=0)
            elif rank == 4:
                mean = tf.reduce_mean(x,axis=(0,1,2),keepdims=True)
                std = tf.reduce_mean(tf.square(tf.subtract(x,mean)),axis=(0,1,2),keepdims=True)
            moving_mean = mean * moment + (1.0-moment) * moving_mean
            moving_var = std * moving_var + (1.0-moment ) *moving_var
            x_hat = (x - moving_mean) / tf.sqrt(moving_var ** 2 + epslon)
        return x_hat,moving_mean,moving_var

In [94]:
bn = BatchNorm(2,4)

In [95]:
bn(x)

<tf.Tensor: id=20276, shape=(1, 3, 3, 2), dtype=float32, numpy=
array([[[[-2276.84   , -2245.2173 ],
         [-1644.3844 , -1612.7618 ],
         [-1011.92883,  -980.3063 ]],

        [[ -379.47327,  -347.85068],
         [  252.98228,   284.6049 ],
         [  885.43787,   917.0604 ]],

        [[ 1517.8934 ,  1549.516  ],
         [ 2150.3489 ,  2181.9717 ],
         [ 2782.8044 ,  2814.4272 ]]]], dtype=float32)>

In [96]:
model = keras.Sequential(
    [layers.InputLayer(input_shape=(28,28,1)),
     layers.Conv2D(filters = 6,kernel_size=5),
     BatchNorm(6,4),
     layers.Activation("sigmoid"),
     layers.MaxPooling2D(pool_size=2,strides=2),
     layers.Conv2D(filters = 16,kernel_size = 4),
     BatchNorm(16,4),
     layers.Activation("sigmoid"),
     layers.MaxPooling2D(pool_size=2,strides=2),
     layers.Flatten(),
     layers.Dense(units=120),
     BatchNorm(120,2),
     layers.Activation("sigmoid"),
     layers.Dense(84,activation="sigmoid"),
     BatchNorm(84,2),
     layers.Activation("sigmoid"),
     layers.Dense(10,activation="softmax")
    ]
)

In [97]:
model.compile(loss=tf.losses.sparse_categorical_crossentropy,metrics=["acc"])

In [73]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
x_train = x_train/255.0
x_test = x_test/255.0

batch_size = 256

x_1 = np.expand_dims(x_train,axis=-1)
x_2 = np.expand_dims(x_test,axis=-1)

def transformDs(ds):
    return ds.shuffle(5000).batch(batch_size).repeat()

ds_train = transformDs(tf.data.Dataset.from_tensor_slices((x_1,y_train)))
ds_test = transformDs(tf.data.Dataset.from_tensor_slices((x_2,y_test)))
steps_per_epoch = len(x_train) // batch_size
val_per_epoch =len(x_test) // batch_size

In [74]:
dome_data,dome_lable = next(iter(ds_train))

In [98]:
model.fit(ds_train,epochs=10,steps_per_epoch=steps_per_epoch,validation_steps=val_per_epoch,validation_data=ds_test)

Train for 234 steps, validate for 39 steps
Epoch 1/10
  1/234 [..............................] - ETA: 4:44

TypeError: An op outside of the function building code is being passed
a "Graph" tensor. It is possible to have Graph tensors
leak out of the function building context by including a
tf.init_scope in your function building code.
For example, the following function will fail:
  @tf.function
  def has_init_scope():
    my_constant = tf.constant(1.)
    with tf.init_scope():
      added = my_constant * 2
The graph tensor has name: cond_1/Identity_1:0

In [102]:
model = keras.Sequential(
    [layers.InputLayer(input_shape=(28,28,1)),
     layers.Conv2D(filters = 6,kernel_size=5),
     layers.BatchNormalization(),
     layers.Activation("sigmoid"),
     layers.MaxPooling2D(pool_size=2,strides=2),
     layers.Conv2D(filters = 16,kernel_size = 4),
     layers.BatchNormalization(),
     layers.Activation("sigmoid"),
     layers.MaxPooling2D(pool_size=2,strides=2),
     layers.Flatten(),
     layers.Dense(units=120),
     layers.BatchNormalization(),
     layers.Activation("sigmoid"),
     layers.Dense(84,activation="sigmoid"),
     layers.BatchNormalization(),
     layers.Activation("sigmoid"),
     layers.Dense(10,activation="softmax")
    ]
)
model.compile(loss=tf.losses.sparse_categorical_crossentropy,metrics=["acc"])

In [103]:
model.fit(ds_train,epochs=10,steps_per_epoch=steps_per_epoch,validation_steps=val_per_epoch,validation_data=ds_test)

Train for 234 steps, validate for 39 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x3e0887b8>