<a href="https://colab.research.google.com/github/cyteena/U-net/blob/main/From_numpy_mean_to_BatchNormalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np

In [4]:
arr = np.array([[1, 2], [3, 4], [5, 6]])
mean_along_axis_0 = np.mean(arr, axis = 0)
print(mean_along_axis_0)

[3. 4.]


In [5]:
arr.shape

(3, 2)

In [6]:
arr = np.array([[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [[[9, 10], [11, 12]], [[13, 14], [15, 16]]]])
mean_along_multiple_axes = np.mean(arr, axis=(0, 2))
print(mean_along_multiple_axes)

[[ 6.  7.]
 [10. 11.]]


In [8]:
mean_along_multiple_axes.shape

(2, 2)

In [10]:
arr = np.array([[1, 2], [3, 4]])
mean_along_axis_0_with_keepdims = np.mean(arr, axis = 0, keepdims=True)
print(mean_along_axis_0_with_keepdims)

[[2. 3.]]


In [12]:
mean_along_axis_0_with_keepdims.shape

(1, 2)

In [13]:
arr.shape

(2, 2)

In [14]:
arr = np.array([[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [[[9, 10], [11, 12]], [[13, 14], [15, 16]]]])

In [15]:
mean_along_multiple_axes = np.mean(arr, axis=(0, 2))

In [16]:
mean_along_multiple_axes

array([[ 6.,  7.],
       [10., 11.]])

In [17]:
arr.shape

(2, 2, 2, 2)

In [18]:
arr.shape[0]

2

In [19]:
sum = 0

In [20]:
count = 0

arr.shape = (2,2,2,2), then operate np.mean(axis=(0,2))
the output.shape = (1,2,1,2), if keepdims = true

Now let's calculate the value of output at the position (0,0,0,0).

In [21]:
for i in range(arr.shape[0]):
  for j in range(arr.shape[2]):
    sum += arr[i][0][j][0]
    count +=1

sum/count

6.0

Now we can understand the BatchNormalizion.

Let's see the example implementation code from Doubao.

In [None]:
import numpy as np


class BatchNormalization2D:
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        """
        Initialize the BatchNormalization2D layer.

        Args:
            num_features (int): The number of channels (features) in the input data.
            eps (float): A small value added to the variance to avoid division by zero.
            momentum (float): The momentum for updating the running mean and running variance.
        """
        self.num_features = num_features
        self.eps = eps
        self.momentum = momentum

        # Parameters to be learned
        self.gamma = np.ones((1, num_features, 1, 1))
        self.beta = np.zeros((1, num_features, 1, 1))

        # Running statistics
        self.running_mean = np.zeros((1, num_features, 1, 1))
        self.running_variance = np.ones((1, num_features, 1, 1))

    def forward(self, x):
        """
        Forward pass of the BatchNormalization2D layer.

        Args:
            x (numpy.ndarray): Input data of shape (batch_size, num_features, height, width).

        Returns:
            numpy.ndarray: Output data after batch normalization.
        """
        if self.training:
            # Calculate mean and variance for the current batch
            batch_mean = np.mean(x, axis=(0, 2, 3), keepdims=True)
            batch_variance = np.var(x, axis=(0, 2, 3), keepdims=True)

            # Normalize the batch data
            x_normalized = (x - batch_mean) / np.sqrt(batch_variance + self.eps)

            # Update running mean and running variance
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * batch_mean
            self.running_variance = self.momentum * self.running_vrawing_variance + (1 - self.momentum) * batch_variance

            # Apply the learned transformation
            out = self.gamma * x_normalized + self.beta
        else:
            # Use the running mean and running variance for normalization during testing
            x_normalized = (x - self.running_mean) / np.sqrt(self.running_variance + self.eps)
            out = self.gamma * x_normalized + self.beta

        return out

    def backward(self, dout):
        """
        Backward pass of the BatchNormalization2D layer.

        Args:
            dout (numpy.ndarray): Gradient of the loss with respect to the output of this layer.

        Returns:
            numpy.ndarray: Gradient of the loss with respect to the input of this layer.
        """
        if self.training:
            batch_size = dout.shape[0]

            # Calculate the normalized input from the forward pass
            x_normalized = (self.input - self.batch_mean) / np.sqrt(self.batch_variance + self.eps)

            # Calculate the gradients of gamma and beta
            dgamma = np.sum(dout * x_normalized, axis=(0, 2, 3), keepdims=True)
            dbeta = np.sum(dout, axis=(0, 2, 3), keepdims=True)

            # Calculate the gradient of the input
            dx = dout * self.gamma / np.sqrt(self.batch_variance + self.eps)
            dx -= (dgamma * x_normalized + dbeta) / np.sqrt(self.batch_variance + self.eps)
            dx *= batch_size / (batch_size - 1)

            return dx
        else:
            # During testing, the gradients are calculated using the running mean and running variance
            x_normalized = (self.input - self.running_mean) / np.sqrt(self.running_variance + self.eps)

            # Calculate the gradients of gamma and beta
            dgamma = np.sum(dout * x_normalized, axis=(0, 2, 3), keepdims=True)
            dbeta = np.sum(dout, axis=(0, 2, 3), keepdims=True)

            # Calculate the gradient of the input
            dx = dout * self.gamma / np.sqrt(self.running_variance + self.eps)
            dx -= (dgamma * x_normalized + dbeta) / np.sqrt(self.running_variance + self.eps)

            return dx

            batch_mean = np.mean(x, axis=(0, 2, 3), keepdims=True)
            batch_variance = np.var(x, axis=(0, 2, 3), keepdims=True)

Consider the input.shape is (Batch_size, num_features, height, width).

Let's say the input.shape is (256, 10, 64, 64), then the
output.shape should be (1, 10, 1, 1).