# Batch norm
1. Derivation
2. Implementation
3. Experiments

In [2]:
pwd

'/root/fastai/Playground/PytorchStarter/Notebooks'

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Derivation

If a matrix $M$ of size $m \times d$ goes through a columnwise operation with a vector of length $d$ in the forward pass, the error term for $M_{ij}$ is $m \times $ the error term passed by a single element in the vector 

## Implementation

1. $Z \rightarrow$ calculate $\mu, \sigma \rightarrow Z_{norm}$
   $$ Z_{norm} = \frac{Z - \mu}{\sigma} $$
   where $\mu, \sigma$ are row vectors of length $d$
2. $Z_{norm} \rightarrow \tilde{Z}$ 
   $$\tilde{Z} = \gamma \cdot Z_{norm} + \beta$$
   where $\gamma, \beta$ are row vectors of length $d$

In [4]:
class NormalizingLayer():
    def __init__(self, tol=1e-6, eps=1e-3):
        self.mu = None
        self.std = None
        self.centered = None
        self.tol = tol
    
    def forward(self,z):
        """z: 2 dimentional numpy array of size (mxd)"""
        if isinstance(z, torch.Tensor):
            z = z.numpy()
        m,d = z.shape
        self.mu = np.mean(z,dim=0)
        self.centered = z - self.mu
        
        self.var = np.var(z, dim=0) + eps
#         var[abs(var)<tol] = eps
        self.std = self.var**.5
        self.inv = 1/self.std
        
        znorm = (z-self.mu) * self.inv
        return znorm
    
    def backward(self):
        """Computes dznorm/dz which has the same size as z"""
        flow1 = self.centered*self.inv 
        flow2 = -self.centered*(-self.std**-2)*(0.5*self.var**(-0.5))*2*self.centered
        return flow1 + flow2

In [5]:
class ShiftScaleLayer():
    def __init__(self, gamma, beta):
        """gamma: scale factor for each feature space. length d
            beta: shift factor for each feature space. length d
            Storage and computations are done in Numpy."""
        if isinstance(gamma, torch.Tensor): gamma = gamma.numpy()
        if isinstance(beta, torch.Tensor): beta = beta.numpy()
  
        self.gamma = gamma
        self.beta = beta
        self.out = None
    def forward(self,z):
        """assumes z is a numpy array"""
        self.out = self.gamma * z + self.beta
        return self.out
    
    def backward(self):
        """Returns [dout/dz, dout/dgamma, dout/dbeta]"""
        dz = np.tile(self.gamma, (z.shape[0],1)) #dout/dz
        dgamma = np.sum(z,dim=0)
        dbeta = m * np.ones_like(beta)
        return [dz, dgamma, dbeta]

## Experiments 

In [6]:
class BCModel(nn.Module):
    
    def __init__(self, inDim, H1, activation):
        """ 
        Single layer binary classification network
        Args: 
        ---
        inDim: input dimension
        H1: number of units in the first layer
        activation: (torch.nn.modules.activation) activation function instance
            eg.nn.Sigmoid() or nn.Relu()
        """
        super(BCModel, self).__init__()
        self.l1 = nn.Linear(inDim, H1)
        self.l2 = nn.Linear(H1, 1)
        
        # Hidden layers' activation
#         self.relu = nn.ReLU() #elementwise relu
        self.activation = activation
    
        # todo: batch norm layer
        self.bn1 = nn.BatchNorm1d(H1)
        
        # Last layer activation
        self.sigmoid = nn.Sigmoid() #elementwise sigmoid activation
        
    
    def forward(self, x):
        """input and output are both tensors"""
        
        out1 = self.activation(self.bn1(self.l1(x))) #todo: add batch norm layer
        out2 = self.activation(self.l2(out1))
        y_pred = self.sigmoid(self.l3(out2))
        return y_pred
    
    def print_params(self):
        for param in self.parameters():
            print(param)

In [67]:
x = np.array(range(12), dtype=np.float32).reshape((3,-1))
x = torch.from_numpy(x)
print(x)
N,D = x.shape
bn = nn.BatchNorm1d(D)


tensor([[  0.,   1.,   2.,   3.],
        [  4.,   5.,   6.,   7.],
        [  8.,   9.,  10.,  11.]])


In [74]:
mu = x.mean(0)
std = (x.var(0) + bn.eps)**0.5
print(mu)
print(std)

tensor([ 4.,  5.,  6.,  7.])
tensor([ 4.0000,  4.0000,  4.0000,  4.0000])


In [59]:
x-mu

tensor([[-4., -4., -4., -4.],
        [ 0.,  0.,  0.,  0.],
        [ 4.,  4.,  4.,  4.]])

In [75]:
xnorm1 = (x-mu)/std; print(xnorm1)

tensor([[-1.0000, -1.0000, -1.0000, -1.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  1.0000,  1.0000,  1.0000]])


check if bn is working as expected

In [68]:
gamma = bn.weight
beta = bn.bias
print(gamma,beta)

Parameter containing:
tensor([ 0.2850,  0.7858,  0.2416,  0.6030]) Parameter containing:
tensor([ 0.,  0.,  0.,  0.])


In [72]:
xmanual = (x-mu)*gamma + beta; print(xmanual)

tensor([[-1.1401, -3.1434, -0.9663, -2.4119],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.1401,  3.1434,  0.9663,  2.4119]])


In [71]:
# compare manual computation with bn's forward function 
xauto = bn(x); print(xauto)

tensor([[-0.3491, -0.9625, -0.2959, -0.7385],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.3491,  0.9625,  0.2959,  0.7385]])


Multiplication test


In [38]:
a = torch.ones(2,3); a

tensor([[ 1.,  1.,  1.],
        [ 1.,  1.,  1.]])

In [40]:
a[1,:] = 2; a

tensor([[ 1.,  1.,  1.],
        [ 2.,  2.,  2.]])

In [41]:
m = 10*torch.ones(1,3); m

tensor([[ 10.,  10.,  10.]])

In [42]:
a*m

tensor([[ 10.,  10.,  10.],
        [ 20.,  20.,  20.]])

In [44]:
m.view(3)

tensor([ 10.,  10.,  10.])

In [45]:
a*m.view(3)

tensor([[ 10.,  10.,  10.],
        [ 20.,  20.,  20.]])