<a href="https://colab.research.google.com/github/davoodwadi/davoodwadi.github.io/blob/main/Batch_normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [190]:
import torch, numpy as np, matplotlib.pyplot as plt, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
cifar_train = datasets.CIFAR10(root='./', train=True, download=True)
cifar_test = datasets.CIFAR10(root='./', train=False, download=True)
x_train = torch.tensor(cifar_train.data)
y_train = torch.tensor(cifar_train.targets)
x_test = torch.tensor(cifar_test.data)
y_test = torch.tensor(cifar_test.targets)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

Files already downloaded and verified
Files already downloaded and verified


(torch.Size([50000, 32, 32, 3]),
 torch.Size([10000, 32, 32, 3]),
 torch.Size([50000]),
 torch.Size([10000]))

In [191]:
cifar_train.data.min(), cifar_train.data.max(), cifar_test.data.min(), cifar_test.data.max()

(0, 255, 0, 255)

In [192]:
x_train.min(), x_train.max(), x_test.min(), x_test.max()

(tensor(0, dtype=torch.uint8),
 tensor(255, dtype=torch.uint8),
 tensor(0, dtype=torch.uint8),
 tensor(255, dtype=torch.uint8))

### Make the data `channel_first`

In [193]:
def channel_first(x):
  return x.permute(0, 3, 1, 2)

In [194]:
x_train = channel_first(x_train).float()
x_test = channel_first(x_test).float()
x_train.shape, x_test.shape

(torch.Size([50000, 3, 32, 32]), torch.Size([10000, 3, 32, 32]))

In [195]:
x_train.min(), x_train.max(), x_test.min(), x_test.max()

(tensor(0.), tensor(255.), tensor(0.), tensor(255.))

In [203]:
w = nn.Conv2d(3, 3, 3, padding = 1)
nn.init.constant_(w.weight, 1.1) # set weight as 1.1
w.weight.shape

torch.Size([3, 3, 3, 3])

## Activation explosion

In [204]:
x = x_train[:100]
for i in range(20):
  x = w(x)
  print(f'mean: {x.mean()}, SD: {x.std()}')
  print('*'*10)

mean: 3268.13818359375, SD: 1548.700927734375
**********
mean: 94226.6875, SD: 44271.546875
**********
mean: 2729341.5, SD: 1286149.5
**********
mean: 79304224.0, SD: 37587400.0
**********
mean: 2309064960.0, SD: 1102421248.0
**********
mean: 67336839168.0, SD: 32403615744.0
**********
mean: 1966051950592.0, SD: 953852428288.0
**********
mean: 57459553075200.0, SD: 28107547344896.0
**********
mean: 1680666471170048.0, SD: 828897885159424.0
**********
mean: 4.919248093918003e+16, SD: 2.4458803663601664e+16
**********
mean: 1.4406948962382643e+18, SD: 7.220532029706732e+17
**********
mean: 4.221515961022703e+19, SD: 2.1323662427273822e+19
**********
mean: 1.2375463934049523e+21, SD: 6.299153836555675e+20
**********
mean: 3.6293554633857825e+22, SD: 1.8612584626387843e+22
**********
mean: 1.064766388176034e+24, SD: 5.5006828939274566e+23
**********
mean: 3.1247896711305433e+25, SD: 1.6259125094342758e+25
**********
mean: 9.173098018426051e+26, SD: 4.8065821414591144e+26
**********
mean: 2

## Normalize the data

In [216]:
x_train = (x_train-x_train.mean())/x_train.std()
x_test = (x_test-x_test.mean())/x_test.std()
x_train.mean(), x_train.std(), x_test.mean(), x_test.std()

(tensor(1.0976e-07), tensor(1.0000), tensor(1.9932e-07), tensor(1.0000))

### Set the correct device

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [219]:
class dataset(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __len__(self):
    return len(self.x)

  def __getitem__(self, index):
    return self.x[index], self.y[index]

In [220]:
bs=64

trainset = dataset(x_train, y_train)
testset = dataset(x_test, y_test)
train_loader = DataLoader(trainset, batch_size=bs, shuffle=True)
valid_loader = DataLoader(testset, batch_size=bs)

In [253]:
ni = 3
nh = 8
layer_filters = [ni, nh, 2*nh, 4*nh]
layer_num = len(layer_filters)-1
kernel_size = 3
padding = 3//2
stride = 2
modules = nn.ModuleDict()
for layer_number in range(1, layer_num+1):
  modules.add_module(f'layer_{layer_number}', nn.Conv2d(layer_filters[layer_number-1], layer_filters[layer_number], kernel_size, stride, padding))
modules

ModuleDict(
  (layer_1): Conv2d(3, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (layer_2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (layer_3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)

## let's put this in pytorch `module` class to use `forward` and `backward`

In [257]:
h, w = x_train.shape[2:]
h,w

(32, 32)

In [263]:
32/(2**layer_num)

4.0

In [299]:
class ConvNet(nn.Module):
  def __init__(self, layer_filters, stride, kernel_size=3, h=32, w=32, num_classes=10):
    super(ConvNet, self).__init__()
    self.layer_filters = layer_filters
    self.num_layers = len(layer_filters) -1
    self.layers = nn.ModuleDict()
    for layer_number in range(1, self.num_layers+1):
      self.layers.add_module(f'layer_{layer_number}', nn.Conv2d(layer_filters[layer_number-1], layer_filters[layer_number], kernel_size, stride, kernel_size//2))
    self.flatten = nn.Flatten() # [bs, layer_filters[num_layers] * h/(2**num_layers),w/(2**num_layers)]
    self.classification = nn.Linear(int(layer_filters[self.num_layers] * h/(2**self.num_layers) * w/(2**self.num_layers)), num_classes)
  def forward(self, x):
    for layer in self.layers.keys():
      x = self.layers[layer](x)
    x = self.flatten(x)
    x = self.classification(x)
    return x


In [268]:
bx, by = next(iter(train_loader))
bx.shape, by.shape, layer_filters

(torch.Size([64, 3, 32, 32]), torch.Size([64]), [3, 8, 16, 32])

In [272]:
stride = 2
kernel_size = 3

model = ConvNet(layer_filters, stride, kernel_size)
model

ConvNet(
  (layers): ModuleDict(
    (layer_1): Conv2d(3, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer_2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer_3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (classification): Linear(in_features=512, out_features=10, bias=True)
)

### Move model and data to GPU

In [283]:
def gpu(bx, by, model):
  bx, by = bx.to(device), by.to(device)
  model = model.to(device)
  return bx, by, model

In [284]:
bx, by, model = gpu(bx, by, model)
output = model(bx)
output.shape

torch.Size([64, 10])

In [285]:
x = bx
for layer_name in model.layers.keys():
  print(layer_name)
  x = model.layers[layer_name](x)
  print(f'x mean: {x.mean()}, sd: {x.std()}')

  print('*'*10)

layer_1
x mean: 0.03805947303771973, sd: 0.4746130704879761
**********
layer_2
x mean: 0.014094870537519455, sd: 0.3058101534843445
**********
layer_3
x mean: -0.022542566061019897, sd: 0.17484818398952484
**********


### Train for 3 epochs

In [286]:
def train_eval(model, optimizer, train_loader, valid_loader, epoch=3):
  loss_fn = nn.CrossEntropyLoss()
  for e in range(epoch):
    correct = 0
    total = 0
    losses = []

    # training
    model.train()
    for bx, by in train_loader:
      bx, by, model = gpu(bx, by, model)
      output = model(bx)
      loss = loss_fn(output, by)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    # validation
    model.eval()
    for bx, by in valid_loader:
      bx, by, model = gpu(bx, by, model)
      with torch.no_grad():
        output = model(bx)
      predictions = output.argmax(-1)

      correct += (predictions==by).float().sum()
      total += len(by)
      losses.append(loss.detach().item())

    accuracy = correct/total
    print(f'epoch {e}; loss {np.mean(losses):.4f}; accuracy {accuracy:.2f}')

In [300]:
stride = 2
kernel_size = 3
lr = 0.001

model = ConvNet(layer_filters, stride, kernel_size)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
train_eval(model, optimizer, train_loader, valid_loader, epoch=10)

epoch 0; loss 2.2556; accuracy 0.22
epoch 1; loss 2.1157; accuracy 0.27
epoch 2; loss 1.9773; accuracy 0.30
epoch 3; loss 1.9661; accuracy 0.32
epoch 4; loss 2.1413; accuracy 0.34
epoch 5; loss 1.8773; accuracy 0.34
epoch 6; loss 1.9587; accuracy 0.35
epoch 7; loss 1.4923; accuracy 0.35
epoch 8; loss 1.6887; accuracy 0.36
epoch 9; loss 1.7652; accuracy 0.36


## Adding ReLU

In [312]:
class ConvNet(nn.Module):
  def __init__(self, layer_filters, stride, kernel_size=3, h=32, w=32, num_classes=10):
    super(ConvNet, self).__init__()
    self.layer_filters = layer_filters
    self.num_layers = len(layer_filters) -1
    self.layers = nn.ModuleDict()
    for layer_number in range(1, self.num_layers+1):
      self.layers.add_module(f'layer_{layer_number}', nn.Conv2d(layer_filters[layer_number-1], layer_filters[layer_number], kernel_size, stride, kernel_size//2))
    self.flatten = nn.Flatten() # [bs, layer_filters[num_layers] * h/(2**num_layers),w/(2**num_layers)]
    self.classification = nn.Linear(int(layer_filters[self.num_layers] * h/(2**self.num_layers) * w/(2**self.num_layers)), num_classes)

  def forward(self, x):
    for layer in self.layers.keys():
      x = self.layers[layer](x)
      x = nn.ReLU()(x)

    x = self.flatten(x)
    x = self.classification(x)
    return x



In [314]:
stride = 2
kernel_size = 3
layer_filters = [ni, nh, 2*nh, 4*nh]

model = ConvNet(layer_filters, stride, kernel_size)
model

ConvNet(
  (layers): ModuleDict(
    (layer_1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer_2): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer_3): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (classification): Linear(in_features=2048, out_features=10, bias=True)
)

In [315]:
bx, by, model = gpu(bx, by, model)
x = bx
for layer_name in model.layers.keys():
  print(layer_name)
  x = model.layers[layer_name](x)
  x = nn.ReLU()(x)
  print(f'x mean: {x.mean()}, sd: {x.std()}')

  print('*'*10)

layer_1
x mean: 0.1957414448261261, sd: 0.3261364996433258
**********
layer_2
x mean: 0.07460178434848785, sd: 0.12499479949474335
**********
layer_3
x mean: 0.02778497524559498, sd: 0.046145956963300705
**********


**ReLU removes half of SD**

In [316]:
stride = 2
kernel_size = 3
lr = 0.001

model = ConvNet(layer_filters, stride, kernel_size)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
train_eval(model, optimizer, train_loader, valid_loader, epoch=10)

epoch 0; loss 2.2766; accuracy 0.17
epoch 1; loss 2.3013; accuracy 0.20
epoch 2; loss 2.1594; accuracy 0.25
epoch 3; loss 1.9361; accuracy 0.27
epoch 4; loss 2.2897; accuracy 0.29
epoch 5; loss 1.9942; accuracy 0.32
epoch 6; loss 1.9650; accuracy 0.33
epoch 7; loss 2.1201; accuracy 0.34
epoch 8; loss 1.6206; accuracy 0.35
epoch 9; loss 1.7111; accuracy 0.36


# Making the model deep

In [328]:
layer_filters[num_layers], h/(2**num_layers) * w/(2**num_layers)

(32, 0.0625)

In [325]:
layer_filters[num_layers] * h/(2**num_layers) * w/(2**num_layers)

2.0

In [329]:
num_layers = len(layer_filters)-1
num_layers, (layer_filters[num_layers] * int(h/(2**num_layers) * w/(2**num_layers)))

(7, 0)

In [330]:
ni = 3
nh = 32
depth = 5
layer_filters = [ni] + [nh]*(depth)
stride = 2
kernel_size = 3

model = ConvNet(layer_filters, stride, kernel_size)
model

ConvNet(
  (layers): ModuleDict(
    (layer_1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer_2): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer_3): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer_4): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (layer_5): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (classification): Linear(in_features=32, out_features=10, bias=True)
)

In [331]:
layer_filters

[3, 32, 32, 32, 32, 32]

In [332]:
bx, by, model = gpu(bx, by, model)
x = bx
for layer_name in model.layers.keys():
  print(layer_name)
  x = model.layers[layer_name](x)
  x = nn.ReLU()(x)
  print(x.shape)
  print(f'x mean: {x.mean()}, sd: {x.std()}')
  print('*'*10)

layer_1
torch.Size([64, 32, 16, 16])
x mean: 0.23010757565498352, sd: 0.39709943532943726
**********
layer_2
torch.Size([64, 32, 8, 8])
x mean: 0.06489626318216324, sd: 0.12451531738042831
**********
layer_3
torch.Size([64, 32, 4, 4])
x mean: 0.0378086119890213, sd: 0.0578189380466938
**********
layer_4
torch.Size([64, 32, 2, 2])
x mean: 0.009765984490513802, sd: 0.02073776163160801
**********
layer_5
torch.Size([64, 32, 1, 1])
x mean: 0.018953658640384674, sd: 0.022025499492883682
**********


In [333]:
stride = 2
kernel_size = 3
lr = 0.001

model = ConvNet(layer_filters, stride, kernel_size)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
train_eval(model, optimizer, train_loader, valid_loader, epoch=10)

epoch 0; loss 2.3023; accuracy 0.09
epoch 1; loss 2.2850; accuracy 0.09
epoch 2; loss 2.2618; accuracy 0.09
epoch 3; loss 2.2985; accuracy 0.10
epoch 4; loss 2.3221; accuracy 0.11
epoch 5; loss 2.3224; accuracy 0.11
epoch 6; loss 2.3211; accuracy 0.11
epoch 7; loss 2.2960; accuracy 0.11
epoch 8; loss 2.3055; accuracy 0.11
epoch 9; loss 2.3051; accuracy 0.11


## Solution? Batch Normalization

In [339]:
class ConvNet(nn.Module):
  def __init__(self, layer_filters, stride, kernel_size=3, h=32, w=32, num_classes=10):
    super(ConvNet, self).__init__()
    self.layer_filters = layer_filters
    self.num_layers = len(layer_filters) - 1
    self.layers = nn.ModuleDict()
    for layer in range(1, self.num_layers+1):
      self.layers.add_module(f'layer_{layer}', nn.Conv2d(layer_filters[layer-1], layer_filters[layer], kernel_size, stride, kernel_size//2))
      self.layers.add_module(f'layer_{layer}_bn', nn.BatchNorm2d(layer_filters[layer]))

    self.flatten = nn.Flatten() # [bs, layer_filters[num_layers] * h/(2**num_layers),w/(2**num_layers)]
    self.classification = nn.Linear(int(layer_filters[self.num_layers] * h/(2**self.num_layers) * w/(2**self.num_layers)), num_classes)

  def forward(self, x):
    for layer_name in self.layers.keys():
      x = self.layers[layer_name](x)
      if 'bn' not in layer_name:
        x = nn.ReLU()(x)

    x = self.flatten(x)
    x = self.classification(x)
    return x


In [340]:
depth = 5
ni = 3
nh = 32
layer_filters = [ni] + [nh]*(depth)
stride = 1
kernel_size = 3

model = ConvNet(layer_filters, stride, kernel_size)

In [341]:
model.num_layers

5

In [342]:
bx, by, model = gpu(bx, by, model)
x = bx
for layer_name in model.layers.keys():
  print(layer_name)
  x = model.layers[layer_name](x)
  if 'bn' not in layer_name:
    x = nn.ReLU()(x)

  print(f'x mean: {x.mean()}, sd: {x.std()}')
  print('*'*10)

layer_1
x mean: 0.2258470058441162, sd: 0.37762323021888733
**********
layer_1_bn
x mean: -6.28642737865448e-09, sd: 0.9998002052307129
**********
layer_2
x mean: 0.20866098999977112, sd: 0.3313664197921753
**********
layer_2_bn
x mean: -2.561137080192566e-09, sd: 0.9999391436576843
**********
layer_3
x mean: 0.2187885195016861, sd: 0.3358442783355713
**********
layer_3_bn
x mean: 6.05359673500061e-09, sd: 0.9999446272850037
**********
layer_4
x mean: 0.2130168080329895, sd: 0.3269975781440735
**********
layer_4_bn
x mean: 5.587935447692871e-09, sd: 0.9999483823776245
**********
layer_5
x mean: 0.2182147055864334, sd: 0.33259060978889465
**********
layer_5_bn
x mean: 6.51925802230835e-09, sd: 0.9999526739120483
**********


In [174]:
model.layers.layer_1_bn.weight.shape

torch.Size([32])

In [343]:
stride = 2
kernel_size = 3
lr = 0.001

model = ConvNet(layer_filters, stride, kernel_size)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
train_eval(model, optimizer, train_loader, valid_loader, epoch=10)

epoch 0; loss 1.7727; accuracy 0.33
epoch 1; loss 1.9622; accuracy 0.38
epoch 2; loss 1.5088; accuracy 0.41
epoch 3; loss 1.5596; accuracy 0.44
epoch 4; loss 1.4573; accuracy 0.45
epoch 5; loss 1.3441; accuracy 0.47
epoch 6; loss 1.9838; accuracy 0.48
epoch 7; loss 1.7721; accuracy 0.49
epoch 8; loss 1.2655; accuracy 0.50
epoch 9; loss 1.3226; accuracy 0.51
