<a href="https://colab.research.google.com/github/cluePrints/fastai-v3-notes/blob/master/fastai3_part2_02a_why_sqrt5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# sqrt5
# gain
# fanin, fanout, numel
# std of uniform(-1,1)
import torch
import math

from torch import nn
from torch.nn import init
from torch import tensor
import torch.nn.functional as F

In [0]:
"""
    def reset_parameters(self):
        n = self.in_channels
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
""";
torch.nn.modules.conv._ConvNd.reset_parameters??

In [0]:
"""
a: the negative slope of the rectifier used after this layer (0 for ReLU by default)

mode: either 'fan_in' (default) or 'fan_out'. Choosing `fan_in`
    preserves the magnitude of the variance of the weights in the
    forward pass. Choosing `fan_out` preserves the magnitudes in the
    backwards pass.

def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
    fan = _calculate_correct_fan(tensor, mode)
    gain = calculate_gain(nonlinearity, a)
    std = gain / math.sqrt(fan)
    bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
    with torch.no_grad():
        return tensor.uniform_(-bound, bound)
""";
init.kaiming_uniform_??

In [4]:
torch.empty(100000).uniform_(-1, 1).std()

tensor(0.5779)

In [5]:
1/math.sqrt(3)

0.5773502691896258

In [0]:
"""
def _calculate_fan_in_and_fan_out(tensor):
    if dimensions == 2:  # Linear
        fan_in = tensor.size(1)
        fan_out = tensor.size(0)
    else:
        num_input_fmaps = tensor.size(1)
        num_output_fmaps = tensor.size(0)
        receptive_field_size = 1
        if tensor.dim() > 2:
            receptive_field_size = tensor[0][0].numel()
        fan_in = num_input_fmaps * receptive_field_size
        fan_out = num_output_fmaps * receptive_field_size

    return fan_in, fan_out
""";
# fan-in/fan-out calc
# https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py#L191

In [7]:
negative_slope = a = math.sqrt(5)

# https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py#L44
leaky_relu_gain = math.sqrt(2.0 / (1 + negative_slope ** 2))
leaky_relu_gain # 1/sqrt(3)

0.5773502691896257

In [0]:
def stats(w):
  with torch.no_grad():
    return w.mean(), w.std()

In [9]:
conv = nn.Conv2d(in_channels=1, out_channels=3, kernel_size=5)
stats(conv.weight)

(tensor(0.0154), tensor(0.1031))

In [0]:
num_output_fmaps = 3
receiptive_field_size = conv.weight[0][0].numel() # 25
fan = num_output_fmaps * receiptive_field_size
gain = leaky_relu_gain
std = gain / math.sqrt(fan)
bound = math.sqrt(3.0) * std

In [11]:
bound

0.11547005383792512

In [12]:
# std looks 2x smaller compared to conv2d
stats(torch.empty(100000).uniform_(-bound, bound))

(tensor(-0.0002), tensor(0.0667))

# Does it work well?

In [13]:
from fastai import datasets
import gzip, pickle
MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

x_train,y_train,x_valid,y_valid = get_data()
train_mean,train_std = x_train.mean(),x_train.std()
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

x_train = x_train.view(-1,1,28,28)
x_valid = x_valid.view(-1,1,28,28)
x_train.shape,x_valid.shape

(torch.Size([50000, 1, 28, 28]), torch.Size([10000, 1, 28, 28]))

In [14]:
stats(conv.weight), stats(conv(x_train[:1000]))

((tensor(0.0154), tensor(0.1031)), (tensor(0.1099), tensor(0.7716)))

In [15]:
stats(F.leaky_relu(conv(x_train[:1000]), negative_slope=0.01))

(tensor(0.3366), tensor(0.5928))

In [16]:
default_negative_slope = 0.01
init.kaiming_uniform_(conv.weight, a=default_negative_slope);
stats(conv.weight)

(tensor(-0.0194), tensor(0.3159))

In [17]:
stats(F.leaky_relu(conv(x_train[:1000]), negative_slope=default_negative_slope))

(tensor(0.6672), tensor(1.0841))

In [0]:
def gain(neg_slope): return math.sqrt(2.0 / (1 + neg_slope**2))

def kaiming_manual_(x,neg_slope, use_fan_out=False):
    n_filters,n_inputs,*_ = x.shape
    receptive_field = x[0,0].shape.numel()
    fan = n_filters*receptive_field if use_fan_out else n_inputs*receptive_field
    std = gain(neg_slope) / math.sqrt(fan)
    bound = math.sqrt(3.) * std
    with torch.no_grad():
      x.uniform_(-bound,bound)

In [19]:
kaiming_manual_(conv.weight, neg_slope=0)
stats(F.leaky_relu(conv(x_train[:1000])))

(tensor(0.5828), tensor(0.7868))

In [20]:
kaiming_manual_(conv.weight, neg_slope=0.01)
stats(F.leaky_relu(conv(x_train[:1000])))

(tensor(0.7288), tensor(1.3803))

In [21]:
kaiming_manual_(conv.weight, neg_slope=math.sqrt(5))
stats(F.leaky_relu(conv(x_train[:1000])))

(tensor(0.2648), tensor(0.4246))

In [0]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)

model = nn.Sequential(
    nn.Conv2d(1,8, 5,stride=2,padding=2), nn.ReLU(),
    nn.Conv2d(8,16,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(16,32,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(32,1,3,stride=2,padding=1),
    nn.AdaptiveAvgPool2d(1),
    Flatten(),
)
out = model(x_train)

In [23]:
stats(out)

(tensor(-0.0447), tensor(0.0094))

In [0]:
for layer in model.children():
  if isinstance(layer, nn.Conv2d):
    kaiming_manual_(layer.weight, 0)

In [25]:
stats(model(x_train))

(tensor(-0.2738), tensor(0.2852))

In [0]:
for layer in model.children():
  if isinstance(layer, nn.Conv2d):
    with torch.no_grad():
      layer.bias.zero_()

In [27]:
stats(model(x_train))

(tensor(-0.2608), tensor(0.2938))