In [1]:
import torch
from torch import nn


net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))

X = torch.normal(0,1,(2,4))

net(X)

tensor([[-0.7411],
        [-0.5263]], grad_fn=<AddmmBackward>)

In [2]:
X

tensor([[-1.1073,  1.1098,  0.0501,  2.7911],
        [-0.6327,  0.7352, -0.0226,  1.8213]])

In [3]:
# accessing parameters

print(net[2].state_dict())

OrderedDict([('weight', tensor([[-0.3002,  0.2701,  0.2222,  0.2614,  0.0269, -0.2362,  0.0534, -0.0627]])), ('bias', tensor([0.0478]))])


In [4]:
print(net[2].bias)
print(type(net[2].bias))
print(net[2].bias.data)


Parameter containing:
tensor([0.0478], requires_grad=True)
<class 'torch.nn.parameter.Parameter'>
tensor([0.0478])


In [7]:
print(net[2].weight.grad)

None


In [8]:
y = net(X)

In [10]:
y.sum().backward()

In [11]:
print(net[2].weight.grad)

tensor([[2.9629, 0.0000, 0.0000, 0.0000, 0.8209, 1.4852, 0.0000, 2.3116]])


In [13]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))


In [14]:
print(*[(name, param.shape) for name, param in net.named_parameters()])

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [15]:
net.state_dict()

OrderedDict([('0.weight',
              tensor([[-0.1607,  0.1800, -0.4635,  0.3459],
                      [ 0.0531, -0.1090,  0.4334, -0.1813],
                      [ 0.4819, -0.3694,  0.0994, -0.2651],
                      [ 0.3065,  0.4754,  0.1272, -0.4293],
                      [ 0.3878,  0.0545, -0.2672,  0.1165],
                      [ 0.1052,  0.4395,  0.1444,  0.0753],
                      [-0.1074, -0.0442, -0.4876, -0.3685],
                      [-0.3632, -0.2482,  0.2401,  0.4332]])),
             ('0.bias',
              tensor([ 0.3842, -0.1967, -0.1540, -0.3962,  0.4326,  0.2531,  0.2553,  0.0664])),
             ('2.weight',
              tensor([[-0.3002,  0.2701,  0.2222,  0.2614,  0.0269, -0.2362,  0.0534, -0.0627]])),
             ('2.bias', tensor([0.0478]))])

In [16]:
net.state_dict()['0.bias']

tensor([ 0.3842, -0.1967, -0.1540, -0.3962,  0.4326,  0.2531,  0.2553,  0.0664])

In [19]:
def block1():
    return nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,4), nn.ReLU())

def block2():
    net = nn.Sequential()
    
    for i in range(4):
        net.add_module(f'block{i}', block1())
    
    net.add_module(f'final',nn.Linear(4,1))
    return net

regnet = block2()

regnet(X)

tensor([[-0.0202],
        [-0.0202]], grad_fn=<AddmmBackward>)

In [29]:
regnet

Sequential(
  (block0): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block1): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block2): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block3): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (final): Linear(in_features=4, out_features=1, bias=True)
)

In [20]:
regnet.state_dict()

OrderedDict([('block0.0.weight',
              tensor([[-0.0116, -0.0553,  0.1173,  0.1460],
                      [-0.2185,  0.3418, -0.2037,  0.4239],
                      [-0.0380,  0.1748, -0.3710, -0.0930],
                      [-0.1280,  0.0396, -0.0988, -0.0131],
                      [ 0.0303, -0.0368,  0.1365,  0.3926],
                      [-0.2799, -0.2030,  0.0899,  0.2668],
                      [-0.0706, -0.4057,  0.4452, -0.1973],
                      [ 0.1060, -0.2885,  0.3358, -0.1146]])),
             ('block0.0.bias',
              tensor([ 0.1351, -0.2829,  0.0054,  0.1815,  0.4974,  0.4477,  0.3667,  0.3806])),
             ('block0.2.weight',
              tensor([[ 0.2167, -0.1914, -0.1014, -0.2227, -0.3228,  0.0677, -0.1550,  0.2795],
                      [ 0.1948, -0.2711, -0.2100,  0.3151, -0.2922,  0.3119, -0.2467,  0.1851],
                      [ 0.1832,  0.3343, -0.0187, -0.0944, -0.3158, -0.0391,  0.0727,  0.2851],
                      [-0.0418, -0.

In [21]:
def init_zero(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0)
        nn.init.zeros_(m.bias)

net.apply(init_normal)

net.state_dict()

OrderedDict([('0.weight',
              tensor([[0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.]])),
             ('0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])),
             ('2.weight', tensor([[0., 0., 0., 0., 0., 0., 0., 0.]])),
             ('2.bias', tensor([0.]))])

In [24]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
        
net.apply(init_normal)

net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.0039, -0.0059, -0.0063,  0.0004],
                      [ 0.0057, -0.0064,  0.0029, -0.0129],
                      [-0.0015, -0.0004,  0.0236,  0.0088],
                      [ 0.0043, -0.0041,  0.0058, -0.0107],
                      [-0.0198,  0.0147, -0.0068,  0.0111],
                      [-0.0123,  0.0153, -0.0094,  0.0126],
                      [-0.0090, -0.0016, -0.0030,  0.0010],
                      [ 0.0120, -0.0086, -0.0122, -0.0176]])),
             ('0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])),
             ('2.weight',
              tensor([[-0.0017,  0.0039,  0.0023,  0.0049,  0.0116, -0.0089,  0.0004, -0.0099]])),
             ('2.bias', tensor([0.]))])

In [28]:
# applying different initialisers for different blocks

def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight,42)

net[0].apply(xavier)
net[2].apply(init_42)

net.state_dict()

OrderedDict([('0.weight',
              tensor([[-0.5783, -0.0872,  0.1788, -0.5835],
                      [-0.4008,  0.0051,  0.3491, -0.5752],
                      [-0.5578,  0.1954, -0.6183,  0.6527],
                      [-0.5420, -0.6750, -0.3802,  0.3711],
                      [ 0.4312,  0.0735,  0.5740, -0.4506],
                      [-0.3833, -0.4753, -0.3140,  0.6563],
                      [-0.4093,  0.0664,  0.0592,  0.2929],
                      [ 0.2368, -0.6250, -0.3616,  0.2094]])),
             ('0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])),
             ('2.weight', tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])),
             ('2.bias', tensor([0.]))])

In [None]:
# custom intialisation
