# Parameter Management

In [1]:
import torch
from torch import nn

In [4]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2,4))
net(X).shape



torch.Size([2, 1])

## Access Parameter

In [3]:
net[2].state_dict()

OrderedDict([('weight',
              tensor([[-0.0171, -0.1557, -0.0524, -0.1929, -0.1379,  0.0217,  0.3426,  0.3249]])),
             ('bias', tensor([0.3165]))])

In [6]:
net[2].bias.data, type(net[2].bias)

(tensor([-0.1190]), torch.nn.parameter.Parameter)

In [8]:
net[2].weight.grad == None

True

In [9]:
# All parameters at once
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

In [11]:
# Tied Parameters
# We need to give shared layer a name so that we can use the name to refer its parameter

shared = nn.LazyLinear(8)
net = nn.Sequential(
    nn.LazyLinear(8), nn.ReLU(),
    shared, nn.ReLU(),
    shared, nn.ReLU(),
    nn.LazyLinear(1)
)

net(X)
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


In [12]:
# The weights are also same between layers
net[2].weight.data[0, 0] = 100
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


# Parameter Initialization

In [13]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape

torch.Size([2, 1])

In [18]:
# Built-in Initialization
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.0057, -0.0056, -0.0028, -0.0065]), tensor(0.))

In [19]:
# Constant initialization
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [21]:
# We can also apply initialization to only a certain block
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)

def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.2403,  0.0247, -0.2942,  0.6212])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])
