In [1]:
import torch
import torch.nn as nn

In [3]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))

X = torch.rand(size = (2, 4))
net(X).shape

torch.Size([2, 1])

In [6]:
# Inspecting the parameters of the secod fully connected layer
net[2].state_dict()

OrderedDict([('weight',
              tensor([[-0.1193,  0.1529,  0.2546, -0.3269,  0.2282,  0.1378, -0.3407, -0.0472]])),
             ('bias', tensor([-0.1684]))])

In [7]:
# Extracting the bias from the second NN layer
type(net[2].bias), net[2].bias.data

(torch.nn.parameter.Parameter, tensor([-0.1684]))

In [8]:
# Accessing the gradient. Since we have not invoked backprop, it should return true

net[2].weight.grad == None

True

In [9]:
# Accessing all parameters at once
[(name, param.shape) for name, param in net.named_parameters()]

[('0.weight', torch.Size([8, 4])),
 ('0.bias', torch.Size([8])),
 ('2.weight', torch.Size([1, 8])),
 ('2.bias', torch.Size([1]))]

In [14]:
# Sharing parameters across multiple layers

shared = nn.LazyLinear(8)
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.LazyLinear(1))
net(X)
# Checking if the parameters are same
print(net[2].weight.data[0] == net[4].weight.data[0])

net[2].bias.data[0] = 100
print(net[2].bias.data[0] == net[4].bias.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor(True)
