In [28]:
import torch
from torch import nn


net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,1))

X = torch.normal(0,1,(2,4))

net(X)

tensor([[-0.4163],
        [-0.2449]], grad_fn=<AddmmBackward>)

In [29]:
X

tensor([[-0.3203, -1.7860,  0.3747,  2.1854],
        [-0.5479, -0.9498, -0.7674,  0.4066]])

In [30]:
# accessing parameters

print(net[2].state_dict())

OrderedDict([('weight', tensor([[-0.3405,  0.1488, -0.3470,  0.0301, -0.1840,  0.0507,  0.2979, -0.1588]])), ('bias', tensor([-0.1072]))])


In [31]:
print(net[2].bias)
print(type(net[2].bias))
print(net[2].bias.data)


Parameter containing:
tensor([-0.1072], requires_grad=True)
<class 'torch.nn.parameter.Parameter'>
tensor([-0.1072])


In [32]:
print(net[2].weight.grad)

None


In [33]:
y = net(X)

In [34]:
y.sum().backward()

In [35]:
print(net[2].weight.grad)

tensor([[0.3270, 0.0000, 0.6522, 1.2486, 0.1149, 0.2409, 0.0000, 0.8672]])


In [36]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))


In [37]:
print(*[(name, param.shape) for name, param in net.named_parameters()])

('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [38]:
net.state_dict()

OrderedDict([('0.weight',
              tensor([[-0.1391, -0.2921, -0.2221, -0.2146],
                      [ 0.1653,  0.4043,  0.0810,  0.2258],
                      [ 0.1089, -0.2901,  0.4060, -0.0764],
                      [-0.4446, -0.1859, -0.0752, -0.0773],
                      [ 0.0501,  0.3209,  0.3922,  0.4266],
                      [ 0.4470,  0.1577, -0.0096,  0.1195],
                      [ 0.1689,  0.4865,  0.0587, -0.0398],
                      [-0.2586,  0.4552,  0.0782,  0.4044]])),
             ('0.bias',
              tensor([-0.1099, -0.2046,  0.1553,  0.2624, -0.3751,  0.3734,  0.3709,  0.4354])),
             ('2.weight',
              tensor([[-0.3405,  0.1488, -0.3470,  0.0301, -0.1840,  0.0507,  0.2979, -0.1588]])),
             ('2.bias', tensor([-0.1072]))])

In [39]:
net.state_dict()['0.bias']

tensor([-0.1099, -0.2046,  0.1553,  0.2624, -0.3751,  0.3734,  0.3709,  0.4354])

In [40]:
def block1():
    return nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,4), nn.ReLU())

def block2():
    net = nn.Sequential()
    
    for i in range(4):
        net.add_module(f'block{i}', block1())
    
    net.add_module(f'final',nn.Linear(4,1))
    return net

regnet = block2()

regnet(X)

tensor([[-0.4282],
        [-0.4283]], grad_fn=<AddmmBackward>)

In [41]:
regnet

Sequential(
  (block0): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block1): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block2): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block3): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (final): Linear(in_features=4, out_features=1, bias=True)
)

In [42]:
regnet.state_dict()

OrderedDict([('block0.0.weight',
              tensor([[-0.1414, -0.2763,  0.0912,  0.2628],
                      [-0.0203, -0.3154, -0.3734,  0.2009],
                      [-0.1486,  0.2502, -0.1259, -0.3609],
                      [-0.4629,  0.2864,  0.0642, -0.0444],
                      [ 0.3430,  0.4648, -0.4341, -0.3820],
                      [-0.0667, -0.0683,  0.4167, -0.2995],
                      [ 0.0370,  0.4288,  0.0596, -0.4676],
                      [-0.2145, -0.4341,  0.1977, -0.2197]])),
             ('block0.0.bias',
              tensor([0.4702, 0.1349, 0.1417, 0.0027, 0.3545, 0.2115, 0.2001, 0.3612])),
             ('block0.2.weight',
              tensor([[ 0.1718,  0.3070, -0.2701, -0.0304, -0.2666, -0.0534,  0.2603,  0.3434],
                      [-0.1674,  0.1735,  0.3278, -0.2066,  0.2596,  0.2381, -0.1595,  0.2295],
                      [ 0.1560, -0.2978, -0.3515,  0.0812, -0.0812,  0.2843, -0.0312,  0.2783],
                      [ 0.2899,  0.3045,  0

In [44]:
def init_zero(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0)
        nn.init.zeros_(m.bias)

net.apply(init_zero)

net.state_dict()

OrderedDict([('0.weight',
              tensor([[0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.],
                      [0., 0., 0., 0.]])),
             ('0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])),
             ('2.weight', tensor([[0., 0., 0., 0., 0., 0., 0., 0.]])),
             ('2.bias', tensor([0.]))])

In [45]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
        
net.apply(init_normal)

net.state_dict()

OrderedDict([('0.weight',
              tensor([[-0.0019, -0.0028,  0.0035, -0.0033],
                      [ 0.0060, -0.0035,  0.0056, -0.0163],
                      [ 0.0010,  0.0019, -0.0180, -0.0063],
                      [ 0.0021,  0.0096, -0.0001,  0.0054],
                      [-0.0084,  0.0045,  0.0005,  0.0103],
                      [-0.0115, -0.0118, -0.0023,  0.0054],
                      [-0.0066, -0.0039,  0.0018, -0.0098],
                      [ 0.0169,  0.0097, -0.0048,  0.0053]])),
             ('0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])),
             ('2.weight',
              tensor([[-0.0017,  0.0031, -0.0265,  0.0157, -0.0008,  0.0357,  0.0165,  0.0004]])),
             ('2.bias', tensor([0.]))])

In [46]:
# applying different initialisers for different blocks

def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight,42)

net[0].apply(xavier)
net[2].apply(init_42)

net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.1990,  0.6444, -0.2115,  0.3642],
                      [-0.6016, -0.0097,  0.6733, -0.1605],
                      [-0.3273, -0.2779, -0.4180, -0.2677],
                      [ 0.5587, -0.1814, -0.3095,  0.6183],
                      [-0.6670,  0.1813,  0.0668, -0.0179],
                      [ 0.2229,  0.0425,  0.4958,  0.3228],
                      [-0.2510,  0.6332, -0.4272, -0.6828],
                      [ 0.0983,  0.1171, -0.3441, -0.1386]])),
             ('0.bias', tensor([0., 0., 0., 0., 0., 0., 0., 0.])),
             ('2.weight', tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])),
             ('2.bias', tensor([0.]))])

In [58]:
# custom intialisation
import torch.nn.functional as F
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units))
    
    def forward(self, X):
        out = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(out)

In [59]:
net = MyLinear(4,1)

In [60]:
net.weight

Parameter containing:
tensor([[ 0.1479],
        [-0.8462],
        [ 1.4379],
        [ 0.6987]], requires_grad=True)

In [61]:
net(X)

tensor([[2.2774],
        [0.0000]])

In [62]:
net.weight

Parameter containing:
tensor([[ 0.1479],
        [-0.8462],
        [ 1.4379],
        [ 0.6987]], requires_grad=True)

In [63]:
net = nn.Sequential(MyLinear(64,32), MyLinear(32,4))

net(torch.rand(4, 64))

tensor([[0.0000, 0.0000, 5.6595, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000]])

### Exercises
1. Design a layer that takes an input and computes a tensor reduction, i.e., it returns yk = i,j Wijkxixj .

2. Design a layer that returns the leading half of the Fourier coefficients of the data.

* dontknow.

In [80]:
#1

class LayerOne(nn.Module):
    def __init__(self, first, second):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(first, second))
        self.bias = nn.Parameter(torch.randn(second))
    
    def forward(self, X1, X2):
        out = torch.matmul(X1, self.weight)
        out = torch.matmul(out, X2)
        return F.relu(out)

In [81]:
first = 5
second = 6

X1 = torch.randn(4,first)
X2 = torch.randn(second,1)

In [82]:
net = LayerOne(5,6)

In [83]:
net(X1, X2)

tensor([[0.0000],
        [1.4783],
        [6.3657],
        [0.0000]], grad_fn=<ReluBackward0>)