<a href="https://colab.research.google.com/github/gabriellaaileen/ADL/blob/main/Tugas_1_ADL_Bab_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **5.1. Layers and Blocks**

In [1]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[ 0.0169,  0.0129,  0.1015,  0.0335,  0.0666,  0.0897, -0.1919, -0.1297,
         -0.0138, -0.0449],
        [ 0.1290,  0.1301,  0.1427,  0.0691,  0.2014,  0.0526, -0.2709, -0.1733,
          0.0015,  0.0389]], grad_fn=<AddmmBackward0>)

# 😲 5.1.1. A Custom Block

In [2]:
class MLP(nn.Module):
    # Declare a layer with model parameters. Here, we declare two fully
    # connected layers
    def __init__(self):
        # Call the constructor of the `MLP` parent class `Module` to perform
        # the necessary initialization. In this way, other function arguments
        # can also be specified during class instantiation, such as the model
        # parameters, `params` (to be described later)
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # Hidden layer
        self.out = nn.Linear(256, 10)  # Output layer

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input `X`
    def forward(self, X):
        # Note here we use the funtional version of ReLU defined in the
        # nn.functional module.
        return self.out(F.relu(self.hidden(X)))

In [3]:
net = MLP()
net(X)

tensor([[ 0.1256,  0.0025, -0.0443,  0.0116, -0.1485,  0.2130, -0.0912, -0.1218,
         -0.0318, -0.0412],
        [ 0.1288,  0.1218,  0.0636, -0.0723, -0.1689,  0.2107, -0.1455, -0.1111,
          0.0559, -0.1314]], grad_fn=<AddmmBackward0>)

# 👕 5.1.2. The Sequential Block

In [4]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for idx, module in enumerate(args):
            # Here, `module` is an instance of a `Module` subclass. We save it
            # in the member variable `_modules` of the `Module` class, and its
            # type is OrderedDict
            self._modules[str(idx)] = module

    def forward(self, X):
        # OrderedDict guarantees that members will be traversed in the order
        # they were added
        for block in self._modules.values():
            X = block(X)
        return X

In [5]:
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[ 0.1287, -0.0214,  0.1135, -0.0445, -0.0556, -0.0445,  0.2277, -0.2158,
          0.1865,  0.0295],
        [ 0.0582, -0.0640,  0.1111,  0.0031, -0.0406, -0.0273,  0.2051, -0.1600,
          0.1426, -0.0554]], grad_fn=<AddmmBackward0>)

# 🦖 5.1.3. Executing Code in the Forward Propagation Function

In [6]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # Random weight parameters that will not compute gradients and
        # therefore keep constant during training
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        # Use the created constant parameters, as well as the `relu` and `mm`
        # functions
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # Reuse the fully-connected layer. This is equivalent to sharing
        # parameters with two fully-connected layers
        X = self.linear(X)
        # Control flow
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [7]:
net = FixedHiddenMLP()
net(X)

tensor(0.0638, grad_fn=<SumBackward0>)

In [8]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(0.1545, grad_fn=<SumBackward0>)

# 🐟 5.1.5. Summary
Layers are blocks.

Many layers can comprise a block.

Many blocks can comprise a block.

A block can contain code.

Blocks take care of lots of housekeeping, including parameter initialization and backpropagation.

Sequential concatenations of layers and blocks are handled by the Sequential block.

# **5.2 Parameter Management**

In [9]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[-0.3797],
        [-0.4794]], grad_fn=<AddmmBackward0>)

# 📦 5.2.1. Parameter Access

In [10]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[-0.0788,  0.2354, -0.0432, -0.0104, -0.2144, -0.1929,  0.2134, -0.1451]])), ('bias', tensor([-0.2742]))])


**Targeted Parameters**

In [11]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2742], requires_grad=True)
tensor([-0.2742])


In [12]:
net[2].weight.grad == None

True

**All Parameters at Once**

In [13]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [14]:
net.state_dict()['2.bias'].data

tensor([-0.2742])

**Collecting Parameters from Nested Blocks**

In [16]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # Nested here
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.3952],
        [0.3952]], grad_fn=<AddmmBackward0>)

In [17]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


**Built-in Initialization**

In [18]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0006, -0.0036, -0.0099,  0.0086]), tensor(0.))

In [19]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [20]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.3876,  0.1793, -0.1025,  0.1875])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


**Custom Initialization**

In [21]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-6.0920,  0.0000,  0.0000,  0.0000],
        [-0.0000,  0.0000,  9.5898, -5.3194]], grad_fn=<SliceBackward0>)

In [22]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.,  1.,  1.,  1.])

# 🐅5.2.3. Tied Parameters

In [23]:
# We need to give the shared layer a name so that we can refer to its
# parameters
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# Make sure that they are actually the same object rather than just having the
# same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


# 🦺 5.2.4. Summary
We have several ways to access, initialize, and tie model parameters.

We can use custom initialization.

# **5.3. Deferred Initialization**

# ⛳5.3.1. Instantiating a Network

In [24]:
import tensorflow as tf

net = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(10),
])

In [25]:
[net.layers[i].get_weights() for i in range(len(net.layers))]

[[], []]

In [26]:
X = tf.random.uniform((2, 20))
net(X)
[w.shape for w in net.get_weights()]

[(20, 256), (256,), (256, 10), (10,)]

# ✨ 5.3.2. Summary
Deferred initialization can be convenient, allowing the framework to infer parameter shapes automatically, making it easy to modify architectures and eliminating one common source of errors.

We can pass data through the model to make the framework finally initialize parameters.

# **5.4. Custom Layers**

# 🥍5.4.1. Layers without Parameters

In [27]:
import torch
from torch import nn
from torch.nn import functional as F


class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

In [28]:
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [29]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

In [30]:
Y = net(torch.rand(4, 8))
Y.mean()

tensor(2.7940e-09, grad_fn=<MeanBackward0>)

# 🍃5.4.2. Layers with Parameters

In [31]:
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [32]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[ 0.2254,  0.0301,  1.4857],
        [ 0.5074, -0.3842,  1.3540],
        [-0.2418, -1.1540, -1.6451],
        [ 0.5182, -0.5010,  1.1043],
        [-0.2075, -0.6601,  0.0120]], requires_grad=True)

In [33]:
linear(torch.rand(2, 5))

tensor([[0.0255, 0.0000, 0.4383],
        [0.0000, 0.0000, 0.0000]])

In [34]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(2, 64))

tensor([[0.],
        [0.]])

# 🥪 5.4.3. Summary
We can design custom layers via the basic layer class. This allows us to define flexible new layers that behave differently from any existing layers in the library.

Once defined, custom layers can be invoked in arbitrary contexts and architectures.

Layers can have local parameters, which can be created through built-in functions.

# **5.5. File I/O**

# 😭 5.5.1. Loading and Saving Tensors

In [35]:
import torch
from torch import nn
from torch.nn import functional as F

x = torch.arange(4)
torch.save(x, 'x-file')

In [36]:
x2 = torch.load('x-file')
x2

tensor([0, 1, 2, 3])

In [37]:
y = torch.zeros(4)
torch.save([x, y],'x-files')
x2, y2 = torch.load('x-files')
(x2, y2)

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [38]:
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')
mydict2 = torch.load('mydict')
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

# 💌 5.5.2. Loading and Saving Model Parameters

In [39]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

In [40]:
torch.save(net.state_dict(), 'mlp.params')

In [41]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

In [42]:
Y_clone = clone(X)
Y_clone == Y

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])

# 🦢 5.5.3. Summary
The save and load functions can be used to perform file I/O for tensor objects.

We can save and load the entire sets of parameters for a network via a parameter dictionary.

Saving the architecture has to be done in code rather than in parameters.

# **5.6. GPUs**

In [None]:
#MASIH BELUM BISA
#!pip install nvidia-smi
!nvidia-smi

# 🧭5.6.1. Computing Devices

In [47]:
import torch
from torch import nn

torch.device('cpu'), torch.device('cuda'), torch.device('cuda:1')

(device(type='cpu'), device(type='cuda'), device(type='cuda', index=1))

In [48]:
torch.cuda.device_count()

0

In [50]:
def try_gpu(i=0):  #save
    """Return gpu(i) if exists, otherwise return cpu()."""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #save
    """Return all available GPUs, or [cpu(),] if no GPU exists."""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

try_gpu(), try_gpu(10), try_all_gpus()

(device(type='cpu'), device(type='cpu'), [device(type='cpu')])

# 🍵 5.6.2. Tensors and GPUs

In [51]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

In [52]:
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [53]:
Y = torch.rand(2, 3, device=try_gpu(1))
Y

tensor([[0.2547, 0.5585, 0.7683],
        [0.5970, 0.9417, 0.8266]])

In [54]:
#MASIH BELUM BISA CUDA=0
Z = X.cuda(1)
print(X)
print(Z)

RuntimeError: ignored

# 🤓 5.6.3. Neural Networks and GPUs

In [55]:
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu())

In [56]:
net(X)

tensor([[0.1980],
        [0.1980]], grad_fn=<AddmmBackward0>)

In [57]:
net[0].weight.data.device

device(type='cpu')

# ⛵ 5.6.4. Summary
We can specify devices for storage and calculation, such as the CPU or GPU. By default, data are created in the main memory and then use the CPU for calculations.

The deep learning framework requires all input data for calculation to be on the same device, be it CPU or the same GPU.

You can lose significant performance by moving data without care. A typical mistake is as follows: computing the loss for every minibatch on the GPU and reporting it back to the user on the command line (or logging it in a NumPy ndarray) will trigger a global interpreter lock which stalls all GPUs. It is much better to allocate memory for logging inside the GPU and only move larger logs.