# Оптимизация нейронных сетей. Метод обратного распространения ошибки (ноутбук)

> Узнаем как устроена оптимизация нейронных сетей. Оптимизация в `PyTorch`.

## План ноутбука

1. Высокоуровневое API для обучение нейросетей в `PyTorch`
2. Обучение первой нейросети в `PyTorch`

## Высокоуровневое API для обучение нейросетей в `PyTorch`

In [1]:
import torch
import torch.nn as nn

### Создание объекта нейросети

In [2]:
net = nn.Sequential(
    nn.Linear(700, 500),
    nn.ReLU(),
    nn.Linear(500, 200),
    nn.ReLU(),
    nn.Linear(200, 10)
)

In [3]:
net

Sequential(
  (0): Linear(in_features=700, out_features=500, bias=True)
  (1): ReLU()
  (2): Linear(in_features=500, out_features=200, bias=True)
  (3): ReLU()
  (4): Linear(in_features=200, out_features=10, bias=True)
)

In [4]:
from collections import OrderedDict

net = nn.Sequential(
    OrderedDict(
        [
            ('linear1', nn.Linear(700, 500)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(500, 200)),
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(200, 10))
        ]
    )
)

In [5]:
net.linear1

Linear(in_features=700, out_features=500, bias=True)

In [6]:
input_tensor = torch.rand(10, 700)

net(input_tensor).shape

torch.Size([10, 10])

In [7]:
class CustomTaskNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear1 = nn.Linear(700, 500)
        self.linear2 = nn.Linear(500, 200)
        self.linear3 = nn.Linear(200, 10)

        self.activation = nn.ReLU()

    def forward(self, x):
        output = self.activation(self.linear1(x))
        output = self.activation(self.linear2(output))
        output = self.linear3(output)

        return output

In [8]:
net = CustomTaskNetwork()

net(input_tensor).shape

torch.Size([10, 10])

In [9]:
net.to()

CustomTaskNetwork(
  (linear1): Linear(in_features=700, out_features=500, bias=True)
  (linear2): Linear(in_features=500, out_features=200, bias=True)
  (linear3): Linear(in_features=200, out_features=10, bias=True)
  (activation): ReLU()
)

In [10]:
net.train()
net.eval()

net.requires_grad_()

CustomTaskNetwork(
  (linear1): Linear(in_features=700, out_features=500, bias=True)
  (linear2): Linear(in_features=500, out_features=200, bias=True)
  (linear3): Linear(in_features=200, out_features=10, bias=True)
  (activation): ReLU()
)

In [11]:
net.parameters()

<generator object Module.parameters at 0x0000018EBB47F3E0>

In [12]:
net.state_dict()

OrderedDict([('linear1.weight',
              tensor([[ 0.0242,  0.0078,  0.0109,  ..., -0.0316, -0.0249, -0.0182],
                      [ 0.0267,  0.0091,  0.0039,  ...,  0.0254, -0.0077,  0.0101],
                      [-0.0214,  0.0178, -0.0324,  ..., -0.0281,  0.0341,  0.0296],
                      ...,
                      [-0.0242,  0.0039,  0.0304,  ..., -0.0248, -0.0278, -0.0223],
                      [ 0.0284, -0.0154,  0.0161,  ..., -0.0215,  0.0067,  0.0286],
                      [ 0.0375, -0.0180,  0.0010,  ..., -0.0176, -0.0225,  0.0093]])),
             ('linear1.bias',
              tensor([-2.8662e-02,  3.7251e-02,  2.0614e-02,  2.4399e-02,  1.6917e-02,
                       1.2851e-02,  1.4385e-02, -2.8415e-02, -2.9006e-02, -1.3612e-02,
                      -1.7165e-02, -1.9116e-02, -1.8488e-02, -3.5874e-02, -1.0187e-02,
                      -8.6642e-03,  1.9877e-02, -1.6024e-02, -1.2617e-02, -2.4848e-02,
                       5.0609e-03,  2.9533e-02,  1.6946e

In [13]:
net.load_state_dict(net.state_dict())

<All keys matched successfully>

### Оптимизаторы

In [14]:
from torch import optim

In [15]:
optim.SGD, optim.Adam

(torch.optim.sgd.SGD, torch.optim.adam.Adam)

In [16]:
optimizer = optim.Adam(net.parameters(), lr=0.0001)

In [17]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0001
    maximize: False
    weight_decay: 0
)

In [18]:
optimizer = optim.SGD(
    [
        {'params': net.linear1.parameters()},
        {'params': net.linear2.parameters(), 'lr': 1e-3}
    ],
    lr=1e-2,
    momentum=0.9
)

In [19]:
optimizer.step()

In [20]:
optimizer.zero_grad()

### Функции потерь

In [21]:
nn.L1Loss, nn.MSELoss, nn.CrossEntropyLoss

(torch.nn.modules.loss.L1Loss,
 torch.nn.modules.loss.MSELoss,
 torch.nn.modules.loss.CrossEntropyLoss)

In [22]:
loss = nn.MSELoss()

In [23]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5)

output = loss(input, target)

print(output)

output.backward()

tensor(2.5575, grad_fn=<MseLossBackward0>)


In [24]:
input.grad

tensor([[ 0.1323,  0.0927,  0.4579, -0.3076,  0.1541],
        [-0.1841, -0.3431,  0.0129, -0.0326, -0.2800],
        [ 0.1487, -0.1870,  0.1929,  0.0435, -0.0201]])

### Датасеты и даталоадеры

In [25]:
from torch.utils.data import Dataset
from torch.utils.data import TensorDataset

In [26]:
n_features = 2
n_objects = 300

torch.manual_seed(0);

In [27]:
w_true = torch.randn(n_features)

X = (torch.rand(n_objects, n_features) - 0.5) * 10
X *= (torch.arange(n_features) * 2 + 1)
Y = (X @ w_true + torch.randn(n_objects)).unsqueeze(1)

In [28]:
dataset = TensorDataset(X, Y)

In [29]:
dataset[7]

(tensor([-3.3897, -6.5319]), tensor([-3.7483]))

In [30]:
X[7], Y[7]

(tensor([-3.3897, -6.5319]), tensor([-3.7483]))

In [31]:
class CustomDataset(Dataset):
    def __init__(self, w_true, n_features, n_objects):
        self.X = (torch.rand(n_objects, n_features) - 0.5) * 10
        self.X *= (torch.arange(n_features) * 2 + 1)
        self.Y = self.X @ w_true + torch.randn(n_objects)

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, item):
        return self.X[item], self.Y[item]

In [32]:
dataset = CustomDataset(w_true, n_features, n_objects)

In [33]:
from torch.utils.data import DataLoader

In [34]:
loader = DataLoader(dataset, batch_size=4, shuffle=True)

In [35]:
for x, y in loader:
    print(x.shape)

torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([4, 2])
torch.Size([

In [36]:
X.shape

torch.Size([300, 2])

### Общая структура обучения модели

model.train()

for x, y in dataloader:

    optimizer.zero_grad()

    output = model(x)

    loss = loss_fn(output, y)

    loss.backward()

    optimizer.step()

## Обучение первой нейросети в `PyTorch`

In [37]:
class CustomTaskNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear = nn.Linear(n_features, 1)

    def forward(self, x):
        return self.linear(x)

net = CustomTaskNetwork()

optimizer = optim.Adam(net.parameters(), lr=1e-1)

loss_fn = nn.MSELoss()

dataset = TensorDataset(X, Y)

loader = DataLoader(dataset, batch_size=4, shuffle=True)

w_list = torch.empty(len(loader) + 1, n_features)

In [38]:
net.train()

for i, (x, y) in enumerate(loader):
    w_list[i] = net.linear.weight.detach().clone()

    optimizer.zero_grad()

    output = net(x)

    loss = loss_fn(output, y)

    print(f'MSE на шаге {i} {loss.item():.5f}')

    loss.backward()

    optimizer.step()

w_list[len(loader)] = net.linear.weight.detach().clone()

MSE на шаге 0 28.63088
MSE на шаге 1 45.38396
MSE на шаге 2 1.64545
MSE на шаге 3 30.87534
MSE на шаге 4 45.28118
MSE на шаге 5 11.27283
MSE на шаге 6 9.16029
MSE на шаге 7 14.66862
MSE на шаге 8 13.91990
MSE на шаге 9 14.74506
MSE на шаге 10 10.66703
MSE на шаге 11 12.45725
MSE на шаге 12 14.94521
MSE на шаге 13 10.00435
MSE на шаге 14 2.04556
MSE на шаге 15 9.38098
MSE на шаге 16 9.90906
MSE на шаге 17 2.16970
MSE на шаге 18 1.53581
MSE на шаге 19 5.15861
MSE на шаге 20 0.65541
MSE на шаге 21 2.19203
MSE на шаге 22 1.33867
MSE на шаге 23 1.24042
MSE на шаге 24 1.43382
MSE на шаге 25 2.31887
MSE на шаге 26 0.67618
MSE на шаге 27 2.36088
MSE на шаге 28 1.00729
MSE на шаге 29 0.30867
MSE на шаге 30 3.10811
MSE на шаге 31 1.03187
MSE на шаге 32 1.54535
MSE на шаге 33 0.91494
MSE на шаге 34 0.71947
MSE на шаге 35 1.57248
MSE на шаге 36 1.05323
MSE на шаге 37 1.56655
MSE на шаге 38 1.97591
MSE на шаге 39 1.25570
MSE на шаге 40 2.83284
MSE на шаге 41 3.17145
MSE на шаге 42 1.99665
MSE на ша

In [39]:
from plot_trajectory import plot_trajectory

plot_trajectory(
    w_list.numpy(),
    w_true.numpy(),
    X.numpy(),
    Y.squeeze(1).numpy(),
    'Mini-batch градиентный спуск'
)

In [40]:
num_epochs = 5

w_list = torch.empty(len(loader) * num_epochs + 1, n_features)

net = CustomTaskNetwork()

optimizer = optim.Adam(net.parameters(), lr=1e-1)

In [41]:
net.train()

for epoch in range(num_epochs):
    total_loss = 0

    for i, (x, y) in enumerate(loader):
        w_list[i + epoch * len(loader)] = net.linear.weight.detach().clone()

        optimizer.zero_grad()

        output = net(x)

        loss = loss_fn(output, y)

        total_loss += loss.item()

        loss.backward()

        optimizer.step()

    total_loss /= len(loader)

    print(f'MSE после эпохи {epoch} {total_loss:.5f}')

w_list[len(loader) * num_epochs] = net.linear.weight.detach().clone()

MSE после эпохи 0 6.04939
MSE после эпохи 1 1.08135
MSE после эпохи 2 1.09777
MSE после эпохи 3 1.24916
MSE после эпохи 4 1.12345


In [42]:
from plot_trajectory import plot_trajectory

plot_trajectory(
    w_list.numpy(),
    w_true.numpy(),
    X.numpy(),
    Y.squeeze(1).numpy(),
    'Mini-batch градиентный спуск'
)