# Введение в PyTorch

In [None]:
import torch

### Создание тензоров

In [None]:
torch.ones(2, 3)

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [None]:
torch.rand(3, 4)

tensor([[0.9893, 0.0792, 0.3574, 0.3458],
        [0.3679, 0.4765, 0.3919, 0.8766],
        [0.9855, 0.4889, 0.2112, 0.5195]])

In [None]:
torch.randn(3, 4)

tensor([[ 1.2033,  0.3764,  1.9905, -0.4725],
        [-1.7201,  0.7569, -1.5390,  0.1497],
        [-1.1193, -0.7456, -1.6319, -0.2386]])

In [None]:
torch.arange(5)

tensor([0, 1, 2, 3, 4])

In [None]:
torch.arange(8).reshape(2, 4)

tensor([[0, 1, 2, 3],
        [4, 5, 6, 7]])

In [None]:
import numpy as np
x = np.array([4, 5, 9])
torch.from_numpy(x)

tensor([4, 5, 9])

### Операции с тензорами

Матричное произведение

In [None]:
A = torch.randn(2, 4)
B = torch.randn(4, 3)
torch.matmul(A, B)

tensor([[-0.7619, -0.8570,  0.0308],
        [-1.6654,  0.4549, -3.4008]])

Поэлементные операции

In [None]:
C = torch.randn(2, 4)
A * C

tensor([[ 0.3694, -0.0925,  0.1386,  0.0105],
        [ 0.0018, -0.0073,  0.7317, -0.2604]])

In [None]:
A.exp()

tensor([[0.5177, 0.8069, 0.7253, 0.9403],
        [0.4662, 1.0201, 2.7632, 1.3464]])

In [None]:
A

tensor([[-0.6584, -0.2145, -0.3212, -0.0616],
        [-0.7632,  0.0199,  1.0164,  0.2974]])

Операции вида inplace - с нижним подчеркиванием, меняют тензор

In [None]:
A.exp_() # inplace

tensor([[0.5177, 0.8069, 0.7253, 0.9403],
        [0.4662, 1.0201, 2.7632, 1.3464]])

In [None]:
A

tensor([[0.5177, 0.8069, 0.7253, 0.9403],
        [0.4662, 1.0201, 2.7632, 1.3464]])

Индексирование

In [None]:
A[0, 1:3]

tensor([0.8069, 0.7253])

In [None]:
A[0, 1:3] = 100
A

tensor([[  0.5177, 100.0000, 100.0000,   0.9403],
        [  0.4662,   1.0201,   2.7632,   1.3464]])

### Составим полносвязную нейросеть

$$Z_1 = W_1 X^T + b_1 \cdot \bar 1$$

$$Z_2 = ReLU(Z_1)$$

$$Z_3 = W_2 Z_2 + b_2 \cdot \bar 1$$

In [None]:
X = torch.randn(32, 10) # 32 объекта, 10 признаков
Y = torch.randn(32) # 32 объекта

W1 = torch.randn(5, 10) # размерность входа: 10, размерность выхода: 5 
b1 = torch.randn(5) # размерность выхода: 5
Z1 = torch.matmul(W1, X.transpose(1, 0)) + b1[:, None]

Z2 = torch.maximum(Z1, torch.zeros_like(Z1))

W2 = torch.randn(1, 5) # размерность входа: 5, размерность выхода: 1 
b2 = torch.randn(1) # размерность выхода: 1 
Z3 = torch.matmul(W2, Z2) + b2[:, None]

Z3 = Z3.transpose(1, 0)

In [None]:
torch.matmul(W1, X.transpose(1, 0)).shape ##### delete for presentation

torch.Size([5, 32])

In [None]:
Z3.shape ### delete for presentation

torch.Size([32, 1])

### Автоматическое дифференцирование

In [None]:
x = torch.randn(4, requires_grad=True)
y = torch.randn(4, requires_grad=True)
z = x * y + 10
p = z.sum()
p.backward()
print('x:', x)
print('y:', y)
print('dp / dx:', x.grad)
print('dp / dy:', y.grad)

x: tensor([-1.4044, -1.3301,  1.2115, -0.4497], requires_grad=True)
y: tensor([-0.3929,  2.2903, -0.7839,  1.7781], requires_grad=True)
dp / dx: tensor([-0.3929,  2.2903, -0.7839,  1.7781])
dp / dy: tensor([-1.4044, -1.3301,  1.2115, -0.4497])


Дифференцируем нейронную сеть

In [None]:
# это я пишу вживую - исходно тут скопипащенный код выше, вставляю requires_grad=True 
# и пишу последние строки
X = torch.randn(32, 10)
Y = torch.randn(32)

W1 = torch.randn(5, 10, requires_grad=True)
b1 = torch.randn(5, requires_grad=True)
Z1 = torch.matmul(W1, X.transpose(1, 0)) + b1[:, None]

Z2 = torch.maximum(Z1, torch.zeros_like(Z1))

W2 = torch.randn(1, 5, requires_grad=True)
b2 = torch.randn(1, requires_grad=True)
Z3 = torch.matmul(W2, Z2) + b2[:, None]
Z3 = Z3.transpose(1, 0)

L = ((Z3-Y)**2).mean()

L.backward()

print("dL / dW1:", W1.grad)
print("dL / db1:", b1.grad)
print("dL / dW2:", W2.grad)
print("dL / db2:", b2.grad)

dL / dW1: tensor([[ 1.0110,  2.6501,  1.8997,  0.4245, -1.8000,  5.9386, -2.9659, -3.9327,
         -4.3395,  6.8481],
        [ 0.7114,  2.0024, -3.9333, -3.8857,  2.6567,  6.0890, -8.2333, -7.7922,
         -2.7761,  7.3916],
        [-0.0273,  0.2752, -0.2023,  0.6661, -0.0140,  1.5380, -0.2271, -0.7837,
         -0.7617,  1.3543],
        [ 0.3616,  0.8439, -0.8045,  0.3191,  0.2109,  0.2400, -0.1422,  0.2637,
          1.1213, -1.3847],
        [-0.4788,  0.9542,  0.4901,  0.0629,  0.1922,  1.7220, -1.3300, -1.6904,
         -1.6242,  1.8175]])
dL / db1: tensor([ 8.3458,  9.9339,  1.2457, -0.9960,  2.8252])
dL / dW2: tensor([[-30.1523, -17.7304,  -7.0576,  -1.4676, -33.8620]])
dL / db2: tensor([-9.4068])


### Об обнулении градиентов

Повторим проход вперед и проход назад еще раз и увидим, что новые и старые градиенты складываются:

In [None]:
Z1 = torch.matmul(W1, X.transpose(1, 0)) + b1[:, None]
Z2 = torch.maximum(Z1, torch.zeros_like(Z1))
Z3 = torch.matmul(W2, Z2) + b2[:, None]
Z3 = Z3.transpose(1, 0)

L = ((Z3-Y)**2).mean()

L.backward()

print("dL / dW1:", W1.grad)
print("dL / db1:", b1.grad)
print("dL / dW2:", W2.grad)
print("dL / db2:", b2.grad)

dL / dW1: tensor([[  2.0220,   5.3002,   3.7994,   0.8491,  -3.6000,  11.8772,  -5.9317,
          -7.8653,  -8.6791,  13.6962],
        [  1.4228,   4.0049,  -7.8665,  -7.7714,   5.3134,  12.1779, -16.4667,
         -15.5844,  -5.5523,  14.7832],
        [ -0.0547,   0.5504,  -0.4046,   1.3322,  -0.0280,   3.0759,  -0.4542,
          -1.5673,  -1.5233,   2.7086],
        [  0.7231,   1.6878,  -1.6091,   0.6383,   0.4219,   0.4800,  -0.2843,
           0.5273,   2.2427,  -2.7694],
        [ -0.9576,   1.9083,   0.9802,   0.1258,   0.3845,   3.4440,  -2.6601,
          -3.3807,  -3.2484,   3.6349]])
dL / db1: tensor([16.6915, 19.8678,  2.4915, -1.9921,  5.6505])
dL / dW2: tensor([[-60.3046, -35.4608, -14.1151,  -2.9353, -67.7240]])
dL / db2: tensor([-18.8137])


Чтобы получить только новые градиенты, старые нужно обнулить:

In [None]:
W1.grad.zero_() # обнуление градиентов
W2.grad.zero_() 
b1.grad.zero_() 
b2.grad.zero_() 

Z1 = torch.matmul(W1, X.transpose(1, 0)) + b1[:, None]
Z2 = torch.maximum(Z1, torch.zeros_like(Z1))
Z3 = torch.matmul(W2, Z2) + b2[:, None]
Z3 = Z3.transpose(1, 0)

L = ((Z3-Y)**2).mean()

L.backward()

print("dL / dW1:", W1.grad)
print("dL / db1:", b1.grad)
print("dL / dW2:", W2.grad)
print("dL / db2:", b2.grad)

dL / dW1: tensor([[ 1.0110,  2.6501,  1.8997,  0.4245, -1.8000,  5.9386, -2.9659, -3.9327,
         -4.3395,  6.8481],
        [ 0.7114,  2.0024, -3.9333, -3.8857,  2.6567,  6.0890, -8.2333, -7.7922,
         -2.7761,  7.3916],
        [-0.0273,  0.2752, -0.2023,  0.6661, -0.0140,  1.5380, -0.2271, -0.7837,
         -0.7617,  1.3543],
        [ 0.3616,  0.8439, -0.8045,  0.3191,  0.2109,  0.2400, -0.1422,  0.2637,
          1.1213, -1.3847],
        [-0.4788,  0.9542,  0.4901,  0.0629,  0.1922,  1.7220, -1.3300, -1.6904,
         -1.6242,  1.8175]])
dL / db1: tensor([ 8.3458,  9.9339,  1.2457, -0.9960,  2.8252])
dL / dW2: tensor([[-30.1523, -17.7304,  -7.0576,  -1.4676, -33.8620]])
dL / db2: tensor([-9.4068])


### Операция .detach()

Применение операции .detach() останавливает пропуск градиентов через вершину:

In [None]:
W1.grad.zero_()
W2.grad.zero_() 
b1.grad.zero_() 
b2.grad.zero_() 

Z1 = torch.matmul(W1, X.transpose(1, 0)).detach() + b1[:, None] # Добавили .detach()
Z2 = torch.maximum(Z1, torch.zeros_like(Z1))
Z3 = torch.matmul(W2, Z2) + b2[:, None]
Z3 = Z3.transpose(1, 0)

L = ((Z3-Y)**2).mean()

L.backward()

print("dL / dW1", W1.grad)
print("dL / db1", b1.grad)
print("dL / dW2", W2.grad)
print("dL / db2", b2.grad)

dL / dW1 tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
dL / db1 tensor([-2.0356e-01,  6.1586e+00,  2.3229e-01, -1.3011e-03, -2.8044e+00])
dL / dW2 tensor([[ -3.8637, -19.8146,  -2.1609,  -0.4106,  -0.3120]])
dL / db2 tensor([-3.6560])


### Хранение тензоров на CPU и GPU

In [None]:
import torch
W1 = torch.randn(5, 10)

In [None]:
W1 = W1.to("cuda")

In [None]:
W1.device

device(type='cuda', index=0)

In [None]:
W1.to("cpu")

tensor([[-0.0611,  0.9957, -1.6562,  0.6484,  0.0092,  0.1642, -2.2585,  1.4083,
          0.1257, -0.4505],
        [ 0.9882,  0.3418,  0.7005,  0.9262, -0.0867, -0.1641,  0.1878,  1.0679,
         -0.2224,  0.4452],
        [ 1.3740,  0.6503, -0.1129,  0.0949,  0.3754, -1.2442, -0.8434, -1.0158,
          1.5763, -0.5616],
        [-1.8780,  1.6868,  1.3144, -1.2419,  2.0040, -0.1158, -2.0601, -0.1759,
          2.1605, -1.0221],
        [-0.5774,  0.8734, -0.9211, -0.0608,  0.0791, -0.7660,  1.8033, -1.8280,
          0.6176, -0.2801]])

In [None]:
W1.cuda()
W1.cpu()

tensor([[-0.0611,  0.9957, -1.6562,  0.6484,  0.0092,  0.1642, -2.2585,  1.4083,
          0.1257, -0.4505],
        [ 0.9882,  0.3418,  0.7005,  0.9262, -0.0867, -0.1641,  0.1878,  1.0679,
         -0.2224,  0.4452],
        [ 1.3740,  0.6503, -0.1129,  0.0949,  0.3754, -1.2442, -0.8434, -1.0158,
          1.5763, -0.5616],
        [-1.8780,  1.6868,  1.3144, -1.2419,  2.0040, -0.1158, -2.0601, -0.1759,
          2.1605, -1.0221],
        [-0.5774,  0.8734, -0.9211, -0.0608,  0.0791, -0.7660,  1.8033, -1.8280,
          0.6176, -0.2801]])