# Параметры модели

In [None]:
#1
import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) # x -> (x-mean)/std, преобразует  [0,1] -> [-1,1]

batch_size = 16

trainset = torchvision.datasets.CIFAR10(root='./cifar10data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./cifar10data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
#2
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Net(nn.Module):
    def __init__(self, dbg=False):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.dbg = dbg

    def forward(self, x):
        if self.dbg:
            insize = x.size()            
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)        
        if self.dbg:
            outsize = x.size()
            print("\tIn Model: input size", insize, "output size", outsize)
        return x

In [None]:
#3
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

In [None]:
#4
netGPU = Net()
netGPU.to(device)

criterion = nn.CrossEntropyLoss()
optimizerGPU = optim.SGD(netGPU.parameters(), lr=0.001, momentum=0.9)

In [None]:
#5
for name, param in netGPU.named_parameters():
    print(name, param.shape, param)

Упражнение: найдите общее количество параметров сети netGPU

In [None]:
#6 не запускайте обучение во время презентации
import time
start = time.time()

for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizerGPU.zero_grad()

        # forward + backward + optimize
        outputs = netGPU(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizerGPU.step()

        # print statistics
        running_loss += loss.item()
        if i % 1000 == 999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

finish = time.time()
print('Finished Training in ', finish-start)

In [None]:
#7
for name, param in netGPU.named_parameters():
    print(name, param.shape, param)

# Автоматический дифференциатор в PyTorch

https://pytorch.org/tutorials/beginner/introyt/autogradyt_tutorial.html

In [None]:
#8
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import math
x = torch.ones(25)
print(x)
w = torch.linspace(0., 2. * math.pi, steps=25, requires_grad=True)
print(w)

In [None]:
#9
a = x*w
print(a)
b = torch.sin(a)
print(b)
#plt.plot(a, b)
plt.plot(a.detach(), b.detach())

In [None]:
#10
c = 2 * b
print(c)

d = c + 1
print(d)

In [None]:
#11
dd = torch.sin(a) * 2 + 1
print(dd)

In [None]:
#12
out = d.sum()
print(out)

In [None]:
#13
print('d:')
print(d.grad_fn)
print(d.grad_fn.next_functions)
print(d.grad_fn.next_functions[0][0].next_functions)
print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions)
print(d.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0].next_functions)
print('\nc:')
print(c.grad_fn)
print('\nb:')
print(b.grad_fn)
print('\na:')
print(a.grad_fn)

In [None]:
#14
print(b.grad_fn)
print(b.grad)

In [None]:
#15
out.backward()

In [None]:
#16
print(w)
print(w.grad)
plt.plot(w.detach(), w.grad.detach())

In [None]:
#variables:    x, w              a            b            c            d              out
#values:       x, w->            w*x      ->  sin(w*x)  -> 2*sin(w*x)-> 2*sin(w*x)+1 ->sum_i(2*sin(w_i*x_i)+1)
#grad_fn   None, AccumulateGrad  MulBackward0 SinBackward0 MulBackward0 AddBackward0   SumBackward0

# d out/ d w_i = 2 cos(w_i*x_i) * x_i

#None = не хранит градиент
#AccumulateGrad = лист дерева вычисления градиента

Упражнение: вычислите при помощи torch производную функции ReLU (torch.nn.functional.relu)

# Что делать с не особенно дифференцируемыми функциями

In [None]:
#17
BATCH_SIZE = 16
DIM_IN = 1000
HIDDEN_SIZE = 100
DIM_OUT = 10

class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.layer1 = torch.nn.Linear(1000, 100)
        self.relu = torch.nn.ReLU()
        self.layer2 = torch.nn.Linear(100, 10)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

some_input = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
ideal_output = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)

model = TinyModel()

In [None]:
#18
print(model.layer2.weight[0][0:10]) # just a small slice
print(model.layer2.weight.grad)

In [None]:
#19
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

prediction = model(some_input)

loss = (ideal_output - prediction).pow(2).sum()
print(loss)

In [None]:
#20 - вычисление производных: обновлены градиенты, но не обновлены веса
loss.backward()
print(model.layer2.weight[0][0:10])
print(model.layer2.weight.grad[0][0:10])

In [None]:
#21 - шаг метода оптимизации: обновлены веса
optimizer.step()
print(model.layer2.weight[0][0:10])
print(model.layer2.weight.grad[0][0:10])

In [None]:
#21
print(model.layer2.weight.grad[0][0:10])

for i in range(0, 5):
    prediction = model(some_input)
    loss = (ideal_output - prediction).pow(2).sum()
    loss.backward()

print(model.layer2.weight.grad[0][0:10])

optimizer.zero_grad(set_to_none=False)

print(model.layer2.weight.grad[0][0:10])

In [None]:
#22
print(loss.grad_fn)
print(loss.grad_fn.next_functions)
print(loss.grad_fn.next_functions[0][0].next_functions)
layer2funcs =loss.grad_fn.next_functions[0][0].next_functions[0][0].next_functions
print(layer2funcs)

In [None]:
#23
print(layer2funcs[1][0].next_functions)