In [None]:
# Task 1
# Find the roots of square equation by gradient descent
# x ** 2 - 6 * x + 4 = 0


In [None]:
Найдем корни аналитически:

In [2]:
x1 = (6 + 20**0.5)/2
x2 = (6 - 20**0.5)/2
print(x1, x2)

5.23606797749979 0.7639320225002102


In [None]:
# возвести в квадрат
# посчитать производную
# надо начать движение от начальной точки в направлении антградиента с заданным шагом
# x = x - lr * grad(x)

In [11]:
%%time
lr = 0.00001
xs = []
for x in range(-25,25,10):
    f = 15
    while round(f,10) != 0:
        f = 4 * x**3 - 36 * x**2 + 88*x - 48
        x = x - lr * f
    xs.append(round(x,3))
xss = set(xs)
xss

Wall time: 358 ms


{0.764, 5.236}

In [None]:
# всегда ли сойдемся за приемлемое количество шагов?
Количество шагов влияет на скорость работы
# важна ли начальная точка?
От начальной точки зависит в какой минимум будет производиться спуск
# как найти второй корень?
В правой части результата списка
# как вляет ЛР?
ЛР влияет на скорость спуска

In [None]:
# Task 2
# Realize forward and backward pass for linear layer with sigmoid activation

In [12]:
import torch

In [13]:
def sigmoid(x):
    return 1. / (1 + torch.exp(-x))

def sigmoid_backward(da, x):
    sig = sigmoid(x)

    return da * sig * (1 - sig)

In [14]:
def mse_loss(t, y):
    return (t - y) ** 2

def d_mse_loss(t, y):
    return 2 * (y - t)

In [15]:
class LinearLayer:
    def __init__(self, n_inp, n_out, activation='sigmoid'):
        self.w = torch.randn(n_out, n_inp) * 0.1
        self.b = torch.randn(n_out, 1) * 0.1
        if activation == 'sigmoid':
            self.activ = sigmoid
        elif activation == 'None':
            self.activ = None
        else:
            raise Exception(f'Unknown activation "{activation}"')
        self._clear_state()

    def _clear_state(self):
        self.lin = None
        self.inp = None
        self.d_w = None
        self.d_b = None

    def forward(self, x):
        self.inp = x
        self.lin = self.w @ self.inp + self.b
        activ = self.activ(self.lin) if self.activ is not None else self.lin
        return activ
    
    def backward(self, grad): 
        if self.activ == sigmoid:
            grad_lin = sigmoid_backward(grad, self.lin) 
        else:
            grad_lin = grad
       
        m = self.inp.shape[1]
        self.d_w = grad_lin @ self.inp.T / m    # d_in dOut
        self.d_b = torch.sum(grad_lin, axis=1, keepdims=True) / m

        grad = self.w.T @ grad_lin

        return grad

In [16]:
from typing import Tuple

class Model:
    def __init__(self, arch: Tuple[Tuple[int, int]], activation):
        self.layers = []
        for i, p in enumerate(arch):
            self.layers.append(
                LinearLayer(p[0], p[1], activation=activation if i < len(arch)-1 else 'None')
            )
        self._clear_state()

    def _clear_state(self):
        for l in self.layers:
            l._clear_state()

    def forward(self, x):
        for layer in self.layers:
            x = layer.forward(x)

        return x

    def backward(self, grad):
        for layer in reversed(self.layers):
            grad = layer.backward(grad)

        return grad

In [None]:
# Task 3
# Realize 1-2 optimizers

In [17]:
class RMSProp:
    def __init__(self, model: Model, lr= 0.0001, rho= 0.99):
        self.model = model
        self.lr = lr
        self.rho = rho
        self.acc = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]

    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.acc[i][0] = self.rho * self.acc[i][0] + (1 - self.rho) * layer.d_w ** 2
            self.acc[i][1] = self.rho * self.acc[i][1] + (1 - self.rho) * layer.d_b ** 2
            adapt_lr_w = self.lr / torch.sqrt(self.acc[i][0])
            adapt_lr_b = self.lr / torch.sqrt(self.acc[i][1])
            layer.w -= adapt_lr_w * layer.d_w
            layer.b -= adapt_lr_b * layer.d_b


    def zero_grad(self):
        self.model._clear_state()

In [18]:
class Adam:
    def __init__(self, model: Model, lr= 0.0001, beta1 = 0.99, beta2 = 0.99):
        self.model = model
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.acc = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]
        self.vel = [[torch.zeros_like(layer.w),
                     torch.zeros_like(layer.b)] for layer in model.layers]

    def step(self):
        for i, layer in enumerate(self.model.layers):
            self.vel[i][0] = self.vel[i][0] * self.beta1 + (1 - self.beta1) * layer.d_w
            self.vel[i][1] = self.vel[i][1] * self.beta1 + (1 - self.beta1) * layer.d_b
            self.acc[i][0] = self.beta2 * self.acc[i][0] + (1 - self.beta2) * layer.d_w ** 2
            self.acc[i][1] = self.beta2 * self.acc[i][1] + (1 - self.beta2) * layer.d_b ** 2
            adapt_lr_w = self.lr / torch.sqrt(self.acc[i][0])
            adapt_lr_b = self.lr / torch.sqrt(self.acc[i][1])
            layer.w -= adapt_lr_w * self.vel[i][0]
            layer.b -= adapt_lr_b * self.vel[i][1]


    def zero_grad(self):
        self.model._clear_state()

In [19]:
x = (torch.rand(2000)-0.5)*4
y = x**2 + torch.randn(1)*0.1

In [20]:
%%time
modelrms = Model(((1, 100), (100, 1)), activation='sigmoid')
optim_rms = RMSProp(modelrms, lr=0.001)
for e in range(20):
    for i, (val, t) in enumerate(zip(x, y)):
        optim_rms.zero_grad()
        pred = modelrms.forward(torch.tensor([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        modelrms.backward(grad)
        optim_rms.step()

    print(e, modelrms.forward(torch.tensor([[1.]])), modelrms.forward(torch.tensor([[2.]])), modelrms.forward(torch.tensor([[-1.]])), modelrms.forward(torch.tensor([[-2.]])))

0 tensor([[0.9825]]) tensor([[0.9705]]) tensor([[1.0077]]) tensor([[1.0207]])
1 tensor([[0.9828]]) tensor([[0.9704]]) tensor([[1.0101]]) tensor([[1.0249]])
2 tensor([[0.9845]]) tensor([[0.9733]]) tensor([[1.0111]]) tensor([[1.0265]])
3 tensor([[0.9862]]) tensor([[0.9773]]) tensor([[1.0123]]) tensor([[1.0296]])
4 tensor([[0.9865]]) tensor([[0.9836]]) tensor([[1.0160]]) tensor([[1.0410]])
5 tensor([[0.9812]]) tensor([[1.0049]]) tensor([[1.0296]]) tensor([[1.0900]])
6 tensor([[0.9752]]) tensor([[1.1181]]) tensor([[1.0563]]) tensor([[1.2455]])
7 tensor([[1.0162]]) tensor([[1.5063]]) tensor([[1.0804]]) tensor([[1.6386]])
8 tensor([[1.1868]]) tensor([[2.3510]]) tensor([[1.1324]]) tensor([[2.4616]])
9 tensor([[1.3407]]) tensor([[3.2219]]) tensor([[1.2060]]) tensor([[3.3582]])
10 tensor([[1.2829]]) tensor([[3.4318]]) tensor([[1.1403]]) tensor([[3.5601]])
11 tensor([[1.2397]]) tensor([[3.5305]]) tensor([[1.1023]]) tensor([[3.6464]])
12 tensor([[1.2063]]) tensor([[3.6045]]) tensor([[1.0819]]) te

In [22]:
%%time
modeladam = Model(((1, 100), (100, 1)), activation='sigmoid')
optim_adam = Adam(modeladam, lr=0.001)
for e in range(20):
    for i, (val, t) in enumerate(zip(x, y)):
        optim_adam.zero_grad()
        pred = modeladam.forward(torch.tensor([[val]]))
        loss = mse_loss(t, pred)
        grad = d_mse_loss(t, pred)
        modeladam.backward(grad)
        optim_adam.step()

    print(e, modeladam.forward(torch.tensor([[1.]])), modeladam.forward(torch.tensor([[2.]])), modeladam.forward(torch.tensor([[-1.]])), modeladam.forward(torch.tensor([[-2.]])))


0 tensor([[1.0457]]) tensor([[1.0289]]) tensor([[1.0792]]) tensor([[1.0964]])
1 tensor([[1.0436]]) tensor([[1.0267]]) tensor([[1.0784]]) tensor([[1.0968]])
2 tensor([[1.0439]]) tensor([[1.0282]]) tensor([[1.0778]]) tensor([[1.0966]])
3 tensor([[1.0444]]) tensor([[1.0308]]) tensor([[1.0790]]) tensor([[1.1006]])
4 tensor([[1.0416]]) tensor([[1.0354]]) tensor([[1.0845]]) tensor([[1.1195]])
5 tensor([[1.0277]]) tensor([[1.0628]]) tensor([[1.0981]]) tensor([[1.1840]])
6 tensor([[1.0115]]) tensor([[1.2039]]) tensor([[1.1123]]) tensor([[1.3522]])
7 tensor([[1.0795]]) tensor([[1.6437]]) tensor([[1.1363]]) tensor([[1.7378]])
8 tensor([[1.3138]]) tensor([[2.4711]]) tensor([[1.2483]]) tensor([[2.5126]])
9 tensor([[1.1486]]) tensor([[2.9357]]) tensor([[0.9804]]) tensor([[3.0206]])
10 tensor([[1.1887]]) tensor([[3.2561]]) tensor([[1.0061]]) tensor([[3.4004]])
11 tensor([[1.1845]]) tensor([[3.3979]]) tensor([[1.0158]]) tensor([[3.5636]])
12 tensor([[1.1700]]) tensor([[3.5003]]) tensor([[1.0247]]) te