# Модификации градиентного спуска, используещиеся в нейронных сетях.

## Стохастический градиентный спуск SGD

In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt
from matplotlib import cm
import time
from keras.utils import to_categorical
from keras import models
from keras import layers

In [2]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

def sigmoid_diff(y):
    return y*(1 - y)


In [None]:
class NeuralNetwork:
    def __init__(self, x, y): # m - объём датасета
        self.input      = x #nxm
        self.y          = y #1xm
        self.diam_s     = 3 #k
        self.weights1   = np.random.rand(self.diam_s, self.input.shape[0]) # kxn
        self.weights2   = np.random.rand(1, self.diam_s) # 1xk
        self.output     = np.zeros(self.y.shape) #1xm
        self.alpha      = 1
        self.bias       = np.random.rand(self.diam_s, 1)
        self.b          = np.random.rand()
        self.batch      = self.input.shape[1]
        shuff           = np.random.shuffle(np.arange(self.input.shape[1]))
        
    def feedforward(self):
        for i in np.arange(self.input.shape[1] // self.batch + int(bool(self.input.shape[1] % self.batch))):
            batch_i = shuff[:, i*self.batch : (i+1)*self.batch]
            self.layer1 = sigmoid(np.dot(self.weights1, self.input[:,batch_i]) + self.bias) #kxm
            self.output = np.dot(self.weights2, self.layer1) + self.b # 1xm
        
    def backprop(self):
        d_weights2 = 2*np.dot((self.y - self.output),self.layer1.T) # 1xk
        d_weights1 = 2*self.weights2.T*np.dot((self.y - self.output)*sigmoid_diff(self.layer1),self.input.T )
        d_bias     = 2*self.weights2.T*np.dot(sigmoid_diff(self.layer1), (self.y - self.output).T)
        d_b        = 2+(self.y - self.output).sum()


        self.weights1 += self.alpha * d_weights1/self.input.shape[1]
        self.weights2 += self.alpha * d_weights2/self.input.shape[1]
        self.bias     += self.alpha * d_bias/self.input.shape[1]
        self.b        += self.alpha * d_b/self.input.shape[1]

    def test(self, t):
        return np.dot(self.weights2, sigmoid(np.dot(self.weights1, t) + self.bias)) + self.b

## Метод Нестерова

### Нестеров

$$V_i = \gamma V_{i-1} - \eta\dfrac{\partial L}{\partial W} $$

$$V_i = \gamma V_{i-1} - \eta\dfrac{\partial L}{\partial (W - \gamma V_{i-1})} $$

$$W = W -V_i $$

## Метод адаптивного градиента AdaGrad

### AdaGrad

$$G_i = G_{i-1} + \left(\dfrac{\partial L}{\partial W} \right)^2 $$

$$W = W - \dfrac{\eta}{\sqrt{G_i + \epsilon}}\dfrac{\partial L}{\partial W} $$

In [4]:
class NeuralNetwork:
    def __init__(self, x, y): # m - объём датасета
        self.input      = x #nxm
        self.y          = y #1xm
        self.diam_s     = 3 #k
        self.weights1   = np.random.rand(self.diam_s, self.input.shape[0]) # kxn
        self.weights2   = np.random.rand(1, self.diam_s) # 1xk
        self.output     = np.zeros(self.y.shape) #1xm
        self.alpha      = 1
        self.bias       = np.random.rand(self.diam_s, 1)
        self.b          = np.random.rand()
        G_d_weights2    = np.zeros(self.weights2.shape)
        G_d_weights1    = np.zeros(self.weights1.shape)
        G_d_bias        = np.zeros(self.bias.shape)
        G_d_b           = 0
        self.eps        = 0.000000001
    def feedforward(self):
        self.layer1 = sigmoid(np.dot(self.weights1, self.input) + self.bias) #kxm
        self.output = np.dot(self.weights2, self.layer1) + self.b # 1xm

    def backprop(self):
        d_weights2 = 2*np.dot((self.y - self.output),self.layer1.T) # 1xk
        d_weights1 = 2*self.weights2.T*np.dot((self.y - self.output)*sigmoid_diff(self.layer1),self.input.T )
        d_bias     = 2*self.weights2.T*np.dot(sigmoid_diff(self.layer1), (self.y - self.output).T)
        d_b        = 2+(self.y - self.output).sum()

        G_d_weights2    += d_weights2**2
        G_d_weights1    += d_weights1**2
        G_d_bias        += d_bias**2
        G_d_b           += d_b**2



        self.weights1 += self.alpha * d_weights1/self.input.shape[1]/np.sqrt(G_d_weights2 + self.eps)
        self.weights2 += self.alpha * d_weights2/self.input.shape[1]/np.sqrt(G_d_weights1 + self.eps)
        self.bias     += self.alpha * d_bias/self.input.shape[1]/np.sqrt(G_d_bias + self.eps)
        self.b        += self.alpha * d_b/self.input.shape[1]/np.sqrt(G_d_b + self.eps)

    def test(self, t):
        return np.dot(self.weights2, sigmoid(np.dot(self.weights1, t) + self.bias)) + self.b

## Adam

### Adam

$$M_i = \beta_1 M_{i-1} +(1-\beta_1)\dfrac{\partial L}{\partial W} $$

$$V_i = \beta_2 V_{i-1} +(1-\beta_2)\left(\dfrac{\partial L}{\partial W} \right)^2 $$

$$\hat{M}_i = \dfrac{M_i}{1-\beta_1^i} $$

$$\hat{V}_i = \dfrac{V_i}{1-\beta_2^i} $$

$$W = W - \dfrac{\eta}{\sqrt{\hat{V}_i + \epsilon}}\hat{M}_i $$
