In [21]:
import numpy as np

## Formula

$$ v_t = {\beta}_{1} v_{t-1} + (1 - {\beta}_{1})g_t$$
$$ s_t = {\beta}_{2} s_{t-1} + (1 - {\beta}_{2})g_t^2$$
$$\hat{v}_{t} = \frac{v_t}{1-{\beta}_{1}^t} \; and \; \hat{s}_{t} = \frac{s_t}{1-{\beta}_{2}^t}$$
$$ {g}_t' = \frac {\eta \hat{v}_t}{\sqrt{\hat{s}_t}+\epsilon}$$
$$  x_t\leftarrow x_{t-1} - {g}_t'$$
$$

## Code

In [22]:
from lib.NeuralNet import *

In [23]:
def OptimizerAdam(W1,b1,W2,b2,W3,b3,dW1,dB1,dW2,dB2,dW3,dB3,vW1,vb1,vW2,vb2,vW3,vb3,sW1,sb1,sW2,sb2,sW3,sb3,lr,beta1,beta2,t):
    momentum = [vW1,vb1,vW2,vb2,vW3,vb3]
    second_momen = [sW1,sb1,sW2,sb2,sW3,sb3]
    gradient = [dW1,dB1,dW2,dB2,dW3,dB3]
    params = [W1,b1,W2,b2,W3,b3]
    for i in range(6):
        momentum[i] = beta1 * momentum[i] + (1-beta1) * gradient[i]
        second_momen[i] = beta2 * second_momen[i] + (1-beta2) * np.multiply(gradient[i],gradient[i])
        v_hat = momentum[i] / (1 - beta1**t)
        s_hat = second_momen[i] / (1 - beta2**t)
        g_t = np.multiply(lr * v_hat,1 / (np.sqrt(s_hat)+1e-6))
        params[i] = params[i] - g_t
    W1,b1,W2,b2,W3,b3 = params
    vW1,vb1,vW2,vb2,vW3,vb3 = momentum
    sW1,sb1,sW2,sb2,sW3,sb3 = second_momen
    return W1,b1,W2,b2,W3,b3,vW1,vb1,vW2,vb2,vW3,vb3,sW1,sb1,sW2,sb2,sW3,sb3
        
def UpdateParamsAdam(W1,b1,W2,b2,W3,b3,S,Y,batch_size=16,lr=1e-4,beta1=0.9,beta2=0.999):
    n_samples = S.shape[0]
    idx = np.arange(n_samples)
    np.random.shuffle(idx)
    S,Y = S[idx],Y[idx]
    vW1,vb1,vW2,vb2,vW3,vb3 = np.zeros_like(W1),np.zeros_like(b1),np.zeros_like(W2),np.zeros_like(b2),np.zeros_like(W3),np.zeros_like(b3)
    sW1,sb1,sW2,sb2,sW3,sb3 = np.zeros_like(W1),np.zeros_like(b1),np.zeros_like(W2),np.zeros_like(b2),np.zeros_like(W3),np.zeros_like(b3)
    t = 1
    for i in np.arange(0, n_samples, batch_size):
        begin, end = i, min(i + batch_size, n_samples)
        s,y =  S[begin:end] , Y[begin:end]
        O1,O2,O3 = Forward(W1,b1,W2,b2,W3,b3,s)
        dW1,dB1,dW2,dB2,dW3,dB3 = Backward(W1,b1,W2,b2,W3,b3,s,y,O1,O2,O3,lr)
        W1,b1,W2,b2,W3,b3,vW1,vb1,vW2,vb2,vW3,vb3,sW1,sb1,sW2,sb2,sW3,sb3 = OptimizerAdam(W1,b1,W2,b2,W3,b3,dW1,dB1,dW2,dB2,dW3,dB3,vW1,vb1,vW2,vb2,vW3,vb3,sW1,sb1,sW2,sb2,sW3,sb3,lr,beta1,beta2,t)
        t+=1
    return W1,b1,W2,b2,W3,b3

In [25]:
X = np.random.rand(5000,100)
y = np.random.rand(5000,1)
W1,b1,W2,b2,W3,b3 = FitModel(X,y,n_iter=1000,batch_size=500,lr=1e-3,UpdateParams=UpdateParamsAdam,print_stat=True)

Epoch  0 Loss:  0.08421819504384775
Epoch  50 Loss:  0.04793530265025331
Epoch  100 Loss:  0.026712419547392254
Epoch  150 Loss:  0.011866150218894514
Epoch  200 Loss:  0.00778059584401054
Epoch  250 Loss:  0.0055093510277443815
Epoch  300 Loss:  0.002589819141137765
Epoch  350 Loss:  0.0027693548712071406
Epoch  400 Loss:  0.0029833941529570037
Epoch  450 Loss:  0.003558220328581537
Epoch  500 Loss:  0.0015646660026048624
Epoch  550 Loss:  0.0016096674255810987
Epoch  600 Loss:  0.0016431763273473652
Epoch  650 Loss:  0.001616825541650852
Epoch  700 Loss:  0.001393229995084289
Epoch  750 Loss:  0.0014480834898836156
Epoch  800 Loss:  0.0012770644750152123
Epoch  850 Loss:  0.0007979579942583756
Epoch  900 Loss:  0.0016239848719123038
Epoch  950 Loss:  0.0010402948035949742
