In [1]:
import numpy as np
import torch

In [5]:
# data
# http://d2l-data.s3-accelerate.amazonaws.com/airfoil_self_noise.dat
data = np.genfromtxt('./airfoil_self_noise.dat', dtype=np.float32, delimiter='\t')
data.shape

(1503, 6)

In [6]:
data = torch.from_numpy((data - data.mean(axis=0)) / data.std(axis=0))

In [12]:
batch_size = 10
n_features = data.shape[1]-1
dataset = torch.utils.data.TensorDataset(data[:1500, :-1], data[:1500, -1])
data_loader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)

## adadelta

In [15]:
torch.square(torch.tensor(2))

tensor(4)

In [13]:
def init_adadelta_states(n_features):
    s_w, s_b = torch.zeros((n_features, 1)), torch.zeros(1)
    delta_w, delta_b = torch.zeros((delta_w, 1)), torch.zeros(1)
    return (s_w, s_b), (delta_w, delta_b)

In [16]:
def adadelta_step(params, states, hyperparams):
    rho, eps = hyperparams['rho'], 1e-6
    for p, (s, delta) in zip(params, states):
        with torch.no_grad():
            s[:] = rho * s + (1-rho)*torch.square(p.grad)
            g = (torch.sqrt(delta + eps) / torch.sqrt(s + eps)) * p.grad
            p[:] -= g
            delta[:] = rho * delta + (1 - rho) * g * g
        p.grad.data.zero_()

In [17]:
def train(step_fn, states, hyperparams, data_loader, n_features, num_epochs=5):
    w = torch.normal(mean=0.0, std=0.01, size=(feature_dim, 1),
                     requires_grad=True)
    b = torch.zeros((1), requires_grad=True)
    model = lambda X: torch.matmul(X, w) + b, 
    loss_fn = lambda y, y_hat: 1/2 * (y_hat - y) ** 2 
    

## Adam

$$
\begin{split}
&m_0=0,v_0=0\\
&m_t=\beta_1m_{t-1}+(1-\beta_1)g_t\\
&v_t=\beta_2v_{t-1}+(1-\beta_2)g_t^2\\
&\hat {m}_t=\frac{m_t}{1-\beta_1^t}\\
&\hat {v}_t=\frac{v_t}{1-\beta_2^t}\\
&\theta_{t+1}=\theta_t-\frac{\alpha \hat{m}_t}{\sqrt{\hat{v}_t}+\epsilon}
\end{split}
$$

- 关于 $m_t$ 的计算，可以称之为 EMA（exponential moving average）
    - $m_t$ 是**过去所有梯度**的加权平均
    - $m_t=(1-\beta_1)g_t+\beta_1(1-\beta_1)g_{t-1}+\beta_1^2(1-\beta_1)g_{t-2}+\cdots+\beta_1^{t-1}(1-\beta_1)g_1$
    - $(1-\beta_1)\gt \beta_1(1-\beta_1)\gt \beta_1^2(1-\beta_1)\gt\cdots\gt \beta_1^{t-1}(1-\beta_1)$
- $\hat{m}_t$ is the bias-corrected first moment (mean) estimate of the gradients.
- $\hat{v}_t$ is the bias-corrected second moment (uncentered variance) estimate of the gradients.

- Adam tweaks the gradient descent method by considering the moving average of the first and second-order moments of the gradient.