In [1]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt

In [2]:
dataset = np.array([
    [1, 7],
    [2, 13],
    [3, 17],
    [4, 22],
    [5, 27],
    [6, 33],
    [7, 38],
    [8, 42],
    [9, 46],
    [10, 52]
])

In [3]:
x = dataset[:, 0:1]
y = dataset[:, 1:]

## AdaGrad

In [23]:
def AdaGrad(x, y, step=0.01, iter_count=500, batch_size=4):
    length, features = x.shape
    data = np.column_stack((x, np.ones((length, 1))))
    w = np.zeros((features + 1, 1))
    r, eta = 0, 10e-7
    start, end = 0, batch_size
    for i in range(iter_count):
        dw = np.sum((np.dot(data[start:end], w) - y[start:end]) * data[start:end], axis=0) / length        
        r = r + np.dot(dw, dw)                     
        w = w - (step / (eta + np.sqrt(r))) * dw.reshape((features + 1, 1))
        start = (start + batch_size) % length
        if start > length:
            start -= length
        end = (end + batch_size) % length
        if end > length:
            end -= length
    return w

In [24]:
AdaGrad(x, y, step=1, iter_count=1000)

array([[5.19133285],
       [1.35955132]])

In [25]:
AdaGrad(x, y, step=0.1, iter_count=1000)

array([[3.37157325],
       [0.6519457 ]])

In [27]:
AdaGrad(x, y, step=0.1, iter_count=3000)

array([[4.72572017],
       [0.91424582]])

## 改进

In [28]:
def AdaGrad(x, y, step=0.01, iter_count=500, step_count=100, batch_size=4):
    length, features = x.shape
    data = np.column_stack((x, np.ones((length, 1))))
    w = np.zeros((features + 1, 1))
    r, eta = 0, 10e-7
    start, end = 0, batch_size
    for i in range(iter_count):
        dw = np.sum((np.dot(data[start:end], w) - y[start:end]) * data[start:end], axis=0) / length
        if i > step_count:
            r = r + np.dot(dw, dw)
            w = w - (step / (eta + np.sqrt(r))) * dw.reshape((features + 1, 1))
        else:
            w -= step * dw.reshape((features + 1, 1))        
        start = (start + batch_size) % length
        if start > length:
            start -= length
        end = (end + batch_size) % length
        if end > length:
            end -= length
    return w

In [30]:
AdaGrad(x, y, iter_count=500)

array([[5.24748173],
       [1.06459711]])

In [33]:
def AdaGrad(x, y, step=0.01, iter_count=500, step_threshold=30, batch_size=4):
    length, features = x.shape
    data = np.column_stack((x, np.ones((length, 1))))
    w = np.zeros((features + 1, 1))
    r, eta = 0, 10e-7
    start, end = 0, batch_size
    for i in range(iter_count):
        dw = np.sum((np.dot(data[start:end], w) - y[start:end]) * data[start:end], axis=0) / length
        dw2 = np.dot(dw, dw)
        if dw2 < step_threshold:
            r = r + dw2
            w = w - (step / (eta + np.sqrt(r))) * dw.reshape((features + 1, 1))
        else:
            w -= step * dw.reshape((features + 1, 1))        
        start = (start + batch_size) % length
        if start > length:
            start -= length
        end = (end + batch_size) % length
        if end > length:
            end -= length
    return w

In [34]:
AdaGrad(x, y)

array([[5.12585752],
       [0.95310592]])