In [1]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt

In [2]:
dataset = np.array([
    [1, 7],
    [2, 13],
    [3, 17],
    [4, 22],
    [5, 27],
    [6, 33],
    [7, 38],
    [8, 42],
    [9, 46],
    [10, 52]
])

In [3]:
x = dataset[:, 0:1]
y = dataset[:, 1:]

## 动量方法-批量梯度下降

In [4]:
def BatchGradientDescentM(x, y, step=0.001, iter_count=500, beta=0.9):
    length, features = x.shape
    data = np.column_stack((x, np.ones((length, 1))))
    w = np.zeros((features + 1, 1))
    v = np.zeros((features + 1, 1))
    for i in range(iter_count):
        v = (beta * v + (1 - beta) * np.sum((np.dot(data, w) - y) * data, axis=0).reshape((features + 1, 1))) / length         
        w -= step * v        
    return w

In [5]:
BatchGradientDescentM(x, y, iter_count=1000)

array([[5.12168864],
       [0.77728077]])

## 动量方法-随机梯度下降

In [6]:
def StochasticGradientDescentM(x, y, step=0.001, iter_count=500, beta=0.9):
    length, features = x.shape
    data = np.column_stack((x, np.ones((length, 1))))
    w = np.zeros((features + 1, 1))
    v = np.zeros((features + 1, 1))
    for i in range(iter_count):
        random_ind = np.random.randint(length)
        dw = (np.dot(data[random_ind], w) - y[random_ind]) * data[random_ind]
        v = (beta * v + (1 - beta) * dw.reshape((features + 1, 1))) / length        
        w -= step * v
    return w

In [7]:
StochasticGradientDescentM(x, y, iter_count=10000)

array([[5.1217052],
       [0.7789374]])

## 动量方法-小批量梯度下降

In [8]:
def MiniBatchGradientDescentM(x, y, step=0.001, iter_count=500, batch_size=4, beta=0.9):
    length, features = x.shape
    data = np.column_stack((x, np.ones((length, 1))))
    w = np.zeros((features + 1, 1))
    v = np.zeros((features + 1, 1))
    start, end = 0, batch_size
    for i in range(iter_count):
        v = (beta * v + (1 - beta) * np.sum((np.dot(data[start:end], w) - y[start:end]) * data[start:end], axis=0).reshape((features + 1, 1))) / length         
        w -= step * v
        start = (start + batch_size) % length
        end = (end + batch_size) % length
    return w

In [9]:
MiniBatchGradientDescentM(x, y, iter_count=10000)

array([[5.24725705],
       [1.02780271]])

## 整合写法

In [10]:
def Momentum(x, y, step=0.01, iter_count=1000, batch_size=4, beta=0.9):
    length, features = x.shape
    data = np.column_stack((x, np.ones((length, 1))))
    w = np.zeros((features + 1, 1))
    v = np.zeros((features + 1, 1))
    start, end = 0, batch_size
    for i in range(iter_count):
        v = (beta * v + (1 - beta) * np.sum((np.dot(data[start:end], w) - y[start:end]) * data[start:end], axis=0).reshape((features + 1, 1))) / length         
        w -= step * v
        start = (start + batch_size) % length
        if start > length:
            start -= length
        end = (end + batch_size) % length
        if end > length:
            end -= length
    return w

In [11]:
# 批量梯度下降
Momentum(x, y, batch_size=(x.shape[0] - 1))

array([[5.00311478],
       [0.8307453 ]])

In [12]:
# 小批量梯度下降
Momentum(x, y, batch_size=5)

array([[4.98144568],
       [1.43164128]])

In [13]:
# 随机梯度下降
Momentum(x, y, batch_size=1)

array([[4.99294353],
       [0.83128473]])

## 牛顿增量

In [16]:
def Nesterov(x, y, step=0.01, iter_count=1000, batch_size=4, beta=0.9):
    length, features = x.shape
    data = np.column_stack((x, np.ones((length, 1))))
    w = np.zeros((features + 1, 1))
    v = np.zeros((features + 1, 1))
    start, end = 0, batch_size
    for i in range(iter_count):
        w_temp = w - step * v
        v = (beta * v + (1 - beta) * np.sum((np.dot(data[start:end], w_temp) - y[start:end]) * data[start:end], axis=0).reshape((features + 1, 1))) / length         
        w -= step * v
        start = (start + batch_size) % length
        if start > length:
            start -= length
        end = (end + batch_size) % length
        if end > length:
            end -= length
    return w

In [17]:
Nesterov(x, y)

array([[5.24702554],
       [1.02779773]])