# batch gd
每次需要所有数据:
![bgd](img/bgd.png)

# 每次只用一个数据来计算搜索方向
## 相比batchgd 每次都是沿着梯度方向, 随机GD可能错过最佳值 所以它的学习率应该越来越小 而不是一直不变
## 这其实就是模拟退火的思想


$$
\eta = \frac{1}{i_iters}
$$
--->
$$
\eta = \frac{a}{i_iters + b}
$$


In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
m = 100000
np.random.seed = 666
x = np.random.normal(size=m)
X = x.reshape(-1, 1)
y = 4. * x + 3. + np.random.normal(0, 3, size = m)

In [3]:
def J(theta, X_b, y):
    try:
        return np.sum((y-X_b.dot(theta)) ** 2) / len(y)
    except:
        return float("inf")
    
def dJ(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2./ len(y)


def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
    theta = initial_theta
    cur_iter = 0
    while cur_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if (abs(J(theta, X_b,y) - J(last_theta, X_b, y)) < epsilon):
            break
        cur_iter +=1
    
    return theta
    

In [4]:
%%time
X_b = np.hstack((np.ones((len(X), 1)), X))
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)

Wall time: 750 ms


In [5]:
theta

array([2.98423638, 4.01323681])

## 随机梯度下降

In [6]:
def dJ_sgd(theta, X_b_i, y_i):
    # X_b_i 代表第i个样本  y_i 代表数据值
    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.

In [7]:
def sgd(X_b, y, initial_theta, n_iters):
    a = 5
    b = 50
    def learning_rate(iter):
        return a/(b + iter)
    # 现在是随机选一个点 可能这2个点隔的很近 所以epsilon 已经没有意义了
    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
        theta = theta - learning_rate(cur_iter) * gradient
    return theta
    

In [8]:
%%time
X_b = np.hstack((np.ones((len(X), 1)), X))
initial_theta = np.zeros(X_b.shape[1])
# 这里只是举例取1/3的数据
theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)

Wall time: 290 ms


In [9]:
theta

array([2.92658792, 3.97599907])

# 使用我们自己的sgd

In [10]:
from playML.LinearRegression import LinearRegression

In [11]:
lin_reg = LinearRegression()

In [12]:
lin_reg.fit_sgd(X, y, n_iters=3)

In [13]:
lin_reg.coef_

array([4.00752182])

In [14]:
lin_reg.interception_

2.9903503958347772

# 使用真实数据

In [15]:
from sklearn import datasets
boston = datasets.load_boston()

In [16]:
X = boston.data
y = boston.target
X = X[y<50]
y = y[y<50]

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

In [19]:
# normalization
from sklearn.preprocessing import StandardScaler

In [20]:
standardScaler = StandardScaler()

In [21]:
standardScaler.fit(X_train)
X_train_std = standardScaler.transform(X_train)

In [22]:
X_test_std = standardScaler.transform(X_test)

In [23]:
lin_reg = LinearRegression()
%time lin_reg.fit_sgd(X_train_std, y_train, n_iters=2)
print(lin_reg.score(X_test_std, y_test))

Wall time: 9.99 ms
0.7965894986415484


In [24]:
lin_reg = LinearRegression()
%time lin_reg.fit_sgd(X_train_std, y_train, n_iters=20)
print(lin_reg.score(X_test_std, y_test))

Wall time: 46 ms
0.8004513916093217


In [27]:
lin_reg = LinearRegression()
%time lin_reg.fit_sgd(X_train_std, y_train, n_iters=100)
print(lin_reg.score(X_test_std, y_test))

Wall time: 611 ms
0.8013566058998575


# sklearn sgd

In [28]:
from sklearn.linear_model import SGDRegressor

In [32]:
sgd_reg = SGDRegressor(max_iter = 100)
%time sgd_reg.fit(X_train_std, y_train)
sgd_reg.score(X_test_std, y_test)

Wall time: 6 ms


0.8003440433689902

### 可以看出sklearn中的sgd 非常的块