## 梯度下降法的向量化

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
boston = datasets.load_boston()
X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [3]:
import sys 
sys.path.append("..") 
from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)
X_train[1,:]

array([  3.67822,   0.     ,  18.1    ,   0.     ,   0.77   ,   5.362  ,
        96.2    ,   2.1036 ,  24.     , 666.     ,  20.2    , 380.79   ,
        10.19   ])

In [4]:
from playML.LinearRegression import LinearRegression

lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(X_train, y_train)
lin_reg1.score(X_test, y_test)

CPU times: user 134 ms, sys: 6.9 ms, total: 141 ms
Wall time: 144 ms


0.8129802602658466

### 使用梯度下降法

In [5]:
lin_reg2 = LinearRegression()
lin_reg2.fit_gd(X_train, y_train)

  return umr_sum(a, axis, dtype, out, keepdims)
  return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
  if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):


LinearRegression()

In [6]:
# 每个特征所对应的数据规模不同，故最终求得梯度较大。默认的eta,使得梯度下降不收敛，故尝试改小eta
lin_reg2.coef_

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [7]:
# 将eta改小，不再有警告（RuntimeWarning: overflow encountered in square）
lin_reg2.fit_gd(X_train, y_train, eta=0.000001)

LinearRegression()

In [8]:
# 但是R_square仍然比较小。可能使eta太小，导致每一步行进的比较小。故需要用更多的循环次数找到损失函数的最小值
lin_reg2.score(X_test, y_test)

0.2755663485338923

In [9]:
%time lin_reg2.fit_gd(X_train, y_train, eta=0.000001, n_iters=1e6)

CPU times: user 1min 5s, sys: 542 ms, total: 1min 5s
Wall time: 34.8 s


LinearRegression()

In [10]:
# 还是未找到损失函数的最小值。因为此时的R_square < 通过解析式（fit_normal）算出来的R_square(0.81298026026584658)
# 在解析式中，不涉及到搜索的过程，故不需要进行归一化操作
# 在使用梯度下降算法时，需要不段的搜索。故特征量纲的不同，会影响搜索的结果。故需要对数据进行归一化操作。
lin_reg2.score(X_test, y_test)

0.7541852353980764

### 使用梯度下降法前进行数据归一化

In [11]:
# 对数据进行归一化
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train_standard = standardScaler.transform(X_train)

lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(X_train_standard, y_train)

CPU times: user 345 ms, sys: 4.41 ms, total: 349 ms
Wall time: 220 ms


LinearRegression()

In [12]:
# 找到损失函数的最小值。因为此时的R_square = 通过解析式（fit_normal）算出来的R_square(0.81298026026584658)
X_test_standard = standardScaler.transform(X_test)
lin_reg3.score(X_test_standard, y_test)

0.8129880620122235

### 梯度下降法的优势

In [13]:
# 线性回归中，解析式法是要对矩阵做乘法操作。当特征的维度较多（eg:n=5000），矩阵较大时，计算就比较慢。
# 在GLM中，很多没有办法找到解析式的解。
m = 1000
n = 5000

big_X = np.random.normal(size=(m, n))

true_theta = np.random.uniform(0.0, 100.0, size=n+1)

big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0., 10., size=m)

In [14]:
big_reg1 = LinearRegression()
%time big_reg1.fit_normal(big_X, big_y)

CPU times: user 11.5 s, sys: 408 ms, total: 11.9 s
Wall time: 5.8 s


LinearRegression()

In [15]:
big_reg2 = LinearRegression()
%time big_reg2.fit_gd(big_X, big_y)

CPU times: user 3.13 s, sys: 9.75 ms, total: 3.14 s
Wall time: 1.57 s


LinearRegression()