# 正规方程法
## 原理

In [None]:
import os, sys
sys.path.append(os.path.abspath('..'))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from playML.visualization import show_predict

In [None]:
def normal():
    boston = load_boston()
    X_train = boston.data[10:, :]
    y_train = boston.target[10:]

    X_test = boston.data[:10, :]
    y_test = boston.target[:10]

    # 正规方程求解
    X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
    theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)

    # 计算结果
    interception = theta[0]  # 截距
    coef = theta[1:]  # 系数

    # 预测
    X_b_test = np.hstack([np.ones((len(X_test), 1)), X_test])
    y_predict = X_b_test.dot(theta)
    print(np.hstack([y_test.reshape(-1, 1), y_predict.reshape(-1, 1)]))

    show_predict(y_test, y_predict)

In [None]:
normal()

## 测试自己封装的库

In [None]:
%%time
%run ../testML/3.多元线性回归.py normal

## 使用scikit-learn

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
def liner_model():
    """正规方程"""

    # 1.获取数据集
    iris = load_iris()


    # 2.数据基本处理
    # 2.1 异常值、缺失值处理

    # 2.2 数据集分割：测试集、训练集
    x_train, x_test, y_train, y_test = train_test_split(
        iris.data, iris.target, random_state=22, test_size=0.2)


    # 3.特征工程  <----- 无需进行数据归一化


    # 4.机器学习
    # 4.1 实例化一个估计器
    estimater = LinearRegression()

    # 4.2 模型训练
    estimater.fit(x_train, y_train)


    # 5.模型评估
    y_predict = estimater.predict(x_test)
    ret = estimater.score(x_test, y_test)
    
    show_predict(y_test, y_predict)
    
    print("模型的回归系数是:\n", estimater.coef_)
    print("模型的截距是:\n", estimater.intercept_)
    print("R^2误差是:\n", ret)  

In [None]:
liner_model()

# 梯度下降法
## 原理
### 梯度下降法模拟

In [None]:
%run ../testML/梯度下降法模拟.py

### 全梯度下降法
- 每次都是沿着梯度下降的方向进行搜索
- 但计算量大

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def batch_gd(_x, _y):
    """全梯度下降法"""

    def J(theta, X_b, y):
        """计算theta处的损失值"""
        return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)

    def dJ(theta, X_b, y):
        """计算theta处的梯度值"""
        res = np.empty(len(theta))
        res[0] = np.sum(X_b.dot(theta) - y)
        for i in range(1, len(theta)):
            res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
        return res * 2 / len(X_b)

    def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
        """
        梯度下降法
        :param X_b: [1:特征矩阵]
        :param y: 值向量
        :param initial_theta: 迭代初始值
        :param eta: 学习率
        :param n_iters: 迭代次数
        :param epsilon: 精度
        :return: 最终迭代值
        """
        theta = initial_theta
        i_iter = 0

        while i_iter < n_iters:
            gradient = dJ(theta, X_b, y)
            last_theta = theta
            theta = theta - eta * gradient

            if abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon:
                break

            i_iter += 1

        return theta

    def show(x, y, theta):
        # 绘制
        plt.scatter(x, y)
        plt.scatter(x, x * theta[1] + theta[0], color='g')
        plt.show()

    # 模拟只有一个feature
    X_train = _x.reshape(-1, 1)
    y_train = _y

    # 扩展特征矩阵: X_b = [1: X]
    X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
    initial_theta = np.zeros(X_b.shape[1])  # 初始值
    eta = 0.01  # 学习率
    theta = gradient_descent(X_b, y_train, initial_theta, eta)
    print("截距:", theta[0])
    print("回归系数:", theta[1:])

    show(_x, _y, theta)

In [None]:
np.random.seed(666)  # 随机数种子
x = 2 * np.random.random(size=100)
y = x * 3. + 4. + np.random.normal(size=100)
batch_gd(x, y)

### 随机梯度下降法
- 使用较少的数据，就可收敛
- 但不保证每次的搜索方向都是沿着收敛方向
- 精度会有一定损失

In [None]:
def sgd(_x, _y):
    """随机梯度下降法"""

    def dJ_sgd(theta, X_b_i, y_i):
        """根据单个样本，计算搜索方向"""
        assert np.array(X_b_i).ndim == 1, "随机梯度法: 计算搜索方向时，要求输入单个样本"
        return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.

    def gradient_descent(X_b, y, initial_theta, n_iters):
        """
        梯度下降法
        :param X_b: [1:特征矩阵]
        :param y: 值向量
        :param initial_theta: 迭代初始值
        :param n_iters: 迭代次数
        :param epsilon: 精度
        :return: 最终迭代值
        """

        t0 = 5
        t1 = 50

        def learning_rate(t):
            """动态学习率"""
            return t0 / (t + t1)

        theta = initial_theta
        for cur_iter in range(n_iters):
            rand_i = np.random.randint(len(X_b))                    # 随机获取一个样本
            gradient_sgd = dJ_sgd(theta, X_b[rand_i], y[rand_i])    # 计算搜索方向
            theta = theta - learning_rate(cur_iter) * gradient_sgd  # 计算下个迭代值

        return theta

    def show(x, y, theta):
        # 绘制
        plt.scatter(x, y)
        plt.scatter(x, x * theta[1] + theta[0], color='g')
        plt.show()

    # 模拟只有一个feature
    X_train = _x.reshape(-1, 1)
    y_train = _y

    # 扩展特征矩阵: X_b = [1: X]
    X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
    initial_theta = np.zeros(X_b.shape[1])  # 初始值
    theta = gradient_descent(X_b, y_train, initial_theta, n_iters=len(X_b))
    print("截距:", theta[0])
    print("回归系数:", theta[1:])

    show(_x, _y, theta)

In [None]:
np.random.seed(666)  # 随机数种子
x = 2 * np.random.random(size=100)
y = x * 3. + 4. + np.random.normal(size=100)
sgd(x, y)

## 测试自己封装的库

In [None]:
%time %run ../testML/3.多元线性回归.py gd
%time %run ../testML/3.多元线性回归.py sgd

## 使用scikit-learn

In [None]:
from sklearn.datasets import load_boston               # 1.获取数据集
from sklearn.model_selection import train_test_split   # 2.数据分割
from sklearn.preprocessing import StandardScaler       # 3.数据标准化
from sklearn.linear_model import SGDRegressor          # 4.线性回归
from sklearn.metrics import mean_squared_error         # 5.模型评估

In [None]:
def gradient_descent():
    """梯度下降法"""
    
     # 1.获取数据集
    iris = load_iris()


    # 2.数据基本处理
    # 2.1 异常值、缺失值处理

    # 2.2 数据集分割：测试集、训练集
    x_train, x_test, y_train, y_test = train_test_split(
        iris.data, iris.target, random_state=22, test_size=0.2)


    # 3.特征工程  <----- 必须进行数据归一化
    transfer = StandardScaler()
    transfer.fit(x_train)
    transfer.transform(x_train)
    transfer.transform(x_test)


    # 4.机器学习
    # 4.1 实例化一个估计器
    estimater = SGDRegressor()

    # 4.2 模型训练
    estimater.fit(x_train, y_train)


    # 5.模型评估
    y_predict = estimater.predict(x_test)
    ret = estimater.score(x_test, y_test)
    
    show_predict(y_test, y_predict)
        
    print("模型的回归系数是:\n", estimater.coef_)
    print("模型的截距是:\n", estimater.intercept_)
    print("R^2误差是:\n", ret)

In [None]:
gradient_descent()

# 对比
- 正规方程适合特征数较少的情况
- 如果特征数太多，使用梯度下降法可以更快的收敛

In [None]:
from playML.LinearRegression import LinearRegression
from playML.preprocessing import StandardScaler
import numpy as np

In [None]:
m = 1000
n = 500

np.random.seed(666)
big_X = np.random.random(size=(m, n))
true_theta = np.random.uniform(0.0, 100.0, size=n+1)
big_y = big_X.dot(true_theta[1:]) + true_theta[0]

# X_b = np.hstack([np.ones((len(big_X), 1)), big_X])
# big_y2 = X_b.dot(true_theta)

# 归一化
transfer = StandardScaler()
big_X_standard = transfer.fit_transform(big_X)

# 正规方程法
estimater1 = LinearRegression()
%time estimater1.fit_normal(big_X_standard, big_y)

# 梯度下降法
estimater2 = LinearRegression()
%time estimater2.fit_gd(big_X_standard, big_y)

np.vstack((true_theta, estimater1._theta, estimater1._theta)).T