In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 原始数据

In [None]:
np.random.seed(666)
x = np.random.uniform(-3, 3, size=100)
X = x.reshape(-1, 1)
y = 0.5 * x**2 + x + 2 + np.random.normal(0, 1, size=100)

In [None]:
plt.scatter(x, y)
plt.show()

# 拟合
## 使用线性回归拟合

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_regression = LinearRegression()
linear_regression.fit(X, y)
y_predict = linear_regression.predict(X)

In [None]:
plt.scatter(x, y)
plt.plot(x, y_predict, color='r')
plt.show()

## 使用多项式回归拟合
### 转换为线性回归
- y = ax**2 + bx + c 看成是: y = ax1 + bx2 + c, X = [x1, x2], 训练模型求合适的系数 a、b、c

In [None]:
X2 = np.hstack([X**2, X])

### 线性回归训练

In [None]:
linear_regression2 = LinearRegression()
linear_regression2.fit(X2, y)
y_predict2 = linear_regression2.predict(X2)

In [None]:
plt.scatter(x, y)
plt.plot(np.sort(x), y_predict2[np.argsort(x)], color='r')  # x从小到大的顺序
plt.show()

In [None]:
linear_regression2.coef_

In [None]:
linear_regression2.intercept_

## Polynomial回归

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


def PolynomialRegression(degree=2):
    """多项式回归"""
    return Pipeline([
        ("ploy", PolynomialFeatures(degree)),    # 1.转换为线性回归, degree: 多项式的阶数
        ("std_scaler", StandardScaler()),        # 2.数据集归一化
        ("linear_reg", LinearRegression())       # 3.线性回归
    ])


def PolynomialRegressionTest(degree=2):
    # 多项式拟合
    ploy_regression = PolynomialRegression(degree=degree)
    ploy_regression.fit(X, y)
    y_predict = ploy_regression.predict(X)

    plt.scatter(x, y)
#     plt.plot(np.sort(x), y_predict[np.argsort(x)], color='g')  # x从小到大的顺序

    # 绘制真实的拟合曲线
    X_plot = np.linspace(-3, 3, 100).reshape(-1, 1)
    y_plot = ploy_regression.predict(X_plot)
    plt.plot(X_plot, y_plot, color='r')
    plt.axis([-3, 3, -1, 10])
    plt.show()

    # MSE
    print(mean_squared_error(y, y_predict))

In [None]:
PolynomialRegressionTest(degree=2)

# 模型泛化能力
## 欠拟合和过拟合
- 欠拟合： 在训练集和测试集上表现都不好。一般是样本量过少训练不足，导致模型过于简单，偏差较大
- 过拟合： 在训练集上表现好，在测试集上表现不好。一般是噪音或无用的特征参与了训练，导致模型过于复杂，方差较大

In [None]:
PolynomialRegressionTest(degree=0)

In [None]:
PolynomialRegressionTest(degree=1)

In [None]:
PolynomialRegressionTest(degree=2)

In [None]:
PolynomialRegressionTest(degree=10)

In [None]:
PolynomialRegressionTest(degree=100)

## 学习曲线

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error


def plot_learning_curve(algo, X_train, X_test, y_train, y_test):
    """绘制学习曲线"""
    train_score = []
    test_score = []
    for i in range(1, len(X_train)+1):
        algo.fit(X_train[:i], y_train[:i])

        y_train_predict = algo.predict(X_train[:i])
        train_score.append(mean_squared_error(y_train[:i], y_train_predict))

        y_test_predict = algo.predict(X_test)
        test_score.append(mean_squared_error(y_test, y_test_predict))

    plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(train_score), label='train')
    plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(test_score), label='test')
    plt.legend()
    plt.axis([0, len(X_train)+1, 0, 4])
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [None]:
plot_learning_curve(LinearRegression(), X_train, X_test, y_train, y_test)  # 欠拟合

In [None]:
plot_learning_curve(PolynomialRegression(), X_train, X_test, y_train, y_test)  # ok

In [None]:
plot_learning_curve(PolynomialRegression(degree=20), X_train, X_test, y_train, y_test) # 过拟合

## 交叉验证
- 使用train_test_split后的数据集来训练模型，结果有可能过拟合测试数据，一般使用交叉验证来训练模型：
  - 1.将train_test_split后的训练集划分为：训练集+验证集，进行模型训练，获取最佳的超参数
  - 2.测试模型在测试集上的表现

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier


def train_test(X_train, X_test, y_train, y_test):
    """使用训练集和测试集训练模型(结果仍有可能过拟合)"""
    best_score, best_p, best_k = 0, 0, 0
    for k in range(2, 6):
        for p in range(1, 5):
            knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
            knn_clf.fit(X_train, y_train)
            score = knn_clf.score(X_test, y_test)
            if score > best_score:
                best_score, best_p, best_k = score, p, k

    print("best k = ", best_k)
    print("best p = ", best_p)
    print("best score = ", best_score)
    

def train_validate_test(X_train, X_test, y_train, y_test, cv=3):
    """
    1.使用交叉验证获取最佳的超参数，
    2.用最佳的超参数训练模型，测试模型在测试集上的表现
    """
    
    # 1.使用交叉验证获取最佳的超参数
    best_score, best_p, best_k = 0, 0, 0
    for k in range(2, 6):
        for p in range(1, 5):
            knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
            scores = cross_val_score(knn_clf, X_train, y_train, cv=cv)
            score = np.mean(scores)
            if score > best_score:
                best_score, best_p, best_k = score, p, k

    print("best k = ", best_k)
    print("best p = ", best_p)
    print("best score = ", best_score)
    
    # 2.用最佳的超参数训练模型，测试模型在测试集上的表现
    knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
    knn_clf.fit(X_train, y_train)
    print("final score = ", knn_clf.score(X_test, y_test))


def test():
    digits = load_digits()
    X = digits.data
    y = digits.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=10)
    
    train_test(X_train, X_test, y_train, y_test)
    train_validate_test(X_train, X_test, y_train, y_test)
    

if __name__ == '__main__':
    test()

## 模型正则化
- 解决过拟合问题。
  - L1 正则化：高次项系数置0，直接去掉高次项。<--**Lasso回归**
  - L2 正则化：高次项系数设为很小的值，弱化高次项带来的影响。<--**Ridge回归**
  - 弹性网络 ：结合 L1、L2

### Ridge回归
- 损失函数：MSE + alpha * np.sum(theta ** 2)
- alpha越大，theta越小，正则化力度越大

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


def RidgeRegression(degree=2, alpha=0.0001):
    """Ridge回归"""
    return Pipeline([
        ("ploy", PolynomialFeatures(degree)),    # 1.转换为线性回归, degree: 多项式的阶数
        ("std_scaler", StandardScaler()),        # 2.数据集归一化
        ("ridge_reg", Ridge(alpha=alpha))        # 3.Ridge回归
    ])


def RidgeRegressionTest(degree=2, alpha=0.0001):
    # Ridge回归
    ridge_regression = RidgeRegression(degree=degree, alpha=alpha)
    ridge_regression.fit(X, y)
    y_predict = ridge_regression.predict(X)

    plt.scatter(x, y)
#     plt.plot(np.sort(x), y_predict[np.argsort(x)], color='g')  # x从小到大的顺序

    # 绘制真实的拟合曲线
    X_plot = np.linspace(-3, 3, 100).reshape(-1, 1)
    y_plot = ridge_regression.predict(X_plot)
    plt.plot(X_plot, y_plot, color='r')
    plt.axis([-3, 3, -1, 10])
    plt.show()

    # MSE
    print(mean_squared_error(y, y_predict))

In [None]:
PolynomialRegressionTest(degree=100)  # 多项式回归

In [None]:
RidgeRegressionTest(degree=100, alpha=0.0001)  # 岭回归

In [None]:
RidgeRegressionTest(degree=100, alpha=1)

In [None]:
RidgeRegressionTest(degree=100, alpha=100)

In [None]:
RidgeRegressionTest(degree=100, alpha=1000000)  # 正则化过头

### Lasso回归
- 损失函数：MSE + alpha * np.abs(theta)
- alpha越大，theta越小，正则化力度越大

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error


def LassoRegression(degree=2, alpha=0.01):
    """Lasso回归"""
    return Pipeline([
        ("ploy", PolynomialFeatures(degree)),    # 1.转换为线性回归, degree: 多项式的阶数
        ("std_scaler", StandardScaler()),        # 2.数据集归一化
        ("lasso_reg", Lasso(alpha=alpha))        # 3.Lasso回归
    ])


def LassoRegressionTest(degree=2, alpha=0.01):
    # Lasso回归
    lasso_regression = LassoRegression(degree=degree, alpha=alpha)
    lasso_regression.fit(X, y)
    y_predict = lasso_regression.predict(X)

    plt.scatter(x, y)
#     plt.plot(np.sort(x), y_predict[np.argsort(x)], color='g')  # x从小到大的顺序

    # 绘制真实的拟合曲线
    X_plot = np.linspace(-3, 3, 100).reshape(-1, 1)
    y_plot = lasso_regression.predict(X_plot)
    plt.plot(X_plot, y_plot, color='r')
    plt.axis([-3, 3, -1, 10])
    plt.show()

    # MSE
    print(mean_squared_error(y, y_predict))

In [None]:
PolynomialRegressionTest(degree=100)  # 多项式回归

In [None]:
LassoRegressionTest(degree=100, alpha=0.01)

In [None]:
LassoRegressionTest(degree=100, alpha=0.1)

In [None]:
LassoRegressionTest(degree=100, alpha=1)

In [None]:
LassoRegressionTest(degree=100, alpha=10)