# 模型选择

![ml_map](./ml_map.svg)

# 线性回归

相关理论部分可以参考本博客 “线性代数” tag 下的文章。

接下来的例子，线性回归预测房价。

In [1]:
# from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, LogisticRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score
import joblib
import pandas as pd
import numpy as np

In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.25, random_state=1)

print(x_train.shape)

(379, 13)


**fit_transform 用于训练集 (x_train)：**
- ``fit_transform`` 会先计算训练数据的均值和标准差（即“拟合”），然后用这些统计量对训练数据进行标准化（即“转换”）。

**transform 用于测试集 (x_test)：**
- ``transform`` 仅使用训练集计算得到的均值和标准差（即 fit 时保存的参数）来对测试数据进行标准化。
- 这保证了测试数据和训练数据使用相同的标准化规则，避免了数据泄漏（data leakage）。如果对测试集也用 ``fit_transform``，会基于测试数据重新计算均值和标准差，导致训练和测试数据的标准化不一致，模型评估可能失真。

In [3]:
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)

# 目标值进行了标准化
std_y = StandardScaler()
temp = y_train.reshape(-1, 1)
print(temp)

[[20.6]
 [23.1]
 [28. ]
 [20. ]
 [23.1]
 [25. ]
 [ 9.7]
 [23.9]
 [36.1]
 [13.4]
 [12.7]
 [39.8]
 [10.4]
 [20.6]
 [17.8]
 [19.5]
 [23.7]
 [28.5]
 [24.3]
 [23.8]
 [19.1]
 [28.4]
 [20.5]
 [33.8]
 [14.5]
 [20.4]
 [16. ]
 [13.3]
 [30.8]
 [27.5]
 [24.4]
 [24.4]
 [25.1]
 [43.8]
 [21.9]
 [26.2]
 [14.2]
 [20.8]
 [20.1]
 [23.1]
 [13.1]
 [16.2]
 [24.8]
 [20.2]
 [22.5]
 [14.8]
 [28.7]
 [20.1]
 [23.4]
 [32. ]
 [19.1]
 [50. ]
 [20.9]
 [21.7]
 [22. ]
 [17.2]
 [30.3]
 [12.3]
 [21.4]
 [20.5]
 [35.2]
 [19.6]
 [22. ]
 [21.7]
 [14.1]
 [21.1]
 [15. ]
 [11.9]
 [20. ]
 [41.3]
 [18.7]
 [50. ]
 [50. ]
 [18.4]
 [17.9]
 [28.1]
 [16.1]
 [17.2]
 [28.6]
 [23.6]
 [20.4]
 [19.6]
 [18.8]
 [22.6]
 [17.7]
 [30.5]
 [18.2]
 [20.6]
 [24.4]
 [17.3]
 [13.3]
 [22.8]
 [20.5]
 [21.2]
 [18.8]
 [18.9]
 [18.2]
 [23.1]
 [32.7]
 [24. ]
 [10.2]
 [19.5]
 [33.1]
 [13.4]
 [15.2]
 [24.8]
 [24.3]
 [ 9.5]
 [24.2]
 [18.5]
 [44. ]
 [50. ]
 [24.7]
 [21.5]
 [ 8.4]
 [21.8]
 [50. ]
 [23.8]
 [32.4]
 [24.4]
 [17.6]
 [29.8]
 [ 9.6]
 [16.7]
 [13.8]


In [4]:
# 目标值是一维的，这里需要传进去2维的
# [1, 4, 5]
# 变成
# [[1],
#  [4],
#  [5]]
y_train = std_y.fit_transform(y_train.reshape(-1, 1))  
y_test = std_y.transform(y_test.reshape(-1, 1))

y_train # 标准化后的房价

array([[-0.19582006],
       [ 0.0847902 ],
       [ 0.63478631],
       [-0.26316652],
       [ 0.0847902 ],
       [ 0.298054  ],
       [-1.41928078],
       [ 0.17458549],
       [ 1.54396354],
       [-1.0039776 ],
       [-1.08254847],
       [ 1.95926673],
       [-1.34070991],
       [-0.19582006],
       [-0.51010354],
       [-0.31928857],
       [ 0.15213666],
       [ 0.69090836],
       [ 0.21948313],
       [ 0.16336107],
       [-0.36418621],
       [ 0.67968395],
       [-0.20704447],
       [ 1.28580211],
       [-0.88050909],
       [-0.21826888],
       [-0.71214293],
       [-1.01520201],
       [ 0.9490698 ],
       [ 0.57866426],
       [ 0.23070754],
       [ 0.23070754],
       [ 0.30927841],
       [ 2.40824314],
       [-0.04990272],
       [ 0.43274692],
       [-0.91418232],
       [-0.17337123],
       [-0.25194211],
       [ 0.0847902 ],
       [-1.03765083],
       [-0.68969411],
       [ 0.27560518],
       [-0.2407177 ],
       [ 0.01744374],
       [-0

In [5]:
# estimator预测，正规方程进行线性回归
lr = LinearRegression()
lr.fit(x_train, y_train)

print('回归系数', lr.coef_)  # 回归系数可以看特征与目标之间的相关性

回归系数 [[-0.12026411  0.15044778  0.02951803  0.07470354 -0.28043353  0.22170939
   0.02190624 -0.35275513  0.29939558 -0.2028089  -0.23911894  0.06305081
  -0.45259462]]


In [6]:
y_predict = lr.predict(x_test)
# 预测测试集的房子价格，通过inverse得到真正的房子价格。否则是标准化后的房价
y_lr_predict = std_y.inverse_transform(y_predict)

# 保存训练好的模型
joblib.dump(lr, "./test.pkl")

print("正规方程测试集里面每个房子的预测价格：", y_lr_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

正规方程测试集里面每个房子的预测价格： [[32.37816533]
 [27.95684437]
 [18.07213891]
 [21.63166556]
 [18.93029508]
 [19.96277202]
 [32.2834674 ]
 [18.06715668]
 [24.72989076]
 [26.85359369]
 [27.23326816]
 [28.57021239]
 [21.18778302]
 [26.94393815]
 [23.37892579]
 [20.89176865]
 [17.11746934]
 [37.73997945]
 [30.51980066]
 [ 8.44489436]
 [20.86557977]
 [16.21989418]
 [25.13605925]
 [24.77658813]
 [31.40497629]
 [11.02741407]
 [13.82097563]
 [16.80208261]
 [35.94637198]
 [14.7155729 ]
 [21.23939821]
 [14.15079469]
 [42.72492585]
 [17.83887162]
 [21.84610225]
 [20.40178099]
 [17.50287927]
 [27.00093206]
 [ 9.80760408]
 [20.00288662]
 [24.27066782]
 [21.06719021]
 [29.47089776]
 [16.48482565]
 [19.38852695]
 [14.54778282]
 [39.39838319]
 [18.09810655]
 [26.22164983]
 [20.60676525]
 [25.09994066]
 [24.48366723]
 [25.02297948]
 [26.84986898]
 [ 5.01517985]
 [24.12809513]
 [10.72843392]
 [26.83178157]
 [16.8023533 ]
 [35.48142073]
 [19.50937911]
 [27.43260347]
 [16.58016763]
 [19.151488  ]
 [10.9990262 ]
 [32.

# 加载保存的模型

In [7]:
model = joblib.load("./test.pkl")

# 因为目标值进行了标准化，一定要把预测后的值逆向转换回来
y_predict = model.predict(x_test)

print("保存的模型预测的结果：", y_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))
print("正规方程inverse后的均方误差：", mean_squared_error(std_y.inverse_transform(y_test),
                                                   std_y.inverse_transform(y_predict)))

保存的模型预测的结果： [[ 1.12620955]
 [ 0.62994234]
 [-0.47955756]
 [-0.08002168]
 [-0.38323459]
 [-0.26734514]
 [ 1.11558027]
 [-0.48011678]
 [ 0.26773583]
 [ 0.50610896]
 [ 0.54872518]
 [ 0.69878929]
 [-0.12984488]
 [ 0.51624959]
 [ 0.11609798]
 [-0.16307075]
 [-0.58671359]
 [ 1.72804157]
 [ 0.91761907]
 [-1.56015899]
 [-0.16601029]
 [-0.68746111]
 [ 0.31332585]
 [ 0.27297733]
 [ 1.01697482]
 [-1.27028638]
 [-0.95672557]
 [-0.62211389]
 [ 1.5267197 ]
 [-0.8563123 ]
 [-0.12405138]
 [-0.91970532]
 [ 2.28757241]
 [-0.50574043]
 [-0.05595243]
 [-0.21806897]
 [-0.54345359]
 [ 0.52264682]
 [-1.40720286]
 [-0.26284251]
 [ 0.21619076]
 [-0.14338071]
 [ 0.79988591]
 [-0.65772411]
 [-0.33180076]
 [-0.87514574]
 [ 1.91418761]
 [-0.47664284]
 [ 0.43517699]
 [-0.1950607 ]
 [ 0.30927175]
 [ 0.24009869]
 [ 0.30063331]
 [ 0.50569088]
 [-1.94512422]
 [ 0.20018782]
 [-1.30384514]
 [ 0.50366068]
 [-0.6220835 ]
 [ 1.47453167]
 [-0.31823582]
 [ 0.57109939]
 [-0.64702253]
 [-0.35840699]
 [-1.27347275]
 [ 1.08939349

In [8]:
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred)

0.375

In [9]:
#人工求均方误差
(np.square(3 - 2.5) + np.square(0.5) + 1) / 4

np.float64(0.375)

# 梯度下降

官方文档：
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html

```py
class sklearn.linear_model.SGDRegressor(loss='squared_error', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, warm_start=False, average=False)
```

SGD 调整的超参数：
- eta0: 学习率的初始值
- learning_rate: 学习率调整的方式
- early_stopping: 损失没有改进，提前停止训练
- penalty: 惩罚，分为 L1 和 L2
- alpha: 值越高，正则化力度越强

**L1 正则和 L2 的区别**：
- L1 正则化产生稀疏的权值, L2 正则化产生平滑的权值
- L1 正则化偏向于稀疏，它会自动进行**特征选择**，去掉一些没用的特征，也就是将这些特征对应的权重置为 0
- L2 主要功能是为了防止过拟合，当要求参数越小时，说明模型越简单，而模型越简单则，越趋向于平滑，从而防止过拟合

拓展：
[深入理解L1、L2正则化](https://www.cnblogs.com/zingp/p/10375691.html)
- 注意，**该文章数学推导的记法比较混乱**
- pdf 已在该 ipynb 文件所在的 github 仓库目录下备份

![pic1](./1.png)

正则化力度：
- 大，参数趋近于 0（高阶项系数趋于0）
- 小，参数变化小（高阶项权重没怎么变）

In [10]:
# 梯度下降去进行房价预测,数据量大要用这个
# 默认可以去调 eta0 = 0.008，会改变learning_rate
# learning_rate='optimal',alpha会影响学习率的值，由alpha来算学习率
sgd = SGDRegressor(eta0=0.008, penalty='l1', alpha=0.005)

# 训练
sgd.fit(x_train, y_train)

print('梯度下降的回归系数', sgd.coef_)

梯度下降的回归系数 [-0.08932943  0.08004267 -0.02474441  0.07664033 -0.1692973   0.27209708
 -0.00368455 -0.23444038  0.08832413 -0.0230712  -0.21839462  0.06705525
 -0.42362037]


  y = column_or_1d(y, warn=True)


In [11]:
# 预测测试集的房子价格
y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))
y_predict = sgd.predict(x_test)

print("梯度下降测试集里面每个房子的预测价格：", y_sgd_predict)
print("梯度下降的均方误差：", mean_squared_error(y_test, y_predict))
print("梯度下降的原始房价量纲均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

梯度下降测试集里面每个房子的预测价格： [[30.32391672]
 [28.32230152]
 [18.27315078]
 [22.59155785]
 [18.31734563]
 [20.78726532]
 [30.28638373]
 [18.67515813]
 [23.67291006]
 [26.96876343]
 [26.28062595]
 [29.54736346]
 [21.56481156]
 [25.87375006]
 [22.97275854]
 [19.36436317]
 [16.96231899]
 [37.94299475]
 [29.95044198]
 [ 9.76815991]
 [20.89432132]
 [17.44043789]
 [25.40570707]
 [25.24249596]
 [30.57208482]
 [10.62516533]
 [14.4816628 ]
 [19.46519839]
 [35.71953687]
 [13.84327476]
 [24.14493776]
 [14.80261213]
 [40.72452168]
 [18.00814526]
 [24.20256132]
 [20.98313239]
 [17.58344774]
 [28.26750522]
 [ 7.93931111]
 [19.38738047]
 [26.48739656]
 [22.14047884]
 [28.64006734]
 [15.41592659]
 [18.50232469]
 [15.01428712]
 [39.69118989]
 [17.54517206]
 [25.75502821]
 [20.91174696]
 [24.75367399]
 [24.58854189]
 [25.77782627]
 [26.46062541]
 [ 7.33672246]
 [24.13522963]
 [10.51127543]
 [26.50928589]
 [17.59076698]
 [35.84148585]
 [19.35906724]
 [27.56548269]
 [15.78684947]
 [17.96360506]
 [11.08460261]
 [31.

# 过拟合、欠拟合

![pic2](./2.png)

**过拟合**：在训练数据上能够获得更好的拟合，但是在训练数据外的数据集上却不能很好地拟合。（模型过于复杂）

**欠拟合**：在训练数据上不能获得更好的拟合，训练数据外的数据集上**也不能**很好地拟合数据（模型过于简单）

![pic3](./3.png)

**欠拟合**原因及解决办法：
- 原因：学习到数据的特征过少
- 解决方法：增加数据的特征数量，样本数

**过拟合**原因及解决办法：
- 原因：原始特征过多，存在一些嘈杂特征，模型过于复杂是因为模型尝试去兼顾各个测试数据点
- 解决方法
  - 进行特征选择，消除关联性大的特征（**很难做**）
  - 交叉验证（让所有数据都有过训练）
  - 正则化

特征选择：
- 过滤式：删除低方差特征
- 嵌入式：正则化，决策树，神经网络

# 岭回归

还是参考 [深入理解L1、L2正则化](https://www.cnblogs.com/zingp/p/10375691.html)，其中**线性回归L1正则化损失函数**就是对应**Lasso 回归**，**线性回归L2正则化损失函数**就是对应**岭回归**。

In [12]:
# 岭回归进行房价预测
rd = Ridge(alpha=0.05)

rd.fit(x_train, y_train)

print(rd.coef_)

[-0.12019408  0.15027489  0.02932631  0.07472724 -0.28019156  0.22179958
  0.0218258  -0.35250679  0.29879635 -0.20224632 -0.23906031  0.06305591
 -0.45246484]


In [13]:
# 预测测试集的房子价格
y_rd_predict = std_y.inverse_transform(rd.predict(x_test).reshape(-1, 1))
y_predict = rd.predict(x_test)
# print("岭回归里面每个房子的预测价格：", y_rd_predict)

print("岭回归的均方误差：", mean_squared_error(y_test, y_predict))
print("岭回归的均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_rd_predict))

岭回归的均方误差： 0.27588055100713943
岭回归的均方误差： 21.89747382596042


# 逻辑回归

```py
class sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='deprecated', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
```

官网链接： https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

## 原理部分

**逻辑回归是分类算法。**

逻辑回归是解决**二分类问题**的利器，经典应用有：
- 广告点击率
- 判断用户的性别
- 是否是垃圾邮件
- 是否患病
- 是否是诈骗
- 是否为虚假账号
- 预测用户是否会购买给定的商品类
- 判断一条评论是正面的还是负面的

本小节的截图来自 [文小刀是也](https://www.bilibili.com/video/BV1As411j7zw) 。

![pic4](./4.png)

![pic5](./5.png)

![pic6](./6.png)

![pic7](./7.png)

![pic8](./8.png)

## 实战

逻辑回归做二分类进行癌症预测（根据细胞的属性特征）。

In [14]:
# 构造列标签名字
column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
          'Uniformity of Cell Shape', 'Marginal Adhesion',
          'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
          'Normal Nucleoli', 'Mitoses', 'Class']

# 读取数据
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
    names=column)

print(data)

     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
694              776715                3                        1   
695              841769                2                        1   
696              888820                5                       10   
697              897471                4                        8   
698              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1                  1                            2   
1        

In [15]:
# 缺失值进行处理
data = data.replace(to_replace='?', value=np.nan)
# 直接删除，哪一行有空值，就删除对应的样本
data = data.dropna()
print(data)

     Sample code number  Clump Thickness  Uniformity of Cell Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
..                  ...              ...                      ...   
694              776715                3                        1   
695              841769                2                        1   
696              888820                5                       10   
697              897471                4                        8   
698              897471                4                        8   

     Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                           1                  1                            2   
1        

In [16]:
# 进行数据的分割
x_train, x_test, y_train, y_test = train_test_split(data[column[1:10]], data[column[10]], test_size=0.25,random_state=1)

# 进行标准化处理
std = StandardScaler()

x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)

# 逻辑回归预测，C 是正则化力度
# solver = 'liblinear'
lg = LogisticRegression(C=0.8, solver='newton-cg')

lg.fit(x_train, y_train)
# 逻辑回归的权重参数，了解就行，因为我们做的是分类，而不是回归
print(lg.coef_)

[[1.18305349 0.1601104  0.86016915 0.6513607  0.03074468 1.16170175
  0.84023185 0.67711276 0.75036567]]


In [17]:
y_predict = lg.predict(x_test)
print(y_predict)

[2 2 2 4 2 4 2 2 4 4 2 2 4 2 2 4 2 2 2 2 2 2 2 4 4 2 2 2 4 4 2 2 4 4 2 4 2
 2 4 4 4 2 2 4 2 2 2 2 4 2 2 2 4 2 2 2 4 2 2 2 2 4 2 2 2 4 2 4 4 2 2 4 2 2
 4 2 2 2 2 2 2 2 4 2 4 4 2 2 2 4 2 2 4 2 2 4 4 2 2 4 2 2 4 4 2 2 2 2 4 2 4
 4 2 4 2 4 2 2 2 2 4 2 4 2 2 2 2 2 4 2 2 2 2 2 2 2 4 2 4 4 2 2 4 2 2 2 2 4
 2 2 2 2 2 4 2 4 2 4 2 2 4 2 4 2 4 4 2 4 2 2 2]


In [18]:
print("准确率：", lg.score(x_test, y_test))

准确率： 0.9824561403508771


In [19]:
print(lg.predict_proba(x_test))  # 得出对应分类的概率：良性、恶性

[[9.51943419e-01 4.80565808e-02]
 [9.95813075e-01 4.18692535e-03]
 [9.85423198e-01 1.45768020e-02]
 [2.48855893e-02 9.75114411e-01]
 [9.97893646e-01 2.10635369e-03]
 [4.48738201e-04 9.99551262e-01]
 [9.93155931e-01 6.84406898e-03]
 [9.93251622e-01 6.74837806e-03]
 [5.44370806e-04 9.99455629e-01]
 [4.26672626e-04 9.99573327e-01]
 [9.88986884e-01 1.10131156e-02]
 [9.97028700e-01 2.97129974e-03]
 [1.16596385e-03 9.98834036e-01]
 [7.52083407e-01 2.47916593e-01]
 [9.89627675e-01 1.03723253e-02]
 [2.13926850e-03 9.97860731e-01]
 [9.89357285e-01 1.06427146e-02]
 [9.24085404e-01 7.59145956e-02]
 [9.90622054e-01 9.37794623e-03]
 [9.49599635e-01 5.04003651e-02]
 [9.82501572e-01 1.74984282e-02]
 [9.97893646e-01 2.10635369e-03]
 [9.90367731e-01 9.63226884e-03]
 [4.56897429e-01 5.43102571e-01]
 [9.28926821e-02 9.07107318e-01]
 [9.92979851e-01 7.02014903e-03]
 [9.95141006e-01 4.85899423e-03]
 [9.95919375e-01 4.08062534e-03]
 [9.38633728e-03 9.90613663e-01]
 [3.77983719e-01 6.22016281e-01]
 [9.669141

In [20]:
# 为什么还要看下召回率，labels 和 target_names对应
# macro avg 平均值  weighted avg 加权平均值
print("召回率：", classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"]))

召回率：               precision    recall  f1-score   support

          良性       0.97      1.00      0.99       111
          恶性       1.00      0.95      0.97        60

    accuracy                           0.98       171
   macro avg       0.99      0.97      0.98       171
weighted avg       0.98      0.98      0.98       171



In [21]:
# AUC计算要求是二分类，不需要是0和1
print("AUC指标：", roc_auc_score(y_test, y_predict))

AUC指标： 0.975


In [22]:
print(x_train.shape)
print(x_test.shape)


(512, 9)
(171, 9)
