### Sklearn中的多元线性回归算法实战


In [10]:
import numpy as np
from sklearn import datasets

# 构造数据
boston = datasets.load_boston()
X = boston.data
y = boston.target

# 清洗数据
X = X[y < 50]
y = y[y < 50]

In [2]:
# 分割训练和测试数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


In [3]:
from sklearn.linear_model import LinearRegression

# 训练模型
estimator_reg = LinearRegression()
estimator_reg.fit(X_train, y_train)

# 评估模型
estimator_reg.score(X_test, y_test)


0.8509323089087225

查看正规方程解相关数据

In [22]:
# 查看系数
estimator_reg.coef_


array([-1.00287922e-01,  3.26543388e-02, -4.68509409e-02,  1.02921936e+00,
       -1.15927300e+01,  3.59511341e+00, -2.53657709e-02, -1.16387369e+00,
        2.22080674e-01, -1.21981663e-02, -8.16231253e-01,  7.16938620e-03,
       -3.50243874e-01])

In [5]:
# 查看截距
estimator_reg.intercept_

32.205113210463736

In [19]:
# 线性回归算法具有可解释性,因为每一个特征都一一对应有一个系数
np.argsort(estimator_reg.coef_)  # 特征系数从小到大排序,返回下标list

array([ 4,  7, 10, 12,  0,  2,  6,  9, 11,  1,  8,  3,  5], dtype=int64)

In [20]:
# 查看排好序的特征名称
boston.feature_names[np.argsort(estimator_reg.coef_)]

array(['NOX', 'DIS', 'PTRATIO', 'LSTAT', 'CRIM', 'INDUS', 'AGE', 'TAX',
       'B', 'ZN', 'RAD', 'CHAS', 'RM'], dtype='<U7')

In [14]:
print(boston.DESCR)


.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

根据波士顿房价信息描述,影响房价的因素有13个,feature_sort表示影响程度的排序

estimator_reg.coef_是有正有负的,正数表示正相关,负数表示负相关

解释部分排序:

In [29]:
print(np.argsort(estimator_reg.coef_))  # 特征系数从小到大排序,返回下标list

# 负相关
# NOX:nitric oxides concentration (parts per 10 million) 一氧化氮浓度（百万分之一）
# 浓度越高,房价越低
print(estimator_reg.coef_[4],boston.feature_names[4])

# DIS:weighted distances to five Boston employment centres 与五个波士顿就业中心的加权距离
# 距离越近,房价越高
print(estimator_reg.coef_[7],boston.feature_names[7])

# PTRATIO:pupil-teacher ratio by town 镇上小学师生比例
# 师生比与房价成反比,教育的重视,教育资源越是富裕的地方,生源就会大,师生比自然会降低,周边的房价会升高
print(estimator_reg.coef_[10],boston.feature_names[10])

# 正相关
# CHAS:Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) 查尔斯河虚拟变量
# 1表示靠近河流,0则不靠近  靠近河流的房间高
print(estimator_reg.coef_[3],boston.feature_names[3])

# RM:average number of rooms per dwelling 每个住宅的平均房间数
# 房间越多,房价越高
print(estimator_reg.coef_[5],boston.feature_names[5])



[ 4  7 10 12  0  2  6  9 11  1  8  3  5]
-11.592729992518393 NOX
-1.1638736929941789 DIS
-0.8162312532894358 PTRATIO
1.0292193644381193 CHAS
3.5951134076632307 RM


### 线性回归算法总结
 
- 典型的参数学习,kNN为非参数学习,需要在训练过程中确定合适的参数

- 只能解决回归问题

- 在很多分类方法中,线性回归是基础,如逻辑回归

- R方是模型评估常用的评估方式,虽然有MSE,RMSE,MAE

- 在使用线性回归算法时,对数据是有一定假设的,即数据和预测值有一定的线性关系

- 对数据具有强解释性

对比kNN, kNN既能解决分类问题,还能解决回归问题

### 使用kNN 解决回归问题

In [6]:
# 数据标准归一化
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
X_train_standard = standard_scaler.transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

# 寻找模型最优化参数
k = [i for i in range(1, 11)]
params_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':k
    },
    {
        'weights':['distance'],
        'n_neighbors':k,
        'p':[p for p in range(1, 6)]
    }
]

estimator_knn = KNeighborsRegressor()

grid_search_cv = GridSearchCV(estimator_knn, params_grid, n_jobs=-1, verbose=1)
grid_search_cv.fit(X_train_standard, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    1.3s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [30]:
# 查看最优化模型参数
grid_search_cv.best_params_

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [None]:
# 查看最高得分
grid_search_cv.best_score_

In [9]:
estimator_knn = grid_search_cv.best_estimator_
# 评估模型
estimator_knn.score(X_test_standard, y_test)

0.8622811965162893