# 美國波士頓房價問題（回歸預測使用 SVR）

使用不同核函數的 Support Vector Machine Regression 來解回歸問題。

我們使用的資料集是 sklearn 內建的波士頓房價資料集。

## Step1 載入資料集

In [1]:
# 從 sklearn.dataset 載入波士頓房價資料集
from sklearn.datasets import load_boston
boston = load_boston()

In [2]:
boston

 'data': array([[  6.32000000e-03,   1.80000000e+01,   2.31000000e+00, ...,
           1.53000000e+01,   3.96900000e+02,   4.98000000e+00],
        [  2.73100000e-02,   0.00000000e+00,   7.07000000e+00, ...,
           1.78000000e+01,   3.96900000e+02,   9.14000000e+00],
        [  2.72900000e-02,   0.00000000e+00,   7.07000000e+00, ...,
           1.78000000e+01,   3.92830000e+02,   4.03000000e+00],
        ..., 
        [  6.07600000e-02,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.96900000e+02,   5.64000000e+00],
        [  1.09590000e-01,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.93450000e+02,   6.48000000e+00],
        [  4.74100000e-02,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.96900000e+02,   7.88000000e+00]]),
 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'],
       dtype='|S7'),
 'target': array([ 24. ,  21.6, 

In [3]:
# 從 sklearn.cross_validation 導入 train_test_split 用於數據分割
from sklearn.cross_validation import train_test_split
X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.25)



## Step2 資料前置處理（標準化）

In [4]:
# 在上述對數據的初步分析中發現目標房價之間的差異較大，因此也需要對目標值進行標準化處理
original_y_test = y_test
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
ss_y = StandardScaler()
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

In [5]:
y_train = y_train.ravel()
y_test = y_test.ravel()
y_train

array([  1.17202994e+00,  -2.82736122e-01,  -1.37111665e+00,
        -9.95433590e-02,  -1.85751718e-01,   1.37529628e-01,
         6.54779782e-01,  -2.41110450e-02,  -1.21095449e-01,
         1.29056643e+00,   2.45290077e-01,  -2.18079853e-01,
        -3.79720526e-01,  -3.15064256e-01,  -9.61626948e-01,
         7.73316275e-01,   1.09659762e+00,  -2.50407987e-01,
        -3.48870898e-02,   1.12892576e+00,  -3.04288212e-01,
        -1.33878852e+00,  -7.89210231e-01,  -6.59897692e-01,
        -3.48870898e-02,   1.25823829e+00,  -3.36616346e-01,
        -5.52137243e-01,  -9.95433590e-02,  -5.95241423e-01,
        -8.10762320e-01,  -6.70673737e-01,  -3.48870898e-02,
        -8.43090455e-01,  -1.64199628e-01,  -8.21538365e-01,
        -1.67284591e+00,  -9.72402993e-01,  -5.73689333e-01,
         2.23737987e-01,   2.56213972e+00,  -6.27569558e-01,
         1.89931345e-02,  -4.55152840e-01,  -1.10319404e-01,
        -4.33600750e-01,   4.06930750e-01,  -6.72152244e-02,
         2.23737987e-01,

## Step3 訓練與預測

In [6]:
# 使用 SVR linear kernel 進行預測
from sklearn.svm import SVR
linear_svr = SVR(kernel = 'linear')
linear_svr.fit(X_train, y_train)
linear_svr_y_predict = linear_svr.predict(X_test)
print(linear_svr_y_predict)

[-0.34500432 -1.23533524 -1.25462523 -0.60293702 -0.0330217  -0.18556008
  1.1177844  -0.81048078  0.28485659 -0.19413807  0.3606654   1.24166678
 -1.82134886  0.09026404 -1.74174867  0.11498961 -0.69491951 -0.63922632
  0.91124334 -0.19623126 -1.29095911 -0.7989655  -0.89402744 -0.70292723
  0.95860969 -1.21944654  0.13513391  0.17461006 -1.30693244  1.09427519
 -0.83218209  0.25379401 -2.07128713 -0.57200772  0.57749427  1.09089651
  0.23918729 -1.87410259 -0.52459263  0.39416606 -0.77055417 -1.14951794
  0.83452979 -1.06221481  0.56496771 -0.35946162 -0.47577697 -0.56424051
  0.27454263 -0.4052508  -0.66780617  1.27431825 -1.31270032 -0.91793512
  0.12692247 -0.7541031   0.25327483 -1.3342555  -0.14814413  0.07663348
 -0.5470557  -0.54128125  1.3879567  -0.13950679 -0.6226309   0.20033115
  0.48880067 -2.55276515 -0.95650778  0.66432403 -0.16331456 -0.52197557
 -1.18417151  0.15797875 -0.60208625  1.61793398 -0.46950632 -2.3809819
 -0.72385642  0.23726564 -0.17150121 -1.45258133 -0.

In [7]:
# 使用 SVR polynomial kernel 進行預測
poly_svr = SVR(kernel = 'poly')
poly_svr.fit(X_train, y_train)
poly_svr_y_predict = poly_svr.predict(X_test)

In [8]:
# 使用 SVR Radial basis function kernel 進行預測
rbf_svr = SVR(kernel = 'rbf')
rbf_svr.fit(X_train, y_train)
rbf_svr_y_predict = rbf_svr.predict(X_test)

## Step4 評估

In [9]:
# SVR linear kernel 評估準確度
from sklearn.metrics import r2_score, mean_squared_error
print('The value of default measurement of Linear SVR is', linear_svr.score(X_test, y_test))
print('The value of R-squared of Linear SVR is', r2_score(y_test, linear_svr_y_predict))
print('The mean squared error of Linear SVR is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)))

('The value of default measurement of Linear SVR is', 0.65171709742960804)
('The value of R-squared of Linear SVR is', 0.65171709742960804)
('The mean squared error of Linear SVR is', 27.006307139324299)


In [10]:
# SVR polynomial kernel 評估準確度
print('The value of default measurement of Poly SVR is', poly_svr.score(X_test, y_test))
print('The value of R-squared of Poly SVR is', r2_score(y_test, poly_svr_y_predict))
print('The mean squared error of Poly SVR is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict)))

('The value of default measurement of Poly SVR is', 0.40445405800289286)
('The value of R-squared of Poly SVR is', 0.40445405800289291)
('The mean squared error of Poly SVR is', 46.179403313952299)


In [11]:
# SVR Radial basis function kernel 評估準確度
print('The value of default measurement of RBF SVR is', rbf_svr.score(X_test, y_test))
print('The value of R-squared of RBF SVR is', r2_score(y_test, rbf_svr_y_predict))
print('The mean squared error of RBF SVR is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)))

('The value of default measurement of RBF SVR is', 0.75640689122739346)
('The value of R-squared of RBF SVR is', 0.75640689122739346)
('The mean squared error of RBF SVR is', 18.888525000753493)


## Cross Validation 技巧

In [12]:
from sklearn.model_selection import cross_val_score
rbf_svr1 = SVR(kernel='rbf')
scores = cross_val_score(rbf_svr1, X_train, y_train, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.84721361  0.8060778   0.71554159  0.86009238  0.86583531]
Accuracy: 0.82 (+/- 0.11)


In [13]:
rbf_svr2 = SVR(kernel='rbf', C=2)
scores = cross_val_score(rbf_svr2, X_train, y_train, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.80909537  0.82899498  0.75576345  0.90190047  0.8935655 ]
Accuracy: 0.84 (+/- 0.11)


In [14]:
rbf_svr3 = SVR(kernel='rbf', C=3)
scores = cross_val_score(rbf_svr3, X_train, y_train, cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.84291141  0.83911851  0.79038852  0.92475129  0.90329718]
Accuracy: 0.86 (+/- 0.10)


In [15]:
rbf_svr = SVR(kernel = 'rbf', C=3)
rbf_svr.fit(X_train, y_train)
rbf_svr_y_predict = rbf_svr.predict(X_test)
print('The value of default measurement of RBF SVR is', rbf_svr.score(X_test, y_test))
print('The value of R-squared of RBF SVR is', r2_score(y_test, rbf_svr_y_predict))
print('The mean squared error of RBF SVR is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)))

('The value of default measurement of RBF SVR is', 0.78068864696167373)
('The value of R-squared of RBF SVR is', 0.78068864696167384)
('The mean squared error of RBF SVR is', 17.005686226864828)


In [16]:
# 網格搜索
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
pipls = Pipeline([('svr', SVR())])
parameters = {'svr__kernel': ['rbf', 'linear'], 'svr__C': [0.01, 0.1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
gs = GridSearchCV(pipls, parameters, verbose=2, refit=True, cv=5)
%time _= gs.fit(X_train,y_train)
gs.best_params_, gs.best_score_

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] svr__kernel=rbf, svr__C=0.01 ....................................
[CV] ........................... svr__kernel=rbf, svr__C=0.01 -   0.0s
[CV] svr__kernel=rbf, svr__C=0.01 ....................................
[CV] ........................... svr__kernel=rbf, svr__C=0.01 -   0.0s
[CV] svr__kernel=rbf, svr__C=0.01 ....................................
[CV] ........................... svr__kernel=rbf, svr__C=0.01 -   0.0s
[CV] svr__kernel=rbf, svr__C=0.01 ....................................
[CV] ........................... svr__kernel=rbf, svr__C=0.01 -   0.0s
[CV] svr__kernel=rbf, svr__C=0.01 ....................................
[CV] ........................... svr__kernel=rbf, svr__C=0.01 -   0.0s
[CV] svr__kernel=linear, svr__C=0.01 .................................
[CV] ........................ svr__kernel=linear, svr__C=0.01 -   0.0s
[CV] svr__kernel=linear, svr__C=0.01 .................................
[CV] ..........

[CV] ........................... svr__kernel=linear, svr__C=4 -   0.1s
[CV] svr__kernel=rbf, svr__C=5 .......................................
[CV] .............................. svr__kernel=rbf, svr__C=5 -   0.0s
[CV] svr__kernel=rbf, svr__C=5 .......................................
[CV] .............................. svr__kernel=rbf, svr__C=5 -   0.0s
[CV] svr__kernel=rbf, svr__C=5 .......................................
[CV] .............................. svr__kernel=rbf, svr__C=5 -   0.0s
[CV] svr__kernel=rbf, svr__C=5 .......................................
[CV] .............................. svr__kernel=rbf, svr__C=5 -   0.0s
[CV] svr__kernel=rbf, svr__C=5 .......................................
[CV] .............................. svr__kernel=rbf, svr__C=5 -   0.0s
[CV] svr__kernel=linear, svr__C=5 ....................................
[CV] ........................... svr__kernel=linear, svr__C=5 -   0.0s
[CV] svr__kernel=linear, svr__C=5 ....................................
[CV] .

[CV] .......................... svr__kernel=linear, svr__C=10 -   0.1s
[CV] svr__kernel=linear, svr__C=10 ...................................
[CV] .......................... svr__kernel=linear, svr__C=10 -   0.1s
CPU times: user 4.66 s, sys: 216 ms, total: 4.88 s
Wall time: 4.82 s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    4.8s finished


({'svr__C': 7, 'svr__kernel': 'rbf'}, 0.879559306603956)

In [17]:
print(gs.score(X_test, y_test))

0.794745841287
