# 美國波士頓房價問題（回歸預測）

使用 Linear Regression、Stochastic Gradient Descent Regression 來解二元分類問題。

我們使用的資料集是 sklearn 內建的手寫數字資料集。

## Step1 載入資料集

In [1]:
# 從 sklearn.dataset 載入波士頓房價資料集
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [2]:
boston

 'data': array([[  6.32000000e-03,   1.80000000e+01,   2.31000000e+00, ...,
           1.53000000e+01,   3.96900000e+02,   4.98000000e+00],
        [  2.73100000e-02,   0.00000000e+00,   7.07000000e+00, ...,
           1.78000000e+01,   3.96900000e+02,   9.14000000e+00],
        [  2.72900000e-02,   0.00000000e+00,   7.07000000e+00, ...,
           1.78000000e+01,   3.92830000e+02,   4.03000000e+00],
        ..., 
        [  6.07600000e-02,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.96900000e+02,   5.64000000e+00],
        [  1.09590000e-01,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.93450000e+02,   6.48000000e+00],
        [  4.74100000e-02,   0.00000000e+00,   1.19300000e+01, ...,
           2.10000000e+01,   3.96900000e+02,   7.88000000e+00]]),
 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'],
       dtype='|S7'),
 'target': array([ 24. ,  21.6, 

In [3]:
# 從 sklearn.cross_validation 導入 train_test_split 用於數據分割
from sklearn.cross_validation import train_test_split
X = boston.data
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.25)



In [4]:
# 分析回歸目標值的差異
import numpy as np
print("The max target value is", np.max(boston.target))
print("The min target value is", np.min(boston.target))
print("The average target value is", np.mean(boston.target))

('The max target value is', 50.0)
('The min target value is', 5.0)
('The average target value is', 22.532806324110677)


## Step2 資料前置處理（標準化）

In [5]:
# 在上述對數據的初步分析中發現目標房價之間的差異較大，因此也需要對目標值進行標準化處理
original_y_test = y_test
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
ss_y = StandardScaler()
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

In [6]:
y_train

array([[  1.17202994e+00],
       [ -2.82736122e-01],
       [ -1.37111665e+00],
       [ -9.95433590e-02],
       [ -1.85751718e-01],
       [  1.37529628e-01],
       [  6.54779782e-01],
       [ -2.41110450e-02],
       [ -1.21095449e-01],
       [  1.29056643e+00],
       [  2.45290077e-01],
       [ -2.18079853e-01],
       [ -3.79720526e-01],
       [ -3.15064256e-01],
       [ -9.61626948e-01],
       [  7.73316275e-01],
       [  1.09659762e+00],
       [ -2.50407987e-01],
       [ -3.48870898e-02],
       [  1.12892576e+00],
       [ -3.04288212e-01],
       [ -1.33878852e+00],
       [ -7.89210231e-01],
       [ -6.59897692e-01],
       [ -3.48870898e-02],
       [  1.25823829e+00],
       [ -3.36616346e-01],
       [ -5.52137243e-01],
       [ -9.95433590e-02],
       [ -5.95241423e-01],
       [ -8.10762320e-01],
       [ -6.70673737e-01],
       [ -3.48870898e-02],
       [ -8.43090455e-01],
       [ -1.64199628e-01],
       [ -8.21538365e-01],
       [ -1.67284591e+00],
 

In [7]:
y_train = y_train.ravel()
y_test = y_test.ravel()
y_train

array([  1.17202994e+00,  -2.82736122e-01,  -1.37111665e+00,
        -9.95433590e-02,  -1.85751718e-01,   1.37529628e-01,
         6.54779782e-01,  -2.41110450e-02,  -1.21095449e-01,
         1.29056643e+00,   2.45290077e-01,  -2.18079853e-01,
        -3.79720526e-01,  -3.15064256e-01,  -9.61626948e-01,
         7.73316275e-01,   1.09659762e+00,  -2.50407987e-01,
        -3.48870898e-02,   1.12892576e+00,  -3.04288212e-01,
        -1.33878852e+00,  -7.89210231e-01,  -6.59897692e-01,
        -3.48870898e-02,   1.25823829e+00,  -3.36616346e-01,
        -5.52137243e-01,  -9.95433590e-02,  -5.95241423e-01,
        -8.10762320e-01,  -6.70673737e-01,  -3.48870898e-02,
        -8.43090455e-01,  -1.64199628e-01,  -8.21538365e-01,
        -1.67284591e+00,  -9.72402993e-01,  -5.73689333e-01,
         2.23737987e-01,   2.56213972e+00,  -6.27569558e-01,
         1.89931345e-02,  -4.55152840e-01,  -1.10319404e-01,
        -4.33600750e-01,   4.06930750e-01,  -6.72152244e-02,
         2.23737987e-01,

## Step3 訓練與預測

In [8]:
# 使用 Linear Regression 進行預測
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_predict = lr.predict(X_test)
print(lr_y_predict)

[ -2.83526824e-01  -1.26091979e+00  -9.83920602e-01  -5.12380538e-01
  -3.31528710e-02  -2.35525258e-01   1.50485678e+00  -8.89019080e-01
   3.81387076e-02  -6.74529800e-02   2.78621030e-01   1.48469365e+00
  -1.89913987e+00   2.69918875e-01  -1.27402149e+00   9.70427270e-02
  -5.76908677e-01  -3.95949308e-01   1.01228575e+00  -4.61324119e-02
  -1.00385430e+00  -3.41726582e-01  -4.98786961e-01  -4.85354534e-01
   1.20215702e+00  -8.07974872e-01   2.61750194e-01   2.17152707e-01
  -1.21822603e+00   1.26614654e+00  -6.92497131e-01   3.19638975e-01
  -1.93379894e+00  -7.34736381e-01   7.62482696e-01   1.16073959e+00
   2.46903115e-01  -1.92202911e+00  -3.03167668e-01   6.42952958e-01
  -5.67166478e-01  -9.95483406e-01   8.04224449e-01  -7.72138573e-01
   8.07342536e-01  -2.77430949e-01  -1.34742793e-01  -6.10703462e-01
   1.41597269e-01  -1.86089726e-01  -5.95558666e-01   1.42831873e+00
  -1.23633688e+00  -7.05100426e-01   1.87003305e-01  -9.30323385e-01
   2.79891811e-01  -8.40003155e-01



In [9]:
# 使用 SGD Regression 進行預測
from sklearn.linear_model import SGDRegressor
sgdr = SGDRegressor()
sgdr.fit(X_train, y_train)
sgdr_y_predict = sgdr.predict(X_test)
print(sgdr_y_predict)

[ -5.29049875e-01  -1.22706714e+00  -1.06543242e+00  -4.81187520e-01
  -2.04442781e-02  -2.15859789e-01   1.36165928e+00  -7.80816587e-01
   2.84380406e-01  -3.31782686e-01   3.74560830e-01   1.45144038e+00
  -1.80754374e+00   2.44655801e-01  -1.41126085e+00   1.51605854e-01
  -5.36817652e-01  -4.61589496e-01   9.83389716e-01  -7.76320839e-02
  -9.95491378e-01  -5.05396500e-01  -6.47855271e-01  -3.56636161e-01
   1.03678290e+00  -9.47824029e-01   2.35609597e-01   2.24694959e-01
  -1.18555762e+00   1.27342697e+00  -7.58818631e-01   3.67119915e-01
  -1.98843564e+00  -4.36589392e-01   8.54078570e-01   1.01071622e+00
   2.55215473e-01  -2.02014549e+00  -3.79964236e-01   5.96955166e-01
  -6.20475971e-01  -9.89543731e-01   9.89267416e-01  -9.30538197e-01
   7.11897674e-01  -2.69038755e-01   1.12181189e-01  -6.33009749e-01
   2.69977817e-01  -3.02833251e-01  -5.84741534e-01   1.48068056e+00
  -1.02330308e+00  -6.35879980e-01   2.15896642e-01  -6.30012195e-01
   3.58644953e-01  -1.00721662e+00



In [10]:
original_y_test

array([ 20.5,   5.6,  13.4,  12.6,  21.2,  19.7,  32.4,  14.8,  33. ,
        21.4,  30.1,  36. ,   8.4,  21.6,  16.3,  23. ,  14.9,  14.1,
        31.1,  11.9,  12.7,  27.9,  20.8,  19.6,  32. ,  21.9,  23.2,
        23.8,  10.8,  34.9,  19.1,  26.5,  10.5,  17.5,  24. ,  36.1,
        25.3,  13.8,  27.5,  24.6,  12.7,   9.5,  32.7,  13.8,  23.5,
        17.7,  15.6,  22.5,  26.2,  20.6,  14.1,  33.3,  15.2,  14.9,
        21.6,  17.2,  23.1,  11.7,  20.6,  22.2,  23.1,  18.4,  43.8,
        21.1,  14.9,  28.7,  23.3,  13.8,  19.7,  30.5,  19. ,  19.1,
        19. ,  26.6,  17.5,  21.9,  13.8,   8.8,  19.4,  28.1,  21. ,
        11.8,   7.2,  24.1,  20. ,  18.9,  50. ,  13.3,  50. ,  41.3,
        28.7,  19.9,  16.5,  10.9,  13.4,  32.9,  20.6,  25. ,  19.5,
        19.9,  15.4,  21.7,  31.5,  27.1,   8.3,  13.6,   8.8,  22.5,
         7.5,  28.6,  50. ,  11.5,  13.5,  24.4,  36.2,  21.4,  18.5,
        22.6,  24.8,  19.3,  29.8,  16.4,   8.4,  24.7,  20.1,  13.1,  35.2])

In [11]:
# 肉眼看 Linear Regression 預測值
ss_y.inverse_transform(lr_y_predict.reshape(-1, 1)).ravel()

array([ 20.29266241,  11.22260952,  13.79311841,  18.16893596,
        22.61609328,  20.73810938,  36.88858075,  14.67378958,
        23.2776679 ,  22.29779367,  25.509306  ,  36.70147008,
         5.30002765,  25.42855137,  11.10102787,  23.82428793,
        17.57012499,  19.24939945,  32.31759913,  22.49564518,
        13.6081368 ,  19.75257782,  18.29508222,  18.41973303,
        34.07957463,  15.42586708,  25.3527473 ,  24.93888963,
        11.61880094,  34.6733873 ,  16.49748234,  25.88994608,
         4.97839697,  16.10550875,  29.99946609,  33.69522742,
        25.21496874,   5.08761915,  20.11039846,  28.89024894,
        17.66053107,  13.6858174 ,  30.38682299,  15.7584223 ,
        30.41575834,  20.34923116,  21.67335479,  17.25651471,
        24.23774707,  21.19686334,  17.39705604,  36.17831967,
        11.45073509,  16.38052575,  24.65910793,  14.29049214,
        25.52109864,  15.12864966,  22.57296032,  23.75118969,
        16.89538022,  18.80530107,  35.6670707 ,  22.07

In [12]:
# 肉眼看 SGD Regressor 預測值
ss_y.inverse_transform(sgdr_y_predict.reshape(-1, 1)).ravel()

array([ 18.01424714,  11.53675679,  13.03670152,  18.45840226,
        22.73402702,  20.92060184,  35.55973045,  15.67789169,
        25.5627521 ,  19.8448556 ,  26.39961224,  36.39288494,
         6.15002531,  25.19411402,   9.82746817,  24.33062516,
        17.9421634 ,  18.64026884,  32.04944846,  22.20333318,
        13.68574342,  18.23374674,  16.91175177,  19.61421927,
        32.54492883,  14.12808892,  25.11016667,  25.00888055,
        11.92195858,  34.74094855,  15.88202926,  26.33056172,
         4.47137694,  18.87226587,  30.84946138,  32.30303407,
        25.29210611,   4.17711458,  19.39773841,  28.46339667,
        17.16582737,  13.74093665,  32.10399259,  14.28849872,
        29.53004505,  20.4271094 ,  23.96477047,  17.04951588,
        25.42909834,  20.1135018 ,  17.49743732,  36.66422921,
        13.42765517,  17.02288059,  24.92723355,  17.07733271,
        26.2519154 ,  13.57693503,  22.27898728,  24.05045462,
        17.41631537,  18.82654599,  36.27956208,  21.04

## Step4 評估

In [13]:
# Linear Regression 評估準確度
print('The value of default measurement of LinearRegression is', lr.score(X_test, y_test))

('The value of default measurement of LinearRegression is', 0.67634038309987021)


In [14]:
# 使用 r2_score, mean_squared_error 評估準確度
from sklearn.metrics import r2_score, mean_squared_error
print('The value of R-squared of LinearRegression is', r2_score(y_test, lr_y_predict))
print('The mean squared error of LinearRegression is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict)))

('The value of R-squared of LinearRegression is', 0.67634038309987021)
('The mean squared error of LinearRegression is', 25.096985692067715)


In [15]:
# SGD Regressor 評估準確度
print('The value of default measurement of SGD Regressor is', sgdr.score(X_test, y_test))
print('The value of R-squared of SGD Regressor is', r2_score(y_test, sgdr_y_predict))
print('The mean squared error of SGD Regressor is', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predict)))

('The value of default measurement of SGD Regressor is', 0.65968001104722851)
('The value of R-squared of SGD Regressor is', 0.65968001104722851)
('The mean squared error of SGD Regressor is', 26.38885250892395)
