# The Boston Housing Dataset

* http://archive.ics.uci.edu/ml/datasets/Housing

* 집 가격 예측하는 machine learning
* 방크기, 인구데이터 등을 기반으로 집값 예측
* 회귀분석 모델

In [70]:
from sklearn.datasets import load_boston 
import matplotlib.pyplot as plt 
import numpy as np


In [71]:
boston = load_boston() 
boston["data"]


array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [73]:
x_data = boston.data 
y_data = boston.target.reshape(boston.target.size,1)


In [74]:
y_data.shape

(506, 1)

In [75]:
# scale 조정한 데이터
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,5)).fit(x_data) # standard_scale = preprocessing.StandardScaler().fit(x_data) 
x_scaled_data = minmax_scale.transform(x_data)
x_scaled_data[:3]

array([[0.00000000e+00, 9.00000000e-01, 3.39076246e-01, 0.00000000e+00,
        1.57407407e+00, 2.88752635e+00, 3.20803296e+00, 1.34601570e+00,
        0.00000000e+00, 1.04007634e+00, 1.43617021e+00, 5.00000000e+00,
        4.48399558e-01],
       [1.17961270e-03, 0.00000000e+00, 1.21151026e+00, 0.00000000e+00,
        8.64197531e-01, 2.73998850e+00, 3.91349125e+00, 1.74480990e+00,
        2.17391304e-01, 5.24809160e-01, 2.76595745e+00, 5.00000000e+00,
        1.02235099e+00],
       [1.17848872e-03, 0.00000000e+00, 1.21151026e+00, 0.00000000e+00,
        8.64197531e-01, 3.47192949e+00, 2.99691040e+00, 1.74480990e+00,
        2.17391304e-01, 5.24809160e-01, 2.76595745e+00, 4.94868627e+00,
        3.17328918e-01]])

In [77]:
# scale 조정 대이터 이용해 test와 train으로 데이터 나눔.
# test_size=0.33 (33%)
# 나머지는 training size로 할당됨.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_scaled_data, y_data, test_size=0.33)

In [78]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((339, 13), (167, 13), (339, 1), (167, 1))

In [79]:
# 알고리즘이 학습을 하도록 fit함.
# 학습을 하면 모델이 만들어짐.
# 가지고 있는 cpu 몇 개 사용할지 n_jobs=8

from sklearn import linear_model
regr = linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=8)

regr.fit(X_train, y_train) 
regr


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=8, normalize=False)

In [80]:
# 학습한 상태기 때문에 나오는 값들
regr.coef_ , regr.intercept_


(array([[-1.84674863,  1.07198455,  0.34500056,  0.48053252, -1.85360669,
          3.28762403,  0.14156723, -3.3026099 ,  1.57916098, -1.68039234,
         -1.92322456,  0.47319234, -3.98114346]]), array([30.21597925]))

In [81]:
# # The coefficients 
print('Coefficients: ', regr.coef_) 
print('intercept: ', regr.intercept_)


Coefficients:  [[-1.84674863  1.07198455  0.34500056  0.48053252 -1.85360669  3.28762403
   0.14156723 -3.3026099   1.57916098 -1.68039234 -1.92322456  0.47319234
  -3.98114346]]
intercept:  [30.21597925]


In [82]:
# 이제 데이터를 넣어주면
# prediction 예측을 할 수 있음.


regr.predict(x_data[:5])


array([[-290.61998429],
       [-238.69718106],
       [-220.28754582],
       [-189.71374457],
       [-196.54324885]])

In [83]:
x_data[:5].dot(regr.coef_.T) + regr.intercept_

array([[-290.61998429],
       [-238.69718106],
       [-220.28754582],
       [-189.71374457],
       [-196.54324885]])

In [84]:
# 예측의 정확성/신뢰성을 알아보기 위해
from sklearn.metrics import r2_score 
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error


In [86]:
# 훈련하지 않았던 test값을 넣어줘서 
# 실제 값과 얼마나 정확한지 값을 확인
y_true = y_test 
y_hat = regr.predict(X_test)

r2_score(y_true, y_hat), mean_absolute_error(y_true, y_hat), mean_squared_error(y_true, y_hat) 

(0.7520815433364153, 3.3340760988124374, 21.09436372196986)

In [87]:
y_true = y_train 
y_hat = regr.predict(X_train)
r2_score(y_true, y_hat), mean_absolute_error(y_true, y_hat), mean_squared_error(y_true, y_hat)

(0.7300685610811818, 3.3364853411896553, 22.69069677248655)

In [89]:
regr.score(X_test, y_test)

0.7520815433364153