# 보스턴 집값 예측 모델 
- 데이터셋 : boston.csv
- 학습방법 : 지도학습 -> 회귀
- 피쳐/독립 : 13개 
- 타겟/종속 : 1개 

In [2]:
# 모듈 로딩 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

In [3]:
# CSV -> DAtaFrame
dataDF = pd.read_csv('../data/boston.csv')
dataDF.head(2)

dataDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


## [2] 전처리
### [2-1] 데이터 정제

- 결측치, 중복값, 이상치, 칼럼별 고유값 추출로 이상 데이터 체크

[2-2] 표준화 & 정규화 ==> 진행여부에 따라 성능의 변화는 경우에 따라 다름!!
 - 정규분포 데이터셋을 기반으로 한 모델 -> StandardScaler, MinMaxScaler, Log 변환
 = 피쳐의 값의 범위 차이를 줄이기 => 피쳐 스케일링, MinMaxScaler, RobustScaler
 - 범주형 피쳐 => 수치화 인코딩 OneHotEncoder, OrdinalEncoder
 - 문자열 타겟 => 정수 라벨 인코딩 LabelEncoder

## [2-2] 표준화 & 정규화 
-> 진행 여부에 따라 성능의 변화는 경우에 따라 다름
- 정규분포 데이터셋을 기반으로 한 모델 -> StandardScaler, MinMaxScaler, Log 변환 
- 피쳐의 값의 범위 차이를 줄이기 -> 피쳐 스케일링, MinMaxScaler, RobustScaler....
- 범주형 피쳐 -> 수치화 인코딩 : OneHotEncoder, OrdinalEncoder
- 문자열 타겟 -> 정수 라벨 인코딩 : LabelEncoder

[2-3] 피쳐와 타겟 분리

In [4]:
featureDF = dataDF.iloc[:, :-1]
targetSR = dataDF['MEDV']

In [5]:
print(f'featureDF : {featureDF.shape}, targetSR : {targetSR.shape}')

featureDF : (506, 13), targetSR : (506,)


[3] 학습 준비

[3-1] 학습용 데이터셋과 테스트용 데이터셋 분리

In [6]:
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetSR, random_state=10)

In [7]:
print(f'X_train : {X_train.shape}, y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}, y_test : {y_test.shape}')

X_train : (379, 13), y_train : (379,)
X_test : (127, 13), y_test : (127,)


[3-2] 학습용 데이터셋으로 스케일러 생성

In [8]:
### - 수치 피쳐 값의 범위 차가 큼 ==> Scaling 진행
ssScaler = StandardScaler()
ssScaler.fit(X_train)

In [9]:
X_train_scaled = ssScaler.transform(X_train)
X_test_Scaled = ssScaler.transform(X_test)

[4] 학습 진행 ==> 교차검증으로 진행

In [29]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [28]:
### 모델 성능을 좌우하는 Hyper-parameter 제어 즉, 튜닝
alpha_values = [0.,1.0,10,100]

for value in alpha_values:
    ridge_model = Ridge(alpha=value)
# 모델 인스턴스 생성
    ridge_model = Ridge(alpha=1.0) # 기본값 1.0

# 학습 진행
# - cv : 3개
# - scoring : 'mean_squared_error', 'r2'
# return_train_score
    result = cross_validate(ridge_model,
                            X_train_scaled,y_train,
                            cv = 3,
                            scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True,
                            return_estimator=True)
    
    resultDF = pd.DataFrame(result)[['test_r2', 'train_r2']]

    resultDF['diff'] = abs(resultDF['test_r2'] - resultDF['train_r2'])
    best_idx = resultDF['diff'].sort_values()[0]
    # print(best_idx)
    print(result['estimator'][0].coef_)
    # print(f'[Ridge(alpha={value})]')
    # print(resultDF, end='\n\n')

[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]


In [30]:
alpha_values = [0.,1.0,10,100]
for value in alpha_values:
    ridge_model = Lasso(alpha=value, max_iter=3) # 기본값 1.0

    result = cross_validate(ridge_model,
                            X_train_scaled,y_train,
                            cv = 3,
                            scoring=['neg_mean_squared_error','r2'],
                            return_train_score=True,
                            return_estimator=True)
    
    resultDF = pd.DataFrame(result)[['test_r2', 'train_r2']]

    resultDF['diff'] = abs(resultDF['test_r2'] - resultDF['train_r2'])
    best_idx = resultDF['diff'].sort_values()[0]
    # print(best_idx)
    print(result['estimator'][0].coef_)
    # print(f'[Ridge(alpha={value})]')
    # print(resultDF, end='\n\n')

[-0.76918209  1.30798802 -1.3660128   0.70871821 -1.12810945  3.13078874
  0.20140226 -3.18951128  0.40006951 -1.02796444 -1.33246342  1.05170534
 -2.85931196]
[-0.12685525  0.         -0.68948499  0.         -0.35867851  3.50097227
 -0.         -0.         -0.02775436 -0.34045443 -1.07046702  0.47097032
 -2.11146537]
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]
[-0.  0. -0.  0. -0.  0. -0.  0. -0. -0. -0.  0. -0.]


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


- 하이퍼 파라미터 튜닝과 교차 검증을 동시에 진행

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
# Ridge의 Hyper-parameter 값 설정
params = {'alpha':[0.,0.1,0.5,1.0],
          'max_iter':[3,5]}
    # (self) 8개의 모델이 만들어진다. 조합이 되는 것.
    # ==> 0., 3 => model    #==> 0., 5 -> Model
    # ==> 0.1, 3 => model    #==> 0.1, 5 -> Model
    # ==> 0.5, 3 => model    #==> 0.5, 5 -> Model
    # ==> 1.0, 3 => model    #==> 1.0, 5 -> Model
    # ==> 8개의 Ridge 모델 생성

In [34]:
# 인스턴스 생성
rModel = Ridge()

searchCV = GridSearchCV(rModel, params, cv =3, verbose=True, return_train_score=True)

In [35]:
searchCV.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [36]:
# fit() 진행 후 모델 파라미터 확인
searchCV.best_params_

{'alpha': 1.0, 'max_iter': 3}

In [37]:
searchCV.best_index_

6

In [40]:
bestModel = searchCV.best_estimator_
bestModel

In [41]:
resultDF = pd.DataFrame(searchCV.cv_results_)
resultDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001671,0.0004728315,0.001003,1.123916e-07,0.0,3,"{'alpha': 0.0, 'max_iter': 3}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
1,0.001663,0.0004845202,0.000334,0.0004721571,0.0,5,"{'alpha': 0.0, 'max_iter': 5}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
2,0.002005,1.123916e-07,0.0,0.0,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
3,0.001003,1.123916e-07,0.001002,2.973602e-07,0.1,5,"{'alpha': 0.1, 'max_iter': 5}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
4,0.001003,1.94668e-07,0.001003,4.052337e-07,0.5,3,"{'alpha': 0.5, 'max_iter': 3}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
5,0.001337,0.0004726067,0.000343,0.0004846326,0.5,5,"{'alpha': 0.5, 'max_iter': 5}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
6,0.001003,2.247832e-07,0.001003,1.94668e-07,1.0,3,"{'alpha': 1.0, 'max_iter': 3}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
7,0.001002,2.973602e-07,0.000677,0.0004788495,1.0,5,"{'alpha': 1.0, 'max_iter': 5}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124


: 