#### 라이브러리 불러오기

In [11]:
import numpy as np
import tensorflow as tf
from datetime import datetime

# 회귀모델 평가 지표
from sklearn.metrics import mean_squared_error
# RMSE(Root Mean Squared Error) = np.sqrt(mse)
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

from sklearn.model_selection import GridSearchCV

# 사용할 모델
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb

#### 데이터 불러오기
* 데이터는 `keras.datasets`에서 가져왔으며, numpy.ndarray로 되어 있음

In [2]:
(train_x, train_y), (test_x, test_y) = tf.keras.datasets.boston_housing.load_data()

In [3]:
train_x.shape, type(train_x)

((404, 13), numpy.ndarray)

#### 공통변수

In [4]:
SEED = 1234

#### 평가함수

In [5]:
def evaluation_func(test_y, pred_y, verbose=1):
    mse = mean_squared_error(test_y, pred_y)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test_y, pred_y)
    mape = mean_absolute_percentage_error(test_y, pred_y)
    
    if verbose != 0:
        print(f'MSE: {mse:.6f}, RMSE: {rmse:.6f}, MAE: {mae:.6f}, MAPE: {mape:.6f}')
        
    return mse, rmse, mae, mape

#### ML 라이브러리

##### Lasso

In [6]:
model_l = Lasso().fit(train_x, train_y)
pred_l = model_l.predict(test_x)
_ = evaluation_func(test_y, pred_l)

MSE: 25.821651, RMSE: 5.081501, MAE: 3.405307, MAPE: 0.162639


##### Ridge

In [7]:
model_r = Ridge().fit(train_x, train_y)
pred_r = model_r.predict(test_x)
_ = evaluation_func(test_y, pred_r)

MSE: 22.548159, RMSE: 4.748490, MAE: 3.402405, MAPE: 0.172457


##### 기타 성능 지표

In [10]:
# 모델 예측성능 지표(모델이 데이터를 잘 설명하는지 여부[0~1])
# 0: 전혀 못함, 1: 잘 설명함
model_r.score(test_x, test_y)

0.729131232028436

In [13]:
# score와 동일(높으면 좋음)
r2_score(test_y, pred_r)

0.729131232028436

##### KNeighborsRegressor

In [8]:
model_knr = KNeighborsRegressor().fit(train_x, train_y)
pred_knr = model_knr.predict(test_x)
_ = evaluation_func(test_y, pred_knr)

MSE: 36.969314, RMSE: 6.080240, MAE: 4.376275, MAPE: 0.215995


##### DecisionTreeRegressor

In [42]:
model_dtr = DecisionTreeRegressor().fit(train_x, train_y)
pred_dtr = model_dtr.predict(test_x)
_ = evaluation_func(test_y, pred_dtr)

MSE: 22.069706, RMSE: 4.697841, MAE: 3.267647, MAPE: 0.174349


* GridSearchCV

In [43]:
# DecisionTreeRegressor 모델 생성
model_dtr = DecisionTreeRegressor(random_state=SEED)

# 튜닝할 하이퍼파라미터 범위 지정
param_grid_dtr = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dtr = GridSearchCV(
    model_dtr, param_grid_dtr, cv=5, scoring='neg_mean_squared_error'
).fit(train_x, train_y)

# 최적의 하이퍼파라미터와 성능 출력
best_params_dtr = grid_search_dtr.best_params_
print("Best Hyperparameters:", best_params_dtr)

# 최적의 모델로 예측
best_model_dtr = grid_search_dtr.best_estimator_
pred_dtr = best_model_dtr.predict(test_x)
_ = evaluation_func(test_y, pred_dtr)

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
MSE: 21.141087, RMSE: 4.597944, MAE: 3.256056, MAPE: 0.172123


##### RandomForestRegressor

In [15]:
model_rfr = RandomForestRegressor(
    n_estimators=500, n_jobs=-1, random_state=SEED
)
model_rfr.fit(train_x, train_y)

RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=1234)

In [17]:
pred_y_rfr = model_rfr.predict(test_x)

In [20]:
_ = evaluation_func(test_y, pred_y_rfr)

MSE: 13.419559, RMSE: 3.663272, MAE: 2.414084, MAPE: 0.132021


* GridSearchCV를 이용한 하이퍼파라메터 튜닝

In [27]:
model_rfr = RandomForestRegressor(n_jobs=-1, random_state=SEED)

param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None, 10, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rfr = GridSearchCV(
    model_rfr, param_grid, cv=5, scoring='neg_mean_squared_error'
)

_start = datetime.now()
grid_search_rfr.fit(train_x, train_y)
print(f'GridSearchCV 처리시간: {datetime.now()-_start}')

best_params = grid_search_rfr.best_params_
print(f'Best Hyperparameters: {best_params}')
model_rfr_best = grid_search_rfr.best_estimator_

pred_rfr_b = model_rfr_best.predict(test_x)
evaluation_func(test_y, pred_rfr_b)

GridSearchCV 처리시간: 0:19:37.644257
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
MSE: 13.512921, RMSE: 3.675992, MAE: 2.418914, MAPE: 0.132344


(13.512920528039215, 3.67599245483981, 2.4189137254901882, 0.1323438394952828)

##### GradientBoostingRegressor

In [22]:
model_gbr = GradientBoostingRegressor().fit(train_x, train_y)
pred_gbr = model_gbr.predict(test_x)
_ = evaluation_func(test_y, pred_gbr)

MSE: 12.679853, RMSE: 3.560878, MAE: 2.345200, MAPE: 0.130145


##### ElasticNet

In [49]:
model_en = ElasticNet(
    alpha=0.1,          # 규제강도(0~inf), 
                        # 값이 클수록 규제가 강화되고 모델이 단순화됨
    l1_ratio=0.5,       # L1/L2 규제 균형조절, 0(L2만), 1(L1만) 사용
    random_state=SEED
).fit(train_x, train_y)
pred_en = model_en.predict(test_x)
_ = evaluation_func(test_y, pred_en)

MSE: 21.769895, RMSE: 4.665822, MAE: 3.333082, MAPE: 0.170267


##### XGBRegressor

In [30]:
model_xgb = XGBRegressor(eta=0.01, n_estimators=100, n_jobs=-1).fit(train_x, train_y)

In [33]:
pred_xgb = model_xgb.predict(test_x)
_ = evaluation_func(test_y, pred_xgb)

MSE: 102.053371, RMSE: 10.102147, MAE: 8.885332, MAPE: 0.363386


In [40]:
model_xgb = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=200,
    n_jobs=-1,
    max_depth=3,
    learning_rate=0.11,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
).fit(train_x, train_y)
pred_xgb = model_xgb.predict(test_x)
_ = evaluation_func(test_y, pred_xgb)

MSE: 12.260352, RMSE: 3.501479, MAE: 2.164824, MAPE: 0.117345


##### LightGBM

In [5]:
# LightGBM 데이터셋 생성
train_ds_lgb = lgb.Dataset(train_x, label=train_y)
test_ds_lgb = lgb.Dataset(test_x, label=test_y, reference=train_ds_lgb)

In [6]:
# LightGBM 모델 설정
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",    # 부스팅 알고리즘
                                # gbdt: Gradient Boosting Decision Tree
    "num_leaves": 31,           # 하나의 결정트리에서 사용하는 최대 잎(리프) 노드 수
                                # 크면 모델 복잡도가 증가하고, 적으면 모델이 간단해짐
    "learning_rate": 0.05,
    "feature_fraction": 0.9     # 모델 학습시 피처 샘플링 비율
                                # 일부만 사용해 과적합을 방지하고자 할 때 주로 사용
}

In [8]:
# 모델 훈련
num_round = 1000
bst = lgb.train(
    params, 
    train_ds_lgb, 
    num_round,      # num_round: GBDT에서 사용할 트리 갯 수
    valid_sets=[test_ds_lgb]
)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 13
[LightGBM] [Info] Start training from score 22.395050
[1]	valid_0's rmse: 8.771
[2]	valid_0's rmse: 8.41432
[3]	valid_0's rmse: 8.07667
[4]	valid_0's rmse: 7.78076
[5]	valid_0's rmse: 7.49969
[6]	valid_0's rmse: 7.22148
[7]	valid_0's rmse: 7.01511
[8]	valid_0's rmse: 6.77446
[9]	valid_0's rmse: 6.54845
[10]	valid_0's rmse: 6.35881
[11]	valid_0's rmse: 6.17107
[12]	valid_0's rmse: 5.98606
[13]	valid_0's rmse: 5.82741
[14]	valid_0's rmse: 5.67726
[15]	valid_0's rmse: 5.54277
[16]	valid_0's rmse: 5.4138
[17]	valid_0's rmse: 5.29538
[18]	valid_0's rmse: 5.19542
[19]	valid_0's rmse: 5.0967
[20]	valid_0's rmse: 5.00488
[21]	valid_0's rmse: 4.9114
[22]	valid_0's rmse: 4.82809
[23]	valid_0's rmse: 4.74814
[24]	valid_0's rmse: 4.66074
[25]	valid_0's rmse: 4.59113
[26]	valid_0's rmse: 4.52046
[27]	valid_0's rmse: 4.

In [11]:
# 모델 평가
pred_y_lgb = bst.predict(test_x, num_iteration=bst.best_iteration)
_, _, _, _ = evaluation_func(test_y, pred_y_lgb)

MSE: 11.971329, RMSE: 3.459961, MAE: 2.205423, MAPE: 0.118909


#### DL 라이브러리