## Stage 6. 모델 고도화

### Import Library

In [1]:
# Visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', 
    name='NanumBarunGothic')                      
fm.fontManager.ttflist.insert(0, fe)            
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) 
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# Utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

### 1. 데이터 분석 전 준비

In [2]:
pd.options.mode.chained_assignment = None

submission = pd.read_csv('../data/sample_submission.csv')
all_data = pd.read_csv('../data/all_data.csv')

In [3]:
all_data.columns

Index(['시군구', '번지', '본번', '부번', '아파트명', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도',
       '도로명', '해제사유발생일', '등기신청일자', '거래유형', '중개사소재지', 'k-단지분류(아파트,주상복합등등)',
       'k-전화번호', 'k-팩스번호', '단지소개기존clob', 'k-세대타입(분양형태)', 'k-관리방식', 'k-복도유형',
       'k-난방방식', 'k-전체동수', 'k-전체세대수', 'k-건설사(시공사)', 'k-시행사', 'k-사용검사일-사용승인일',
       'k-연면적', 'k-주거전용면적', 'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)',
       'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하', 'k-135㎡초과', 'k-홈페이지',
       'k-등록일자', 'k-수정일자', '고용보험관리번호', '경비비관리형태', '세대전기계약방법', '청소비관리형태',
       '건축면적', '주차대수', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드',
       '좌표X', '좌표Y', '단지신청일', 'target', 'train_test'],
      dtype='object')

### 2. RandomForest Model

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import optuna

validation_year = 2022

columns = [
    'recent_price', 'transaction_cnt', 'dt_interest_rate', 'transaction_year', 'transaction_month', 'cluster'
]

train_x = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] < validation_year), columns]
train_y = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] < validation_year), 'target']

val_x = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] == validation_year), columns]
val_y = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] == validation_year), 'target']

def objective(trial):
    # 하이퍼파라미터 탐색 대상
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = trial.suggest_int('max_depth', 2, 32)

    # RandomForestRegressor 모델 학습
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=7)
    model.fit(train_x, train_y)

    # 검증 데이터로 평가
    y_pred = model.predict(val_x)
    mse = mean_squared_error(val_y, y_pred)
    return mse

# Optuna를 사용하여 하이퍼파라미터 탐색
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

best_params_rf = study.best_params
print("Best Params:", best_params_rf)

[I 2024-01-17 13:26:56,161] A new study created in memory with name: no-name-b112ec2f-8173-4bb5-9408-f73d000ab2ae
[I 2024-01-17 13:28:55,246] Trial 0 finished with value: 881923158.0347974 and parameters: {'n_estimators': 89, 'max_depth': 15}. Best is trial 0 with value: 881923158.0347974.
[I 2024-01-17 13:30:41,672] Trial 1 finished with value: 862574700.3532003 and parameters: {'n_estimators': 86, 'max_depth': 13}. Best is trial 1 with value: 862574700.3532003.
[I 2024-01-17 13:31:04,241] Trial 2 finished with value: 1143350950.6095297 and parameters: {'n_estimators': 42, 'max_depth': 4}. Best is trial 1 with value: 862574700.3532003.
[I 2024-01-17 13:33:24,070] Trial 3 finished with value: 895290657.9461818 and parameters: {'n_estimators': 81, 'max_depth': 24}. Best is trial 1 with value: 862574700.3532003.
[I 2024-01-17 13:34:23,573] Trial 4 finished with value: 905661116.5577742 and parameters: {'n_estimators': 33, 'max_depth': 27}. Best is trial 1 with value: 862574700.3532003.
[

Best Params: {'n_estimators': 59, 'max_depth': 8}


In [44]:
from sklearn.metrics import mean_absolute_error

for validation_year in [2018, 2019, 2020, 2021, 2022, 2023]:
    columns = [
        'recent_price', 'transaction_cnt', 'dt_interest_rate', 'transaction_year', 'transaction_month', 'cluster', 
    ]
    train_x = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] < validation_year), columns]
    train_y = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] < validation_year), 'target']
    
    val_x = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] == validation_year), columns]
    val_y = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] == validation_year), 'target']
    
    # 모델 생성 및 훈련
    model_rf = RandomForestRegressor(n_estimators=best_params_rf['n_estimators'],
                                     max_depth=best_params_rf['max_depth'],
                                     random_state=7
                                     )
    model_rf.fit(train_x, train_y)

    # 예측
    pred_rf_ls = model_rf.predict(val_x)

    mae = mean_absolute_error(pred_rf_ls, val_y)
    print(validation_year,'년도 MAE: ' , mae)

2018 년도 MAE:  6489.6347232219205
2019 년도 MAE:  7236.038529171427
2020 년도 MAE:  7111.432044377888
2021 년도 MAE:  10079.930731122875
2022 년도 MAE:  12967.535997085104
2023 년도 MAE:  12114.45537953079


In [46]:
# 테스트 데이터 예측

from sklearn.ensemble import RandomForestRegressor

columns = [
    'recent_price', 'transaction_cnt', 'dt_interest_rate', 'transaction_year', 'transaction_month', 'cluster', 
]
train_x = all_data.loc[all_data['train_test'] == 'train', columns]
train_y = all_data.loc[all_data['train_test'] == 'train', 'target']
test_x = all_data.loc[all_data['train_test'] == 'test', columns]

# 모델 생성 및 훈련
model_rf = RandomForestRegressor(n_estimators=best_params_rf['n_estimators'],
                                 max_depth=best_params_rf['max_depth'],
                                 random_state=7
                                 )
model_rf.fit(train_x, train_y)

# 예측
pred_ls = list()
now_df = all_data.loc[all_data['train_test'] == 'train']
test = all_data.loc[all_data['train_test'] == 'test']

for idx, row in tqdm(test.iterrows(), total = test.shape[0]):
    now_df = pd.concat([now_df, test.loc[[idx]]])
    test_x.loc[idx, 'recent_price'] = get_recent_price(idx, now_df)
    
    # 예측
    pred_rf_ls = model_rf.predict(test_x.loc[idx:idx])

    now_df.loc[idx, 'target'] = pred_rf_ls
    pred_ls.append(pred_rf_ls[0])

100%|██████████| 9272/9272 [2:58:36<00:00,  1.16s/it]  


### 3. XGBoost Model

In [53]:
import xgboost as xgb

def objective(trial):
    # 하이퍼파라미터 탐색 대상
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)

    # xgboost 모델 학습
    model = xgb.XGBRegressor(n_estimators=n_estimators,
                             max_depth=max_depth,
                             learning_rate=learning_rate,
                             random_state=7)
    model.fit(train_x, train_y)

    # 검증 데이터로 평가
    y_pred = model.predict(val_x)
    mse = mean_squared_error(val_y, y_pred)
    return mse

# Optuna를 사용하여 하이퍼파라미터 탐색
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

best_params_xgb = study.best_params
print("Best Params:", best_params_xgb)

[I 2024-01-17 18:20:38,016] A new study created in memory with name: no-name-3f96531a-ce2c-46bf-b463-3eb08fbf4e7c
  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2024-01-17 18:20:40,034] Trial 0 finished with value: 413140532.91341776 and parameters: {'n_estimators': 84, 'max_depth': 12, 'learning_rate': 0.030583854125054447}. Best is trial 0 with value: 413140532.91341776.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2024-01-17 18:20:42,220] Trial 1 finished with value: 358325104.44659686 and parameters: {'n_estimators': 49, 'max_depth': 15, 'learning_rate': 0.05180585251080496}. Best is trial 1 with value: 358325104.44659686.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.001, 0.1)
[I 2024-01-17 18:20:46,172] Trial 2 finished with value: 2117821946.5379415 and parameters: {'n_estimators': 18, 'max_depth': 31, 'learning_rate': 0.03857282466940319}. Best is trial 1 with value: 358325104.44659686.
  learning_rate = 

Best Params: {'n_estimators': 88, 'max_depth': 28, 'learning_rate': 0.09798003396471158}


In [54]:
from sklearn.metrics import mean_absolute_error

for validation_year in [2018, 2019, 2020, 2021, 2022, 2023]:
    columns = [
        'recent_price', 'transaction_cnt', 'dt_interest_rate', 'transaction_year', 'transaction_month', 'cluster', 
    ]
    train_x = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] < validation_year), columns]
    train_y = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] < validation_year), 'target']
    
    val_x = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] == validation_year), columns]
    val_y = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] == validation_year), 'target']

    # 모델 생성 및 훈련
    model_xgb = xgb.XGBRegressor(n_estimators=best_params_xgb['n_estimators'],
                              max_depth=best_params_xgb['max_depth'],
                              learning_rate=best_params_xgb['learning_rate'],
                              random_state=7)
    model_xgb.fit(train_x, train_y)
    pred_xgb_ls = model_xgb.predict(val_x)

In [55]:
from sklearn.ensemble import RandomForestRegressor

columns = [
    'recent_price', 'transaction_cnt', 'dt_interest_rate', 'transaction_year', 'transaction_month', 'cluster', 
]
train_x = all_data.loc[all_data['train_test'] == 'train', columns]
train_y = all_data.loc[all_data['train_test'] == 'train', 'target']
test_x = all_data.loc[all_data['train_test'] == 'test', columns]

# 모델 생성 및 훈련
model_xgb = xgb.XGBRegressor(n_estimators=best_params_xgb['n_estimators'],
                          max_depth=best_params_xgb['max_depth'],
                          learning_rate=best_params_xgb['learning_rate'],
                          random_state=7)
model_xgb.fit(train_x, train_y)

# 예측
pred_ls = list()
now_df = all_data.loc[all_data['train_test'] == 'train']
test = all_data.loc[all_data['train_test'] == 'test']

for idx, row in tqdm(test.iterrows(), total = test.shape[0]):
    now_df = pd.concat([now_df, test.loc[[idx]]])
    test_x.loc[idx, 'recent_price'] = get_recent_price(idx, now_df)
    
    # 예측
    pred_xgb_ls = model_xgb.predict(test_x.loc[idx:idx])
    now_df.loc[idx, 'target'] = pred_xgb_ls
    pred_ls.append(pred_xgb_ls[0])

100%|██████████| 9272/9272 [2:09:15<00:00,  1.20it/s]  


### 4. 모델 교차검증

In [58]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import numpy as np

for validation_year in [2018, 2019, 2020, 2021, 2022, 2023]:
    columns = [
        'recent_price', 'transaction_cnt', 'dt_interest_rate', 'transaction_year', 'transaction_month', 'cluster', 
    ]
    train_x = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] < validation_year), columns]
    train_y = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] < validation_year), 'target']
    
    val_x = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] == validation_year), columns]
    val_y = all_data.loc[(all_data['train_test'] == 'train') & (all_data['transaction_year'] == validation_year), 'target']
    
    # 모델 생성 및 훈련
    model_rf = RandomForestRegressor(n_estimators=best_params_rf['n_estimators'],
                                     max_depth=best_params_rf['max_depth'],
                                     random_state=7
                                     )
    model_rf.fit(train_x, train_y)
    
    model_xgb = xgb.XGBRegressor(n_estimators=best_params_xgb['n_estimators'],
                              max_depth=best_params_xgb['max_depth'],
                              learning_rate=best_params_xgb['learning_rate'],
                              random_state=7)
    model_xgb.fit(train_x, train_y)
    
    # 예측
    pred_rf_ls = model_rf.predict(val_x)
    pred_xgb_ls = model_xgb.predict(val_x)
    blended_prediction = (pred_rf_ls + pred_xgb_ls)/2
    
    mae = mean_absolute_error(blended_prediction, val_y)
    print(validation_year,'년도 MAE: ' , mae)

        # RMSE 계산
    rmse = np.sqrt(mean_squared_error(pred_val_ls, val_y))
    print(validation_year,'년도 RMSE: ', rmse)

2018 년도 MAE:  6696.961714122557
2019 년도 MAE:  7846.3793309957755
2020 년도 MAE:  7714.378067297535
2021 년도 MAE:  11020.183134681416
2022 년도 MAE:  13940.465705311575
2023 년도 MAE:  13572.748584541521


### 5. 테스트 데이터 예측

In [59]:
from sklearn.ensemble import RandomForestRegressor

columns = [
    'recent_price', 'transaction_cnt', 'dt_interest_rate', 'transaction_year', 'transaction_month', 'cluster', 
]
train_x = all_data.loc[all_data['train_test'] == 'train', columns]
train_y = all_data.loc[all_data['train_test'] == 'train', 'target']
test_x = all_data.loc[all_data['train_test'] == 'test', columns]

# 모델 생성 및 훈련
model_rf = RandomForestRegressor(n_estimators=best_params_rf['n_estimators'],
                                 max_depth=best_params_rf['max_depth'],
                                 random_state=7
                                 )
model_rf.fit(train_x, train_y)

model_xgb = xgb.XGBRegressor(n_estimators=best_params_xgb['n_estimators'],
                          max_depth=best_params_xgb['max_depth'],
                          learning_rate=best_params_xgb['learning_rate'],
                          random_state=7)
model_xgb.fit(train_x, train_y)

# 예측
pred_ls = list()
now_df = all_data.loc[all_data['train_test'] == 'train']
test = all_data.loc[all_data['train_test'] == 'test']

for idx, row in tqdm(test.iterrows(), total = test.shape[0]):
    now_df = pd.concat([now_df, test.loc[[idx]]])
    test_x.loc[idx, 'recent_price'] = get_recent_price(idx, now_df)
    
    # 예측
    pred_rf_ls = model_rf.predict(test_x.loc[idx:idx])
    pred_xgb_ls = model_xgb.predict(test_x.loc[idx:idx])
    blended_prediction = (pred_rf_ls + pred_xgb_ls)/2
    
    now_df.loc[idx, 'target'] = blended_prediction
    pred_ls.append(blended_prediction[0])

100%|██████████| 9272/9272 [2:09:20<00:00,  1.19it/s]  


### 6. 정답 제출 파일 생성

In [52]:
submission['target'] = np.ceil(pred_ls).astype(int)
submission.to_csv('../data/RF output_DY.csv', index=False)

In [56]:
submission['target'] = np.ceil(pred_ls).astype(int)
submission.to_csv('../data/XGB output_DY.csv', index=False)

In [61]:
submission['target'] = np.ceil(pred_ls).astype(int)
submission.to_csv('../data/RF,XGB CV output.csv', index=False)