# Robust_Scaling + All features (test_submission1,2.csv)
> Public Score
> - test_submission1: 26982.13108
> - test_submission2: 38008.90479

In [110]:
import sys, os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [111]:
# 데이터 경로 설정
file_path = os.path.join(os.getcwd(), 'data')
train_origin = pd.read_csv(file_path+'\dataset.csv')
test_origin = pd.read_csv(file_path+'\problem.csv')

In [112]:
train_origin.shape, test_origin.shape

((1340, 24), (130, 23))

In [113]:
# validation set 분리
from sklearn.model_selection import train_test_split


train, val = train_test_split(train_origin, test_size=0.1, random_state=42)
test = test_origin.copy()   # test data 복사

In [114]:
train.shape, test.shape

((1206, 24), (130, 23))

In [115]:
# 메모리를 효율적으로 사용하기 위한 downcast 함수 정의
def downcast(df, verbose=True):     # verbose 옵션 추가: (True)인 경우 몇 퍼센트 압축됐는지 출력
    start_mem = df.memory_usage().sum() / 1024**2   # 초기 메모리 사용량
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print(f'{(100*(start_mem - end_mem) / start_mem):.1f}% 압축됨')

    return df

In [116]:
downcast(train)
downcast(val)
downcast(test)

64.5% 압축됨
64.5% 압축됨
68.7% 압축됨


Unnamed: 0,Id,LotArea,Street,LotConfig,OverallQual,OverallCond,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageYrBlt,GarageCars,GarageArea,YrSold
0,1341,11200,Pave,Inside,5,5,1965,1965,1040,1040,...,0,3,1,5,0,Detchd,1965,1,384,2008
1,1342,7200,Pave,Corner,5,7,1951,2000,900,900,...,0,3,1,5,0,Detchd,2005,2,576,2010
2,1343,16905,Pave,Inside,5,6,1959,1959,1350,1328,...,1,2,1,5,2,Attchd,1959,1,308,2007
3,1344,9180,Pave,CulDSac,5,7,1983,1983,840,884,...,0,2,1,5,0,Attchd,1983,2,504,2007
4,1345,7200,Pave,Inside,5,7,1920,1996,530,581,...,0,3,1,6,0,Detchd,1935,1,288,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,1466,11478,Pave,Inside,8,5,2007,2008,1704,1704,...,0,3,1,7,1,Attchd,2008,3,772,2010
126,1467,16321,Pave,CulDSac,5,6,1957,1997,1484,1600,...,0,2,1,6,2,Attchd,1957,1,319,2006
127,1468,6324,Pave,Inside,4,6,1927,1950,520,520,...,0,1,1,4,0,Detchd,1920,1,240,2008
128,1469,8500,Pave,Inside,4,4,1920,1950,649,649,...,0,3,1,6,0,Detchd,1920,1,250,2008


# 전처리

In [117]:
# 일부 column 제거(Id, GarageCars, GarageYrBlt, TotRmsAbvGrd, TotalBsmtSF)
train = train.drop(columns=['Id'])
val = val.drop(columns=['Id'])
test = test.drop(columns=['Id'])

In [118]:
print(train.duplicated().sum())

# 중복 데이터 제거
train = train.drop_duplicates()

1


In [119]:
# X, y 분리
X_train = train.drop(columns='SalePrice', axis=1)
y_train = train['SalePrice']

X_val = val.drop(columns='SalePrice', axis=1)
y_val = val['SalePrice']

In [120]:
X_train.isnull().sum()

LotArea          0
Street           0
LotConfig        0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
GrLivArea        0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageType      73
GarageYrBlt     73
GarageCars       0
GarageArea       0
YrSold           0
dtype: int64

In [121]:
# 결측치 최빈값 처리
GarageType_mode = X_train['GarageType'].mode()[0]
GarageYrBlt_mode = X_train['GarageYrBlt'].mode()[0]

X_train['GarageType'] = X_train['GarageType'].fillna(GarageType_mode)
X_train['GarageYrBlt'] = X_train['GarageYrBlt'].fillna(GarageYrBlt_mode)

X_val['GarageType'] = X_val['GarageType'].fillna(GarageType_mode)
X_val['GarageYrBlt'] = X_val['GarageYrBlt'].fillna(GarageYrBlt_mode)

In [122]:
X_train.isnull().sum().sum(), X_val.isnull().sum().sum() ,test.isnull().sum().sum()

(0, 0, 0)

In [123]:
# 범주형 변수 Label Encoding
from sklearn.preprocessing import LabelEncoder

categorical = X_train.select_dtypes(include='object').columns

for col in categorical:
    encoder = LabelEncoder()
    X_train[col] = encoder.fit_transform(X_train[col])
    X_val[col] = encoder.transform(X_val[col])
    test[col] = encoder.transform(test[col])

In [124]:
X_train.shape, X_val.shape, test.shape

((1205, 22), (134, 22), (130, 22))

In [125]:
X_train.head()

Unnamed: 0,LotArea,Street,LotConfig,OverallQual,OverallCond,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageYrBlt,GarageCars,GarageArea,YrSold
196,15138,1,4,8,5,1995,1996,1462,1490,1304,...,1,4,1,9,1,1,1995.0,3,810,2009
447,2308,1,4,6,5,1974,1974,855,855,467,...,1,3,1,6,1,1,1974.0,2,440,2009
339,53107,1,0,6,5,1992,1992,1580,1079,874,...,1,3,1,9,2,1,1992.0,2,501,2007
464,11988,1,0,6,6,1957,1957,1244,1244,0,...,1,3,1,6,2,1,1957.0,1,336,2007
1190,17871,1,1,6,5,1967,1976,1680,1724,0,...,1,3,1,7,1,1,1967.0,2,480,2009


In [126]:
# Robust Scaling
from sklearn.preprocessing import RobustScaler

#numerical = X_train.select_dtypes(exclude='object').columns

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test)

In [127]:
X_train_scaled.shape, X_val_scaled.shape, test_scaled.shape 

((1205, 22), (134, 22), (130, 22))

In [128]:
X_train_scaled.shape, y_train.shape

((1205, 22), (1205,))

# Model training

### XGBoost

In [129]:
# 베이지안 최적화
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

param_bounds = {
    'learning_rate' : (0.001, 0.5),
    'max_leaves': (2, 1024),
    'n_estimators': (100, 1000),
    'gamma': (0, 10),
    'max_depth': (3, 15),
    'min_child_weight': (1, 10),
    }

def eval_function(max_leaves, learning_rate, n_estimators, gamma, max_depth, min_child_weight):
    params = {
        'learning_rate' : learning_rate, 
        'max_leaves': int(max_leaves),
        'n_estimators': int(n_estimators),
        'gamma': gamma,
        'max_depth': int(max_depth),
        'min_child_weight': min_child_weight,
    }

    xgb_model = XGBRegressor(**params, random_state=42)
    xgb_model.fit(X_train_scaled, y_train)
    y_pred = xgb_model.predict(X_val_scaled)
    valid_rmse = mean_squared_error(y_val, y_pred, squared=False)
    return -valid_rmse

In [130]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=50)

|   iter    |  target   |   gamma   | learni... | max_depth | max_le... | min_ch... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-2.51e+04[0m | [0m3.745    [0m | [0m0.4754   [0m | [0m11.78    [0m | [0m613.8    [0m | [0m2.404    [0m | [0m240.4    [0m |
| [0m2        [0m | [0m-2.719e+0[0m | [0m0.5808   [0m | [0m0.4332   [0m | [0m10.21    [0m | [0m725.7    [0m | [0m1.185    [0m | [0m972.9    [0m |
| [95m3        [0m | [95m-2.094e+0[0m | [95m8.324    [0m | [95m0.107    [0m | [95m5.182    [0m | [95m189.4    [0m | [95m3.738    [0m | [95m572.3    [0m |
| [0m4        [0m | [0m-2.386e+0[0m | [0m4.319    [0m | [0m0.1463   [0m | [0m10.34    [0m | [0m144.6    [0m | [0m3.629    [0m | [0m429.7    [0m |
| [0m5        [0m | [0m-2.296e+0[0m | [0m4.561    [0m | [0m0.3928   [0m | [0m5.396    [0m | [0m527.5    [0m | [0m6.332    [0m | [0m14

In [131]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = {
    'n_estimators': round(optimizer.max['params']['n_estimators']),
    'max_depth': round(optimizer.max['params']['max_depth']),
    'max_leaves': round(optimizer.max['params']['max_leaves']),
    'min_child_weight': round(optimizer.max['params']['min_child_weight']),
    'learning_rate': optimizer.max['params']['learning_rate'],
    'gamma': optimizer.max['params']['gamma'],
}

best_xgb = XGBRegressor(**best_params, random_state=42)
best_xgb.fit(X_train_scaled, y_train)

In [132]:
# validation set 예측 및 평가
y_train_pred = best_xgb.predict(X_train_scaled)
y_valid_pred = best_xgb.predict(X_val_scaled)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
valid_rmse = mean_squared_error(y_val, y_valid_pred, squared=False)

train_rmse, valid_rmse

(10846.331588737927, 21985.301592769203)

In [133]:
# 모델의 결정계수 확인
from sklearn.metrics import r2_score

r2_score(y_val, y_valid_pred)

0.9006237973190354

### LightGBM

In [134]:
# 베이지안 최적화
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

param_bounds = {
    'n_estimators': (100, 1000),
    'max_depth': (3, 15),
    'num_leaves': (20, 300),
    'min_child_samples': (10, 30),
    'learning_rate': (0.001, 0.5),
}

def eval_function(n_estimators, max_depth, num_leaves, min_child_samples, learning_rate):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'num_leaves': int(num_leaves),
        'min_child_samples': int(min_child_samples),
        'learning_rate': learning_rate,
    }
    lgbm = LGBMRegressor(**params, metric='RMSE', random_state=42, verbose=0)

    lgbm.fit(X_train_scaled, y_train)
    y_pred = lgbm.predict(X_val_scaled)
    valid_rmse = mean_squared_error(y_val, y_pred, squared=False)
    return -valid_rmse

In [135]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=50)

|   iter    |  target   | learni... | max_depth | min_ch... | n_esti... | num_le... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-2.568e+0[0m | [0m0.1879   [0m | [0m14.41    [0m | [0m24.64    [0m | [0m638.8    [0m | [0m63.69    [0m |
| [95m2        [0m | [95m-2.381e+0[0m | [95m0.07884  [0m | [95m3.697    [0m | [95m27.32    [0m | [95m641.0    [0m | [95m218.3    [0m |
| [95m3        [0m | [95m-2.166e+0[0m | [95m0.01127  [0m | [95m14.64    [0m | [95m26.65    [0m | [95m291.1    [0m | [95m70.91    [0m |
| [0m4        [0m | [0m-2.576e+0[0m | [0m0.09252  [0m | [0m6.651    [0m | [0m20.5     [0m | [0m488.8    [0m | [0m101.5    [0m |
| [0m5        [0m | [0m-2.484e+0[0m | [0m0.3063   [0m | [0m4.674    [0m | [0m15.84    [0m | [0m429.7    [0m | [0m147.7    [0m |
| [0m6        [0m | [0m-2.392e+0[0m | [0m0.1853   [0m | [0m12.21    [0m | [0m23.69    [0m | [

In [136]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = {
    'n_estimators': round(optimizer.max['params']['n_estimators']),
    'max_depth': round(optimizer.max['params']['max_depth']),
    'num_leaves': round(optimizer.max['params']['num_leaves']),
    'min_child_samples': round(optimizer.max['params']['min_child_samples']),
    'learning_rate': optimizer.max['params']['learning_rate'],
}

best_lgbm = LGBMRegressor(**best_params, metric='RMSE', random_state=42, verbose=0)
best_lgbm.fit(X_train_scaled, y_train)



In [137]:
# validation set 예측 및 평가
y_train_pred = best_lgbm.predict(X_train_scaled)
y_valid_pred = best_lgbm.predict(X_val_scaled)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
valid_rmse = mean_squared_error(y_val, y_valid_pred, squared=False)

train_rmse, valid_rmse

(24634.186823987216, 22200.78518334919)

In [138]:
# 모델의 결정계수 확인
from sklearn.metrics import r2_score

r2_score(y_val, y_valid_pred)

0.8986662273807516

# 결과 제출

In [144]:
# submission 파일 생성
submission_origin = pd.read_csv(file_path+'\submission.csv')
submission = submission_origin.copy()

In [145]:
# # 최종 모델(XGBRegressor)로 test set 예측
# xgb_pred = best_xgb.predict(test_scaled)
# lgbm_pred = best_lgbm.predict(test_scaled)

# submission['SalePrice'] = xgb_pred
# submission.to_csv('test_submission1.csv', index=False)

# submission['SalePrice'] = lgbm_pred
# submission.to_csv('test_submission2.csv', index=False)