# Baseline_modeling (baseline_model.csv)
> Public Score: 27086.90876

In [174]:
import sys, os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [175]:
# 데이터 경로 설정
file_path = os.path.join(os.getcwd(), 'data')
train_origin = pd.read_csv(file_path+'\dataset.csv')
test_origin = pd.read_csv(file_path+'\problem.csv')

In [176]:
train_origin.shape, test_origin.shape

((1340, 24), (130, 23))

In [177]:
# validation set 분리
from sklearn.model_selection import train_test_split

train, valid = train_test_split(train_origin, test_size=0.1, random_state=42)

In [178]:
train.shape, valid.shape

((1206, 24), (134, 24))

In [179]:
test = test_origin.copy()   # test data 복사

In [180]:
# 메모리를 효율적으로 사용하기 위한 downcast 함수 정의
def downcast(df, verbose=True):     # verbose 옵션 추가: (True)인 경우 몇 퍼센트 압축됐는지 출력
    start_mem = df.memory_usage().sum() / 1024**2   # 초기 메모리 사용량
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print(f'{(100*(start_mem - end_mem) / start_mem):.1f}% 압축됨')

    return df

In [181]:
downcast(train)
downcast(valid)
downcast(test)

64.5% 압축됨
64.5% 압축됨
68.7% 압축됨


Unnamed: 0,Id,LotArea,Street,LotConfig,OverallQual,OverallCond,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageYrBlt,GarageCars,GarageArea,YrSold
0,1341,11200,Pave,Inside,5,5,1965,1965,1040,1040,...,0,3,1,5,0,Detchd,1965,1,384,2008
1,1342,7200,Pave,Corner,5,7,1951,2000,900,900,...,0,3,1,5,0,Detchd,2005,2,576,2010
2,1343,16905,Pave,Inside,5,6,1959,1959,1350,1328,...,1,2,1,5,2,Attchd,1959,1,308,2007
3,1344,9180,Pave,CulDSac,5,7,1983,1983,840,884,...,0,2,1,5,0,Attchd,1983,2,504,2007
4,1345,7200,Pave,Inside,5,7,1920,1996,530,581,...,0,3,1,6,0,Detchd,1935,1,288,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,1466,11478,Pave,Inside,8,5,2007,2008,1704,1704,...,0,3,1,7,1,Attchd,2008,3,772,2010
126,1467,16321,Pave,CulDSac,5,6,1957,1997,1484,1600,...,0,2,1,6,2,Attchd,1957,1,319,2006
127,1468,6324,Pave,Inside,4,6,1927,1950,520,520,...,0,1,1,4,0,Detchd,1920,1,240,2008
128,1469,8500,Pave,Inside,4,4,1920,1950,649,649,...,0,3,1,6,0,Detchd,1920,1,250,2008


# 전처리

In [182]:
# 일부 column 제거(Id, GarageCars, GarageYrBlt, TotRmsAbvGrd, TotalBsmtSF)
train = train.drop(columns=['Id', 'GarageCars', 'GarageYrBlt', 'TotRmsAbvGrd', 'TotalBsmtSF'])
valid = valid.drop(columns=['Id', 'GarageCars', 'GarageYrBlt', 'TotRmsAbvGrd', 'TotalBsmtSF'])
test = test.drop(columns=['Id', 'GarageCars', 'GarageYrBlt', 'TotRmsAbvGrd', 'TotalBsmtSF'])

In [183]:
train.duplicated().sum()

# 중복 데이터 제거
train = train.drop_duplicates()

In [184]:
# X, y 분리
X_train = train.drop(columns='SalePrice', axis=1)
y_train = train['SalePrice']

X_valid = valid.drop(columns='SalePrice', axis=1)
y_valid = valid['SalePrice']

In [185]:
X_train.isnull().sum()

LotArea          0
Street           0
LotConfig        0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
1stFlrSF         0
2ndFlrSF         0
GrLivArea        0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
Fireplaces       0
GarageType      73
GarageArea       0
YrSold           0
dtype: int64

In [186]:
# 결측치 최빈값 처리
train_mode = X_train['GarageType'].mode()[0]
X_train['GarageType'] = X_train['GarageType'].fillna(train_mode)
X_valid['GarageType'] = X_valid['GarageType'].fillna(train_mode)

In [187]:
X_train.isnull().sum().sum(), X_valid.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0, 0)

In [188]:
X_train.shape, X_valid.shape, test.shape

((1205, 18), (134, 18), (130, 18))

In [189]:
# 범주형 변수 OHE
categorical = X_train.select_dtypes(include='object').columns

all_data = pd.concat([X_train, X_valid, test], sort=False)

all_data = pd.get_dummies(all_data, columns=categorical)

X_train_encoded = all_data[:len(X_train)]
X_valid_encoded = all_data[len(X_train):-len(test)]
test_encoded = all_data[-len(test):]

In [190]:
X_train_encoded.shape, X_valid_encoded.shape, test_encoded.shape

((1205, 28), (134, 28), (130, 28))

In [191]:
X_train_encoded.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,1stFlrSF,2ndFlrSF,GrLivArea,FullBath,HalfBath,...,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd
196,15138,8,5,1995,1996,1490,1304,2794,2,1,...,False,False,False,True,False,True,False,False,False,False
447,2308,6,5,1974,1974,855,467,1322,2,1,...,False,False,False,True,False,True,False,False,False,False
339,53107,6,5,1992,1992,1079,874,1953,2,1,...,False,False,False,False,False,True,False,False,False,False
464,11988,6,6,1957,1957,1244,0,1244,1,1,...,False,False,False,False,False,True,False,False,False,False
1190,17871,6,5,1967,1976,1724,0,1724,1,1,...,True,False,False,False,False,True,False,False,False,False


In [192]:
# # 수치형 변수들 Standardization
from sklearn.preprocessing import StandardScaler

#numerical = X_train.select_dtypes(exclude='object').columns

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_valid_scaled = scaler.transform(X_valid_encoded)
test_scaled = scaler.transform(test_encoded)

In [193]:
X_train_scaled.shape

(1205, 28)

In [194]:
X_train_scaled.shape

(1205, 28)

In [195]:
X_train_scaled.shape, y_train.shape

((1205, 28), (1205,))

# Model training

### Lasso

In [235]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

lasso = Lasso(alpha=0.0005, random_state=42)
lasso.fit(X_train_scaled, y_train)

  model = cd_fast.enet_coordinate_descent(


In [236]:
# validation set 예측 및 평가
y_train_pred = lasso.predict(X_train_scaled)
y_valid_pred = lasso.predict(X_valid_scaled)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
valid_rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)

train_rmse, valid_rmse

(36765.52039384454, 29699.20903511644)

### XGBoost

In [255]:
# 베이지안 최적화
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

param_bounds = {
    'learning_rate' : (0.001, 0.5),
    'max_leaves': (2, 1024),
    'n_estimators': (100, 1000),
    'gamma': (0, 10),
    'max_depth': (3, 15),
    'min_child_weight': (1, 10),
    }

def eval_function(max_leaves, learning_rate, n_estimators, gamma, max_depth, min_child_weight):
    params = {
        'learning_rate' : learning_rate, 
        'max_leaves': int(max_leaves),
        'n_estimators': int(n_estimators),
        'gamma': gamma,
        'max_depth': int(max_depth),
        'min_child_weight': min_child_weight,
    }

    xgb_model = XGBRegressor(**params, random_state=42)
    xgb_model.fit(X_train_scaled, y_train)
    y_pred = xgb_model.predict(X_valid_scaled)
    valid_rmse = mean_squared_error(y_valid, y_pred, squared=False)
    return -valid_rmse

In [256]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=50)

|   iter    |  target   |   gamma   | learni... | max_depth | max_le... | min_ch... | n_esti... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-2.612e+0[0m | [0m3.745    [0m | [0m0.4754   [0m | [0m11.78    [0m | [0m613.8    [0m | [0m2.404    [0m | [0m240.4    [0m |
| [0m2        [0m | [0m-2.703e+0[0m | [0m0.5808   [0m | [0m0.4332   [0m | [0m10.21    [0m | [0m725.7    [0m | [0m1.185    [0m | [0m972.9    [0m |
| [95m3        [0m | [95m-2.371e+0[0m | [95m8.324    [0m | [95m0.107    [0m | [95m5.182    [0m | [95m189.4    [0m | [95m3.738    [0m | [95m572.3    [0m |
| [0m4        [0m | [0m-2.549e+0[0m | [0m4.319    [0m | [0m0.1463   [0m | [0m10.34    [0m | [0m144.6    [0m | [0m3.629    [0m | [0m429.7    [0m |
| [0m5        [0m | [0m-2.57e+04[0m | [0m4.561    [0m | [0m0.3928   [0m | [0m5.396    [0m | [0m527.5    [0m | [0m6.332    [0m | [0m14

In [257]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = {
    'n_estimators': round(optimizer.max['params']['n_estimators']),
    'max_depth': round(optimizer.max['params']['max_depth']),
    'max_leaves': round(optimizer.max['params']['max_leaves']),
    'min_child_weight': round(optimizer.max['params']['min_child_weight']),
    'learning_rate': optimizer.max['params']['learning_rate'],
    'gamma': optimizer.max['params']['gamma'],
}

best_xgb = XGBRegressor(**best_params, random_state=42)
best_xgb.fit(X_train_scaled, y_train)

In [258]:
# validation set 예측 및 평가
y_train_pred = best_xgb.predict(X_train_scaled)
y_valid_pred = best_xgb.predict(X_valid_scaled)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
valid_rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)

train_rmse, valid_rmse

(467.0642663695416, 22815.096664588735)

In [259]:
# 모델의 결정계수 확인
from sklearn.metrics import r2_score

r2_score(y_valid, y_valid_pred)

0.8929806845324563

### LightGBM

In [222]:
# 베이지안 최적화
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

param_bounds = {
    'n_estimators': (100, 1000),
    'max_depth': (3, 15),
    'num_leaves': (20, 300),
    'min_child_samples': (10, 30),
    'learning_rate': (0.001, 0.5),
}

def eval_function(n_estimators, max_depth, num_leaves, min_child_samples, learning_rate):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'num_leaves': int(num_leaves),
        'min_child_samples': int(min_child_samples),
        'learning_rate': learning_rate,
    }
    lgbm = LGBMRegressor(**params, metric='RMSE', random_state=42, verbose=0)

    lgbm.fit(X_train_scaled, y_train)
    y_pred = lgbm.predict(X_valid_scaled)
    valid_rmse = mean_squared_error(y_valid, y_pred, squared=False)
    return -valid_rmse

In [223]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=50)

|   iter    |  target   | learni... | max_depth | min_ch... | n_esti... | num_le... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-2.484e+0[0m | [0m0.1879   [0m | [0m14.41    [0m | [0m24.64    [0m | [0m638.8    [0m | [0m63.69    [0m |
| [95m2        [0m | [95m-2.343e+0[0m | [95m0.07884  [0m | [95m3.697    [0m | [95m27.32    [0m | [95m641.0    [0m | [95m218.3    [0m |
| [0m3        [0m | [0m-2.374e+0[0m | [0m0.01127  [0m | [0m14.64    [0m | [0m26.65    [0m | [0m291.1    [0m | [0m70.91    [0m |
| [0m4        [0m | [0m-2.617e+0[0m | [0m0.09252  [0m | [0m6.651    [0m | [0m20.5     [0m | [0m488.8    [0m | [0m101.5    [0m |
| [0m5        [0m | [0m-2.64e+04[0m | [0m0.3063   [0m | [0m4.674    [0m | [0m15.84    [0m | [0m429.7    [0m | [0m147.7    [0m |
| [0m6        [0m | [0m-2.565e+0[0m | [0m0.1916   [0m | [0m4.61     [0m | [0m19.56    [0m | [0m648.0

In [224]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = {
    'n_estimators': round(optimizer.max['params']['n_estimators']),
    'max_depth': round(optimizer.max['params']['max_depth']),
    'num_leaves': round(optimizer.max['params']['num_leaves']),
    'min_child_samples': round(optimizer.max['params']['min_child_samples']),
    'learning_rate': optimizer.max['params']['learning_rate'],
}

best_lgbm = LGBMRegressor(**best_params, metric='RMSE', random_state=42, verbose=0)
best_lgbm.fit(X_train_scaled, y_train)

In [225]:
# validation set 예측 및 평가
y_train_pred = best_lgbm.predict(X_train_scaled)
y_valid_pred = best_lgbm.predict(X_valid_scaled)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
valid_rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)

train_rmse, valid_rmse

(2867.2336439159444, 24445.91374360593)

### SVR

In [239]:
# svr
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

param_bounds = {
    'C': (0.1, 10),
    'gamma': (0.1, 10),
    }

def eval_function(C, gamma):

    svr = SVR(kernel='rbf', C=C, gamma=gamma)
    svr.fit(X_train_scaled, y_train)
    y_pred = svr.predict(X_valid_scaled)
    valid_rmse = mean_squared_error(y_valid, y_pred, squared=False)
    return -valid_rmse

In [240]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=50)

|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m1        [0m | [0m-7.02e+04[0m | [0m3.808    [0m | [0m9.512    [0m |
| [95m2        [0m | [95m-7.02e+04[0m | [95m7.347    [0m | [95m6.027    [0m |
| [0m3        [0m | [0m-7.02e+04[0m | [0m1.645    [0m | [0m1.644    [0m |
| [0m4        [0m | [0m-7.02e+04[0m | [0m0.675    [0m | [0m8.675    [0m |
| [0m5        [0m | [0m-7.02e+04[0m | [0m6.051    [0m | [0m7.11     [0m |
| [95m6        [0m | [95m-7.02e+04[0m | [95m8.587    [0m | [95m1.981    [0m |
| [95m7        [0m | [95m-6.995e+0[0m | [95m10.0     [0m | [95m0.1      [0m |
| [0m8        [0m | [0m-7.018e+0[0m | [0m9.929    [0m | [0m0.5254   [0m |
| [0m9        [0m | [0m-7.02e+04[0m | [0m0.1559   [0m | [0m4.691    [0m |
| [0m10       [0m | [0m-7.02e+04[0m | [0m6.709    [0m | [0m6.458    [0m |
| [0m11       [0m | [0m-7.02e+04[0m | [0m0.4215   [0m | [0m2.

In [242]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = optimizer.max['params']

best_svr = SVR(**best_params, kernel='rbf')
best_svr.fit(X_train_scaled, y_train)

In [243]:
# validation set 예측 및 평가
y_train_pred = best_svr.predict(X_train_scaled)
y_valid_pred = best_svr.predict(X_valid_scaled)

train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
valid_rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)

train_rmse, valid_rmse

(80617.07110639139, 69945.50159339218)

# 결과 제출

In [245]:
# 최종 모델(XGBRegressor)로 test set 예측
test_pred = best_xgb.predict(test_scaled)
test_pred

array([132941.92 , 123052.84 , 152929.67 , 130510.01 ,  98148.12 ,
       170934.03 , 202057.88 ,  83002.945, 232649.94 , 139623.98 ,
       208249.11 , 148903.05 , 267677.03 , 106831.47 , 293492.66 ,
       203468.89 , 222260.3  , 178613.6  , 154359.06 , 208637.66 ,
       151145.6  , 429144.28 , 109550.56 , 369975.03 , 242341.   ,
       245138.42 , 180324.77 , 233111.   , 214289.83 , 195294.52 ,
       182666.7  , 220972.06 , 338179.38 , 213143.97 , 216900.95 ,
       160156.44 , 146456.27 , 166782.42 , 177528.83 , 142751.97 ,
       158735.58 , 295022.75 , 286409.97 , 107031.06 , 235221.3  ,
       179800.7  , 167392.75 , 194821.86 , 195435.7  , 139130.52 ,
       316262.25 , 199738.94 , 202994.53 , 121208.125,  94726.05 ,
       248067.14 , 670864.9  , 134384.61 , 187254.36 , 104968.87 ,
       195271.22 , 143478.27 , 162493.73 , 142304.94 , 181751.8  ,
       225331.16 , 174996.   , 183069.83 , 153104.   , 121217.914,
       223515.22 , 131296.58 , 152384.53 , 160896.69 , 270278.

In [251]:
# submission 파일 생성
submission_origin = pd.read_csv(file_path+'\submission.csv')
submission = submission_origin.copy()
submission['SalePrice'] = test_pred
submission.to_csv('baseline_model.csv', index=False)
