In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_colwidth', 30)

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
feature_description = pd.read_excel('data_description_Kor.xlsx')

In [3]:
train_test_data = pd.concat([train_data, test_data])

In [4]:
object_columns = train_test_data.dtypes[train_test_data.dtypes == object].index.tolist()

In [5]:
for column in object_columns:
    train_test_data[column] = pd.factorize(train_test_data[column])[0]

In [6]:
def feature_processed(train_test_data):
    train_test_data['Lot_GrLiv_Area_Sum'] = train_test_data[['LotArea', 'GrLivArea']].sum(axis=1)
    train_test_data['Lot_GrLiv_Area_Ratio'] = train_test_data['GrLivArea'] / train_test_data['LotArea']
    
    train_test_data['GrLiv_Bsmt_Area_Sum'] = train_test_data[['GrLivArea', 'TotalBsmtSF']].sum(axis=1)
    train_test_data['GrLiv_Bsmt_Area_Ratio'] = train_test_data['TotalBsmtSF'] / train_test_data['GrLivArea']
    
    train_test_data['1st_2nd_Area_Sum'] = train_test_data[['1stFlrSF', '2ndFlrSF']].sum(axis=1)
    train_test_data['1st_2nd_Area_Mean'] = train_test_data[['1stFlrSF', '2ndFlrSF']].mean(axis=1)
    train_test_data['1st_2nd_Area_Std'] = train_test_data[['1stFlrSF', '2ndFlrSF']].std(axis=1)
    train_test_data['1st_2nd_Area_Std'] = train_test_data['1st_2nd_Area_Std'].fillna(train_test_data['1st_2nd_Area_Std'].mean())
    
    train_test_data['Built_Remod_Year_Mean'] = train_test_data[['YearBuilt', 'YearRemodAdd']].mean(axis=1)
    
    train_test_data['Total_Bsmtfin_Mean'] = train_test_data[['BsmtFinSF1', 'BsmtFinSF2']].mean(axis=1)
    
    train_test_data['BsmtUnf_Ratio'] = train_test_data['BsmtUnfSF'] / train_test_data['TotalBsmtSF']
    
    train_test_data['Porch_All_Sum'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)
    train_test_data['Porch_All_Mean'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].mean(axis=1)
    train_test_data['Porch_All_Std'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].std(axis=1)
    train_test_data['Porch_All_Std'] = train_test_data['Porch_All_Std'].fillna(train_test_data['Porch_All_Std'].mean())
    
    train_test_data['Qual_All_Sum'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].sum(axis=1)
    train_test_data['Qual_All_Mean'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].mean(axis=1)
    train_test_data['Qual_All_Std'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].std(axis=1)
    train_test_data['Qual_All_Std'] = train_test_data['Qual_All_Std'].fillna(train_test_data['Qual_All_Std'].mean())
    
    train_test_data['Cond_All_Sum'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].sum(axis=1)
    train_test_data['Cond_All_Mean'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].mean(axis=1)
    train_test_data['Cond_All_Std'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].std(axis=1)
    train_test_data['Cond_All_Std'] = train_test_data['Cond_All_Std'].fillna(train_test_data['Cond_All_Std'].mean())
    
    train_test_data['Room_Kitchen_Sum'] = train_test_data[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd']].sum(axis=1)
    
    return train_test_data

In [7]:
train_test_data = feature_processed(train_test_data)

In [8]:
train_data = train_test_data[~train_test_data['SalePrice'].isnull()]
test_data = train_test_data[train_test_data['SalePrice'].isnull()]
test_data = test_data.drop('SalePrice', axis=1)

In [9]:
from sklearn.model_selection import train_test_split

ftr_train = train_data.drop(['Id', 'SalePrice'], axis=1)
saleprice_train = train_data['SalePrice']
train_x, valid_x, train_y, valid_y = train_test_split(ftr_train, saleprice_train, test_size=0.3, random_state=2022)

In [10]:
from bayes_opt import BayesianOptimization
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

In [11]:
bayesian_params = {
    'max_depth': (6, 16),
    'num_leaves': (12, 48),
    'min_child_samples': (10, 100),
    'min_child_weight': (1, 50),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
    'max_bin': (10, 500),
    'reg_lambda': (0.001, 10),
    'reg_alpha': (0.01, 50)
}

In [12]:
def lgb_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, colsample_bytree,
                max_bin, reg_lambda, reg_alpha):
    params = {
        'n_estimators': 300,
        'learning_rate': 0.02,
        'max_depth': int(round(max_depth)),
        'num_leaves': int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin': max(int(round(max_bin)), 10),
        'reg_lambda': max(reg_lambda, 0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMRegressor(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='mean_squared_error',
                  verbose='50', early_stopping_rounds=50)
    valid_predict = lgb_model.predict(valid_x)
    mse = mean_squared_error(valid_y, valid_predict)
    
    return mse

In [None]:
lgbr = BayesianOptimization(lgb_eval, bayesian_params, random_state=2022)
lgbr.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[50]	training's l2: 1.92468e+09	valid_1's l2: 2.15239e+09
[100]	training's l2: 9.46171e+08	valid_1's l2: 1.23417e+09
[150]	training's l2: 6.77562e+08	valid_1's l2: 9.71967e+08
[200]	training's l2: 5.84184e+08	valid_1's l2: 8.73196e+08
[250]	training's l2: 5.16122e+08	valid_1's l2: 8.13847e+08
[300]	training's l2: 4.63659e+08	valid_1's l2: 7.78425e+08
| [0m 1       [0m | [0m 7.784e+0[0m | [0m 0.5047  [0m | [0m 254.5   [0m | [0m 7.134   [0m | [0m 14.5    [0m | [0m 34.58   [0m | [0m 29.53   [0m | [0m 44.88   [0m | [0m 6.475   [0m | [0m 0.9485  [0m |
[50]	training's l2: 2.20202e+09	valid_1's l2: 2.40872e+09
[100]	training's l2: 1.31991e+09	valid_1's l2: 1.62028e+09
[150]	training's l2: 1.04927e+09	valid_1

In [None]:
lgbr.res

In [None]:
target_list = []
for result in lgbr.res:
    target = result['target']
    target_list.append(target)
print(target_list)
print('maximum target index:', np.argmax(np.array(target_list)))

In [None]:
max_dict = lgbr.res[np.argmax(np.array(target_list))]
print(max_dict)

In [None]:
tuning_lgbr = LGBMRegressor(
    nthread=4,
    n_estimators=800,
    learning_rate=0.02,
    max_depth = 10,
    num_leaves=13,
    colsample_bytree=0.873,
    subsample=0.745,
    max_bin=458,
    reg_alpha=14.846,
    reg_lambda=3.637,
    min_child_weight=5,
    min_child_samples=96,
    silent=-1,
    verbose=-1,
)

tuning_lgbr.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'mean_squared_error',
                verbose= 100, early_stopping_rounds= 100)

In [None]:
preds = tuning_lgbr.predict(test_data.drop('Id', axis=1))
test_data['SalePrice'] = preds

In [None]:
test_data[['Id', 'SalePrice']].to_csv('House_prices_hyperparameter_tuning_03.csv', index=False)