In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_colwidth', 30)

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
feature_description = pd.read_excel('data_description_Kor.xlsx')

In [3]:
train_test_data = pd.concat([train_data, test_data])

In [4]:
object_columns = train_test_data.dtypes[train_test_data.dtypes == object].index.tolist()

In [5]:
for column in object_columns:
    train_test_data[column] = pd.factorize(train_test_data[column])[0]

In [6]:
def feature_processed(train_test_data):
    train_test_data['Lot_GrLiv_Area_Sum'] = train_test_data[['LotArea', 'GrLivArea']].sum(axis=1)
    train_test_data['Lot_GrLiv_Area_Ratio'] = train_test_data['GrLivArea'] / train_test_data['LotArea']
    
    train_test_data['GrLiv_Bsmt_Area_Sum'] = train_test_data[['GrLivArea', 'TotalBsmtSF']].sum(axis=1)
    train_test_data['GrLiv_Bsmt_Area_Ratio'] = train_test_data['TotalBsmtSF'] / train_test_data['GrLivArea']
    
    train_test_data['1st_2nd_Area_Sum'] = train_test_data[['1stFlrSF', '2ndFlrSF']].sum(axis=1)
    train_test_data['1st_2nd_Area_Mean'] = train_test_data[['1stFlrSF', '2ndFlrSF']].mean(axis=1)
    train_test_data['1st_2nd_Area_Std'] = train_test_data[['1stFlrSF', '2ndFlrSF']].std(axis=1)
    train_test_data['1st_2nd_Area_Std'] = train_test_data['1st_2nd_Area_Std'].fillna(train_test_data['1st_2nd_Area_Std'].mean())
    
    train_test_data['Built_Remod_Year_Mean'] = train_test_data[['YearBuilt', 'YearRemodAdd']].mean(axis=1)
    
    train_test_data['Total_Bsmtfin_Mean'] = train_test_data[['BsmtFinSF1', 'BsmtFinSF2']].mean(axis=1)
    
    train_test_data['BsmtUnf_Ratio'] = train_test_data['BsmtUnfSF'] / train_test_data['TotalBsmtSF']
    
    train_test_data['Porch_All_Sum'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)
    train_test_data['Porch_All_Mean'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].mean(axis=1)
    train_test_data['Porch_All_Std'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].std(axis=1)
    train_test_data['Porch_All_Std'] = train_test_data['Porch_All_Std'].fillna(train_test_data['Porch_All_Std'].mean())
    
    train_test_data['Qual_All_Sum'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].sum(axis=1)
    train_test_data['Qual_All_Mean'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].mean(axis=1)
    train_test_data['Qual_All_Std'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].std(axis=1)
    train_test_data['Qual_All_Std'] = train_test_data['Qual_All_Std'].fillna(train_test_data['Qual_All_Std'].mean())
    
    train_test_data['Cond_All_Sum'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].sum(axis=1)
    train_test_data['Cond_All_Mean'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].mean(axis=1)
    train_test_data['Cond_All_Std'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].std(axis=1)
    train_test_data['Cond_All_Std'] = train_test_data['Cond_All_Std'].fillna(train_test_data['Cond_All_Std'].mean())
    
    train_test_data['Room_Kitchen_Sum'] = train_test_data[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd']].sum(axis=1)
    
    return train_test_data

In [7]:
train_test_data = feature_processed(train_test_data)

In [8]:
train_data = train_test_data[~train_test_data['SalePrice'].isnull()]
test_data = train_test_data[train_test_data['SalePrice'].isnull()]
test_data = test_data.drop('SalePrice', axis=1)

In [9]:
from sklearn.model_selection import train_test_split

ftr_train = train_data.drop(['Id', 'SalePrice'], axis=1)
saleprice_train = train_data['SalePrice']
train_x, valid_x, train_y, valid_y = train_test_split(ftr_train, saleprice_train, test_size=0.3, random_state=2022)

In [10]:
from bayes_opt import BayesianOptimization
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score

In [11]:
bayesian_params = {
    'max_depth': (6, 16),
    'num_leaves': (12, 48),
    'min_child_samples': (10, 100),
    'min_child_weight': (1, 50),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
    'max_bin': (10, 500),
    'reg_lambda': (0.001, 10),
    'reg_alpha': (0.01, 50)
}

In [14]:
def lgb_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, colsample_bytree,
                max_bin, reg_lambda, reg_alpha):
    params = {
        'n_estimators': 300,
        'learning_rate': 0.02,
        'max_depth': int(round(max_depth)),
        'num_leaves': int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin': max(int(round(max_bin)), 10),
        'reg_lambda': max(reg_lambda, 0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMRegressor(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='r2',
                  verbose='50', early_stopping_rounds=50)
    valid_predict = lgb_model.predict(valid_x)
    r2 = r2_score(valid_y, valid_predict)
    
    return r2

In [15]:
lgbr = BayesianOptimization(lgb_eval, bayesian_params, random_state=2022)
lgbr.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[50]	training's l2: 1.92468e+09	valid_1's l2: 2.15239e+09
[100]	training's l2: 9.46171e+08	valid_1's l2: 1.23417e+09
[150]	training's l2: 6.77562e+08	valid_1's l2: 9.71967e+08
[200]	training's l2: 5.84184e+08	valid_1's l2: 8.73196e+08
[250]	training's l2: 5.16122e+08	valid_1's l2: 8.13847e+08
[300]	training's l2: 4.63659e+08	valid_1's l2: 7.78425e+08
| [0m 1       [0m | [0m 0.8753  [0m | [0m 0.5047  [0m | [0m 254.5   [0m | [0m 7.134   [0m | [0m 14.5    [0m | [0m 34.58   [0m | [0m 29.53   [0m | [0m 44.88   [0m | [0m 6.475   [0m | [0m 0.9485  [0m |
[50]	training's l2: 2.20202e+09	valid_1's l2: 2.40872e+09
[100]	training's l2: 1.31991e+09	valid_1's l2: 1.62028e+09
[150]	training's l2: 1.04927e+09	valid_1

[50]	training's l2: 1.9476e+09	valid_1's l2: 2.1484e+09
[100]	training's l2: 1.04627e+09	valid_1's l2: 1.31828e+09
[150]	training's l2: 7.9635e+08	valid_1's l2: 1.08719e+09
[200]	training's l2: 6.92944e+08	valid_1's l2: 1.00598e+09
[250]	training's l2: 6.23711e+08	valid_1's l2: 9.50512e+08
[300]	training's l2: 5.75631e+08	valid_1's l2: 9.14026e+08
| [0m 15      [0m | [0m 0.8536  [0m | [0m 0.7231  [0m | [0m 261.5   [0m | [0m 9.406   [0m | [0m 55.4    [0m | [0m 43.93   [0m | [0m 34.61   [0m | [0m 19.09   [0m | [0m 4.325   [0m | [0m 0.6294  [0m |
[50]	training's l2: 2.02385e+09	valid_1's l2: 2.23585e+09
[100]	training's l2: 1.01865e+09	valid_1's l2: 1.28312e+09
[150]	training's l2: 7.21121e+08	valid_1's l2: 9.91729e+08
[200]	training's l2: 6.15672e+08	valid_1's l2: 8.81009e+08
[250]	training's l2: 5.47077e+08	valid_1's l2: 8.20962e+08
[300]	training's l2: 4.91343e+08	valid_1's l2: 7.7886e+08
| [0m 16      [0m | [0m 0.8753  [0m | [0m 0.5276  [0m | [0m 68.61   

[250]	training's l2: 2.25721e+08	valid_1's l2: 6.60839e+08
[300]	training's l2: 1.89521e+08	valid_1's l2: 6.48232e+08
| [0m 29      [0m | [0m 0.8962  [0m | [0m 0.5     [0m | [0m 252.1   [0m | [0m 16.0    [0m | [0m 10.0    [0m | [0m 1.0     [0m | [0m 12.0    [0m | [0m 50.0    [0m | [0m 0.001   [0m | [0m 0.5     [0m |
[50]	training's l2: 1.31947e+09	valid_1's l2: 1.70934e+09
[100]	training's l2: 4.39332e+08	valid_1's l2: 9.24381e+08
[150]	training's l2: 2.14778e+08	valid_1's l2: 7.53693e+08
[200]	training's l2: 1.36389e+08	valid_1's l2: 7.08283e+08
[250]	training's l2: 9.88339e+07	valid_1's l2: 6.98915e+08
[300]	training's l2: 7.6037e+07	valid_1's l2: 6.9265e+08
| [0m 30      [0m | [0m 0.8891  [0m | [0m 0.5     [0m | [0m 284.8   [0m | [0m 16.0    [0m | [0m 10.0    [0m | [0m 1.0     [0m | [0m 48.0    [0m | [0m 50.0    [0m | [0m 0.001   [0m | [0m 0.5     [0m |


In [16]:
lgbr.res

[{'target': 0.8753367545907617,
  'params': {'colsample_bytree': 0.5046793069038824,
   'max_bin': 254.5383273174741,
   'max_depth': 7.133836899348352,
   'min_child_samples': 14.497661635986743,
   'min_child_weight': 34.58497211791149,
   'num_leaves': 29.531570455766293,
   'reg_alpha': 44.883884749899046,
   'reg_lambda': 6.47487325536192,
   'subsample': 0.9484815613954983}},
 {'target': 0.8141349374596167,
  'params': {'colsample_bytree': 0.8605674645823427,
   'max_bin': 417.36317618033297,
   'max_depth': 14.275680688218047,
   'min_child_samples': 85.02216256523265,
   'min_child_weight': 47.895172447477464,
   'num_leaves': 25.249599732179597,
   'reg_alpha': 24.746933117965533,
   'reg_lambda': 3.3957552364003205,
   'subsample': 0.8097146631224952}},
 {'target': 0.8711105733131626,
  'params': {'colsample_bytree': 0.9887648192106426,
   'max_bin': 57.252208042606135,
   'max_depth': 13.442062122546155,
   'min_child_samples': 36.32495266858435,
   'min_child_weight': 15.63

In [17]:
target_list = []
for result in lgbr.res:
    target = result['target']
    target_list.append(target)
print(target_list)
print('maximum target index:', np.argmax(np.array(target_list)))

[0.8753367545907617, 0.8141349374596167, 0.8711105733131626, 0.8469367233692098, 0.8058959828855055, 0.8485951102683049, 0.8039893750255797, 0.8780776096904163, 0.8366386232574425, 0.8548755450653256, 0.8543638879785388, 0.8925560091161902, 0.8615065676927419, 0.8454156795597528, 0.8536204272487271, 0.8752671211550201, 0.877824052753378, 0.8953696996979894, 0.874250457864942, 0.8896457134215765, 0.8788686317776327, 0.8861969599961282, 0.8973098875346905, 0.8911236233802434, 0.8877963926392238, 0.8964641979823766, 0.8615461914501253, 0.873595816841304, 0.8961869460070122, 0.8890734893850425]
maximum target index: 22


In [18]:
max_dict = lgbr.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.8973098875346905, 'params': {'colsample_bytree': 0.5, 'max_bin': 168.35021508395596, 'max_depth': 16.0, 'min_child_samples': 10.0, 'min_child_weight': 1.0, 'num_leaves': 12.0, 'reg_alpha': 50.0, 'reg_lambda': 0.001, 'subsample': 0.5}}


In [19]:
tuning_lgbr = LGBMRegressor(
    nthread=4,
    n_estimators=800,
    learning_rate=0.02,
    max_depth = 16,
    num_leaves=12,
    colsample_bytree=0.5,
    subsample=0.5,
    max_bin=168,
    reg_alpha=50,
    reg_lambda=0.001,
    min_child_weight=1,
    min_child_samples=10,
    silent=-1,
    verbose=-1,
)

tuning_lgbr.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'r2',
                verbose= 100, early_stopping_rounds= 100)

[100]	training's l2: 6.7649e+08	valid_1's l2: 1.00191e+09
[200]	training's l2: 2.90443e+08	valid_1's l2: 6.93659e+08
[300]	training's l2: 1.93882e+08	valid_1's l2: 6.4122e+08
[400]	training's l2: 1.45202e+08	valid_1's l2: 6.3357e+08
[500]	training's l2: 1.12874e+08	valid_1's l2: 6.30698e+08
[600]	training's l2: 8.91903e+07	valid_1's l2: 6.21481e+08
[700]	training's l2: 7.0379e+07	valid_1's l2: 6.1363e+08
[800]	training's l2: 5.72364e+07	valid_1's l2: 6.11042e+08


LGBMRegressor(colsample_bytree=0.5, learning_rate=0.02, max_bin=168,
              max_depth=16, min_child_samples=10, min_child_weight=1,
              n_estimators=800, nthread=4, num_leaves=12, reg_alpha=50,
              reg_lambda=0.001, silent=-1, subsample=0.5, verbose=-1)

In [20]:
preds = tuning_lgbr.predict(test_data.drop('Id', axis=1))
test_data['SalePrice'] = preds

In [21]:
test_data[['Id', 'SalePrice']].to_csv('House_prices_hyperparameter_tuning_04.csv', index=False)