In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_colwidth', 30)

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
feature_description = pd.read_excel('data_description_Kor.xlsx')

In [3]:
train_test_data = pd.concat([train_data, test_data])

In [4]:
object_columns = train_test_data.dtypes[train_test_data.dtypes == object].index.tolist()

In [5]:
for column in object_columns:
    train_test_data[column] = pd.factorize(train_test_data[column])[0]

In [6]:
def feature_processed(train_test_data):
    train_test_data['Lot_GrLiv_Area_Sum'] = train_test_data[['LotArea', 'GrLivArea']].sum(axis=1)
    train_test_data['Lot_GrLiv_Area_Ratio'] = train_test_data['GrLivArea'] / train_test_data['LotArea']
    
    train_test_data['GrLiv_Bsmt_Area_Sum'] = train_test_data[['GrLivArea', 'TotalBsmtSF']].sum(axis=1)
    train_test_data['GrLiv_Bsmt_Area_Ratio'] = train_test_data['TotalBsmtSF'] / train_test_data['GrLivArea']
    
    train_test_data['1st_2nd_Area_Sum'] = train_test_data[['1stFlrSF', '2ndFlrSF']].sum(axis=1)
    train_test_data['1st_2nd_Area_Mean'] = train_test_data[['1stFlrSF', '2ndFlrSF']].mean(axis=1)
    train_test_data['1st_2nd_Area_Std'] = train_test_data[['1stFlrSF', '2ndFlrSF']].std(axis=1)
    train_test_data['1st_2nd_Area_Std'] = train_test_data['1st_2nd_Area_Std'].fillna(train_test_data['1st_2nd_Area_Std'].mean())
    
    train_test_data['Built_Remod_Year_Mean'] = train_test_data[['YearBuilt', 'YearRemodAdd']].mean(axis=1)
    
    train_test_data['Total_Bsmtfin_Mean'] = train_test_data[['BsmtFinSF1', 'BsmtFinSF2']].mean(axis=1)
    
    train_test_data['BsmtUnf_Ratio'] = train_test_data['BsmtUnfSF'] / train_test_data['TotalBsmtSF']
    
    train_test_data['Porch_All_Sum'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)
    train_test_data['Porch_All_Mean'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].mean(axis=1)
    train_test_data['Porch_All_Std'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].std(axis=1)
    train_test_data['Porch_All_Std'] = train_test_data['Porch_All_Std'].fillna(train_test_data['Porch_All_Std'].mean())
    
    train_test_data['Qual_All_Sum'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].sum(axis=1)
    train_test_data['Qual_All_Mean'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].mean(axis=1)
    train_test_data['Qual_All_Std'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].std(axis=1)
    train_test_data['Qual_All_Std'] = train_test_data['Qual_All_Std'].fillna(train_test_data['Qual_All_Std'].mean())
    
    train_test_data['Cond_All_Sum'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].sum(axis=1)
    train_test_data['Cond_All_Mean'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].mean(axis=1)
    train_test_data['Cond_All_Std'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].std(axis=1)
    train_test_data['Cond_All_Std'] = train_test_data['Cond_All_Std'].fillna(train_test_data['Cond_All_Std'].mean())
    
    train_test_data['Room_Kitchen_Sum'] = train_test_data[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd']].sum(axis=1)
    
    return train_test_data

In [7]:
train_test_data = feature_processed(train_test_data)

In [8]:
train_data = train_test_data[~train_test_data['SalePrice'].isnull()]
test_data = train_test_data[train_test_data['SalePrice'].isnull()]
test_data = test_data.drop('SalePrice', axis=1)

In [9]:
from sklearn.model_selection import train_test_split

ftr_train = train_data.drop(['Id', 'SalePrice'], axis=1)
saleprice_train = train_data['SalePrice']
train_x, valid_x, train_y, valid_y = train_test_split(ftr_train, saleprice_train, test_size=0.3, random_state=2022)

In [10]:
from bayes_opt import BayesianOptimization
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score

In [11]:
bayesian_params = {
    'max_depth': (6, 16),
    'num_leaves': (12, 48),
    'min_child_samples': (10, 100),
    'min_child_weight': (1, 50),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
    'max_bin': (10, 500),
    'reg_lambda': (0.001, 10),
    'reg_alpha': (0.01, 50)
}

In [12]:
def lgb_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, colsample_bytree,
                max_bin, reg_lambda, reg_alpha):
    params = {
        'n_estimators': 300,
        'learning_rates': 0.02,
        'max_depth': int(round(max_depth)),
        'num_leaves': int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin': max(int(round(max_bin)), 10),
        'reg_lambda': max(reg_lambda, 0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMRegressor(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='r2',
                  verbose='50', early_stopping_rounds=50)
    valid_predict = lgb_model.predict(valid_x)
    r2 = r2_score(valid_y, valid_predict)
    
    return r2

In [13]:
lgbr = BayesianOptimization(lgb_eval, bayesian_params, random_state=2022)
lgbr.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[50]	training's l2: 5.34348e+08	valid_1's l2: 7.92211e+08
[100]	training's l2: 3.34831e+08	valid_1's l2: 7.02531e+08
[150]	training's l2: 2.29283e+08	valid_1's l2: 6.74106e+08
[200]	training's l2: 1.68338e+08	valid_1's l2: 6.70207e+08
| [0m 1       [0m | [0m 0.8928  [0m | [0m 0.5047  [0m | [0m 254.5   [0m | [0m 7.134   [0m | [0m 14.5    [0m | [0m 34.58   [0m | [0m 29.53   [0m | [0m 44.88   [0m | [0m 6.475   [0m | [0m 0.9485  [0m |
[50]	training's l2: 8.45521e+08	valid_1's l2: 1.20676e+09
[100]	training's l2: 6.2561e+08	valid_1's l2: 1.03852e+09
[150]	training's l2: 4.89509e+08	valid_1's l2: 9.78006e+08
[200]	training's l2: 3.9454e+08	valid_1's l2: 9.63748e+08
[250]	training's l2: 3.27484e+08	valid_1's

[50]	training's l2: 2.52994e+08	valid_1's l2: 6.35445e+08
[100]	training's l2: 1.29013e+08	valid_1's l2: 6.07048e+08
[150]	training's l2: 7.64688e+07	valid_1's l2: 5.98356e+08
[200]	training's l2: 4.67309e+07	valid_1's l2: 5.93384e+08
[250]	training's l2: 2.96679e+07	valid_1's l2: 5.94762e+08
| [95m 15      [0m | [95m 0.905   [0m | [95m 0.5     [0m | [95m 89.65   [0m | [95m 6.0     [0m | [95m 10.0    [0m | [95m 1.0     [0m | [95m 12.0    [0m | [95m 32.45   [0m | [95m 0.001   [0m | [95m 0.5     [0m |
[50]	training's l2: 4.95178e+08	valid_1's l2: 8.39329e+08
[100]	training's l2: 2.99465e+08	valid_1's l2: 7.43793e+08
[150]	training's l2: 1.97989e+08	valid_1's l2: 7.30705e+08
[200]	training's l2: 1.44744e+08	valid_1's l2: 7.26611e+08
| [0m 16      [0m | [0m 0.8838  [0m | [0m 0.9561  [0m | [0m 260.9   [0m | [0m 7.872   [0m | [0m 10.09   [0m | [0m 33.21   [0m | [0m 30.05   [0m | [0m 46.88   [0m | [0m 7.514   [0m | [0m 0.7638  [0m |
[50]	training's 

In [14]:
lgbr.res

[{'target': 0.8927939354340579,
  'params': {'colsample_bytree': 0.5046793069038824,
   'max_bin': 254.5383273174741,
   'max_depth': 7.133836899348352,
   'min_child_samples': 14.497661635986743,
   'min_child_weight': 34.58497211791149,
   'num_leaves': 29.531570455766293,
   'reg_alpha': 44.883884749899046,
   'reg_lambda': 6.47487325536192,
   'subsample': 0.9484815613954983}},
 {'target': 0.8499957207606696,
  'params': {'colsample_bytree': 0.8605674645823427,
   'max_bin': 417.36317618033297,
   'max_depth': 14.275680688218047,
   'min_child_samples': 85.02216256523265,
   'min_child_weight': 47.895172447477464,
   'num_leaves': 25.249599732179597,
   'reg_alpha': 24.746933117965533,
   'reg_lambda': 3.3957552364003205,
   'subsample': 0.8097146631224952}},
 {'target': 0.8794802395301293,
  'params': {'colsample_bytree': 0.9887648192106426,
   'max_bin': 57.252208042606135,
   'max_depth': 13.442062122546155,
   'min_child_samples': 36.32495266858435,
   'min_child_weight': 15.63

In [15]:
target_list = []
for result in lgbr.res:
    target = result['target']
    target_list.append(target)
print(target_list)
print('maximum target index:', np.argmax(np.array(target_list)))

[0.8927939354340579, 0.8499957207606696, 0.8794802395301293, 0.8725579916988855, 0.8432258716305929, 0.843431491502658, 0.8788984274271829, 0.8793508922172927, 0.8879135199983933, 0.8796202175031564, 0.8753898986613257, 0.8933674613184927, 0.8455776051963257, 0.8831954013904021, 0.9050437879082671, 0.8837902534089458, 0.889228648598289, 0.9003800261873215, 0.8852190879990316, 0.8947229897965121, 0.8778826213432521, 0.9050268295746249, 0.9035253239760407, 0.8884757860094065, 0.8823315233162305, 0.9005901766849579, 0.8905333426699937, 0.8983784660555117, 0.8909188809212794, 0.881009680096857]
maximum target index: 14


In [16]:
max_dict = lgbr.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.9050437879082671, 'params': {'colsample_bytree': 0.5, 'max_bin': 89.6496779573514, 'max_depth': 6.0, 'min_child_samples': 10.0, 'min_child_weight': 1.0, 'num_leaves': 12.0, 'reg_alpha': 32.4533221024008, 'reg_lambda': 0.001, 'subsample': 0.5}}


In [17]:
tuning_lgbr = LGBMRegressor(
    njobs=-1,
    n_estimators=800,
    learning_rate=0.02,
    max_depth = 6,
    num_leaves=12,
    colsample_bytree=0.5,
    subsample=0.5,
    max_bin=89,
    reg_alpha=32.453,
    reg_lambda=0.001,
    min_child_weight=1,
    min_child_samples=10,
    silent=-1,
    verbose=-1,
)

tuning_lgbr.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'r2', verbose= 100, 
            early_stopping_rounds= 100)

[100]	training's l2: 6.85522e+08	valid_1's l2: 1.04538e+09
[200]	training's l2: 2.97023e+08	valid_1's l2: 7.16489e+08
[300]	training's l2: 2.02289e+08	valid_1's l2: 6.62427e+08
[400]	training's l2: 1.53459e+08	valid_1's l2: 6.55392e+08
[500]	training's l2: 1.19249e+08	valid_1's l2: 6.42184e+08
[600]	training's l2: 9.4327e+07	valid_1's l2: 6.3701e+08
[700]	training's l2: 7.62899e+07	valid_1's l2: 6.3427e+08
[800]	training's l2: 6.27109e+07	valid_1's l2: 6.32867e+08


LGBMRegressor(colsample_bytree=0.5, learning_rate=0.02, max_bin=89, max_depth=6,
              min_child_samples=10, min_child_weight=1, n_estimators=800,
              njobs=-1, num_leaves=12, reg_alpha=32.453, reg_lambda=0.001,
              silent=-1, subsample=0.5, verbose=-1)

In [18]:
preds = tuning_lgbr.predict(test_data.drop('Id', axis=1))
test_data['SalePrice'] = preds

In [19]:
test_data[['Id', 'SalePrice']].to_csv('House_prices_hyperparameter_tuning_01.csv', index=False)