In [179]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 5000)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from scipy.stats import norm, skew, probplot
from scipy.special import boxcox1p
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.kernel_ridge import KernelRidge as krr
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as rfr, GradientBoostingRegressor as gbr
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.cluster import DBSCAN

import optuna
from functools import partial

In [424]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

nominal_vars = [
    'MSZoning', 'LandContour', 'Utilities', 'LotConfig', 'Neighborhood', 'Condition1', 
    'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
    'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType', 
    'MiscFeature', 'SaleType', 'SaleCondition'
]
order_vars = [
    'MSSubClass', 'Street', 'Alley', 'LotShape', 'LandSlope', 'ExterQual', 
    'ExterCond', 'BsmtQual','BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
    'HeatingQC', 'CentralAir', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond',
    'PoolQC', 'Fence', 'Functional', 'GarageFinish', 'PavedDrive', 'MoSold', 'YrSold'
]

In [404]:
train_id = train.Id
test_id = test.Id
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [405]:
outlier_idx = train.GrLivArea.sort_values(ascending=False)[:2].index
outlier_idx = list(outlier_idx.values)
train.drop(outlier_idx, axis=0, inplace=True)

In [406]:
train_size = train.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test), axis=0).reset_index(drop=True).drop(['SalePrice'], axis=1)

In [408]:
all_data.MSZoning = all_data.groupby('Neighborhood').MSZoning.transform(lambda x: x.fillna(x.mode()[0]))
all_data.LotFrontage = all_data.groupby('Neighborhood').LotFrontage.transform(lambda x: x.fillna(x.median()))
all_data.Exterior1st = all_data.groupby('Neighborhood').Exterior1st.transform(lambda x: x.fillna(x.mode()[0]))
all_data.Exterior2nd = all_data.groupby('Neighborhood').Exterior2nd.transform(lambda x: x.fillna(x.mode()[0]))

for c in ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
          'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
          'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'KitchenQual']:
    all_data[c] = all_data[c].fillna('None')
    
for c in ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
         'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']:
    all_data[c] = all_data[c].fillna(0)   
    
for c in ['Electrical', 'BsmtFullBath', 'Utilities', 
          'SaleType', 'Functional']:
    all_data[c] = all_data[c].fillna(all_data[c].mode()[0])

In [409]:
# Complete
all_data.isnull().sum().sum()

0

In [410]:
nominal_vars = list(set(nominal_vars) - set(['Utilities']))

In [411]:
all_data.drop(['Utilities'], axis=1, inplace=True)

In [412]:
all_data[nominal_vars] = all_data[nominal_vars].astype(str)
all_data[ranking_vars] = all_data[ranking_vars].astype(str)

In [413]:
from sklearn.preprocessing import LabelEncoder

for c in ranking_vars:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

In [414]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head()

Unnamed: 0,Skew
MiscVal,21.939672
PoolArea,17.688664
LotArea,13.109495
LowQualFinSF,12.084539
3SsnPorch,11.37208


In [415]:
skewness = skewness[abs(skewness) > 0.75]

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [416]:
all_data = pd.get_dummies(all_data)
print(all_data.shape)

(2917, 219)


In [417]:
train.SalePrice = np.log1p(train.SalePrice)

In [418]:
X_train = all_data[:ntrain]
X_test = all_data[ntrain:]

In [419]:
n_folds = 5

def rmsle(model, x, y):
    return np.sqrt(mean_squared_error(y, model.predict(x)))

def rmsle_cv(model, x, y):
    #kf = KFold(n_splits=5, shuffle=True, random_state=42)
    kf = 5
    rmse =np.sqrt(-cross_val_score(model, x.values, y, scoring='neg_mean_squared_error', cv=kf))
    return rmse

In [420]:
pipe_lasso = Pipeline([
    ('scaler', RobustScaler()),
    ('model', Lasso())
])
pipe_enet = Pipeline([
    ('scaler', RobustScaler()),
    ('model', ElasticNet(max_iter=5000))
])
pipe_krr = Pipeline([
    ('scaler', RobustScaler()),
    ('model', krr())
])

grid_param_lasso = [{
    'model__alpha': 0.0001 * np.arange(1, 100)
}]
grid_param_enet = [{
    'model__alpha': 0.0001 * np.arange(1, 100),
    'model__l1_ratio': 0.001 * np.arange(1, 10)
}]
grid_param_krr = [{
    'model__alpha': [0.0001 * np.arange(1, 100)],
    'model__degree': [1, 2, 3],
    'model__alpha': [0.6],
    'model__kernel': ['polynomial'],
    'model__coef0': [2.5]
}]

In [421]:
best_params = {
    'Lasso': None,
    'ElasticNet': None,
    'Kernel Ridge': None
}

In [323]:
search_lasso = GridSearchCV(pipe_lasso, grid_param_lasso, scoring='neg_mean_squared_error', n_jobs=-1).fit(X_train, y_train)
best_params['Lasso'] = search_lasso.best_params_

In [324]:
search_enet = GridSearchCV(pipe_enet, grid_param_enet, scoring='neg_mean_squared_error', n_jobs=-1).fit(X_train, y_train)
best_params['ElasticNet'] = search_enet.best_params_

In [422]:
search_krr = GridSearchCV(pipe_krr, grid_param_krr, scoring='neg_mean_squared_error', n_jobs=-1).fit(X_train, y_train)
best_params['Kernel Ridge'] = search_krr.best_params_

In [423]:
best_params

{'Lasso': None,
 'ElasticNet': None,
 'Kernel Ridge': {'model__alpha': 0.6,
  'model__coef0': 2.5,
  'model__degree': 3,
  'model__kernel': 'polynomial'}}

In [220]:
outlier_idx = train.GrLivArea.sort_values(ascending=False)[:2].index
train.drop(outlier_idx, axis=0, inplace=True)

In [221]:
train_size = train.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test), axis=0).reset_index(drop=True).drop(['SalePrice'], axis=1)

In [255]:
# specific value replacements
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
all_data["Alley"] = all_data["Alley"].fillna("None")
all_data["Fence"] = all_data["Fence"].fillna("None")
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")

for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)
    
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
all_data["Functional"] = all_data["Functional"].fillna("Typ")
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")

In [256]:
# mean value replacements
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

In [257]:
all_data.drop(['Utilities'], axis=1, inplace=True)

In [222]:
#all_data.MSZoning = all_data.groupby('Neighborhood').MSZoning.transform(lambda x: x.fillna(x.mode()[0]))
all_data.LotFrontage = all_data.groupby('Neighborhood').LotFrontage.transform(lambda x: x.fillna(x.median()))
#all_data.Exterior1st = all_data.groupby('Neighborhood').Exterior1st.transform(lambda x: x.fillna(x.mode()[0]))
#all_data.Exterior2nd = all_data.groupby('Neighborhood').Exterior2nd.transform(lambda x: x.fillna(x.mode()[0]))

for c in ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
          'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
          'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'KitchenQual']:
    all_data[c] = all_data[c].fillna('None')
    
for c in ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
         'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']:
    all_data[c] = all_data[c].fillna(0)   
# 'MSZoning', 'Exterior1st', 'Exterior2nd', 
for c in ['MSZoning', 'Exterior1st', 'Exterior2nd','Electrical', 'Utilities',
          'SaleType', 'Functional']:
    all_data[c] = all_data[c].fillna(all_data[c].mode()[0])

In [258]:
# Complete
all_data.isnull().sum().sum()

0

In [259]:
all_data['HasMasVnr'] = all_data.MasVnrArea.apply(lambda x: 1 if x else 0)
all_data['Has2ndFlrSF'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x else 0)
all_data['HasGarageArea'] = all_data['GarageArea'].apply(lambda x: 1 if x else 0)
all_data['HasWoodDeckSF'] = all_data['WoodDeckSF'].apply(lambda x: 1 if x else 0)
all_data['HasOpenPorchSF'] = all_data['OpenPorchSF'].apply(lambda x: 1 if x else 0)
all_data['HasEnclosedPorch'] = all_data['EnclosedPorch'].apply(lambda x: 1 if x else 0)
all_data['Has3SsnPorch'] = all_data['3SsnPorch'].apply(lambda x: 1 if x else 0)
all_data['HasScreenPorch'] = all_data['ScreenPorch'].apply(lambda x: 1 if x else 0)
all_data['HasPoolArea'] = all_data['PoolArea'].apply(lambda x: 1 if x else 0)
all_data['HasMiscVal'] = all_data['MiscVal'].apply(lambda x: 1 if x else 0)

all_data['TotalBath'] = all_data['BsmtFullBath'] + all_data['BsmtHalfBath'] * 0.5\
+ all_data['FullBath'] + all_data['HalfBath'] * 0.5
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
#all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['2ndFlrSF']
all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'] + all_data['BsmtFinSF1'] \
+ all_data['BsmtFinSF2']

In [260]:
nominal_vars = list(set(nominal_vars) - set(['Utilities']))
#order_vars = list(set(order_vars) - set(['Utilities']))
#all_data.drop(['Utilities'], axis=1, inplace=True)

In [261]:
all_data[nominal_vars] = all_data[nominal_vars].astype(str)
all_data[order_vars] = all_data[order_vars].astype(str)

In [262]:
for c in order_vars:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

In [263]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head()

Unnamed: 0,Skew
MiscVal,21.939672
PoolArea,17.688664
HasPoolArea,15.494756
LotArea,13.109495
LowQualFinSF,12.084539


In [264]:
skewness = skewness[abs(skewness) > 0.75]

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [265]:
all_data = pd.get_dummies(all_data)

In [231]:
y_train = np.log1p(y_train)

In [266]:
X_train, X_test = all_data.iloc[:train_size, :], all_data.iloc[train_size:, :]

In [267]:
X_train.shape, X_test.shape, y_train.shape

((1458, 231), (1459, 231), (1458,))

In [234]:
def rmsle_cv(model):
    return np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring='neg_mean_squared_error',
                   cv=5, verbose=0, n_jobs=-1))

In [271]:
model_lasso = Pipeline([
    ('scaler', RobustScaler()),
    ('model', Lasso())
])
model_elasticNet = Pipeline([
    ('scaler', RobustScaler()),
    ('model', ElasticNet(max_iter=5000))
])
model_krr = Pipeline([
    ('scaler', RobustScaler()),
    ('model', krr())
])

model_svr = Pipeline([
    ('scaler', RobustScaler()),
    ('model', SVR())
])

grid_param_lasso = {
    'model__alpha': 0.0001 * np.arange(1, 100)
}
grid_param_elasticNet = {
    'model__alpha': 0.0001 * np.arange(1, 100),
    'model__l1_ratio': 0.001 * np.arange(1, 10)
}
grid_param_krr = {
    'model__alpha': [0.099, 0.6],
    'model__degree': [1, 2, 3],
    'model__kernel': ['polynomial'],
    'model__coef0': [2.5]
}
grid_param_svr = {
    'model__C': [0.001, 0.1, 1, 10, 20],
    'model__gamma': [.0001, .0002, .0003, .0004, .0005, .0006, .0007, .0008, .0009, .001],
    'model__epsilon': [.01, .02, .03, .04, .05, .06, .07, .08, .09, .1]
}

In [236]:
search_lasso = GridSearchCV(model_lasso, grid_param_lasso, scoring='neg_mean_squared_error',
                           cv=5, n_jobs=-1, verbose=0).fit(X_train, y_train)
search_lasso.best_params_

{'model__alpha': 0.0005}

In [269]:
search_elasticNet = GridSearchCV(model_elasticNet, grid_param_elasticNet, scoring='neg_mean_squared_error',
                           cv=5, n_jobs=-1, verbose=0).fit(X_train, y_train)
search_elasticNet.best_params_

{'model__alpha': 0.0089, 'model__l1_ratio': 0.009000000000000001}

In [273]:
search_krr = GridSearchCV(model_krr, grid_param_krr, scoring='neg_mean_squared_error',
                           cv=5, n_jobs=-1, verbose=0).fit(X_train, y_train)
search_krr.best_params_

{'model__alpha': 0.099,
 'model__coef0': 2.5,
 'model__degree': 1,
 'model__kernel': 'polynomial'}

In [22]:
# search_svr = GridSearchCV(model_svr, grid_param_svr, scoring='neg_mean_squared_error',
#                            cv=5, n_jobs=-1, verbose=0).fit(X_train, y_train)
# search_svr.best_params_

In [65]:
def objective_xgbr(trial, X, y):
    param = {
        'n_estimators': 2000,
        'max_depth': trial.suggest_int('max_depth', 3, 11),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.005, 0.01),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 100),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 100),
        'subsample': trial.suggest_categorical('subsample', list(np.arange(0.4, 1.1, 0.1))),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', list(np.arange(0.4, 1.1, 0.1))),
        'n_jobs': -1
    }
    model = XGBRegressor(**param)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_scores, test_scores = [], []
    for train_idx, test_idx in kf.split(X_train):
        X_tmp_train, X_tmp_test = X_train.iloc[train_idx, :], X_train.iloc[test_idx, :]
        y_tmp_train, y_tmp_test = y_train[train_idx], y_train[test_idx]
        model.fit(X_tmp_train, y_tmp_train,
                 eval_metric=['rmse'], eval_set=[(X_tmp_test, y_tmp_test)],
                 early_stopping_rounds=30, verbose=0,
                 callbacks=[optuna.integration.XGBoostPruningCallback(trial, observation_key='validation_0-rmse')])
        train_score = np.sqrt(mse(y_tmp_train, model.predict(X_tmp_train)))
        test_score = np.sqrt(mse(y_tmp_test, model.predict(X_tmp_test)))
        train_scores.append(train_score)
        test_scores.append(test_score)
    train_score = np.array(train_scores).mean()
    test_score = np.array(test_scores).mean()
    print(f'train score: {train_score}')
    print(f'test score: {test_score}')
    return test_score

In [66]:
optimizer = partial(objective_xgb, X=X_train, y=y_train)
study = optuna.create_study(direction='minimize')
study.optimize(optimizer, n_trials=100)

[32m[I 2021-10-23 12:55:03,377][0m A new study created in memory with name: no-name-81d582f2-4e61-44a4-a0e8-b39b2fb73c17[0m
[32m[I 2021-10-23 12:56:08,923][0m Trial 0 finished with value: 0.13803421666146676 and parameters: {'max_depth': 10, 'learning_rate': 0.00955985055896822, 'subsample': 1.0, 'colsample_bylevel': 0.9, 'reg_alpha': 0.008905301379345233, 'reg_lambda': 79.12436856939524}. Best is trial 0 with value: 0.13803421666146676.[0m


train score: 0.04338052970682678
test score: 0.13803421666146676


[32m[I 2021-10-23 12:56:39,018][0m Trial 1 finished with value: 0.1420461088893113 and parameters: {'max_depth': 5, 'learning_rate': 0.005354345711295748, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 5.5691097262869, 'reg_lambda': 0.008930129011477737}. Best is trial 0 with value: 0.13803421666146676.[0m


train score: 0.11362014884539902
test score: 0.1420461088893113


[32m[I 2021-10-23 12:57:18,208][0m Trial 2 finished with value: 0.12704724010907878 and parameters: {'max_depth': 7, 'learning_rate': 0.005052323955634718, 'subsample': 1.0, 'colsample_bylevel': 0.7, 'reg_alpha': 1.1805595018641346, 'reg_lambda': 0.00442156729278353}. Best is trial 2 with value: 0.12704724010907878.[0m


train score: 0.06593972708805362
test score: 0.12704724010907878


[32m[I 2021-10-23 12:57:56,928][0m Trial 3 finished with value: 0.12399574742319645 and parameters: {'max_depth': 6, 'learning_rate': 0.00850691312237557, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'reg_alpha': 0.6043872358145256, 'reg_lambda': 3.2833098127885654}. Best is trial 3 with value: 0.12399574742319645.[0m


train score: 0.047155791908205624
test score: 0.12399574742319645


[32m[I 2021-10-23 12:58:32,406][0m Trial 4 finished with value: 0.11861348284067239 and parameters: {'max_depth': 9, 'learning_rate': 0.009815924323494565, 'subsample': 0.7, 'colsample_bylevel': 0.4, 'reg_alpha': 0.009267682438574731, 'reg_lambda': 4.600204686272811}. Best is trial 4 with value: 0.11861348284067239.[0m


train score: 0.02005353298112104
test score: 0.11861348284067239


[32m[I 2021-10-23 12:58:33,687][0m Trial 5 pruned. Trial was pruned at iteration 401.[0m
[32m[I 2021-10-23 12:58:33,712][0m Trial 6 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 12:58:34,208][0m Trial 7 pruned. Trial was pruned at iteration 244.[0m
[32m[I 2021-10-23 12:58:34,233][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 12:58:34,257][0m Trial 9 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 12:58:34,293][0m Trial 10 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 12:59:25,665][0m Trial 11 finished with value: 0.120178710220287 and parameters: {'max_depth': 8, 'learning_rate': 0.00864632300514989, 'subsample': 0.7, 'colsample_bylevel': 1.0, 'reg_alpha': 0.10299844816037101, 'reg_lambda': 2.4324705598187126}. Best is trial 4 with value: 0.11861348284067239.[0m


train score: 0.027001199915064538
test score: 0.120178710220287


[32m[I 2021-10-23 13:00:18,072][0m Trial 12 finished with value: 0.12076332042643692 and parameters: {'max_depth': 9, 'learning_rate': 0.00932784848655223, 'subsample': 0.7, 'colsample_bylevel': 1.0, 'reg_alpha': 0.06399492780688731, 'reg_lambda': 2.02359552553199}. Best is trial 4 with value: 0.11861348284067239.[0m
[32m[I 2021-10-23 13:00:18,109][0m Trial 13 pruned. Trial was pruned at iteration 0.[0m


train score: 0.01919198868786914
test score: 0.12076332042643692


[32m[I 2021-10-23 13:01:06,101][0m Trial 14 finished with value: 0.11846238021485767 and parameters: {'max_depth': 11, 'learning_rate': 0.009175581112633203, 'subsample': 0.7, 'colsample_bylevel': 0.9, 'reg_alpha': 0.029011232600079363, 'reg_lambda': 0.2989799146589174}. Best is trial 14 with value: 0.11846238021485767.[0m


train score: 0.012732228301952966
test score: 0.11846238021485767


[32m[I 2021-10-23 13:01:44,447][0m Trial 15 finished with value: 0.11616624737159391 and parameters: {'max_depth': 11, 'learning_rate': 0.009953903157836036, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.0077930853733423015, 'reg_lambda': 0.28607076422543243}. Best is trial 15 with value: 0.11616624737159391.[0m


train score: 0.02069706128990351
test score: 0.11616624737159391


[32m[I 2021-10-23 13:02:26,752][0m Trial 16 finished with value: 0.11566270145070348 and parameters: {'max_depth': 11, 'learning_rate': 0.00926365717222274, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.0046202001099162485, 'reg_lambda': 0.12586533182920503}. Best is trial 16 with value: 0.11566270145070348.[0m
[32m[I 2021-10-23 13:02:26,792][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m


train score: 0.018017713265059195
test score: 0.11566270145070348


[32m[I 2021-10-23 13:03:10,865][0m Trial 18 finished with value: 0.11583732683950947 and parameters: {'max_depth': 11, 'learning_rate': 0.009211124210874944, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.005662385004601676, 'reg_lambda': 0.12593410613221206}. Best is trial 16 with value: 0.11566270145070348.[0m
[32m[I 2021-10-23 13:03:10,905][0m Trial 19 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:03:10,939][0m Trial 20 pruned. Trial was pruned at iteration 0.[0m


train score: 0.017238633612781183
test score: 0.11583732683950947


[32m[I 2021-10-23 13:03:55,485][0m Trial 21 finished with value: 0.11676413618648432 and parameters: {'max_depth': 11, 'learning_rate': 0.009974587879836119, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.013217605075951103, 'reg_lambda': 0.6337986175828852}. Best is trial 16 with value: 0.11566270145070348.[0m
[32m[I 2021-10-23 13:03:55,523][0m Trial 22 pruned. Trial was pruned at iteration 0.[0m


train score: 0.018338052043330087
test score: 0.11676413618648432


[32m[I 2021-10-23 13:04:40,449][0m Trial 23 finished with value: 0.11592544409055665 and parameters: {'max_depth': 10, 'learning_rate': 0.009448757356188223, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.003073743997623817, 'reg_lambda': 0.2546705688282807}. Best is trial 16 with value: 0.11566270145070348.[0m
[32m[I 2021-10-23 13:04:40,485][0m Trial 24 pruned. Trial was pruned at iteration 0.[0m


train score: 0.01761194268476953
test score: 0.11592544409055665


[32m[I 2021-10-23 13:05:11,525][0m Trial 25 finished with value: 0.11763143628882464 and parameters: {'max_depth': 8, 'learning_rate': 0.009497475435250575, 'subsample': 0.8, 'colsample_bylevel': 0.7, 'reg_alpha': 0.02008060382035321, 'reg_lambda': 0.11499448301766028}. Best is trial 16 with value: 0.11566270145070348.[0m
[32m[I 2021-10-23 13:05:11,560][0m Trial 26 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:05:11,594][0m Trial 27 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:05:11,629][0m Trial 28 pruned. Trial was pruned at iteration 0.[0m


train score: 0.02011656436392839
test score: 0.11763143628882464


[32m[I 2021-10-23 13:05:47,960][0m Trial 29 finished with value: 0.11601489239941891 and parameters: {'max_depth': 10, 'learning_rate': 0.009608606254331837, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.005778125135294363, 'reg_lambda': 0.06440800709775349}. Best is trial 16 with value: 0.11566270145070348.[0m


train score: 0.021406110386838456
test score: 0.11601489239941891


[32m[I 2021-10-23 13:06:25,757][0m Trial 30 finished with value: 0.11609164977862665 and parameters: {'max_depth': 9, 'learning_rate': 0.009492190561702835, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.0025455266693623006, 'reg_lambda': 0.013699433814096277}. Best is trial 16 with value: 0.11566270145070348.[0m


train score: 0.01955807311247297
test score: 0.11609164977862665


[32m[I 2021-10-23 13:07:04,718][0m Trial 31 finished with value: 0.11583795527356693 and parameters: {'max_depth': 10, 'learning_rate': 0.009612454075327726, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.006525881844068064, 'reg_lambda': 0.08968432681668093}. Best is trial 16 with value: 0.11566270145070348.[0m


train score: 0.022006304517148446
test score: 0.11583795527356693


[32m[I 2021-10-23 13:07:35,896][0m Trial 32 finished with value: 0.11486099803932044 and parameters: {'max_depth': 10, 'learning_rate': 0.0095482092318586, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 0.01114656157470384, 'reg_lambda': 0.006014555451368543}. Best is trial 32 with value: 0.11486099803932044.[0m
[32m[I 2021-10-23 13:07:35,933][0m Trial 33 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:07:35,972][0m Trial 34 pruned. Trial was pruned at iteration 0.[0m


train score: 0.019268822229596635
test score: 0.11486099803932044


[32m[I 2021-10-23 13:07:37,410][0m Trial 35 pruned. Trial was pruned at iteration 442.[0m
[32m[I 2021-10-23 13:07:37,446][0m Trial 36 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:07:37,482][0m Trial 37 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:07:40,918][0m Trial 38 pruned. Trial was pruned at iteration 565.[0m
[32m[I 2021-10-23 13:07:40,960][0m Trial 39 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:07:43,060][0m Trial 40 pruned. Trial was pruned at iteration 563.[0m
[32m[I 2021-10-23 13:07:46,750][0m Trial 41 pruned. Trial was pruned at iteration 617.[0m
[32m[I 2021-10-23 13:07:46,788][0m Trial 42 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:08:25,577][0m Trial 43 finished with value: 0.11627036256409484 and parameters: {'max_depth': 10, 'learning_rate': 0.009794043831424244, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.004509855314291214, 'reg_lambda': 0.399048624

train score: 0.022815702637458805
test score: 0.11627036256409484


[32m[I 2021-10-23 13:08:28,398][0m Trial 48 pruned. Trial was pruned at iteration 606.[0m
[32m[I 2021-10-23 13:08:28,434][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:08:28,473][0m Trial 50 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:09:04,416][0m Trial 51 finished with value: 0.11586458644381467 and parameters: {'max_depth': 10, 'learning_rate': 0.009716481407362526, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.006542204188021846, 'reg_lambda': 0.07648564137344181}. Best is trial 32 with value: 0.11486099803932044.[0m


train score: 0.022341281171459063
test score: 0.11586458644381467


[32m[I 2021-10-23 13:09:14,761][0m Trial 52 pruned. Trial was pruned at iteration 849.[0m
[32m[I 2021-10-23 13:09:51,642][0m Trial 53 finished with value: 0.11559133814236615 and parameters: {'max_depth': 10, 'learning_rate': 0.009605337237880585, 'subsample': 0.5, 'colsample_bylevel': 0.9, 'reg_alpha': 0.0054634651398526675, 'reg_lambda': 0.1781410313644347}. Best is trial 32 with value: 0.11486099803932044.[0m
[32m[I 2021-10-23 13:09:51,680][0m Trial 54 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:09:51,721][0m Trial 55 pruned. Trial was pruned at iteration 0.[0m


train score: 0.022843404182477198
test score: 0.11559133814236615


[32m[I 2021-10-23 13:09:55,412][0m Trial 56 pruned. Trial was pruned at iteration 576.[0m
[32m[I 2021-10-23 13:09:57,007][0m Trial 57 pruned. Trial was pruned at iteration 501.[0m
[32m[I 2021-10-23 13:09:57,044][0m Trial 58 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:09:57,080][0m Trial 59 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:09:58,308][0m Trial 60 pruned. Trial was pruned at iteration 316.[0m
[32m[I 2021-10-23 13:09:58,345][0m Trial 61 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:10:03,364][0m Trial 62 pruned. Trial was pruned at iteration 790.[0m
[32m[I 2021-10-23 13:10:03,404][0m Trial 63 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:10:03,442][0m Trial 64 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:10:33,194][0m Trial 65 finished with value: 0.11465485263591675 and parameters: {'max_depth': 11, 'learning_rate': 0.009996194464200983, 'subsample': 

train score: 0.02049986729918013
test score: 0.11465485263591675


[32m[I 2021-10-23 13:11:05,043][0m Trial 66 finished with value: 0.11408322831476077 and parameters: {'max_depth': 11, 'learning_rate': 0.00996702130903663, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 0.023958271624403407, 'reg_lambda': 0.028567741855150673}. Best is trial 66 with value: 0.11408322831476077.[0m


train score: 0.02021222565857873
test score: 0.11408322831476077


[32m[I 2021-10-23 13:11:37,348][0m Trial 67 finished with value: 0.11420035916308809 and parameters: {'max_depth': 11, 'learning_rate': 0.009967623643377955, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 0.024959225607427808, 'reg_lambda': 0.025815405990824852}. Best is trial 66 with value: 0.11408322831476077.[0m


train score: 0.019705780512696596
test score: 0.11420035916308809


[32m[I 2021-10-23 13:12:08,162][0m Trial 68 finished with value: 0.11450529209667044 and parameters: {'max_depth': 11, 'learning_rate': 0.009967320428965063, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 0.025869863491263786, 'reg_lambda': 0.02438255333928065}. Best is trial 66 with value: 0.11408322831476077.[0m


train score: 0.02155576000016006
test score: 0.11450529209667044


[32m[I 2021-10-23 13:12:16,443][0m Trial 69 pruned. Trial was pruned at iteration 810.[0m
[32m[I 2021-10-23 13:12:20,086][0m Trial 70 pruned. Trial was pruned at iteration 678.[0m
[32m[I 2021-10-23 13:12:52,513][0m Trial 71 finished with value: 0.11415175838592491 and parameters: {'max_depth': 11, 'learning_rate': 0.009879934720583093, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 0.01628694534663974, 'reg_lambda': 0.018978829378970115}. Best is trial 66 with value: 0.11408322831476077.[0m


train score: 0.01941071173017373
test score: 0.11415175838592491


[32m[I 2021-10-23 13:13:24,971][0m Trial 72 finished with value: 0.11437451384790479 and parameters: {'max_depth': 11, 'learning_rate': 0.009997644263235817, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 0.017583699982113666, 'reg_lambda': 0.022737675401273295}. Best is trial 66 with value: 0.11408322831476077.[0m


train score: 0.01905618917774498
test score: 0.11437451384790479


[32m[I 2021-10-23 13:13:29,635][0m Trial 73 pruned. Trial was pruned at iteration 810.[0m
[32m[I 2021-10-23 13:13:59,591][0m Trial 74 finished with value: 0.11435486442945056 and parameters: {'max_depth': 11, 'learning_rate': 0.00997214670909868, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 0.0375475167141632, 'reg_lambda': 0.020095615750628407}. Best is trial 66 with value: 0.11408322831476077.[0m


train score: 0.024669710347406396
test score: 0.11435486442945056


[32m[I 2021-10-23 13:14:08,553][0m Trial 75 pruned. Trial was pruned at iteration 810.[0m
[32m[I 2021-10-23 13:14:08,591][0m Trial 76 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:14:37,817][0m Trial 77 finished with value: 0.11435769930222162 and parameters: {'max_depth': 11, 'learning_rate': 0.00986297689368371, 'subsample': 0.4, 'colsample_bylevel': 0.6, 'reg_alpha': 0.032307709611681605, 'reg_lambda': 0.013008356213110226}. Best is trial 66 with value: 0.11408322831476077.[0m
[32m[I 2021-10-23 13:14:37,857][0m Trial 78 pruned. Trial was pruned at iteration 0.[0m


train score: 0.026850474245924398
test score: 0.11435769930222162


[32m[I 2021-10-23 13:15:07,209][0m Trial 79 finished with value: 0.11408651374567329 and parameters: {'max_depth': 11, 'learning_rate': 0.009995822636002467, 'subsample': 0.4, 'colsample_bylevel': 0.6, 'reg_alpha': 0.02881482762893963, 'reg_lambda': 0.02341850822366798}. Best is trial 66 with value: 0.11408322831476077.[0m
[32m[I 2021-10-23 13:15:07,250][0m Trial 80 pruned. Trial was pruned at iteration 0.[0m


train score: 0.025957591200249104
test score: 0.11408651374567329


[32m[I 2021-10-23 13:15:11,446][0m Trial 81 pruned. Trial was pruned at iteration 797.[0m
[32m[I 2021-10-23 13:15:14,790][0m Trial 82 pruned. Trial was pruned at iteration 671.[0m
[32m[I 2021-10-23 13:15:18,792][0m Trial 83 pruned. Trial was pruned at iteration 797.[0m
[32m[I 2021-10-23 13:15:22,794][0m Trial 84 pruned. Trial was pruned at iteration 797.[0m
[32m[I 2021-10-23 13:15:25,327][0m Trial 85 pruned. Trial was pruned at iteration 523.[0m
[32m[I 2021-10-23 13:15:25,368][0m Trial 86 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:15:29,497][0m Trial 87 pruned. Trial was pruned at iteration 730.[0m
[32m[I 2021-10-23 13:15:29,537][0m Trial 88 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:15:33,756][0m Trial 89 pruned. Trial was pruned at iteration 797.[0m
[32m[I 2021-10-23 13:15:37,948][0m Trial 90 pruned. Trial was pruned at iteration 730.[0m
[32m[I 2021-10-23 13:15:37,993][0m Trial 91 pruned. Trial was pruned at 

train score: 0.023101396183393054
test score: 0.11407255681523465


[32m[I 2021-10-23 13:16:42,052][0m Trial 94 finished with value: 0.11464162714010018 and parameters: {'max_depth': 11, 'learning_rate': 0.009988688919641664, 'subsample': 0.5, 'colsample_bylevel': 0.6, 'reg_alpha': 0.0476388298304726, 'reg_lambda': 0.020137778628964303}. Best is trial 93 with value: 0.11407255681523465.[0m


train score: 0.024364396670287482
test score: 0.11464162714010018


[32m[I 2021-10-23 13:16:46,440][0m Trial 95 pruned. Trial was pruned at iteration 790.[0m
[32m[I 2021-10-23 13:16:46,487][0m Trial 96 pruned. Trial was pruned at iteration 1.[0m
[32m[I 2021-10-23 13:16:50,589][0m Trial 97 pruned. Trial was pruned at iteration 790.[0m
[32m[I 2021-10-23 13:16:50,635][0m Trial 98 pruned. Trial was pruned at iteration 0.[0m
[32m[I 2021-10-23 13:16:54,086][0m Trial 99 pruned. Trial was pruned at iteration 688.[0m


In [25]:
print(a)

NameError: name 'a' is not defined

In [67]:
study.best_params

{'max_depth': 11,
 'learning_rate': 0.009997615556977911,
 'subsample': 0.5,
 'colsample_bylevel': 0.6,
 'reg_alpha': 0.034393897207617534,
 'reg_lambda': 0.019925604450093764}

In [103]:
model_lasso = Pipeline([
    ('scaler', RobustScaler()),
    ('model', Lasso(alpha=0.0005))
])
model_elasticNet = Pipeline([
    ('scaler', RobustScaler()),
    ('model', ElasticNet(alpha=0.0091, l1_ratio=0.009000000000000001))
])
model_krr = Pipeline([
    ('scaler', RobustScaler()),
    ('model', krr(alpha=0.0099,
                        kernel='polynomial',
                        degree=1,
                        coef0=2.5))
])
model_svr = Pipeline([
    ('scaler', RobustScaler()),
    ('model', SVR(C=20,gamma=0.0003, epsilon=0.02))
])
model_xgbr = XGBRegressor(colsample_bytree=0.4, learning_rate=0.00898718134841855, max_depth=8, 
                             n_estimators=2200, reg_alpha=0.036142628805195254, reg_lambda=0.03188665185506858,
                             subsample=0.6, random_state =42)
model_gbr = gbr(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state=5)
model_lgbm = LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
stack_gen = StackingCVRegressor(regressors=(model_lgbm, model_lasso, model_elasticNet, model_krr, model_gbr),
                               meta_regressor=model_xgbr,
                               use_features_in_secondary=True)

In [104]:
models = [
    model_lasso, model_elasticNet, model_krr, model_svr, model_gbr, model_xgbr, model_lgbm
]
cross_score = {
    'Lasso': 0,
    'ElasticNet': 0,
    'Kernel Ridge': 0,
    'SVR': 0,
    'GradientBoosting': 0,
    'XGBoost': 0,
    'LightGBM': 0,
}

for idx, model in enumerate(models):
    cross_score[list(cross_score.keys())[idx]] = rmsle_cv(model).mean()

In [105]:
for model in models:
    model = model.fit(X_train, y_train)



In [106]:
stack_gen = stack_gen.fit(X_train, y_train)



In [107]:
cross_score

{'Lasso': 0.1115607527360313,
 'ElasticNet': 0.1133347949034966,
 'Kernel Ridge': 0.1158427963943615,
 'SVR': 0.11274480912575471,
 'GradientBoosting': 0.11638980595469176,
 'XGBoost': 0.11368213481477489,
 'LightGBM': 0.11457622325507826}

In [108]:
def blend(X):
    return ((0.10 * model_lasso.predict(X)) + \
            (0.10 * model_elasticNet.predict(X)) + \
            (0.10 * model_krr.predict(X)) + \
            (0.10 * model_svr.predict(X)) + \
            (0.10 * model_xgbr.predict(X)) + \
            (0.10 * model_lgbm.predict(X)) + \
            (0.40 * stack_gen.predict(np.array(X))))

In [109]:
np.sqrt(mse(y_train, blend(X_train)))

0.046939510458975015

In [110]:
sub = pd.DataFrame()
sub['Id'] = test_id
sub['SalePrice'] = score = np.expm1(blend(X_test))
sub.to_csv('submission.csv',index=False)