In [1]:
import sys
sys.path.append('../')
import pandas as pd


from utils.GetData import DataPrep
import numpy as np
from sklearn.model_selection import train_test_split


from datetime import datetime

from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from mlxtend.regressor import StackingCVRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
PATH = '../in/train.csv'
df = pd.read_csv(PATH)

y = df['SalePrice']
df.drop('SalePrice', axis=1, inplace=True)

In [43]:
data = DataPrep()
X, y = data.get_data(df.copy(),y)

In [4]:
pred_example = pd.read_csv('../in/sample_submission.csv')

In [5]:
test = pd.read_csv('../in/test.csv')
X_test, _ = data.get_data(test.copy(), fit=False)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=7)

In [7]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [8]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

In [9]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


# build our model scoring function
def cv_rmse(model, X=X):
    rmse = np.sqrt(-cross_val_score(model, X, y,
                                    scoring="neg_mean_squared_error",
                                    cv=kfolds))
    return (rmse)

In [33]:
ridge = RidgeCV(alphas=alphas_alt, cv=kfolds)

lasso = LassoCV(max_iter=1e7, alphas=alphas2,
                              random_state=42, cv=kfolds)

elasticnet = ElasticNetCV(max_iter=1e7, alphas=e_alphas,
                                        cv=kfolds, l1_ratio=e_l1ratio)
                                        
svr = SVR(C= 20, epsilon= 0.008, gamma=0.0003,)


gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)
                                   

lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       )
                                       

xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [34]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet,
                                            gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [16]:
print('TEST score on CV')

score = cv_rmse(ridge)
print("Kernel Ridge score: {:.4f} , std: {:.4f}\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(lasso)
print("Lasso score: {:.4f} , std: {:.4f}\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(elasticnet)
print("ElasticNet score: {:.4f} , std: {:.4f}\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(svr)
print("SVR score: {:.4f} , std: {:.4f}\n".format(score.mean(), score.std()), datetime.now(), )

score = cv_rmse(xgboost)
print("Xgboost score: {:.4f} , std: {:.4f}\n".format(score.mean(), score.std()), datetime.now(), )

TEST score on CV
Kernel Ridge score: 0.1276 , std: 0.0211
 2021-12-24 14:39:04.027111
Lasso score: 0.1243 , std: 0.0228
 2021-12-24 14:39:12.942011
ElasticNet score: 0.1242 , std: 0.0228
 2021-12-24 14:40:18.955918
SVR score: 0.2432 , std: 0.0237
 2021-12-24 14:40:33.684373
Xgboost score: 0.1165 , std: 0.0204
 2021-12-24 14:45:27.601654


In [35]:
score = cv_rmse(lightgbm)
print("Lightgbm score: {:.4f} , std: {:.4f}\n".format(score.mean(), score.std()), datetime.now(), )

Lightgbm score: 0.1208 , std: 0.0205
 2021-12-24 15:25:06.101416


In [25]:
score = cv_rmse(gbr)
print("GradientBoosting score: {:.4f} , std: {:.4f}\n".format(score.mean(), score.std()), datetime.now(), )

GradientBoosting score: 0.1204 , std: 0.0231
 2021-12-24 15:22:37.949200


In [36]:
print('START Fit')
print(datetime.now(), 'StackingCVRegressor')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
print(datetime.now(), 'elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)
print(datetime.now(), 'lasso')
lasso_model_full_data = lasso.fit(X, y)
print(datetime.now(), 'ridge')
ridge_model_full_data = ridge.fit(X, y)
print(datetime.now(), 'svr')
svr_model_full_data = svr.fit(X, y)
print(datetime.now(), 'GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)
print(datetime.now(), 'xgboost')
xgb_model_full_data = xgboost.fit(X, y)
print(datetime.now(), 'lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)

START Fit
2021-12-24 15:26:48.401359 StackingCVRegressor
2021-12-24 15:33:20.345785 elasticnet
2021-12-24 15:33:29.179359 lasso
2021-12-24 15:33:29.793178 ridge
2021-12-24 15:33:32.601879 svr
2021-12-24 15:33:34.427749 GradientBoosting
2021-12-24 15:34:00.309700 xgboost
2021-12-24 15:34:23.313437 lightgbm


In [37]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.1 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.25 * stack_gen_model.predict(np.array(X))))

In [38]:
print('RMSLE score on train data:')
print(rmsle(y, blend_models_predict(X)))

RMSLE score on train data:
0.06070909660050297


In [46]:
y_predict = np.floor(np.expm1(blend_models_predict(X_test)))

sub = pd.DataFrame()
sub['Id'] = pred_example.Id
sub['SalePrice'] = y_predict
sub.to_csv('../out/mysubmission7.csv',index=False)
sub = sub.set_index('Id')