Kaggle competition House Prices - Advanced Regression Techniques 
https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

Public score: 0.11913.

This notebook was copied and edited from SERIGNE's Stacked Regressions : Top 4% on LeaderBoard https://www.kaggle.com/code/serigne/stacked-regressions-top-4-on-leaderboard;

I borrowed how to drop highly correlated variables from ERIK BRUIN's House prices: Lasso, XGBoost, and a detailed EDA https://www.kaggle.com/code/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda.

I also added a ratio feature following Kaggle Learn Course Feature Engineering https://www.kaggle.com/learn/feature-engineering.

Finally I blended models following NANASHI's #1 House Prices Solution [top 1%] https://www.kaggle.com/code/jesucristo/1-house-prices-solution-top-1#New-blending

Happy Kaggling!

In [None]:
#import some necessary librairies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
%matplotlib inline
color = sns.color_palette()
sns.set_style('darkgrid')

from scipy import stats
from scipy.stats import norm, skew

from sklearn.preprocessing import LabelEncoder
# from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

import xgboost as xgb
import lightgbm as lgb

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 


# Step 1 : Load the data.

In [None]:
#Now let's import and put the train and test datasets in  pandas dataframe

train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
##display the first five rows of the train dataset.
train.head(5)

In [None]:
##display the first five rows of the test dataset.
test.head(5)

In [None]:
#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(train.shape))
print("The test data size before dropping Id feature is : {} ".format(test.shape))

#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) 
print("The test data size after dropping Id feature is : {} ".format(test.shape))

# Step 2: Remove the outliers.

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

In [None]:
#Deleting outliers
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train['GrLivArea'], train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

# Step 3:Normalize the target variable.

In [None]:
sns.histplot(train['SalePrice'], kde=True, stat="density", kde_kws=dict(cut=3), alpha=.4, edgecolor=(1, 1, 1, .4), bins=50);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# Plot the PDF. 
xmin, xmax = plt.xlim() 
x = np.linspace(xmin, xmax, 100) 
p = norm.pdf(x, mu, sigma) 
  
plt.plot(x, p, 'k', linewidth=2) 
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["SalePrice"] = np.log1p(train["SalePrice"])

#Check the new distribution 
sns.histplot(train['SalePrice'] , kde=True, stat="density", kde_kws=dict(cut=3), alpha=.4, edgecolor=(1, 1, 1, .4), bins=50);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# Plot the PDF. 
xmin, xmax = plt.xlim() 
x = np.linspace(xmin, xmax, 100) 
p = norm.pdf(x, mu, sigma) 
  
plt.plot(x, p, 'k', linewidth=2) 
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

# Step 4: Fill in missing data.

In [None]:
# 34 coloums have missing data.

all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:40]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(40)

In [None]:
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='vertical')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

Imputing missing values

In [None]:
# PoolQC : data description says NA means "No Pool". That make sense, given the huge ratio of missing value (+99%) and majority of houses have no Pool at all in general.
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")

# MiscFeature : data description says NA means "no misc feature"
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")

# Alley : data description says NA means "no alley access"
all_data["Alley"] = all_data["Alley"].fillna("None")

# Fence : data description says NA means "no fence"
all_data["Fence"] = all_data["Fence"].fillna("None")

# FireplaceQu : data description says NA means "no fireplace"
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")

# LotFrontage : Since the area of each street connected to the house property most likely have a similar area to other houses in its neighborhood , 
# we can fill in missing values by the median LotFrontage of the neighborhood.
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

# GarageType, GarageFinish, GarageQual and GarageCond : Replacing missing data with None
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')
    
# GarageArea and GarageCars : Replacing missing data with 0 (Since No garage = no cars in such garage).
for col in ('GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)
    
# GarageYrBlt: replacing missing values with median.
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].fillna(all_data['GarageYrBlt'].median())
    
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath : missing values are likely zero for having no basement
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)
    
# BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1 and BsmtFinType2 : For all these categorical basement-related features, NaN means that there is no basement.
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')
    
# MasVnrArea and MasVnrType : NA most likely means no masonry veneer for these houses. We can fill 0 for the area and None for the type.
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

# MSZoning (The general zoning classification) : 'RL' is by far the most common value. So we can fill in missing values with 'RL'
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

# Utilities : For this categorical feature all records are "AllPub", except for one "NoSeWa" and 2 NA . 
# Since the house with 'NoSewa' is in the training set, this feature won't help in predictive modelling. We can then safely remove it.
all_data = all_data.drop(['Utilities'], axis=1)

# Functional : data description says NA means typical
all_data["Functional"] = all_data["Functional"].fillna("Typ")

# Electrical : It has one NA value. Since this feature has mostly 'SBrkr', we can set that for the missing value.
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

# KitchenQual: Only one NA value, and same as Electrical, we set 'TA' (which is the most frequent) for the missing value in KitchenQual.
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

# Exterior1st and Exterior2nd : Again Both Exterior 1 & 2 have only one missing value. We will just substitute in the most common string
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

# SaleType : Fill in again with most frequent which is "WD"
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

# MSSubClass : Na most likely means No building class. We can replace missing values with None
# all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")


In [None]:
#Check remaining missing values if any 
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head()

Transforming some numerical variables that are really categorical.

In [None]:
#MSSubClass=The building class
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].astype(str)

#Year and month sold are transformed into categorical features.
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [None]:
# Ordinal Encoding some categorical variables that may contain information in their ordering set.
# How to map categorical data to category_encoders.OrdinalEncoder in python pandas dataframe
# https://stackoverflow.com/questions/50092911/how-to-map-categorical-data-to-category-encoders-ordinalencoder-in-python-pandas
# ordinal_mappings = {
#    "MSSubClass": [np.nan, 20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190], 
#    "Street": [np.nan, 'Grvl', 'Pave'],
#    "Alley": ['None', 'Grvl', 'Pave'],
#    "LotShape": [np.nan, 'IR3', 'IR2', 'IR1', 'Reg'], 
#    "LandSlope": [np.nan, 'Sev', 'Mod', 'Gtl'],
#    "OverallCond": [np.nan, '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
#    "ExterQual": [np.nan, 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
#    "ExterCond":[np.nan, 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
#    "BsmtQual": ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
#    "BsmtCond": ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
#    "BsmtExposure": ['None', 'No', 'Mn', 'Av', 'Gd'], 
#    "BsmtFinType1": ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], 
#    "BsmtFinType2": ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], 
#    "HeatingQC": [np.nan, 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
#    "CentralAir": [np.nan, 'N', 'Y'],
#    "KitchenQual": [np.nan, 'Fa', 'TA', 'Gd', 'Ex'], 
#    "Functional": [np.nan, 'Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'], 
#    "FireplaceQu": ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
#    "GarageFinish": ['None', 'Unf', 'RFn', 'Fin'], 
#    "GarageQual": ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
#    "GarageCond": ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], 
#    "PavedDrive": [np.nan, 'N', 'P', 'Y'],
#    "PoolQC": ['None', 'Fa', 'TA', 'Gd', 'Ex'], 
#    "Fence": ['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
# }


# transform to a suitable format for OrdinalEncoder
# ce_ordinal_mappings = []
# for col, unique_values in ordinal_mappings.items():
#    local_mapping = {val:idx for idx, val in enumerate(unique_values)}
#    ce_ordinal_mappings.append({"col":col, "mapping":local_mapping})

# encoder = OrdinalEncoder(mapping=ce_ordinal_mappings, return_df=True)
# encoder.fit_transform(all_data)

Label Encoding some categorical variables that may contain information in their ordering set.

In [None]:
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))
    
# shape        
print('Shape all_data: {}'.format(all_data.shape))    

# Step 5: Drop highly correlated variables.

According to the data description, some features may actually point to the same quantities. For example, 'GarageYrBlt' and 'YearBuilt' may be the same, because when the house was built, the garage was included. We draw the correlation map, and select those pairs that have a correlation factor greater than 0.8. 

In [None]:
#Correlation map to see how features are correlated with SalePrice
# choose cols that only contains number
numeric_cols = train.select_dtypes(include='number')

# then use this new col to do the next step
corrmat = numeric_cols.corr()

plt.subplots(figsize=(25,25))  #this step is very important, without it you will miss many figure.
sns.heatmap(corrmat, vmax=0.9, square=True, annot=True)

They are 'GarageYrBlt' and 'YearBuilt', 'GarageArea' and 'GarageCar', 'TotalBsmtSF' and '1stFlrSF', and 'TotRmsAbvGrd' and 'GrLivArea'. We keep the one of higher correlation with 'SalePrice' and delete the other. 

In [None]:
all_data.drop(['GarageYrBlt', 'GarageArea', '1stFlrSF', 'TotRmsAbvGrd'], axis = 1, inplace = True)
print('Shape all_data: {}'.format(all_data.shape))

# Step 6: Add new features.

In [None]:
# Adding total sqfootage feature 
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['GrLivArea']

# Adding total number of bathrooms
all_data['TotalBathrooms'] = all_data['FullBath'] + (all_data['HalfBath']*0.5) + all_data['BsmtFullBath'] + (all_data['BsmtHalfBath']*0.5)

# Adding total outside sqfootage
all_data['TotalOutsideSF'] = all_data['WoodDeckSF'] + all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']

# Adding a ratio feature
all_data['LivLotRatio'] = all_data['GrLivArea'] / all_data['LotArea']

# The value of a home often depends on how it compares to typical homes in its neighborhood. 
# Create a feature `MedNhbdArea` that describes the *median* of `GrLivArea` grouped on `Neighborhood`.
all_data["MedNhbdArea"] = all_data.groupby("Neighborhood")["GrLivArea"].transform("median")

# Step 7: Normalize all numerical variables.

In [None]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)

Box Cox Transformation of (highly) skewed features https://onlinestatbook.com/2/transformations/box-cox.html

In [None]:
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [None]:
all_data = pd.get_dummies(all_data)
print(all_data.shape)

# Step 8: Blend models.

In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [None]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [None]:
# LASSO Regression
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

# Elastic Net Regression
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

# Kernel Ridge Regression
KRR = make_pipeline(RobustScaler(), KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5))

# SVR
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))

# Gradient Boosting Regression
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

# XGBoost
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

# LightGBM
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7
                              # min_data_in_leaf =6, min_sum_hessian_in_leaf = 11
                             )

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(svr)
print("SVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

Stacking models

In [None]:
# Stacking Ensemble Machine Learning With Python, https://machinelearningmastery.com/stacking-ensemble-machine-learning-with-python/

level0 = list()
level0.append(('lasso', lasso))
level0.append(('enet', ENet))
level0.append(('krr', KRR))
level0.append(('gboost', GBoost))
level0.append(('xgb', model_xgb))
level0.append(('lgbm', model_lgb))
# define meta learner model
level1 = model_xgb
model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

score = rmsle_cv(model)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Ensembling all the models defined previously.

In [None]:
# We first define a rmsle evaluation function
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
# lasso
lasso.fit(train.values, y_train)
lasso_train_pred = lasso.predict(train.values)
lasso_pred = np.expm1(lasso.predict(test.values))
print(rmsle(y_train, lasso_train_pred))

In [None]:
# enet
ENet.fit(train.values, y_train)
enet_train_pred = ENet.predict(train.values)
enet_pred = np.expm1(ENet.predict(test.values))
print(rmsle(y_train, enet_train_pred))

In [None]:
# krr
KRR.fit(train.values, y_train)
krr_train_pred = KRR.predict(train.values)
krr_pred = np.expm1(KRR.predict(test.values))
print(rmsle(y_train, krr_train_pred))

In [None]:
# svr
svr.fit(train.values, y_train)
svr_train_pred = svr.predict(train.values)
svr_pred = np.expm1(svr.predict(test.values))
print(rmsle(y_train, svr_train_pred))

In [None]:
# gboost
GBoost.fit(train.values, y_train)
gboost_train_pred = GBoost.predict(train.values)
gboost_pred = np.expm1(GBoost.predict(test.values))
print(rmsle(y_train, gboost_train_pred))

In [None]:
# scikit-learn StackingRegressor
model.fit(train.values, y_train)
stacking_train_pred = model.predict(train.values)
stacking_pred = np.expm1(model.predict(test.values))
print(rmsle(y_train, stacking_train_pred))

In [None]:
# XGBoost
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))

In [None]:
# LightGBM
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

In [None]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train, lasso_train_pred * 0.1 + \
                     enet_train_pred * 0.1 + \
                     krr_train_pred * 0.1 + \
                     svr_train_pred * 0.1 + \
                     gboost_train_pred * 0.1 + \
                     stacking_train_pred * 0.25 + \
                     xgb_train_pred * 0.15 + \
                     lgb_train_pred * 0.1 ))

In [None]:
# Ensemble prediction
ensemble = lasso_pred * 0.1 + \
           enet_pred * 0.1 + \
           krr_pred * 0.1 + \
           svr_pred * 0.1 + \
           gboost_pred * 0.1 + \
           stacking_pred * 0.25 + \
           xgb_pred * 0.15 + \
           lgb_pred * 0.1

# Step 9: Submission.

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = ensemble
sub.to_csv('/kaggle/working/submission.csv',index=False)
sub.head()