In [339]:
# Standard Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
import pdb
from collections import defaultdict

# Pandas API
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_string_dtype

# Sklearn
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV, LinearRegression, RidgeCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# scipy/stats
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax


# XGBoost / LightGBM
import xgboost as xgb
import lightgbm as lgb
import catboost

from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_string_dtype


# Custom Classes and Functions

In [2]:
class FixMissing(BaseEstimator, TransformerMixin):
    """ Imputes meadian value for missing values
    
    Parameters
    ----------
    na_column : boolean, optional (default = 1)
        If true, will add columns keeping track of
        missing values on the original data

    """
    
    def __init__(self, na_column=True):
        self.na_column = na_column
    
    def fit(self, df, y=None):

        # filler data for columns with NA values
        self.na_dict = defaultdict(int)
        # boolean for columns with NA values
        self.is_na_column = []
        
        # filling dictionaries
        for name, column in df.items():
            if pd.isnull(column).sum():
                self.is_na_column.append(name)
            if(is_numeric_dtype(column)):
                filler = column.median()
                self.na_dict[name] = filler
        return self
                
    def transform(self, df):
        self.df = df.copy()
        for name, column in self.df.items():
    
            if name in self.is_na_column and self.na_column:
                self.df[name + '_na'] = pd.isnull(column)
                
            if (is_numeric_dtype(column) and pd.isnull(column).sum()):
                self.df.loc[:, name] = column.fillna(self.na_dict[name])
            # corner case where all values on the fit step are nan
            # drop the column if that's the case
            if math.isnan(self.na_dict[name]):
                self.df.drop(columns=name, inplace=True)
        return self.df

In [3]:
class MakeCategorical(BaseEstimator, TransformerMixin):
    """ Label encoding for columns with string/categorical data
        NA values are transformed into 0
        
    Parameters
    ----------
    skip : list of strings, optional (default = [])
        skip these columns
    fix_na : boolean, optional (default = 1)
        If true, NA values will be changed to zero
    max_cat : integer (default = 0)
        maximum number of categories for one-hot encoding
    """    
    
    def __init__(self, max_cat, skip = [], fix_na = True):
        self.skip = skip
        self.fix_na = fix_na
        self.max_cat = max_cat
    
    def fit(self, df, y=None):
        
        self.cat_dict = {}
        self.onehot_list = []
        
        for name, col in df.items():
            if is_string_dtype(col):
                # dictionary with categories from the training set
                self.cat_dict.update({name:col.astype('category').cat.categories})
                # list with columns for one hot encoding
                if len(self.cat_dict[name]) <= self.max_cat:
                    self.onehot_list.append(name)
        return self
    
    def transform(self, df):
        
        self.df = df.copy()
        for name, col in df.items():
            if is_string_dtype(col) and name not in self.skip:
                
                # convert to categories on the training set
                temp = col.astype('category')
                self.df[name] = temp.cat.set_categories(self.cat_dict[name])
                
                # one hot-encoding for selected columns
                if name in self.onehot_list:
                    sub_df = pd.get_dummies(self.df[name])
                    sub_df.rename(columns = {n:name+'_'+n for n in sub_df.columns}, inplace=True)
                    self.df = pd.concat((self.df, sub_df), axis=1)
                    self.df.drop(columns=name, inplace=True)
                # change to numerical values if not one hot encoded
                else:
                    self.df[name] = temp.cat.set_categories(self.cat_dict[name]).cat.codes + 1
                                    
                if not self.fix_na:
                    self.df.loc[self.df[name]==0, name] = np.nan
                    
        return self.df

In [4]:
class TargetMedianEncoding(BaseEstimator, TransformerMixin):
    """ TO DO
    """    
    
    def __init__(self, column):
        
        self.column = column
        
    def fit(self, X, y):
        
        df = pd.concat([X[self.column], y], axis=1)
        self.map = df.groupby(self.column).median().iloc[:, 0]
        return self
    
    def transform(self, X):
        
        self.df = X.copy()
        self.df[self.column+'_median'] = X[self.column].map(self.map)
        
        return self.df

In [5]:
def clip_data(data, columns, quantiles=[.01, .99]):
    
    # lower and upper limits for all variables
    lower_dict = {column:np.quantile(data[column], quantiles[0]) for column in columns}
    upper_dict = {column:np.quantile(data[column], quantiles[1]) for column in columns}
    
    # clips dataset
    for column in columns:
        data = data[(data[column] >= lower_dict[column]) & (data[column] <= upper_dict[column])]
    
    return data

In [6]:
def calculate_skewness(data):
    
    numerical_variables = [column for column in data.columns if is_numeric_dtype(data[column])]
    skewness = [skew(data[col]) for col in numerical_variables]
    
    df = pd.DataFrame(dict(variable=numerical_variables,
                  skewness=skewness)).sort_values('skewness', ascending=False)
    
    return df

# Loading Data

In [7]:
DATA_FOLDER = './data/house_prices/'
os.listdir(DATA_FOLDER)

['data_description.txt',
 'parsed_xgb',
 'sample_submission.csv',
 'test.csv',
 'train - Copy.csv',
 'train.csv']

In [8]:
sample = pd.read_csv(os.path.join(DATA_FOLDER, 'sample_submission.csv'))
train = pd.read_csv(os.path.join(DATA_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))

# Adjusting Outliers - Huge Impact

We clip the datasets based on the given quantiles, using the columns listes. Important: we calculate the outliers for every columns before clipping (not in a sequentially manner).

In [9]:
columns = ['GrLivArea', 'LotArea', 'TotalBsmtSF', '1stFlrSF', 'BsmtFinSF1']
train = clip_data(train, columns, quantiles=[0.01, 0.99])

Concatenating train and test data and transforming SalePrice:

In [10]:
y = np.log(train['SalePrice'])
data = pd.concat([train.drop(columns='SalePrice'), test])

Dropping Id:

In [11]:
data.drop(columns='Id', inplace=True)

# Fixing Missing Values

Using None for categorical variables:

In [12]:
fix_na_none = ['MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
               'GarageType', 'GarageFinish', 'GarageQual',
               'GarageCond', 'GarageYrBlt', 'BsmtQual',
               'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
               'BsmtFinType2']

In [13]:
for col in fix_na_none:
    data[col].fillna('None', inplace=True)

We add a boolean for pool and remove the original columns:

In [14]:
data['has_pool'] = np.where(data['PoolQC'].isna(), 0, 1)
data.drop(columns=['PoolQC', 'PoolArea'], inplace=True)

Using the median value of LotFrontage for missing values:

In [15]:
data['LotFrontage'] = data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

We will deal with the remaining missing values on the pipeline.

# Feature Engineering

Some features suggested on popular kaggle kernels:

In [16]:
data['YrBltAndRemod']=data['YearBuilt']+data['YearRemodAdd']
data['TotalSF']=data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

data['Total_sqr_footage'] = (data['BsmtFinSF1'] + data['BsmtFinSF2'] +
                                 data['1stFlrSF'] + data['2ndFlrSF'])

data['Total_Bathrooms'] = (data['FullBath'] + (0.5 * data['HalfBath']) +
                               data['BsmtFullBath'] + (0.5 * data['BsmtHalfBath']))

data['Total_porch_sf'] = (data['OpenPorchSF'] + data['3SsnPorch'] +
                              data['EnclosedPorch'] + data['ScreenPorch'] +
                              data['WoodDeckSF'])

Some other features and interactions:

In [17]:
data['TotalSF_OverallQual'] = data['TotalSF']*data['OverallQual']

good_bsm = np.where(data['BsmtFinType1']=='GLQ', 1, 0)
avg_bsm = np.where(data['BsmtFinType1']=='ALQ', 1, 0)

data['good_TotalSF'] = data['BsmtFinSF1']*good_bsm+data['1stFlrSF']+data['2ndFlrSF']
data['avg_TotalSF'] = data['BsmtFinSF1']*avg_bsm+data['1stFlrSF']+data['2ndFlrSF']


data['SaleCondition_SaleType'] = data['SaleCondition'].astype(str)+data['SaleType'].astype(str)


data['bin_TotalSF'] = pd.cut(data['TotalSF'],bins=10, labels=False)+1 

data['OverallQual*TotalBsmtSF'] = data['OverallQual']*data['TotalBsmtSF']


data['GarageCars*OverallQual'] = data['OverallQual']*data['GarageCars']


data['OverallQual*TotalBsmtSF'] = data['OverallQual']*data['TotalBsmtSF']

A few more...

In [18]:
data['TotalSF**2'] = data['TotalSF']**2
data['good_TotalSF**2'] = data['good_TotalSF']**2
data['OverallQual**2'] = data['OverallQual']**2
data['avg_TotalSF**2'] = data['good_TotalSF']**2
data['LotArea**2'] = data['LotArea']**2
data['OverallCond**2'] = data['OverallCond']**2

# Adjusting Skewness

Gettin a list of skewed variables:

In [19]:
numerical_variables = [column for column in data.columns if is_numeric_dtype(data[column])]
skew_df = calculate_skewness(data)
skewed_columns = skew_df[skew_df['skewness'] > 0.5]['variable'].values

Box Cox Transformation:

In [20]:
for col in skewed_columns:
    data[col] = boxcox1p(data[col], boxcox_normmax(data[col] + 1))



# Dropping Some Variables

In [21]:
data.drop(columns=['BsmtFinType2', 'RoofMatl', 'LandContour',
                   'BsmtFinType1', 'MasVnrType', 'LowQualFinSF',
                   'Heating', 'HouseStyle', 'BsmtFinSF2',
                   'BedroomAbvGr', 'PavedDrive', 'Alley',
                   'Exterior2nd', 'LotConfig',
                   'TotalBsmtSF', 'RoofStyle', 'YrSold',
                   'SaleType', 'ScreenPorch', 'FullBath',
                   'MSSubClass', 'MasVnrArea', 'LotFrontage',
                   'BsmtCond', 'WoodDeckSF', 'LotShape',
                   'MoSold'], inplace=True)

# Baseline Model

Going Back to Train and Test Data:

In [22]:
train = data[:len(y)]
test = data[len(y):]

Let's build our baseline pipeline and evaluate its performance using cross-validation. The steps on the pipeline are:  
1. Median Encoding the Target values based on Neighborhood;
2. Fix remaining missing values;
3. Transforming categorical variables (one-hot encoding for categories with less than max_cat features and label encoding for the rest);
4. Scalling the data;
5. LassoCV using prespecified alpha values.

In [28]:
alphas = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
scaler = RobustScaler()

lassocv_pipeline = Pipeline([
    ('encoder1', TargetMedianEncoding(column='Neighborhood')),
    ('fix_missing', FixMissing(na_column=False)),
    ('make_categorical', MakeCategorical(max_cat=60)),
    ('scaler', scaler),
    ('lasso', LassoCV(max_iter=1e7, alphas=alphas, 
                      random_state=42, cv=5))
])

kfolds = KFold(n_splits=10)

Notice that all the steps on the pipeline above are done respecting train-validation split methodology (taht is, no leakage). Let's evaluate the whole pipeline using cross-validation:

In [29]:
np.mean(np.sqrt(-cross_val_score(lassocv_pipeline, train, y, scoring="neg_mean_squared_error",
                                 cv=kfolds)))

0.10744128214262771

This model alone achieves a pretty good score on the Public Leaderbord: 12747.97 (around top 2.7% percentile). Obs: we are evaluating the models using log(y) on this notebook, while the Public Leaderboard shows results using the original scale. Since the transformation is monotonic, we can use log when doing local validaiton. Preparing the submission:

In [None]:
lassocv_pipeline.fit(train, y);
y_pred = lassocv_pipeline.predict(test)
sample['SalePrice'] = np.exp(y_pred)
sample.to_csv('./data/predictions.csv', index=False)

# XGBoost

Let's tune an XGBoost model and see if we can improve on the baseline model. We will preprocess the dataset before tuning the model, so we can iterate faster. After selecting a good set of parameters, we can evaluate the pipeline like we did above.

In [58]:
baseline_xgb = xgb.XGBRegressor()

Preprocessing the data:

In [55]:
scaler = RobustScaler()

preprocess_pipeline = Pipeline([
    ('encoder1', TargetMedianEncoding(column='Neighborhood')),
    ('fix_missing', FixMissing(na_column=False)),
    ('make_categorical', MakeCategorical(max_cat=60)),
    ('scaler', scaler)
])

In [56]:
train_proc = preprocess_pipeline.fit_transform(train, y)

Evaluating the performance using XGBoost with the default parameters:

In [59]:
np.mean(np.sqrt(-cross_val_score(baseline_xgb, 
                                 train_proc, y, 
                                 scoring="neg_mean_squared_error",
                                 cv=10)))

0.12833051853824423

Let's find the initial number of estimators for some standard parameters configurations:

In [70]:
def find_nrounds_xgb(model, X, y, metrics='rmse',
                     cv_folds=5, early_stopping_rounds=50):
    
    xgtrain = xgb.DMatrix(X, y)
    
    params = model.get_xgb_params()
    
    cvresult = xgb.cv(params, xgtrain, metrics=metrics, 
                      num_boost_round=params['n_estimators'],
                      early_stopping_rounds=early_stopping_rounds)
    
    # Setting optimal number of estimators
    n_rounds_optimal = cvresult.shape[0]
    model.set_params(n_estimators=n_rounds_optimal)
    
    print(cvresult.iloc[-1, :])
    print(f"n_estimators:{n_rounds_optimal}")
    
    return model

In [76]:
def grid_search(estimator, X, y, params, scoring, cv=4, random=True,
                n_iter=150, n_jobs=6):
    
    if random:
        random_search = RandomizedSearchCV(estimator, 
                                           param_distributions=params,
                                           n_iter=n_iter, n_jobs=n_jobs, 
                                           cv=cv, scoring=scoring,
                                           verbose=3, random_state=340)
    
    else:
        random_search = GridSearchCV(estimator, param_grid=params, 
                                       n_jobs=n_jobs, cv=cv,
                                       scoring='neg_mean_absolute_error',
                                       verbose=3)

    random_search.fit(X, y)
    
    return random_search

In [71]:
model = xgb.XGBRegressor(learning_rate=0.1, n_estimators=1000,
                         max_depth=5, min_child_weight=1,
                         gamma=0, colsample_bytree=0.8)

In [72]:
model = find_nrounds_xgb(model, train_proc, y)

train-rmse-mean    0.026588
train-rmse-std     0.001133
test-rmse-mean     0.120527
test-rmse-std      0.001983
Name: 186, dtype: float64
n_estimators:187


## **max_depth** / **min_child_weight**:

In [99]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

In [100]:
results = grid_search(model, train_proc, y, param_test1, 
                      scoring='neg_mean_squared_error',
                      random=False)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:   12.1s
[Parallel(n_jobs=6)]: Done  48 out of  48 | elapsed:   38.4s finished


In [101]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.07867467679097549
{'max_depth': 3, 'min_child_weight': 5}


We hit the lower bound for maximum depth, let's add a few more options and try again:

In [102]:
param_test2 = {
 'max_depth':range(1,5,1),
 'min_child_weight':range(1,6,2)
}

In [103]:
results = grid_search(model, train_proc, y, param_test2, 
                      scoring='neg_mean_squared_error',
                      random=False)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    7.4s
[Parallel(n_jobs=6)]: Done  48 out of  48 | elapsed:   23.8s finished


Now we ended inside the range of parameters:

In [104]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.07791161190825713
{'max_depth': 4, 'min_child_weight': 5}


Updating out model:

In [107]:
model.set_params(**results.best_params_);

## **gamma**

In [108]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,51)]
}

In [109]:
results = grid_search(model, train_proc, y, param_test3, 
                      scoring='neg_mean_squared_error',
                      random=False)

Fitting 4 folds for each of 51 candidates, totalling 204 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:   14.4s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done 204 out of 204 | elapsed:  2.4min finished


Gamma equals 0 (minimum) is the best values:

In [111]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.07791161190825713
{'gamma': 0.0}


In [112]:
model.set_params(**results.best_params_)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.8, gamma=0.0,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=None, max_depth=4,
             min_child_weight=5, missing=nan, monotone_constraints=None,
             n_estimators=187, n_jobs=None, num_parallel_tree=None,
             objective='reg:squarederror', random_state=None, reg_alpha=None,
             reg_lambda=None, scale_pos_weight=None, subsample=None,
             tree_method=None, validate_parameters=False, verbosity=None)

## **subsample / cosample_bytree**

In [113]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

In [114]:
results = grid_search(model, train_proc, y, param_test4, 
                      scoring='neg_mean_squared_error',
                      random=False)

Fitting 4 folds for each of 16 candidates, totalling 64 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:   10.3s
[Parallel(n_jobs=6)]: Done  64 out of  64 | elapsed:   36.6s finished


In [115]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.07722327081465681
{'colsample_bytree': 0.6, 'subsample': 0.8}


Let's expand the bounds:

In [119]:
param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(1,10)]
}

In [120]:
results = grid_search(model, train_proc, y, param_test5, 
                      scoring='neg_mean_squared_error',
                      random=False)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    4.8s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:   47.6s
[Parallel(n_jobs=6)]: Done 144 out of 144 | elapsed:  1.1min finished


In [121]:
print(f"MSE: {-results.best_score_}")
print(results.best_params_)

MSE: 0.0759563339131957
{'colsample_bytree': 0.2, 'subsample': 0.9}


In [122]:
model.set_params(**results.best_params_)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.2, gamma=0.0,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=None, max_depth=4,
             min_child_weight=5, missing=nan, monotone_constraints=None,
             n_estimators=187, n_jobs=None, num_parallel_tree=None,
             objective='reg:squarederror', random_state=None, reg_alpha=None,
             reg_lambda=None, scale_pos_weight=None, subsample=0.9,
             tree_method=None, validate_parameters=False, verbosity=None)

Finally, let's reduce the learning rate and increase the number of estimators:

In [123]:
model.set_params(**{'learning_rate':0.01, 'n_estimators':5000})

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.2, gamma=0.0,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=0.01, max_delta_step=None, max_depth=4,
             min_child_weight=5, missing=nan, monotone_constraints=None,
             n_estimators=5000, n_jobs=None, num_parallel_tree=None,
             objective='reg:squarederror', random_state=None, reg_alpha=None,
             reg_lambda=None, scale_pos_weight=None, subsample=0.9,
             tree_method=None, validate_parameters=False, verbosity=None)

## Performance

Final Performance:

In [124]:
scaler = RobustScaler()

xgbcv_pipeline = Pipeline([
    ('encoder1', TargetMedianEncoding(column='Neighborhood')),
    ('fix_missing', FixMissing(na_column=False)),
    ('make_categorical', MakeCategorical(max_cat=60)),
    ('scaler', scaler),
    ('xgb', model)
])

kfolds = KFold(n_splits=10)

Slightly worse than the LassoCV:

In [127]:
np.mean(np.sqrt(-cross_val_score(xgbcv_pipeline, train, y, scoring="neg_mean_squared_error",
                                 cv=kfolds)))

0.10892592669298158

Preparing the Submission:

In [128]:
xgbcv_pipeline.fit(train, y);
y_pred = xgbcv_pipeline.predict(test)
sample['SalePrice'] = np.exp(y_pred)
sample.to_csv('./data/predictions.csv', index=False)

# Stacking

Loading baseline models:

In [239]:
alphas = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
scaler = RobustScaler()

lassocv_pipeline = Pipeline([
    ('encoder1', TargetMedianEncoding(column='Neighborhood')),
    ('fix_missing', FixMissing(na_column=False)),
    ('make_categorical', MakeCategorical(max_cat=60)),
    ('scaler', scaler),
    ('lasso', LassoCV(max_iter=1e7, alphas=alphas, 
                      random_state=42, cv=5))
])

In [240]:
scaler = RobustScaler()

xgb_pipeline = Pipeline([
    ('encoder1', TargetMedianEncoding(column='Neighborhood')),
    ('fix_missing', FixMissing(na_column=False)),
    ('make_categorical', MakeCategorical(max_cat=60)),
    ('scaler', scaler),
    ('xgb', model)
])

Let's build a stacking estimator:

In [295]:
class Stacking(BaseEstimator, ClassifierMixin):
    """ TO DO
    """    
    
    def __init__(self, stack_model, pipelines, cv_stack=5):
        
        self.stack_model = stack_model
        self.pipelines = pipelines
        self.cv_stack = cv_stack
        
    def fit(self, X, y):
        
        cv = KFold(n_splits=self.cv_stack, 
                   shuffle=True, random_state=583)
        
        # out of fold matrix with dimension (len(y), number of models)
        y_oof = np.zeros(shape=(len(y), len(self.pipelines))) # Validation
        
        # calculate the oof predictions of y for every model
        for i, pipeline in enumerate(self.pipelines):
            
            for tr, tt in cv.split(X, y):
                pipeline.fit(X.iloc[tr, :], y.iloc[tr])
                y_oof[tt, i] = pipeline.predict(X.iloc[tt, :])
        
        # fit the stacking model, using the oof ys as features and y as target
        self.y_oof = y_oof
        self.stack_model.fit(y_oof, y)
        
        # refitting the pipelines (for the prediction stage)
        for pipeline in self.pipelines:
            pipeline.fit(X, y)
        
        return self
    
    def predict(self, X):
        
        # calculate the predictions for all pipelines
        predictions = np.zeros(shape=(len(X), len(self.pipelines))) 
        for i, pipeline in enumerate(self.pipelines):
            
            predictions[:, i] = pipeline.predict(X)
        
        # combine these predictions using the stacking model
        y_final = self.stack_model.predict(predictions)
        
        return y_final

In [296]:
pipelines = [lassocv_pipeline, xgb_pipeline]
stacker = Stacking(LinearRegression(), pipelines)

In [297]:
results = stacker.fit(train, y)

Fitting Stacking (1st level and 2nd level models):

In [305]:
stacker.fit(train, y);

Stacking(cv_stack=5,
         pipelines=[Pipeline(memory=None,
                             steps=[('encoder1',
                                     TargetMedianEncoding(column='Neighborhood')),
                                    ('fix_missing',
                                     FixMissing(na_column=False)),
                                    ('make_categorical',
                                     MakeCategorical(fix_na=True, max_cat=60,
                                                     skip=[])),
                                    ('scaler',
                                     RobustScaler(copy=True,
                                                  quantile_range=(25.0, 75.0),
                                                  with_centering=True,
                                                  with_scaling=True)),
                                    ('lasso',
                                     LassoCV(alph...
                                                  missing=n

Preparing Submissions:

In [307]:
y_pred = stacker.predict(test)

In [308]:
sample['SalePrice'] = np.exp(y_pred)
sample.to_csv('./data/predictions_stacking.csv', index=False)