# **REGRESSION MODELS FOR HOUSE PRICES**

## **INITIALIZATION**

In [29]:
import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt

from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

import xgboost as xgb
import lightgbm as lgb

%matplotlib inline 
import matplotlib.pyplot as plt

In [30]:
# fetch engineered train and test data
train = pd.read_csv('train_engineered.csv')
test = pd.read_csv('test_engineered.csv')
outcomes = pd.read_csv('outcomes.csv')
y_train = np.asarray(outcomes['SalePrice'].values)
train_id = train['Id']; test_id = test['Id']
del train['Id']
del test['Id']

# feature selection
features_selected =['AllSF', 'OverallQual', 'AllFlrsSF', '1stFlr_2ndFlr_Sf', 'GrLivArea',
                    'All_Liv_SF', 'ExterQual', 'TotalBath', 'KitchenQual', 'GarageCars',
                    'OverallGrade', '1stFlrSF', 'ExterGrade', 'YearBuilt', 'FullBath',
                    'YearRemodAdd', 'TotRmsAbvGrd', 'FireplaceScore', 'FireplaceQu',
                    'Foundation_PConc', 'BsmtQual', 'GarageArea', 'Fireplaces',
                    'GarageScore', 'HeatingQC', 'OpenPorchSF', 'TotalBsmtSF',
                    'KitchenScore', 'MasVnrArea', 'GarageFinish_Fin', 'GarageType_Attchd',
                    'LotArea', 'HasMasVnr', 'LotFrontage', 'GarageGrade', 'GarageQual',
                    'GarageCond', 'Neighborhood_NridgHt', 'CentralAir_Y', 'WoodDeckSF',
                    'Exterior2nd_VinylSd', 'Exterior1st_VinylSd', 'BsmtExposure',
                    'SaleType_New', 'GarageYrBlt', 'BoughtOffPlan', 'SaleCondition_Partial',
                    'HalfBath', 'MasVnrType_Stone', 'BsmtFinType1', 'RecentRemodel', 'lat',
                    'IsElectricalSBrkr', 'Electrical_SBrkr', 'PavedDrive', 'HasWoodDeck',
                    'GarageType_No', 'GarageFinish_No', 'Foundation_CBlock', 'MSZoning_RM',
                    'CentralAir_N', 'MasVnrType_None', 'GarageType_Detchd', 
                    'IsGarageDetached', 'GarageFinish_Unf', 'HasOpenPorch']

# split features observations for train and forecast
train = train[features_selected]; test = test[features_selected]
#X = np.asarray(train[features_selected])
#X_forecast = np.asarray(test[features_selected])
#print(X.shape, y_train.shape, X_forecast.shape)

## **CROSS-VALIDATION**

In [31]:
# cross-validation with shuffling
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model, train.values, y_train, 
                                    scoring='neg_mean_squared_error', cv=kf))
    return(rmse)

## **MODELS** from https://www.kaggle.com/pavel1988

In [32]:
# model selection
# LASSO regression made robust to outliers with RobustScaler
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0003, random_state=1))
# elastic net regression
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0003, l1_ratio=0.9, random_state=3))
# kernel ridge regression
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# gradient boosting regression with huber loss that makes it robust to outliers
GBoost = GradientBoostingRegressor(n_estimators=5000, learning_rate=0.05, max_depth=4,
                                  max_features='sqrt', min_samples_leaf=15, 
                                   min_samples_split=10, loss='huber', random_state=5)
# xgboost
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05,
                            max_depth=3, min_child_weight=1.7817, n_estimators=2200,
                            reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213,
                            silent=1, random_state=7, nthread=-1)
# light gbm
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05,
                             n_estimators=720, max_bin=55, bagging_fraction=0.8,
                             bagging_freq=5, feature_fraction=0.2319, 
                             feature_fraction_sed=9, bagging_seed=9, min_data_in_leaf=6,
                             min_sum_hessian_in_leaf=11)

## **BASE MODELS SCORES**

In [9]:
# evaluate the cross-validation rmsle error
score = rmsle_cv(lasso)
print('\nLasso score: {:.4f} ({:.4f})\n'.format(score.mean(), score.std()))


Lasso score: 0.1119 (0.0057)



In [11]:
score = rmsle_cv(ENet)
print('\nENet score: {:.4f} ({:.4f})\n'.format(score.mean(), score.std()))


ENet score: 0.1119 (0.0057)



In [12]:
score = rmsle_cv(KRR)
print('\nKRR score: {:.4f} ({:.4f})\n'.format(score.mean(), score.std()))


KRR score: 0.1499 (0.0213)



In [13]:
score = rmsle_cv(GBoost)
print('\nGBoost score: {:.4f} ({:.4f})\n'.format(score.mean(), score.std()))


GBoost score: 0.1140 (0.0065)



In [14]:
score = rmsle_cv(model_xgb)
print('\nxgb score: {:.4f} ({:.4f})\n'.format(score.mean(), score.std()))


xgb score: 0.1134 (0.0065)



In [15]:
score = rmsle_cv(model_lgb)
print('\nlgb score: {:.4f} ({:.4f})\n'.format(score.mean(), score.std()))


lgb score: 0.1130 (0.0073)



## **STACKING MODELS**

### **AVERAGING BASE MODELS**

In [33]:
# write a class to encapsulate model and reuse
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
    # define clones of the original models to fit in the data
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        # train cloned base models
        for model in self.models_:
            model.fit(X, y)
        return self
    # make predictions for cloned models and average predictions
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1)

In [34]:
# average the base models ENet, GBoost, KRR and Lasso
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))
score = rmsle_cv(averaged_models)
print('Averaged base models score: {:.4f} ({:.4f}))\n'.format(score.mean(), score.std()))

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Averaged base models score: 0.1170 (0.0066))



### **COMPLEX STACKING: ADDING A META-MODEL**

In [None]:
# ADD A META-MODEL ON AVERAGED BASE MODELS AND USE THE OUT-OF-FOLDS PREDICTIONS OF BASE MODELS
# TO TRAIN OUR META-MODEL

In [None]:
# 1. split train set into 2 disjoint sets (train and holdout)
# 2. train several base models on the first part (train)
# 3. test the models from 2. on the second part (holdout)
# 4. use the predictions from 3. (out-of-fold predictions) as inputs,
#    and the correct outcomes (target variable) as output to train a
#    higher level learner called meta-model

### **STACKING AVERAGED MODELS CLASS**

In [35]:
# again, write a class to encapsulate
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
    # fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        # train cloned base models then create out-of-fold predictions needed
        # to train the cloned meta model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
        # train cloned meta-model using the out_of_fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
        
    #make predictions of all base models on the test data and take averages as meta-features
    # for the final prediction done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
                np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
                for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [24]:
# use the same models: average Enet, KRR and GBoost, then add lasso as meta-model
stacked_averaged_models = StackingAveragedModels(base_models=(ENet, GBoost, KRR),
                                                meta_model=lasso)
score = rmsle_cv(stacked_averaged_models)
print('Stacking Averaged models score: {:.4f} ({:.4f})\n'.format(score.mean(), score.std()))

  positive)
  positive)
  positive)
  positive)
  positive)


Stacking Averaged models score: 0.1071 (0.0060)



## **ENSEMBLING STACKEDREGRESSOR, XGBOOST AND LIGHTGBM**

In [36]:
# evaluation function
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [37]:
# stacked regressor
stacked_averaged_models.fit(train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(y_train, stacked_train_pred))

  positive)
  positive)
  positive)
  positive)


0.07814376127966635


In [38]:
# xgboost
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))

0.08284055511870037


In [39]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

0.07883216458873454


In [40]:
# combined scoring
'''RMSE on the entire Train data when averaging'''
print('RMSLE score on train data:')
print(rmsle(y_train, stacked_train_pred*0.70 + xgb_train_pred*0.15 + lgb_train_pred*0.15))

RMSLE score on train data:
0.07699122182532601


In [41]:
# ensemble prediction
ensemble = stacked_pred*0.70+ xgb_pred*0.15 + lgb_pred*0.15

In [42]:
# prepare for submission
sub = pd.DataFrame()
sub['Id']= test_id
sub['SalePrice'] = ensemble
sub.to_csv('submission_regression.csv', index=False)
sub.head(30)

Unnamed: 0,Id,SalePrice
0,1461,120097.273523
1,1462,161615.71162
2,1463,184195.472198
3,1464,193073.539693
4,1465,186013.338494
5,1466,173034.543152
6,1467,175605.371582
7,1468,166429.277457
8,1469,191506.117446
9,1470,121455.587709
