# Ensembling of Various Models: Voting and Stacking

#### Import Libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

#options for display
%matplotlib inline
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 120)

Ensemble Packages

In [6]:
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

##### Train and test data :)

In [9]:
train_x = pd.read_csv('../Data/train_x2.csv')
train_y = pd.read_csv('../Data/train_y2.csv',header=None)
train_y = train_y.values.ravel()
test_x = pd.read_csv('../Data/test_x2.csv')

###### Scoring Metric

In [10]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_x.values)
    rmse= np.sqrt(-cross_val_score(model, train_x.values, train_y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

Libraries for specific Ensembled Models

In [11]:
#Linear Models
from sklearn.linear_model import Lasso


#Kernel Ridge Regression
from sklearn.kernel_ridge import KernelRidge


#Gradient Boosting Machines
from sklearn.ensemble import GradientBoostingRegressor

#Random Forest
from sklearn.ensemble import RandomForestRegressor

#Support Vector Machines
from sklearn import svm

#Others


### Defining Models

Lasso (Elastic went to Lasso)

In [29]:
lasso = make_pipeline(Lasso(alpha =0.0005, random_state=1))

Kernel Ridge Regression

In [30]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

Gradient Boost

In [31]:
gbr = GradientBoostingRegressor(max_depth = 2, max_features = 14, min_samples_split = 8, subsample = 0.5,
     random_state=42, learning_rate = 0.01, n_estimators = 8000, verbose = 0)

Random Forest

In [32]:
randomforest = RandomForestRegressor(n_jobs=-1, n_estimators=800, max_features=11,random_state=43,oob_score=True)

SVM

In [33]:
svm = svm.SVR(C=260, epsilon=.05, gamma=.0001, kernel='rbf')

##### Ok, let's average models then try a more complex ensemble

### Average Model

Building a class that takes models, fits them, then averages them

In [34]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

Try out a few linear models. General Linear Lasso, Kernel Ridge Regression, and Gradient Boost Regression

In [35]:
averaged_models = AveragingModels(models = (lasso, KRR, gbr, randomforest, svm))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

  y = column_or_1d(y, warn=True)
  # This is added back by InteractiveShellApp.init_path()
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  # This is added back by InteractiveShellApp.init_path()
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  # This is added back by InteractiveShellApp.init_path()
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  # This is added back by InteractiveShellApp.init_path()
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  # This is added back by InteractiveShellApp.init_path()
  y = column_or_1d(y, warn=True)


 Averaged base models score: 0.1163 (0.0048)



In [52]:
averaged_models.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)
  # This is added back by InteractiveShellApp.init_path()
  y = column_or_1d(y, warn=True)


AveragingModels(models=(Pipeline(steps=[('lasso', Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False))]), KernelRidge(alpha=0.6, coef0=2.5, degree=2, gamma=None, kernel='polynom... epsilon=0.05, gamma=0.0001,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)))

In [55]:
predictions = averaged_models.predict(test_x)

In [56]:
prediction = pd.DataFrame({'Id' : (np.arange(len(test_x))+1461),
            'SalePrice': np.exp(predictions)})


In [65]:
prediction.to_csv(path_or_buf="../predictions.csv",index=None)

###### Oh it improves it!!!!!

### Stacking
Well let's move on to the fancy ass ensemble...
Get each model's sale price prediction (on the out of bag fold), and use that as input to another model.

Stacking Class:

In [36]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred.flatten()
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

#### Stack them up, and use a meta model to decide based on the new stack

Using Gboost as meta-model for now in case there are non-linearities

In [14]:
stacked_averaged_models = StackingAveragedModels(base_models = (lasso, GBoost, KRR),
                                                 meta_model = GBoost)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Stacking Averaged models score: 0.1278 (0.0043)
