# Ensembling of Various Models: Voting and Stacking

This is a kernel on a convoluted Kaggle prediction public leaderboard score.
Computed multiple weighted averages in a three-step series of weighted averages.

#### Import Libraries

In [160]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

#options for display
%matplotlib inline
pd.set_option('display.max_columns', 120)
pd.set_option('display.max_rows', 120)

Ensemble Packages

In [785]:
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

In [786]:
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_x.values)
    rmse= np.sqrt(-cross_val_score(model, train_x.values, train_y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [787]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

##### Train and test data :)

In [788]:
train_x = pd.read_csv('../Data/train_x2.csv')
train_y = pd.read_csv('../Data/train_y2.csv',header=None)
train_y = train_y.values.ravel()
test_x = pd.read_csv('../Data/test_x2.csv')

In [789]:
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn import model_selection

In [790]:
# Function that converts to kaggle submission formatted pandas Dataframe
def kaggle(x):
    # Input x - the model prediction
    # returns dataframe of sales price and Id ready to be written to
    # csv for kaggle competition submission.
    return(pd.DataFrame({'Id': (np.arange(len(x)) + 1461),
                         'SalePrice': np.exp(x)}))

###### Tiered Sum Weighted Average Kaggle LB Predictor
First took a weighted average half and half between Elastic Net and SVR. Then computed a weighted average between that ensembled prediction and lowly weighted GBR and ridge regression. This is a sort of tiered averages where we make a prediction and post it to the Kaggle Leaderboard, then once we decide it's a good score, we continue using those new results and developing another prediction by weighting averages with those predictions and the previous ones from the individual models.

In [834]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)  
KRR.fit(train_x,train_y)

KernelRidge(alpha=0.6, coef0=2.5, degree=2, gamma=None, kernel='polynomial',
      kernel_params=None)

In [835]:
elastic = ElasticNet(alpha= 0.0049, fit_intercept = True, l1_ratio= 0.61)
elastic.fit(train_x,train_y)

ElasticNet(alpha=0.0049, copy_X=True, fit_intercept=True, l1_ratio=0.61,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [836]:
gbr = GradientBoostingRegressor(max_depth = 2, max_features = 12, min_samples_split = 10, subsample = 0.7,
     random_state=42, learning_rate = 0.01, n_estimators = 4000, verbose = 0)
gbr.fit(train_x,train_y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=2, max_features=12,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=4000, presort='auto', random_state=42,
             subsample=0.7, verbose=0, warm_start=False)

In [837]:
ridge = Ridge(alpha = 1.2)
ridge.fit(train_x, train_y)

Ridge(alpha=1.2, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [838]:
svr = svm.SVR(C=9, epsilon=.009, degree = 1, kernel='poly')
svr.fit(train_x,train_y)

SVR(C=9, cache_size=200, coef0=0.0, degree=1, epsilon=0.009, gamma='auto',
  kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

1st Normal Average

In [839]:
elastic_pred = kaggle(elastic.predict(test_x))
svr_pred = kaggle(svr.predict(test_x))
KRR_pred = kaggle(KRR.predict(test_x))
GBR_pred = kaggle(gbr.predict(test_x))
ridge_pred = kaggle(ridge.predict(test_x))

Playing Around with weights to then be tested on Kaggle Public Leaderboard. The best ones found are recorded below, ended with a final public leaderboard score of 0.1154

In [847]:
pred = elastic_pred
pred['SalePrice'] = (0.7*elastic_pred['SalePrice'] + 
                      0.2*svr_pred['SalePrice'] + 
                      0.1*ridge_pred['SalePrice'])

In [None]:
# In case you want to test original weighted average
pred.to_csv('KaggleHighestScore.csv')

In [849]:
pred2 = pred.copy()

In [852]:
pred2['SalePrice'] = 0.7*pred['SalePrice'] + 0.7*elastic_pred['SalePrice']+0.2*KRR_pred['SalePrice']

In [None]:
pred2.to_csv('AveragedModel.csv', index = False)

In [None]:
Final_Ensemble = pred2

In [None]:
Final_Ensemble.SalePrice = 0.8*pred2 + 0.15*KRR_pred + 0.05*GBR_pred

In [None]:
pred2.to_csv('EnsembleAveragedModel.csv', index = False)