## Stacking Ensemble Prediction Approach

Finally after the feature engineering, data formatting and individual model analysis, creating  a Stacking ensemble model using the below models.
    - Linear Regression
    - Negative Binomial
    - Random Forest
    - XGBoost

## Load packages

In [1]:
import numpy as np    # fundamental package for scientific computing
import pandas as pd   # Python Data Analysis Library
from pandas import Series # one-dimensional labeled array capable of holding any data type 
import seaborn as sns # library for making statistical graphics in Python
import os             # operating system dependent functionality, file descriptor..

from sklearn.model_selection import train_test_split             # Split arrays or matrices into random train and test subsets

from sklearn.metrics import mean_absolute_error                  # Mean absolute error regression loss
from sklearn.metrics import median_absolute_error                # Median absolute error regression loss

import warnings
warnings.filterwarnings('ignore')                                # For warning control

In [2]:
# plotly library for visalization
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

%matplotlib inline    
# Line magic command will make plot outputs appear and be stored within the notebook.
import matplotlib.pyplot as plt   # matplotlib's plotting framework

In [81]:
# Going to use these 4 base models for the stacking
from sklearn.linear_model import LinearRegression   # base model for linear regression
from sklearn.linear_model import Lasso              # least absolute shrinkage and selection operator
from sklearn.linear_model import Ridge              # ridge regression
import statsmodels.api as sm                        # Negative binomial regression
import statsmodels.formula.api as smf               # statistical models formula APIs
from sklearn.ensemble import RandomForestRegressor  # A random forest regressor
import xgboost as xgb                               # Gradient Boosting for regression

from sklearn.model_selection import KFold           # K-Folds cross-validator 
from sklearn.model_selection import GridSearchCV    # Grid Search for paramter tuning

In [4]:
# Let's check what data files are available.
PATH="./../datasets/"
os.listdir(PATH)

['dengue_test_iq.pkl',
 'dengue_test_sj.pkl',
 'dengue_train_iq.pkl',
 'dengue_train_sj.pkl',
 'test_iq_month_1.pkl',
 'test_iq_month_10.pkl',
 'test_iq_month_11.pkl',
 'test_iq_month_12.pkl',
 'test_iq_month_2.pkl',
 'test_iq_month_3.pkl',
 'test_iq_month_4.pkl',
 'test_iq_month_5.pkl',
 'test_iq_month_6.pkl',
 'test_iq_month_7.pkl',
 'test_iq_month_8.pkl',
 'test_iq_month_9.pkl',
 'test_sj_month_1.pkl',
 'test_sj_month_10.pkl',
 'test_sj_month_11.pkl',
 'test_sj_month_12.pkl',
 'test_sj_month_2.pkl',
 'test_sj_month_3.pkl',
 'test_sj_month_4.pkl',
 'test_sj_month_5.pkl',
 'test_sj_month_6.pkl',
 'test_sj_month_7.pkl',
 'test_sj_month_8.pkl',
 'test_sj_month_9.pkl',
 'train_iq_month_1.pkl',
 'train_iq_month_10.pkl',
 'train_iq_month_11.pkl',
 'train_iq_month_12.pkl',
 'train_iq_month_2.pkl',
 'train_iq_month_3.pkl',
 'train_iq_month_4.pkl',
 'train_iq_month_5.pkl',
 'train_iq_month_6.pkl',
 'train_iq_month_7.pkl',
 'train_iq_month_8.pkl',
 'train_iq_month_9.pkl',
 'train_sj_month_1.pk

In [5]:
# let's load the train and test data
train_filename_sj = ( './../datasets/dengue_train_sj.pkl' )
train_filename_iq = ( './../datasets/dengue_train_iq.pkl' )
test_filename_sj  = ( './../datasets/dengue_test_sj.pkl' )
test_filename_iq  = ( './../datasets/dengue_test_iq.pkl' )

dengue_train_sj = pd.read_pickle( train_filename_sj )
dengue_train_iq = pd.read_pickle( train_filename_iq )
dengue_test_sj  = pd.read_pickle( test_filename_sj )
dengue_test_iq  = pd.read_pickle( test_filename_iq )

In [19]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, reg, seed=0, params=None, NB=False,col=[],alpha=0.001):
        self.NB = NB
        self.col = col
        self.alpha = alpha
        if( self.NB == True ):
            self.model = None
        else:
            self.model = reg(**params)
            params['random_state'] = seed

    def train(self, x_train, y_train):
        if( self.NB == False ):
            self.model.fit(x_train, y_train)
        else:
            formula = ' + '.join([ str(feature) for feature in list(self.col)])
            formula = 'y ~ ' + formula
            train      = pd.DataFrame( x_train )
            print(train)
            train['y'] = y_train
            self.model = smf.glm( formula = formula,
                                  data = train,
                                  family = sm.families.NegativeBinomial(alpha=self.alpha)).fit()
        

    def predict(self, x):
        return self.model.predict(x)
    
    def fit(self,x,y):
        return self.model.fit(x,y)
    
    def feature_importances(self,x,y):
        if( self.NB == False ):
            return self.model.fit(x,y).feature_importances_
        else
            return self.model.feature_importances_

### Out-of-Fold Predictions

Stacking uses predictions of base regressor as input for training to a second-level model.

In [33]:
def get_OutOfFoldPredictions( model, x_train, y_train, x_test ):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train.iloc[test_index]
        model.train(x_tr, y_tr)
        oof_train[test_index] = model.predict(x_te)
        oof_test_skf[i, :] = model.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Generating Base First-Level Models 

So now let us prepare four learning models as our first level regression.
- The models are listed as follows:
    - Linear Regression
    - Negative Binomial
    - Random Forest
    - XGBoost

**Parameters**

Just a quick summary of the parameters that we will be listing here for completeness,

**n_jobs** : Number of cores used for the training process. If set to -1, all cores are used.

**n_estimators** : Number of classification trees in your learning model ( set to 10 per default)

**max_depth** : Maximum depth of tree, or how much a node should be expanded. Beware if set to too high  a number would run the risk of overfitting as one would be growing the tree too deep

**verbose** : Controls whether you want to output any text during the learning process. A value of 0 suppresses all text while a value of 3 outputs the tree learning process at every iteration.

 Please check out the full description via the official Sklearn website. There you will find that there are a whole host of other useful parameters that you can play around with. 

In [8]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params_sj = {
    'n_estimators': 220,
    'max_depth': 4,
    'criterion': 'mae',
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'bootstrap': True,
    'verbose': 0
}
        
rf_params_iq = {
    'n_estimators': 22,
    'max_depth': 4,
    'criterion': 'mae',
    'min_samples_split': 16,
    'max_features': 'sqrt',
    'bootstrap': True,
    'verbose': 0
}

# XGBoost parameters
xgb_params_sj = {
    'learning_rate': 0.01, 
    'n_estimators': 225,
    'max_depth': 5,
    'min_child_weight': 12,
    'gamma': 0,
    'subsample': 0.95,
    'colsample_bytree': 0.5,
    'reg_alpha': 0.01,
    'reg_lambda': 1e-5,
    'objective':'reg:squarederror',
    'verbose': 0
}
xgb_params_iq = {
    'learning_rate': 0.1, 
    'n_estimators': 17,
    'max_depth': 8,
    'min_child_weight': 5,
    'gamma': 0.01,
    'subsample': 0.75,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.75,
    'reg_lambda': 0.75,
    'objective': 'reg:squarederror',
    'verbose': 0
}



In [9]:
#  RFE selected features from Linear Regression
col_RFE_sj = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'reanalysis_air_temp_c',
       'reanalysis_avg_temp_c', 'reanalysis_dew_point_temp_c',
       'reanalysis_relative_humidity_percent',
       'reanalysis_specific_humidity_g_per_kg', 'ndvi_mean',
       'station_avg_temp_c_lagVar', 'reanalysis_max_air_temp_c_lagSum',
       'reanalysis_air_temp_c_lagSum', 'reanalysis_air_temp_c_lagMean',
       'reanalysis_air_temp_c_lagVar', 'reanalysis_avg_temp_c_lagSum',
       'reanalysis_avg_temp_c_lagMean', 'reanalysis_avg_temp_c_lagVar',
       'ndvi_ne_lagSum', 'ndvi_ne_lagMean', 'ndvi_ne_lagVar',
       'reanalysis_tdtr_c_lagSum', 'reanalysis_tdtr_c_lagVar',
       'ndvi_mean_lagVar', 'ndvi_nw_lagSum', 'ndvi_nw_lagVar',
       'ndvi_se_lagSum', 'ndvi_se_lagMean', 'ndvi_se_lagVar', 'ndvi_sw_lagSum',
       'ndvi_sw_lagMean', 'ndvi_sw_lagVar']
col_RFE_iq = ['ndvi_nw', 'ndvi_sw', 'reanalysis_dew_point_temp_c',
       'reanalysis_specific_humidity_g_per_kg',
       'reanalysis_specific_humidity_g_per_kg_lagVar',
       'reanalysis_dew_point_temp_c_lagVar', 'ndvi_se_lagVar',
       'ndvi_sw_lagVar', 'ndvi_ne_lagVar', 'ndvi_nw_lagVar']

In [10]:
SEED   = 1 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold( n_splits = NFOLDS, random_state=SEED)

In [11]:
# Create objects that represent our models

# For san Juan
rf_sj = SklearnHelper(reg=RandomForestRegressor, seed=SEED, params=rf_params_sj)
gb_sj = SklearnHelper(reg=xgb.XGBRegressor, seed=SEED, params=xgb_params_sj)
nb_sj = SklearnHelper(reg=None, seed=SEED, params=None, NB=True, col=col_RFE_sj, alpha=1.18 )

# For Iquitos
rf_iq = SklearnHelper(reg=RandomForestRegressor, seed=SEED, params=rf_params_iq)
gb_iq = SklearnHelper(reg=xgb.XGBRegressor, seed=SEED, params=xgb_params_iq)
nb_iq = SklearnHelper(reg=None, seed=SEED, params=None, NB=True, col=col_RFE_iq, alpha=0.001 )


In [21]:
x_train_sj = pd.DataFrame( dengue_train_sj )
x_train_sj.drop( columns = ['city','year','total_cases'], inplace = True )
y_train_sj = dengue_train_sj.total_cases
x_test_sj = pd.DataFrame( dengue_test_sj, columns = x_train_sj.columns )
ntrain = x_train_sj.shape[0]
ntest = x_test_sj.shape[0]

In [34]:
rf_oof_train_sj, rf_oof_test_sj = get_OutOfFoldPredictions( rf_sj,
                                                            x_train_sj,
                                                            y_train_sj,
                                                            x_test_sj ) # Random Forest
gb_oof_train_sj, gb_oof_test_sj = get_OutOfFoldPredictions( gb_sj,
                                                            x_train_sj,
                                                            y_train_sj,
                                                            x_test_sj ) # Gradient Boost
x_train_sj = x_train_sj[col_RFE_sj]
x_test_sj  = x_test_sj[col_RFE_sj]
nb_oof_train_sj, nb_oof_test_sj = get_OutOfFoldPredictions( nb_sj,
                                                            x_train_sj,
                                                            y_train_sj,
                                                            x_test_sj )  # Negative Binomial

In [85]:
x_train_iq = pd.DataFrame( dengue_train_iq )
x_train_iq.drop( columns = ['city','year','total_cases'], inplace = True )
y_train_iq = dengue_train_iq.total_cases
x_test_iq = pd.DataFrame( dengue_test_iq, columns = x_train_iq.columns )
ntrain = x_train_iq.shape[0]
ntest  = x_test_iq.shape[0]

In [86]:
rf_oof_train_iq, rf_oof_test_iq = get_OutOfFoldPredictions( rf_iq,
                                                            x_train_iq,
                                                            y_train_iq,
                                                            x_test_iq ) # Random Forest
gb_oof_train_iq, gb_oof_test_iq = get_OutOfFoldPredictions( gb_iq,
                                                            x_train_iq,
                                                            y_train_iq,
                                                            x_test_iq ) # Gradient Boost
x_train_iq = x_train_iq[col_RFE_iq]
x_test_iq  = x_test_iq[col_RFE_iq]
nb_oof_train_iq, nb_oof_test_iq = get_OutOfFoldPredictions( nb_iq,
                                                            x_train_iq,
                                                            y_train_iq,
                                                            x_test_iq )  # Negative Binomial

In [87]:
rf_sj_features = rf_sj.feature_importances( x_train_sj, y_train_sj )
gb_sj_features = gb_sj.feature_importances( x_train_sj, y_train_sj )
rf_iq_features = rf_iq.feature_importances( x_train_iq, y_train_iq )
gb_iq_features = gb_iq.feature_importances( x_train_iq, y_train_iq )

[0.00644359 0.01510076 0.00758349 0.00497379 0.01436924 0.01415707
 0.01815332 0.01596558 0.02517232 0.00853181 0.00331829 0.03345881
 0.05791077 0.03899551 0.00395463 0.04145806 0.03117509 0.00476751
 0.01819894 0.01857979 0.01069877 0.05297248 0.00940716 0.07258634
 0.05933908 0.20330589 0.01208244 0.01320407 0.04408255 0.00728822
 0.01148669 0.12127794]
[0.00934986 0.00927348 0.0154     0.01016222 0.00865264 0.00767919
 0.01794899 0.01064    0.01876744 0.01068679 0.02116974 0.03579723
 0.01893968 0.02444226 0.00603216 0.03013143 0.02269513 0.00952023
 0.0159846  0.01545951 0.05508079 0.01915999 0.01028542 0.04356719
 0.02216302 0.16475454 0.02418447 0.01679084 0.05603883 0.01806889
 0.01852562 0.23264776]
[0.06144757 0.06644634 0.29610823 0.13234999 0.0944076  0.10261989
 0.07181884 0.08078204 0.04158592 0.05243359]
[0.07176472 0.09631506 0.07714727 0.14798443 0.0782735  0.0950506
 0.15163308 0.10270847 0.07302786 0.10609501]


In [69]:
#nb_feature = nb_sj.feature_importances(x_train_sj, y_train_sj)

In [91]:
rf_sj_features = [0.00644359, 0.01510076, 0.00758349, 0.00497379, 0.01436924, 0.01415707,
                  0.01815332, 0.01596558, 0.02517232, 0.00853181, 0.00331829, 0.03345881,
                  0.05791077, 0.03899551, 0.00395463, 0.04145806, 0.03117509, 0.00476751,
                  0.01819894, 0.01857979, 0.01069877, 0.05297248, 0.00940716, 0.07258634,
                  0.05933908, 0.20330589, 0.01208244, 0.01320407, 0.04408255, 0.00728822,
                  0.01148669, 0.12127794]
gb_sj_features = [0.00934986, 0.00927348, 0.0154    , 0.01016222, 0.00865264, 0.00767919,
                  0.01794899, 0.01064   , 0.01876744, 0.01068679, 0.02116974, 0.03579723,
                  0.01893968, 0.02444226, 0.00603216, 0.03013143, 0.02269513, 0.00952023,
                  0.0159846 , 0.01545951, 0.05508079, 0.01915999, 0.01028542, 0.04356719,
                  0.02216302, 0.16475454, 0.02418447, 0.01679084, 0.05603883, 0.01806889,
                  0.01852562, 0.23264776]
rf_iq_features = [0.06144757, 0.06644634, 0.29610823, 0.13234999, 0.0944076,  0.10261989,
                  0.07181884, 0.08078204, 0.04158592, 0.05243359]
gb_iq_features = [0.07176472, 0.09631506, 0.07714727, 0.14798443, 0.0782735,  0.0950506,
                  0.15163308, 0.10270847, 0.07302786, 0.10609501]

In [92]:
cols = x_train_sj.columns.values
# Create a dataframe with features
feature_dataframe_sj = pd.DataFrame( { 'features': cols,
                                    'Random Forest feature importances': rf_sj_features,
                                    'Gradient Boost feature importances': gb_sj_features
                                    })
cols = x_train_iq.columns.values
feature_dataframe_iq = pd.DataFrame( { 'features': cols,
                                    'Random Forest feature importances': rf_iq_features,
                                    'Gradient Boost feature importances': gb_iq_features
                                    })

In [95]:
# Scatter plot 
trace = go.Scatter( y = feature_dataframe_sj['Random Forest feature importances'].values,
                    x = feature_dataframe_sj['features'].values,
                    mode='markers',
                    marker=dict( sizemode = 'diameter',
                                 sizeref = 1,
                                 size = 25,
                                 color = feature_dataframe_sj['Random Forest feature importances'].values,
                                 colorscale='Portland',
                                 showscale=True ),
                    text = feature_dataframe_sj['features'].values )
data = [trace]

layout= go.Layout(  autosize= True,
                    title= 'Random Forest Feature Importance (City - San Juan)',
                    hovermode= 'closest',
                    yaxis=dict( title= 'Feature Importance',
                                ticklen= 5,
                                gridwidth= 2 ),
                    xaxis= dict( automargin= True),
                    showlegend= False )
fig = go.Figure( data=data, layout=layout )
py.iplot( fig, filename='scatter2010' )

# Scatter plot 
trace = go.Scatter( y = feature_dataframe_sj['Gradient Boost feature importances'].values,
                    x = feature_dataframe_sj['features'].values,
                    mode='markers',
                    marker=dict( sizemode = 'diameter',
                                 sizeref = 1,
                                 size = 25,
                                 color = feature_dataframe_sj['Gradient Boost feature importances'].values,
                                 colorscale='Portland',
                                 showscale=True ),
                    text = feature_dataframe_sj['features'].values )
data = [trace]

layout= go.Layout(  autosize= True,
                    title= 'Gradient Boosting Feature Importance (City-San Juan)',
                    hovermode= 'closest',
                    yaxis=dict( title= 'Feature Importance',
                                ticklen= 5,
                                gridwidth= 2,
                              automargin= True),
                    xaxis= dict( automargin= True),
                    showlegend= False )
fig = go.Figure( data=data, layout=layout)
py.iplot( fig, filename='scatter2010' )

# Scatter plot 
trace = go.Scatter( y = feature_dataframe_sj['Random Forest feature importances'].values,
                    x = feature_dataframe_sj['features'].values,
                    mode='markers',
                    marker=dict( sizemode = 'diameter',
                                 sizeref = 1,
                                 size = 25,
                                 color = feature_dataframe_sj['Random Forest feature importances'].values,
                                 colorscale='Portland',
                                 showscale=True ),
                    text = feature_dataframe_sj['features'].values )
data = [trace]

layout= go.Layout(  autosize= True,
                    title= 'Random Forest Feature Importance (City - Iquitos)',
                    hovermode= 'closest',
                    yaxis=dict( title= 'Feature Importance',
                                ticklen= 5,
                                gridwidth= 2 ),
                    xaxis= dict( automargin= True),
                    showlegend= False )
fig = go.Figure( data=data, layout=layout )
py.iplot( fig, filename='scatter2010' )

# Scatter plot 
trace = go.Scatter( y = feature_dataframe_sj['Gradient Boost feature importances'].values,
                    x = feature_dataframe_sj['features'].values,
                    mode='markers',
                    marker=dict( sizemode = 'diameter',
                                 sizeref = 1,
                                 size = 25,
                                 color = feature_dataframe_sj['Gradient Boost feature importances'].values,
                                 colorscale='Portland',
                                 showscale=True ),
                    text = feature_dataframe_sj['features'].values )
data = [trace]

layout= go.Layout(  autosize= True,
                    title= 'Gradient Boosting Feature Importance (City- Iquitos)',
                    hovermode= 'closest',
                    yaxis=dict( title= 'Feature Importance',
                                ticklen= 5,
                                gridwidth= 2,
                              automargin= True),
                    xaxis= dict( automargin= True),
                    showlegend= False )
fig = go.Figure( data=data, layout=layout)
py.iplot( fig, filename='scatter2010' )

In [96]:
# Create the new column containing the average of values
feature_dataframe_sj['mean'] = feature_dataframe_sj.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe_iq['mean'] = feature_dataframe_iq.mean(axis= 1) # axis = 1 computes the mean row-wise

**Plotly Barplot of Average Feature Importances**

Having obtained the mean feature importance across all our models, we can plot them into a Plotly bar plot as follows:

In [98]:
y = feature_dataframe_sj['mean'].values
x = feature_dataframe_sj['features'].values
data = [go.Bar( x= x,
                y= y,
                width = 0.5,
                marker=dict( color = feature_dataframe_sj['mean'].values,
                             colorscale='Portland',
                             showscale=True,
                             reversescale = False
                            ),
                opacity=0.6 )]

layout= go.Layout( autosize= True,
                   title= 'Barplots of Mean Feature Importance ( City - San Juan )',
                   hovermode= 'closest',
                   yaxis=dict( title= 'Feature Importance',
                               ticklen= 5,
                               gridwidth= 2 ),
                   xaxis = dict( automargin= True ),
                   showlegend = False )
fig = go.Figure( data=data, layout = layout )
py.iplot(fig, filename='bar-direct-labels')

y = feature_dataframe_iq['mean'].values
x = feature_dataframe_iq['features'].values
data = [go.Bar( x= x,
                y= y,
                width = 0.5,
                marker=dict( color = feature_dataframe_iq['mean'].values,
                             colorscale='Portland',
                             showscale=True,
                             reversescale = False
                            ),
                opacity=0.6 )]

layout= go.Layout( autosize= True,
                   title= 'Barplots of Mean Feature Importance ( City - Iquitos )',
                   hovermode= 'closest',
                   yaxis=dict( title= 'Feature Importance',
                               ticklen= 5,
                               gridwidth= 2 ),
                   xaxis = dict( automargin= True ),
                   showlegend = False )
fig = go.Figure( data=data, layout = layout )
py.iplot(fig, filename='bar-direct-labels')

# Second-Level Predictions from the First-level Output

**First-level output as new features**

Having now obtained our first-level predictions, one can think of it as essentially building a new set of features to be used as training data for the next regression. As per the code below, we are therefore having as our new columns the first-level predictions from our earlier regressors and we train the next regression on this.

In [99]:
base_predictions_train_sj = pd.DataFrame({'RandomForest':     rf_oof_train_sj.ravel(),
                                          'GradientBoost':    gb_oof_train_sj.ravel(),
                                          'NegativeBinomial': nb_oof_train_sj.ravel() 
                                         })
print( base_predictions_train_sj.head() )
base_predictions_train_iq = pd.DataFrame({'RandomForest':     rf_oof_train_iq.ravel(),
                                          'GradientBoost':    gb_oof_train_iq.ravel(),
                                          'NegativeBinomial': nb_oof_train_iq.ravel() 
                                         })
print( base_predictions_train_iq.head())

   RandomForest  GradientBoost  NegativeBinomial
0     14.547727      19.508806         13.574270
1     13.150000      14.942312         14.361505
2     14.200000      15.864154         23.484834
3     15.706818      17.220106         34.883915
4     19.361364      20.001890         38.960599
   RandomForest  GradientBoost  NegativeBinomial
0      3.818182       3.631974          7.936760
1      4.590909       3.917861          7.359638
2      5.863636       4.647056          6.815955
3      4.045455       3.749698          6.367498
4      4.454545       3.192487          3.443660


**Correlation Heatmap of the Second Level Training set**

In [76]:
data = [ go.Heatmap( z = base_predictions_train_sj.astype( float ).corr().values,
                     x = base_predictions_train_sj.columns.values,
                     y = base_predictions_train_sj.columns.values,
                     colorscale = 'Viridis',
                     showscale = True,
                     reversescale = True )]
py.iplot( data, filename = 'labelled-heatmap' )

In [100]:
data = [ go.Heatmap( z = base_predictions_train_iq.astype( float ).corr().values,
                     x = base_predictions_train_iq.columns.values,
                     y = base_predictions_train_iq.columns.values,
                     colorscale = 'Viridis',
                     showscale = True,
                     reversescale = True )]
py.iplot( data, filename = 'labelled-heatmap' )

In [101]:
x_train_sec_sj = np.concatenate(( rf_oof_train_sj, gb_oof_train_sj, nb_oof_train_sj ), axis=1)
x_test_sec_sj  = np.concatenate(( rf_oof_test_sj, gb_oof_test_sj, nb_oof_test_sj ), axis=1)
x_train_sec_iq = np.concatenate(( rf_oof_train_iq, gb_oof_train_iq, nb_oof_train_iq ), axis=1)
x_test_sec_iq  = np.concatenate(( rf_oof_test_iq, gb_oof_test_iq, nb_oof_test_iq ), axis=1)

Having now concatenated and joined both the first-level train and test predictions as x_train and x_test, we can now fit a second-level learning model.

### Second level learning model via XGBoost

Here we choose the eXtremely famous library for boosted tree learning model, XGBoost. It was built to optimize large-scale boosted tree algorithms. For further information about the algorithm, check out the [official documentation][1].

  [1]: https://xgboost.readthedocs.io/en/latest/

Anyways, we call an XGBClassifier and fit it to the first-level train and target data and use the learned model to predict the test data as follows:

In [111]:
X = pd.DataFrame( x_train_sec_sj )
y = y_train_sj
parameters_for_testing = { 'learning_rate':[0.1,0.01],
                           'max_depth':[4,5,6],
                           'n_estimators':[100,200,500,1000],
                           'gamma':[0,0.01],
                           'min_child_weight':[5,8,12,15],
                           'colsample_bytree':[0.5,0.4,0.6],
                           'reg_alpha':[0.01],
                           'reg_lambda':[1e-5,0.075],
                           'subsample':[0.95,1e-5]
                         }
xgb_model = xgb.XGBRegressor()
gs_xg_sj = GridSearchCV( estimator = xgb_model, param_grid = parameters_for_testing,
                      n_jobs=4,iid=False, verbose=1, scoring ='neg_mean_squared_error',
                      cv= 5).fit( X, y, eval_metric='rmse' )
Y_sj_pred = gs_xg_sj.best_estimator_.predict( pd.DataFrame( x_test_sec_sj, columns = X.columns ) ).astype(int)

Fitting 5 folds for each of 2304 candidates, totalling 11520 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 279 tasks      | elapsed:   17.6s
[Parallel(n_jobs=4)]: Done 779 tasks      | elapsed:   48.3s
[Parallel(n_jobs=4)]: Done 1479 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 2373 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 3468 tasks      | elapsed:  3.6min
[Parallel(n_jobs=4)]: Done 4500 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 5909 tasks      | elapsed:  6.2min
[Parallel(n_jobs=4)]: Done 7519 tasks      | elapsed:  7.9min
[Parallel(n_jobs=4)]: Done 9419 tasks      | elapsed:  9.9min
[Parallel(n_jobs=4)]: Done 11432 tasks      | elapsed: 12.0min




[Parallel(n_jobs=4)]: Done 11520 out of 11520 | elapsed: 12.1min finished


In [112]:
gs_xg_sj.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 4,
 'min_child_weight': 15,
 'n_estimators': 200,
 'reg_alpha': 0.01,
 'reg_lambda': 1e-05,
 'subsample': 0.95}

In [113]:
X = pd.DataFrame( x_train_sec_iq )
y = y_train_iq
parameters_for_testing = { 'learning_rate':[0.1,0.01],
                           'max_depth':[2,4,5],
                           'n_estimators':[100,200,300,500],
                           'gamma':[0],#,0.01
                           'min_child_weight':[12,18,15,20],#5,8,
                           'colsample_bytree':[0.4,0.6],
                           'reg_alpha':[0.01],
                           'reg_lambda':[1e-5,1e-4,0.075],
                           'subsample':[0.95]#,1e-5,1e-4
                         }
xgb_model = xgb.XGBRegressor()
gs_xg_iq = GridSearchCV( estimator = xgb_model, param_grid = parameters_for_testing,
                      n_jobs=4,iid=False, verbose=1, scoring ='neg_mean_squared_error',
                      cv= 5).fit( X, y, eval_metric='rmse' )

Y_iq_pred = gs_xg_iq.best_estimator_.predict( pd.DataFrame( x_test_sec_iq, columns = X.columns )).astype(int)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    6.9s
[Parallel(n_jobs=4)]: Done 1044 tasks      | elapsed:   28.3s
[Parallel(n_jobs=4)]: Done 2044 tasks      | elapsed:  1.0min




[Parallel(n_jobs=4)]: Done 2880 out of 2880 | elapsed:  1.4min finished


In [114]:
gs_xg_iq.best_params_

{'colsample_bytree': 0.4,
 'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 2,
 'min_child_weight': 20,
 'n_estimators': 200,
 'reg_alpha': 0.01,
 'reg_lambda': 0.075,
 'subsample': 0.95}