## Stacking Ensemble Prediction Approach

Finally after the feature engineering, data formatting and individual model analysis, creating  a Stacking ensemble model using the below models.
    - Linear Regression
    - Negative Binomial
    - Random Forest
    - XGBoost

## Load packages

In [1]:
import numpy as np    # fundamental package for scientific computing
import pandas as pd   # Python Data Analysis Library
from pandas import Series # one-dimensional labeled array capable of holding any data type 
import seaborn as sns # library for making statistical graphics in Python
import os             # operating system dependent functionality, file descriptor..

from sklearn.model_selection import train_test_split             # Split arrays or matrices into random train and test subsets

from sklearn.metrics import mean_absolute_error                  # Mean absolute error regression loss
from sklearn.metrics import median_absolute_error                # Median absolute error regression loss

import warnings
warnings.filterwarnings('ignore')                                # For warning control

In [2]:
# plotly library for visalization
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

%matplotlib inline    
# Line magic command will make plot outputs appear and be stored within the notebook.
import matplotlib.pyplot as plt   # matplotlib's plotting framework

In [3]:
# Going to use these 4 base models for the stacking
from sklearn.linear_model import LinearRegression   # base model for linear regression
from sklearn.linear_model import Lasso              # least absolute shrinkage and selection operator
from sklearn.linear_model import Ridge              # ridge regression
import statsmodels.api as sm                        # Negative binomial regression
import statsmodels.formula.api as smf               # statistical models formula APIs
from sklearn.ensemble import RandomForestRegressor  # A random forest regressor
import xgboost as xgb                               # Gradient Boosting for regression

from sklearn.model_selection import KFold           # K-Folds cross-validator 
from sklearn.model_selection import GridSearchCV    # Grid Search for paramter tuning

In [4]:
# Let's check what data files are available.
PATH="./../datasets/"
os.listdir(PATH)

['dengue_test_iq.pkl',
 'dengue_test_sj.pkl',
 'dengue_train_iq.pkl',
 'dengue_train_sj.pkl',
 'test_iq_month_1.pkl',
 'test_iq_month_10.pkl',
 'test_iq_month_11.pkl',
 'test_iq_month_12.pkl',
 'test_iq_month_2.pkl',
 'test_iq_month_3.pkl',
 'test_iq_month_4.pkl',
 'test_iq_month_5.pkl',
 'test_iq_month_6.pkl',
 'test_iq_month_7.pkl',
 'test_iq_month_8.pkl',
 'test_iq_month_9.pkl',
 'test_sj_month_1.pkl',
 'test_sj_month_10.pkl',
 'test_sj_month_11.pkl',
 'test_sj_month_12.pkl',
 'test_sj_month_2.pkl',
 'test_sj_month_3.pkl',
 'test_sj_month_4.pkl',
 'test_sj_month_5.pkl',
 'test_sj_month_6.pkl',
 'test_sj_month_7.pkl',
 'test_sj_month_8.pkl',
 'test_sj_month_9.pkl',
 'train_iq_month_1.pkl',
 'train_iq_month_10.pkl',
 'train_iq_month_11.pkl',
 'train_iq_month_12.pkl',
 'train_iq_month_2.pkl',
 'train_iq_month_3.pkl',
 'train_iq_month_4.pkl',
 'train_iq_month_5.pkl',
 'train_iq_month_6.pkl',
 'train_iq_month_7.pkl',
 'train_iq_month_8.pkl',
 'train_iq_month_9.pkl',
 'train_sj_month_1.pk

In [5]:
# let's load the train and test data
train_filename_sj = ( './../datasets/dengue_train_sj.pkl' )
train_filename_iq = ( './../datasets/dengue_train_iq.pkl' )
test_filename_sj  = ( './../datasets/dengue_test_sj.pkl' )
test_filename_iq  = ( './../datasets/dengue_test_iq.pkl' )

dengue_train_sj = pd.read_pickle( train_filename_sj )
dengue_train_iq = pd.read_pickle( train_filename_iq )
dengue_test_sj  = pd.read_pickle( test_filename_sj )
dengue_test_iq  = pd.read_pickle( test_filename_iq )

In [6]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, reg, seed=0, params=None, NB=False,col=[],alpha=0.001):
        self.NB = NB
        self.col = col
        self.alpha = alpha
        if( self.NB == True ):
            self.model = None
        else:
            self.model = reg(**params)
            params['random_state'] = seed

    def train(self, x_train, y_train):
        if( self.NB == False ):
            self.model.fit(x_train, y_train)
        else:
            formula = ' + '.join([ str(feature) for feature in list(self.col)])
            formula = 'y ~ ' + formula
            train      = pd.DataFrame( x_train )
            train['y'] = y_train
            self.model = smf.glm( formula = formula,
                                  data = train,
                                  family = sm.families.NegativeBinomial(alpha=self.alpha)).fit()
        

    def predict(self, x):
        return self.model.predict(x)
    
    def fit(self,x,y):
        return self.model.fit(x,y)
    
    def feature_importances(self,x,y):
        if( self.NB == False ):
            print( self.model.fit(x,y).feature_importances_ )
        else:
            print( self.model.feature_importances_ )

### Out-of-Fold Predictions

Stacking uses predictions of base regressor as input for training to a second-level model.

In [7]:
def get_OutOfFoldPredictions( model, x_train, y_train, x_test ):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train.iloc[test_index]
        model.train(x_tr, y_tr)
        oof_train[test_index] = model.predict(x_te)
        oof_test_skf[i, :] = model.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Generating Base First-Level Models 

So now let us prepare four learning models as our first level regression.
- The models are listed as follows:
    - Linear Regression
    - Negative Binomial
    - Random Forest
    - XGBoost

In [8]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params_sj = {
    'n_estimators': 220,
    'max_depth': 4,
    'criterion': 'mae',
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'bootstrap': True,
    'verbose': 0
}
        
rf_params_iq = {
    'n_estimators': 22,
    'max_depth': 4,
    'criterion': 'mae',
    'min_samples_split': 16,
    'max_features': 'sqrt',
    'bootstrap': True,
    'verbose': 0
}

# XGBoost parameters
xgb_params_sj = {
    'learning_rate': 0.01, 
    'n_estimators': 225,
    'max_depth': 5,
    'min_child_weight': 12,
    'gamma': 0,
    'subsample': 0.95,
    'colsample_bytree': 0.5,
    'reg_alpha': 0.01,
    'reg_lambda': 1e-5,
    'objective':'reg:squarederror',
    'verbose': 0
}
xgb_params_iq = {
    'learning_rate': 0.1, 
    'n_estimators': 17,
    'max_depth': 8,
    'min_child_weight': 5,
    'gamma': 0.01,
    'subsample': 0.75,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.75,
    'reg_lambda': 0.75,
    'objective': 'reg:squarederror',
    'verbose': 0
}

# Linear Regression parameters ( L1 lasso regularization )
l1_params_sj = {
    'alpha': 2
}
lr_params_iq = {
}


In [9]:
#  RFE selected features from Linear Regression
col_RFE_sj = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'reanalysis_air_temp_c',
       'reanalysis_avg_temp_c', 'reanalysis_dew_point_temp_c',
       'reanalysis_relative_humidity_percent',
       'reanalysis_specific_humidity_g_per_kg', 'ndvi_mean',
       'station_avg_temp_c_lagVar', 'reanalysis_max_air_temp_c_lagSum',
       'reanalysis_air_temp_c_lagSum', 'reanalysis_air_temp_c_lagMean',
       'reanalysis_air_temp_c_lagVar', 'reanalysis_avg_temp_c_lagSum',
       'reanalysis_avg_temp_c_lagMean', 'reanalysis_avg_temp_c_lagVar',
       'ndvi_ne_lagSum', 'ndvi_ne_lagMean', 'ndvi_ne_lagVar',
       'reanalysis_tdtr_c_lagSum', 'reanalysis_tdtr_c_lagVar',
       'ndvi_mean_lagVar', 'ndvi_nw_lagSum', 'ndvi_nw_lagVar',
       'ndvi_se_lagSum', 'ndvi_se_lagMean', 'ndvi_se_lagVar', 'ndvi_sw_lagSum',
       'ndvi_sw_lagMean', 'ndvi_sw_lagVar']
col_RFE_iq = ['ndvi_nw', 'ndvi_sw', 'reanalysis_dew_point_temp_c',
       'reanalysis_specific_humidity_g_per_kg',
       'reanalysis_specific_humidity_g_per_kg_lagVar',
       'reanalysis_dew_point_temp_c_lagVar', 'ndvi_se_lagVar',
       'ndvi_sw_lagVar', 'ndvi_ne_lagVar', 'ndvi_nw_lagVar']
col_FFS_iq = ['reanalysis_tdtr_c_lagVar', 'reanalysis_specific_humidity_g_per_kg_lagMean',
          'reanalysis_precip_amt_kg_per_m2_lagVar', 'ndvi_se_lagVar', 'ndvi_ne_lagVar',
          'reanalysis_precip_amt_kg_per_m2_lagMean', 'station_min_temp_c']

In [10]:
SEED   = 1 # for reproducibility
NFOLDS = 10 # set folds for out-of-fold prediction
kf = KFold( n_splits = NFOLDS, random_state=SEED)

In [11]:
# Create objects that represent our models

# For san Juan
rf_sj = SklearnHelper(reg=RandomForestRegressor, seed=SEED, params=rf_params_sj)
gb_sj = SklearnHelper(reg=xgb.XGBRegressor, seed=SEED, params=xgb_params_sj)
nb_sj = SklearnHelper(reg=None, seed=SEED, params=None, NB=True, col=col_RFE_sj, alpha=1.18 )
lr_sj = SklearnHelper(reg=Lasso, seed=SEED, params=l1_params_sj )

# For Iquitos
rf_iq = SklearnHelper(reg=RandomForestRegressor, seed=SEED, params=rf_params_iq)
gb_iq = SklearnHelper(reg=xgb.XGBRegressor, seed=SEED, params=xgb_params_iq)
nb_iq = SklearnHelper(reg=None, seed=SEED, params=None, NB=True, col=col_RFE_iq, alpha=0.001 )
lr_iq = SklearnHelper(reg=LinearRegression, seed=SEED, params=lr_params_iq )


In [12]:
x_train_sj = pd.DataFrame( dengue_train_sj )
x_train_sj.drop( columns = ['city','year','total_cases'], inplace = True )
y_train_sj = dengue_train_sj.total_cases
x_test_sj = pd.DataFrame( dengue_test_sj, columns = x_train_sj.columns )
ntrain = x_train_sj.shape[0]
ntest = x_test_sj.shape[0]

In [13]:
rf_oof_train_sj, rf_oof_test_sj = get_OutOfFoldPredictions( rf_sj,
                                                            x_train_sj,
                                                            y_train_sj,
                                                            x_test_sj ) # Random Forest
gb_oof_train_sj, gb_oof_test_sj = get_OutOfFoldPredictions( gb_sj,
                                                            x_train_sj,
                                                            y_train_sj,
                                                            x_test_sj ) # Gradient Boost
x_train_RFE_sj = x_train_sj[col_RFE_sj]
x_test_RFE_sj  = x_test_sj[col_RFE_sj]
nb_oof_train_sj, nb_oof_test_sj = get_OutOfFoldPredictions( nb_sj,
                                                            x_train_RFE_sj,
                                                            y_train_sj,
                                                            x_test_RFE_sj )  # Negative Binomial
lr_oof_train_sj, lr_oof_test_sj = get_OutOfFoldPredictions( lr_sj,
                                                            x_train_RFE_sj,
                                                            y_train_sj,
                                                            x_test_RFE_sj )  # Linear Regression

In [14]:
x_train_iq = pd.DataFrame( dengue_train_iq )
x_train_iq.drop( columns = ['city','year','total_cases'], inplace = True )
y_train_iq = dengue_train_iq.total_cases
x_test_iq = pd.DataFrame( dengue_test_iq, columns = x_train_iq.columns )
ntrain = x_train_iq.shape[0]
ntest  = x_test_iq.shape[0]

In [15]:
rf_oof_train_iq, rf_oof_test_iq = get_OutOfFoldPredictions( rf_iq,
                                                            x_train_iq,
                                                            y_train_iq,
                                                            x_test_iq ) # Random Forest
gb_oof_train_iq, gb_oof_test_iq = get_OutOfFoldPredictions( gb_iq,
                                                            x_train_iq,
                                                            y_train_iq,
                                                            x_test_iq ) # Gradient Boost
x_train_nb_iq = x_train_iq[col_RFE_iq]
x_test_nb_iq  = x_test_iq[col_RFE_iq]
nb_oof_train_iq, nb_oof_test_iq = get_OutOfFoldPredictions( nb_iq,
                                                            x_train_nb_iq,
                                                            y_train_iq,
                                                            x_test_nb_iq )  # Negative Binomial
x_train_lr_iq = x_train_iq[col_FFS_iq]
x_test_lr_iq  = x_test_iq[col_FFS_iq]
lr_oof_train_iq, lr_oof_test_iq = get_OutOfFoldPredictions( lr_iq,
                                                            x_train_lr_iq,
                                                            y_train_iq,
                                                            x_test_lr_iq )  # Linear Regression

In [16]:
rf_sj_features = rf_sj.feature_importances( x_train_sj, y_train_sj )
gb_sj_features = gb_sj.feature_importances( x_train_sj, y_train_sj )
rf_iq_features = rf_iq.feature_importances( x_train_iq, y_train_iq )
gb_iq_features = gb_iq.feature_importances( x_train_iq, y_train_iq )

[0.07029777 0.00219327 0.00292479 0.00101336 0.00230713 0.00081825
 0.0067825  0.00164834 0.0072039  0.00336376 0.00091378 0.00420098
 0.00245287 0.00068504 0.0076273  0.0045073  0.00347677 0.00286253
 0.00567335 0.00104624 0.00225147 0.07150726 0.03495467 0.00266393
 0.03112396 0.01990394 0.0035615  0.02299255 0.02644218 0.00148009
 0.00265207 0.00260086 0.00252744 0.01100528 0.00945348 0.00409327
 0.01005412 0.01156765 0.0017419  0.00674039 0.00820239 0.00539959
 0.0115724  0.009396   0.00248839 0.01087023 0.01258675 0.0027207
 0.00588235 0.00819142 0.00079096 0.01381779 0.01509903 0.0056947
 0.01301619 0.01444608 0.00524432 0.00479788 0.00367792 0.01055272
 0.01464253 0.01180444 0.0042838  0.00350379 0.00653898 0.00193065
 0.0049526  0.00290752 0.00287192 0.01040244 0.00766109 0.03489003
 0.00237607 0.00309256 0.00341422 0.01752735 0.02487205 0.12595353
 0.00455193 0.00773774 0.0315453  0.00317069 0.00229619 0.09130757]
[0.02062602 0.00304032 0.01259764 0.01393582 0.00207411 0.00664

In [17]:
rf_sj_features = [0.06552142, 0.00219433, 0.00140005, 0.00115677, 0.00198608, 0.        ,
 0.0059839 , 0.00354681, 0.00809153, 0.00207007, 0.00111632, 0.0029966,
 0.00186996, 0.0007398 , 0.00614606, 0.00325124, 0.0035698 , 0.00241256,
 0.00658614, 0.0027097 , 0.0008126 , 0.06134252, 0.03447339, 0.00081739,
 0.0312964 , 0.02336969, 0.00525984, 0.0281136 , 0.02517242, 0.00343475,
 0.00230136, 0.00610341, 0.0015246 , 0.00807308, 0.00708329, 0.0056468,
 0.01346309, 0.01173351, 0.00122551, 0.01123167, 0.00774375, 0.00522104,
 0.01211497, 0.00945242, 0.00038477, 0.01874965, 0.01604164, 0.0026474,
 0.00386768, 0.00583344, 0.00117332, 0.01345873, 0.01780789, 0.00653702,
 0.01248731, 0.01464591, 0.00517133, 0.00684301, 0.0028884 , 0.00898438,
 0.01530477, 0.009724  , 0.00311161, 0.00432615, 0.0053499 , 0.00239065,
 0.00369173, 0.00406703, 0.00327751, 0.00809449, 0.00902947, 0.04528149,
 0.00346488, 0.00217366, 0.00124962, 0.02209893, 0.0165887 , 0.10035036,
 0.00797168, 0.0063134 , 0.03399828, 0.00067597, 0.0028558 , 0.1107278 ]
gb_sj_features = [0.02062602, 0.00304032, 0.01259764, 0.01393582, 0.00207411, 0.00664518,
 0.00300613, 0.00335041, 0.00888941, 0.00696998, 0.00194022, 0.02007009,
 0.00331984, 0.        , 0.00916281, 0.00229268, 0.00523384, 0.00302132,
 0.00520209, 0.00421099, 0.00645869, 0.01774131, 0.01511698, 0.00117846,
 0.01797894, 0.01836765, 0.00402221, 0.01322321, 0.01345389, 0.00235425,
 0.00668141, 0.00664038, 0.02105593, 0.00860308, 0.00956656, 0.0075722,
 0.01340138, 0.0093812 , 0.0016997 , 0.00440383, 0.00250956, 0.01066318,
 0.00792598, 0.00854455, 0.01487491, 0.0162416 , 0.01397573, 0.00452146,
 0.00195694, 0.00401489, 0.00981712, 0.00623803, 0.0061285 , 0.00459388,
 0.0106931 , 0.01163398, 0.01457163, 0.00653599, 0.00545781, 0.01061611,
 0.00751453, 0.00601226, 0.00537275, 0.00772282, 0.00713389, 0.01155213,
 0.00516064, 0.01101745, 0.01132101, 0.00522063, 0.00457807, 0.05886888,
 0.00425028, 0.00412684, 0.01093994, 0.01034258, 0.01426637, 0.10897352,
 0.01035224, 0.01106404, 0.02984831, 0.01471299, 0.01772999, 0.11591283]
rf_iq_features = [0.01843478, 0.0087513 , 0.00158497, 0.        , 0.00344156, 0.00773927,
 0.        , 0.        , 0.04303735, 0.0120708 , 0.00386065, 0.0059675,
 0.00303337, 0.        , 0.05955841, 0.00316269, 0.00772704, 0.02059143,
 0.00296104, 0.00445259, 0.        , 0.01974483, 0.01848046, 0.00624665,
 0.07224036, 0.02540893, 0.0033652 , 0.01131129, 0.02344694, 0.00586568,
 0.00732532, 0.0690712 , 0.01595218, 0.01027952, 0.02928023, 0.05693126,
 0.00919114, 0.00151668, 0.01541319, 0.01088209, 0.00689789, 0.02549505,
 0.00779642, 0.        , 0.00873778, 0.00284981, 0.00210403, 0.00904322,
 0.00750833, 0.        , 0.00458447, 0.        , 0.00096516, 0.00403482,
 0.00792541, 0.00919065, 0.0119589 , 0.00175827, 0.00277846, 0.0222118,
 0.01347531, 0.02406618, 0.00780779, 0.01246299, 0.01611583, 0.01115656,
 0.00705038, 0.00384029, 0.00795892, 0.00363233, 0.        , 0.00895901,
 0.        , 0.00628657, 0.03005151, 0.00092619, 0.00468793, 0.01265017,
 0.00421725, 0.00535795, 0.01334902, 0.02224283, 0.01040133, 0.01913523]
gb_iq_features = [0.02971221, 0.00696406, 0.00463756, 0.00129092, 0.00036199, 0.00555033,
 0.00610315, 0.00412624, 0.02326182, 0.00321228, 0.00685143, 0.0156428,
 0.01165206, 0.0031447 , 0.00633739, 0.00396873, 0.00332629, 0.0047188,
 0.00223654, 0.02499755, 0.00283266, 0.16362487, 0.01411329, 0.001122,
 0.02098849, 0.00078195, 0.02263264, 0.02001691, 0.03256186, 0.03412917,
 0.01950135, 0.0291129 , 0.00322758, 0.00626948, 0.00066504, 0.01134744,
 0.00444213, 0.01027194, 0.01466779, 0.00468768, 0.00986198, 0.0458931,
 0.00526166, 0.        , 0.02060457, 0.018149  , 0.01169879, 0.00287929,
 0.02073549, 0.00301833, 0.00193202, 0.00264979, 0.00141093, 0.00651464,
 0.00125476, 0.        , 0.0095772 , 0.00520398, 0.0129438 , 0.03874254,
 0.01817854, 0.01335398, 0.00487748, 0.01285064, 0.00142793, 0.01063166,
 0.00302515, 0.00642072, 0.01147957, 0.00791891, 0.01743513, 0.00921271,
 0.00217802, 0.00193674, 0.01125835, 0.00681128, 0.00139544, 0.01041006,
 0.00960818, 0.00124423, 0.02775898, 0.00330617, 0.00115449, 0.00269979]

In [18]:
cols = x_train_sj.columns.values
# Create a dataframe with features
feature_dataframe_sj = pd.DataFrame( { 'features': cols,
                                       'Random Forest feature importances': rf_sj_features,
                                       'Gradient Boost feature importances': gb_sj_features
                                    })
cols = x_train_iq.columns.values
feature_dataframe_iq = pd.DataFrame( { 'features': cols,
                                      'Random Forest feature importances': rf_iq_features,
                                      'Gradient Boost feature importances': gb_iq_features
                                    })
# for Negative binomial and Linear regressionused the RFE features,
#  so feature importance is not considered for analysis

In [19]:
# Scatter plot 
trace = go.Scatter( y = feature_dataframe_sj['Random Forest feature importances'].values,
                    x = feature_dataframe_sj['features'].values,
                    mode='markers',
                    marker=dict( sizemode = 'diameter',
                                 sizeref = 1,
                                 size = 25,
                                 color = feature_dataframe_sj['Random Forest feature importances'].values,
                                 colorscale='Portland',
                                 showscale=True ),
                    text = feature_dataframe_sj['features'].values )
data = [trace]

layout= go.Layout(  autosize= True,
                    title= 'Random Forest Feature Importance (City - San Juan)',
                    hovermode= 'closest',
                    yaxis=dict( title= 'Feature Importance',
                                ticklen= 5,
                                gridwidth= 2 ),
                    xaxis= dict( automargin= True),
                    showlegend= False )
fig = go.Figure( data=data, layout=layout )
py.iplot( fig, filename='scatter2010' )

# Scatter plot 
trace = go.Scatter( y = feature_dataframe_sj['Gradient Boost feature importances'].values,
                    x = feature_dataframe_sj['features'].values,
                    mode='markers',
                    marker=dict( sizemode = 'diameter',
                                 sizeref = 1,
                                 size = 25,
                                 color = feature_dataframe_sj['Gradient Boost feature importances'].values,
                                 colorscale='Portland',
                                 showscale=True ),
                    text = feature_dataframe_sj['features'].values )
data = [trace]

layout= go.Layout(  autosize= True,
                    title= 'Gradient Boosting Feature Importance (City-San Juan)',
                    hovermode= 'closest',
                    yaxis=dict( title= 'Feature Importance',
                                ticklen= 5,
                                gridwidth= 2,
                              automargin= True),
                    xaxis= dict( automargin= True),
                    showlegend= False )
fig = go.Figure( data=data, layout=layout)
py.iplot( fig, filename='scatter2010' )

# Scatter plot 
trace = go.Scatter( y = feature_dataframe_sj['Random Forest feature importances'].values,
                    x = feature_dataframe_sj['features'].values,
                    mode='markers',
                    marker=dict( sizemode = 'diameter',
                                 sizeref = 1,
                                 size = 25,
                                 color = feature_dataframe_sj['Random Forest feature importances'].values,
                                 colorscale='Portland',
                                 showscale=True ),
                    text = feature_dataframe_sj['features'].values )
data = [trace]

layout= go.Layout(  autosize= True,
                    title= 'Random Forest Feature Importance (City - Iquitos)',
                    hovermode= 'closest',
                    yaxis=dict( title= 'Feature Importance',
                                ticklen= 5,
                                gridwidth= 2 ),
                    xaxis= dict( automargin= True),
                    showlegend= False )
fig = go.Figure( data=data, layout=layout )
py.iplot( fig, filename='scatter2010' )

# Scatter plot 
trace = go.Scatter( y = feature_dataframe_sj['Gradient Boost feature importances'].values,
                    x = feature_dataframe_sj['features'].values,
                    mode='markers',
                    marker=dict( sizemode = 'diameter',
                                 sizeref = 1,
                                 size = 25,
                                 color = feature_dataframe_sj['Gradient Boost feature importances'].values,
                                 colorscale='Portland',
                                 showscale=True ),
                    text = feature_dataframe_sj['features'].values )
data = [trace]

layout= go.Layout(  autosize= True,
                    title= 'Gradient Boosting Feature Importance (City- Iquitos)',
                    hovermode= 'closest',
                    yaxis=dict( title= 'Feature Importance',
                                ticklen= 5,
                                gridwidth= 2,
                              automargin= True),
                    xaxis= dict( automargin= True),
                    showlegend= False )
fig = go.Figure( data=data, layout=layout)
py.iplot( fig, filename='scatter2010' )

In [20]:
# Create the new column containing the average of values
feature_dataframe_sj['mean'] = feature_dataframe_sj.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe_iq['mean'] = feature_dataframe_iq.mean(axis= 1) # axis = 1 computes the mean row-wise

**Plotly Barplot of Average Feature Importances( RF & XGB )**

Having obtained the mean feature importance across RF & XGB models, we can plot them into a Plotly bar plot as follows:

In [21]:
y = feature_dataframe_sj['mean'].values
x = feature_dataframe_sj['features'].values
data = [go.Bar( x= x,
                y= y,
                width = 0.5,
                marker=dict( color = feature_dataframe_sj['mean'].values,
                             colorscale='Portland',
                             showscale=True,
                             reversescale = False
                            ),
                opacity=0.6 )]

layout= go.Layout( autosize= True,
                   title= 'Barplots of Mean Feature Importance ( City - San Juan )',
                   hovermode= 'closest',
                   yaxis=dict( title= 'Feature Importance',
                               ticklen= 5,
                               gridwidth= 2 ),
                   xaxis = dict( automargin= True ),
                   showlegend = False )
fig = go.Figure( data=data, layout = layout )
py.iplot(fig, filename='bar-direct-labels')

y = feature_dataframe_iq['mean'].values
x = feature_dataframe_iq['features'].values
data = [go.Bar( x= x,
                y= y,
                width = 0.5,
                marker=dict( color = feature_dataframe_iq['mean'].values,
                             colorscale='Portland',
                             showscale=True,
                             reversescale = False
                            ),
                opacity=0.6 )]

layout= go.Layout( autosize= True,
                   title= 'Barplots of Mean Feature Importance ( City - Iquitos )',
                   hovermode= 'closest',
                   yaxis=dict( title= 'Feature Importance',
                               ticklen= 5,
                               gridwidth= 2 ),
                   xaxis = dict( automargin= True ),
                   showlegend = False )
fig = go.Figure( data=data, layout = layout )
py.iplot(fig, filename='bar-direct-labels')

# Second-Level Predictions from the First-level Output

**First-level output as new features**

Having now obtained our first-level predictions, one can think of it as essentially building a new set of features to be used as training data for the next regression. As per the code below, we are therefore having as our new columns the first-level predictions from our earlier regressors and we train the next regression on this.

In [22]:
base_predictions_train_sj = pd.DataFrame({'RandomForest':     rf_oof_train_sj.ravel(),
                                          'GradientBoost':    gb_oof_train_sj.ravel(),
                                          'NegativeBinomial': nb_oof_train_sj.ravel(),
                                          'LinearRegression': lr_oof_train_sj.ravel(),
                                         })
print( base_predictions_train_sj.head() )
base_predictions_train_iq = pd.DataFrame({'RandomForest':     rf_oof_train_iq.ravel(),
                                          'GradientBoost':    gb_oof_train_iq.ravel(),
                                          'NegativeBinomial': nb_oof_train_iq.ravel(),
                                          'LinearRegression': lr_oof_train_iq.ravel(),
                                         })
print( base_predictions_train_iq.head())

   RandomForest  GradientBoost  NegativeBinomial  LinearRegression
0     13.690909      12.931940         13.499543         25.193159
1     20.468182      10.624352         13.740438         29.917565
2     17.613636      11.012058         23.060024         37.932809
3     17.331818      14.802308         34.542179         44.788130
4     19.570455      18.301807         38.274167         43.894125
   RandomForest  GradientBoost  NegativeBinomial  LinearRegression
0      4.045455       2.592939          6.774370          4.265719
1      5.636364       4.557562          5.891829         10.005146
2      9.795455       4.852777          5.507552         10.668325
3      4.886364       5.683856          5.715218          9.460051
4      2.977273       3.202901          3.368411          4.559737


In [32]:
base_predictions_train_iq.shape

(516, 4)

**Correlation Heatmap of the Second Level Training set**

In [23]:
data = [ go.Heatmap( z = base_predictions_train_sj.astype( float ).corr().values,
                     x = base_predictions_train_sj.columns.values,
                     y = base_predictions_train_sj.columns.values,
                     colorscale = 'Viridis',
                     showscale = True,
                     reversescale = True )]
py.iplot( data, filename = 'labelled-heatmap' )

In [24]:
data = [ go.Heatmap( z = base_predictions_train_iq.astype( float ).corr().values,
                     x = base_predictions_train_iq.columns.values,
                     y = base_predictions_train_iq.columns.values,
                     colorscale = 'Viridis',
                     showscale = True,
                     reversescale = True )]
py.iplot( data, filename = 'labelled-heatmap' )

In [25]:
x_train_sec_sj = np.concatenate(( rf_oof_train_sj, gb_oof_train_sj, nb_oof_train_sj, lr_oof_train_sj), axis=1)
x_test_sec_sj  = np.concatenate(( rf_oof_test_sj, gb_oof_test_sj, nb_oof_test_sj, lr_oof_test_sj ), axis=1)
x_train_sec_iq = np.concatenate(( rf_oof_train_iq, gb_oof_train_iq, nb_oof_train_iq,lr_oof_train_iq ), axis=1)
x_test_sec_iq  = np.concatenate(( rf_oof_test_iq, gb_oof_test_iq, nb_oof_test_iq, lr_oof_test_iq ), axis=1)

In [33]:
x_train_sec_sj

array([[13.69090909, 12.93194008, 13.49954259, 25.19315917],
       [20.46818182, 10.62435246, 13.74043794, 29.91756516],
       [17.61363636, 11.01205826, 23.06002403, 37.93280938],
       ...,
       [11.64318182,  7.22331619, 10.96450423,  0.64054586],
       [11.70454545,  7.85787249, 11.005549  ,  7.67288334],
       [10.51363636,  7.47007513,  6.33263732, -2.14827749]])

Having now concatenated and joined both the first-level train and test predictions as x_train and x_test, we can now fit a second-level learning model.

### Second level learning model via XGBoost

Here we choose the eXtremely famous library for boosted tree learning model, XGBoost. It was built to optimize large-scale boosted tree algorithms. For further information about the algorithm, check out the [official documentation][1].

  [1]: https://xgboost.readthedocs.io/en/latest/

Anyways, we call an XGBClassifier and fit it to the first-level train and target data and use the learned model to predict the test data as follows:

In [26]:
X = pd.DataFrame( x_train_sec_sj )
y = y_train_sj
parameters_for_testing = { 'learning_rate':[0.01],#0.1,0.01
                           'max_depth':[3,4,5],#,6
                           'n_estimators':[150,200,300,250],#100,200,500,1000
                           'gamma':[0],#0,0.01
                           'min_child_weight':[5,8,12,15],
                           'colsample_bytree':[0.5,0.4,0.6],
                           'reg_alpha':[0.01],
                           'reg_lambda':[1e-5,1e-4],#0.075
                           'subsample':[0.95]#,1e-5
                         }
xgb_model = xgb.XGBRegressor()
gs_xg_sj = GridSearchCV( estimator = xgb_model, param_grid = parameters_for_testing,
                      n_jobs=4,iid=False, verbose=1, scoring ='neg_mean_squared_error',
                      cv= 5).fit( X, y, eval_metric='rmse' )
Y_sj_pred = gs_xg_sj.best_estimator_.predict( pd.DataFrame( x_test_sec_sj, columns = X.columns ) ).astype(int)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   42.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  2.1min




[Parallel(n_jobs=4)]: Done 1440 out of 1440 | elapsed:  2.4min finished


In [27]:
X = pd.DataFrame( x_train_sec_iq )
y = y_train_iq
parameters_for_testing = { 'learning_rate':[0.1,0.01],
                           'max_depth':[2,3,4,5],
                           'n_estimators':[150,200,250],#100,200,300,500
                           'gamma':[0],#,0.01
                           'min_child_weight':[12,18,15,20],#5,8,
                           'colsample_bytree':[0.4,0.6],
                           'reg_alpha':[0.01],
                           'reg_lambda':[1e-5,1e-4,0.075],
                           'subsample':[0.95]#,1e-5,1e-4
                         }
xgb_model = xgb.XGBRegressor()
gs_xg_iq = GridSearchCV( estimator = xgb_model, param_grid = parameters_for_testing,
                      n_jobs=4,iid=False, verbose=1, scoring ='neg_mean_squared_error',
                      cv= 5).fit( X, y, eval_metric='rmse' )

Y_iq_pred = gs_xg_iq.best_estimator_.predict( pd.DataFrame( x_test_sec_iq, columns = X.columns )).astype(int)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 144 tasks      | elapsed:    4.5s
[Parallel(n_jobs=4)]: Done 744 tasks      | elapsed:   26.3s
[Parallel(n_jobs=4)]: Done 1744 tasks      | elapsed:  1.0min




[Parallel(n_jobs=4)]: Done 2880 out of 2880 | elapsed:  1.9min finished


In [28]:
submission_sj = pd.DataFrame( dengue_test_sj, columns = ['city'])
submission_iq = pd.DataFrame( dengue_test_iq, columns = ['city'])
submission_sj.reset_index( inplace = True)
submission_sj['year'] = submission_sj['week_start_date'].dt.year
submission_sj['weekofyear'] = submission_sj['week_start_date'].dt.weekofyear
submission_iq.reset_index( inplace = True)
submission_iq['year'] = submission_iq['week_start_date'].dt.year
submission_iq['weekofyear'] = submission_iq['week_start_date'].dt.weekofyear
submission = submission_sj.append( submission_iq )
submission.drop( columns = ['week_start_date'], inplace = True )

In [29]:
submit_pred_sj = pd.DataFrame( dengue_test_sj, columns = ['city', 'week_start_date'])
submit_pred_sj.drop( columns=['week_start_date'], inplace = True )
submit_pred_sj.reset_index( inplace = True )
submit_pred_sj['year'] = submit_pred_sj['week_start_date'].dt.year
submit_pred_sj['weekofyear'] = submit_pred_sj['week_start_date'].dt.weekofyear
submit_pred_sj.drop( columns = ['week_start_date'], inplace = True )
submit_pred_sj['total_cases'] = Y_sj_pred

submit_pred_iq = pd.DataFrame( dengue_test_iq, columns = ['city', 'week_start_date'])
submit_pred_iq.drop( columns=['week_start_date'], inplace = True )
submit_pred_iq.reset_index( inplace = True )
submit_pred_iq['year'] = submit_pred_iq['week_start_date'].dt.year
submit_pred_iq['weekofyear'] = submit_pred_iq['week_start_date'].dt.weekofyear
submit_pred_iq.drop( columns = ['week_start_date'], inplace = True )
submit_pred_iq['total_cases'] = Y_iq_pred

submit = submit_pred_sj.append( submit_pred_iq, sort=True )
test = pd.merge( submission, submit, on =['city','year','weekofyear'], how='left')

In [30]:
test.to_csv("data/ensemble_stacking_rf_xgb_nb_linear.csv", index = False)