## Stacking Ensemble Prediction Approach

Finally after the feature engineering, data formatting and individual model analysis, creating  a Stacking ensemble model using the below models.
    - Linear Regression
    - Negative Binomial
    - Random Forest
    - XGBoost

## Load packages

In [18]:
import numpy as np    # fundamental package for scientific computing
import pandas as pd   # Python Data Analysis Library
from pandas import Series # one-dimensional labeled array capable of holding any data type 
import seaborn as sns # library for making statistical graphics in Python
import os             # operating system dependent functionality, file descriptor..

from sklearn.model_selection import train_test_split             # Split arrays or matrices into random train and test subsets

from sklearn.metrics import mean_absolute_error                  # Mean absolute error regression loss
from sklearn.metrics import median_absolute_error                # Median absolute error regression loss

import warnings
warnings.filterwarnings('ignore')                                # For warning control

In [19]:
# plotly library for visalization
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

%matplotlib inline    
# Line magic command will make plot outputs appear and be stored within the notebook.
import matplotlib.pyplot as plt   # matplotlib's plotting framework

In [20]:
# Going to use these 4 base models for the stacking
from sklearn.linear_model import LinearRegression   # base model for linear regression
from sklearn.linear_model import Lasso              # least absolute shrinkage and selection operator
from sklearn.linear_model import Ridge              # ridge regression
import statsmodels.api as sm                        # Negative binomial regression
import statsmodels.formula.api as smf               # statistical models formula APIs
from sklearn.ensemble import RandomForestRegressor  # A random forest regressor
import xgboost as xgb                               # Gradient Boosting for regression

from sklearn.model_selection import KFold           # K-Folds cross-validator 
from sklearn.model_selection import GridSearchCV    # Grid Search for paramter tuning

In [21]:
# Let's check what data files are available.
PATH="./../datasets/"
os.listdir(PATH)

['dengue_test_iq.pkl',
 'dengue_test_sj.pkl',
 'dengue_train_iq.pkl',
 'dengue_train_sj.pkl',
 'test_iq_month_1.pkl',
 'test_iq_month_10.pkl',
 'test_iq_month_11.pkl',
 'test_iq_month_12.pkl',
 'test_iq_month_2.pkl',
 'test_iq_month_3.pkl',
 'test_iq_month_4.pkl',
 'test_iq_month_5.pkl',
 'test_iq_month_6.pkl',
 'test_iq_month_7.pkl',
 'test_iq_month_8.pkl',
 'test_iq_month_9.pkl',
 'test_sj_month_1.pkl',
 'test_sj_month_10.pkl',
 'test_sj_month_11.pkl',
 'test_sj_month_12.pkl',
 'test_sj_month_2.pkl',
 'test_sj_month_3.pkl',
 'test_sj_month_4.pkl',
 'test_sj_month_5.pkl',
 'test_sj_month_6.pkl',
 'test_sj_month_7.pkl',
 'test_sj_month_8.pkl',
 'test_sj_month_9.pkl',
 'train_iq_month_1.pkl',
 'train_iq_month_10.pkl',
 'train_iq_month_11.pkl',
 'train_iq_month_12.pkl',
 'train_iq_month_2.pkl',
 'train_iq_month_3.pkl',
 'train_iq_month_4.pkl',
 'train_iq_month_5.pkl',
 'train_iq_month_6.pkl',
 'train_iq_month_7.pkl',
 'train_iq_month_8.pkl',
 'train_iq_month_9.pkl',
 'train_sj_month_1.pk

In [22]:
# let's load the train and test data
train_filename_sj = ( './../datasets/dengue_train_sj.pkl' )
train_filename_iq = ( './../datasets/dengue_train_iq.pkl' )
test_filename_sj  = ( './../datasets/dengue_test_sj.pkl' )
test_filename_iq  = ( './../datasets/dengue_test_iq.pkl' )

dengue_train_sj = pd.read_pickle( train_filename_sj )
dengue_train_iq = pd.read_pickle( train_filename_iq )
dengue_test_sj  = pd.read_pickle( test_filename_sj )
dengue_test_iq  = pd.read_pickle( test_filename_iq )

In [23]:
# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, reg, seed=0, params=None, NB=False,col=[],alpha=0.001):
        self.NB = NB
        self.col = col
        self.alpha = alpha
        if( self.NB == True ):
            self.model = None
        else:
            self.model = reg(**params)
            params['random_state'] = seed

    def train(self, x_train, y_train):
        if( self.NB == False ):
            self.model.fit(x_train, y_train)
        else:
            formula = ' + '.join([ str(feature) for feature in list(self.col)])
            formula = 'y ~ ' + formula
            train      = pd.DataFrame( x_train )
            train['y'] = y_train
            self.model = smf.glm( formula = formula,
                                  data = train,
                                  family = sm.families.NegativeBinomial(alpha=self.alpha)).fit()
        

    def predict(self, x):
        return self.model.predict(x)
    
    def fit(self,x,y):
        return self.model.fit(x,y)
    
    def feature_importances(self,x,y):
        if( self.NB == False ):
            print( self.model.fit(x,y).feature_importances_ )
        else:
            print( self.model.feature_importances_ )

### Out-of-Fold Predictions

Stacking uses predictions of base regressor as input for training to a second-level model.

In [24]:
def get_OutOfFoldPredictions( model, x_train, y_train, x_test ):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train.iloc[test_index]
        model.train(x_tr, y_tr)
        oof_train[test_index] = model.predict(x_te)
        oof_test_skf[i, :] = model.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

# Generating Base First-Level Models 

So now let us prepare four learning models as our first level regression.
- The models are listed as follows:
    - Linear Regression
    - Negative Binomial
    - Random Forest
    - XGBoost

In [25]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params_sj = {
    'n_estimators': 220,
    'max_depth': 4,
    'criterion': 'mae',
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'bootstrap': True,
    'verbose': 0
}
        
rf_params_iq = {
    'n_estimators': 22,
    'max_depth': 4,
    'criterion': 'mae',
    'min_samples_split': 16,
    'max_features': 'sqrt',
    'bootstrap': True,
    'verbose': 0
}

# XGBoost parameters
xgb_params_sj = {
    'learning_rate': 0.01, 
    'n_estimators': 225,
    'max_depth': 5,
    'min_child_weight': 12,
    'gamma': 0,
    'subsample': 0.95,
    'colsample_bytree': 0.5,
    'reg_alpha': 0.01,
    'reg_lambda': 1e-5,
    'objective':'reg:squarederror',
    'verbose': 0
}
xgb_params_iq = {
    'learning_rate': 0.1, 
    'n_estimators': 17,
    'max_depth': 8,
    'min_child_weight': 5,
    'gamma': 0.01,
    'subsample': 0.75,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.75,
    'reg_lambda': 0.75,
    'objective': 'reg:squarederror',
    'verbose': 0
}

# Linear Regression parameters ( L1 lasso regularization )
l1_params_sj = {
    'alpha': 2
}
lr_params_iq = {
}


In [26]:
#  RFE selected features from Linear Regression
col_RFE_sj = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'reanalysis_air_temp_c',
       'reanalysis_avg_temp_c', 'reanalysis_dew_point_temp_c',
       'reanalysis_relative_humidity_percent',
       'reanalysis_specific_humidity_g_per_kg', 'ndvi_mean',
       'station_avg_temp_c_lagVar', 'reanalysis_max_air_temp_c_lagSum',
       'reanalysis_air_temp_c_lagSum', 'reanalysis_air_temp_c_lagMean',
       'reanalysis_air_temp_c_lagVar', 'reanalysis_avg_temp_c_lagSum',
       'reanalysis_avg_temp_c_lagMean', 'reanalysis_avg_temp_c_lagVar',
       'ndvi_ne_lagSum', 'ndvi_ne_lagMean', 'ndvi_ne_lagVar',
       'reanalysis_tdtr_c_lagSum', 'reanalysis_tdtr_c_lagVar',
       'ndvi_mean_lagVar', 'ndvi_nw_lagSum', 'ndvi_nw_lagVar',
       'ndvi_se_lagSum', 'ndvi_se_lagMean', 'ndvi_se_lagVar', 'ndvi_sw_lagSum',
       'ndvi_sw_lagMean', 'ndvi_sw_lagVar']
col_RFE_iq = ['ndvi_nw', 'ndvi_sw', 'reanalysis_dew_point_temp_c',
       'reanalysis_specific_humidity_g_per_kg',
       'reanalysis_specific_humidity_g_per_kg_lagVar',
       'reanalysis_dew_point_temp_c_lagVar', 'ndvi_se_lagVar',
       'ndvi_sw_lagVar', 'ndvi_ne_lagVar', 'ndvi_nw_lagVar']
col_FFS_iq = ['reanalysis_tdtr_c_lagVar', 'reanalysis_specific_humidity_g_per_kg_lagMean',
          'reanalysis_precip_amt_kg_per_m2_lagVar', 'ndvi_se_lagVar', 'ndvi_ne_lagVar',
          'reanalysis_precip_amt_kg_per_m2_lagMean', 'station_min_temp_c']

In [27]:
SEED   = 1 # for reproducibility
NFOLDS = 10 # set folds for out-of-fold prediction
kf = KFold( n_splits = NFOLDS, random_state=SEED)

In [28]:
# Create objects that represent our models

# For san Juan
rf_sj = SklearnHelper(reg=RandomForestRegressor, seed=SEED, params=rf_params_sj)
gb_sj = SklearnHelper(reg=xgb.XGBRegressor, seed=SEED, params=xgb_params_sj)
nb_sj = SklearnHelper(reg=None, seed=SEED, params=None, NB=True, col=col_RFE_sj, alpha=1.18 )
lr_sj = SklearnHelper(reg=Lasso, seed=SEED, params=l1_params_sj )

# For Iquitos
rf_iq = SklearnHelper(reg=RandomForestRegressor, seed=SEED, params=rf_params_iq)
gb_iq = SklearnHelper(reg=xgb.XGBRegressor, seed=SEED, params=xgb_params_iq)
nb_iq = SklearnHelper(reg=None, seed=SEED, params=None, NB=True, col=col_RFE_iq, alpha=0.001 )
lr_iq = SklearnHelper(reg=LinearRegression, seed=SEED, params=lr_params_iq )


In [70]:
rf_oof_train_sj = np.array([])
rf_oof_test_sj  = np.array([])
gb_oof_train_sj = np.array([])
gb_oof_test_sj  = np.array([])
nb_oof_train_sj = np.array([])
nb_oof_test_sj  = np.array([])
lr_oof_train_sj = np.array([])
lr_oof_test_sj  = np.array([])
y_train_sj      = np.array([])
for month in range( 1, 13 ):
    train_filename = ( './../datasets/train_sj_month_' + str( month ) + '.pkl' )
    test_filename  = ( './../datasets/test_sj_month_' + str( month ) + '.pkl' )
    dengue_train_sj_month = pd.read_pickle( train_filename )
    dengue_test_sj_month  = pd.read_pickle( test_filename )
    x_train_sj = pd.DataFrame( dengue_train_sj_month )
    x_train_sj.drop( columns = ['city','year','month','total_cases'], inplace = True )
    y_train_sj_m = dengue_train_sj_month.total_cases
    x_test_sj = pd.DataFrame( dengue_test_sj_month, columns = x_train_sj.columns )
    ntrain = x_train_sj.shape[0]
    ntest = x_test_sj.shape[0]
    rf_oof_train_sj_m, rf_oof_test_sj_m = get_OutOfFoldPredictions( rf_sj, x_train_sj, y_train_sj_m, x_test_sj ) # Random Forest
    gb_oof_train_sj_m, gb_oof_test_sj_m = get_OutOfFoldPredictions( gb_sj, x_train_sj, y_train_sj_m, x_test_sj ) # Gradient Boost
    x_train_RFE_sj = x_train_sj[col_RFE_sj]
    x_test_RFE_sj  = x_test_sj[col_RFE_sj]
    nb_oof_train_sj_m, nb_oof_test_sj_m = get_OutOfFoldPredictions( nb_sj, x_train_RFE_sj, y_train_sj_m, x_test_RFE_sj )  # Negative Binomial
    lr_oof_train_sj_m, lr_oof_test_sj_m = get_OutOfFoldPredictions( lr_sj, x_train_RFE_sj,y_train_sj_m, x_test_RFE_sj )  # Linear Regression
    rf_oof_train_sj = np.append( rf_oof_train_sj, rf_oof_train_sj_m)
    rf_oof_test_sj  = np.append( rf_oof_test_sj , rf_oof_test_sj_m )
    gb_oof_train_sj = np.append( gb_oof_train_sj, gb_oof_train_sj_m)
    gb_oof_test_sj  = np.append( gb_oof_test_sj , gb_oof_test_sj_m )
    nb_oof_train_sj = np.append( nb_oof_train_sj, nb_oof_train_sj_m)
    nb_oof_test_sj  = np.append( nb_oof_test_sj , nb_oof_test_sj_m )
    lr_oof_train_sj = np.append( lr_oof_train_sj, lr_oof_train_sj_m)
    lr_oof_test_sj  = np.append( lr_oof_test_sj , lr_oof_test_sj_m)
    y_train_sj = np.append( y_train_sj , y_train_sj_m )

In [71]:
rf_oof_train_iq = np.array([])
rf_oof_test_iq  = np.array([])
gb_oof_train_iq = np.array([])
gb_oof_test_iq  = np.array([])
nb_oof_train_iq = np.array([])
nb_oof_test_iq  = np.array([])
lr_oof_train_iq = np.array([])
lr_oof_test_iq  = np.array([])
y_train_iq      = np.array([])
for month in range( 1, 13 ):
    train_filename = ( './../datasets/train_iq_month_' + str( month ) + '.pkl' )
    test_filename  = ( './../datasets/test_iq_month_' + str( month ) + '.pkl' )
    dengue_train_iq_month = pd.read_pickle( train_filename )
    dengue_test_iq_month  = pd.read_pickle( test_filename )
    x_train_iq = pd.DataFrame( dengue_train_iq_month )
    x_train_iq.drop( columns = ['city','year','month','total_cases'], inplace = True )
    y_train_iq_m = dengue_train_iq_month.total_cases
    x_test_iq = pd.DataFrame( dengue_test_iq_month, columns = x_train_iq.columns )
    ntrain = x_train_iq.shape[0]
    ntest = x_test_iq.shape[0]
    rf_oof_train_iq_m, rf_oof_test_iq_m = get_OutOfFoldPredictions( rf_iq, x_train_iq, y_train_iq_m, x_test_iq ) # Random Forest
    gb_oof_train_iq_m, gb_oof_test_iq_m = get_OutOfFoldPredictions( gb_iq, x_train_iq, y_train_iq_m, x_test_iq ) # Gradient Boost
    x_train_RFE_iq = x_train_iq[col_RFE_iq]
    x_test_RFE_iq  = x_test_iq[col_RFE_iq]
    nb_oof_train_iq_m, nb_oof_test_iq_m = get_OutOfFoldPredictions( nb_iq, x_train_RFE_iq, y_train_iq_m, x_test_RFE_iq )  # Negative Binomial
    x_train_lr_iq = x_train_iq[col_FFS_iq]
    x_test_lr_iq  = x_test_iq[col_FFS_iq]
    lr_oof_train_iq_m, lr_oof_test_iq_m = get_OutOfFoldPredictions( lr_iq, x_train_lr_iq, y_train_iq_m, x_test_lr_iq )  # Linear Regression
    rf_oof_train_iq = np.append( rf_oof_train_iq, rf_oof_train_iq_m)
    rf_oof_test_iq  = np.append( rf_oof_test_iq , rf_oof_test_iq_m )
    gb_oof_train_iq = np.append( gb_oof_train_iq, gb_oof_train_iq_m)
    gb_oof_test_iq  = np.append( gb_oof_test_iq , gb_oof_test_iq_m )
    nb_oof_train_iq = np.append( nb_oof_train_iq, nb_oof_train_iq_m)
    nb_oof_test_iq  = np.append( nb_oof_test_iq , nb_oof_test_iq_m )
    lr_oof_train_iq = np.append( lr_oof_train_iq, lr_oof_train_iq_m)
    lr_oof_test_iq  = np.append( lr_oof_test_iq , lr_oof_test_iq_m)
    y_train_iq = np.append( y_train_iq , y_train_iq_m )

# Second-Level Predictions from the First-level Output

**First-level output as new features**

Having now obtained our first-level predictions, one can think of it as essentially building a new set of features to be used as training data for the next regression. As per the code below, we are therefore having as our new columns the first-level predictions from our earlier regressors and we train the next regression on this.

In [111]:
base_predictions_train_sj = pd.DataFrame({'RandomForest':     rf_oof_train_sj.ravel(),
                                          'GradientBoost':    gb_oof_train_sj.ravel(),
                                          'NegativeBinomial': nb_oof_train_sj.ravel(),
                                          'LinearRegression': lr_oof_train_sj.ravel(),
                                         })
print( base_predictions_train_sj.head() )
base_predictions_train_iq = pd.DataFrame({'RandomForest':     rf_oof_train_iq.ravel(),
                                          'GradientBoost':    gb_oof_train_iq.ravel(),
                                          'NegativeBinomial': nb_oof_train_iq.ravel(),
                                          'LinearRegression': lr_oof_train_iq.ravel(),
                                         })
print( base_predictions_train_iq.head())

base_predictions_test_sj = pd.DataFrame({ 'RandomForest':     rf_oof_test_sj.ravel(),
                                          'GradientBoost':    gb_oof_test_sj.ravel(),
                                          'NegativeBinomial': nb_oof_test_sj.ravel(),
                                          'LinearRegression': lr_oof_test_sj.ravel(),
                                         })
print( base_predictions_test_sj.head() )
base_predictions_test_iq = pd.DataFrame({ 'RandomForest':     rf_oof_test_iq.ravel(),
                                          'GradientBoost':    gb_oof_test_iq.ravel(),
                                          'NegativeBinomial': nb_oof_test_iq.ravel(),
                                          'LinearRegression': lr_oof_test_iq.ravel(),
                                         })
print( base_predictions_test_iq.head())

   RandomForest  GradientBoost  NegativeBinomial  LinearRegression
0     34.052273      29.743795         56.734418         29.754986
1     30.811364      25.842989         66.486865         36.788830
2     31.663636      26.968149         45.285490         37.443580
3     30.818182      25.467220         31.016681         23.613596
4     31.361364      27.874380         25.114499         25.279974
   RandomForest  GradientBoost  NegativeBinomial  LinearRegression
0     18.545455      15.255092          7.799189         41.623884
1     18.136364      15.255092         10.697936         45.381529
2     20.181818      18.691101          7.435928         41.298359
3     27.863636      21.634377          8.165964         41.541228
4     22.090909      19.177652         31.420405         35.738764
   RandomForest  GradientBoost  NegativeBinomial  LinearRegression
0     27.017500      25.355682          6.232939         26.396776
1     25.692500      25.339326         12.277291         27.30

**Correlation Heatmap of the Second Level Training set**

In [73]:
data = [ go.Heatmap( z = base_predictions_train_sj.astype( float ).corr().values,
                     x = base_predictions_train_sj.columns.values,
                     y = base_predictions_train_sj.columns.values,
                     colorscale = 'Viridis',
                     showscale = True,
                     reversescale = True )]
py.iplot( data, filename = 'labelled-heatmap' )

In [74]:
data = [ go.Heatmap( z = base_predictions_train_iq.astype( float ).corr().values,
                     x = base_predictions_train_iq.columns.values,
                     y = base_predictions_train_iq.columns.values,
                     colorscale = 'Viridis',
                     showscale = True,
                     reversescale = True )]
py.iplot( data, filename = 'labelled-heatmap' )

In [106]:
x_train_sec_sj = np.array( base_predictions_train_sj )
x_test_sec_sj  = np.array( base_predictions_test_sj )
x_train_sec_iq = np.array( base_predictions_train_iq )
x_test_sec_iq  = np.array( base_predictions_test_iq )

Having now concatenated and joined both the first-level train and test predictions as x_train and x_test, we can now fit a second-level learning model.

### Second level learning model via XGBoost

Here we choose the eXtremely famous library for boosted tree learning model, XGBoost. It was built to optimize large-scale boosted tree algorithms. For further information about the algorithm, check out the [official documentation][1].

  [1]: https://xgboost.readthedocs.io/en/latest/

Anyways, we call an XGBClassifier and fit it to the first-level train and target data and use the learned model to predict the test data as follows:

In [91]:
X = pd.DataFrame( x_train_sec_sj )
y = y_train_sj
parameters_for_testing = { 'learning_rate':[0.01],#0.1,0.01
                           'max_depth':[3,4,5],#,6
                           'n_estimators':[150,200,300,250],#100,200,500,1000
                           'gamma':[0],#0,0.01
                           'min_child_weight':[5,8,12,15],
                           'colsample_bytree':[0.5,0.4,0.6],
                           'reg_alpha':[0.01],
                           'reg_lambda':[1e-5,1e-4],#0.075
                           'subsample':[0.95]#,1e-5
                         }
xgb_model = xgb.XGBRegressor()
gs_xg_sj = GridSearchCV( estimator = xgb_model, param_grid = parameters_for_testing,
                      n_jobs=4,iid=False, verbose=1, scoring ='neg_mean_squared_error',
                      cv= 5).fit( X, y, eval_metric='rmse' )
Y_sj_pred = gs_xg_sj.best_estimator_.predict( pd.DataFrame( x_test_sec_sj, columns = X.columns ) ).astype(int)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   14.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   30.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   51.7s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.3min




[Parallel(n_jobs=4)]: Done 1440 out of 1440 | elapsed:  1.5min finished


In [None]:
X = pd.DataFrame( x_train_sec_iq )
y = y_train_iq
parameters_for_testing = { 'learning_rate':[0.1,0.01],
                           'max_depth':[2,3,4,5],
                           'n_estimators':[150,200,250],#100,200,300,500
                           'gamma':[0],#,0.01
                           'min_child_weight':[12,18,15,20],#5,8,
                           'colsample_bytree':[0.4,0.6],
                           'reg_alpha':[0.01],
                           'reg_lambda':[1e-5,1e-4,0.075],
                           'subsample':[0.95]#,1e-5,1e-4
                         }
xgb_model = xgb.XGBRegressor()
gs_xg_iq = GridSearchCV( estimator = xgb_model, param_grid = parameters_for_testing,
                      n_jobs=4,iid=False, verbose=1, scoring ='neg_mean_squared_error',
                      cv= 5).fit( X, y, eval_metric='rmse' )

Y_iq_pred = gs_xg_iq.best_estimator_.predict( pd.DataFrame( x_test_sec_iq, columns = X.columns )).astype(int)

In [108]:
# Random Forest as Last model
# X = pd.DataFrame( x_train_sec_sj )
# y = y_train_sj
# param_grid = { 
#     "n_estimators"      : [160,220,150],
#     "max_features"      : ["sqrt"],
#     "min_samples_split" : [10,12,18],
#     "bootstrap"         : [True],
#     "max_depth"         : [2,4,5,8]
#     }
# estimator = RandomForestRegressor( criterion='mae', oob_score=True)
# rf_est = GridSearchCV(estimator, param_grid,
#                       n_jobs=-1, cv=5, verbose=0,
#                       iid=True ).fit( X, y)
# Y_sj_pred = rf_est.best_estimator_.predict(pd.DataFrame( x_test_sec_sj, columns = X.columns )).astype(int)
# 
# X = pd.DataFrame( x_train_sec_iq )
# y = y_train_iq
# param_grid = { 
#     "n_estimators"      : [160,220,100],
#     "max_features"      : ["sqrt"],
#     "min_samples_split" : [10,16,20],
#     "bootstrap"         : [True],
#     "max_depth"         : [2,4,5,8]
#     }
# estimator = RandomForestRegressor( criterion='mae', oob_score=True)
# rf_est = GridSearchCV(estimator, param_grid,
#                       n_jobs=-1, cv=5, verbose=0, iid=True).fit( X, y)
# Y_iq_pred = rf_est.best_estimator_.predict(pd.DataFrame( x_test_sec_iq, columns = X.columns )).astype(int)

In [78]:
submission_sj = pd.DataFrame( dengue_test_sj, columns = ['city'])
submission_iq = pd.DataFrame( dengue_test_iq, columns = ['city'])
submission_sj.reset_index( inplace = True)
submission_sj['year'] = submission_sj['week_start_date'].dt.year
submission_sj['weekofyear'] = submission_sj['week_start_date'].dt.weekofyear
submission_iq.reset_index( inplace = True)
submission_iq['year'] = submission_iq['week_start_date'].dt.year
submission_iq['weekofyear'] = submission_iq['week_start_date'].dt.weekofyear
submission = submission_sj.append( submission_iq )
submission.drop( columns = ['week_start_date'], inplace = True )

In [109]:
submit_pred_sj = pd.DataFrame()
for month in range( 1, 13 ):
    test_filename  = ( './../datasets/test_sj_month_' + str( month ) + '.pkl' )
    dengue_test_sj_month = pd.read_pickle( test_filename )
    submit_pred_sj_month = pd.DataFrame( dengue_test_sj_month, columns = ['city', 'week_start_date'])
    submit_pred_sj = submit_pred_sj.append( submit_pred_sj_month )
submit_pred_sj.drop( columns=['week_start_date'], inplace = True )
submit_pred_sj.reset_index( inplace = True )
submit_pred_sj['year'] = submit_pred_sj['week_start_date'].dt.year
submit_pred_sj['weekofyear'] = submit_pred_sj['week_start_date'].dt.weekofyear
submit_pred_sj.drop( columns = ['week_start_date'], inplace = True )
submit_pred_sj['total_cases'] = Y_sj_pred

submit_pred_iq = pd.DataFrame()
for month in range( 1, 13 ):
    test_filename  = ( './../datasets/test_iq_month_' + str( month ) + '.pkl' )
    dengue_test_iq_month = pd.read_pickle( test_filename )
    submit_pred_iq_month = pd.DataFrame( dengue_test_iq_month, columns = ['city', 'week_start_date'])
    submit_pred_iq = submit_pred_iq.append( submit_pred_iq_month )
submit_pred_iq = pd.DataFrame( dengue_test_iq, columns = ['city', 'week_start_date'])
submit_pred_iq.drop( columns=['week_start_date'], inplace = True )
submit_pred_iq.reset_index( inplace = True )
submit_pred_iq['year'] = submit_pred_iq['week_start_date'].dt.year
submit_pred_iq['weekofyear'] = submit_pred_iq['week_start_date'].dt.weekofyear
submit_pred_iq.drop( columns = ['week_start_date'], inplace = True )
submit_pred_iq['total_cases'] = Y_iq_pred

submit = submit_pred_sj.append( submit_pred_iq, sort=True )
test = pd.merge( submission, submit, on =['city','year','weekofyear'], how='left')

In [88]:
test.to_csv("data/ensemble_stacking_rf_xgb_nb_linear_monthwise.csv", index = False)