# ML Ensamble introductory example 

In [1]:
import numpy as np

# ML Ensemble
from mlens.ensemble import Ensemble
from mlens.metrics import rmse

# Base Models
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline

# CV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [2]:
np.random.seed(3675)

## Data

First, we need some dummy data to work with

In [3]:
# training data
X = np.random.random((1000, 10))

# noisy output, y = x0 * x1 + x2^2 + x3 - x4^(1/4) + e
y = X[:, 0] * X[:, 1] + X[:, 2] ** 2 + X[:, 3] - X[:, 4] ** (1 / 4)

# Change scales
X[:, 0] *= 10
X[:, 1] += 10
X[:, 2] *= 5
X[:, 3] *= 3
X[:, 4] /= 10

# Baseline models

Next, we need a couple of baseline estimators that we want to combine

In [4]:
ls = Lasso()
rf = RandomForestRegressor()
kn = KNeighborsRegressor()
sv = SVR()

Let's see how they perform on the raw input data

In [5]:
def check_models(names, models, parameters):
    print('Best cross validated score on raw input data:')
    for m, e, p in zip(names, models, parameters):
        grid = RandomizedSearchCV(e, param_distributions=p, n_iter=10, cv=10,
                                  scoring=rmse, n_jobs=-1)
        grid.fit(X, y)
        print('%.3f [%s - %r]' % (grid.best_score_, m, grid.best_params_))

In [6]:
ls_p = {'alpha': uniform(0.0005, 0.005)}
rf_p = {'max_depth': randint(2, 5), 'max_features': randint(3, 5),
        'min_samples_leaf': randint(2, 10)}
kn_p = {'n_neighbors': randint(5, 20)}
sv_p = {'C': uniform(5, 15)}

check_models(['Lasso', 'Random Forest', 'KNN', 'SVR'],
             (ls, rf, kn, sv), (ls_p, rf_p, kn_p, sv_p))

Best cross validated score on raw input data:
-0.127 [Lasso - {'alpha': 0.00091227021768499955}]
-0.282 [Random Forest - {'min_samples_leaf': 7, 'max_depth': 3, 'max_features': 4}]
-0.223 [KNN - {'n_neighbors': 8}]
-0.111 [SVR - {'C': 15.456574343107798}]


Let's see if preprocessing the data helps. Now, to avoid data leakages, we need to put our models into pipelines.

In [7]:
ls = make_pipeline(StandardScaler(), Lasso(alpha=0.001))
rf = make_pipeline(StandardScaler(),
                   RandomForestRegressor(max_depth=3, max_features=0.7,
                                         n_estimators=20, n_jobs=1))
kn = make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=3))
sv = make_pipeline(MinMaxScaler(), SVR(C=0.5))

In [8]:
ls_p = {'lasso__alpha': uniform(0.0005, 0.005)}
rf_p = {'randomforestregressor__max_depth': randint(2, 5),
        'randomforestregressor__max_features': randint(3, 5),
        'randomforestregressor__min_samples_leaf': randint(2, 10)}
kn_p = {'kneighborsregressor__n_neighbors': randint(1, 30)}
sv_p = {'svr__C': uniform(1, 20)}

check_models(['Lasso', 'Random Forest', 'KNN', 'SVR'],
             (ls, rf, kn, sv), (ls_p, rf_p, kn_p, sv_p))

Best cross validated score on raw input data:
-0.123 [Lasso - {'lasso__alpha': 0.00053995960232497984}]
-0.235 [Random Forest - {'randomforestregressor__max_features': 4, 'randomforestregressor__max_depth': 4, 'randomforestregressor__min_samples_leaf': 4}]
-0.205 [KNN - {'kneighborsregressor__n_neighbors': 10}]
-0.058 [SVR - {'svr__C': 20.691855254901064}]


Scaling data helps the Lasso and KNN. The Random Forest is invariant to scale, so no real difference in perforance.
The SVR does not improve with standard scaling (not shown), but Min-Max scaling helps. 
In all, transforming the data helps. However, fitting models in this way is inefficient since several models depend on the same transformation process. However, because there is some variation, it is not so easy as to fit all models
on the same preprocessed data. 
Luckily, the ML-Ensemble library allows us to specify separate preprocessing pipelines with associate base estimators, so we can easily handle such a situation.

# Ensemble

To improve our score, we build an estimator on top of these baselines in order to try and learn when which model is performing well, and thus leverage the strenght of each model across feature space. The ensemble here has not been optimized to any greater degree: for illustration purposes we'll settle on a configuration somewhat arbitrarily.

In [9]:
# fix a meta estimator for generating final predictions
meta = SVR()

# Create base estimators, along with associated preprocessing pipelines
base_pipelines = {'sc':
                  # standard scaling pipeline
                  # we now specify a tuple (preprocessing, estimators)
                  # preprocessing is an ordered list of transformers
                  # estimators is a list of estimators
                  # estimators can be given names by passing tuples
                  # ('name', estimator)
                  ([StandardScaler()], 
                   [('ls', Lasso()), ('kn', KNeighborsRegressor())]),
                  'mm': # We don't need to name objects
                  ([MinMaxScaler()], [SVR()]),
                  'np':
                  # There is no requirement to preprocess data, in this case
                  # just pass an empty list
                  ([], [('rf', RandomForestRegressor())])
                 }

ensemble = Ensemble(meta, # this is the meta estimator
                    base_pipelines, # this is the base pipelines
                    folds=10, # number of folds used for stacking
                    shuffle=True, # whether to shuffle data
                    scorer=rmse._score_func, # to get base est test scores
                    n_jobs=1,  # joblib can only parallelize the outer loop
                               # which is the grid search. Set to 1 for now.
                    as_df=True # allows us to look at feature importances
                   )

Having instantiated the ensemble, it is easy to map parameters to specific models and pipelines:

In [10]:
print('Sample of parameters:\n')
for i, (key, val) in enumerate(ensemble.get_params().items()):
    print("%r: %r" % (key, val))
    if i == 15:
        break
    i += 1

Sample of parameters:

'as_df': True
'n_jobs': 1
'sc-kn__n_neighbors': 5
'meta-svr': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
'scorer': <function rmse_scoring at 0x10e79cea0>
'meta-svr__epsilon': 0.1
'sc-ls__random_state': None
'sc-ls__alpha': 1.0
'np-rf__min_samples_split': 2
'meta-svr__degree': 3
'meta-svr__kernel': 'rbf'
'sc-ls__fit_intercept': True
'sc-kn__n_jobs': 1
'meta-svr__max_iter': -1
'sc-kn': KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')
'sc-standardscaler__copy': True


Hence, we can use the sklearn API to update ingoing parameters in any estimator or transformer:

In [11]:
# Ingoing parameter setting:
ensemble.get_params()['mm-svr__C']

1.0

In [12]:
# We update it using the sklearn API
ensemble.set_params(**{'mm-svr__C': 5.0})

# And now check the nest list "base_estimators" that is used for fitting
ensemble.base_estimators[2]

('mm',
 [('svr',
   SVR(C=5.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

So, to fit the ensemble, we need to set up the grid search parameter dictionary, which essentially amounts to
concatenating the previous dictionaries we used:

In [13]:
en_p = {'sc-ls__alpha': uniform(0.0005, 0.005),
        'np-rf__max_depth': randint(2, 6),
        'np-rf__max_features': randint(2, 5),
        'np-rf__min_samples_leaf': randint(5, 12),
        'sc-kn__n_neighbors': randint(6, 12),
        'mm-svr__C': uniform(10, 20),
        'meta-svr__C': uniform(10, 20)}

In [14]:
check_models(['Lasso', 'Random Forest', 'KNN', 'SVR', 'Ensemble'],
             (ls, rf, kn, sv, ensemble), (ls_p, rf_p, kn_p, sv_p, en_p))

Best cross validated score on raw input data:
-0.123 [Lasso - {'lasso__alpha': 0.0015142045314504382}]
-0.236 [Random Forest - {'randomforestregressor__max_features': 4, 'randomforestregressor__max_depth': 4, 'randomforestregressor__min_samples_leaf': 5}]
-0.207 [KNN - {'kneighborsregressor__n_neighbors': 11}]
-0.059 [SVR - {'svr__C': 15.328019931942373}]
-0.053 [Ensemble - {'np-rf__min_samples_leaf': 9, 'np-rf__max_features': 4, 'sc-kn__n_neighbors': 9, 'np-rf__max_depth': 4, 'sc-ls__alpha': 0.0014284293508642438, 'meta-svr__C': 16.626146983723014, 'mm-svr__C': 11.834807428701293}]


The ensemble improves the best base estimator score by 0.01, or by 9%. Given the high accuracy the SVR achieves already, this is a fairly large improvement even with our minor parameter tuning.

# Building a final ensemble

With the grid search complete, we have a fair idea of what parameters we want to have. However,
during the grid search we deactivated parallelization to avoid clashing with the grid search. 
Naturally, when we fit the final model, we want to reactivate it.

In [18]:
ensemble.set_params(**{'np-rf__min_samples_leaf': 9,
                       'meta-svr__C': 16.626146983723014,
                       'sc-kn__n_neighbors': 9,
                       'np-rf__max_features': 4,
                       'mm-svr__C': 11.834807428701293,
                       'sc-ls__alpha': 0.0014284293508642438,
                       'np-rf__max_depth': 4,
                       'n_jobs': -1,
                       'verbose': 3})
ensemble.fit(X[:900], y[:900])

Fitting ensemble

> fitting meta estimator
>> preprocessing folds
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.1s finished
>> fitting base estimators
[Parallel(n_jobs=-1)]: Done   5 out of  40 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.3s finished
>> fitting meta estimator

> fitting base estimators
>> preprocessing data
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished
>> fitting base estimators
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.0s finished

Fit complete | 00:00:01



Ensemble(as_df=True,
     base_pipelines={'np': ([], [('rf', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features=4, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=9, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
   ...silon=0.1,
  gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001,
  verbose=False)])},
     folds=10,
     meta_estimator=SVR(C=16.626146983723014, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001,
  verbose=False),
     n_jobs=-1, random_state=None,
     scorer=<function rmse_scoring at 0x10e79cea0>, shuffle=True,
     verbose=3)

In [19]:
print('Base Estimator scores:')
for est_name, score in ensemble.scores_.items():
    print(est_name, score)

Base Estimator scores:
sc-kn 0.20796201837
np-rf 0.242135163725
sc-ls 0.1238471778
mm-svr 0.0618138749617


In [20]:
rmse(ensemble, X[900:], y[900:])

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


-0.050417414288113208