# Multi-model with grid search

In [None]:
import logging

import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.decomposition import PCA

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
class EstimatorSelectionHelper(BaseEstimator):

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for '%s'." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator'  : key,
                 'min_score'  : min(scores),
                 'max_score'  : max(scores),
                 'mean_score' : np.mean(scores),
                 'std_score'  : np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [None]:
breast_cancer = datasets.load_breast_cancer()
X_cancer = breast_cancer.data
y_cancer = breast_cancer.target

In [None]:
print(X_cancer.shape)
print(y_cancer.shape)

In [None]:
models1 = {
    'ExtraTreesClassifier'       : ExtraTreesClassifier(),
    'RandomForestClassifier'     : RandomForestClassifier(),
    'AdaBoostClassifier'         : AdaBoostClassifier(),
    'GradientBoostingClassifier' : GradientBoostingClassifier(),
    'SVC'                        : SVC()
}

params1 = {
    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
    'RandomForestClassifier': { 'n_estimators': [16, 32] },
    'AdaBoostClassifier': { 'n_estimators': [16, 32] },
    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
    'SVC': [
        {'kernel': ['linear'], 'C': [1, 10]},
        {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
    ]
}

In [None]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(X_cancer, y_cancer, scoring='f1', n_jobs=2)

In [None]:
helper1.score_summary(sort_by='max_score')

In [None]:
pca = PCA()
helper1 = EstimatorSelectionHelper(models1, params1)

flow = Pipeline([
    ('pca', pca),
    ('selector', helper1),
])

flow.fit(X_cancer, y_cancer)

In [None]:
helper1.score_summary(sort_by='max_score')

# pipelinehelper

## boston house-prices (regression)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn import datasets
from pipelinehelper import PipelineHelper

X, y = datasets.load_boston(True)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(y)

In [None]:
pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler()),
    ], include_bypass=True)), # this will produce one setting without scaler
    ('regressor', PipelineHelper([
        ('rf', RandomForestRegressor()),
        ('ada', AdaBoostRegressor()),
        ('gb', GradientBoostingRegressor()),
    ])),
])

params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate({
        'std__with_mean': [True, False],
        'std__with_std': [True, False],
        # no params for 'max' leads to using standard params
    }),
    'regressor__selected_model': pipe.named_steps['regressor'].generate({

        'rf__n_estimators': [10, 20],

        'ada__n_estimators': [10, 20],
        
        'gb__n_estimators': [10, 20],
        #'gb__criterion': ['friedman_mse', 'mse', 'mae'],
        #'gb__max_features': ['auto', 'sqrt', None],

    })
}
grid = GridSearchCV(pipe, params, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1, return_train_score=True)
grid.fit(X, y)
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_.decision_function(X))

In [None]:
pd.DataFrame(grid.cv_results_).sort_values('mean_test_score', ascending=False)

##  Iris (classification)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from pipelinehelper import PipelineHelper

X, y = datasets.load_iris(True)

pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler()),
    ], include_bypass=True)), # this will produce one setting without scaler
    ('classifier', PipelineHelper([
        ('svm', SVC()),
        ('rf', RandomForestClassifier()),
        ('ada', AdaBoostClassifier()),
        ('gb', GradientBoostingClassifier()),
        ('knn', KNeighborsClassifier()),
        
        ('nb_pipe', Pipeline([
            # Naivie Bayes needs positive numbers
            ('scaler', MinMaxScaler()),
            ('nb', MultinomialNB())
        ])),
    ])),
])

params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate({
        'std__with_mean': [True, False],
        'std__with_std': [True, False],
        # no params for 'max' leads to using standard params
    }),
    'classifier__selected_model': pipe.named_steps['classifier'].generate({

        'svm__C': [0.1, 1.0],
        'svm__kernel': ['linear', 'rbf'],

        'rf__n_estimators': [10, 20, 50, 100, 150],
        'rf__max_features': ['auto', 'sqrt', 'log2'],
        'rf__min_samples_split' : [2, 5, 10],
        'rf__min_samples_leaf' : [1, 2, 4],
        'rf__bootstrap': [True, False],

        'ada__n_estimators': [10, 20, 40, 100],
        'ada__algorithm': ['SAMME', 'SAMME.R'],
        
        'gb__n_estimators': [10, 20, 50, 100],
        'gb__criterion': ['friedman_mse', 'mse', 'mae'],
        'gb__max_features': ['auto', 'sqrt', None],

        'knn__n_neighbors': [2, 3, 5, 7, 10],
        'knn__leaf_size':[1,2,3,5],
        'knn__weights': ['uniform', 'distance'],
        'knn__algorithm': ['auto', 'ball_tree','kd_tree','brute'],

        'nb_pipe__nb__fit_prior': [True, False],
        'nb_pipe__nb__alpha': [0.1, 0.2],
    })
}
grid = GridSearchCV(pipe, params, scoring='accuracy', verbose=2, n_jobs=-1, return_train_score=True)
grid.fit(X, y)
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_.decision_function(X))

In [None]:
pd.DataFrame(grid.cv_results_).sort_values('mean_test_score', ascending=False)

# Custom GridSearchCV

In [None]:
import os, sys, time
import math, warnings, operator

from tqdm import tqdm_notebook as tqdm

from abc import ABCMeta, abstractmethod
from collections import Mapping, namedtuple, defaultdict, Sequence, Iterable
from functools import partial, reduce
from itertools import product

import numpy as np
from scipy.stats import rankdata

from sklearn.base import BaseEstimator, is_classifier, clone
from sklearn.base import MetaEstimatorMixin

from sklearn.model_selection import GridSearchCV, ParameterGrid, ParameterSampler
from sklearn.model_selection._split import check_cv
from sklearn.model_selection._validation import _fit_and_score, _aggregate_score_dicts, _score
from sklearn.exceptions import NotFittedError
from sklearn.externals.joblib import Parallel, delayed, logger, parallel_backend, register_parallel_backend
from sklearn.externals import six
from sklearn.utils import check_random_state
from sklearn.utils.fixes import sp_version
from sklearn.utils.fixes import MaskedArray
from sklearn.utils.random import sample_without_replacement
from sklearn.utils.validation import indexable, check_is_fitted, _is_arraylike, _num_samples
from sklearn.utils.metaestimators import if_delegate_has_method, _safe_split
from sklearn.utils.deprecation import DeprecationDict
from sklearn.metrics.scorer import _check_multimetric_scoring, check_scoring
from sklearn.metrics.scorer import mean_squared_error, r2_score, make_scorer

In [None]:
def my_fit_and_score(estimator, X, y, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, return_n_test_samples=False,
                   return_times=False, return_estimator=False, return_y=False,
                   error_score='raise-deprecating'):
    """Fit estimator and compute scores for a given dataset split.
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    X : array-like of shape at least 2D
        The data to fit.
    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    scorer : A single callable or dict mapping scorer name to the callable
        If it is a single callable, the return value for ``train_scores`` and
        ``test_scores`` is a single float.
        For a dict, it should be one mapping the scorer name to the scorer
        callable object / function.
        The callable object / fn should have signature
        ``scorer(estimator, X, y)``.
    train : array-like, shape (n_train_samples,)
        Indices of training samples.
    test : array-like, shape (n_test_samples,)
        Indices of test samples.
    verbose : integer
        The verbosity level.
    error_score : 'raise' | 'raise-deprecating' or numeric
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised.
        If set to 'raise-deprecating', a FutureWarning is printed before the
        error is raised.
        If a numeric value is given, FitFailedWarning is raised. This parameter
        does not affect the refit step, which will always raise the error.
        Default is 'raise-deprecating' but from version 0.22 it will change
        to np.nan.
    parameters : dict or None
        Parameters to be set on the estimator.
    fit_params : dict or None
        Parameters that will be passed to ``estimator.fit``.
    return_train_score : boolean, optional, default: False
        Compute and return score on training set.
    return_parameters : boolean, optional, default: False
        Return parameters that has been used for the estimator.
    return_n_test_samples : boolean, optional, default: False
        Whether to return the ``n_test_samples``
    return_times : boolean, optional, default: False
        Whether to return the fit/score times.
    return_estimator : boolean, optional, default: False
        Whether to return the fitted estimator.
    Returns
    -------
    train_scores : dict of scorer name -> float, optional
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.
    test_scores : dict of scorer name -> float, optional
        Score on testing set (for all the scorers).
    n_test_samples : int
        Number of test samples.
    fit_time : float
        Time spent for fitting in seconds.
    score_time : float
        Time spent for scoring in seconds.
    parameters : dict or None, optional
        The parameters that have been evaluated.
    estimator : estimator object
        The fitted estimator
    """
    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                          for k, v in parameters.items()))
        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}
    fit_params = dict([(k, _index_param_value(X, v, train))
                      for k, v in fit_params.items()])

    train_scores = {}
    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    is_multimetric = not callable(scorer)
    n_scorers = len(scorer.keys()) if is_multimetric else 1

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = time.time() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif error_score == 'raise-deprecating':
            warnings.warn("From version 0.22, errors during fit will result "
                          "in a cross validation score of NaN by default. Use "
                          "error_score='raise' if you want an exception "
                          "raised or error_score=np.nan to adopt the "
                          "behavior from version 0.22.",
                          FutureWarning)
            raise
        elif isinstance(error_score, numbers.Number):
            if is_multimetric:
                test_scores = dict(zip(scorer.keys(),
                                   [error_score, ] * n_scorers))
                if return_train_score:
                    train_scores = dict(zip(scorer.keys(),
                                        [error_score, ] * n_scorers))
            else:
                test_scores = error_score
                if return_train_score:
                    train_scores = error_score
            warnings.warn("Estimator fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%s" %
                          (error_score, format_exception_only(type(e), e)[0]),
                          FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = time.time() - start_time
        # _score will return dict if is_multimetric is True
        test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
        score_time = time.time() - start_time - fit_time
        if return_train_score:
            train_scores = _score(estimator, X_train, y_train, scorer,
                                  is_multimetric)
        
        if return_y:
            y_pred = estimator.predict(X_test)
        
    if verbose > 2:
        if is_multimetric:
            for scorer_name, score in test_scores.items():
                msg += ", %s=%s" % (scorer_name, score)
        else:
            msg += ", score=%s" % test_scores
    if verbose > 1:
        total_time = score_time + fit_time
        end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

    ret = [train_scores, test_scores] if return_train_score else [test_scores]

    if return_n_test_samples:
        ret.append(_num_samples(X_test))
    if return_times:
        ret.extend([fit_time, score_time])
    if return_parameters:
        ret.append(parameters)
    if return_estimator:
        ret.append(estimator)
    if return_y:
        ret.append(np.array(y_test))
        ret.append(y_pred)
    return ret

In [None]:
class UltraGridSearchCV(GridSearchCV):

    def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
                 n_jobs=1, iid='warn', refit=True, cv=None, verbose=0,
                 pre_dispatch='2*n_jobs', error_score='raise-deprecating',
                 return_train_score="warn"):
        super(UltraGridSearchCV, self).__init__(
            estimator=estimator, param_grid = param_grid, scoring=scoring, fit_params=fit_params,
            n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
            pre_dispatch=pre_dispatch, error_score=error_score,
            return_train_score=return_train_score)

    def _get_param_iterator(self):
        """Return ParameterGrid instance for the given param_grid"""
        return ParameterGrid(self.param_grid)
    
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        if self.fit_params is not None:
            warnings.warn('"fit_params" as a constructor argument was '
                          'deprecated in version 0.19 and will be removed '
                          'in version 0.21. Pass fit parameters to the '
                          '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn('Ignoring fit_params passed as a constructor '
                              'argument in favor of keyword arguments to '
                              'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                            pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=True,
                                    return_estimator=True,
                                    return_y=True,
                                    error_score=self.error_score,
                                    verbose=self.verbose)
        results_container = [{}]
        with parallel:
            all_candidate_params = []
            all_out = []

            def evaluate_candidates(candidate_params):
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                              n_splits, n_candidates, n_candidates * n_splits))

                out = parallel(delayed(my_fit_and_score)(clone(base_estimator),
                                                       X, y,
                                                       train=train, test=test,
                                                       parameters=parameters,
                                                       **fit_and_score_kwargs)
                               for (parameters, (train, test)), i
                               in zip(
                                   product(candidate_params, cv.split(X, y, groups)), 
                                   tqdm(range(n_candidates * n_splits))
                               )
                )
                
                all_candidate_params.extend(candidate_params)
                all_out.extend(out)

                # XXX: When we drop Python 2 support, we can use nonlocal
                # instead of results_container
                results_container[0] = self._format_results(
                    all_candidate_params, scorers, n_splits, all_out)
                return results_container[0]

            self._run_search(evaluate_candidates)

        results = results_container[0]

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = results["params"][self.best_index_]
            self.best_score_ = results["mean_test_%s" % refit_metric][
                self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self
    
    def _format_results(self, candidate_params, scorers, n_splits, out):
        n_candidates = len(candidate_params)
        
        # if one choose to see train score, "out" will contain train score info
        if self.return_train_score:
            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
             score_time, parameters, the_estimator, y_test, y_pred) = zip(*out)
        else:
            (test_score_dicts, test_sample_counts, fit_time,
             score_time, parameters, the_estimator, y_test, y_pred) = zip(*out)
        
        # test_score_dicts and train_score dicts are lists of dictionaries and
        # we make them into dict of lists
        test_scores = _aggregate_score_dicts(test_score_dicts)
        if self.return_train_score:
            train_scores = _aggregate_score_dicts(train_score_dicts)

        # TODO: replace by a dict in 0.21
        results = (DeprecationDict() if self.return_train_score == 'warn'
                   else {})

        def _store(key_name, array, weights=None, splits=False, rank=False):
            """A small helper to store the scores/times to the cv_results_"""
            # When iterated first by splits, then by parameters
            # We want `array` to have `n_candidates` rows and `n_splits` cols.
            array = np.array(array, dtype=np.float64).reshape(n_candidates,
                                                              n_splits)
            if splits:
                for split_i in range(n_splits):
                    # Uses closure to alter the results
                    results["split%d_%s"
                            % (split_i, key_name)] = array[:, split_i]

            array_means = np.average(array, axis=1, weights=weights)
            results['mean_%s' % key_name] = array_means
            # Weighted std is not directly available in numpy
            array_stds = np.sqrt(np.average((array -
                                             array_means[:, np.newaxis]) ** 2,
                                            axis=1, weights=weights))
            results['std_%s' % key_name] = array_stds

            if rank:
                results["rank_%s" % key_name] = np.asarray(
                    rankdata(-array_means, method='min'), dtype=np.int32)

        _store('fit_time', fit_time)
        _store('score_time', score_time)
        
        #################################################################
        
        y_test_array = np.array(y_test)
        y_pred_array = np.array(y_pred)
        if y_test_array.dtype == np.object:
            y_test_array = y_test_array.reshape(n_candidates, n_splits)
            y_pred_array = y_pred_array.reshape(n_candidates, n_splits)
            for split_i in range(n_splits):
                results["split%d_%s" % (split_i, 'y_test')] = y_test_array[:, split_i]
                results["split%d_%s" % (split_i, 'y_pred')] = y_pred_array[:, split_i]
        else:
            y_test_array = y_test_array.reshape(n_candidates, n_splits, -1)
            y_pred_array = y_pred_array.reshape(n_candidates, n_splits, -1)
            for split_i in range(n_splits):
                results["split%d_%s" % (split_i, 'y_test')] = list(y_test_array[:, split_i, :])
                results["split%d_%s" % (split_i, 'y_pred')] = list(y_pred_array[:, split_i, :])
    
        #################################################################
    
        # Use one MaskedArray and mask all the places where the param is not
        # applicable for that candidate. Use defaultdict as each candidate may
        # not contain all the params
        param_results = defaultdict(partial(MaskedArray,
                                            np.empty(n_candidates,),
                                            mask=True,
                                            dtype=object))
        for cand_i, params in enumerate(candidate_params):
            for name, value in params.items():
                # An all masked empty array gets created for the key
                # `"param_%s" % name` at the first occurrence of `name`.
                # Setting the value at an index also unmasks that index
                param_results["param_%s" % name][cand_i] = value

        results.update(param_results)
        # Store a list of param dicts at the key 'params'
        results['params'] = candidate_params

        # NOTE test_sample counts (weights) remain the same for all candidates
        test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int)
        iid = self.iid
        if self.iid == 'warn':
            warn = False
            for scorer_name in scorers.keys():
                scores = test_scores[scorer_name].reshape(n_candidates,
                                                          n_splits)
                means_weighted = np.average(scores, axis=1,
                                            weights=test_sample_counts)
                means_unweighted = np.average(scores, axis=1)
                if not np.allclose(means_weighted, means_unweighted,
                                   rtol=1e-4, atol=1e-4):
                    warn = True
                    break

            if warn:
                warnings.warn("The default of the `iid` parameter will change "
                              "from True to False in version 0.22 and will be"
                              " removed in 0.24. This will change numeric"
                              " results when test-set sizes are unequal.",
                              DeprecationWarning)
            iid = True

        for scorer_name in scorers.keys():
            # Computed the (weighted) mean and std for test scores alone
            _store('test_%s' % scorer_name, test_scores[scorer_name],
                   splits=True, rank=True,
                   weights=test_sample_counts if iid else None)
            if self.return_train_score:
                prev_keys = set(results.keys())
                _store('train_%s' % scorer_name, train_scores[scorer_name],
                       splits=True)
                if self.return_train_score == 'warn':
                    for key in set(results.keys()) - prev_keys:
                        message = (
                            'You are accessing a training score ({!r}), '
                            'which will not be available by default '
                            'any more in 0.21. If you need training scores, '
                            'please set return_train_score=True').format(key)
                        # warn on key access
                        results.add_warning(key, message, FutureWarning)

        return results

# Final pipline

In [None]:
import os, logging
import math

import numpy as np
import pandas as pd

from ultra_ml.parm_search import UltraGridSearchCV
from ultra_ml.pipeline import PipelineHelper

from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics.scorer import mean_squared_error, r2_score, make_scorer

from xgboost import XGBRegressor

import matplotlib.pyplot as plt

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
X, y = datasets.load_boston(True)

In [None]:
def all_but_first_column(X):
    return X[:, 1:]

ft = FunctionTransformer(all_but_first_column)
X_out = ft.fit_transform(X)
print(X.shape, X_out.shape)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
ct = ColumnTransformer([
    ("norm1", Normalizer(norm='l1'), [0, 1]),
    ("norm2", Normalizer(norm='l2'), slice(3, 5))
])
#X = np.array([[0.0, 1.0, 2.0, 2.0, 5.0],
#              [1.0, 1.0, 0.0, 1.0, -2.0]])
#ct.fit_transform(X)

In [None]:
tscv = TimeSeriesSplit(
    n_splits=3, 
#    max_train_size=30
)
tscv
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, len(train_index))
    print("TEST:", test_index, len(test_index))

In [None]:
pipe = Pipeline([
    ('scaler', PipelineHelper([
        ('std', StandardScaler()),
        ('max', MaxAbsScaler()),
    ], include_bypass=True)), # this will produce one setting without scaler
    #('ct', ct),
    ('regressor', PipelineHelper([
        ('rf', RandomForestRegressor()),
        ('xgb', XGBRegressor()),
        ('gb', GradientBoostingRegressor()),
    ])),
])

params = {
    'scaler__selected_model': pipe.named_steps['scaler'].generate({
        'std__with_mean': [True, False],
        'std__with_std': [True, False],
    }),
    'regressor__selected_model': pipe.named_steps['regressor'].generate({
        'rf__n_estimators': [10, 20, 50, 100],
        'xgb__n_estimators': [10, 20, 50, 100],
        'gb__n_estimators': [10, 20, 50, 100],
        'gb__criterion': ['friedman_mse', 'mse', 'mae'],
        'gb__max_features': ['auto', 'sqrt', None],
    })
}

scorers = {
    'MSE' : make_scorer(mean_squared_error, greater_is_better=False),
    'R2'  : make_scorer(r2_score)
}

grid = UltraGridSearchCV(
    estimator  = pipe, 
    param_grid = params,
    cv         = tscv,
    scoring    = scorers,
    refit      = 'MSE',
    verbose    = 20, 
    n_jobs     = -1, 
    return_train_score=True
)

fit_result = grid.fit(X, y)

In [None]:
#grid.cv_results_

In [None]:
result_df = pd.DataFrame(grid.cv_results_).sort_values('mean_test_R2', ascending=False)
result_df

In [None]:
result_df['params'][8]

In [None]:
pipe.named_steps

In [None]:
for k, v in pipe.named_steps.items():
    print(k)
    print(v)
    print('-------')

# Plot

In [None]:
def plot_regression_result(result_df, cv):
    result_size = len(result_df)
    ncols = 4
    nrows = int(math.ceil(result_size / float(ncols)))

    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, nrows*1.5), dpi=96)

    fig.subplots_adjust(left=0.0, right=1.0, hspace=0.6, wspace=0.1)

    for ax, (idx, row) in zip(axs.flat[:result_size], result_df.iterrows()):
        
        y1 = np.concatenate([row["split{}_{}".format(i, 'y_test')] for i in range(cv.n_splits)])
        y2 = np.concatenate([row["split{}_{}".format(i, 'y_pred')] for i in range(cv.n_splits)])
        x = np.array(range(len(y2)))
        
        y2_df = pd.DataFrame({'y2': y2})
        move_window = y2_df['y2'].rolling(window=30)
        y2_df['UB'] = move_window.mean() + 2 * move_window.std()
        y2_df['LB'] = move_window.mean() - 2 * move_window.std()
        
        ax.plot(x, y1, linewidth=1, label='Actual')
        ax.plot(x, y2, linewidth=1, label='Pred')
        ax.fill_between(x, y2_df['LB'], y2_df['UB'], alpha=0.33, edgecolor='#CC4F1B', facecolor='#FF9848',
                            linewidth=0.5, antialiased=True)

        ax.grid()
        ax.legend(markerscale=0.5, fontsize=5, labelspacing=0.1, borderpad=0.25)
        ax.tick_params(direction='in', labelsize=8, length=2, width=1, pad=3, grid_alpha=0.5)

        title_text = '\n'.join([
            str(row['param_regressor__selected_model'])[0:50],
            str(row['param_scaler__selected_model'])[0:50]
        ])
        ax.set_title(title_text, fontsize=7)

        ax.text(10, 60, "R2: %.4f" % (row['mean_test_R2']), 
                size=8, rotation=15., ha="center", va="center",
                bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 0.8, 0.8), alpha=0.8))

    # For rest ax
    for ax in axs.flat[result_size:]:
        ax.grid()
        ax.tick_params(direction='in', labelsize=6, length=2, width=1, pad=3, grid_alpha=0.5)

    #plt.tight_layout()
    plt.show()
    
plot_regression_result(result_df, tscv)

# Experiment

In [None]:
from math import sqrt
from sklearn.externals.joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm_notebook as tqdm
result = Parallel(n_jobs=2)(delayed(sqrt)(i ** 2) for i in tqdm(range(100000)))

In [None]:
for a in (np.array([1,2]),np.array([3,5,6])):
    print(a.shape)
    
#np.vstack( (np.array([1,2,5]),np.array([3,5,6])) )
b = np.array( (np.array([1,2]), np.array([3,5,6])) )