In [1]:
from preprocess import Preprocessor

from ml_inference import BaselineRegressor

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

pd.options.mode.chained_assignment = None
INFILE = '../data/pennycook_et_al_study2_clean.csv'

In [154]:
from sklearn.model_selection import train_test_split

df = pd.read_csv(INFILE)
# df = df[df.Treatment==0]
X, treat, y = df.drop(columns='Diff'), df.Treatment, df.Diff
X, X_test, y, y_test = train_test_split(X, y)
df.head()

Unnamed: 0,SharingType_1,SharingType_2,SharingType_3,SharingType_4,SharingType_6,SharingType_5,SocialMedia_1,SocialMedia_2,SocialMedia_3,SocialMedia_4,...,Education,Income,English,Partisan,Social_Conserv,Economic_Conserv,Treatment,Diff,Party,POTUS2016
0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,17.0,9.0,1.0,3.0,1.0,2.0,1.0,-0.2,Democrat,Clinton
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,19.0,3.0,1.0,5.0,4.0,4.0,0.0,-0.666667,Republican,Trump
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,16.0,7.0,1.0,3.0,2.0,2.0,0.0,0.0,Independent,Clinton
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,13.0,4.0,1.0,2.0,4.0,4.0,0.0,0.0,Democrat,Clinton
4,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,14.0,6.0,1.0,1.0,5.0,5.0,0.0,0.2,Democrat,Clinton


In [3]:
cross_val_score(BaselineRegressor(), X, y).mean()

-0.017490228432946963

In [4]:
baseline_reg = BaselineRegressor().fit(X, y)
baseline_reg.score(X_test, y_test)

-0.0058821388028651445

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
    
linear_reg = make_pipeline(
    Preprocessor(),
    LinearRegression()
)
cross_val_score(linear_reg, X, y).mean()

-0.25900279660296033

In [6]:
linear_reg.fit(X, y)
linear_reg.score(X_test, y_test)

-0.25692287313206497

In [7]:
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, VotingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LassoLars, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import SVR
from scipy.stats import expon, uniform, poisson, randint
from xgboost import XGBRegressor


class Tuner():
    def __init__(self, preprocess=[]):
        self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        
    def tune(self, X, y, n_iter=10, n_jobs=None):
        param_distributions = self.get_param_distributions(X, y)
        est = RandomizedSearchCV(self.make_estimator(), param_distributions, n_iter=n_iter, n_jobs=n_jobs).fit(X, y)
        self.best_score_ = est.best_score_
        self.cv_results_ = list(zip(est.cv_results_['mean_test_score'], est.cv_results_['params']))
        return self
    
    def make_best_estimator(self, q=0, return_score=False):
        self.cv_results_.sort(key=lambda x: x[0], reverse=True)
        idx = round(q*(len(self.cv_results_)-1))
        est = self.make_estimator(**self.cv_results_[idx][1])
        return (self.cv_results_[idx][0], est) if return_score else est
    
    def rm_params(self, q=0):
        idx = round(q*(len(self.best_params_)-1))
        return self.best_params_.pop(idx)


class RandomForestRegressorTuner(Tuner):    
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            RandomForestRegressor()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'randomforestregressor__n_estimators': poisson(1, 2**5)
        }
    
    
class LassoLarsTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            LassoLars(normalize=True)
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'lassolars__alpha': expon(0, 1)
        }
    
    
class RidgeTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            Ridge(normalize=True)
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'ridge__alpha': expon(0, 1)
        }
    
    
class ElasticNetTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            ElasticNet(normalize=True)
        )
        return est.set_params(**params)
    
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'elasticnet__alpha': expon(0, 1),
            'elasticnet__l1_ratio': uniform(0, 1)
        }
    
    
class KernelRidgeTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            StandardScaler(),
            KernelRidge()
        )
        return est.set_params(**params)
    
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'kernelridge__alpha': expon(0, 1),
            'kernelridge__degree': list(range(2, 5)),
            'kernelridge__kernel': ['linear', 'poly', 'rbf', 'laplacian']
        }
    
    
class SVRTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            StandardScaler(),
            SVR()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'svr__C': expon(0, 1),
            'svr__degree': list(range(2, 5)),
            'svr__kernel': ['linear', 'poly', 'rbf'],
        }
    
class KNeighborsRegressorTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            StandardScaler(),
            KNeighborsRegressor()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'kneighborsregressor__n_neighbors': randint(1, .05*X.shape[0]),
            'kneighborsregressor__weights': ['uniform', 'distance']
        }
    
    
class AdaBoostRegressorTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            AdaBoostRegressor()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'adaboostregressor__n_estimators': poisson(1, 2**5)
        }
    
class XGBRegressorTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            XGBRegressor()
        )
        return est.set_params(**params)
    
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'xgbregressor__gamma': expon(0, 1),
            'xgbregressor__max_depth': list(range(10)),
            'xgbregressor__min_child_weight': expon(0, 1),
            'xgbregressor__max_delta_step': expon(0, 1),
            'xgbregressor__lambda': expon(0, 1),
            'xgbregressor__alpha': expon(0, 1),
        }
    
# tuner = XGBRegressorTuner(preprocess=Encoder())
# tuner.make_estimator().get_params()
# tuner.tune(X, y, n_iter=2)
# tuner.best_params_

In [103]:
%%time

tuners = [
    AdaBoostRegressorTuner(preprocess=Preprocessor()),
    KernelRidgeTuner(preprocess=Preprocessor()),
    RidgeTuner(preprocess=Preprocessor()),
    SVRTuner(preprocess=Preprocessor()),
    RandomForestRegressorTuner(preprocess=Preprocessor()),
    XGBRegressorTuner(preprocess=Preprocessor()),
    KNeighborsRegressorTuner(preprocess=Preprocessor()),
]
for i, tuner in enumerate(tuners):
    print('Running tuner {} of {}'.format(i+1, len(tuners)))
    tuner.tune(X, y, n_iter=2**4, n_jobs=-1)

Running tuner 1 of 7
Running tuner 2 of 7
Running tuner 3 of 7
Running tuner 4 of 7
Running tuner 5 of 7
Running tuner 6 of 7
Running tuner 7 of 7
Wall time: 55.6 s


In [8]:
from sklearn.linear_model import LinearRegression


class ConstrainedLinearRegression(LinearRegression):
    def __init__(self, constraint=0, normalize=False, copy_X=True, n_jobs=None):
        self.constraint = constraint
        super().__init__(fit_intercept=False, normalize=normalize, copy_X=copy_X, n_jobs=n_jobs)
        
    def fit(self, X, y, sample_weight=None):
        if X.shape[1] == 1:
            self.coef_ = np.array([1])
            return self
        if hasattr(X, 'values'):
            X = X.values
        X_0, X_rest = X[:,0], X[:,1:]
        X_rest = (X_rest.T - X_0).T
        y = y - self.constraint * X_0
        super().fit(X_rest, y, sample_weight)
        self.coef_ = np.insert(self.coef_, 0, self.constraint - self.coef_.sum())
        return self
    
    def predict(self, X):
        return X @ self.coef_

In [90]:
from copy import deepcopy

import numpy as np
from joblib import Parallel
from scipy.stats import loguniform
from sklearn.base import clone, is_classifier
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.ensemble._base import _fit_single_estimator
from sklearn.model_selection import check_cv, cross_val_predict
from sklearn.utils.fixes import delayed

def _predict_single_estimator(estimator, X):
    return estimator.predict(X)


class _EnsembleBase(StackingRegressor):
    def __init__(self, estimators, cv=None, n_jobs=None, verbose=0):
        super().__init__(estimators, cv=cv, n_jobs=n_jobs, verbose=verbose)
        
    def transform(self, X):
        predictions = Parallel(n_jobs=self.n_jobs)(
            delayed(_predict_single_estimator)(est, X)
            for est in self.estimators_
        )
        return self._concatenate_predictions(X, predictions)
    
    def _fit_estimators(self, X, y, estimators, sample_weight):
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_single_estimator)(clone(est), X, y, sample_weight)
            for est in estimators
        )
        
    def _check_cv(self):
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
        if hasattr(cv, 'random_state') and cv.random_state is None:
            cv.random_state = np.random.RandomState()
        if hasattr(cv, 'shuffle'):
            cv.shuffle = True
        cv.shuffle = True
        return cv
    
    def _cross_val_predict(self, X, y, estimators, cv, sample_weight):
        fit_params = (
            {} if sample_weight is None else {'sample_weight': sample_weight}
        )
        predictions = Parallel(n_jobs=self.n_jobs)(
            delayed(cross_val_predict)(
                clone(est), X, y, 
                cv=deepcopy(cv), 
                n_jobs=self.n_jobs, 
                fit_params=fit_params, 
                verbose=self.verbose
            )
            for est in estimators
        )
        return self._concatenate_predictions(X, predictions)
    

class StackingRegressorRFECV(_EnsembleBase):   
    def fit(self, X, y, sample_weight=None):
        def get_rfe_scores(X_meta, estimators):
            rfe_progress = []
            while estimators:
                score = cross_val_score(linear_reg, X_meta, y, cv=cv).mean()
                linear_reg.fit(X_meta, y)
                rfe_progress.append((score, estimators.copy(), linear_reg.coef_))
                drop_idx = int(np.argmin(linear_reg.coef_))
                estimators.pop(drop_idx)
                X_meta = np.delete(X_meta, drop_idx, axis=1)
            return max(rfe_progress, key=lambda x: x[0])
        
        names, all_estimators = self._validate_estimators()
        cv = self._check_cv()
        X_meta = self._cross_val_predict(X, y, all_estimators, cv, sample_weight)
        linear_reg = ConstrainedLinearRegression(1)
        estimators = list(zip(names, all_estimators))
        self.best_score_, estimators, linear_reg.coef_ = get_rfe_scores(X_meta, estimators)
        self.names_, estimators = zip(*estimators)
        self._fit_estimators(X, y, estimators, sample_weight)
        self.final_estimator_ = linear_reg
        return self
    
    def make_best_estimator(self):
        estimators = list(zip(self.names_, self.estimators_))
        return VotingRegressor(
            estimators, weights=self.final_estimator_.coef_, 
            n_jobs=self.n_jobs, verbose=self.verbose
        )

In [133]:
class StackingStepwiseRegressorCV(_EnsembleBase):
    def fit(self, X, y, sample_weight=None):
        def get_stepwise_scores(X_meta, estimators):
            best_score = -np.inf
            in_estimators, X_in = [], None
            out_estimators, X_out = estimators.copy(), X_meta
            while out_estimators:
                new_scores = []
                for col in X_out.T:
                    col = col.reshape(-1, 1)
                    X = col if X_in is None else np.concatenate((X_in, col), axis=1)
                    coef_ = linear_reg.fit(X, y).coef_
                    if any(coef_ < 0):
                        # make sure estimators get non-negative weight
                        # otherwise, the ensemble has probably overfit
                        new_scores.append(-np.inf)
                    else:
                        new_scores.append(cross_val_score(linear_reg, X, y, cv=cv).mean())
                idx = np.argmax(new_scores)
                if new_scores[idx] <= best_score:
                    break
                best_score = new_scores[idx]
                col = X_out[:, idx].reshape(-1, 1)
                X_in = col if X_in is None else np.concatenate((X_in, col), axis=1)
                X_out = np.delete(X_out, idx, axis=1)
                in_estimators.append(out_estimators.pop(idx))
            return best_score, in_estimators, linear_reg.fit(X_in, y).coef_
            
        names, all_estimators = self._validate_estimators()
        cv = self._check_cv()
        X_meta = self._cross_val_predict(X, y, all_estimators, cv, sample_weight)
        linear_reg = ConstrainedLinearRegression(1)
        estimators = list(zip(names, all_estimators))
        self.best_score_, estimators, linear_reg.coef_ = get_stepwise_scores(X_meta, estimators)
        self.names_, estimators = zip(*estimators)
        self._fit_estimators(X, y, estimators, sample_weight)
        self.final_estimator_ = linear_reg
        return self
    
    def make_best_estimator(self):
        estimators = list(zip(self.names_, self.estimators_))
        return VotingRegressor(
            estimators, weights=self.final_estimator_.coef_, 
            n_jobs=self.n_jobs, verbose=self.verbose
        )

In [134]:
%%time

estimators = [tuner.make_best_estimator() for tuner in tuners]
estimators = [('estimator {}'.format(i), est) for i, est in enumerate(estimators)]
reg = StackingStepwiseRegressorCV(estimators)
reg.fit(X, y)
reg.best_score_

Wall time: 5.47 s


0.05861217266966494

In [135]:
reg.final_estimator_.coef_

array([0.23717454, 0.40588064, 0.35694482])

In [10]:
def make_default_tuners():
    return [
        RandomForestRegressorTuner(),
        LassoLarsTuner(),
        RidgeTuner(),
        ElasticNetTuner(),
        KernelRidgeTuner(),
        SVRTuner(),
        KNeighborsRegressorTuner(),
        AdaBoostRegressorTuner(),
        XGBRegressorTuner()
    ]

class AutoRegressor():
    def __init__(self, tuners=[], preprocessors=[], max_ensemble_size=100, n_ensembles=5, n_iter=10, n_jobs=None, verbose=False, cv=None):
        self.tuners = tuners or make_default_tuners()
        self.preprocessors = preprocessors if isinstance(preprocessors, list) else [preprocessors]
        self.max_ensemble_size = max_ensemble_size
        self.n_ensembles = n_ensembles
        self.n_iter = n_iter
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.cv = cv
        
    def fit(self, X, y, sample_weight=None):
        def run_tuners():
            for i, tuner in enumerate(self.tuners):
                print('Running tuner {} of {}'.format(i+1, len(self.tuners)))
                tuner.tune(X, y, n_iter=self.n_iter, n_jobs=self.n_jobs)
                print('best score', tuner.best_score_)
                
        for preprocessor in self.preprocessors:
            X = preprocessor.fit_transform(X)
        run_tuners()
        best_params = [(res, tuner) for tuner in self.tuners for res in tuner.cv_results_]
        # x[0][0] is the cv score
        best_params = sorted(best_params, key=lambda x: x[0][0], reverse=True)[:self.max_ensemble_size]
        best_estimators = [tuner.make_estimator(**params) for (_, params), tuner in best_params]
        best_estimators = [('estimator {}'.format(i), estimator) for i, estimator in enumerate(best_estimators)]
        voting_regressors = []
        for i in range(self.n_ensembles):
            print('Building ensemble {} of {}'.format(i+1, self.n_ensembles))
            stack = StackingRegressorRFECV(best_estimators, n_jobs=self.n_jobs, cv=self.cv)
            stack.fit(X, y)
            print(stack.best_score_)
            voting_regressors.append(('ensemble {}'.format(i+1), stack.make_best_estimator()))
        self.best_estimator_ = make_pipeline(
            *self.preprocessors,
            VotingRegressor(voting_regressors)
        )
        return self
    
    def predict(self, X):
        return self.best_estimator_.predict(X)
    
    def make_best_estimator(self):
        return clone(self.best_estimator_)

In [187]:
class AutoRegressor():
    def __init__(
            self, tuners=[], preprocessors=[], ensemble_method='stepwise', max_ensemble_size=100, n_ensembles=5, n_iter=10, 
            n_jobs=None, verbose=False, cv=None
        ):
        self.tuners = tuners or make_default_tuners()
        self.preprocessors = preprocessors if isinstance(preprocessors, list) else [preprocessors]
        assert ensemble_method in ('stepwise', 'rfe')
        self.ensemble_method = ensemble_method
        self.max_ensemble_size = max_ensemble_size
        self.n_ensembles = n_ensembles
        self.n_iter = n_iter
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.cv = cv
        
    def fit(self, X, y, sample_weight=None):
        def run_tuners():
            for i, tuner in enumerate(self.tuners):
                print('Running tuner {} of {}'.format(i+1, len(self.tuners)))
                tuner.tune(X, y, n_iter=self.n_iter, n_jobs=self.n_jobs)
                print('best score', tuner.best_score_)
                
        for preprocessor in self.preprocessors:
            X = preprocessor.fit_transform(X)
        run_tuners()
        best_params = [(res, tuner) for tuner in self.tuners for res in tuner.cv_results_]
        # x[0][0] is the cv score
        best_params = sorted(best_params, key=lambda x: x[0][0], reverse=True)[:self.max_ensemble_size]
        best_estimators = [tuner.make_estimator(**params) for (_, params), tuner in best_params]
        best_estimators = [('estimator {}'.format(i), estimator) for i, estimator in enumerate(best_estimators)]
        estimators = best_estimators.pop(0)
        linear_reg = ConstrainedLinearRegression(1)
        score = cross_val_score(linear_reg, X, y)
        rfe_voting_regressors = []
        stepwise_voting_regressors = []
        voting_regressors = []
        for i in range(self.n_ensembles):
            print('Building ensemble {} of {}'.format(i+1, self.n_ensembles))
#             if self.ensemble_method == 'rfe':
#                 stack = StackingRegressorRFECV(best_estimators, n_jobs=self.n_jobs, cv=self.cv)
#             elif self.ensemble_method == 'stepwise':
#                 stack = StackingStepwiseRegressorCV(best_estimators, n_jobs=self.n_jobs, cv=self.cv)
            stack = StackingRegressorRFECV(best_estimators, n_jobs=self.n_jobs, cv=self.cv)
            stack.fit(X, y)
            print('RFE best score', stack.best_score_)
            rfe_estimator = stack.make_best_estimator()
            rfe_voting_regressors.append(('ensemble {}'.format(i+1), rfe_estimator))
            stack = StackingStepwiseRegressorCV(best_estimators, n_jobs=self.n_jobs, cv=self.cv)
            stack.fit(X, y)
            print('Stepwise best score', stack.best_score_)
            stepwise_estimator = stack.make_best_estimator()
            stepwise_voting_regressors.append(('ensemble {}'.format(i+1), stepwise_estimator))
            stack = StackingStepwiseRegressorCV(
                estimators=[
                    ('rfe_estimator', rfe_estimator),
                    ('stepwise_estimator', stepwise_estimator)
                ],
                n_jobs=self.n_jobs,
                cv=self.cv
            )
            stack.fit(X, y)
            voting_regressors.append(('ensemble {}'.format(i+1), stack.make_best_estimator()))
            print('Ensemble best score', stack.best_score_)
        self.rfe_best_estimator_ = make_pipeline(
            *self.preprocessors,
            VotingRegressor(rfe_voting_regressors)
        )
        self.stepwise_best_estimator_ = make_pipeline(
            *self.preprocessors,
            VotingRegressor(stepwise_voting_regressors)
        )
        self.ensemble_best_estimator_ = make_pipeline(
            *self.preprocessors,
            VotingRegressor(voting_regressors)
        )
        return self
    
    def predict(self, X):
        return self.best_estimator_.predict(X)
    
    def make_best_estimator(self):
        return clone(self.best_estimator_)

In [188]:
%%time

from sklearn.datasets import load_diabetes
from sklearn.model_selection import KFold

# X, y = load_diabetes(return_X_y=True)
# X, X_test, y, y_test = train_test_split(X, y)
X, y = df.drop(columns='Diff'), df.Diff
X, X_test, y, y_test = train_test_split(X, y)
autoreg = AutoRegressor(preprocessors=Preprocessor(), max_ensemble_size=3, n_iter=3, n_ensembles=1, n_jobs=-1).fit(X, y)

Running tuner 1 of 9
best score 0.09400844635278931
Running tuner 2 of 9
best score -0.0026438008817306624
Running tuner 3 of 9
best score 0.1397755290624509
Running tuner 4 of 9
best score -0.0026438008817306624
Running tuner 5 of 9
best score 0.08409025885361458
Running tuner 6 of 9
best score 0.12240153875172485
Running tuner 7 of 9
best score 0.08687868226573803
Running tuner 8 of 9
best score 0.06585126758087165
Running tuner 9 of 9
best score 0.0273452549161598
Building ensemble 1 of 1
RFE best score 0.12754176389562022
Stepwise best score 0.1316988439521851
Ensemble best score 0.13228383570527272
Wall time: 31.5 s


In [205]:
%%time

from sklearn.datasets import load_diabetes, load_boston

rfe_test_score_ = []
stepwise_test_score_ = []
ensemble_test_score_ = []
for i in range(10):
    X, y = load_boston(return_X_y=True)
    X, X_test, y, y_test = train_test_split(X, y)
    autoreg = AutoRegressor(max_ensemble_size=100, n_iter=2**4, n_ensembles=3, n_jobs=-1).fit(X, y)
    reg = autoreg.rfe_best_estimator_.fit(X, y)
    rfe_test_score_.append(reg.score(X_test, y_test))
    print()
    print('RFE test score', rfe_test_score_[-1])
    reg = autoreg.stepwise_best_estimator_.fit(X, y)
    stepwise_test_score_.append(reg.score(X_test, y_test))
    print('Stepwise test score', stepwise_test_score_[-1])
    reg = autoreg.ensemble_best_estimator_.fit(X, y)
    ensemble_test_score_.append(reg.score(X_test, y_test))
    print('Ensemble test score', ensemble_test_score_[-1])
    print()

Running tuner 1 of 9
best score 0.7353581502397176
Running tuner 2 of 9
best score 0.0037945080409373944
Running tuner 3 of 9
best score 0.6241101453034259
Running tuner 4 of 9
best score 0.1637886267126912
Running tuner 5 of 9
best score 0.7430575774937813
Running tuner 6 of 9
best score 0.6360758212975578
Running tuner 7 of 9
best score 0.7091696348974517
Running tuner 8 of 9
best score 0.6850572593610561
Running tuner 9 of 9
best score 0.6336611662968309
Building ensemble 1 of 3
RFE best score 0.8168682666287393
Stepwise best score 0.8028979914562203
Ensemble best score 0.8456411446730648
Building ensemble 2 of 3
RFE best score 0.8125276916210898
Stepwise best score 0.809429202730253
Ensemble best score 0.8068824151601401
Building ensemble 3 of 3
RFE best score 0.8109926330434474
Stepwise best score 0.7845336790499869
Ensemble best score 0.8188425239722588

RFE test score 0.8274430845740518
Stepwise test score 0.7961170474563914
Ensemble test score 0.8264556434713517

Running tuner 

best score 0.675693768788288
Running tuner 9 of 9
best score 0.381918829606602
Building ensemble 1 of 3
RFE best score 0.7744437503345499
Stepwise best score 0.7570122019296291
Ensemble best score 0.7718584383848416
Building ensemble 2 of 3
RFE best score 0.7762016384321343
Stepwise best score 0.7611355119523251
Ensemble best score 0.7614614987592807
Building ensemble 3 of 3
RFE best score 0.7568451179735727
Stepwise best score 0.7666824846374725
Ensemble best score 0.7750925917370808

RFE test score 0.8317273620864167
Stepwise test score 0.8341100394992138
Ensemble test score 0.8359986557295448

Running tuner 1 of 9
best score 0.7709599048274258
Running tuner 2 of 9
best score -0.011217234902483542
Running tuner 3 of 9
best score 0.665991014261198
Running tuner 4 of 9
best score 0.629887863464878
Running tuner 5 of 9
best score 0.6954874740397545
Running tuner 6 of 9
best score 0.6346977527943339
Running tuner 7 of 9
best score 0.7520079608933152
Running tuner 8 of 9
best score 0.7053

In [206]:
test_scores = [np.array(scores) for scores in (rfe_test_score_, stepwise_test_score_, ensemble_test_score_)]
for score in test_scores:
    print(score.mean(), score.std(), score.min(), score.max())

0.8245916674202511 0.02708519349808612 0.7598120715836976 0.8620076294430448
0.8067616910769017 0.033416720624429797 0.7152632475743925 0.8356088415709901
0.8234868984485759 0.023992394974231376 0.7635026270696887 0.850732325555668


In [195]:
%%time

rfe_test_score = []
stepwise_test_score = []
ensemble_test_score = []
for i in range(10):
    X, y = df.drop(columns='Diff'), df.Diff
    X, X_test, y, y_test = train_test_split(X, y)
    autoreg = AutoRegressor(preprocessors=Preprocessor(), max_ensemble_size=100, n_iter=2**5, n_ensembles=3, n_jobs=-1).fit(X, y)
    reg = autoreg.rfe_best_estimator_.fit(X, y)
    rfe_test_score.append(reg.score(X_test, y_test))
    print()
    print('RFE test score', rfe_test_score[-1])
    reg = autoreg.stepwise_best_estimator_.fit(X, y)
    stepwise_test_score.append(reg.score(X_test, y_test))
    print('Stepwise test score', stepwise_test_score[-1])
    reg = autoreg.ensemble_best_estimator_.fit(X, y)
    ensemble_test_score.append(reg.score(X_test, y_test))
    print('Ensemble test score', ensemble_test_score[-1])
    print()

Running tuner 1 of 9
best score 0.12870141985581054
Running tuner 2 of 9
best score -0.0026147414240628385
Running tuner 3 of 9
best score 0.15351782549913248
Running tuner 4 of 9
best score 0.00031031007960604653
Running tuner 5 of 9
best score 0.15370653489449745
Running tuner 6 of 9
best score 0.13504367219993452
Running tuner 7 of 9
best score 0.14142730094193978
Running tuner 8 of 9
best score 0.13311167001058416
Running tuner 9 of 9
best score 0.11098018346930678
Building ensemble 1 of 3
RFE best score 0.16608657174614433
Stepwise best score 0.17879440809636277
Ensemble best score 0.19151255873678094
Building ensemble 2 of 3
RFE best score 0.19362065766121944
Stepwise best score 0.19633730567392668
Ensemble best score 0.17658092876254422
Building ensemble 3 of 3
RFE best score 0.1838389333733734
Stepwise best score 0.17775872985309535
Ensemble best score 0.16574444485142076

RFE test score 0.09660723034952334
Stepwise test score 0.07859982499098406
Ensemble test score 0.087257886

best score 0.0051950490926700125
Running tuner 5 of 9
best score 0.1655908368069621
Running tuner 6 of 9
best score 0.15115760146669638
Running tuner 7 of 9
best score 0.12501657903563995
Running tuner 8 of 9
best score 0.1864837133868861
Running tuner 9 of 9
best score 0.10584798552688705
Building ensemble 1 of 3
RFE best score 0.2040038598941035
Stepwise best score 0.17469702431243145
Ensemble best score 0.17267442365300137
Building ensemble 2 of 3
RFE best score 0.19676020957956988
Stepwise best score 0.1741268207630071
Ensemble best score 0.17550341307075268
Building ensemble 3 of 3
RFE best score 0.18083176199369128
Stepwise best score 0.1910331417201909
Ensemble best score 0.19381397816624793

RFE test score 0.10103701721897129
Stepwise test score 0.10425565424641425
Ensemble test score 0.10713038490613114

Running tuner 1 of 9
best score 0.10868431343797565
Running tuner 2 of 9
best score -0.010926529567007081
Running tuner 3 of 9
best score 0.15146898605950396
Running tuner 4 o

In [196]:
rfe_test_score = np.array(rfe_test_score)
stepwise_test_score = np.array(stepwise_test_score)
ensemble_test_score = np.array(ensemble_test_score)
rfe_test_score.mean(), stepwise_test_score.mean(), ensemble_test_score.mean()

(0.1431094014560752, 0.1423603204421809, 0.1450567063338882)

In [198]:
test_scores = [rfe_test_score, stepwise_test_score, ensemble_test_score]
for score in test_scores:
    print(score.mean(), score.std(), score.min(), score.max())

0.1431094014560752 0.03698229927440087 0.09660723034952334 0.20287743298118488
0.1423603204421809 0.03522963723231343 0.07859982499098406 0.1996726395700834
0.1450567063338882 0.03738906043423106 0.08725788670019274 0.20471490099460887


In [199]:
linear_reg = make_pipeline(
    Preprocessor(),
    LinearRegression()
)
linear_reg.fit(X, y)
print(linear_reg.score(X_test, y_test))
baseline_reg.fit(X, y)
print(baseline_reg.score(X_test, y_test))

0.010924311185860569
-0.0029542213699329523


In [178]:
%%time
from sklearn.model_selection import KFold

for _ in range(5):
    cv = KFold(shuffle=True)
    cv.random_state = np.random.RandomState()
    print(cross_val_score(baseline_reg, X, y, cv=cv).mean())
    print(cross_val_score(linear_reg, X, y, cv=cv).mean())
    print(cross_val_score(reg, X, y, cv=cv).mean())
    print()

-0.0062577576409958095
-0.04600090617207673
0.16307040944304638

-0.009055857140276746
-0.014232296794247045
0.15361196627922696

-0.009418446809962022
-0.027665366344261776
0.16010932609017375

-0.05502685209152669
-0.049947435246535866
0.1661552575352783

-0.005171155783737591
-0.039876971002700004
0.14715201192386776

Wall time: 16.2 s
