In [1]:
from preprocess import Preprocessor

from ml_inference.automl import AutoRegressor

import pandas as pd

INFILE = '../data/pennycook_et_al_study2_clean.csv'
df = pd.read_csv(INFILE)
# df = df[df.Treatment==0]
X, y = df.drop(columns='Diff'), df.Diff

In [None]:
from sklearn.datasets import load_boston

X, y = load_boston(return_X_y=True)

In [2]:
%%time

reg = AutoRegressor(preprocessors=Preprocessor(), n_jobs=-1, n_iter=2).fit(X, y)
reg = reg.best_estimator_


Tuning estimator 1 of 18: RandomForestRegressorCV
Best estimator score 0.10427524086356392

Tuning estimator 2 of 18: PCARandomForestRegressorCV
Best estimator score 0.07384330402998036

Tuning estimator 3 of 18: LassoLarsCV
Best estimator score -0.04962489827860379

Tuning estimator 4 of 18: PCALassoLarsCV
Best estimator score -0.04962489827860379

Tuning estimator 5 of 18: RidgeCV
Best estimator score 0.1289117992806556

Tuning estimator 6 of 18: PCARidgeCV
Best estimator score 0.09202551443054896

Tuning estimator 7 of 18: ElasticNetCV
Best estimator score -0.04962489827860379

Tuning estimator 8 of 18: PCAElasticNetCV
Best estimator score -0.04962489827860379

Tuning estimator 9 of 18: KernelRidgeCV
Best estimator score 0.12987761124467317

Tuning estimator 10 of 18: PCAKernelRidgeCV
Best estimator score 0.08591169166745191

Tuning estimator 11 of 18: SVRCV
Best estimator score 0.1190568483637291

Tuning estimator 12 of 18: PCASVRCV
Best estimator score 0.0873157625440631

Tuning 

In [3]:
reg.steps[-1][-1].estimators

[('estimator 0',
  Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('kernelridge',
                   KernelRidge(alpha=0.5001082590929797, degree=4,
                               kernel='rbf'))])),
 ('estimator 5',
  Pipeline(steps=[('randomforestregressor',
                   RandomForestRegressor(n_estimators=133))])),
 ('estimator 8',
  Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=1)),
                  ('standardscaler-1', StandardScaler()),
                  ('pca', PCA(n_components=67)),
                  ('standardscaler-2', StandardScaler()),
                  ('svr', SVR(C=0.6670272805927896, degree=10, kernel='linear'))])),
 ('estimator 14',
  Pipeline(steps=[('standardscaler', StandardScaler()),
                  ('kneighborsregressor', KNeighborsRegressor(n_neighbors=12))]))]

In [4]:
reg.steps[-1][-1].weights

array([0.36486547, 0.28486175, 0.15248981, 0.19778297])

In [5]:
%%time

from sklearn.model_selection import cross_val_score, KFold

for i in range(10):
    xval = cross_val_score(reg, X, y, cv=KFold(shuffle=True), n_jobs=-1)
    print(xval.mean())

0.17263651776382263
0.20086737293061718
0.17726144652729803
0.1825402260105387
0.18903829304253159
0.17405104909286445
0.1842314987773955
0.18653033211307243
0.1724284064065859
0.1746981418389027
Wall time: 49.5 s


In [None]:
from dask.distributed import Client
# see https://github.com/dask/distributed/issues/4168
import multiprocessing.popen_spawn_win32

client = Client(n_workers=4)

In [None]:
import numpy as np
from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True)
a = np.c_[X, y.reshape(-1, 1)]
np.random.shuffle(a)
X, y = a[:,:-1], a[:,-1]

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

knn = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)
cross_val_score(knn, X, y).mean()

In [None]:
from sklearn.svm import SVR

svr = make_pipeline(
    StandardScaler(),
    SVR()
)
cross_val_score(svr, X, y).mean()

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
xval_score = cross_val_score(rf, X, y)
print(xval_score)
print(xval_score.mean())

In [None]:
from sklearn.linear_model import Ridge as RidgeBase

class Ridge(RidgeBase):
    def __init__(self, alpha=1., prior_weight=0, normalize_coef=False, fit_intercept=True, normalize=False, copy_X=True, 
                 max_iter=None, tol=.001, solver='auto', random_state=None):
        self.prior_weight = prior_weight
        self.normalize_coef = normalize_coef
        super().__init__(alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol,
                        solver=solver, random_state=random_state)
        
    def fit(self, X, y, sample_weight=None):
        y = y - (self.prior_weight * X).sum(axis=1)
        super().fit(X, y, sample_weight)
        if self.normalize_coef:
            self.coef_ -= self.coef_.mean()
        return self
    
    def predict(self, X):
        return super().predict(X) + (self.prior_weight * X).sum(axis=1)

In [None]:
ridge = Ridge(normalize=True)
cross_val_score(ridge, X, y).mean()

In [None]:
from scipy import sparse
from scipy.stats import loguniform
from sklearn.linear_model import RidgeCV as RidgeCVBase
from sklearn.model_selection import RandomizedSearchCV

class RidgeCV(RidgeCVBase):
    def __init__(self, alphas=loguniform(1e-3, 1e3), prior_weight=0, normalize_coef=False, n_iter=10, fit_intercept=True, 
                 normalize=False, scoring=None, cv=None, n_jobs=None):
        self.prior_weight = prior_weight
        self.normalize_coef = normalize_coef
        self.n_iter = n_iter
        self.n_jobs = n_jobs
        super().__init__(fit_intercept=fit_intercept, normalize=normalize, scoring=scoring, cv=cv)
        self.alphas = alphas
        
    def fit(self, X, y, sample_weight=None):
        parameters = {'alpha': self.alphas}
        solver = 'sparse_cg' if sparse.issparse(X) else 'auto'
        rs = RandomizedSearchCV(
            Ridge(
                prior_weight=self.prior_weight,
                normalize_coef=self.normalize_coef,
                fit_intercept=self.fit_intercept, 
                normalize=self.normalize, 
                solver=solver
            ),
            parameters,
            n_iter=self.n_iter,
            scoring=self.scoring,
            cv=self.cv,
            n_jobs=self.n_jobs
        )
        rs.fit(X, y, sample_weight=sample_weight)
        estimator = rs.best_estimator_
        self.alpha_ = estimator.alpha
        self.best_score_ = rs.best_score_
        self.coef_ = estimator.coef_
        self.intercept_ = estimator.intercept_
        self.n_features_in_ = estimator.n_features_in_
        return self
    
    def predict(self, X):
        return self.make_best_estimator().predict(X)
    
    def make_best_estimator(self):
        estimator = Ridge(self.alpha_, prior_weight=self.prior_weight, normalize_coef=self.normalize_coef,
                          fit_intercept=self.fit_intercept, normalize=self.normalize)
        estimator.coef_ = self.coef_
        estimator.intercept_ = self.intercept_
        estimator.n_features_in_ = self.n_features_in_
        return estimator

In [None]:
ridge = RidgeCV(normalize=True).fit(X, y).make_best_estimator()
xval_score = cross_val_score(ridge, X, y)
print(xval_score)
print(xval_score.mean())

In [None]:
# upper and lower bounds on alpha
# with adding estimators one at a time; see stacking estimator fit method
from scipy.stats import expon

def compute_alpha_lb(X, y, max_iter=10):
    def search(scale):
        q, step = .5, .25
        dist = expon(0, scale)
        for _ in range(max_iter):
            ridge.alpha = dist.ppf(q)
            ridge.fit(X, y)
            weight = ridge.coef_ + ridge.prior_weight
            q += step if np.any(weight<0) else -step
            step /= 2
        return np.all(weight>0)
    
    ridge = Ridge(normalize_coef=True, fit_intercept=False, prior_weight=1./X.shape[1])
    scale = 1
    while not search(scale):
        scale *= 10
    return ridge.alpha

In [None]:
def compute_alpha_ub(X, y, max_iter=10, tol=.01):
    def search(scale):
        q, step = .5, .25
        dist = expon(0, scale)
        for _ in range(max_iter):
            ridge.alpha = dist.ppf(q)
            ridge.fit(X, y)
            q += step if np.any(abs(ridge.coef_)>tol) else -step
            step /= 2
        return np.all(abs(ridge.coef_)<tol)
        
    ridge = Ridge(normalize_coef=True, fit_intercept=False, prior_weight=1./X.shape[1])
    scale = 1
    while not search(scale):
        scale *= 10
    return ridge.alpha

In [None]:
from copy import deepcopy

import numpy as np
from joblib import Parallel
from scipy.stats import loguniform
from sklearn.base import clone, is_classifier
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble._base import _fit_single_estimator
from sklearn.model_selection import check_cv, cross_val_predict
from sklearn.utils.fixes import delayed

def _predict_single_estimator(estimator, X):
    return estimator.predict(X)


class _StackingBase(StackingRegressor):
    def __init__(self, estimators, cv=None, n_jobs=None, verbose=0):
        super().__init__(estimators, cv=cv, n_jobs=n_jobs, verbose=verbose)
        
    def transform(self, X):
        predictions = Parallel(n_jobs=self.n_jobs)(
            delayed(_predict_single_estimator)(est, X)
            for est in self.estimators_
        )
        return self._concatenate_predictions(X, predictions)
    
    def _fit_estimators(self, all_estimators, sample_weight):
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_single_estimator)(clone(est), X, y, sample_weight)
            for est in all_estimators
        )
        
    def _check_cv(self):
        cv = check_cv(self.cv, y=y, classifier=is_classifier(self))
        if hasattr(cv, 'random_state') and cv.random_state is None:
            cv.random_state = np.random.RandomState()
        return cv
    
    def _cross_val_predict(self, all_estimators, cv, sample_weight):
        fit_params = (
            {} if sample_weight is None else {'sample_weight': sample_weight}
        )
        predictions = Parallel(n_jobs=self.n_jobs)(
            delayed(cross_val_predict)(
                clone(est), X, y, 
                cv=deepcopy(cv), 
                n_jobs=self.n_jobs, 
                fit_params=fit_params, 
                verbose=self.verbose
            )
            for est in all_estimators
        )
        return self._concatenate_predictions(X, predictions)


class StackingRidge(_StackingBase):
    def __init__(self, estimators, alpha=1., cv=None, n_jobs=None, verbose=0):
        self.alpha = alpha
        super().__init__(estimators, cv=cv, n_jobs=n_jobs, verbose=verbose)
        
    def fit(self, X, y, sample_weight=None):
        names, all_estimators = self._validate_estimators()
        self._fit_estimators(all_estimators, sample_weight)
        cv = self._check_cv()
        X_meta = self._cross_val_predict(all_estimators, cv, sample_weight)
        
        # fit final estimator on CV predictions
        self.final_estimator_ = Ridge(
            self.alpha,
            fit_intercept=False,
            prior_weight=1./X_meta.shape[1],
            normalize_coef=True
        ).fit(X_meta, y)
        return self


class StackingRidgeCV(_StackingBase):
    def __init__(self, estimators, cv=None, n_iter=10, n_jobs=None, verbose=0):
        self.n_iter = n_iter
        super().__init__(estimators, cv=cv, n_jobs=n_jobs, verbose=verbose)
        
    def fit(self, X, y, sample_weight=None):
        names, all_estimators = self._validate_estimators()
        self._fit_estimators(all_estimators, sample_weight)
        cv = self._check_cv()
        X_meta = self._cross_val_predict(all_estimators, cv, sample_weight)
        alpha_prior = loguniform(compute_alpha_lb(X_meta, y), compute_alpha_ub(X_meta, y))
        
        # fit final estimator on CV predictions
        ridge_cv = RidgeCV(
            alpha_prior,
            fit_intercept=False,
            prior_weight=1./X_meta.shape[1],
            normalize_coef=True,
            cv=cv,
            n_jobs=self.n_jobs,
            n_iter=self.n_iter
        )
        self.final_estimator = ridge_cv.fit(X_meta, y)
        self.final_estimator_ = self.final_estimator.make_best_estimator()
        return self
    
    def make_best_estimator(self):
        stack = StackingRidge(
            self.estimators, self.final_estimator_.alpha, 
            cv=self.cv, n_jobs=self.n_jobs, verbose=self.verbose
        )
        stack.estimators_ = self.estimators_
        stack.final_estimator_ = self.final_estimator_
        return stack

In [None]:
estimators = [('ridge', ridge), ('knn', knn), ('svr', svr), ('rf', rf)]
stack = StackingRidgeCV(estimators).fit(X, y).make_best_estimator()

In [None]:
xval_score = cross_val_score(stack, X, y)
print(xval_score)
print(xval_score.mean())

In [None]:
stack.fit(X, y)
print(stack.final_estimator_.coef_.sum())
stack.final_estimator_.coef_ + stack.final_estimator_.prior_weight

In [None]:
from sklearn.linear_model import LinearRegression


class ConstrainedLinearRegression(LinearRegression):
    def __init__(self, constraint=0, normalize=False, copy_X=True, n_jobs=None):
        self.constraint = constraint
        super().__init__(fit_intercept=False, normalize=normalize, copy_X=copy_X, n_jobs=n_jobs)
        
    def fit(self, X, y, sample_weight=None):
        if X.shape[1] == 1:
            self.coef_ = np.array([1])
            return self
        if hasattr(X, 'values'):
            X = X.values
        X_0, X_rest = X[:,0], X[:,1:]
        X_rest = (X_rest.T - X_0).T
        y = y - self.constraint * X_0
        super().fit(X_rest, y, sample_weight)
        self.coef_ = np.insert(self.coef_, 0, self.constraint - self.coef_.sum())
        return self
    
    def predict(self, X):
        return X @ self.coef_

In [None]:
from copy import deepcopy

from sklearn.ensemble import VotingRegressor

class VotingRegressorRFECV(_StackingBase):    
    def fit(self, X, y, sample_weight=None):
        def get_rfe_scores(X_meta):
            rfe_progress = []
            while estimators:
                score = cross_val_score(linear_reg, X_meta, y, cv=cv).mean()
                linear_reg.fit(X_meta, y)
                rfe_progress.append((score, estimators.copy(), linear_reg.coef_))
                drop_idx = int(np.argmin(linear_reg.coef_))
                estimators.pop(drop_idx)
                X_meta = np.delete(X_meta, drop_idx, axis=1)
            rfe_progress.sort(key=lambda x: x[0], reverse=True)
            return rfe_progress
        
        names, all_estimators = self._validate_estimators()
        cv = self._check_cv()
        X_meta = self._cross_val_predict(all_estimators, cv, sample_weight)
        linear_reg = ConstrainedLinearRegression(1)
        estimators = list(zip(names, all_estimators))
        rfe_progress = get_rfe_scores(X_meta)
        self.best_score_, estimators, linear_reg.coef_ = rfe_progress[0]
        self.names_, estimators = zip(*estimators)
        self._fit_estimators(estimators, sample_weight)
        self.final_estimator_ = linear_reg
        return self
    
    def make_best_estimator(self):
        estimators = list(zip(self.names_, self.estimators_))
        return VotingRegressor(
            estimators, weights=self.final_estimator_.coef_, 
            n_jobs=self.n_jobs, verbose=self.verbose
        )

In [None]:
estimators = [('knn', knn), ('svr', svr), ('ridge', ridge), ('rf', rf)]
stack = VotingRegressorRFECV(estimators)
stack.fit(X, y)

In [None]:
stack.estimators_

In [None]:
stack.best_score_

In [None]:
stack.final_estimator_.coef_

In [None]:
reg = stack.make_best_estimator()
reg

In [None]:
cross_val_score(reg, X, y).mean()