In [1]:
from ml_inference import BaselineRegressor, Preprocessor

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

pd.options.mode.chained_assignment = None
INFILE = '../data/pennycook_et_al_study2_clean.csv'

In [2]:
df = pd.read_csv(INFILE)
X, treat, y = df.drop(columns='Diff'), df.Treatment, df.Diff
df.head()

Unnamed: 0,SharingType_1,SharingType_2,SharingType_3,SharingType_4,SharingType_6,SharingType_5,SocialMedia_1,SocialMedia_2,SocialMedia_3,SocialMedia_4,...,Party_Republican,POTUS2016_Clinton,POTUS2016_Didn't vote but could have,POTUS2016_Didn't vote in protest,POTUS2016_Other,POTUS2016_Trump,POTUS2016_Unable to vote,Diff,Party,POTUS2016
0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.2,Democrat,Clinton
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,Republican,Trump
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Independent,Clinton
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Democrat,Clinton
4,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.2,Democrat,Clinton


In [3]:
cross_val_score(BaselineRegressor(), X, y).mean()

-0.0496248982786037

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
    
dummy_cols = ['Party', 'POTUS2016']
categories = [X[col].unique() for col in dummy_cols]
encoder = OneHotEncoder(categories=categories, sparse=False).fit(X[dummy_cols])
feature_names = list(encoder.get_feature_names(dummy_cols))
    
class Encoder(Preprocessor):
    def transform(self, X):
        X[feature_names] = encoder.transform(X[dummy_cols])
        return X.drop(columns=dummy_cols)
    
linear_reg = make_pipeline(
    Encoder(),
    LinearRegression()
)
cross_val_score(linear_reg, X, y).mean()

0.04675756802886222

In [78]:
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, VotingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LassoLars, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import SVR
from scipy.stats import expon, uniform, poisson, randint
from xgboost import XGBRegressor


class Tuner():
    def __init__(self, preprocess=[]):
        self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        
    def tune(self, X, y, n_iter=10):
        param_distributions = self.get_param_distributions(X, y)
        est = RandomizedSearchCV(self.make_estimator(), param_distributions, n_iter=n_iter).fit(X, y)
        res = list(zip(est.cv_results_['mean_test_score'], est.cv_results_['params']))
        self.best_params_ = sorted(res, key=lambda x: -x[0])
        return self
    
    def make_best_estimator(self, q=0, return_score=False):
        idx = round(q*(len(self.best_params_)-1))
        est = self.make_estimator(**self.best_params_[idx][1])
        return self.best_params_[idx][0], est if return_score else est
    
    def rm_params(self, q=0):
        idx = round(q*(len(self.best_params_)-1))
        return self.best_params_.pop(idx)


class RandomForestRegressorTuner(Tuner):    
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            RandomForestRegressor()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'randomforestregressor__n_estimators': poisson(1, 2**5)
        }
    
    
class LassoLarsTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            LassoLars(normalize=True)
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'lassolars__alpha': expon(0, 1)
        }
    
    
class RidgeTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            Ridge(normalize=True)
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'ridge__alpha': expon(0, 1)
        }
    
    
class ElasticNetTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            ElasticNet(normalize=True)
        )
        return est.set_params(**params)
    
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'elasticnet__alpha': expon(0, 1),
            'elasticnet__l1_ratio': uniform(0, 1)
        }
    
    
class KernelRidgeTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            StandardScaler(),
            KernelRidge()
        )
        return est.set_params(**params)
    
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'kernelridge__alpha': expon(0, 1),
            'kernelridge__degree': list(range(2, 5)),
            'kernelridge__kernel': ['linear', 'poly', 'rbf', 'laplacian']
        }
    
    
class SVRTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            StandardScaler(),
            SVR()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'svr__C': expon(0, 1),
            'svr__degree': list(range(2, 5)),
            'svr__kernel': ['linear', 'poly', 'rbf'],
        }
    
class KNeighborsRegressorTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            StandardScaler(),
            KNeighborsRegressor()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'kneighborsregressor__n_neighbors': randint(1, .05*X.shape[0]),
            'kneighborsregressor__weights': ['uniform', 'distance']
        }
    
    
class AdaBoostRegressorTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            AdaBoostRegressor()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'adaboostregressor__n_estimators': poisson(1, 2**5)
        }
    
class XGBRegressorTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess,
            PolynomialFeatures(),
            PCA(),
            XGBRegressor()
        )
        return est.set_params(**params)
    
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'xgbregressor__gamma': expon(0, 1),
            'xgbregressor__max_depth': list(range(10)),
            'xgbregressor__min_child_weight': expon(0, 1),
            'xgbregressor__max_delta_step': expon(0, 1),
            'xgbregressor__lambda': expon(0, 1),
            'xgbregressor__alpha': expon(0, 1),
        }
    
tuner = XGBRegressorTuner(preprocess=Encoder())
# tuner.make_estimator().get_params()
# tuner.tune(X, y, n_iter=2)
# tuner.best_params_

In [95]:
def make_default_tuners():
    return [
        RandomForestRegressorTuner(),
        LassoLarsTuner(),
        RidgeTuner(),
        ElasticNetTuner(),
        KernelRidgeTuner(),
        SVRTuner(),
        KNeighborsRegressorTuner(),
        AdaBoostRegressorTuner(),
        XGBRegressorTuner()
    ]

class AutoML(VotingRegressor):
    def __init__(self, tuners=[], preprocess=[], estimators=[], weights=[], n_jobs=None, verbose=False):
        self.tuners = tuners or make_default_tuners()
        self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        super().__init__(estimators, weights=weights, n_jobs=n_jobs, verbose=verbose)
        
    def fit(self, X, y, *args, **kwargs):
        return super().fit(self.preprocess_X(X), y, *args, **kwargs)
    
    def predict(self, X, *args, **kwargs):
        return super().predict(self.preprocess_X(X), *args, **kwargs)
        
    def preprocess_X(self, X, verbose=False):
        for preprocessor in self.preprocess:
            X = preprocessor.fit(X).transform(X)
        return X
        
    def tune(self, X, y, n_iter=10, quantiles=[0, .1, .2]): 
        def tune_estimators():
            for i, tuner in enumerate(self.tuners):
                print('\nRunning tuner {} of {}'.format(i+1, len(self.tuners)))
                tuner.tune(X_preproc, y, n_iter=n_iter)
                print('Best estimator score: {:.4f}'.format(tuner.best_params_[0][0]))
                
        def add_estimator(i, best_score):
            print('\nAdding estimator {}'.format(i+1))
            tuner, estimator, weight, score, q = get_best_estimator(i, best_score)
            print('Best ensemble score: {:.4f}'.format(score))
            if estimator is not None:
                tuner.rm_params(q)
                estimators.append(('estimator_'+str(i), estimator))
                weights.append(weight)
                self.set_params(estimators=estimators, weights=weights)
            return score
            
        def get_best_estimator(i, best_score):
            best_tuner, best_estimator, best_weight, best_quantile = None, None, None, None
            for tuner in self.tuners:
                if tuner.best_params_:
                    for q in quantiles:
                        score, estimator = tuner.make_best_estimator(q, return_score=True)
                        weight = score - baseline_score
                        if weight > 0:
                            weights.append(weight)
                            estimators.append(('estimator_'+str(i), estimator))
                            self.set_params(estimators=estimators, weights=weights)
                            cv_score = cross_val_score(self, X, y).mean()
                            if cv_score > best_score:
                                best_tuner = tuner
                                best_estimator = estimator
                                best_weight = weight
                                best_score = cv_score
                                best_quantile = q
                            estimators.pop(), weights.pop()
            return best_tuner, best_estimator, best_weight, best_score, best_quantile
    
        X_preproc = self.preprocess_X(X)
        tune_estimators()
        baseline_score = cross_val_score(BaselineRegressor(), X_preproc, y).mean()
        estimators, weights = [], []
        i, best_score = 0, -np.inf
        while True:
            score = add_estimator(i, best_score)
            if score <= best_score:
                break
            best_score = score
            i += 1
        return self
    
    def get_params(self, **kwargs):
        params = super().get_params(**kwargs)
        params.update({
            'tuners': self.tuners,
            'preprocess': self.preprocess
        })
        return params
    
    def set_params(self, tuners=[], preprocess=[], **params):
        if tuners:
            self.tuners = tuners
        if preprocess:
            self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        return super().set_params(**params)
        
automl = AutoML(preprocess=Encoder()).tune(X, y, n_iter=2**5, quantiles=[0, .1, .2, .3])


Running tuner 1 of 9
Best estimator score: 0.0452

Running tuner 2 of 9
Best estimator score: -0.0496

Running tuner 3 of 9
Best estimator score: 0.0823

Running tuner 4 of 9
Best estimator score: -0.0496

Running tuner 5 of 9
Best estimator score: 0.1061

Running tuner 6 of 9
Best estimator score: 0.1257

Running tuner 7 of 9
Best estimator score: 0.0699

Running tuner 8 of 9
Best estimator score: 0.0563

Running tuner 9 of 9
Best estimator score: 0.0666

Adding estimator 1
Best ensemble score: 0.1254

Adding estimator 2
Best ensemble score: 0.1393

Adding estimator 3
Best ensemble score: 0.1468

Adding estimator 4
Best ensemble score: 0.1498

Adding estimator 5
Best ensemble score: 0.1498


In [96]:
# automl = AutoML(preprocess=Encoder()).tune(X, y)
cross_val_score(automl, X, y).mean()

0.14354502276456418