In [1]:
from ml_inference import BaselineRegressor, Preprocessor

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score

pd.options.mode.chained_assignment = None
INFILE = '../data/pennycook_et_al_study2_clean.csv'

In [2]:
df = pd.read_csv(INFILE)
X, treat, y = df.drop(columns='Diff'), df.Treatment, df.Diff
df.head()

Unnamed: 0,SharingType_1,SharingType_2,SharingType_3,SharingType_4,SharingType_6,SharingType_5,SocialMedia_1,SocialMedia_2,SocialMedia_3,SocialMedia_4,...,Party_Republican,POTUS2016_Clinton,POTUS2016_Didn't vote but could have,POTUS2016_Didn't vote in protest,POTUS2016_Other,POTUS2016_Trump,POTUS2016_Unable to vote,Diff,Party,POTUS2016
0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.2,Democrat,Clinton
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,Republican,Trump
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Independent,Clinton
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Democrat,Clinton
4,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.2,Democrat,Clinton


In [3]:
cross_val_score(BaselineRegressor(), X, y).mean()

-0.0496248982786037

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
    
dummy_cols = ['Party', 'POTUS2016']
categories = [X[col].unique() for col in dummy_cols]
encoder = OneHotEncoder(categories=categories, sparse=False).fit(X[dummy_cols])
feature_names = list(encoder.get_feature_names(dummy_cols))
    
class Encoder(Preprocessor):
    def transform(self, X):
        X[feature_names] = encoder.transform(X[dummy_cols])
        return X.drop(columns=dummy_cols)
    
linear_reg = make_pipeline(
    Encoder(),
    LinearRegression()
)
cross_val_score(linear_reg, X, y).mean()

0.04675756802886222

In [5]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LassoLars, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from scipy.stats import expon, uniform, poisson


class Tuner():
    def __init__(self, preprocess=[]):
        self.preprocess_ = preprocess if isinstance(preprocess, list) else [preprocess]
        
    def tune(self, X, y, n_iter=10):
        param_distributions = self.get_param_distributions(X, y)
        est = RandomizedSearchCV(self.make_estimator(), param_distributions, n_iter=n_iter).fit(X, y)
        res = list(zip(est.cv_results_['mean_test_score'], est.cv_results_['params']))
        self.best_params_ = sorted(res, key=lambda x: -x[0])
        return self
    
    def make_best_estimator(self, return_score=False):
        est = self.make_estimator(**self.best_params_[0][1])
        return self.best_params_[0][0], est if return_score else est


class RandomForestRegressorTuner(Tuner):    
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess_,
            PolynomialFeatures(),
            PCA(),
            RandomForestRegressor()
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'randomforestregressor__n_estimators': poisson(1, 2**5),
            'randomforestregressor__max_depth': poisson(1, 2**3)
        }
    
    
class LassoLarsTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess_,
            PolynomialFeatures(),
            PCA(),
            LassoLars(normalize=True)
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'lassolars__alpha': expon(0, 1)
        }
    
    
class RidgeTuner(Tuner):
    def make_estimator(self, **params):
        est = make_pipeline(
            *self.preprocess_,
            PolynomialFeatures(),
            PCA(),
            Ridge(normalize=True)
        )
        return est.set_params(**params)
        
    def get_param_distributions(self, X, y):
        return {
            'polynomialfeatures__degree': [1, 2],
            'pca__n_components': list(range(1, X.shape[1])),
            'ridge__alpha': expon(0, 1)
        }

In [39]:
def make_default_tuners():
    return [
        RandomForestRegressorTuner(),
        RidgeTuner()
    ]

class AutoML(VotingRegressor):
    def __init__(self, tuners=[], preprocess=[], estimators=[], weights=[], n_jobs=None, verbose=False):
        self.tuners = tuners or make_default_tuners()
        self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        super().__init__(estimators, weights=weights, n_jobs=n_jobs, verbose=verbose)
        
    def fit(self, X, y, *args, **kwargs):
        return super().fit(self.preprocess_X(X), y, *args, **kwargs)
    
    def predict(self, X, *args, **kwargs):
        return super().predict(self.preprocess_X(X), *args, **kwargs)
        
    def preprocess_X(self, X, verbose=False):
        for preprocessor in self.preprocess:
            X = preprocessor.fit(X).transform(X)
        return X
        
    def tune(self, X, y, n_iter=10): 
        def tune_estimators():
            for i, tuner in enumerate(self.tuners):
                print('\nRunning tuner {} of {}'.format(i+1, len(self.tuners)))
                tuner.tune(X_preproc, y, n_iter=n_iter)
                print('Best estimator score: {:.4f}'.format(tuner.best_params_[0][0]))
            
        def get_best_estimator(i, best_score):
            best_tuner, best_estimator, best_weight = None, None, None
            for tuner in self.tuners:
                if tuner.best_params_:
                    score, estimator = tuner.make_best_estimator(return_score=True)
                    weight = score - baseline_score
                    if weight > 0:
                        weights.append(weight)
                        estimators.append(('estimator_'+str(i), estimator))
                        self.set_params(estimators=estimators, weights=weights)
                        cv_score = cross_val_score(self, X, y).mean()
                        if cv_score > best_score:
                            best_tuner = tuner
                            best_estimator = estimator
                            best_weight = weight
                            best_score = cv_score
                        estimators.pop(), weights.pop()
            return best_tuner, best_estimator, best_weight, best_score
    
        X_preproc = self.preprocess_X(X)
        tune_estimators()
        baseline_score = cross_val_score(BaselineRegressor(), X_preproc, y).mean()
        estimators, weights = [], []
        i, best_score = 0, -np.inf
        best_tuner, best_estimator, best_weight, best_score = get_best_estimator(i, best_score)
        while best_estimator is not None:
            print('\nAdding estimator {}'.format(i+1))
            print('Best ensemble score: {:.4f}'.format(best_score))
            best_tuner.best_params_.pop(0)
            estimators.append(('estimator_'+str(i), best_estimator))
            weights.append(best_weight)
            self.set_params(estimators=estimators, weights=weights)
            i += 1
            best_tuner, best_estimator, best_weight, best_score = get_best_estimator(i, best_score)
        return self
    
    def get_params(self, **kwargs):
        params = super().get_params(**kwargs)
        params.update({
            'tuners': self.tuners,
            'preprocess': self.preprocess
        })
        return params
    
    def set_params(self, tuners=[], preprocess=[], **params):
        if tuners:
            self.tuners = tuners
        if preprocess:
            self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        return super().set_params(**params)
        
automl = AutoML(preprocess=Encoder()).tune(X, y, n_iter=2)


Running tuner 1 of 2
Best estimator score: 0.0309

Running tuner 2 of 2
Best estimator score: 0.0047

Adding estimator 1
Best ensemble score: 0.0333

Adding estimator 2
Best ensemble score: 0.0750


In [42]:
automl = AutoML(preprocess=Encoder()).tune(X, y)
cross_val_score(automl, X, y).mean()


Running tuner 1 of 2
Best estimator score: 0.0597

Running tuner 2 of 2
Best estimator score: 0.1021

Adding estimator 1
Best ensemble score: 0.1021

Adding estimator 2
Best ensemble score: 0.1059

Adding estimator 3
Best ensemble score: 0.1067

Adding estimator 4
Best ensemble score: 0.1160


0.1137929742239984