In [1]:
from ml_inference.tuners import RandomForestRegressorTuner, RidgeTuner
from preprocess import Preprocessor

import pandas as pd

INFILE = '../data/pennycook_et_al_study2_clean.csv'

In [2]:
df = pd.read_csv(INFILE)
df = df[df.Treatment == 1].drop(columns='Treatment')
X, y = df.drop(columns='Diff').reset_index(drop=True), df.Diff
y = (y - y.mean()) / y.std()
df.head()

Unnamed: 0,SharingType_1,SharingType_2,SharingType_3,SharingType_4,SharingType_6,SharingType_5,SocialMedia_1,SocialMedia_2,SocialMedia_3,SocialMedia_4,...,Male,Education,Income,English,Partisan,Social_Conserv,Economic_Conserv,Diff,Party,POTUS2016
0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,17.0,9.0,1.0,3.0,1.0,2.0,-0.2,Democrat,Clinton
5,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,10.0,12.0,1.0,1.0,1.0,1.0,-0.2,Democrat,Clinton
7,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,7.0,1.0,5.0,3.0,3.0,0.266667,Independent,Unable to vote
8,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,2.0,1.0,4.0,3.0,3.0,0.0,Independent,Didn't vote but could have
13,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,12.0,3.0,1.0,3.0,3.0,3.0,-0.266667,Independent,Didn't vote but could have


In [4]:
from ml_inference.tuners import RandomForestRegressorTuner

rf_tuner = RandomForestRegressorTuner(Preprocessor(X)).tune(X, y, n_jobs=-1)



ValueError: Found input variables with inconsistent numbers of samples: [420, 420, 0]

In [None]:
from sklearn.ensemble import VotingRegressor

class StackingRegressor(VotingRegressor):
    def __init__(self, estimators, preprocess=[], weights=None, n_jobs=None, verbose=False):
        self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        super().__init__(estimators, weights=weights, n_jobs=n_jobs, verbose=verbose)
        
    def fit(self, X, y, sample_weight=[]):
        resid = y.copy()
        for _, est in self.estimators[:-1]:
            est.fit(X, resid)
            resid -= est.predict(X)
        for preproc in self.preprocess:
            X = preproc.fit(X).transform(X)
        self.estimators[-1][1].fit(X, resid)
        return self
            
    def predict(self, X):
        predictions = [est.predict(X) for _, est in self.estimators[:-1]]        
        for preproc in self.preprocess:
            X = preproc.transform(X)
        predictions.append(self.estimators[-1][1].predict(X))
        return np.array(predictions).sum(axis=0) 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

reg = StackingRegressor(
    [
        ('svr', make_pipeline(
            Preprocessor(X),
            SVR()
        )),
        ('rf', RandomForestRegressor())
    ], 
    preprocess=Preprocessor(X)
)
reg.fit(X, y)
reg.score(X, y)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

rf = make_pipeline(
    Preprocessor(X),
    RandomForestRegressor()
)
print(cross_val_score(rf, X, y).mean())
svr = make_pipeline(
    Preprocessor(X),
    SVR()
)
print(cross_val_score(svr, X, y).mean())
vr = make_pipeline(
    Preprocessor(X),
    VotingRegressor([
        ('rf', RandomForestRegressor()),
        ('svr', SVR())
    ])
)
print(cross_val_score(vr, X, y).mean())
print(cross_val_score(reg, X, y).mean())

In [None]:
reg.get_params()

In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LassoLars, Ridge, ElasticNet
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import SVR
from scipy.stats import expon, uniform, poisson, randint
from xgboost import XGBRegressor

from ml_inference.tuners import Tuner

class RandomForestStackingRegressorTuner(Tuner):
    def __init__(self, name, estimators=[], preprocess=[]):
        self.name = name
        self.estimators = estimators
        self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        
    def make_estimator(self, **params):
        est = make_pipeline(
            PolynomialFeatures(),
            PCA(),
            RandomForestRegressor()
        )
        stacking_est = StackingRegressor(
            self.estimators+[(self.name, est)], 
            preprocess=self.preprocess
        )
        return stacking_est.set_params(**params)
#         return super().make_estimator(est.set_params(**params))

    def make_best_estimator(self, idx=0, return_score=False):
        if idx < 1 and idx != 0:
            # interpret idx as a quantile
            idx = round(idx*(len(self.best_params_)-1))
        est = self.make_estimator(**self.best_params_[idx][1])
        return (self.best_params_[idx][0], est) if return_score else est
        
    def get_param_distributions(self, X, y):
        return {
            'rf__polynomialfeatures__degree': [1, 2],
            'rf__pca__n_components': list(range(1, X.shape[1])),
            'rf__randomforestregressor__n_estimators': poisson(1, 2**5)
        }

In [None]:
from ml_inference.tuners import SVRTuner

tuner = SVRTuner(Preprocessor(X)).tune(X, y, n_iter=2**5)
svr = tuner.make_best_estimator()
cross_val_score(svr, X, y).mean()

In [None]:
tuner = RandomForestStackingRegressorTuner('rf', estimators=[('svr', svr)], preprocess=Preprocessor(X))
tuner.tune(X, y, n_iter=2**5)

In [None]:
tuner.best_params_[0]

In [None]:
reg = tuner.make_best_estimator()
print(cross_val_score(reg, X, y).mean())
reg.get_params()

In [None]:
from ml_inference.tuners import Tuner


class StackingTuner(Tuner):
    def __init__(self, estimators=[], preprocess=[]):
        self.estimators = estimators
        self.preprocess = preprocess if isinstance(preprocess, list) else [preprocess]
        
    def make_estimator(self, last_estimator):
        return StackingRegressor(self.estimators+[last_estimator])