In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingRegressor

In [2]:
class AttributeKFold:
    ''' k-fold cross validator splitting on a particular attribute
        so that all samples with a given value are either in the train or test set

        attribute value for each sample is given in the constructor, so that
        the attribute itself need not be in the features for the model
    '''
    def __init__(self, cv, attr):
        self.cv, self.attr = cv, attr

    def get_n_splits(self, *args, **kwargs):
        return self.cv.get_n_splits(*args, **kwargs)

    def split(self, X, y=None, groups=None):
        vals = self.attr.unique()
        for train_idx, test_idx in self.cv.split(vals):
            train_mask = self.attr.isin(vals[train_idx])
            test_mask = self.attr.isin(vals[test_idx])

            yield (
                np.argwhere(train_mask).reshape(-1),
                np.argwhere(test_mask).reshape(-1),
            )

In [3]:
class LogUniform:
    ''' random variable X such that log(x) is distributed uniformly
    '''
    def __init__(self, base, expmin, expmax):
        self.base, self.expmin, self.expmax = base, expmin, expmax

    def rvs(self, size=None, random_state=None):
        random_state = random_state or np.random.RandomState()
        exp = random_state.uniform(self.expmin, self.expmax, size=size)
        return np.power(self.base, exp)


class IntDistribution:
    ''' random variable taking only integer values
    '''
    def __init__(self, rv):
        self.rv = rv

    def rvs(self, *args, **kwargs):
        sample = self.rv.rvs(*args, **kwargs)
        return int(round(sample))

In [4]:
def load_data(most_only):
    dframe_path = 'data/cabauw/processed-full-log.csv.gz'
    df = pd.read_csv(dframe_path, na_values='--', compression='gzip')

    df = df[(df.ustar > 0.1) & (abs(df.H) > 10) & (df.wind > 1)]
    df = df[df.ds != 201603]

    wind_temp_levels = df.pivot_table(
        values=['wind', 'temp'], columns='z', index=['ds', 'tt']
    ).reset_index()
    wind_temp_levels.columns = [
        '%s_%d' % (a, b) if b else a
        for a, b in wind_temp_levels.columns.values
    ]

    ddf = df.merge(wind_temp_levels, on=['ds', 'tt']).dropna()
    
    if most_only:
        ddf = ddf[(ddf.zL > -2) & (ddf.zL < 1)]

    features = [
        'air_dens', 'wind', 'temp', 'virtual_temp',
        'soil_temp', 'z',
        'dewpoint', 'spec_hum', 'rel_hum', 'press', 'rain', 'co2flux',
        'soilheat', 'netrad', 'temp_10', 'temp_20', 'temp_40', 'wind_10',
        'wind_20', 'wind_40'
    ]
    
    return ddf, features


df, features = load_data(most_only=True)

In [5]:
est = BaggingRegressor(Pipeline([
    ('poly', PolynomialFeatures()),
    ('scal', StandardScaler()),
    ('redu', TruncatedSVD()),
    ('reg', Ridge()),
]))


params = {
    'base_estimator__poly__degree': [1, 2],
    'base_estimator__poly__interaction_only': [True, False],
    'base_estimator__redu__n_components': stats.uniform(0.1, 0.89),
    'base_estimator__reg__alpha': LogUniform(10, -7, 1),
    'n_estimators': IntDistribution(stats.uniform(5, 45)),
    'max_samples': stats.uniform(0.05, 0.15),
    'max_features': stats.uniform(0.25, 0.75),
}

In [None]:
#sdf = df.sample(250000)
sdf = df
cv = AttributeKFold(KFold(10, shuffle=True), sdf.ds)

gs = RandomizedSearchCV(
    est,
    params,
    n_iter=96,
    n_jobs=6,
    pre_dispatch='n_jobs',
    cv=cv,
    verbose=2,
    scoring='neg_mean_squared_error',
)

gs.fit(sdf[features], sdf.phi_m)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=6)]: Done  35 tasks      | elapsed: 18.3min
[Parallel(n_jobs=6)]: Done 156 tasks      | elapsed: 142.6min


In [None]:
gs.best_score_, gs.best_params_