In [1]:
import pandas as pd
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import datetime
import time
from scipy.stats import probplot
import datetime
import seaborn as sns
sns.set()
from sklearn.utils import shuffle
from keras.optimizers import RMSprop, Adam
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, TensorBoard, EarlyStopping
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization, GaussianNoise, Input, PReLU, Activation, Concatenate
from keras.initializers import VarianceScaling
from keras import regularizers 
from keras.models import load_model
from keras import backend as K
from sklearn import metrics

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_data():
    dframe_path = 'data/cabauw/processed-full-log.csv.gz'
    df = pd.read_csv(dframe_path, na_values='--', compression='gzip')

    df = df[(df.ustar > 0.1) & (abs(df.H) > 10) & (df.wind > 1)]
    df = df[df.ds != 201603]
    
    return df

df = load_data()

In [3]:
def make_index(dtimes, interval):
    # returns a tuple index_above, index_below
    # index_above[i] is the largest i
    # such that dtimes[index_above[i]] - dtimes[i] < interval
    # index_below[i] is the smallest i
    # such that dtimes[i] - dtimes[index_below[i]] < interval
    # dtimes must be already sorted!
    index_below, index_above = np.zeros(
        (2, len(dtimes)), dtype=np.int
    ) - 1
    
    for i, x in enumerate(dtimes):
        j = index_below[i - 1] if i > 0 else 0
        while x - dtimes[j] > interval:
            j += 1

        index_below[i] = j
        index_above[j] = i

    last_above = index_above[0]
    for i in range(len(dtimes)):
        if index_above[i] < 0:
            index_above[i] = last_above
        else:
            last_above = index_above[i]
    
    return index_above, index_below


def compute_trend(df, columns, interval=3600):
    df = df.sort_values('datetime')
    for z in df.z.unique():  
        this_level = df[df.z == z]
        index_above, index_below = make_index(this_level.datetime.values, interval)

        for col in columns:
            val_above = this_level[col].values
            val_below = this_level.iloc[index_below][col].values

            time_above = this_level.datetime.values
            time_below = this_level.iloc[index_below].datetime.values

            trend = 3600 * (val_above - val_below) / (time_above - time_below)

            df.loc[df.z == z, col + '_trend'] = trend

    return df, [col + '_trend' for col in columns]


def get_features(df, use_trend, feature_level):
    wind_temp_levels = df.pivot_table(
        values=['wind', 'temp'], columns='z', index=['ds', 'tt']
    ).reset_index()
    wind_temp_levels.columns = [
        '%s_%d' % (a, b) if b else a
        for a, b in wind_temp_levels.columns.values
    ]

    df = df.merge(wind_temp_levels, on=['ds', 'tt'])

    feature_sets = [
        [
            'z', 'wind', 'temp', 'soil_temp',
            'wind_10', 'wind_20', 'wind_40',
            'temp_10', 'temp_20', 'temp_40',
        ],
        ['soilheat'],
        ['netrad'],
        ['rain', 'dewpoint'],
        ['H', 'LE'],
    ]

    features = [
        f for fset in feature_sets[:feature_level]
        for f in fset
    ]

    if use_trend:
        df, added_cols = compute_trend(df, [
            f for f in features if f != 'z'
        ])
        features.extend(added_cols)
        
    return df, features


def get_train_test_data_my_random_months(df, features, target, n_months=18):
    test_ds = np.random.choice(df.ds.unique(), n_months, replace=False)
    test_mask = df.ds.isin(test_ds)

    train_x, train_y = df.loc[~test_mask, features], df.loc[~test_mask, target]
    test_x, test_y = df.loc[test_mask, features], df.loc[test_mask, target]

    mean_x, mean_y = train_x.mean(), train_y.mean()
    std_x, std_y = train_x.std(), train_y.std()

    train_x = (train_x - mean_x) / std_x
    test_x = (test_x - mean_x) / std_x
    
    assert np.all(np.isfinite(train_x))
    assert np.all(np.isfinite(test_x))
    
    train_y = (train_y - mean_y) / std_y
    test_y = (test_y - mean_y) / std_y

    return features, train_x, train_y, test_x, test_y, mean_y, std_y

In [4]:
class AttributeKFold:
    ''' k-fold cross validator splitting on a particular attribute
        so that all samples with a given value are either in the train or test set

        attribute value for each sample is given in the constructor, so that
        the attribute itself need not be in the features for the model
    '''
    def __init__(self, cv, attr):
        self.cv, self.attr = cv, attr

    def get_n_splits(self, *args, **kwargs):
        return self.cv.get_n_splits(*args, **kwargs)

    def split(self, X, y=None, groups=None):
        vals = self.attr.unique()
        for train_idx, test_idx in self.cv.split(vals):
            train_mask = self.attr.isin(vals[train_idx])
            test_mask = self.attr.isin(vals[test_idx])

            X = np.argwhere(train_mask).reshape(-1)
            y = np.argwhere(test_mask).reshape(-1)
            
            assert np.all(np.isfinite(X))
            assert np.all(np.isfinite(y))
            
            yield X, y

In [5]:
def compute_denormalized_mse(std_y):
    def denormalized_mse(y_true, y_pred):
        # model is trained with normalized data, but we want
        # mse on not normalized data to compare with MOST
        mse = K.mean(K.square(y_true - y_pred), axis=-1)
        return mse * std_y**2
    return denormalized_mse


def orthonormal_regularizer(regu):
    # eq. 3 in https://arxiv.org/pdf/1703.01827.pdf
    def compute(weight_matrix):
        rows, cols = weight_matrix.shape
        wtw = K.dot(K.transpose(weight_matrix), weight_matrix)
        return regu * K.sum((wtw - K.eye(cols.value))**2) / 2
    return compute


def build_model(sizes, std_y=1):
    # every element in sizes specifies a layer
    #   negative number: skip connection of -n layers
    #                    successive skips are aggregated
    #                    into a single layer
    #   0<n<1: dropout with pkeep=n
    #   >1 fully connected then prelu
    layers = [Input(shape=(sizes[0],)),]
    #layers.append(GaussianNoise(0.01)(layers[-1]))
    i = 1
    while i < len(sizes):
        num = sizes[i]
        if num < 0:
            skip = [layers[-1]]
            while i < len(sizes) and sizes[i] < 0:
                skip.append(layers[sizes[i] - 1])
                i += 1
            layer = Concatenate()(skip)
            i -= 1
        elif num < 1:
            layer = Dropout(num)(layers[-1])
        else:
            layer = PReLU()(
                Dense(num, kernel_initializer=VarianceScaling(2, 'fan_in'))(
                    layers[-1]
                )
            )

        layers.append(layer)
        i += 1

    layers.append(Dense(1)(layers[-1]))

    opt = Adam(lr=0.001)
    model = Model(inputs=layers[0], outputs=layers[-1])
    model.compile(loss='mse', optimizer=opt, metrics=[compute_denormalized_mse(std_y)])
    return model

In [6]:
def run_experiment(network, use_trend, feature_level, batch_size=1024, verbose=2, most_only=False):
    ddf, features = get_features(df.dropna(), use_trend, feature_level)
    if most_only:
        ddf = ddf[(ddf.zL > -2) & (ddf.zL < 1)]
    features, train_x, train_y, test_x, test_y, mean_y, std_y = get_train_test_data_my_random_months(
        ddf, features, 'phi_m'
    )

    K.clear_session()  # https://stackoverflow.com/q/35114376/521776
    model = build_model([len(features)] + network, std_y=std_y)

    dtime = datetime.datetime.utcnow().isoformat().replace('-', '').replace(':', '').replace('T', '-')[:-7]
    logdir = 'dev/logs/%s-tren%s-features%s-batchsize%s-nparam%s/' % (
        dtime, use_trend, feature_level, batch_size, model.count_params()
    )

    if verbose > 0:
        print('Saving to', logdir)

    callbacks = [
        ReduceLROnPlateau(factor=0.1, verbose=verbose, min_lr=1e-6, patience=10),
        ModelCheckpoint(logdir + 'best.hdf5', verbose=verbose, save_best_only=True),
        TensorBoard(logdir, write_graph=True, write_grads=True, histogram_freq=0),
        EarlyStopping(min_delta=0.0001, patience=25),
    ]

    hist = model.fit(
        train_x, train_y,
        batch_size=batch_size,
        epochs=1000,
        verbose=verbose,
        shuffle=True,
        callbacks=callbacks,
        validation_data=(test_x, test_y)
    )

    return hist, logdir

In [7]:
def get_train_test_data_by_index(df, features, target, train_idx, test_idx, normalize):
    train_x, train_y = df.iloc[train_idx][features], df.iloc[train_idx][target]
    test_x, test_y = df.iloc[test_idx][features], df.iloc[test_idx][target]

    if normalize:
        mean_x, std_x = train_x.mean(), train_x.std()
        train_x = (train_x - mean_x) / std_x
        test_x = (test_x - mean_x) / std_x

        mean_y, std_y = train_y.mean(), train_y.std()
        train_y = (train_y - mean_y) / std_y
        test_y = (test_y - mean_y) / std_y
    else:
        mean_y, std_y = 0, 1

    return train_x, train_y, test_x, test_y, mean_y, std_y


def run_cv(network, use_trend, feature_level, most_only, batch_size=1024, cv_folds=10, verbose=0):
    ddf, features = get_features(df.dropna(), use_trend, feature_level)
    finite = np.isfinite(ddf[features]).all(axis=1)
    ddf = ddf.loc[finite]
    if most_only:
        ddf = ddf[(ddf.zL > -2) & (ddf.zL < 1)]

    cv = AttributeKFold(KFold(cv_folds, shuffle=True), ddf.ds)
    results = []
    for cv_idx, (train_idx, test_idx) in enumerate(cv.split(ddf.ds)):

        # prepare data
        train_x, train_y, test_x, test_y, mean_y, std_y = get_train_test_data_by_index(
            ddf, features, 'phi_m', train_idx, test_idx, normalize=True
        )

        K.clear_session()  # https://stackoverflow.com/q/35114376/521776
        model = build_model([len(features)] + network, std_y=std_y)
        dtime = datetime.datetime.utcnow().isoformat().replace('-', '').replace(':', '').replace('T', '-')[:-7]
        logdir = 'dev/logs/%s-tren%s-features%s-batchsize%s-nparam%s-cv%d/' % (
            dtime, use_trend, feature_level, batch_size, model.count_params(), cv_idx
        )
        save_to = logdir + 'best.hdf5'
        if verbose > 0:
            print('Saving to', logdir)

        # fit to train data
        hist = model.fit(
            train_x, train_y,
            batch_size=batch_size,
            epochs=500,
            verbose=verbose,
            shuffle=True,
            callbacks=[
                ReduceLROnPlateau(factor=0.1, verbose=verbose, min_lr=1e-6, patience=10),
                ModelCheckpoint(save_to, verbose=verbose, save_best_only=True),
                TensorBoard(logdir, write_graph=True, write_grads=True, histogram_freq=0),
                EarlyStopping(min_delta=0.0001, patience=25),
            ],
            validation_data=(test_x, test_y)
        )
        
        # evaluate on test data
        best = load_model(logdir + 'best.hdf5', custom_objects={
            'denormalized_mse': compute_denormalized_mse(std_y)
        })

        y_pred = best.predict(test_x)
        y_pred = y_pred * std_y + mean_y
        test_y = test_y * std_y + mean_y
        
        y_pred = y_pred.reshape(-1)
        test_y = test_y.values.reshape(-1)

        results.append((
            metrics.explained_variance_score(test_y, y_pred),
            metrics.mean_absolute_error(test_y, y_pred),
            metrics.mean_squared_error(test_y, y_pred),
            metrics.median_absolute_error(test_y, y_pred),
            metrics.r2_score(test_y, y_pred),
            np.mean(np.abs((test_y - y_pred) / test_y)) * 100,
        ))
    
    return pd.DataFrame(results, columns=[
        'explained_variance', 'mean_absolute_error', 'mean_squared_error',
        'median_absolute_error', 'r2_score', 'mean_absolute_percent_error'
    ])


def test_setting(use_trend, feature_level, most_only, batch_size=1024, cv_folds=10, verbose=0):
    # test all models on the given setting
    models = [
        [128, 64, 32, 16, 8, 4, 2, 1],
        [256, 0.5, 128, 64, 32, 16, 8, 4, 2, 1],
        [256, 0.5, 128, 64, 64, 32, 16, 8, 4, 2, 1], 
        [256, 0.5, 128, 64, 64, 32, 32, 16, 8, 4, 2, 1],
        [512, 0.5, 256, 0.5, 128, 64, 32, 16, 8, 4, 2, 1],
        [256, 0.5, 128, 64, 64, 32, 32, 16, 8, 4, 2, 1],
    ]

    results = []
    for i, mod in enumerate(models):
        res = run_cv(
            mod, use_trend, feature_level, most_only, batch_size, cv_folds, verbose
        )
        print('----  model', i)
        print(res.describe().T)
        results.append(res)
    return results

In [8]:
test_setting(
    use_trend=True,
    feature_level=5,
    most_only=True,
)



Instructions for updating:
Use the retry module or similar alternatives.


KeyboardInterrupt: 