In [35]:
import pandas as pd
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import datetime
import time
from scipy.stats import probplot
import datetime
import seaborn as sns
sns.set()
from sklearn.utils import shuffle
from keras.optimizers import RMSprop, Adam
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, TensorBoard, EarlyStopping
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization, GaussianNoise, Input, PReLU, Activation, Concatenate
from keras.initializers import VarianceScaling
from keras.models import load_model
from keras import backend as K
from sklearn import metrics

In [36]:
def load_data():
    dframe_path = 'data/cabauw/processed-full-log.csv.gz'
    df = pd.read_csv(dframe_path, na_values='--', compression='gzip')

    df = df[(df.ustar > 0.1) & (abs(df.H) > 10) & (df.wind > 1)]
    df = df[df.ds != 201603]
    
    return df

df = load_data()

In [37]:
def make_index(dtimes, interval):
    # returns a tuple index_above, index_below
    # index_above[i] is the largest i
    # such that dtimes[index_above[i]] - dtimes[i] < interval
    # index_below[i] is the smallest i
    # such that dtimes[i] - dtimes[index_below[i]] < interval
    # dtimes must be already sorted!
    index_below, index_above = np.zeros(
        (2, len(dtimes)), dtype=np.int
    ) - 1
    
    for i, x in enumerate(dtimes):
        j = index_below[i - 1] if i > 0 else 0
        while x - dtimes[j] > interval:
            j += 1

        index_below[i] = j
        index_above[j] = i

    last_above = index_above[0]
    for i in range(len(dtimes)):
        if index_above[i] < 0:
            index_above[i] = last_above
        else:
            last_above = index_above[i]
    
    return index_above, index_below


def compute_trend(df, columns, interval=3600):
    df = df.sort_values('datetime')
    for z in df.z.unique():  
        this_level = df[df.z == z]
        index_above, index_below = make_index(this_level.datetime.values, interval)

        for col in columns:
            val_above = this_level[col].values
            val_below = this_level.iloc[index_below][col].values

            time_above = this_level.datetime.values
            time_below = this_level.iloc[index_below].datetime.values

            trend = 3600 * (val_above - val_below) / (time_above - time_below)

            df.loc[df.z == z, col + '_trend'] = trend

    return df, [col + '_trend' for col in columns]


def get_features(df, use_trend, feature_level):
    wind_temp_levels = df.pivot_table(
        values=['wind', 'temp'], columns='z', index=['ds', 'tt']
    ).reset_index()
    wind_temp_levels.columns = [
        '%s_%d' % (a, b) if b else a
        for a, b in wind_temp_levels.columns.values
    ]

    df = df.merge(wind_temp_levels, on=['ds', 'tt'])

    feature_sets = [
        [
            'z', 'wind', 'temp', 'soil_temp',
            'wind_10', 'wind_20', 'wind_40',
            'temp_10', 'temp_20', 'temp_40',
        ],
        ['soilheat'],
        ['netrad'],
        ['rain', 'dewpoint'],
        ['H', 'LE'],
    ]

    features = [
        f for fset in feature_sets[:feature_level]
        for f in fset
    ]

    if use_trend:
        df, added_cols = compute_trend(df, [
            f for f in features if f != 'z'
        ])
        features.extend(added_cols)

    return df, features


def get_train_test_data(df, features, target, n_months=12):
    # remove feature columns with only nulls and rows with any null
    empty_columns = df.isnull().all(axis=0)
    keep_columns = df.columns.isin(features) & ~empty_columns
    missing = df.loc[:, keep_columns].isnull().any(axis=1)
    df = df[~missing]

    # get random test months
    test_ds = np.random.choice(df.ds.unique(), n_months, replace=False)
    test_mask = df.ds.isin(test_ds)

    train_x, train_y = df.loc[~test_mask, keep_columns], df.loc[~test_mask, target]
    test_x, test_y = df.loc[test_mask, keep_columns], df.loc[test_mask, target]

    mean_x, mean_y = train_x.mean(), train_y.mean()
    std_x, std_y = train_x.std(), train_y.std()

    train_x = (train_x - mean_x) /  std_x
    test_x = (test_x - mean_x) / std_x
    
    assert np.all(np.isfinite(train_x))
    
    train_y = (train_y - mean_y) / std_y
    test_y = (test_y - mean_y) / std_y

    features = keep_columns.index.values[keep_columns.values]
    return features, train_x, train_y, test_x, test_y, mean_y, std_y

In [38]:
def compute_denormalized_mse(std_y):
    def denormalized_mse(y_true, y_pred):
        # model is trained with normalized data, but we want
        # mse on not normalized data to compare with MOST
        mse = K.mean(K.square(y_true - y_pred), axis=-1)
        return mse * std_y**2
    return denormalized_mse



def build_model(sizes, std_y=1):
    # every element in sizes specifies a layer
    #   negative number: skip connection of -n layers
    #                    successive skips are aggregated
    #                    into a single layer
    #   0<n<1: dropout with pkeep=n
    #   >1 fully connected then prelu
    layers = [Input(shape=(sizes[0],))]
    i = 1
    while i < len(sizes):
        num = sizes[i]
        if num < 0:
            skip = [layers[-1]]
            while i < len(sizes) and sizes[i] < 0:
                skip.append(layers[sizes[i] - 1])
                i += 1
            layer = Concatenate()(skip)
            i -= 1
        elif num < 1:
            layer = Dropout(num)(layers[-1])
        else:
            layer = PReLU()(
                Dense(num, kernel_initializer=VarianceScaling(2, 'fan_in'))(
                    layers[-1]
                )
            )

        layers.append(layer)
        i += 1

    layers.append(Dense(1)(layers[-1]))

    opt = Adam(lr=0.001)
    model = Model(inputs=layers[0], outputs=layers[-1])
    model.compile(loss='mse', optimizer=opt, metrics=[compute_denormalized_mse(std_y)])
    return model

In [71]:
def run_experiment(network, use_trend, feature_level, batch_size=1024, verbose=2):
    ddf, features = get_features(df, use_trend, feature_level)
    features, train_x, train_y, test_x, test_y, mean_y, std_y = get_train_test_data(ddf, features, 'phi_m')
    
    K.clear_session()  # https://stackoverflow.com/q/35114376/521776
    model = build_model([len(features)] + network, std_y=std_y)

    dtime = datetime.datetime.utcnow().isoformat().replace('-', '').replace(':', '').replace('T', '-')[:-7]
    logdir = 'dev/logs/%s-tren:%s-features:%s-batchsize:%s-nparam:%s/' % (
        dtime, use_trend, feature_level, batch_size, model.count_params()
    )
    
    if verbose > 0:
        print('Saving to', logdir)

    callbacks = [
        ReduceLROnPlateau(factor=0.5, verbose=verbose, min_lr=1e-6, patience=20),
        ModelCheckpoint(logdir + 'weights-w.{epoch:04d}-{val_loss:.4f}.hdf5',
                        verbose=verbose, save_best_only=True),
        TensorBoard(logdir, write_graph=True, write_grads=True, histogram_freq=0),
        EarlyStopping(min_delta=0.0001, patience=50),
    ]

    hist = model.fit(
        train_x, train_y,
        batch_size=batch_size, epochs=1000,
        verbose=verbose,
        shuffle=True,
        callbacks=callbacks,
        validation_data=(test_x, test_y)
    )
    
    return hist, logdir

In [None]:
models = [
    [64, 32, 16, 8, 4, 2, 1],
    [128, 64, 32, 16, 8, 4, 2, 1],
    [256, 0.5, 128, 64, 32, 16, 8, 4, 2, 1],
    [512, 0.5, 256, 0.5, 128, 64, 32, 16, 8, 4, 2, 1],
]


for fset in [1, 2, 3, 4, 5]:
    for trend in [True, False]:
        for i, network in enumerate(models):
            hist, logdir = run_experiment(network, trend, fset, bsize, verbose=0)
            best = min(hist.history['val_denormalized_mse'])
            best_at = np.argmin(hist.history['val_denormalized_mse'])

            print('bsize %s fset %s trend %s network %s best %f @ %s - logdir %s' % (
                bsize, fset, trend, i, best, best_at, logdir
            ))



bsize 1024 fset 1 trend True network 0 best 0.325869 @ 31 - logdir dev/logs/20180323-084738-tren:True-features:1-batchsize:1024-nparam:4202/
bsize 1024 fset 1 trend True network 1 best 0.346951 @ 41 - logdir dev/logs/20180323-085644-tren:True-features:1-batchsize:1024-nparam:13866/
bsize 1024 fset 1 trend True network 2 best 0.310231 @ 106 - logdir dev/logs/20180323-091131-tren:True-features:1-batchsize:1024-nparam:49578/
bsize 1024 fset 1 trend True network 3 best 0.311047 @ 12 - logdir dev/logs/20180323-095544-tren:True-features:1-batchsize:1024-nparam:186538/
bsize 1024 fset 1 trend False network 0 best 0.319747 @ 69 - logdir dev/logs/20180323-103254-tren:False-features:1-batchsize:1024-nparam:3626/
bsize 1024 fset 1 trend False network 1 best 0.360080 @ 35 - logdir dev/logs/20180323-104434-tren:False-features:1-batchsize:1024-nparam:12714/
bsize 1024 fset 1 trend False network 2 best 0.354544 @ 24 - logdir dev/logs/20180323-105749-tren:False-features:1-batchsize:1024-nparam:47274/
