here we put together things from the nnet and neural architecture search notebooks, and see what happens

In [1]:
import pandas as pd
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import datetime
import time
from scipy.stats import probplot
import datetime
import seaborn as sns
sns.set()
from sklearn.utils import shuffle
from keras.optimizers import RMSprop, Adam
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, TensorBoard, EarlyStopping
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization, GaussianNoise, Input, PReLU, Activation, Concatenate
from keras.initializers import VarianceScaling
from keras import regularizers 
from keras.models import load_model
from keras import backend as K
from sklearn import metrics
import joblib

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class ControllerRNN:
    def __init__(self, max_len, batch_size, type_size, arg_size,
                 learning_rate=0.001, hidden_size=32, baseline_smoothing=0.95):
        self.hidden_size = hidden_size
        self.unroll_by = max_len
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.type_size = type_size + 1  # 0 is for end-of-network token
        self.arg_size = arg_size
        self.baseline_smoothing = baseline_smoothing

    def build(self):
        # reward for the architectures
        self.architecture_reward = tf.placeholder(tf.float32, [self.batch_size])
        
        # exponential moving average of the reward
        self.last_average_reward = tf.reduce_mean(self.architecture_reward)
        self.reward_ema = tf.train.ExponentialMovingAverage(self.baseline_smoothing)
        self.update_reward_ema = self.reward_ema.apply([self.last_average_reward])

        rnn = tf.contrib.rnn.GRUCell(self.hidden_size)
        state = tf.random_normal([self.batch_size, rnn.state_size])
        
        # weight matrices to transform from rnn output to layer type and discrete arg
        rnn_to_layer_type_weight = tf.Variable(tf.random_normal([rnn.output_size, self.type_size]))
        rnn_to_layer_type_gradient = []
        rnn_to_layer_arg_weight = tf.Variable(tf.random_normal([rnn.output_size, self.arg_size]))
        rnn_to_layer_arg_gradient = []

        # rnn output and gradients
        output = tf.random_normal([self.batch_size, rnn.output_size])

        # layer_probs contains the output from the network, namely the probabilities
        # of type and argument for every layer of every network
        self.layer_probs = []

        # layer_indicators contains one-hot indicators of type and argument
        # for every layer of every network.
        # used to select which action is used to compute the gradient.
        # fixed, must be set before updating the weights
        self.layer_indicators = []

        losses = []
        
        for i in range(self.unroll_by):
            # run rnn cell
            output, state = rnn(output, state)

            if i == 0:  # gru variables are only initialized now
                rnn_params = rnn.trainable_variables + rnn.trainable_weights
                rnn_gradients = [[] for _ in range(len(rnn_params))]

            # compute output probabilites
            layer_type = tf.nn.softmax(tf.matmul(output, rnn_to_layer_type_weight))
            layer_arg = tf.nn.softmax(tf.matmul(output, rnn_to_layer_arg_weight))
            chosen_layer_type = tf.placeholder(tf.int32, self.batch_size)
            chosen_layer_arg = tf.placeholder(tf.int32, self.batch_size)
            self.layer_probs.append((layer_type, layer_arg))
            self.layer_indicators.append((chosen_layer_type, chosen_layer_arg))

            # aggregate gradients
            baseline = self.reward_ema.average(self.last_average_reward)
            prob = (self.last_average_reward - baseline) * (
                tf.reduce_sum(
                    tf.one_hot(chosen_layer_type, depth=self.type_size) * tf.log(layer_type + 1e-12),
                    axis=1
                ) + tf.reduce_sum(
                    tf.one_hot(chosen_layer_arg, depth=self.arg_size) * tf.log(layer_arg + 1e-12),
                    axis=1
                )
            )
            losses.append(prob)

            rnn_to_layer_arg_gradient.append(tf.gradients(prob, rnn_to_layer_arg_weight)[0])
            rnn_to_layer_type_gradient.append(tf.gradients(prob, rnn_to_layer_type_weight)[0])
            for param, grad in zip(rnn_params, rnn_gradients):
                grad.append(tf.gradients(prob, param)[0])

        self.loss = tf.reduce_mean(losses)

        def sanitize_gradient(grads):
            avg = sum(grads) / len(grads)
            return tf.clip_by_norm(avg, 1.0)

        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.optimize = optimizer.apply_gradients([
            (sanitize_gradient(grad), param)
            for param, grad in zip(rnn_params, rnn_gradients)
        ] + [
            (sanitize_gradient(rnn_to_layer_type_gradient), rnn_to_layer_type_weight),
            (sanitize_gradient(rnn_to_layer_arg_gradient), rnn_to_layer_arg_weight),
        ])

    def generate_architecture(self, session):
        layers = session.run(self.layer_probs)
        networks = [[] for _ in range(self.batch_size)]
        for (ltype, larg) in layers:
            for i, (nnet, type_prob, arg_prob) in enumerate(zip(networks, ltype, larg)):
                # the network always has at least one layer
                if nnet and nnet[-1][0] == 0:
                    continue

                assert all(np.isfinite(type_prob))
                assert all(np.isfinite(arg_prob))

                layer_type = np.random.choice(len(type_prob), p=type_prob)
                layer_arg = np.random.choice(len(arg_prob), p=arg_prob)
                nnet.append((layer_type, layer_arg))

        return networks

    def learn_from_rewards(self, sess, networks, rewards):
        assert len(rewards) == self.batch_size

        # set the indicator variables, telling which action was chosen
        feed_dict = {ind: []
                     for layer_ind in self.layer_indicators
                     for ind in layer_ind}

        for nnet in networks:
            # pad network if shorter than expected
            # we set the indicators to -1, so that all one hot will be 0
            # thus not contributing to the gradient
            if len(nnet) < self.unroll_by:
                nnet = nnet + [(-1, -1)] * (self.unroll_by - len(nnet))

            assert len(nnet) == self.unroll_by
            for (itype, iarg), (ntype, narg) in zip(self.layer_indicators, nnet):
                feed_dict[itype].append(ntype)
                feed_dict[iarg].append(narg)

        feed_dict[self.architecture_reward] = rewards
        loss, _, _ = sess.run([self.loss, self.update_reward_ema, self.optimize],
                              feed_dict=feed_dict)
        return loss

    
class MovingAverages:
    def __init__(self):
        self.metrics = {}
        self.smoothing = {}
        self.snapshots = []
    
    def update(self, metric, value, smoothing=None):
        if smoothing is None:
            smoothing = self.smoothing.get(metric, 0.6)
        self.smoothing[metric] = smoothing
        
        # can pass None to update smoothing
        if value is not None:
            self.metrics[metric] = (
                smoothing * self.metrics.get(metric, value)
                + (1 - smoothing) * value
            )
        return self.metrics[metric]
    
    def update_all(self, **metrics):
        for metric, value in metrics.items():
            self.update(metric, value)
        return [self.metrics[m] for m in metrics]
    
    def snapshot(self, **meta):
        snap = dict(self.metrics)
        snap.update(meta)
        self.snapshots.append(snap)
        return snap

    
def make_index(dtimes, interval):
    # returns a tuple index_above, index_below
    # index_above[i] is the largest i
    # such that dtimes[index_above[i]] - dtimes[i] < interval
    # index_below[i] is the smallest i
    # such that dtimes[i] - dtimes[index_below[i]] < interval
    # dtimes must be already sorted!
    index_below, index_above = np.zeros(
        (2, len(dtimes)), dtype=np.int
    ) - 1
    
    for i, x in enumerate(dtimes):
        j = index_below[i - 1] if i > 0 else 0
        while x - dtimes[j] > interval:
            j += 1

        index_below[i] = j
        index_above[j] = i

    last_above = index_above[0]
    for i in range(len(dtimes)):
        if index_above[i] < 0:
            index_above[i] = last_above
        else:
            last_above = index_above[i]
    
    return index_above, index_below


def compute_trend(df, columns, interval=3600):
    df = df.sort_values('datetime')
    for z in df.z.unique():  
        this_level = df[df.z == z]
        index_above, index_below = make_index(this_level.datetime.values, interval)

        for col in columns:
            val_above = this_level[col].values
            val_below = this_level.iloc[index_below][col].values

            time_above = this_level.datetime.values
            time_below = this_level.iloc[index_below].datetime.values

            trend = 3600 * (val_above - val_below) / (time_above - time_below)

            df.loc[df.z == z, col + '_trend'] = trend

    return df, [col + '_trend' for col in columns]


def get_features(df, use_trend, feature_level):
    wind_temp_levels = df.pivot_table(
        values=['wind', 'temp'], columns='z', index=['ds', 'tt']
    ).reset_index()
    wind_temp_levels.columns = [
        '%s_%d' % (a, b) if b else a
        for a, b in wind_temp_levels.columns.values
    ]

    df = df.merge(wind_temp_levels, on=['ds', 'tt'])

    feature_sets = [
        [
            'z', 'wind', 'temp', 'soil_temp',
            'wind_10', 'wind_20', 'wind_40',
            'temp_10', 'temp_20', 'temp_40',
        ],
        ['soilheat'],
        ['netrad'],
        ['rain', 'dewpoint'],
        ['H', 'LE'],
    ]

    features = [
        f for fset in feature_sets[:feature_level]
        for f in fset
    ]
    
    if use_trend:
        df, added_cols = compute_trend(df, [
            f for f in features if f != 'z'
        ])
        features.extend(added_cols)

    return df, features


def get_train_test_data(df, features, target, samples_count, n_months=12):
    df = df.dropna()

    # get random test months
    test_ds = np.random.choice(df.ds.unique(), n_months, replace=False)
    test_mask = df.ds.isin(test_ds)
    
    train_df, test_df = df.loc[~test_mask], df.loc[test_mask]
    if samples_count > 0:
        # maintain proportion of train/test samples
        test_size = int(samples_count * len(test_df) / len(train_df))
        train_df = train_df.sample(samples_count)
        test_df = test_df.sample(test_size)
    
    train_x, train_y = train_df[features], train_df[target]
    test_x, test_y = test_df[features], test_df[target]

    mean_x, mean_y = train_x.mean(), train_y.mean()
    std_x, std_y = train_x.std(), train_y.std()

    train_x = (train_x - mean_x) /  std_x
    test_x = (test_x - mean_x) / std_x
    
    assert np.all(np.isfinite(train_x))
    
    train_y = (train_y - mean_y) / std_y
    test_y = (test_y - mean_y) / std_y

    return train_x, train_y, test_x, test_y, mean_y, std_y
    

def compute_denormalized_mse(std_y):
    def denormalized_mse(y_true, y_pred):
        # model is trained with normalized data, but we want
        # mse on not normalized data to compare with MOST
        mse = K.mean(K.square(y_true - y_pred), axis=-1)
        return mse * std_y**2
    return denormalized_mse


def build_model(input_shape, architecture, std_y=1):    
    regularizer = None
    layers = [Input(shape=(input_shape,))]

    for layer_type, layer_arg in architecture:
        if layer_type == 0 or layer_type == 1:
            num = 2**layer_arg
            layers.append(PReLU()(
                BatchNormalization()(
                    Dense(num, kernel_initializer=VarianceScaling(2, 'fan_in'),
                          kernel_regularizer=regularizer)(
                        layers[-1]
                    )
                )
            ))
        elif layer_type == 2:
            pkeep = (layer_arg + 1) / 10
            layers.append(Dropout(pkeep)(layers[-1]))
        elif layer_type == 3:
            regu = 6 ** -layer_arg
            regularizer = regularizers.l2(regu)
        else:
            raise ValueError('layer type from 0 to 3')

    layers.append(Dense(1)(layers[-1]))

    opt = Adam(lr=0.001)
    model = Model(inputs=layers[0], outputs=layers[-1])
    model.compile(loss='mse', optimizer=opt, metrics=[compute_denormalized_mse(std_y)])
    return model


def evaluate_architecture(step, arch_idx, architecture, max_epochs, samples_count):
    train_x, train_y, test_x, test_y, _, std_y = get_train_test_data(
        ddf, features, 'phi_m', samples_count, n_months=12
    )
    
    #K.clear_session()  # https://stackoverflow.com/q/35114376/521776
    model = build_model(train_x.shape[1], architecture, std_y=std_y)

    logdir = 'dev/logs/nas/step-%d-arch-%d' % (step, arch_idx)
    callbacks = [
        ReduceLROnPlateau(factor=0.1, verbose=0, min_lr=1e-6, patience=10),
        TensorBoard(logdir, write_graph=True, write_grads=True, histogram_freq=0),
        EarlyStopping(min_delta=0.001, patience=25),
    ]

    hist = model.fit(
        train_x, train_y,
        batch_size=1024,
        epochs=max_epochs,
        verbose=0,
        shuffle=True,
        callbacks=callbacks,
        validation_data=(test_x, test_y)
    )

    best = min(hist.history['val_denormalized_mse'])

    return best

In [5]:
def load_data():
    dframe_path = 'data/cabauw/processed-full-log.csv.gz'
    df = pd.read_csv(dframe_path, na_values='--', compression='gzip')

    df = df[(df.ustar > 0.1) & (abs(df.H) > 10) & (df.wind > 1)]
    df = df[df.ds != 201603]
    
    return df

df = load_data()
ddf, features = get_features(df, use_trend=True, feature_level=4)



In [8]:
controller = ControllerRNN(
    hidden_size=64,
    max_len=20,
    batch_size=1,
    type_size=3,
    arg_size=10,
    learning_rate=0.001,
    baseline_smoothing=0.99,
)

hist = []
controller_graph = tf.Graph()
with controller_graph.as_default():
    controller.build()
    controller_session = tf.Session(graph=controller_graph)
    controller_session.run(tf.global_variables_initializer())

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [None]:
averages = MovingAverages()
averages.smoothing['time'] = 0
start_time = time.time()
for step_idx in range(10000):
    avg_mse = averages.metrics.get('inner_mse', 10)
    if avg_mse > 1:
        samples_count, max_epochs = 250000, 5
    elif avg_mse > 0.5:
        samples_count, max_epochs = 500000, 20
    elif avg_mse > 0.3:
        samples_count, max_epochs = -1, 50
    else:
        samples_count, max_epochs = -1, 500

    architectures = controller.generate_architecture(controller_session)

    # test architectures on a temporary graph
    with tf.Graph().as_default():
        with tf.Session().as_default():
            rewards = [
                evaluate_architecture(
                    step_idx, arch_idx, arch,
                    max_epochs, samples_count
                )
                for arch_idx, arch in enumerate(architectures)
            ]

    loss = controller.learn_from_rewards(
        controller_session, architectures, rewards
    )

    averages.update_all(
        contro_loss=loss**2,
        inner_mse=np.mean(rewards),
        inner_mse_std=np.std(rewards),
        length=np.mean(list(map(len, architectures))),
        fc_count=np.mean([sum(1 for lt, _ in arch if lt <= 1) for arch in architectures]),
        dropo_count=np.mean([sum(1 for lt, _ in arch if lt == 2) for arch in architectures]),
        regu_count=np.mean([sum(1 for lt, _ in arch if lt == 3) for arch in architectures]),
    )

    if step_idx % 1 == 0:
        snap = averages.snapshot(step=step_idx, time=time.time() - start_time)
        print('  '.join(
            '%s=%.3f' % metric for metric in snap.items()
        ))

contro_loss=103.844  inner_mse=1.435  inner_mse_std=0.000  length=20.000  fc_count=0.000  dropo_count=1.000  regu_count=19.000  step=0.000  time=23.800
contro_loss=62.420  inner_mse=0.988  inner_mse_std=0.000  length=12.400  fc_count=0.400  dropo_count=0.600  regu_count=11.400  step=1.000  time=48.087
contro_loss=37.452  inner_mse=0.701  inner_mse_std=0.000  length=7.840  fc_count=0.640  dropo_count=0.360  regu_count=6.840  step=2.000  time=221.593
contro_loss=22.474  inner_mse=0.532  inner_mse_std=0.000  length=5.104  fc_count=0.784  dropo_count=0.216  regu_count=4.104  step=3.000  time=283.459
contro_loss=13.610  inner_mse=0.471  inner_mse_std=0.000  length=3.462  fc_count=0.870  dropo_count=0.130  regu_count=2.462  step=4.000  time=312.700
contro_loss=28.278  inner_mse=0.715  inner_mse_std=0.000  length=10.077  fc_count=4.522  dropo_count=3.678  regu_count=1.877  step=5.000  time=3454.817
contro_loss=72.059  inner_mse=0.706  inner_mse_std=0.000  length=14.046  fc_count=2.713  dropo_

contro_loss=73.191  inner_mse=0.724  inner_mse_std=0.000  length=5.936  fc_count=1.884  dropo_count=1.455  regu_count=2.598  step=54.000  time=10586.618
contro_loss=52.539  inner_mse=0.975  inner_mse_std=0.000  length=4.762  fc_count=1.530  dropo_count=1.673  regu_count=1.559  step=55.000  time=10640.526
contro_loss=31.684  inner_mse=0.713  inner_mse_std=0.000  length=5.257  fc_count=3.318  dropo_count=1.004  regu_count=0.935  step=56.000  time=10797.402
contro_loss=19.011  inner_mse=0.548  inner_mse_std=0.000  length=3.554  fc_count=2.391  dropo_count=0.602  regu_count=0.561  step=57.000  time=10887.765
contro_loss=11.530  inner_mse=0.780  inner_mse_std=0.000  length=2.933  fc_count=1.835  dropo_count=0.761  regu_count=0.337  step=58.000  time=11054.288
contro_loss=7.205  inner_mse=0.831  inner_mse_std=0.000  length=2.960  fc_count=1.901  dropo_count=0.857  regu_count=0.202  step=59.000  time=11092.624
contro_loss=4.366  inner_mse=0.624  inner_mse_std=0.000  length=2.976  fc_count=1.5

contro_loss=1.715  inner_mse=0.340  inner_mse_std=0.000  length=9.832  fc_count=8.346  dropo_count=0.100  regu_count=1.385  step=108.000  time=38337.550
contro_loss=1.041  inner_mse=0.340  inner_mse_std=0.000  length=6.299  fc_count=5.408  dropo_count=0.060  regu_count=0.831  step=109.000  time=38569.397
contro_loss=3.566  inner_mse=1.048  inner_mse_std=0.000  length=5.379  fc_count=3.645  dropo_count=0.836  regu_count=0.899  step=110.000  time=38768.225
contro_loss=13.512  inner_mse=1.130  inner_mse_std=0.000  length=5.628  fc_count=2.587  dropo_count=2.502  regu_count=0.539  step=111.000  time=38792.476
contro_loss=44.764  inner_mse=1.441  inner_mse_std=0.000  length=6.577  fc_count=3.152  dropo_count=3.101  regu_count=0.323  step=112.000  time=38879.164
contro_loss=26.869  inner_mse=1.025  inner_mse_std=0.000  length=6.746  fc_count=3.091  dropo_count=3.461  regu_count=0.194  step=113.000  time=38961.576
contro_loss=16.739  inner_mse=1.035  inner_mse_std=0.000  length=12.048  fc_cou

contro_loss=169.157  inner_mse=1.360  inner_mse_std=0.000  length=10.011  fc_count=2.998  dropo_count=3.847  regu_count=3.165  step=161.000  time=60349.195
contro_loss=101.494  inner_mse=1.091  inner_mse_std=0.000  length=6.806  fc_count=2.199  dropo_count=2.308  regu_count=2.299  step=162.000  time=60355.464
contro_loss=60.897  inner_mse=0.845  inner_mse_std=0.000  length=4.484  fc_count=1.719  dropo_count=1.385  regu_count=1.379  step=163.000  time=60378.693
contro_loss=36.562  inner_mse=0.617  inner_mse_std=0.000  length=3.090  fc_count=1.432  dropo_count=0.831  regu_count=0.828  step=164.000  time=60439.396
contro_loss=22.622  inner_mse=0.557  inner_mse_std=0.000  length=4.254  fc_count=3.259  dropo_count=0.499  regu_count=0.497  step=165.000  time=60522.222
contro_loss=13.602  inner_mse=0.498  inner_mse_std=0.000  length=2.953  fc_count=2.355  dropo_count=0.299  regu_count=0.298  step=166.000  time=60557.769
contro_loss=13.569  inner_mse=0.543  inner_mse_std=0.000  length=9.772  f

contro_loss=62.326  inner_mse=0.490  inner_mse_std=0.000  length=12.488  fc_count=2.105  dropo_count=0.153  regu_count=10.229  step=215.000  time=75586.635
contro_loss=41.152  inner_mse=1.277  inner_mse_std=0.000  length=15.493  fc_count=1.263  dropo_count=1.292  regu_count=12.938  step=216.000  time=75729.330
contro_loss=26.232  inner_mse=1.290  inner_mse_std=0.000  length=17.296  fc_count=0.758  dropo_count=1.175  regu_count=15.363  step=217.000  time=75734.928
contro_loss=17.470  inner_mse=0.957  inner_mse_std=0.000  length=13.577  fc_count=3.655  dropo_count=0.705  regu_count=9.218  step=218.000  time=75777.395
contro_loss=10.503  inner_mse=0.803  inner_mse_std=0.000  length=16.146  fc_count=2.193  dropo_count=0.423  regu_count=13.531  step=219.000  time=75801.552
contro_loss=6.303  inner_mse=0.771  inner_mse_std=0.000  length=17.688  fc_count=1.316  dropo_count=0.254  regu_count=16.118  step=220.000  time=75825.046
contro_loss=5.238  inner_mse=0.612  inner_mse_std=0.000  length=12

contro_loss=9.316  inner_mse=0.468  inner_mse_std=0.000  length=4.414  fc_count=0.856  dropo_count=0.165  regu_count=3.393  step=268.000  time=88980.976
contro_loss=5.605  inner_mse=0.415  inner_mse_std=0.000  length=3.049  fc_count=0.914  dropo_count=0.099  regu_count=2.036  step=269.000  time=89221.908
contro_loss=4.943  inner_mse=0.348  inner_mse_std=0.000  length=3.429  fc_count=2.148  dropo_count=0.059  regu_count=1.221  step=270.000  time=89828.464
contro_loss=4.258  inner_mse=0.702  inner_mse_std=0.000  length=5.258  fc_count=2.489  dropo_count=1.636  regu_count=1.133  step=271.000  time=92365.198
contro_loss=3.959  inner_mse=0.578  inner_mse_std=0.000  length=5.555  fc_count=3.093  dropo_count=0.981  regu_count=1.480  step=272.000  time=92700.392
contro_loss=2.396  inner_mse=0.482  inner_mse_std=0.000  length=3.733  fc_count=2.256  dropo_count=0.589  regu_count=0.888  step=273.000  time=92752.333
contro_loss=168.703  inner_mse=0.795  inner_mse_std=0.000  length=10.240  fc_count